1#!/usr/bin/env python
2#
3# Copyright 2016 The Android Open Source Project. All Rights Reserved.
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9#    http://www.apache.org/licenses/LICENSE-2.0
10#
11#    Unless required by applicable law or agreed to in writing, software
12#    distributed under the License is distributed on an "AS IS" BASIS,
13#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14#    See the License for the specific language governing permissions and
15#    limitations under the License.
16#
17
18"""Generate a C++ data table containing locale data."""
19
20import collections
21import glob
22import os.path
23import sys
24
25
26def get_locale_parts(locale):
27    """Split a locale into three parts, for langauge, script, and region."""
28    parts = locale.split('_')
29    if len(parts) == 1:
30        return (parts[0], None, None)
31    elif len(parts) == 2:
32        if len(parts[1]) == 4:  # parts[1] is a script
33            return (parts[0], parts[1], None)
34        else:
35            return (parts[0], None, parts[1])
36    else:
37        assert len(parts) == 3
38        return tuple(parts)
39
40
41def read_likely_subtags(input_file_name):
42    """Read and parse ICU's likelySubtags.txt."""
43    with open(input_file_name) as input_file:
44        likely_script_dict = {
45            # Android's additions for pseudo-locales. These internal codes make
46            # sure that the pseudo-locales would not match other English or
47            # Arabic locales. (We can't use private-use ISO 15924 codes, since
48            # they may be used by apps for other purposes.)
49            "en_XA": "~~~A",
50            "ar_XB": "~~~B",
51            # Removed data from later versions of ICU
52            "ji": "Hebr", # Old code for Yiddish, still used in Java and Android
53        }
54        representative_locales = {
55            # Android's additions
56            "en_Latn_GB", # representative for en_Latn_001
57            "es_Latn_MX", # representative for es_Latn_419
58            "es_Latn_US", # representative for es_Latn_419 (not the best idea,
59                          # but Android has been shipping with it for quite a
60                          # while. Fortunately, MX < US, so if both exist, MX
61                          # would be chosen.)
62        }
63        for line in input_file:
64            line = unicode(line, 'UTF-8').strip(u' \n\uFEFF').encode('UTF-8')
65            if line.startswith('//'):
66                continue
67            if '{' in line and '}' in line:
68                from_locale = line[:line.index('{')]
69                to_locale = line[line.index('"')+1:line.rindex('"')]
70                from_lang, from_scr, from_region = get_locale_parts(from_locale)
71                _, to_scr, to_region = get_locale_parts(to_locale)
72                if from_lang == 'und':
73                    continue  # not very useful for our purposes
74                if from_region is None and to_region not in ['001', 'ZZ']:
75                    representative_locales.add(to_locale)
76                if from_scr is None:
77                    likely_script_dict[from_locale] = to_scr
78        return likely_script_dict, frozenset(representative_locales)
79
80
81# From packLanguageOrRegion() in ResourceTypes.cpp
82def pack_language_or_region(inp, base):
83    """Pack langauge or region in a two-byte tuple."""
84    if inp is None:
85        return (0, 0)
86    elif len(inp) == 2:
87        return ord(inp[0]), ord(inp[1])
88    else:
89        assert len(inp) == 3
90        base = ord(base)
91        first = ord(inp[0]) - base
92        second = ord(inp[1]) - base
93        third = ord(inp[2]) - base
94
95        return (0x80 | (third << 2) | (second >>3),
96                ((second << 5) | first) & 0xFF)
97
98
99# From packLanguage() in ResourceTypes.cpp
100def pack_language(language):
101    """Pack language in a two-byte tuple."""
102    return pack_language_or_region(language, 'a')
103
104
105# From packRegion() in ResourceTypes.cpp
106def pack_region(region):
107    """Pack region in a two-byte tuple."""
108    return pack_language_or_region(region, '0')
109
110
111def pack_to_uint32(locale):
112    """Pack language+region of locale into a 32-bit unsigned integer."""
113    lang, _, region = get_locale_parts(locale)
114    plang = pack_language(lang)
115    pregion = pack_region(region)
116    return (plang[0] << 24) | (plang[1] << 16) | (pregion[0] << 8) | pregion[1]
117
118
119def dump_script_codes(all_scripts):
120    """Dump the SCRIPT_CODES table."""
121    print 'const char SCRIPT_CODES[][4] = {'
122    for index, script in enumerate(all_scripts):
123        print "    /* %-2d */ {'%c', '%c', '%c', '%c'}," % (
124            index, script[0], script[1], script[2], script[3])
125    print '};'
126    print
127
128
129def dump_script_data(likely_script_dict, all_scripts):
130    """Dump the script data."""
131    print
132    print 'const std::unordered_map<uint32_t, uint8_t> LIKELY_SCRIPTS({'
133    for locale in sorted(likely_script_dict.keys()):
134        script = likely_script_dict[locale]
135        print '    {0x%08Xu, %2du}, // %s -> %s' % (
136            pack_to_uint32(locale),
137            all_scripts.index(script),
138            locale.replace('_', '-'),
139            script)
140    print '});'
141
142
143def pack_to_uint64(locale):
144    """Pack a full locale into a 64-bit unsigned integer."""
145    _, script, _ = get_locale_parts(locale)
146    return ((pack_to_uint32(locale) << 32) |
147            (ord(script[0]) << 24) |
148            (ord(script[1]) << 16) |
149            (ord(script[2]) << 8) |
150            ord(script[3]))
151
152
153def dump_representative_locales(representative_locales):
154    """Dump the set of representative locales."""
155    print
156    print 'std::unordered_set<uint64_t> REPRESENTATIVE_LOCALES({'
157    for locale in sorted(representative_locales):
158        print '    0x%08XLLU, // %s' % (
159            pack_to_uint64(locale),
160            locale)
161    print '});'
162
163
164def read_and_dump_likely_data(icu_data_dir):
165    """Read and dump the likely-script data."""
166    likely_subtags_txt = os.path.join(icu_data_dir, 'misc', 'likelySubtags.txt')
167    likely_script_dict, representative_locales = read_likely_subtags(
168        likely_subtags_txt)
169
170    all_scripts = list(set(likely_script_dict.values()))
171    assert len(all_scripts) <= 256
172    all_scripts.sort()
173
174    dump_script_codes(all_scripts)
175    dump_script_data(likely_script_dict, all_scripts)
176    dump_representative_locales(representative_locales)
177    return likely_script_dict
178
179def escape_script_variable_name(script):
180    """Escape characters, e.g. '~', in a C++ variable name"""
181    return script.replace("~", "_")
182
183def read_parent_data(icu_data_dir):
184    """Read locale parent data from ICU data files."""
185    all_icu_data_files = glob.glob(os.path.join(icu_data_dir, '*', '*.txt'))
186    parent_dict = {}
187    for data_file in all_icu_data_files:
188        locale = os.path.splitext(os.path.basename(data_file))[0]
189        with open(data_file) as input_file:
190            for line in input_file:
191                if '%%Parent' in line:
192                    parent = line[line.index('"')+1:line.rindex('"')]
193                    if locale in parent_dict:
194                        # Different files shouldn't have different parent info
195                        assert parent_dict[locale] == parent
196                    else:
197                        parent_dict[locale] = parent
198                elif locale.startswith('ar_') and 'default{"latn"}' in line:
199                    # Arabic parent overrides for ASCII digits. Since
200                    # Unicode extensions are not supported in ResourceTypes,
201                    # we will use ar-015 (Arabic, Northern Africa) instead
202                    # of the more correct ar-u-nu-latn.
203                    parent_dict[locale] = 'ar_015'
204    return parent_dict
205
206
207def get_likely_script(locale, likely_script_dict):
208    """Find the likely script for a locale, given the likely-script dictionary.
209    """
210    if locale.count('_') == 2:
211        # it already has a script
212        return locale.split('_')[1]
213    elif locale in likely_script_dict:
214        return likely_script_dict[locale]
215    else:
216        language = locale.split('_')[0]
217        return likely_script_dict[language]
218
219
220def dump_parent_data(script_organized_dict):
221    """Dump information for parents of locales."""
222    sorted_scripts = sorted(script_organized_dict.keys())
223    print
224    for script in sorted_scripts:
225        parent_dict = script_organized_dict[script]
226        print ('const std::unordered_map<uint32_t, uint32_t> %s_PARENTS({'
227            % escape_script_variable_name(script.upper()))
228        for locale in sorted(parent_dict.keys()):
229            parent = parent_dict[locale]
230            print '    {0x%08Xu, 0x%08Xu}, // %s -> %s' % (
231                pack_to_uint32(locale),
232                pack_to_uint32(parent),
233                locale.replace('_', '-'),
234                parent.replace('_', '-'))
235        print '});'
236        print
237
238    print 'const struct {'
239    print '    const char script[4];'
240    print '    const std::unordered_map<uint32_t, uint32_t>* map;'
241    print '} SCRIPT_PARENTS[] = {'
242    for script in sorted_scripts:
243        print "    {{'%c', '%c', '%c', '%c'}, &%s_PARENTS}," % (
244            script[0], script[1], script[2], script[3],
245            escape_script_variable_name(script.upper()))
246    print '};'
247
248
249def dump_parent_tree_depth(parent_dict):
250    """Find and dump the depth of the parent tree."""
251    max_depth = 1
252    for locale, _ in parent_dict.items():
253        depth = 1
254        while locale in parent_dict:
255            locale = parent_dict[locale]
256            depth += 1
257        max_depth = max(max_depth, depth)
258    assert max_depth < 5 # Our algorithms assume small max_depth
259    print
260    print 'const size_t MAX_PARENT_DEPTH = %d;' % max_depth
261
262
263def read_and_dump_parent_data(icu_data_dir, likely_script_dict):
264    """Read parent data from ICU and dump it."""
265    parent_dict = read_parent_data(icu_data_dir)
266    script_organized_dict = collections.defaultdict(dict)
267    for locale in parent_dict:
268        parent = parent_dict[locale]
269        if parent == 'root':
270            continue
271        script = get_likely_script(locale, likely_script_dict)
272        script_organized_dict[script][locale] = parent_dict[locale]
273    dump_parent_data(script_organized_dict)
274    dump_parent_tree_depth(parent_dict)
275
276
277def main():
278    """Read the data files from ICU and dump the output to a C++ file."""
279    source_root = sys.argv[1]
280    icu_data_dir = os.path.join(
281        source_root,
282        'external', 'icu', 'icu4c', 'source', 'data')
283
284    print '// Auto-generated by %s' % sys.argv[0]
285    print
286    likely_script_dict = read_and_dump_likely_data(icu_data_dir)
287    read_and_dump_parent_data(icu_data_dir, likely_script_dict)
288
289
290if __name__ == '__main__':
291    main()
292