1#!/usr/bin/env python 2 3# Copyright (C) 2015 The Android Open Source Project 4# 5# Licensed under the Apache License, Version 2.0 (the 'License'); 6# you may not use this file except in compliance with the License. 7# You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, software 12# distributed under the License is distributed on an 'AS IS' BASIS, 13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14# See the License for the specific language governing permissions and 15# limitations under the License. 16 17""" 18Convert hyphen files in standard TeX format (a trio of pat, chr, and hyp) 19into binary format. See doc/hyb_file_format.md for more information. 20 21Usage: mk_hyb_file.py [-v] hyph-foo.pat.txt hyph-foo.hyb 22 23Optional -v parameter turns on verbose debugging. 24 25""" 26 27from __future__ import print_function 28 29import io 30import sys 31import struct 32import math 33import getopt 34 35 36VERBOSE = False 37 38# U+00DF is LATIN SMALL LETTER SHARP S 39# U+1E9E is LATIN CAPITAL LETTER SHARP S 40SHARP_S_TO_DOUBLE = u'\u00dfSS' 41SHARP_S_TO_CAPITAL = u'\u00df\u1e9e' 42 43if sys.version_info[0] >= 3: 44 def unichr(x): 45 return chr(x) 46 47 48# number of bits required to represent numbers up to n inclusive 49def num_bits(n): 50 return 1 + int(math.log(n, 2)) if n > 0 else 0 51 52 53class Node: 54 55 def __init__(self): 56 self.succ = {} 57 self.res = None 58 self.fsm_pat = None 59 self.fail = None 60 61 62# List of free slots, implemented as doubly linked list 63class Freelist: 64 65 def __init__(self): 66 self.first = None 67 self.last = None 68 self.pred = [] 69 self.succ = [] 70 71 def grow(self): 72 this = len(self.pred) 73 self.pred.append(self.last) 74 self.succ.append(None) 75 if self.last is None: 76 self.first = this 77 else: 78 self.succ[self.last] = this 79 self.last = this 80 81 def next(self, cursor): 82 if cursor == 0: 83 cursor = self.first 84 if cursor is None: 85 self.grow() 86 result = self.last 87 else: 88 result = cursor 89 return result, self.succ[result] 90 91 def is_free(self, ix): 92 while ix >= len(self.pred): 93 self.grow() 94 return self.pred[ix] != -1 95 96 def use(self, ix): 97 if self.pred[ix] is None: 98 self.first = self.succ[ix] 99 else: 100 self.succ[self.pred[ix]] = self.succ[ix] 101 if self.succ[ix] is None: 102 self.last = self.pred[ix] 103 else: 104 self.pred[self.succ[ix]] = self.pred[ix] 105 if self.pred[ix] == -1: 106 assert self.pred[ix] != -1, 'double free!' 107 self.pred[ix] = -1 108 109 110def combine(a, b): 111 if a is None: return b 112 if b is None: return a 113 if len(b) < len(a): a, b = b, a 114 res = b[:len(b) - len(a)] 115 for i in range(len(a)): 116 res.append(max(a[i], b[i + len(b) - len(a)])) 117 return res 118 119 120def trim(pattern): 121 for ix in range(len(pattern)): 122 if pattern[ix] != 0: 123 return pattern[ix:] 124 125 126def pat_to_binary(pattern): 127 return b''.join(struct.pack('B', x) for x in pattern) 128 129 130class Hyph: 131 132 def __init__(self): 133 self.root = Node() 134 self.root.str = '<root>' 135 self.node_list = [self.root] 136 137 # Add a pattern (word fragment with numeric codes, such as ".ad4der") 138 def add_pat(self, pat): 139 lastWasLetter = False 140 haveSeenNumber = False 141 result = [] 142 word = '' 143 for c in pat: 144 if c.isdigit(): 145 result.append(int(c)) 146 lastWasLetter = False 147 haveSeenNumber = True 148 else: 149 word += c 150 if lastWasLetter and haveSeenNumber: 151 result.append(0) 152 lastWasLetter = True 153 if lastWasLetter: 154 result.append(0) 155 156 self.add_word_res(word, result) 157 158 # Add an exception (word with hyphens, such as "ta-ble") 159 def add_exception(self, hyph_word): 160 res = [] 161 word = ['.'] 162 need_10 = False 163 for c in hyph_word: 164 if c == '-': 165 res.append(11) 166 need_10 = False 167 else: 168 if need_10: 169 res.append(10) 170 word.append(c) 171 need_10 = True 172 word.append('.') 173 res.append(0) 174 res.append(0) 175 if VERBOSE: 176 print(word, res) 177 self.add_word_res(''.join(word), res) 178 179 def add_word_res(self, word, result): 180 if VERBOSE: 181 print(word, result) 182 183 t = self.root 184 s = '' 185 for c in word: 186 s += c 187 if c not in t.succ: 188 new_node = Node() 189 new_node.str = s 190 self.node_list.append(new_node) 191 t.succ[c] = new_node 192 t = t.succ[c] 193 t.res = result 194 195 def pack(self, node_list, ch_map, use_node=False): 196 size = 0 197 self.node_map = {} 198 nodes = Freelist() 199 edges = Freelist() 200 edge_start = 1 if use_node else 0 201 for node in node_list: 202 succ = sorted([ch_map[c] + edge_start for c in node.succ.keys()]) 203 if len(succ): 204 cursor = 0 205 while True: 206 edge_ix, cursor = edges.next(cursor) 207 ix = edge_ix - succ[0] 208 if (ix >= 0 and nodes.is_free(ix) and 209 all(edges.is_free(ix + s) for s in succ) and 210 ((not use_node) or edges.is_free(ix))): 211 break 212 elif use_node: 213 ix, _ = edges.next(0) 214 nodes.is_free(ix) # actually don't need nodes at all when use_node, 215 # but keep it happy 216 else: 217 ix, _ = nodes.next(0) 218 node.ix = ix 219 self.node_map[ix] = node 220 nodes.use(ix) 221 size = max(size, ix) 222 if use_node: 223 edges.use(ix) 224 for s in succ: 225 edges.use(ix + s) 226 size += max(ch_map.values()) + 1 227 return size 228 229 # return list of nodes in bfs order 230 def bfs(self, ch_map): 231 result = [self.root] 232 ix = 0 233 while ix < len(result): 234 node = result[ix] 235 node.bfs_ix = ix 236 mapped = {} 237 for c, next in node.succ.items(): 238 assert ch_map[c] not in mapped, 'duplicate edge ' + node.str + ' ' + hex(ord(c)) 239 mapped[ch_map[c]] = next 240 for i in sorted(mapped.keys()): 241 result.append(mapped[i]) 242 ix += 1 243 self.bfs_order = result 244 return result 245 246 # suffix compression - convert the trie into an acyclic digraph, merging nodes when 247 # the subtries are identical 248 def dedup(self): 249 uniques = [] 250 dupmap = {} 251 dedup_ix = [0] * len(self.bfs_order) 252 for ix in reversed(range(len(self.bfs_order))): 253 # construct string representation of node 254 node = self.bfs_order[ix] 255 if node.res is None: 256 s = '' 257 else: 258 s = ''.join(str(c) for c in node.res) 259 for c in sorted(node.succ.keys()): 260 succ = node.succ[c] 261 s += ' ' + c + str(dedup_ix[succ.bfs_ix]) 262 if s in dupmap: 263 dedup_ix[ix] = dupmap[s] 264 else: 265 uniques.append(node) 266 dedup_ix[ix] = ix 267 dupmap[s] = dedup_ix[ix] 268 uniques.reverse() 269 if VERBOSE: 270 print(len(uniques), 'unique nodes,', len(self.bfs_order), 'total') 271 return dedup_ix, uniques 272 273 274# load the ".pat" file, which contains patterns such as a1b2c3 275def load(fn): 276 hyph = Hyph() 277 with io.open(fn, encoding='UTF-8') as f: 278 for l in f: 279 pat = l.strip() 280 hyph.add_pat(pat) 281 return hyph 282 283 284# load the ".chr" file, which contains the alphabet and case pairs, eg "aA", "bB" etc. 285def load_chr(fn): 286 ch_map = {'.': 0} 287 with io.open(fn, encoding='UTF-8') as f: 288 for i, l in enumerate(f): 289 l = l.strip() 290 if len(l) > 2: 291 if l == SHARP_S_TO_DOUBLE: 292 # replace with lowercasing from capital letter sharp s 293 l = SHARP_S_TO_CAPITAL 294 else: 295 # lowercase maps to multi-character uppercase sequence, ignore uppercase for now 296 l = l[:1] 297 else: 298 assert len(l) == 2, 'expected 2 chars in chr' 299 for c in l: 300 ch_map[c] = i + 1 301 return ch_map 302 303 304# load exceptions with explicit hyphens 305def load_hyp(hyph, fn): 306 with io.open(fn, encoding='UTF-8') as f: 307 for l in f: 308 hyph.add_exception(l.strip()) 309 310 311def generate_header(alphabet, trie, pattern): 312 alphabet_off = 6 * 4 313 trie_off = alphabet_off + len(alphabet) 314 pattern_off = trie_off + len(trie) 315 file_size = pattern_off + len(pattern) 316 data = [0x62ad7968, 0, alphabet_off, trie_off, pattern_off, file_size] 317 return struct.pack('<6I', *data) 318 319 320def generate_alphabet(ch_map): 321 ch_map = ch_map.copy() 322 del ch_map['.'] 323 min_ch = ord(min(ch_map)) 324 max_ch = ord(max(ch_map)) 325 if max_ch - min_ch < 1024 and max(ch_map.values()) < 256: 326 # generate format 0 327 data = [0] * (max_ch - min_ch + 1) 328 for c, val in ch_map.items(): 329 data[ord(c) - min_ch] = val 330 result = [struct.pack('<3I', 0, min_ch, max_ch + 1)] 331 for b in data: 332 result.append(struct.pack('<B', b)) 333 else: 334 # generate format 1 335 assert max(ch_map.values()) < 2048, 'max number of unique characters exceeded' 336 result = [struct.pack('<2I', 1, len(ch_map))] 337 for c, val in sorted(ch_map.items()): 338 data = (ord(c) << 11) | val 339 result.append(struct.pack('<I', data)) 340 binary = b''.join(result) 341 if len(binary) % 4 != 0: 342 binary += b'\x00' * (4 - len(binary) % 4) 343 return binary 344 345 346# assumes hyph structure has been packed, ie node.ix values have been set 347def generate_trie(hyph, ch_map, n_trie, dedup_ix, dedup_nodes, patmap): 348 ch_array = [0] * n_trie 349 link_array = [0] * n_trie 350 pat_array = [0] * n_trie 351 link_shift = num_bits(max(ch_map.values())) 352 char_mask = (1 << link_shift) - 1 353 pattern_shift = link_shift + num_bits(n_trie - 1) 354 link_mask = (1 << pattern_shift) - (1 << link_shift) 355 result = [struct.pack('<6I', 0, char_mask, link_shift, link_mask, pattern_shift, n_trie)] 356 357 for node in dedup_nodes: 358 ix = node.ix 359 if node.res is not None: 360 pat_array[ix] = patmap[pat_to_binary(node.res)] 361 for c, next in node.succ.items(): 362 c_num = ch_map[c] 363 link_ix = ix + c_num 364 ch_array[link_ix] = c_num 365 if dedup_ix is None: 366 dedup_next = next 367 else: 368 dedup_next = hyph.bfs_order[dedup_ix[next.bfs_ix]] 369 link_array[link_ix] = dedup_next.ix 370 371 for i in range(n_trie): 372 #print((pat_array[i], link_array[i], ch_array[i])) 373 packed = (pat_array[i] << pattern_shift) | (link_array[i] << link_shift) | ch_array[i] 374 result.append(struct.pack('<I', packed)) 375 return b''.join(result) 376 377 378def generate_pattern(pats): 379 pat_array = [0] 380 patmap = {b'': 0} 381 382 raw_pat_array = [] 383 raw_pat_size = 0 384 raw_patmap = {} 385 386 for pat in pats: 387 if pat is None: 388 continue 389 pat_str = pat_to_binary(pat) 390 if pat_str not in patmap: 391 shift = 0 392 while shift < len(pat) and pat[len(pat) - shift - 1] == 0: 393 shift += 1 394 rawpat = pat_str[:len(pat) - shift] 395 if rawpat not in raw_patmap: 396 raw_patmap[rawpat] = raw_pat_size 397 raw_pat_array.append(rawpat) 398 raw_pat_size += len(rawpat) 399 data = (len(rawpat) << 26) | (shift << 20) | raw_patmap[rawpat] 400 patmap[pat_str] = len(pat_array) 401 pat_array.append(data) 402 data = [0, len(pat_array), 16 + 4 * len(pat_array), raw_pat_size] 403 result = [struct.pack('<4I', *data)] 404 for x in pat_array: 405 result.append(struct.pack('<I', x)) 406 result.extend(raw_pat_array) 407 return patmap, b''.join(result) 408 409 410def generate_hyb_file(hyph, ch_map, hyb_fn): 411 bfs = hyph.bfs(ch_map) 412 dedup_ix, dedup_nodes = hyph.dedup() 413 n_trie = hyph.pack(dedup_nodes, ch_map) 414 alphabet = generate_alphabet(ch_map) 415 patmap, pattern = generate_pattern([n.res for n in hyph.node_list]) 416 trie = generate_trie(hyph, ch_map, n_trie, dedup_ix, dedup_nodes, patmap) 417 header = generate_header(alphabet, trie, pattern) 418 419 with open(hyb_fn, 'wb') as f: 420 f.write(header) 421 f.write(alphabet) 422 f.write(trie) 423 f.write(pattern) 424 425 426# Verify that the file contains the same lines as the lines argument, in arbitrary order 427def verify_file_sorted(lines, fn): 428 file_lines = [l.strip() for l in io.open(fn, encoding='UTF-8')] 429 line_set = set(lines) 430 file_set = set(file_lines) 431 if SHARP_S_TO_DOUBLE in file_set: 432 # ignore difference of double capital letter s and capital letter sharp s 433 file_set.symmetric_difference_update([SHARP_S_TO_DOUBLE, SHARP_S_TO_CAPITAL]) 434 if line_set == file_set: 435 return True 436 for line in line_set - file_set: 437 print(repr(line) + ' in reconstruction, not in file') 438 for line in file_set - line_set: 439 print(repr(line) + ' in file, not in reconstruction') 440 return False 441 442 443def map_to_chr(alphabet_map): 444 result = [] 445 ch_map = {} 446 for val in alphabet_map.values(): 447 chs = [ch for ch in alphabet_map if alphabet_map[ch] == val] 448 # non-cased characters (like Ethopic) are in both, matching chr file 449 lowercase = [ch for ch in chs if not ch.isupper()] 450 uppercase = [ch for ch in chs if not ch.islower()] 451 # print(val, `lowercase`, `uppercase`) 452 assert len(lowercase) == 1, 'expected 1 lowercase character' 453 assert 0 <= len(uppercase) <= 1, 'expected 0 or 1 uppercase character' 454 ch_map[val] = lowercase[0] 455 result.append(''.join(lowercase + uppercase)) 456 ch_map[0] = '.' 457 return (ch_map, result) 458 459 460def get_pattern(pattern_data, ix): 461 pattern_offset = struct.unpack('<I', pattern_data[8:12])[0] 462 entry = struct.unpack('<I', pattern_data[16 + ix * 4: 16 + ix * 4 + 4])[0] 463 pat_len = entry >> 26 464 pat_shift = (entry >> 20) & 0x1f 465 offset = pattern_offset + (entry & 0xfffff) 466 return pattern_data[offset: offset + pat_len] + b'\0' * pat_shift 467 468 469def traverse_trie(ix, s, trie_data, ch_map, pattern_data, patterns, exceptions): 470 (char_mask, link_shift, link_mask, pattern_shift) = struct.unpack('<4I', trie_data[4:20]) 471 node_entry = struct.unpack('<I', trie_data[24 + ix * 4: 24 + ix * 4 + 4])[0] 472 pattern = node_entry >> pattern_shift 473 if pattern: 474 result = [] 475 is_exception = False 476 pat = get_pattern(pattern_data, pattern) 477 for i in range(len(s) + 1): 478 pat_off = i - 1 + len(pat) - len(s) 479 if pat_off < 0: 480 code = 0 481 else: 482 code = struct.unpack('B', pat[pat_off : pat_off + 1])[0] 483 if 1 <= code <= 9: 484 result.append('%d' % code) 485 elif code == 10: 486 is_exception = True 487 elif code == 11: 488 result.append('-') 489 is_exception = True 490 else: 491 assert code == 0, 'unexpected code' 492 if i < len(s): 493 result.append(s[i]) 494 pat_str = ''.join(result) 495 #print(`pat_str`, `pat`) 496 if is_exception: 497 assert pat_str[0] == '.', "expected leading '.'" 498 assert pat_str[-1] == '.', "expected trailing '.'" 499 exceptions.append(pat_str[1:-1]) # strip leading and trailing '.' 500 else: 501 patterns.append(pat_str) 502 for ch in ch_map: 503 edge_entry = struct.unpack('<I', trie_data[24 + (ix + ch) * 4: 24 + (ix + ch) * 4 + 4])[0] 504 link = (edge_entry & link_mask) >> link_shift 505 if link != 0 and ch == (edge_entry & char_mask): 506 sch = s + ch_map[ch] 507 traverse_trie(link, sch, trie_data, ch_map, pattern_data, patterns, exceptions) 508 509 510# Verify the generated binary file by reconstructing the textual representations 511# from the binary hyb file, then checking that they're identical (mod the order of 512# lines within the file, which is irrelevant). This function makes assumptions that 513# are stronger than absolutely necessary (in particular, that the patterns are in 514# lowercase as defined by python islower). 515def verify_hyb_file(hyb_fn, pat_fn, chr_fn, hyp_fn): 516 with open(hyb_fn, 'rb') as f: 517 hyb_data = f.read() 518 header = hyb_data[0: 6 * 4] 519 (magic, version, alphabet_off, trie_off, pattern_off, file_size) = struct.unpack('<6I', header) 520 alphabet_data = hyb_data[alphabet_off:trie_off] 521 trie_data = hyb_data[trie_off:pattern_off] 522 pattern_data = hyb_data[pattern_off:file_size] 523 524 # reconstruct alphabet table 525 alphabet_version = struct.unpack('<I', alphabet_data[:4])[0] 526 alphabet_map = {} 527 if alphabet_version == 0: 528 (min_ch, max_ch) = struct.unpack('<2I', alphabet_data[4:12]) 529 for ch in range(min_ch, max_ch): 530 offset = 12 + ch - min_ch 531 b = struct.unpack('B', alphabet_data[offset : offset + 1])[0] 532 if b != 0: 533 alphabet_map[unichr(ch)] = b 534 else: 535 assert alphabet_version == 1 536 n_entries = struct.unpack('<I', alphabet_data[4:8])[0] 537 for i in range(n_entries): 538 entry = struct.unpack('<I', alphabet_data[8 + 4 * i: 8 + 4 * i + 4])[0] 539 alphabet_map[unichr(entry >> 11)] = entry & 0x7ff 540 541 ch_map, reconstructed_chr = map_to_chr(alphabet_map) 542 543 # EXCEPTION for Armenian (hy), we don't really deal with the uppercase form of U+0587 544 if u'\u0587' in reconstructed_chr: 545 reconstructed_chr.remove(u'\u0587') 546 reconstructed_chr.append(u'\u0587\u0535\u0552') 547 548 assert verify_file_sorted(reconstructed_chr, chr_fn), 'alphabet table not verified' 549 550 # reconstruct trie 551 patterns = [] 552 exceptions = [] 553 traverse_trie(0, '', trie_data, ch_map, pattern_data, patterns, exceptions) 554 555 # EXCEPTION for Bulgarian (bg), which contains an ineffectual line of <0, U+044C, 0> 556 if u'\u044c' in patterns: 557 patterns.remove(u'\u044c') 558 patterns.append(u'0\u044c0') 559 560 assert verify_file_sorted(patterns, pat_fn), 'pattern table not verified' 561 assert verify_file_sorted(exceptions, hyp_fn), 'exception table not verified' 562 563 564def main(): 565 global VERBOSE 566 try: 567 opts, args = getopt.getopt(sys.argv[1:], 'v') 568 except getopt.GetoptError as err: 569 print(str(err)) 570 sys.exit(1) 571 for o, _ in opts: 572 if o == '-v': 573 VERBOSE = True 574 pat_fn, out_fn = args 575 hyph = load(pat_fn) 576 if pat_fn.endswith('.pat.txt'): 577 chr_fn = pat_fn[:-8] + '.chr.txt' 578 ch_map = load_chr(chr_fn) 579 hyp_fn = pat_fn[:-8] + '.hyp.txt' 580 load_hyp(hyph, hyp_fn) 581 generate_hyb_file(hyph, ch_map, out_fn) 582 verify_hyb_file(out_fn, pat_fn, chr_fn, hyp_fn) 583 584if __name__ == '__main__': 585 main() 586