1#!/usr/bin/env python 2# -*- coding: utf-8 -*- 3# 4# compose-parse.py, version 1.3 5# 6# multifunction script that helps manage the compose sequence table in GTK+ (gtk/gtkimcontextsimple.c) 7# the script produces statistics and information about the whole process, run with --help for more. 8# 9# You may need to switch your python installation to utf-8, if you get 'ascii' codec errors. 10# 11# Complain to Simos Xenitellis (simos@gnome.org, http://simos.info/blog) for this craft. 12 13from re import findall, match, split, sub 14from string import atoi 15from unicodedata import normalize 16from urllib import urlretrieve 17from os.path import isfile, getsize 18from copy import copy 19 20import sys 21import getopt 22 23# We grab files off the web, left and right. 24URL_COMPOSE = 'http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre' 25URL_KEYSYMSTXT = "http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt" 26URL_GDKKEYSYMSH = "http://git.gnome.org/browse/gtk%2B/plain/gdk/gdkkeysyms.h" 27URL_UNICODEDATATXT = 'http://www.unicode.org/Public/5.2.0/ucd/UnicodeData.txt' 28FILENAME_COMPOSE_SUPPLEMENTARY = 'gtk-compose-lookaside.txt' 29 30# We currently support keysyms of size 2; once upstream xorg gets sorted, 31# we might produce some tables with size 2 and some with size 4. 32SIZEOFINT = 2 33 34# Current max compose sequence length; in case it gets increased. 35WIDTHOFCOMPOSETABLE = 5 36 37keysymdatabase = {} 38keysymunicodedatabase = {} 39unicodedatabase = {} 40 41headerfile_start = """/* GTK - The GIMP Tool Kit 42 * Copyright (C) 2007, 2008 GNOME Foundation 43 * 44 * This library is free software; you can redistribute it and/or 45 * modify it under the terms of the GNU Lesser General Public 46 * License as published by the Free Software Foundation; either 47 * version 2 of the License, or (at your option) any later version. 48 * 49 * This library is distributed in the hope that it will be useful, 50 * but WITHOUT ANY WARRANTY; without even the implied warranty of 51 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 52 * Lesser General Public License for more details. 53 * 54 * You should have received a copy of the GNU Lesser General Public 55 * License along with this library; if not, write to the 56 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 57 * Boston, MA 02111-1307, USA. 58 */ 59 60/* 61 * File auto-generated from script found at http://bugzilla.gnome.org/show_bug.cgi?id=321896 62 * using the input files 63 * Input : http://gitweb.freedesktop.org/?p=xorg/lib/libX11.git;a=blob_plain;f=nls/en_US.UTF-8/Compose.pre 64 * Input : http://www.cl.cam.ac.uk/~mgk25/ucs/keysyms.txt 65 * Input : http://www.unicode.org/Public/UNIDATA/UnicodeData.txt 66 * 67 * This table is optimised for space and requires special handling to access the content. 68 * This table is used solely by http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimple.c 69 * 70 * The resulting file is placed at http://svn.gnome.org/viewcvs/gtk%2B/trunk/gtk/gtkimcontextsimpleseqs.h 71 * This file is described in bug report http://bugzilla.gnome.org/show_bug.cgi?id=321896 72 */ 73 74/* 75 * Modified by the GTK+ Team and others 2007, 2008. See the AUTHORS 76 * file for a list of people on the GTK+ Team. See the ChangeLog 77 * files for a list of changes. These files are distributed with 78 * GTK+ at ftp://ftp.gtk.org/pub/gtk/. 79 */ 80 81#ifndef __GTK_IM_CONTEXT_SIMPLE_SEQS_H__ 82#define __GTK_IM_CONTEXT_SIMPLE_SEQS_H__ 83 84/* === These are the original comments of the file; we keep for historical purposes === 85 * 86 * The following table was generated from the X compose tables include with 87 * XFree86 4.0 using a set of Perl scripts. Contact Owen Taylor <otaylor@redhat.com> 88 * to obtain the relevant perl scripts. 89 * 90 * The following compose letter letter sequences confliced 91 * Dstroke/dstroke and ETH/eth; resolved to Dstroke (Croation, Vietnamese, Lappish), over 92 * ETH (Icelandic, Faroese, old English, IPA) [ D- -D d- -d ] 93 * Amacron/amacron and ordfeminine; resolved to ordfeminine [ _A A_ a_ _a ] 94 * Amacron/amacron and Atilde/atilde; resolved to atilde [ -A A- a- -a ] 95 * Omacron/Omacron and masculine; resolved to masculine [ _O O_ o_ _o ] 96 * Omacron/omacron and Otilde/atilde; resolved to otilde [ -O O- o- -o ] 97 * 98 * [ Amacron and Omacron are in Latin-4 (Baltic). ordfeminine and masculine are used for 99 * spanish. atilde and otilde are used at least for Portuguese ] 100 * 101 * at and Aring; resolved to Aring [ AA ] 102 * guillemotleft and caron; resolved to guillemotleft [ << ] 103 * ogonek and cedilla; resolved to cedilla [ ,, ] 104 * 105 * This probably should be resolved by first checking an additional set of compose tables 106 * that depend on the locale or selected input method. 107 */ 108 109static const guint16 gtk_compose_seqs_compact[] = {""" 110 111headerfile_end = """}; 112 113#endif /* __GTK_IM_CONTEXT_SIMPLE_SEQS_H__ */ 114""" 115 116def stringtohex(str): return atoi(str, 16) 117 118def factorial(n): 119 if n <= 1: 120 return 1 121 else: 122 return n * factorial(n-1) 123 124def uniq(*args) : 125 """ Performs a uniq operation on a list or lists """ 126 theInputList = [] 127 for theList in args: 128 theInputList += theList 129 theFinalList = [] 130 for elem in theInputList: 131 if elem not in theFinalList: 132 theFinalList.append(elem) 133 return theFinalList 134 135 136 137def all_permutations(seq): 138 """ Borrowed from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/252178 """ 139 """ Produces all permutations of the items of a list """ 140 if len(seq) <=1: 141 yield seq 142 else: 143 for perm in all_permutations(seq[1:]): 144 for i in range(len(perm)+1): 145 #nb str[0:1] works in both string and list contexts 146 yield perm[:i] + seq[0:1] + perm[i:] 147 148def usage(): 149 print """compose-parse available parameters: 150 -h, --help this craft 151 -s, --statistics show overall statistics (both algorithmic, non-algorithmic) 152 -a, --algorithmic show sequences saved with algorithmic optimisation 153 -g, --gtk show entries that go to GTK+ 154 -u, --unicodedatatxt show compose sequences derived from UnicodeData.txt (from unicode.org) 155 -v, --verbose show verbose output 156 -p, --plane1 show plane1 compose sequences 157 -n, --numeric when used with --gtk, create file with numeric values only 158 -e, --gtk-expanded when used with --gtk, create file that repeats first column; not usable in GTK+ 159 --all-sequences when used with --gtk, create file with entries rejected by default 160 Default is to show statistics. 161 """ 162 163try: 164 opts, args = getopt.getopt(sys.argv[1:], "pvgashune", ["help", "algorithmic", "statistics", "unicodedatatxt", 165 "stats", "gtk", "verbose", "plane1", "numeric", "gtk-expanded", "all-sequences"]) 166except: 167 usage() 168 sys.exit(2) 169 170opt_statistics = False 171opt_algorithmic = False 172opt_gtk = False 173opt_unicodedatatxt = False 174opt_verbose = False 175opt_plane1 = False 176opt_numeric = False 177opt_gtkexpanded = False 178opt_allsequences = False 179 180for o, a in opts: 181 if o in ("-h", "--help"): 182 usage() 183 sys.exit() 184 if o in ("-s", "--statistics"): 185 opt_statistics = True 186 if o in ("-a", "--algorithmic"): 187 opt_algorithmic = True 188 if o in ("-g", "--gtk"): 189 opt_gtk = True 190 if o in ("-u", "--unicodedatatxt"): 191 opt_unicodedatatxt = True 192 if o in ("-v", "--verbose"): 193 opt_verbose = True 194 if o in ("-p", "--plane1"): 195 opt_plane1 = True 196 if o in ("-n", "--numeric"): 197 opt_numeric = True 198 if o in ("-e", "--gtk-expanded"): 199 opt_gtkexpanded = True 200 if o == "--all-sequences": 201 opt_allsequences = True 202 203if not opt_algorithmic and not opt_gtk and not opt_unicodedatatxt: 204 opt_statistics = True 205 206def download_hook(blocks_transferred, block_size, file_size): 207 """ A download hook to provide some feedback when downloading """ 208 if blocks_transferred == 0: 209 if file_size > 0: 210 if opt_verbose: 211 print "Downloading", file_size, "bytes: ", 212 else: 213 if opt_verbose: 214 print "Downloading: ", 215 sys.stdout.write('#') 216 sys.stdout.flush() 217 218 219def download_file(url): 220 """ Downloads a file provided a URL. Returns the filename. """ 221 """ Borks on failure """ 222 localfilename = url.split('/')[-1] 223 if not isfile(localfilename) or getsize(localfilename) <= 0: 224 if opt_verbose: 225 print "Downloading ", url, "..." 226 try: 227 urlretrieve(url, localfilename, download_hook) 228 except IOError, (errno, strerror): 229 print "I/O error(%s): %s" % (errno, strerror) 230 sys.exit(-1) 231 except: 232 print "Unexpected error: ", sys.exc_info()[0] 233 sys.exit(-1) 234 print " done." 235 else: 236 if opt_verbose: 237 print "Using cached file for ", url 238 return localfilename 239 240def process_gdkkeysymsh(): 241 """ Opens the gdkkeysyms.h file from GTK+/gdk/gdkkeysyms.h """ 242 """ Fills up keysymdb with contents """ 243 filename_gdkkeysymsh = download_file(URL_GDKKEYSYMSH) 244 try: 245 gdkkeysymsh = open(filename_gdkkeysymsh, 'r') 246 except IOError, (errno, strerror): 247 print "I/O error(%s): %s" % (errno, strerror) 248 sys.exit(-1) 249 except: 250 print "Unexpected error: ", sys.exc_info()[0] 251 sys.exit(-1) 252 253 """ Parse the gdkkeysyms.h file and place contents in keysymdb """ 254 linenum_gdkkeysymsh = 0 255 keysymdb = {} 256 for line in gdkkeysymsh.readlines(): 257 linenum_gdkkeysymsh += 1 258 line = line.strip() 259 if line == "" or not match('^#define GDK_KEY_', line): 260 continue 261 components = split('\s+', line) 262 if len(components) < 3: 263 print "Invalid line %(linenum)d in %(filename)s: %(line)s"\ 264 % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line} 265 print "Was expecting 3 items in the line" 266 sys.exit(-1) 267 if not match('^GDK_KEY_', components[1]): 268 print "Invalid line %(linenum)d in %(filename)s: %(line)s"\ 269 % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line} 270 print "Was expecting a keysym starting with GDK_KEY_" 271 sys.exit(-1) 272 if match('^0x[0-9a-fA-F]+$', components[2]): 273 unival = long(components[2][2:], 16) 274 if unival == 0: 275 continue 276 keysymdb[components[1][8:]] = unival 277 else: 278 print "Invalid line %(linenum)d in %(filename)s: %(line)s"\ 279 % {'linenum': linenum_gdkkeysymsh, 'filename': filename_gdkkeysymsh, 'line': line} 280 print "Was expecting a hexadecimal number at the end of the line" 281 sys.exit(-1) 282 gdkkeysymsh.close() 283 284 """ Patch up the keysymdb with some of our own stuff """ 285 286 """ This is for a missing keysym from the currently upstream file """ 287 #keysymdb['dead_stroke'] = 0x338 288 289 """ This is for a missing keysym from the currently upstream file """ 290 ###keysymdb['dead_belowring'] = 0x323 291 ###keysymdb['dead_belowmacron'] = 0x331 292 ###keysymdb['dead_belowcircumflex'] = 0x32d 293 ###keysymdb['dead_belowtilde'] = 0x330 294 ###keysymdb['dead_belowbreve'] = 0x32e 295 ###keysymdb['dead_belowdiaeresis'] = 0x324 296 297 """ This is^Wwas preferential treatment for Greek """ 298 # keysymdb['dead_tilde'] = 0x342 299 """ This is^was preferential treatment for Greek """ 300 #keysymdb['combining_tilde'] = 0x342 301 302 """ Fixing VoidSymbol """ 303 keysymdb['VoidSymbol'] = 0xFFFF 304 305 return keysymdb 306 307def process_keysymstxt(): 308 """ Grabs and opens the keysyms.txt file that Markus Kuhn maintains """ 309 """ This file keeps a record between keysyms <-> unicode chars """ 310 filename_keysymstxt = download_file(URL_KEYSYMSTXT) 311 try: 312 keysymstxt = open(filename_keysymstxt, 'r') 313 except IOError, (errno, strerror): 314 print "I/O error(%s): %s" % (errno, strerror) 315 sys.exit(-1) 316 except: 317 print "Unexpected error: ", sys.exc_info()[0] 318 sys.exit(-1) 319 320 """ Parse the keysyms.txt file and place content in keysymdb """ 321 linenum_keysymstxt = 0 322 keysymdb = {} 323 for line in keysymstxt.readlines(): 324 linenum_keysymstxt += 1 325 line = line.strip() 326 if line == "" or match('^#', line): 327 continue 328 components = split('\s+', line) 329 if len(components) < 5: 330 print "Invalid line %(linenum)d in %(filename)s: %(line)s'"\ 331 % {'linenum': linenum_keysymstxt, 'filename': filename_keysymstxt, 'line': line} 332 print "Was expecting 5 items in the line" 333 sys.exit(-1) 334 if match('^U[0-9a-fA-F]+$', components[1]): 335 unival = long(components[1][1:], 16) 336 if unival == 0: 337 continue 338 keysymdb[components[4]] = unival 339 keysymstxt.close() 340 341 """ Patch up the keysymdb with some of our own stuff """ 342 """ This is for a missing keysym from the currently upstream file """ 343 ###keysymdb['dead_belowring'] = 0x323 344 ###keysymdb['dead_belowmacron'] = 0x331 345 ###keysymdb['dead_belowcircumflex'] = 0x32d 346 ###keysymdb['dead_belowtilde'] = 0x330 347 ###keysymdb['dead_belowbreve'] = 0x32e 348 ###keysymdb['dead_belowdiaeresis'] = 0x324 349 350 """ This is preferential treatment for Greek """ 351 """ => we get more savings if used for Greek """ 352 # keysymdb['dead_tilde'] = 0x342 353 """ This is preferential treatment for Greek """ 354 # keysymdb['combining_tilde'] = 0x342 355 356 """ This is for a missing keysym from Markus Kuhn's db """ 357 keysymdb['dead_stroke'] = 0x338 358 """ This is for a missing keysym from Markus Kuhn's db """ 359 keysymdb['Oslash'] = 0x0d8 360 """ This is for a missing keysym from Markus Kuhn's db """ 361 keysymdb['Ssharp'] = 0x1e9e 362 363 """ This is for a missing (recently added) keysym """ 364 keysymdb['dead_psili'] = 0x313 365 """ This is for a missing (recently added) keysym """ 366 keysymdb['dead_dasia'] = 0x314 367 368 """ Allows to import Multi_key sequences """ 369 keysymdb['Multi_key'] = 0xff20 370 371 keysymdb['zerosubscript'] = 0x2080 372 keysymdb['onesubscript'] = 0x2081 373 keysymdb['twosubscript'] = 0x2082 374 keysymdb['threesubscript'] = 0x2083 375 keysymdb['foursubscript'] = 0x2084 376 keysymdb['fivesubscript'] = 0x2085 377 keysymdb['sixsubscript'] = 0x2086 378 keysymdb['sevensubscript'] = 0x2087 379 keysymdb['eightsubscript'] = 0x2088 380 keysymdb['ninesubscript'] = 0x2089 381 keysymdb['dead_doublegrave'] = 0x030F 382 keysymdb['dead_invertedbreve'] = 0x0311 383 384 return keysymdb 385 386def keysymvalue(keysym, file = "n/a", linenum = 0): 387 """ Extracts a value from the keysym """ 388 """ Find the value of keysym, using the data from keysyms """ 389 """ Use file and linenum to when reporting errors """ 390 if keysym == "": 391 return 0 392 if keysymdatabase.has_key(keysym): 393 return keysymdatabase[keysym] 394 elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]): 395 return atoi(keysym[1:], 16) 396 elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]): 397 return atoi(keysym[2:], 16) 398 else: 399 print 'keysymvalue: UNKNOWN{%(keysym)s}' % { "keysym": keysym } 400 #return -1 401 sys.exit(-1) 402 403def keysymunicodevalue(keysym, file = "n/a", linenum = 0): 404 """ Extracts a value from the keysym """ 405 """ Find the value of keysym, using the data from keysyms """ 406 """ Use file and linenum to when reporting errors """ 407 if keysym == "": 408 return 0 409 if keysymunicodedatabase.has_key(keysym): 410 return keysymunicodedatabase[keysym] 411 elif keysym[0] == 'U' and match('[0-9a-fA-F]+$', keysym[1:]): 412 return atoi(keysym[1:], 16) 413 elif keysym[:2] == '0x' and match('[0-9a-fA-F]+$', keysym[2:]): 414 return atoi(keysym[2:], 16) 415 else: 416 print 'keysymunicodevalue: UNKNOWN{%(keysym)s}' % { "keysym": keysym } 417 sys.exit(-1) 418 419def rename_combining(seq): 420 filtered_sequence = [] 421 for ks in seq: 422 if findall('^combining_', ks): 423 ks = sub('^combining_', 'dead_', ks) 424 if ks == 'dead_double_grave': 425 ks = 'dead_doublegrave' 426 if ks == 'dead_inverted_breve': 427 ks = 'dead_invertedbreve' 428 filtered_sequence.append(ks) 429 return filtered_sequence 430 431 432keysymunicodedatabase = process_keysymstxt() 433keysymdatabase = process_gdkkeysymsh() 434 435""" Grab and open the compose file from upstream """ 436filename_compose = download_file(URL_COMPOSE) 437try: 438 composefile = open(filename_compose, 'r') 439except IOError, (errno, strerror): 440 print "I/O error(%s): %s" % (errno, strerror) 441 sys.exit(-1) 442except: 443 print "Unexpected error: ", sys.exc_info()[0] 444 sys.exit(-1) 445 446""" Look if there is a lookaside (supplementary) compose file in the current 447 directory, and if so, open, then merge with upstream Compose file. 448""" 449xorg_compose_sequences_raw = [] 450for seq in composefile.readlines(): 451 xorg_compose_sequences_raw.append(seq) 452 453try: 454 composefile_lookaside = open(FILENAME_COMPOSE_SUPPLEMENTARY, 'r') 455 for seq in composefile_lookaside.readlines(): 456 xorg_compose_sequences_raw.append(seq) 457except IOError, (errno, strerror): 458 if opt_verbose: 459 print "I/O error(%s): %s" % (errno, strerror) 460 print "Did not find lookaside compose file. Continuing..." 461except: 462 print "Unexpected error: ", sys.exc_info()[0] 463 sys.exit(-1) 464 465""" Parse the compose file in xorg_compose_sequences""" 466xorg_compose_sequences = [] 467xorg_compose_sequences_algorithmic = [] 468linenum_compose = 0 469comment_nest_depth = 0 470for line in xorg_compose_sequences_raw: 471 linenum_compose += 1 472 line = line.strip() 473 if match("^XCOMM", line) or match("^#", line): 474 continue 475 476 line = sub(r"\/\*([^\*]*|[\*][^/])\*\/", "", line) 477 478 comment_start = line.find("/*") 479 480 if comment_start >= 0: 481 if comment_nest_depth == 0: 482 line = line[:comment_start] 483 else: 484 line = "" 485 486 comment_nest_depth += 1 487 else: 488 comment_end = line.find("*/") 489 490 if comment_end >= 0: 491 comment_nest_depth -= 1 492 493 if comment_nest_depth < 0: 494 print "Invalid comment %(linenum_compose)d in %(filename)s: \ 495 Closing '*/' without opening '/*'" % { "linenum_compose": linenum_compose, "filename": filename_compose } 496 exit(-1) 497 498 if comment_nest_depth > 0: 499 line = "" 500 else: 501 line = line[comment_end + 2:] 502 503 if line is "": 504 continue 505 506 #line = line[:-1] 507 components = split(':', line) 508 if len(components) != 2: 509 print "Invalid line %(linenum_compose)d in %(filename)s: No sequence\ 510 /value pair found" % { "linenum_compose": linenum_compose, "filename": filename_compose } 511 exit(-1) 512 (seq, val ) = split(':', line) 513 seq = seq.strip() 514 val = val.strip() 515 raw_sequence = findall('\w+', seq) 516 values = split('\s+', val) 517 unichar_temp = split('"', values[0]) 518 unichar = unichar_temp[1] 519 if len(values) == 1: 520 continue 521 codepointstr = values[1] 522 if values[1] == '#': 523 # No codepoints that are >1 characters yet. 524 continue 525 if raw_sequence[0][0] == 'U' and match('[0-9a-fA-F]+$', raw_sequence[0][1:]): 526 raw_sequence[0] = '0x' + raw_sequence[0][1:] 527 if match('^U[0-9a-fA-F]+$', codepointstr): 528 codepoint = long(codepointstr[1:], 16) 529 elif keysymunicodedatabase.has_key(codepointstr): 530 #if keysymdatabase[codepointstr] != keysymunicodedatabase[codepointstr]: 531 #print "DIFFERENCE: 0x%(a)X 0x%(b)X" % { "a": keysymdatabase[codepointstr], "b": keysymunicodedatabase[codepointstr]}, 532 #print raw_sequence, codepointstr 533 codepoint = keysymunicodedatabase[codepointstr] 534 else: 535 print 536 print "Invalid codepoint at line %(linenum_compose)d in %(filename)s:\ 537 %(line)s" % { "linenum_compose": linenum_compose, "filename": filename_compose, "line": line } 538 exit(-1) 539 sequence = rename_combining(raw_sequence) 540 reject_this = False 541 for i in sequence: 542 if keysymvalue(i) > 0xFFFF: 543 reject_this = True 544 if opt_plane1: 545 print sequence 546 break 547 if keysymvalue(i) < 0: 548 reject_this = True 549 break 550 if reject_this: 551 continue 552 if "U0342" in sequence or \ 553 "U0313" in sequence or \ 554 "U0314" in sequence or \ 555 "0x0313" in sequence or \ 556 "0x0342" in sequence or \ 557 "0x0314" in sequence: 558 continue 559 if "dead_belowring" in sequence or\ 560 "dead_currency" in sequence or\ 561 "dead_belowcomma" in sequence or\ 562 "dead_belowmacron" in sequence or\ 563 "dead_belowtilde" in sequence or\ 564 "dead_belowbreve" in sequence or\ 565 "dead_belowdiaeresis" in sequence or\ 566 "dead_belowcircumflex" in sequence: 567 continue 568 #for i in range(len(sequence)): 569 # if sequence[i] == "0x0342": 570 # sequence[i] = "dead_tilde" 571 if "Multi_key" not in sequence: 572 """ Ignore for now >0xFFFF keysyms """ 573 if codepoint < 0xFFFF: 574 original_sequence = copy(sequence) 575 stats_sequence = copy(sequence) 576 base = sequence.pop() 577 basechar = keysymvalue(base, filename_compose, linenum_compose) 578 579 if basechar < 0xFFFF: 580 counter = 1 581 unisequence = [] 582 not_normalised = True 583 skipping_this = False 584 for i in range(0, len(sequence)): 585 """ If the sequence has dead_tilde and is for Greek, we don't do algorithmically 586 because of lack of dead_perispomeni (i.e. conflict) 587 """ 588 bc = basechar 589 """if sequence[-1] == "dead_tilde" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff): 590 skipping_this = True 591 break 592 if sequence[-1] == "dead_horn" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff): 593 skipping_this = True 594 break 595 if sequence[-1] == "dead_ogonek" and (bc >= 0x370 and bc <= 0x3ff) or (bc >= 0x1f00 and bc <= 0x1fff): 596 skipping_this = True 597 break 598 if sequence[-1] == "dead_psili": 599 sequence[i] = "dead_horn" 600 if sequence[-1] == "dead_dasia": 601 sequence[-1] = "dead_ogonek" 602 """ 603 unisequence.append(unichr(keysymunicodevalue(sequence.pop(), filename_compose, linenum_compose))) 604 605 if skipping_this: 606 unisequence = [] 607 for perm in all_permutations(unisequence): 608 # print counter, original_sequence, unichr(basechar) + "".join(perm) 609 # print counter, map(unichr, perm) 610 normalized = normalize('NFC', unichr(basechar) + "".join(perm)) 611 if len(normalized) == 1: 612 # print 'Base: %(base)s [%(basechar)s], produces [%(unichar)s] (0x%(codepoint)04X)' \ 613 # % { "base": base, "basechar": unichr(basechar), "unichar": unichar, "codepoint": codepoint }, 614 # print "Normalized: [%(normalized)s] SUCCESS %(c)d" % { "normalized": normalized, "c": counter } 615 stats_sequence_data = map(keysymunicodevalue, stats_sequence) 616 stats_sequence_data.append(normalized) 617 xorg_compose_sequences_algorithmic.append(stats_sequence_data) 618 not_normalised = False 619 break; 620 counter += 1 621 if not_normalised or opt_allsequences: 622 original_sequence.append(codepoint) 623 xorg_compose_sequences.append(original_sequence) 624 """ print xorg_compose_sequences[-1] """ 625 626 else: 627 print "Error in base char !?!" 628 exit(-2) 629 else: 630 print "OVER", sequence 631 exit(-1) 632 else: 633 sequence.append(codepoint) 634 xorg_compose_sequences.append(sequence) 635 """ print xorg_compose_sequences[-1] """ 636 637def sequence_cmp(x, y): 638 if keysymvalue(x[0]) > keysymvalue(y[0]): 639 return 1 640 elif keysymvalue(x[0]) < keysymvalue(y[0]): 641 return -1 642 elif len(x) > len(y): 643 return 1 644 elif len(x) < len(y): 645 return -1 646 elif keysymvalue(x[1]) > keysymvalue(y[1]): 647 return 1 648 elif keysymvalue(x[1]) < keysymvalue(y[1]): 649 return -1 650 elif len(x) < 4: 651 return 0 652 elif keysymvalue(x[2]) > keysymvalue(y[2]): 653 return 1 654 elif keysymvalue(x[2]) < keysymvalue(y[2]): 655 return -1 656 elif len(x) < 5: 657 return 0 658 elif keysymvalue(x[3]) > keysymvalue(y[3]): 659 return 1 660 elif keysymvalue(x[3]) < keysymvalue(y[3]): 661 return -1 662 elif len(x) < 6: 663 return 0 664 elif keysymvalue(x[4]) > keysymvalue(y[4]): 665 return 1 666 elif keysymvalue(x[4]) < keysymvalue(y[4]): 667 return -1 668 else: 669 return 0 670 671def sequence_unicode_cmp(x, y): 672 if keysymunicodevalue(x[0]) > keysymunicodevalue(y[0]): 673 return 1 674 elif keysymunicodevalue(x[0]) < keysymunicodevalue(y[0]): 675 return -1 676 elif len(x) > len(y): 677 return 1 678 elif len(x) < len(y): 679 return -1 680 elif keysymunicodevalue(x[1]) > keysymunicodevalue(y[1]): 681 return 1 682 elif keysymunicodevalue(x[1]) < keysymunicodevalue(y[1]): 683 return -1 684 elif len(x) < 4: 685 return 0 686 elif keysymunicodevalue(x[2]) > keysymunicodevalue(y[2]): 687 return 1 688 elif keysymunicodevalue(x[2]) < keysymunicodevalue(y[2]): 689 return -1 690 elif len(x) < 5: 691 return 0 692 elif keysymunicodevalue(x[3]) > keysymunicodevalue(y[3]): 693 return 1 694 elif keysymunicodevalue(x[3]) < keysymunicodevalue(y[3]): 695 return -1 696 elif len(x) < 6: 697 return 0 698 elif keysymunicodevalue(x[4]) > keysymunicodevalue(y[4]): 699 return 1 700 elif keysymunicodevalue(x[4]) < keysymunicodevalue(y[4]): 701 return -1 702 else: 703 return 0 704 705def sequence_algorithmic_cmp(x, y): 706 if len(x) < len(y): 707 return -1 708 elif len(x) > len(y): 709 return 1 710 else: 711 for i in range(len(x)): 712 if x[i] < y[i]: 713 return -1 714 elif x[i] > y[i]: 715 return 1 716 return 0 717 718 719xorg_compose_sequences.sort(sequence_cmp) 720 721xorg_compose_sequences_uniqued = [] 722first_time = True 723item = None 724for next_item in xorg_compose_sequences: 725 if first_time: 726 first_time = False 727 item = next_item 728 if sequence_unicode_cmp(item, next_item) != 0: 729 xorg_compose_sequences_uniqued.append(item) 730 item = next_item 731 732xorg_compose_sequences = copy(xorg_compose_sequences_uniqued) 733 734counter_multikey = 0 735for item in xorg_compose_sequences: 736 if findall('Multi_key', "".join(item[:-1])) != []: 737 counter_multikey += 1 738 739xorg_compose_sequences_algorithmic.sort(sequence_algorithmic_cmp) 740xorg_compose_sequences_algorithmic_uniqued = uniq(xorg_compose_sequences_algorithmic) 741 742firstitem = "" 743num_first_keysyms = 0 744zeroes = 0 745num_entries = 0 746num_algorithmic_greek = 0 747for sequence in xorg_compose_sequences: 748 if keysymvalue(firstitem) != keysymvalue(sequence[0]): 749 firstitem = sequence[0] 750 num_first_keysyms += 1 751 zeroes += 6 - len(sequence) + 1 752 num_entries += 1 753 754for sequence in xorg_compose_sequences_algorithmic_uniqued: 755 ch = ord(sequence[-1:][0]) 756 if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff: 757 num_algorithmic_greek += 1 758 759 760if opt_algorithmic: 761 for sequence in xorg_compose_sequences_algorithmic_uniqued: 762 letter = "".join(sequence[-1:]) 763 print '0x%(cp)04X, %(uni)s, seq: [ <0x%(base)04X>,' % { 'cp': ord(unicode(letter)), 'uni': letter.encode('utf-8'), 'base': sequence[-2] }, 764 for elem in sequence[:-2]: 765 print "<0x%(keysym)04X>," % { 'keysym': elem }, 766 """ Yeah, verified... We just want to keep the output similar to -u, so we can compare/sort easily """ 767 print "], recomposed as", letter.encode('utf-8'), "verified" 768 769def num_of_keysyms(seq): 770 return len(seq) - 1 771 772def convert_UnotationToHex(arg): 773 if isinstance(arg, str): 774 if match('^U[0-9A-F][0-9A-F][0-9A-F][0-9A-F]$', arg): 775 return sub('^U', '0x', arg) 776 return arg 777 778def addprefix_GDK(arg): 779 if match('^0x', arg): 780 return '%(arg)s, ' % { 'arg': arg } 781 else: 782 return 'GDK_KEY_%(arg)s, ' % { 'arg': arg } 783 784if opt_gtk: 785 first_keysym = "" 786 sequence = [] 787 compose_table = [] 788 ct_second_part = [] 789 ct_sequence_width = 2 790 start_offset = num_first_keysyms * (WIDTHOFCOMPOSETABLE+1) 791 we_finished = False 792 counter = 0 793 794 sequence_iterator = iter(xorg_compose_sequences) 795 sequence = sequence_iterator.next() 796 while True: 797 first_keysym = sequence[0] # Set the first keysym 798 compose_table.append([first_keysym, 0, 0, 0, 0, 0]) 799 while sequence[0] == first_keysym: 800 compose_table[counter][num_of_keysyms(sequence)-1] += 1 801 try: 802 sequence = sequence_iterator.next() 803 except StopIteration: 804 we_finished = True 805 break 806 if we_finished: 807 break 808 counter += 1 809 810 ct_index = start_offset 811 for line_num in range(len(compose_table)): 812 for i in range(WIDTHOFCOMPOSETABLE): 813 occurences = compose_table[line_num][i+1] 814 compose_table[line_num][i+1] = ct_index 815 ct_index += occurences * (i+2) 816 817 for sequence in xorg_compose_sequences: 818 ct_second_part.append(map(convert_UnotationToHex, sequence)) 819 820 print headerfile_start 821 for i in compose_table: 822 if opt_gtkexpanded: 823 print "0x%(ks)04X," % { "ks": keysymvalue(i[0]) }, 824 print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i[1:])) } 825 elif not match('^0x', i[0]): 826 print 'GDK_KEY_%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) } 827 else: 828 print '%(str)s' % { 'str': "".join(map(lambda x : str(x) + ", ", i)) } 829 for i in ct_second_part: 830 if opt_numeric: 831 for ks in i[1:][:-1]: 832 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) }, 833 print '0x%(cp)04X, ' % { 'cp':i[-1] } 834 """ 835 for ks in i[:-1]: 836 print '0x%(seq)04X, ' % { 'seq': keysymvalue(ks) }, 837 print '0x%(cp)04X, ' % { 'cp':i[-1] } 838 """ 839 elif opt_gtkexpanded: 840 print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1])), 'cp':i[-1] } 841 else: 842 print '%(seq)s0x%(cp)04X, ' % { 'seq': "".join(map(addprefix_GDK, i[:-1][1:])), 'cp':i[-1] } 843 print headerfile_end 844 845def redecompose(codepoint): 846 (name, decomposition, combiningclass) = unicodedatabase[codepoint] 847 if decomposition[0] == '' or decomposition[0] == '0': 848 return [codepoint] 849 if match('<\w+>', decomposition[0]): 850 numdecomposition = map(stringtohex, decomposition[1:]) 851 return map(redecompose, numdecomposition) 852 numdecomposition = map(stringtohex, decomposition) 853 return map(redecompose, numdecomposition) 854 855def process_unicodedata_file(verbose = False): 856 """ Grab from wget http://www.unicode.org/Public/UNIDATA/UnicodeData.txt """ 857 filename_unicodedatatxt = download_file(URL_UNICODEDATATXT) 858 try: 859 unicodedatatxt = open(filename_unicodedatatxt, 'r') 860 except IOError, (errno, strerror): 861 print "I/O error(%s): %s" % (errno, strerror) 862 sys.exit(-1) 863 except: 864 print "Unexpected error: ", sys.exc_info()[0] 865 sys.exit(-1) 866 for line in unicodedatatxt.readlines(): 867 if line[0] == "" or line[0] == '#': 868 continue 869 line = line[:-1] 870 uniproperties = split(';', line) 871 codepoint = stringtohex(uniproperties[0]) 872 """ We don't do Plane 1 or CJK blocks. The latter require reading additional files. """ 873 if codepoint > 0xFFFF or (codepoint >= 0x4E00 and codepoint <= 0x9FFF) or (codepoint >= 0xF900 and codepoint <= 0xFAFF): 874 continue 875 name = uniproperties[1] 876 category = uniproperties[2] 877 combiningclass = uniproperties[3] 878 decomposition = uniproperties[5] 879 unicodedatabase[codepoint] = [name, split('\s+', decomposition), combiningclass] 880 881 counter_combinations = 0 882 counter_combinations_greek = 0 883 counter_entries = 0 884 counter_entries_greek = 0 885 886 for item in unicodedatabase.keys(): 887 (name, decomposition, combiningclass) = unicodedatabase[item] 888 if decomposition[0] == '': 889 continue 890 print name, "is empty" 891 elif match('<\w+>', decomposition[0]): 892 continue 893 print name, "has weird", decomposition[0] 894 else: 895 sequence = map(stringtohex, decomposition) 896 chrsequence = map(unichr, sequence) 897 normalized = normalize('NFC', "".join(chrsequence)) 898 899 """ print name, sequence, "Combining: ", "".join(chrsequence), normalized, len(normalized), """ 900 decomposedsequence = [] 901 for subseq in map(redecompose, sequence): 902 for seqitem in subseq: 903 if isinstance(seqitem, list): 904 for i in seqitem: 905 if isinstance(i, list): 906 for j in i: 907 decomposedsequence.append(j) 908 else: 909 decomposedsequence.append(i) 910 else: 911 decomposedsequence.append(seqitem) 912 recomposedchar = normalize('NFC', "".join(map(unichr, decomposedsequence))) 913 if len(recomposedchar) == 1 and len(decomposedsequence) > 1: 914 counter_entries += 1 915 counter_combinations += factorial(len(decomposedsequence)-1) 916 ch = item 917 if ch >= 0x370 and ch <= 0x3ff or ch >= 0x1f00 and ch <= 0x1fff: 918 counter_entries_greek += 1 919 counter_combinations_greek += factorial(len(decomposedsequence)-1) 920 if verbose: 921 print "0x%(cp)04X, %(uni)c, seq:" % { 'cp':item, 'uni':unichr(item) }, 922 print "[", 923 for elem in decomposedsequence: 924 print '<0x%(hex)04X>,' % { 'hex': elem }, 925 print "], recomposed as", recomposedchar, 926 if unichr(item) == recomposedchar: 927 print "verified" 928 929 if verbose == False: 930 print "Unicode statistics from UnicodeData.txt" 931 print "Number of entries that can be algorithmically produced :", counter_entries 932 print " of which are for Greek :", counter_entries_greek 933 print "Number of compose sequence combinations requiring :", counter_combinations 934 print " of which are for Greek :", counter_combinations_greek 935 print "Note: We do not include partial compositions, " 936 print "thus the slight discrepancy in the figures" 937 print 938 939if opt_unicodedatatxt: 940 process_unicodedata_file(True) 941 942if opt_statistics: 943 print 944 print "Total number of compose sequences (from file) :", len(xorg_compose_sequences) + len(xorg_compose_sequences_algorithmic) 945 print " of which can be expressed algorithmically :", len(xorg_compose_sequences_algorithmic) 946 print " of which cannot be expressed algorithmically :", len(xorg_compose_sequences) 947 print " of which have Multi_key :", counter_multikey 948 print 949 print "Algorithmic (stats for Xorg Compose file)" 950 print "Number of sequences off due to algo from file (len(array)) :", len(xorg_compose_sequences_algorithmic) 951 print "Number of sequences off due to algo (uniq(sort(array))) :", len(xorg_compose_sequences_algorithmic_uniqued) 952 print " of which are for Greek :", num_algorithmic_greek 953 print 954 process_unicodedata_file() 955 print "Not algorithmic (stats from Xorg Compose file)" 956 print "Number of sequences :", len(xorg_compose_sequences) 957 print "Flat array looks like :", len(xorg_compose_sequences), "rows of 6 integers (2 bytes per int, or 12 bytes per row)" 958 print "Flat array would have taken up (in bytes) :", num_entries * 2 * 6, "bytes from the GTK+ library" 959 print "Number of items in flat array :", len(xorg_compose_sequences) * 6 960 print " of which are zeroes :", zeroes, "or ", (100 * zeroes) / (len(xorg_compose_sequences) * 6), " per cent" 961 print "Number of different first items :", num_first_keysyms 962 print "Number of max bytes (if using flat array) :", num_entries * 2 * 6 963 print "Number of savings :", zeroes * 2 - num_first_keysyms * 2 * 5 964 print 965 print "Memory needs if both algorithmic+optimised table in latest Xorg compose file" 966 print " :", num_entries * 2 * 6 - zeroes * 2 + num_first_keysyms * 2 * 5 967 print 968 print "Existing (old) implementation in GTK+" 969 print "Number of sequences in old gtkimcontextsimple.c :", 691 970 print "The existing (old) implementation in GTK+ takes up :", 691 * 2 * 12, "bytes" 971