1#!/usr/bin/env python 2 3from __future__ import print_function, division, absolute_import 4 5import sys, os, re, difflib, unicodedata, errno, cgi 6from itertools import * 7try: 8 import unicodedata2 as unicodedata 9except Exception: 10 pass 11 12diff_symbols = "-+=*&^%$#@!~/" 13diff_colors = ['red', 'green', 'blue'] 14 15def codepoints(s): 16 return (ord (u) for u in s) 17 18try: 19 unichr = unichr 20 21 if sys.maxunicode < 0x10FFFF: 22 # workarounds for Python 2 "narrow" builds with UCS2-only support. 23 24 _narrow_unichr = unichr 25 26 def unichr(i): 27 """ 28 Return the unicode character whose Unicode code is the integer 'i'. 29 The valid range is 0 to 0x10FFFF inclusive. 30 31 >>> _narrow_unichr(0xFFFF + 1) 32 Traceback (most recent call last): 33 File "<stdin>", line 1, in ? 34 ValueError: unichr() arg not in range(0x10000) (narrow Python build) 35 >>> unichr(0xFFFF + 1) == u'\U00010000' 36 True 37 >>> unichr(1114111) == u'\U0010FFFF' 38 True 39 >>> unichr(0x10FFFF + 1) 40 Traceback (most recent call last): 41 File "<stdin>", line 1, in ? 42 ValueError: unichr() arg not in range(0x110000) 43 """ 44 try: 45 return _narrow_unichr(i) 46 except ValueError: 47 try: 48 padded_hex_str = hex(i)[2:].zfill(8) 49 escape_str = "\\U" + padded_hex_str 50 return escape_str.decode("unicode-escape") 51 except UnicodeDecodeError: 52 raise ValueError('unichr() arg not in range(0x110000)') 53 54 def codepoints(s): 55 high_surrogate = None 56 for u in s: 57 cp = ord (u) 58 if 0xDC00 <= cp <= 0xDFFF: 59 if high_surrogate: 60 yield 0x10000 + (high_surrogate - 0xD800) * 0x400 + (cp - 0xDC00) 61 high_surrogate = None 62 else: 63 yield 0xFFFD 64 else: 65 if high_surrogate: 66 yield 0xFFFD 67 high_surrogate = None 68 if 0xD800 <= cp <= 0xDBFF: 69 high_surrogate = cp 70 else: 71 yield cp 72 high_surrogate = None 73 if high_surrogate: 74 yield 0xFFFD 75 76except NameError: 77 unichr = chr 78 79try: 80 unicode = unicode 81except NameError: 82 unicode = str 83 84def tounicode(s, encoding='ascii', errors='strict'): 85 if not isinstance(s, unicode): 86 return s.decode(encoding, errors) 87 else: 88 return s 89 90class ColorFormatter: 91 92 class Null: 93 @staticmethod 94 def start_color (c): return '' 95 @staticmethod 96 def end_color (): return '' 97 @staticmethod 98 def escape (s): return s 99 @staticmethod 100 def newline (): return '\n' 101 102 class ANSI: 103 @staticmethod 104 def start_color (c): 105 return { 106 'red': '\033[41;37;1m', 107 'green': '\033[42;37;1m', 108 'blue': '\033[44;37;1m', 109 }[c] 110 @staticmethod 111 def end_color (): 112 return '\033[m' 113 @staticmethod 114 def escape (s): return s 115 @staticmethod 116 def newline (): return '\n' 117 118 class HTML: 119 @staticmethod 120 def start_color (c): 121 return '<span style="background:%s">' % c 122 @staticmethod 123 def end_color (): 124 return '</span>' 125 @staticmethod 126 def escape (s): return cgi.escape (s) 127 @staticmethod 128 def newline (): return '<br/>\n' 129 130 @staticmethod 131 def Auto (argv = [], out = sys.stdout): 132 format = ColorFormatter.ANSI 133 if "--format" in argv: 134 argv.remove ("--format") 135 format = ColorFormatter.ANSI 136 if "--format=ansi" in argv: 137 argv.remove ("--format=ansi") 138 format = ColorFormatter.ANSI 139 if "--format=html" in argv: 140 argv.remove ("--format=html") 141 format = ColorFormatter.HTML 142 if "--no-format" in argv: 143 argv.remove ("--no-format") 144 format = ColorFormatter.Null 145 return format 146 147 148class DiffColorizer: 149 150 diff_regex = re.compile ('([a-za-z0-9_]*)([^a-za-z0-9_]?)') 151 152 def __init__ (self, formatter, colors=diff_colors, symbols=diff_symbols): 153 self.formatter = formatter 154 self.colors = colors 155 self.symbols = symbols 156 157 def colorize_lines (self, lines): 158 lines = (l if l else '' for l in lines) 159 ss = [self.diff_regex.sub (r'\1\n\2\n', l).splitlines (True) for l in lines] 160 oo = ["",""] 161 st = [False, False] 162 for l in difflib.Differ().compare (*ss): 163 if l[0] == '?': 164 continue 165 if l[0] == ' ': 166 for i in range(2): 167 if st[i]: 168 oo[i] += self.formatter.end_color () 169 st[i] = False 170 oo = [o + self.formatter.escape (l[2:]) for o in oo] 171 continue 172 if l[0] in self.symbols: 173 i = self.symbols.index (l[0]) 174 if not st[i]: 175 oo[i] += self.formatter.start_color (self.colors[i]) 176 st[i] = True 177 oo[i] += self.formatter.escape (l[2:]) 178 continue 179 for i in range(2): 180 if st[i]: 181 oo[i] += self.formatter.end_color () 182 st[i] = False 183 oo = [o.replace ('\n', '') for o in oo] 184 return [s1+s2+self.formatter.newline () for (s1,s2) in zip (self.symbols, oo) if s2] 185 186 def colorize_diff (self, f): 187 lines = [None, None] 188 for l in f: 189 if l[0] not in self.symbols: 190 yield self.formatter.escape (l).replace ('\n', self.formatter.newline ()) 191 continue 192 i = self.symbols.index (l[0]) 193 if lines[i]: 194 # Flush 195 for line in self.colorize_lines (lines): 196 yield line 197 lines = [None, None] 198 lines[i] = l[1:] 199 if (all (lines)): 200 # Flush 201 for line in self.colorize_lines (lines): 202 yield line 203 lines = [None, None] 204 if (any (lines)): 205 # Flush 206 for line in self.colorize_lines (lines): 207 yield line 208 209 210class ZipDiffer: 211 212 @staticmethod 213 def diff_files (files, symbols=diff_symbols): 214 files = tuple (files) # in case it's a generator, copy it 215 try: 216 for lines in izip_longest (*files): 217 if all (lines[0] == line for line in lines[1:]): 218 sys.stdout.writelines ([" ", lines[0]]) 219 continue 220 221 for i, l in enumerate (lines): 222 if l: 223 sys.stdout.writelines ([symbols[i], l]) 224 except IOError as e: 225 if e.errno != errno.EPIPE: 226 print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr) 227 sys.exit (1) 228 229 230class DiffFilters: 231 232 @staticmethod 233 def filter_failures (f): 234 for key, lines in DiffHelpers.separate_test_cases (f): 235 lines = list (lines) 236 if not DiffHelpers.test_passed (lines): 237 for l in lines: yield l 238 239class Stat: 240 241 def __init__ (self): 242 self.count = 0 243 self.freq = 0 244 245 def add (self, test): 246 self.count += 1 247 self.freq += test.freq 248 249class Stats: 250 251 def __init__ (self): 252 self.passed = Stat () 253 self.failed = Stat () 254 self.total = Stat () 255 256 def add (self, test): 257 self.total.add (test) 258 if test.passed: 259 self.passed.add (test) 260 else: 261 self.failed.add (test) 262 263 def mean (self): 264 return float (self.passed.count) / self.total.count 265 266 def variance (self): 267 return (float (self.passed.count) / self.total.count) * \ 268 (float (self.failed.count) / self.total.count) 269 270 def stddev (self): 271 return self.variance () ** .5 272 273 def zscore (self, population): 274 """Calculate the standard score. 275 Population is the Stats for population. 276 Self is Stats for sample. 277 Returns larger absolute value if sample is highly unlikely to be random. 278 Anything outside of -3..+3 is very unlikely to be random. 279 See: http://en.wikipedia.org/wiki/Standard_score""" 280 281 return (self.mean () - population.mean ()) / population.stddev () 282 283 284 285 286class DiffSinks: 287 288 @staticmethod 289 def print_stat (f): 290 passed = 0 291 failed = 0 292 # XXX port to Stats, but that would really slow us down here 293 for key, lines in DiffHelpers.separate_test_cases (f): 294 if DiffHelpers.test_passed (lines): 295 passed += 1 296 else: 297 failed += 1 298 total = passed + failed 299 print ("%d out of %d tests passed. %d failed (%g%%)" % (passed, total, failed, 100. * failed / total)) 300 301 302class Test: 303 304 def __init__ (self, lines): 305 self.freq = 1 306 self.passed = True 307 self.identifier = None 308 self.text = None 309 self.unicodes = None 310 self.glyphs = None 311 for l in lines: 312 symbol = l[0] 313 if symbol != ' ': 314 self.passed = False 315 i = 1 316 if ':' in l: 317 i = l.index (':') 318 if not self.identifier: 319 self.identifier = l[1:i] 320 i = i + 2 # Skip colon and space 321 j = -1 322 if l[j] == '\n': 323 j -= 1 324 brackets = l[i] + l[j] 325 l = l[i+1:-2] 326 if brackets == '()': 327 self.text = l 328 elif brackets == '<>': 329 self.unicodes = Unicode.parse (l) 330 elif brackets == '[]': 331 # XXX we don't handle failed tests here 332 self.glyphs = l 333 334 335class DiffHelpers: 336 337 @staticmethod 338 def separate_test_cases (f): 339 '''Reads lines from f, and if the lines have identifiers, ie. 340 have a colon character, groups them by identifier, 341 yielding lists of all lines with the same identifier.''' 342 343 def identifier (l): 344 if ':' in l[1:]: 345 return l[1:l.index (':')] 346 return l 347 return groupby (f, key=identifier) 348 349 @staticmethod 350 def test_passed (lines): 351 lines = list (lines) 352 # XXX This is a hack, but does the job for now. 353 if any (l.find("space+0|space+0") >= 0 for l in lines if l[0] == '+'): return True 354 if any (l.find("uni25CC") >= 0 for l in lines if l[0] == '+'): return True 355 if any (l.find("dottedcircle") >= 0 for l in lines if l[0] == '+'): return True 356 if any (l.find("glyph0") >= 0 for l in lines if l[0] == '+'): return True 357 if any (l.find("gid0") >= 0 for l in lines if l[0] == '+'): return True 358 if any (l.find("notdef") >= 0 for l in lines if l[0] == '+'): return True 359 return all (l[0] == ' ' for l in lines) 360 361 362class FilterHelpers: 363 364 @staticmethod 365 def filter_printer_function (filter_callback): 366 def printer (f): 367 for line in filter_callback (f): 368 print (line) 369 return printer 370 371 @staticmethod 372 def filter_printer_function_no_newline (filter_callback): 373 def printer (f): 374 for line in filter_callback (f): 375 sys.stdout.writelines ([line]) 376 return printer 377 378 379class Ngram: 380 381 @staticmethod 382 def generator (n): 383 384 def gen (f): 385 l = [] 386 for x in f: 387 l.append (x) 388 if len (l) == n: 389 yield tuple (l) 390 l[:1] = [] 391 392 gen.n = n 393 return gen 394 395 396class UtilMains: 397 398 @staticmethod 399 def process_multiple_files (callback, mnemonic = "FILE"): 400 401 if "--help" in sys.argv: 402 print ("Usage: %s %s..." % (sys.argv[0], mnemonic)) 403 sys.exit (1) 404 405 try: 406 files = sys.argv[1:] if len (sys.argv) > 1 else ['-'] 407 for s in files: 408 callback (FileHelpers.open_file_or_stdin (s)) 409 except IOError as e: 410 if e.errno != errno.EPIPE: 411 print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr) 412 sys.exit (1) 413 414 @staticmethod 415 def process_multiple_args (callback, mnemonic): 416 417 if len (sys.argv) == 1 or "--help" in sys.argv: 418 print ("Usage: %s %s..." % (sys.argv[0], mnemonic)) 419 sys.exit (1) 420 421 try: 422 for s in sys.argv[1:]: 423 callback (s) 424 except IOError as e: 425 if e.errno != errno.EPIPE: 426 print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr) 427 sys.exit (1) 428 429 @staticmethod 430 def filter_multiple_strings_or_stdin (callback, mnemonic, \ 431 separator = " ", \ 432 concat_separator = False): 433 434 if "--help" in sys.argv: 435 print ("Usage:\n %s %s...\nor:\n %s\n\nWhen called with no arguments, input is read from standard input." \ 436 % (sys.argv[0], mnemonic, sys.argv[0])) 437 sys.exit (1) 438 439 try: 440 if len (sys.argv) == 1: 441 while (1): 442 line = sys.stdin.readline () 443 if not len (line): 444 break 445 if line[-1] == '\n': 446 line = line[:-1] 447 print (callback (line)) 448 else: 449 args = sys.argv[1:] 450 if concat_separator != False: 451 args = [concat_separator.join (args)] 452 print (separator.join (callback (x) for x in (args))) 453 except IOError as e: 454 if e.errno != errno.EPIPE: 455 print ("%s: %s: %s" % (sys.argv[0], e.filename, e.strerror), file=sys.stderr) 456 sys.exit (1) 457 458 459class Unicode: 460 461 @staticmethod 462 def decode (s): 463 return u','.join ("U+%04X" % cp for cp in codepoints (tounicode (s, 'utf-8'))) 464 465 @staticmethod 466 def parse (s): 467 s = re.sub (r"0[xX]", " ", s) 468 s = re.sub (r"[<+>{},;&#\\xXuUnNiI\n\t]", " ", s) 469 return [int (x, 16) for x in s.split ()] 470 471 @staticmethod 472 def encode (s): 473 s = u''.join (unichr (x) for x in Unicode.parse (s)) 474 if sys.version_info[0] == 2: s = s.encode ('utf-8') 475 return s 476 477 shorthands = { 478 "ZERO WIDTH NON-JOINER": "ZWNJ", 479 "ZERO WIDTH JOINER": "ZWJ", 480 "NARROW NO-BREAK SPACE": "NNBSP", 481 "COMBINING GRAPHEME JOINER": "CGJ", 482 "LEFT-TO-RIGHT MARK": "LRM", 483 "RIGHT-TO-LEFT MARK": "RLM", 484 "LEFT-TO-RIGHT EMBEDDING": "LRE", 485 "RIGHT-TO-LEFT EMBEDDING": "RLE", 486 "POP DIRECTIONAL FORMATTING": "PDF", 487 "LEFT-TO-RIGHT OVERRIDE": "LRO", 488 "RIGHT-TO-LEFT OVERRIDE": "RLO", 489 } 490 491 @staticmethod 492 def pretty_name (u): 493 try: 494 s = unicodedata.name (u) 495 except ValueError: 496 return "XXX" 497 s = re.sub (".* LETTER ", "", s) 498 s = re.sub (".* VOWEL SIGN (.*)", r"\1-MATRA", s) 499 s = re.sub (".* SIGN ", "", s) 500 s = re.sub (".* COMBINING ", "", s) 501 if re.match (".* VIRAMA", s): 502 s = "HALANT" 503 if s in Unicode.shorthands: 504 s = Unicode.shorthands[s] 505 return s 506 507 @staticmethod 508 def pretty_names (s): 509 s = re.sub (r"[<+>\\uU]", " ", s) 510 s = re.sub (r"0[xX]", " ", s) 511 s = [unichr (int (x, 16)) for x in re.split ('[, \n]', s) if len (x)] 512 return u' + '.join (Unicode.pretty_name (x) for x in s).encode ('utf-8') 513 514 515class FileHelpers: 516 517 @staticmethod 518 def open_file_or_stdin (f): 519 if f == '-': 520 return sys.stdin 521 return open (f) 522 523 524class Manifest: 525 526 @staticmethod 527 def read (s, strict = True): 528 529 if not os.path.exists (s): 530 if strict: 531 print ("%s: %s does not exist" % (sys.argv[0], s), file=sys.stderr) 532 sys.exit (1) 533 return 534 535 s = os.path.normpath (s) 536 537 if os.path.isdir (s): 538 539 try: 540 m = open (os.path.join (s, "MANIFEST")) 541 items = [x.strip () for x in m.readlines ()] 542 for f in items: 543 for p in Manifest.read (os.path.join (s, f)): 544 yield p 545 except IOError: 546 if strict: 547 print ("%s: %s does not exist" % (sys.argv[0], os.path.join (s, "MANIFEST")), file=sys.stderr) 548 sys.exit (1) 549 return 550 else: 551 yield s 552 553 @staticmethod 554 def update_recursive (s): 555 556 for dirpath, dirnames, filenames in os.walk (s, followlinks=True): 557 558 for f in ["MANIFEST", "README", "LICENSE", "COPYING", "AUTHORS", "SOURCES", "ChangeLog"]: 559 if f in dirnames: 560 dirnames.remove (f) 561 if f in filenames: 562 filenames.remove (f) 563 dirnames.sort () 564 filenames.sort () 565 ms = os.path.join (dirpath, "MANIFEST") 566 print (" GEN %s" % ms) 567 m = open (ms, "w") 568 for f in filenames: 569 print (f, file=m) 570 for f in dirnames: 571 print (f, file=m) 572 for f in dirnames: 573 Manifest.update_recursive (os.path.join (dirpath, f)) 574 575if __name__ == '__main__': 576 pass 577