1#!python3 2"""Program to dump contents of Brotli compressed files showing the compression format. 3Jurjen N.E. Bos, 2016. 4I found the following issues with the Brotli format: 5- The distance alphabet has size 16+(48<<POSTFIX), 6 but the last symbols are useless. 7 It could be lowered to 16+(44-POSTFIX<<POSTFIX), and this could matter. 8- The block type code is useless if NBLTYPES==2, you would only need 1 symbol 9 anyway, so why don't you just switch to "the other" type? 10""" 11import struct 12from operator import itemgetter, methodcaller 13from itertools import accumulate, repeat 14from collections import defaultdict, deque 15from functools import partial 16 17class InvalidStream(Exception): pass 18#lookup table 19L, I, D = "literal", "insert©", "distance" 20pL, pI, pD = 'P'+L, 'P'+I, 'P'+D 21 22def outputCharFormatter(c): 23 """Show character in readable format 24 """ 25 #TODO 2: allow hex only output 26 if 32<c<127: return chr(c) 27 elif c==10: return '\\n' 28 elif c==13: return '\\r' 29 elif c==32: return '" "' 30 else: return '\\x{:02x}'.format(c) 31 32def outputFormatter(s): 33 """Show string or char. 34 """ 35 result = '' 36 def formatSubString(s): 37 for c in s: 38 if c==32: yield ' ' 39 else: yield outputCharFormatter(c) 40 if len(result)<200: return ''.join(formatSubString(s)) 41 else: 42 return ''.join(formatSubString(s[:100]))+'...'+ \ 43 ''.join(formatSubString(s[-100:])) 44 45 46class BitStream: 47 """Represent a bytes object. Can read bits and prefix codes the way 48 Brotli does. 49 """ 50 def __init__(self, byteString): 51 self.data = byteString 52 #position in bits: byte pos is pos>>3, bit pos is pos&7 53 self.pos = 0 54 55 def __repr__(self): 56 """Representation 57 >>> olleke 58 BitStream(pos=0:0) 59 """ 60 return "BitStream(pos={:x}:{})".format(self.pos>>3, self.pos&7) 61 62 def read(self, n): 63 """Read n bits from the stream and return as an integer. 64 Produces zero bits beyond the stream. 65 >>> olleke.data[0]==27 66 True 67 >>> olleke.read(5) 68 27 69 70 >>> olleke 71 BitStream(pos=0:5) 72 """ 73 value = self.peek(n) 74 self.pos += n 75 if self.pos>len(self.data)*8: 76 raise ValueError('Read past end of stream') 77 return value 78 79 def peek(self, n): 80 """Peek an n bit integer from the stream without updating the pointer. 81 It is not an error to read beyond the end of the stream. 82 >>> olleke.data[:2]==b'\x1b\x2e' and 0x2e1b==11803 83 True 84 >>> olleke.peek(15) 85 11803 86 >>> hex(olleke.peek(32)) 87 '0x2e1b' 88 """ 89 #read bytes that contain the data: self.data[self.pos>>3:self.pos+n+7>>3] 90 #convert to int: int.from_bytes(..., 'little') 91 #shift out the bits from the first byte: >>(self.pos&7) 92 #mask unwanted bits: & (1<<n)-1 93 return int.from_bytes( 94 self.data[self.pos>>3:self.pos+n+7>>3], 95 'little')>>(self.pos&7) & (1<<n)-1 96 97 def readBytes(self, n): 98 """Read n bytes from the stream on a byte boundary. 99 """ 100 if self.pos&7: raise ValueError('readBytes: need byte boundary') 101 result = self.data[self.pos>>3:(self.pos>>3)+n] 102 self.pos += 8*n 103 return result 104 105#-----------------------Symbol------------------------------------------- 106class Symbol: 107 """A symbol in a code. 108 Refers back to the code that contains it. 109 Index is the place in the alphabet of the symbol. 110 """ 111 __slots__ = 'code', 'index' 112 def __init__(self, code, value): 113 self.code = code 114 self.index = value 115 116 def __repr__(self): 117 return 'Symbol({}, {})'.format(self.code.name, self.index) 118 119 def __len__(self): 120 """Number of bits in the prefix notation of this symbol 121 """ 122 return self.code.length(self.index) 123 124 def __int__(self): 125 return self.index 126 127 #these routines call equivalent routine in Code class 128 def bitPattern(self): 129 """Value of the symbol in the stream 130 """ 131 return self.code.bitPattern(self.index) 132 133 def extraBits(self): 134 """Number of extra bits to read for this symbol 135 """ 136 return self.code.extraBits(self.index) 137 138 def __str__(self): 139 """Short descriptor of the symbol without extra bits. 140 """ 141 return self.code.mnemonic(self.index) 142 143 #requiring optional extra bits, if self.code supports them 144 def value(self, extra=None): 145 """The value used for processing. Can be a tuple. 146 with optional extra bits 147 """ 148 if isinstance(self.code, WithExtra): 149 if not 0<=extra<1<<self.extraBits(): 150 raise ValueError("value: extra value doesn't fit in extraBits") 151 return self.code.value(self.index, extra) 152 if extra is not None: 153 raise ValueError('value: no extra bits for this code') 154 return self.code.value(self.index) 155 156 def explanation(self, extra=None): 157 """Long explanation of the value from the numeric value 158 with optional extra bits 159 Used by Layout.verboseRead when printing the value 160 """ 161 if isinstance(self.code, WithExtra): 162 return self.code.callback(self, extra) 163 return self.code.callback(self) 164 165#========================Code definitions================================== 166class RangeDecoder: 167 """A decoder for the Code class that assumes the symbols 168 are encoded consecutively in binary. 169 It all depends on the "alphabetSize" property. 170 The range runs from 0 to alphabetSize-1. 171 This is the default decoder. 172 """ 173 def __init__(self, *, alphabetSize=None, bitLength=None, **args): 174 if bitLength is not None: alphabetSize = 1<<bitLength 175 if alphabetSize is not None: 176 self.alphabetSize = alphabetSize 177 self.maxLength = (alphabetSize-1).bit_length() 178 179 def __len__(self): 180 return self.alphabetSize 181 182 def __iter__(self): 183 """Produce all symbols. 184 """ 185 return map(partial(Symbol, self), range(len(self))) 186 187 def __getitem__(self, index): 188 if index>=self.alphabetSize: raise ValueError('index out of range') 189 return Symbol(self, index) 190 191 def bitPattern(self, index): 192 return '{:0{}b}'.format(index, self.maxLength) 193 194 def length(self, index): 195 """Encoding length of given symbol. 196 Does not depend on index in this case. 197 """ 198 return self.maxLength 199 200 def decodePeek(self, data): 201 """Find which symbol index matches the given data (from peek, as a number) 202 and return the number of bits decoded. 203 Can also be used to figure out length of a symbol. 204 """ 205 return self.maxLength, Symbol(self, data&(1<<self.maxLength)-1) 206 207class PrefixDecoder: 208 """A decoder for the Code class that uses a prefix code. 209 The code is determined by encoding: 210 encode[p] gives the index corresponding to bit pattern p. 211 Used setDecode(decodeTable) to switch the decoder from the default 212 to a prefix decoder, or pass decodeTable at init. 213 You can also use setLength(lengthTable) 214 to define the encoding from the lengths. 215 The set of symbol values does not need to be consecutive. 216 """ 217 def __init__(self, *, decodeTable=None, **args): 218 if decodeTable is not None: self.setDecode(decodeTable) 219 220 def __len__(self): 221 return len(self.decodeTable) 222 223 def __iter__(self): 224 def revBits(index): 225 return self.bitPattern(index)[::-1] 226 return ( 227 Symbol(self, index) 228 for index in sorted(self.decodeTable.values(), key=revBits) 229 ) 230 231 def __getitem__(self, index): 232 if index not in self.lengthTable: 233 raise ValueError('No symbol {}[{}]'.format( 234 self.__class__.__name__, index)) 235 return Symbol(self, index) 236 237 def bitPattern(self, index): 238 bits = next(b for (b,s) in self.decodeTable.items() if s==index) 239 return '{:0{}b}'.format(bits, self.length(index)) 240 241 def length(self, index): 242 """Encoding length of given symbol. 243 """ 244 return self.lengthTable[index] 245 246 def decodePeek(self, data): 247 """Find which symbol index matches the given data (from peek, as a number) 248 and return the number of bits decoded. 249 Can also be used to figure out length of a symbol. 250 """ 251 #do binary search for word length 252 #invariant: lo<=length<=hi 253 lo, hi = self.minLength, self.maxLength 254 while lo<=hi: 255 mid = lo+hi>>1 256 #note lo<=mid<hi at this point 257 mask = (1<<mid)-1 258 #lets see what happens if we guess length is mid 259 try: index = self.decodeTable[data&mask] 260 except KeyError: 261 #too many bits specified, reduce estimated length 262 hi = mid-1 263 continue 264 #we found a symbol, but there could be a longer match 265 symbolLength = self.lengthTable[index] 266 if symbolLength<=mid: 267 #all bits match, symbol must be right 268 return symbolLength, Symbol(self, index) 269 #there must be more bits to match 270 lo = mid+1 271 return lo, Symbol(self, index) 272 273 #routine to set up the tables 274 def setDecode(self, decodeTable): 275 """Store decodeTable, 276 and compute lengthTable, minLength, maxLength from encodings. 277 """ 278 self.decodeTable = decodeTable 279 #set of symbols with unknown length 280 todo = set(decodeTable) 281 #bit size under investigation 282 maskLength = 0 283 lengthTable = {} 284 while todo: 285 mask = (1<<maskLength)-1 286 #split the encodings that we didn't find yet using b bits 287 splitSymbols = defaultdict(list) 288 for s in todo: splitSymbols[s&mask].append(s) 289 #unique encodings have a length of maskLength bits 290 #set length, and remove from todo list 291 for s,subset in splitSymbols.items(): 292 if len(subset)==1: 293 lengthTable[self.decodeTable[s]] = maskLength 294 todo.remove(s) 295 #now investigate with longer mask 296 maskLength +=1 297 #save result 298 self.lengthTable = lengthTable 299 self.minLength = min(lengthTable.values()) 300 self.maxLength = max(lengthTable.values()) 301 self.switchToPrefix() 302 303 def setLength(self, lengthTable): 304 """Given the bit pattern lengths for symbols given in lengthTable, 305 set decodeTable, minLength, maxLength 306 """ 307 self.lengthTable = lengthTable 308 self.minLength = min(lengthTable.values()) 309 self.maxLength = max(lengthTable.values()) 310 #compute the backwards codes first; then reverse them 311 #compute (backwards) first code for every separate lengths 312 nextCodes = [] 313 #build codes for each length, from right to left 314 code = 0 315 for bits in range(self.maxLength+1): 316 code <<= 1 317 nextCodes.append(code) 318 code += sum(x==bits for x in lengthTable.values()) 319 self.decodeTable = {} 320 #count codes for each length, and store reversed in the table 321 for symbol in sorted(lengthTable): 322 bits = lengthTable[symbol] 323 bitpattern = '{:0{}b}'.format(nextCodes[bits], bits) 324 self.decodeTable[int(bitpattern[::-1], 2)] = symbol 325 nextCodes[bits] += 1 326 self.switchToPrefix() 327 328 def switchToPrefix(self): 329 """This routine makes sure the prefix decoder is activated. 330 """ 331 self.mode = PrefixDecoder 332 333class Code(RangeDecoder, PrefixDecoder): 334 """An alphabet of symbols, that can be read from a stream. 335 If you use setDecode or setLength, you have a prefix code, 336 otherwise you have a range code. 337 Features: 338 code[index] produces symbol with given index 339 value(index): value of symbol 340 mnemonic(index): short description of symbol 341 explanation(index): show meaning of symbol, shown in Layout.verboseRead 342 iter(code): produce all symbols in some order 343 name: show as context in Layout.verboseRead 344 """ 345 name = '?' 346 #callback is a function that gets the symbol and the extra bits 347 #default callback calls explanation 348 def __init__(self, name=None, *, callback=None, description='', **args): 349 """Don't forget to set either alphabetSize or decodeTable 350 """ 351 #set name when provided, otherwise take class variable 352 if name is not None: self.name = name 353 if callback is not None: self.callback = callback 354 self.description = description 355 #mode switch 356 if 'bitLength' in args or 'alphabetSize' in args: 357 self.mode = RangeDecoder 358 RangeDecoder.__init__(self, **args) 359 elif 'decodeTable' in args: 360 self.mode = PrefixDecoder 361 PrefixDecoder.__init__(self, **args) 362 else: 363 super().__init__(**args) 364 365 def __repr__(self): 366 return self.__class__.__name__+' '+self.name 367 368 #the routines that get switched between RangeDecoder and PrefixDecoder 369 def __len__(self): return self.mode.__len__(self) 370 def __iter__(self): return self.mode.__iter__(self) 371 def __getitem__(self, index): return self.mode.__getitem__(self, index) 372 def bitPattern(self, index): return self.mode.bitPattern(self, index) 373 def length(self, index): return self.mode.length(self, index) 374 def decodePeek(self, data): return self.mode.decodePeek(self, data) 375 #general routines 376 def value(self, index, extra=None): 377 """Get value of symbol for computations. 378 Override where needed. 379 """ 380 if extra is not None: 381 raise ValueError('value: no extra for this symbol') 382 return index 383 384 def mnemonic(self, index): 385 """Give mnemonic of symbol. 386 Override where needed. 387 """ 388 return str(self.value(index)) 389 390 def callback(self, symbol): 391 return self.explanation(symbol.index) 392 393 def explanation(self, index): 394 """Long explanation of the value from the numeric value 395 This is a default routine. 396 You can customize in three ways: 397 - set description to add some text 398 - override to get more control 399 - set callback to make it dependent on you local variables 400 """ 401 value = self.value(index) 402 return '{0}{1}: {2}'.format( 403 self.description and self.description+': ', 404 self.bitPattern(index), 405 value, 406 ) 407 408 def extraBits(self, index): 409 return 0 410 411 #Routines that use the decode interface 412 def showCode(self, width=80): 413 """Show all words of the code in a nice format. 414 """ 415 #make table of all symbols with binary strings 416 symbolStrings = [ 417 (self.bitPattern(s.index), self.mnemonic(s.index)) 418 for s in self 419 ] 420 #determine column widths the way Lisp programmers do it 421 leftColWidth, rightColWidth = map(max, map( 422 map, 423 repeat(len), 424 zip(*symbolStrings) 425 )) 426 colwidth = leftColWidth+rightColWidth 427 columns = 81//(colwidth+2) 428 rows = -(-len(symbolStrings)//columns) 429 def justify(bs): 430 b,s = bs 431 return b.rjust(leftColWidth)+':'+s.ljust(rightColWidth) 432 for i in range(rows): 433 print(' '.join(map(justify, symbolStrings[i::rows])).rstrip()) 434 435 def readTuple(self, stream): 436 """Read symbol from stream. Returns symbol, length. 437 """ 438 length, symbol = self.decodePeek(stream.peek(self.maxLength)) 439 stream.pos += length 440 return length, symbol 441 442 def readTupleAndExtra(self, stream): 443 return self.readTuple(stream)+(0, None) 444 445class WithExtra(Code): 446 """Extension for Code so that symbol may have extra bits associated. 447 If you supply an extraTable, you can use extraBits 448 You can define an extraTable, 449 which allows to call extraBits to get the number of extraBits. 450 Otherwise, you can supply extraBits yourself. 451 Routine readTupleAndExtra now reads the extra bits too. 452 Value probably needs to be overridden; see Enumerator. 453 Note: this does not give you an decodeTable. 454 """ 455 #redefine these if you don't want to use an extraTable 456 def extraBits(self, index): 457 """Get the number of extra bits for this symbol. 458 """ 459 return self.extraTable[index] 460 461 def mnemonic(self, index): 462 """This value must be independent of extra. 463 """ 464 return str(index) 465 466 def readTupleAndExtra(self, stream): 467 """Read symbol and extrabits from stream. 468 Returns symbol length, symbol, extraBits, extra 469 >>> olleke.pos = 6 470 >>> MetablockLengthAlphabet().readTupleAndExtra(olleke) 471 (2, Symbol(MLEN, 4), 16, 46) 472 """ 473 length, symbol = self.decodePeek(stream.peek(self.maxLength)) 474 stream.pos += length 475 extraBits = self.extraBits(symbol.index) 476 return length, symbol, extraBits, stream.read(extraBits) 477 478 def explanation(self, index, extra=None): 479 """Expanded version of Code.explanation supporting extra bits. 480 If you don't supply extra, it is not mentioned. 481 """ 482 extraBits = 0 if extra is None else self.extraBits(index) 483 if not hasattr(self, 'extraTable'): 484 formatString = '{0}{3}' 485 lo = hi = value = self.value(index, extra) 486 elif extraBits==0: 487 formatString = '{0}{2}: {3}' 488 lo, hi = self.span(index) 489 value = lo 490 else: 491 formatString = '{0}{1} {2}: {3}-{4}; {3}+{5}={6}' 492 lo, hi = self.span(index) 493 value = lo+extra 494 return formatString.format( 495 self.description and self.description+': ', 496 'x'*extraBits, 497 self.bitPattern(index), 498 lo, hi, 499 extra, 500 value, 501 ) 502 503 def callback(self, symbol, extra): 504 return self.explanation(symbol.index, extra) 505 506class BoolCode(Code): 507 """Same as Code(bitLength=1), but shows a boolean. 508 """ 509 def __init__(self, name=None, **args): 510 super().__init__(name, bitLength=1, **args) 511 512 def value(self, index, extra=None): 513 return bool(super().value(index, extra)) 514 515class Enumerator(WithExtra): 516 """Code that is defined by the ExtraTable. 517 extraTable is a class variable that contains 518 the extraBits of the symbols from 0 519 value0 contains the value of symbol 0 520 encodings is not neccessary, but allowed. 521 Note: place for FixedCode to make sure extraBits works 522 """ 523 def __init__(self, name=None, **args): 524 #if there is no decodeTable to determine length, compute it ourselves 525 if 'decodeTable' not in args: 526 args['alphabetSize'] = len(self.extraTable) 527 super().__init__(name, **args) 528 529 def __len__(self): 530 return len(self.extraTable) 531 532 def __getitem__(self, index): 533 """Faster than PrefixDecoder 534 """ 535 if index>=len(self.extraTable): 536 raise ValueError("No symbol {}[{}]".format( 537 self.__class__.__name__, index)) 538 return Symbol(self, index) 539 540 def value(self, index, extra): 541 """Override if you don't define value0 and extraTable 542 """ 543 lower, upper = self.span(index) 544 value = lower+(extra or 0) 545 if value>upper: 546 raise ValueError('value: extra out of range') 547 return value 548 549 def span(self, index): 550 """Give the range of possible values in a tuple 551 Useful for mnemonic and explanation 552 """ 553 lower = self.value0+sum(1<<x for x in self.extraTable[:index]) 554 upper = lower+(1<<self.extraTable[index]) 555 return lower, upper-1 556 557#======================Code subclasses====================================== 558#Alphabets used in the metablock header---------------------------------- 559#For prefix codes 560class PrefixCodeHeader(WithExtra): 561 """Header of prefix codes. 562 """ 563 def __init__(self, codename): 564 super().__init__('PFX', bitLength=2) 565 #this is the name of the code that it describes 566 self.codename = codename 567 568 def extraBits(self, index): 569 return 2 if index==1 else 0 570 571 def value(self, index, extra): 572 """Returns ('Simple', #codewords) or ('Complex', HSKIP) 573 """ 574 if index==1: 575 if extra>3: 576 raise ValueError('value: extra out of range') 577 return 'Simple', extra+1 578 if extra: 579 raise ValueError('value: extra out of range') 580 return 'Complex', index 581 582 def explanation(self, index, extra): 583 if index==1: 584 return '{} is simple with {} code word{}'.format( 585 self.codename, extra+1, 's' if extra else '') 586 lengths = [1, 2, 3, 4, 0, 5, 17, 6] 587 return '{} is complex with lengths {}...'.format( 588 self.codename, 589 ','.join( 590 map(str, lengths[index:index+5])) 591 ) 592 593class TreeShapeAlhabet(BoolCode): 594 """The bit used to indicate if four word code is "deep" or "wide" 595 """ 596 name = 'SHAPE' 597 def value(self, index): 598 return [(2,2,2,2), (1,2,3,3)][index] 599 600 def explanation(self, index): 601 return str(bool(index))+': lengths {},{},{},{}'.format(*self.value(index)) 602 603class LengthOfLengthAlphabet(Code): 604 """For use in decoding complex code descriptors. 605 >>> lengthOfLengthAlphabet = LengthOfLengthAlphabet('') 606 >>> print(lengthOfLengthAlphabet[2]) 607 coded with 2 bits 608 >>> len(lengthOfLengthAlphabet[0]) 609 2 610 >>> [len(lengthOfLengthAlphabet[x]) for x in range(6)] 611 [2, 4, 3, 2, 2, 4] 612 >>> lengthOfLengthAlphabet.showCode() 613 00:skipped 01:coded with 4 bits 0111:coded with 1 bits 614 10:coded with 3 bits 011:coded with 2 bits 1111:coded with 5 bits 615 """ 616 decodeTable = { 617 0b00:0, 0b10:3, 618 0b0111:1, 0b01:4, 619 0b011:2, 0b1111:5, 620 } 621 622 def __init__(self, name=None, **args): 623 super().__init__(name, decodeTable=self.decodeTable, **args) 624 625 def mnemonic(self, index): 626 if index==0: return 'skipped' 627 return 'coded with {} bits'.format(index) 628 629 def explanation(self, index, extra=None): 630 return self.description+': '+self.mnemonic(index) 631 632class LengthAlphabet(WithExtra): 633 """Length of symbols 634 Used during construction of a code. 635 """ 636 def __init__(self, name): 637 super().__init__(name, alphabetSize=18) 638 639 def extraBits(self, index): 640 return {16:2, 17:3}.get(index, 0) 641 642 def mnemonic(self, index): 643 if index==0: return 'unused' 644 elif index==16: return 'rep xx' 645 elif index==17: return 'zero xxx' 646 else: return 'len {}'.format(index) 647 648 def explanation(self, index, extra): 649 return self.description.format(self[index], extra) 650 651 def value(self, index, extra): 652 #the caller got the length already, so extra is enough 653 return extra 654 655#Stream header 656class WindowSizeAlphabet(Code): 657 """The alphabet used for window size in the stream header. 658 >>> WindowSizeAlphabet()[10].explanation() 659 'windowsize=(1<<10)-16=1008' 660 """ 661 decodeTable = { 662 0b0100001: 10, 0b1100001: 14, 0b0011: 18, 0b1011: 22, 663 0b0110001: 11, 0b1110001: 15, 0b0101: 19, 0b1101: 23, 664 0b1000001: 12, 0b0: 16, 0b0111: 20, 0b1111: 24, 665 0b1010001: 13, 0b0000001: 17, 0b1001: 21, 666 0b0010001: None, 667 } 668 669 name = 'WSIZE' 670 671 def __init__(self, name=None): 672 super().__init__(name, decodeTable=self.decodeTable) 673 674 def value(self, index): 675 #missing value gives index None 676 if index is None: return None 677 return (1<<index)-16 678 679 def explanation(self, index): 680 return 'windowsize=(1<<{})-16={}'.format( 681 index, (1<<index)-16) 682 683#Metablock 684class MetablockLengthAlphabet(WithExtra): 685 """Used for the meta block length; 686 also indicates a block with no data 687 >>> metablockLengthAlphabet = MetablockLengthAlphabet() 688 >>> metablockLengthAlphabet[0]; str(metablockLengthAlphabet[0]) 689 Symbol(MLEN, 0) 690 'empty' 691 >>> metablockLengthAlphabet[3] 692 Traceback (most recent call last): 693 ... 694 ValueError: No symbol MetablockLengthAlphabet[3] 695 >>> print(metablockLengthAlphabet[4]) 696 hhhh00 697 >>> metablockLengthAlphabet[4].value(0x1000) 698 4097 699 >>> metablockLengthAlphabet[5].value(0x1000) 700 Traceback (most recent call last): 701 ... 702 InvalidStream: Zeros in high nibble of MLEN 703 >>> metablockLengthAlphabet[5].explanation(0x12345) 704 'data length: 12345h+1=74566' 705 >>> metablockLengthAlphabet.showCode() 706 00:hhhh00 10:hhhhhh10 01:hhhhh01 11:empty 707 """ 708 decodeTable = {0b11:0, 0b00:4, 0b01:5, 0b10:6} 709 710 name = 'MLEN' 711 def __init__(self, name=None): 712 super().__init__(name, decodeTable=self.decodeTable) 713 714 def extraBits(self, index): 715 return index*4 716 717 def mnemonic(self, index): 718 if index==0: return 'empty' 719 return 'h'*(self.extraBits(index)//4)+self.bitPattern(index) 720 721 def value(self, index, extra): 722 extraBits = self.extraBits(index) 723 if not 0<=extra<1<<extraBits: 724 raise ValueError('value: extra out of range') 725 if index==0: return 0 726 if index>4 and extra>>extraBits-4==0: raise InvalidStream( 727 'Zeros in high nibble of MLEN') 728 return extra+1 729 730 def explanation(self, index, extra): 731 if index==0: return '11: empty block' 732 extraBits = self.extraBits(index) 733 return 'data length: {:0{}x}h+1={}'.format(extra, extraBits//4, extra+1) 734 735 736class ReservedAlphabet(BoolCode): 737 """The reserved bit that must be zero. 738 """ 739 name = 'RSVD' 740 def value(self, index): 741 if index: raise ValueError('Reserved bit is not zero') 742 743 def explanation(self, index): 744 return 'Reserved (must be zero)' 745 746class FillerAlphabet(Code): 747 def __init__(self, *, streamPos): 748 super().__init__('SKIP', bitLength=(-streamPos)&7) 749 750 def explanation(self, index): 751 return '{} bit{} ignored'.format( 752 self.length(index), 753 '' if self.length(index)==1 else 's', 754 ) 755 756class SkipLengthAlphabet(WithExtra): 757 """Used for the skip length in an empty metablock 758 >>> skipLengthAlphabet = SkipLengthAlphabet() 759 >>> skipLengthAlphabet[0]; str(skipLengthAlphabet[0]) 760 Symbol(SKIP, 0) 761 'empty' 762 >>> skipLengthAlphabet[4] 763 Traceback (most recent call last): 764 ... 765 ValueError: index out of range 766 >>> print(skipLengthAlphabet[3]) 767 hhhhhh11 768 >>> skipLengthAlphabet[2].value(0x1000) 769 4097 770 >>> skipLengthAlphabet[3].value(0x1000) 771 Traceback (most recent call last): 772 ... 773 InvalidStream: Zeros in high byte of SKIPBYTES 774 >>> skipLengthAlphabet[3].explanation(0x12345) 775 'skip length: 12345h+1=74566' 776 >>> skipLengthAlphabet.showCode() 777 00:empty 01:hh01 10:hhhh10 11:hhhhhh11 778 """ 779 def __init__(self): 780 super().__init__('SKIP', bitLength=2) 781 782 def extraBits(self, index): 783 return index*8 784 785 def mnemonic(self, index): 786 if index==0: return 'empty' 787 return 'h'*(self.extraBits(index)//4)+self.bitPattern(index) 788 789 def value(self, index, extra): 790 extraBits = self.extraBits(index) 791 if not 0<=extra<1<<extraBits: 792 raise ValueError('value: extra out of range') 793 if index==0: return 0 794 if index>1 and extra>>extraBits-8==0: 795 raise InvalidStream('Zeros in high byte of SKIPBYTES') 796 return extra+1 797 798 def explanation(self, index, extra): 799 if index==0: return '00: no skip' 800 extraBits = self.extraBits(index) 801 return 'skip length: {:{}x}h+1={}'.format(extra, extraBits//8, extra+1) 802 803 804class TypeCountAlphabet(Enumerator): 805 """Used for giving block type counts and tree counts. 806 >>> TypeCountAlphabet(description='').showCode() 807 0:0 0101:xx,0101 1011:xxxxx,1011 808 0001:0001 1101:xxxxxx,1101 0111:xxx,0111 809 1001:xxxx,1001 0011:x,0011 1111:xxxxxxx,1111 810 """ 811 decodeTable = { 812 0b0: 0, 0b1001: 5, 813 0b0001: 1, 0b1011: 6, 814 0b0011: 2, 0b1101: 7, 815 0b0101: 3, 0b1111: 8, 816 0b0111: 4, 817 } 818 819 value0 = 1 820 extraTable = [0, 0, 1, 2, 3, 4, 5, 6, 7] 821 name = 'BT#' 822 823 def __init__(self, name=None, *, description): 824 super().__init__( 825 name, 826 decodeTable=self.decodeTable, 827 description=description) 828 829 def mnemonic(self, index): 830 if index==0: return '0' 831 if index==1: return '0001' 832 return 'x'*(self.extraBits(index))+','+self.bitPattern(index) 833 834 def explanation(self, index, extra): 835 value = self.value(index, extra) 836 description = self.description 837 if value==1: description = description[:-1] 838 return '{}: {} {}'.format( 839 self.mnemonic(index), 840 value, 841 description) 842 843class BlockTypeAlphabet(Code): 844 """The block types; this code works for all three kinds. 845 >>> b = BlockTypeAlphabet('T', NBLTYPES=5) 846 >>> print(*(x for x in b)) 847 prev +1 #0 #1 #2 #3 #4 848 """ 849 def __init__(self, name, NBLTYPES, **args): 850 super().__init__(name, alphabetSize=NBLTYPES+2, **args) 851 self.NBLTYPES = NBLTYPES 852 853 def mnemonic(self, index): 854 if index==0: return 'prev' 855 elif index==1: return '+1' 856 else: return '#'+str(index-2) 857 858 def value(self, index): 859 return index-2 860 861 def explanation(self, index): 862 if index==0: return '0: previous' 863 elif index==1: return '1: increment' 864 else: return 'Set block type to: '+str(index-2) 865 866class BlockCountAlphabet(Enumerator): 867 """Block counts 868 >>> b = BlockCountAlphabet('L') 869 >>> print(b[25]) 870 [24*x]: BC16625-16793840 871 """ 872 873 value0 = 1 874 extraTable = [2,2,2,2,3, 3,3,3,4,4, 4,4,5,5,5, 5,6,6,7,8, 9,10,11,12,13, 24] 875 def __init__(self, name, **args): 876 super().__init__(name, alphabetSize=26, **args) 877 878 def mnemonic(self, index): 879 extraBits = self.extraBits(index) 880 return '{}: BC{}-{}'.format( 881 'x'*extraBits if index<5 else '[{}*x]'.format(extraBits), 882 *self.span(index)) 883 884 def explanation(self, index, extra): 885 return 'Block count: '+super().explanation(index, extra) 886 887class DistanceParamAlphabet(WithExtra): 888 """The distance parameters NPOSTFIX and NDIRECT. 889 Although these are treated as two in the description, this is easier. 890 """ 891 def __init__(self): 892 super().__init__('DIST', bitLength=2) 893 894 def extraBits(self, index): 895 return 4 896 897 def value(self, index, extra): 898 """Returns NPOSTFIX and NDIRECT<<NPOSTFIX 899 """ 900 if extra>15: 901 raise ValueError('value: extra out of range') 902 return index, extra<<index 903 904 def explanation(self, index, extra): 905 return '{} postfix bits and {:04b}<<{}={} direct codes'.format( 906 index, extra, index, extra<<index) 907 908 def mnemonic(self, index): 909 return 'PF'+str(index) 910 911class LiteralContextMode(Code): 912 """For the literal context modes. 913 >>> LiteralContextMode().showCode() 914 00:LSB6 01:MSB6 10:UTF8 11:Signed 915 >>> LiteralContextMode().explanation(2) 916 'Context mode for type 9: 2(UTF8)' 917 """ 918 919 def __init__(self, *, number=9): 920 super().__init__('LC'+str(number), bitLength=2) 921 self.number = number 922 923 def mnemonic(self, index): 924 return ['LSB6', 'MSB6', 'UTF8', 'Signed'][index] 925 926 def explanation(self, index): 927 return 'Context mode for type {}: {}({})'.format( 928 self.number, 929 index, 930 self.mnemonic(index)) 931 932class RLEmaxAlphabet(Enumerator): 933 """Used for describing the run length encoding used for describing context maps. 934 >>> RLEmaxAlphabet().showCode() 935 0:1 1:more 936 """ 937 value0 = 0 938 extraTable = [0, 4] 939 name = 'RLE#' 940 941 def mnemonic(self, index): 942 return ['1', 'more'][index] 943 944 def explanation(self, index, extra): 945 description = self.description and self.description+': ' 946 if index==0: return description+'No RLE coding' 947 return '{}xxxx 1: RLEMAX={}'.format(description, extra+1) 948 949class TreeAlphabet(WithExtra): 950 """The alphabet to enumerate entries (called trees) in the context map. 951 parameters are RLEMAX and NTREES 952 >>> t = TreeAlphabet('', RLEMAX=3, NTREES=5) 953 >>> len(t) 954 8 955 >>> print(t[2]) 956 xx+4 zeroes 957 >>> t[3].explanation(2) 958 '8+010=10 zeroes' 959 >>> t[0].value(0) 960 (1, 0) 961 """ 962 name = 'CMI' 963 def __init__(self, name=None, *, RLEMAX, NTREES, **args): 964 super().__init__(name, alphabetSize=RLEMAX+NTREES, **args) 965 self.RLEMAX = RLEMAX 966 self.NTREES = NTREES 967 968 def extraBits(self, index): 969 if 0<index<=self.RLEMAX: return index 970 return 0 971 972 def mnemonic(self, index): 973 if index==0: return 'map #0' 974 if index<=self.RLEMAX: 975 return '{}+{} zeroes'.format('x'*index, 1<<index) 976 return 'map #{}'.format(index-self.RLEMAX) 977 978 def value(self, index, extra): 979 """Give count and value.""" 980 index = index 981 if index==0: return 1, 0 982 if index<=self.RLEMAX: return (1<<index)+extra, 0 983 return 1, index-self.RLEMAX 984 985 def explanation(self, index, extra): 986 description = self.description and self.description+': ' 987 if index==0: return description+'map #0' 988 if index<=self.RLEMAX: 989 return '{}+{:0{}b}={} zeroes'.format( 990 (1<<index), 991 extra, self.extraBits(index), 992 (1<<index)+extra) 993 return '{}map #{}-{}={}'.format( 994 description, 995 index, self.RLEMAX, index-self.RLEMAX) 996 997#Prefix alphabets for the data stream---------------------------------- 998class LiteralAlphabet(Code): 999 """Alphabet of symbols. 1000 """ 1001 minLength = maxLength = 8 1002 def __init__(self, number): 1003 super().__init__('L'+str(number), alphabetSize=1<<8) 1004 1005 def mnemonic(self, index): 1006 return outputCharFormatter(index) 1007 1008 def value(self, index, extra=None): 1009 return index 1010 1011 def explanation(self, index, extra=None): 1012 return self.mnemonic(index) 1013 1014class InsertLengthAlphabet(Enumerator): 1015 """Intern code for insert counts 1016 """ 1017 value0 = 0 1018 extraTable = [0,0,0,0,0, 0,1,1,2,2, 3,3,4,4,5, 5,6,7,8,9, 10,12,14,24] 1019 1020class CopyLengthAlphabet(Enumerator): 1021 value0 = 2 1022 extraTable = [0,0,0,0,0, 0,0,0,1,1, 2,2,3,3,4, 4,5,5,6,7, 8,9,10,24] 1023 1024class InsertAndCopyAlphabet(WithExtra): 1025 """The insert and copy code 1026 >>> for x in range(0,704,704//13): 1027 ... print('{:10b}'.format(x), InsertAndCopyAlphabet()[x]) 1028 0 I0C2&D=0 1029 110110 I6+xC8&D=0 1030 1101100 I5C22+xxx&D=0 1031 10100010 I4C4 1032 11011000 I3C10+x 1033 100001110 I14+xxC8 1034 101000100 I10+xxC22+xxx 1035 101111010 I98+xxxxxC14+xx 1036 110110000 I6+xC70+xxxxx 1037 111100110 I1090+[10*x]C8 1038 1000011100 I26+xxxC326+[8*x] 1039 1001010010 I322+[8*x]C14+xx 1040 1010001000 I194+[7*x]C70+xxxxx 1041 1010111110 I22594+[24*x]C1094+[10*x] 1042 """ 1043 insertLengthAlphabet = InsertLengthAlphabet(None) 1044 copyLengthAlphabet = CopyLengthAlphabet(None) 1045 1046 def __init__(self, number=''): 1047 super().__init__('IC'+str(number), bitLength=10) 1048 1049 def __len__(self): 1050 return 704 1051 1052 def extraBits(self, index): 1053 insertSymbol, copySymbol, dist0 = self.splitSymbol(index) 1054 return InsertLengthAlphabet.extraTable[insertSymbol.index] + \ 1055 CopyLengthAlphabet.extraTable[copySymbol.index] 1056 1057 def splitSymbol(self, index): 1058 """Give relevant values for computations: 1059 (insertSymbol, copySymbol, dist0flag) 1060 """ 1061 #determine insert and copy upper bits from table 1062 row = [0,0,1,1,2,2,1,3,2,3,3][index>>6] 1063 col = [0,1,0,1,0,1,2,0,2,1,2][index>>6] 1064 #determine inserts and copy sub codes 1065 insertLengthCode = row<<3 | index>>3&7 1066 if row: insertLengthCode -= 8 1067 copyLengthCode = col<<3 | index&7 1068 return ( 1069 Symbol(self.insertLengthAlphabet, insertLengthCode), 1070 Symbol(self.copyLengthAlphabet, copyLengthCode), 1071 row==0 1072 ) 1073 1074 def mnemonic(self, index): 1075 """Make a nice mnemonic 1076 """ 1077 i,c,d0 = self.splitSymbol(index) 1078 iLower, _ = i.code.span(i.index) 1079 iExtra = i.extraBits() 1080 cLower, _ = c.code.span(c.index) 1081 cExtra = c.extraBits() 1082 return 'I{}{}{}C{}{}{}{}'.format( 1083 iLower, 1084 '+' if iExtra else '', 1085 'x'*iExtra if iExtra<6 else '[{}*x]'.format(iExtra), 1086 cLower, 1087 '+' if cExtra else '', 1088 'x'*cExtra if cExtra<6 else '[{}*x]'.format(cExtra), 1089 '&D=0' if d0 else '') 1090 1091 def value(self, index, extra): 1092 i,c,d0 = self.splitSymbol(index) 1093 iExtra = i.extraBits() 1094 ce, ie = extra>>iExtra, extra&(1<<iExtra)-1 1095 insert = i.value(ie) 1096 copy = c.value(ce) 1097 return insert, copy, d0 1098 1099 def explanation(self, index, extra): 1100 insert, copy, d0 = self.value(index, extra) 1101 if d0: return 'Literal: {}, copy: {}, same distance'.format(insert, copy) 1102 else: return 'Literal: {}, copy: {}'.format(insert, copy) 1103 1104class DistanceAlphabet(WithExtra): 1105 """Represent the distance encoding. 1106 Dynamically generated alphabet. 1107 This is what the documentation should have said: 1108 Ignoring offsets for the moment, the "long" encoding works as follows: 1109 Write the distance in binary as follows: 1110 1xy..yz..z, then the distance symbol consists of n..nxz..z 1111 Where: 1112 n is one less than number of bits in y 1113 x is a single bit 1114 y..y are n+1 extra bits (encoded in the bit stream) 1115 z..z is NPOSTFIX bits that are part of the symbol 1116 The offsets are so as to start at the lowest useable value: 1117 if 1xyyyyz = distance +(4<<POSTFIX)-NDIRECT-1 1118 then n..nxz..z is symbol -NDIRECT-16 1119 >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10) 1120 >>> print(d[4], d[17], d[34]) 1121 last-1 1 10xx00-5 1122 >>> [str(d[x]) for x in range(26, 32)] 1123 ['10x00-5', '10x01-5', '10x10-5', '10x11-5', '11x00-5', '11x01-5'] 1124 """ 1125 def __init__(self, number, *, NPOSTFIX, NDIRECT): 1126 self.NPOSTFIX = NPOSTFIX 1127 self.NDIRECT = NDIRECT 1128 #set length 1129 #Actually, not all symbols are used, 1130 #only NDIRECT+16+(44-2*POSTFIX<<NPOSTFIX) 1131 super().__init__('D'+str(number), 1132 alphabetSize=self.NDIRECT+16+(48<<self.NPOSTFIX)) 1133 1134 def extraBits(self, index): 1135 """Indicate how many extra bits are needed to interpret symbol 1136 >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10) 1137 >>> [d[i].extraBits() for i in range(26)] 1138 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 1139 >>> [d[i].extraBits() for i in range(26,36)] 1140 [1, 1, 1, 1, 1, 1, 1, 1, 2, 2] 1141 """ 1142 if index<16+self.NDIRECT: return 0 1143 return 1 + ((index - self.NDIRECT - 16) >> (self.NPOSTFIX + 1)) 1144 1145 def value(self, dcode, dextra): 1146 """Decode value of symbol together with the extra bits. 1147 >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10) 1148 >>> d[34].value(2) 1149 (0, 35) 1150 """ 1151 if dcode<16: 1152 return [(1,0),(2,0),(3,0),(4,0), 1153 (1,-1),(1,+1),(1,-2),(1,+2),(1,-3),(1,+3), 1154 (2,-1),(2,+1),(2,-2),(2,+2),(2,-3),(2,+3) 1155 ][dcode] 1156 if dcode<16+self.NDIRECT: 1157 return (0,dcode-16) 1158 #we use the original formulas, instead of my clear explanation 1159 POSTFIX_MASK = (1 << self.NPOSTFIX) - 1 1160 ndistbits = 1 + ((dcode - self.NDIRECT - 16) >> (self.NPOSTFIX + 1)) 1161 hcode = (dcode - self.NDIRECT - 16) >> self.NPOSTFIX 1162 lcode = (dcode - self.NDIRECT - 16) & POSTFIX_MASK 1163 offset = ((2 + (hcode & 1)) << ndistbits) - 4 1164 distance = ((offset + dextra) << self.NPOSTFIX) + lcode + self.NDIRECT + 1 1165 return (0,distance) 1166 1167 def mnemonic(self, index, verbose=False): 1168 """Give mnemonic representation of meaning. 1169 verbose compresses strings of x's 1170 """ 1171 if index<16: 1172 return ['last', '2last', '3last', '4last', 1173 'last-1', 'last+1', 'last-2', 'last+2', 'last-3', 'last+3', 1174 '2last-1', '2last+1', '2last-2', '2last+2', '2last-3', '2last+3' 1175 ][index] 1176 if index<16+self.NDIRECT: 1177 return str(index-16) 1178 #construct strings like "1xx01-15" 1179 index -= self.NDIRECT+16 1180 hcode = index >> self.NPOSTFIX 1181 lcode = index & (1<<self.NPOSTFIX)-1 1182 if self.NPOSTFIX: formatString = '1{0}{1}{2:0{3}b}{4:+d}' 1183 else: formatString = '1{0}{1}{4:+d}' 1184 return formatString.format( 1185 hcode&1, 1186 'x'*(2+hcode>>1) if hcode<13 or verbose else '[{}*x]'.format(2+hcode>>1), 1187 lcode, self.NPOSTFIX, 1188 self.NDIRECT+1-(4<<self.NPOSTFIX)) 1189 1190 def explanation(self, index, extra): 1191 """ 1192 >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10) 1193 >>> d[55].explanation(13) 1194 '11[1101]01-5: [0]+240' 1195 """ 1196 extraBits = self.extraBits(index) 1197 extraString = '[{:0{}b}]'.format(extra, extraBits) 1198 return '{0}: [{1[0]}]{1[1]:+d}'.format( 1199 self.mnemonic(index, True).replace('x'*(extraBits or 1), extraString), 1200 self.value(index, extra)) 1201 1202#Classes for doing actual work------------------------------------------ 1203class ContextModeKeeper: 1204 """For computing the literal context mode. 1205 You feed it characters, and it computes indices in the context map. 1206 """ 1207 def __init__(self, mode): 1208 self.chars = deque([0,0], maxlen=2) 1209 self.mode = mode 1210 1211 def setContextMode(self, mode): 1212 """Switch to given context mode (0..3)""" 1213 self.mode = mode 1214 def getIndex(self): 1215 if self.mode==0: #LSB6 1216 return self.chars[1]&0x3f 1217 elif self.mode==1: #MSB6 1218 return self.chars[1]>>2 1219 elif self.mode==2: #UTF8: character class of previous and a bit of the second 1220 p2,p1 = self.chars 1221 return self.lut0[p1]|self.lut1[p2] 1222 elif self.mode==3: #Signed: initial bits of last two bytes 1223 p2,p1 = self.chars 1224 return self.lut2[p1]<<3|self.lut2[p2] 1225 1226 def add(self, index): 1227 """Adjust the context for output char (as int).""" 1228 self.chars.append(index) 1229 1230 #0: control #16: quote #32: ,:; #48: AEIOU 1231 #4: tab/lf/cr #20: % #36: . #52: BC..Z 1232 #8: space #24: (<[{ #40: = #56: aeiou 1233 #12:!#$&*+-/?@| #28: )>]} #44: 0-9 #60: bc..z 1234 lut0 = [0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0, 0, 4, 0, 0, 1235 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1236 8, 12, 16, 12, 12, 20, 12, 16, 24, 28, 12, 12, 32, 12, 36, 12, 1237 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 32, 32, 24, 40, 28, 12, 1238 12, 48, 52, 52, 52, 48, 52, 52, 52, 48, 52, 52, 52, 52, 52, 48, 1239 52, 52, 52, 52, 52, 48, 52, 52, 52, 52, 52, 24, 12, 28, 12, 12, 1240 12, 56, 60, 60, 60, 56, 60, 60, 60, 56, 60, 60, 60, 60, 60, 56, 1241 60, 60, 60, 60, 60, 56, 60, 60, 60, 60, 60, 24, 12, 28, 12, 0 1242 ]+[0,1]*32+[2,3]*32 1243 #0: space 1:punctuation 2:digit/upper 3:lower 1244 lut1 = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1245 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1246 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1247 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1248 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1249 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1250 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1251 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0 1252 ]+[0]*96+[2]*32 1253 #initial bits: 8*0, 4*0, 2*0, 1*0, 1*1, 2*1, 4*1, 8*1 1254 lut2 = [0]+[1]*15+[2]*48+[3]*64+[4]*64+[5]*48+[6]*15+[7] 1255 assert len(lut0)==len(lut1)==len(lut2)==256 1256 1257class WordList: 1258 """Word list. 1259 >>> WordList().word(7, 35555) 1260 b'Program to ' 1261 """ 1262 NDBITS = [0, 0, 0, 0, 10, 10, 11, 11, 10, 10, 1263 10, 10, 10, 9, 9, 8, 7, 7, 8, 7, 1264 7, 6, 6, 5, 5] 1265 def __init__(self): 1266 self.file = open('dict', 'rb') 1267 self.compileActions() 1268 1269 def word(self, size, dist): 1270 """Get word 1271 """ 1272 #split dist in index and action 1273 ndbits = self.NDBITS[size] 1274 index = dist&(1<<ndbits)-1 1275 action = dist>>ndbits 1276 #compute position in file 1277 position = sum(n<<self.NDBITS[n] for n in range(4,size))+size*index 1278 self.file.seek(position) 1279 return self.doAction(self.file.read(size), action) 1280 1281 def upperCase1(self, word): 1282 word = word.decode('utf8') 1283 word = word[0].upper()+word[1:] 1284 return word.encode('utf8') 1285 1286 1287 #Super compact form of action table. 1288 #_ means space, .U means UpperCaseAll, U(w) means UpperCaseFirst 1289 actionTable = r""" 1290 0:w 25:w+_for_ 50:w+\n\t 75:w+. This_100:w+ize_ 1291 1:w+_ 26:w[3:] 51:w+: 76:w+, 101:w.U+. 1292 2:_+w+_ 27:w[:-2] 52:_+w+._ 77:.+w+_ 102:\xc2\xa0+w 1293 3:w[1:] 28:w+_a_ 53:w+ed_ 78:U(w)+( 103:_+w+, 1294 4:U(w)+_ 29:w+_that_ 54:w[9:] 79:U(w)+. 104:U(w)+=" 1295 5:w+_the_ 30:_+U(w) 55:w[7:] 80:w+_not_ 105:w.U+=" 1296 6:_+w 31:w+._ 56:w[:-6] 81:_+w+=" 106:w+ous_ 1297 7:s_+w+_ 32:.+w 57:w+( 82:w+er_ 107:w.U+,_ 1298 8:w+_of_ 33:_+w+,_ 58:U(w)+,_ 83:_+w.U+_ 108:U(w)+=\' 1299 9:U(w) 34:w[4:] 59:w[:-8] 84:w+al_ 109:_+U(w)+, 1300 10:w+_and_ 35:w+_with_ 60:w+_at_ 85:_+w.U 110:_+w.U+=" 1301 11:w[2:] 36:w+\' 61:w+ly_ 86:w+=\' 111:_+w.U+,_ 1302 12:w[:-1] 37:w+_from_ 62:_the_+w+_of_ 87:w.U+" 112:_+w.U+, 1303 13:,_+w+_ 38:w+_by_ 63:w[:-5] 88:U(w)+._ 113:w.U+( 1304 14:w+,_ 39:w[5:] 64:w[:-9] 89:_+w+( 114:w.U+._ 1305 15:_+U(w)+_ 40:w[6:] 65:_+U(w)+,_ 90:w+ful_ 115:_+w.U+. 1306 16:w+_in_ 41:_the_+w 66:U(w)+" 91:_+U(w)+._116:w.U+=\' 1307 17:w+_to_ 42:w[:-4] 67:.+w+( 92:w+ive_ 117:_+w.U+._ 1308 18:e_+w+_ 43:w+. The_ 68:w.U+_ 93:w+less_ 118:_+U(w)+=" 1309 19:w+" 44:w.U 69:U(w)+"> 94:w.U+\' 119:_+w.U+=\' 1310 20:w+. 45:w+_on_ 70:w+=" 95:w+est_ 120:_+U(w)+=\' 1311 21:w+"> 46:w+_as_ 71:_+w+. 96:_+U(w)+. 1312 22:w+\n 47:w+_is_ 72:.com/+w 97:w.U+"> 1313 23:w[:-3] 48:w[:-7] 98:_+w+=\' 1314 24:w+] 49:w[:-1]+ing_ 74:U(w)+\' 99:U(w)+, 1315 """ 1316 1317 def compileActions(self): 1318 """Build the action table from the text above 1319 """ 1320 import re 1321 self.actionList = actions = [None]*121 1322 #Action 73, which is too long, looks like this when expanded: 1323 actions[73] = "b' the '+w+b' of the '" 1324 #find out what the columns are 1325 actionLines = self.actionTable.splitlines() 1326 colonPositions = [m.start() 1327 for m in re.finditer(':',actionLines[1]) 1328 ]+[100] 1329 columns = [(colonPositions[i]-3,colonPositions[i+1]-3) 1330 for i in range(len(colonPositions)-1)] 1331 for line in self.actionTable.splitlines(keepends=False): 1332 for start,end in columns: 1333 action = line[start:end] 1334 #skip empty actions 1335 if not action or action.isspace(): continue 1336 #chop it up, and check if the colon is properly placed 1337 index, colon, action = action[:3], action[3], action[4:] 1338 assert colon==':' 1339 #remove filler spaces at right 1340 action = action.rstrip() 1341 #replace space symbols 1342 action = action.replace('_', ' ') 1343 wPos = action.index('w') 1344 #add quotes around left string when present 1345 #translation: any pattern from beginning, up to 1346 #(but not including) a + following by a w later on 1347 action = re.sub(r"^(.*)(?=\+[U(]*w)", r"b'\1'", action) 1348 #add quotes around right string when present 1349 #translation: anything with a w in it, followed by a + 1350 #and a pattern up to the end 1351 #(there is no variable lookbehind assertion, 1352 #so we have to copy the pattern) 1353 action = re.sub(r"(w[[:\-1\]).U]*)\+(.*)$", r"\1+b'\2'", action) 1354 #expand shortcut for uppercaseAll 1355 action = action.replace(".U", ".upper()") 1356 #store action 1357 actions[int(index)] = action 1358 1359 def doAction(self, w, action): 1360 """Perform the proper action 1361 """ 1362 #set environment for the UpperCaseFirst 1363 U = self.upperCase1 1364 return eval(self.actionList[action], locals()) 1365 1366class Layout: 1367 """Class to layout the output. 1368 """ 1369 #display width of hexdata+bitdata 1370 width = 25 1371 #general 1372 def __init__(self, stream): 1373 self.stream = stream 1374 self.bitPtr = self.width 1375 1376 def makeHexData(self, pos): 1377 """Produce hex dump of all data containing the bits 1378 from pos to stream.pos 1379 """ 1380 firstAddress = pos+7>>3 1381 lastAddress = self.stream.pos+7>>3 1382 return ''.join(map('{:02x} '.format, 1383 self.stream.data[firstAddress:lastAddress])) 1384 1385 def formatBitData(self, pos, width1, width2=0): 1386 """Show formatted bit data: 1387 Bytes are separated by commas 1388 whole bytes are displayed in hex 1389 >>> Layout(olleke).formatBitData(6, 2, 16) 1390 '|00h|2Eh,|00' 1391 >>> Layout(olleke).formatBitData(4, 1, 0) 1392 '1' 1393 """ 1394 result = [] 1395 #make empty prefix code explicit 1396 if width1==0: result = ['()', ','] 1397 for width in width1, width2: 1398 #skip empty width2 1399 if width==0: continue 1400 #build result backwards in a list 1401 while width>0: 1402 availableBits = 8-(pos&7) 1403 if width<availableBits: 1404 #read partial byte, beginning nor ending at boundary 1405 data = self.stream.data[pos>>3] >> (pos&7) & (1<<width)-1 1406 result.append('{:0{}b}'.format(data, width)) 1407 elif availableBits<8: 1408 #read rest of byte, ending at boundary 1409 data = self.stream.data[pos>>3] >> (pos&7) 1410 result.append('|{:0{}b}'.format(data, availableBits)) 1411 else: 1412 #read whole byte (in hex), beginning and ending at boundary 1413 data = self.stream.data[pos>>3] 1414 result.append('|{:02X}h'.format(data)) 1415 width -= availableBits 1416 pos += availableBits 1417 #if width overshot from the availableBits subtraction, fix it 1418 pos += width 1419 #add comma to separate fields 1420 result.append(',') 1421 #concatenate pieces, reversed, skipping the last space 1422 return ''.join(result[-2::-1]) 1423 1424 def readPrefixCode(self, alphabet): 1425 """give alphabet the prefix code that is read from the stream 1426 Called for the following alphabets, in this order: 1427 The alphabet in question must have a "logical" order, 1428 otherwise the assignment of symbols doesn't work. 1429 """ 1430 mode, numberOfSymbols = self.verboseRead(PrefixCodeHeader(alphabet.name)) 1431 if mode=='Complex': 1432 #for a complex code, numberOfSymbols means hskip 1433 self.readComplexCode(numberOfSymbols, alphabet) 1434 return alphabet 1435 else: 1436 table = [] 1437 #Set table of lengths for mnemonic function 1438 lengths = [[0], [1,1], [1,2,2], '????'][numberOfSymbols-1] 1439 #adjust mnemonic function of alphabet class 1440 def myMnemonic(index): 1441 return '{} bit{}: {}'.format( 1442 lengths[i], 1443 '' if lengths[i]==1 else 's', 1444 alphabet.__class__.mnemonic(alphabet, index) 1445 ) 1446 alphabet.mnemonic = myMnemonic 1447 for i in range(numberOfSymbols): 1448 table.append(self.verboseRead(alphabet, skipExtra=True).index) 1449 #restore mnemonic 1450 del alphabet.mnemonic 1451 if numberOfSymbols==4: 1452 #read tree shape to redefine lengths 1453 lengths = self.verboseRead(TreeShapeAlhabet()) 1454 #construct the alphabet prefix code 1455 alphabet.setLength(dict(zip(table, lengths))) 1456 return alphabet 1457 1458 def readComplexCode(self, hskip, alphabet): 1459 """Read complex code""" 1460 stream = self.stream 1461 #read the lengths for the length code 1462 lengths = [1,2,3,4,0,5,17,6,16,7,8,9,10,11,12,13,14,15][hskip:] 1463 codeLengths = {} 1464 total = 0 1465 lol = LengthOfLengthAlphabet('##'+alphabet.name) 1466 #lengthCode will be used for coding the lengths of the new code 1467 #we use it for display until now; definition comes below 1468 lengthCode = LengthAlphabet('#'+alphabet.name) 1469 lengthIter = iter(lengths) 1470 lengthsLeft = len(lengths) 1471 while total<32 and lengthsLeft>0: 1472 lengthsLeft -= 1 1473 newSymbol = next(lengthIter) 1474 lol.description = str(lengthCode[newSymbol]) 1475 length = self.verboseRead(lol) 1476 if length: 1477 codeLengths[newSymbol] = length 1478 total += 32>>length 1479 if total>32: raise ValueError("Stream format") 1480 if len(codeLengths)==1: codeLengths[list(codeLengths.keys())[0]] = 0 1481 #Now set the encoding of the lengthCode 1482 lengthCode.setLength(codeLengths) 1483 print("***** Lengths for {} will be coded as:".format(alphabet.name)) 1484 lengthCode.showCode() 1485 #Now determine the symbol lengths with the lengthCode 1486 symbolLengths = {} 1487 total = 0 1488 lastLength = 8 1489 alphabetIter = iter(alphabet) 1490 while total<32768: 1491 #look ahead to see what is going to happen 1492 length = lengthCode.decodePeek( 1493 self.stream.peek(lengthCode.maxLength))[1].index 1494 #in every branch, set lengthCode.description to explanatory text 1495 #lengthCode calls format(symbol, extra) with this string 1496 if length==0: 1497 symbol = next(alphabetIter) 1498 lengthCode.description = 'symbol {} unused'.format(symbol) 1499 self.verboseRead(lengthCode) 1500 #unused symbol 1501 continue 1502 if length==16: 1503 lengthCode.description = \ 1504 '{1}+3 symbols of length '+str(lastLength) 1505 extra = self.verboseRead(lengthCode) 1506 #scan series of 16s (repeat counts) 1507 #start with repeat count 2 1508 repeat = 2 1509 startSymbol = next(alphabetIter) 1510 endSymbol = next(alphabetIter) 1511 symbolLengths[startSymbol.index] = \ 1512 symbolLengths[endSymbol.index] = lastLength 1513 #count the two just defined symbols 1514 total += 2*32768>>lastLength 1515 #note: loop may end because we're there 1516 #even if a 16 _appears_ to follow 1517 while True: 1518 #determine last symbol 1519 oldRepeat = repeat 1520 repeat = (repeat-2<<2)+extra+3 1521 #read as many symbols as repeat increased 1522 for i in range(oldRepeat, repeat): 1523 endSymbol = next(alphabetIter) 1524 symbolLengths[endSymbol.index] = lastLength 1525 #compute new total; it may be end of loop 1526 total += (repeat-oldRepeat)*32768>>lastLength 1527 if total>=32768: break 1528 #see if there is more to do 1529 length = lengthCode.decodePeek( 1530 self.stream.peek(lengthCode.maxLength))[1].index 1531 if length!=16: break 1532 lengthCode.description = 'total {}+{{1}} symbols'.format( 1533 (repeat-2<<2)+3) 1534 extra = self.verboseRead(lengthCode) 1535 elif length==17: 1536 #read, and show explanation 1537 lengthCode.description = '{1}+3 unused' 1538 extra = self.verboseRead(lengthCode) 1539 #scan series of 17s (groups of zero counts) 1540 #start with repeat count 2 1541 repeat = 2 1542 startSymbol = next(alphabetIter) 1543 endSymbol = next(alphabetIter) 1544 #note: loop will not end with total==32768, 1545 #since total doesn't change here 1546 while True: 1547 #determine last symbol 1548 oldRepeat = repeat 1549 repeat = (repeat-2<<3)+extra+3 1550 #read as many symbols as repeat increases 1551 for i in range(repeat-oldRepeat): 1552 endSymbol = next(alphabetIter) 1553 #see if there is more to do 1554 length = lengthCode.decodePeek( 1555 self.stream.peek(lengthCode.maxLength))[1].index 1556 if length!=17: break 1557 lengthCode.description = 'total {}+{{1}} unused'.format( 1558 (repeat-2<<3)+3) 1559 extra = self.verboseRead(lengthCode) 1560 else: 1561 symbol = next(alphabetIter) 1562 #double braces for format 1563 char = str(symbol) 1564 if char in '{}': char *= 2 1565 lengthCode.description = \ 1566 'Length for {} is {{0.index}} bits'.format(char) 1567 #output is not needed (will be 0) 1568 self.verboseRead(lengthCode) 1569 symbolLengths[symbol.index] = length 1570 total += 32768>>length 1571 lastLength = length 1572 assert total==32768 1573 alphabet.setLength(symbolLengths) 1574 print('End of table. Prefix code '+alphabet.name+':') 1575 alphabet.showCode() 1576 1577 #stream 1578 def processStream(self): 1579 """Process a brotli stream. 1580 """ 1581 print('addr hex{:{}s}binary context explanation'.format( 1582 '', self.width-10)) 1583 print('Stream header'.center(60, '-')) 1584 self.windowSize = self.verboseRead(WindowSizeAlphabet()) 1585 print('Metablock header'.center(60, '=')) 1586 self.ISLAST = False 1587 self.output = bytearray() 1588 while not self.ISLAST: 1589 self.ISLAST = self.verboseRead( 1590 BoolCode('LAST', description="Last block")) 1591 if self.ISLAST: 1592 if self.verboseRead( 1593 BoolCode('EMPTY', description="Empty block")): break 1594 if self.metablockLength(): continue 1595 if not self.ISLAST and self.uncompressed(): continue 1596 print('Block type descriptors'.center(60, '-')) 1597 self.numberOfBlockTypes = {} 1598 self.currentBlockCounts = {} 1599 self.blockTypeCodes = {} 1600 self.blockCountCodes = {} 1601 for blockType in (L,I,D): self.blockType(blockType) 1602 print('Distance code parameters'.center(60, '-')) 1603 self.NPOSTFIX, self.NDIRECT = self.verboseRead(DistanceParamAlphabet()) 1604 self.readLiteralContextModes() 1605 print('Context maps'.center(60, '-')) 1606 self.cmaps = {} 1607 #keep the number of each kind of prefix tree for the last loop 1608 numberOfTrees = {I: self.numberOfBlockTypes[I]} 1609 for blockType in (L,D): 1610 numberOfTrees[blockType] = self.contextMap(blockType) 1611 print('Prefix code lists'.center(60, '-')) 1612 self.prefixCodes = {} 1613 for blockType in (L,I,D): 1614 self.readPrefixArray(blockType, numberOfTrees[blockType]) 1615 self.metablock() 1616 1617 #metablock header 1618 def verboseRead(self, alphabet, context='', skipExtra=False): 1619 """Read symbol and extra from stream and explain what happens. 1620 Returns the value of the symbol 1621 >>> olleke.pos = 0 1622 >>> l = Layout(olleke) 1623 >>> l.verboseRead(WindowSizeAlphabet()) 1624 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288 1625 4194288 1626 """ 1627 #TODO 2: verbosity level, e.g. show only codes and maps in header 1628 stream = self.stream 1629 pos = stream.pos 1630 if skipExtra: 1631 length, symbol = alphabet.readTuple(stream) 1632 extraBits, extra = 0, None 1633 else: 1634 length, symbol, extraBits, extra = alphabet.readTupleAndExtra( 1635 stream) 1636 #fields: address, hex data, binary data, name of alphabet, explanation 1637 hexdata = self.makeHexData(pos) 1638 addressField = '{:04x}'.format(pos+7>>3) if hexdata else '' 1639 bitdata = self.formatBitData(pos, length, extraBits) 1640 #bitPtr moves bitdata so that the bytes are easier to read 1641 #jump back to right if a new byte starts 1642 if '|' in bitdata[1:]: 1643 #start over on the right side 1644 self.bitPtr = self.width 1645 fillWidth = self.bitPtr-(len(hexdata)+len(bitdata)) 1646 if fillWidth<0: fillWidth = 0 1647 print('{:<5s} {:<{}s} {:7s} {}'.format( 1648 addressField, 1649 hexdata+' '*fillWidth+bitdata, self.width, 1650 context+alphabet.name, 1651 symbol if skipExtra else symbol.explanation(extra), 1652 )) 1653 #jump to the right if we started with a '|' 1654 #because we didn't jump before printing 1655 if bitdata.startswith('|'): self.bitPtr = self.width 1656 else: self.bitPtr -= len(bitdata) 1657 return symbol if skipExtra else symbol.value(extra) 1658 1659 def metablockLength(self): 1660 """Read MNIBBLES and meta block length; 1661 if empty block, skip block and return true. 1662 """ 1663 self.MLEN = self.verboseRead(MetablockLengthAlphabet()) 1664 if self.MLEN: 1665 return False 1666 #empty block; skip and return False 1667 self.verboseRead(ReservedAlphabet()) 1668 MSKIP = self.verboseRead(SkipLengthAlphabet()) 1669 self.verboseRead(FillerAlphabet(streamPos=self.stream.pos)) 1670 self.stream.pos += 8*MSKIP 1671 print("Skipping to {:x}".format(self.stream.pos>>3)) 1672 return True 1673 1674 def uncompressed(self): 1675 """If true, handle uncompressed data 1676 """ 1677 ISUNCOMPRESSED = self.verboseRead( 1678 BoolCode('UNCMPR', description='Is uncompressed?')) 1679 if ISUNCOMPRESSED: 1680 self.verboseRead(FillerAlphabet(streamPos=self.stream.pos)) 1681 print('Uncompressed data:') 1682 self.output += self.stream.readBytes(self.MLEN) 1683 print(outputFormatter(self.output[-self.MLEN:])) 1684 return ISUNCOMPRESSED 1685 1686 def blockType(self, kind): 1687 """Read block type switch descriptor for given kind of blockType.""" 1688 NBLTYPES = self.verboseRead(TypeCountAlphabet( 1689 'BT#'+kind[0].upper(), 1690 description='{} block types'.format(kind), 1691 )) 1692 self.numberOfBlockTypes[kind] = NBLTYPES 1693 if NBLTYPES>=2: 1694 self.blockTypeCodes[kind] = self.readPrefixCode( 1695 BlockTypeAlphabet('BT'+kind[0].upper(), NBLTYPES)) 1696 self.blockCountCodes[kind] = self.readPrefixCode( 1697 BlockCountAlphabet('BC'+kind[0].upper())) 1698 blockCount = self.verboseRead(self.blockCountCodes[kind]) 1699 else: 1700 blockCount = 1<<24 1701 self.currentBlockCounts[kind] = blockCount 1702 1703 def readLiteralContextModes(self): 1704 """Read literal context modes. 1705 LSB6: lower 6 bits of last char 1706 MSB6: upper 6 bits of last char 1707 UTF8: rougly dependent on categories: 1708 upper 4 bits depend on category of last char: 1709 control/whitespace/space/ punctuation/quote/%/open/close/ 1710 comma/period/=/digits/ VOWEL/CONSONANT/vowel/consonant 1711 lower 2 bits depend on category of 2nd last char: 1712 space/punctuation/digit or upper/lowercase 1713 signed: hamming weight of last 2 chars 1714 """ 1715 print('Context modes'.center(60, '-')) 1716 self.literalContextModes = [] 1717 for i in range(self.numberOfBlockTypes[L]): 1718 self.literalContextModes.append( 1719 self.verboseRead(LiteralContextMode(number=i))) 1720 1721 def contextMap(self, kind): 1722 """Read context maps 1723 Returns the number of differnt values on the context map 1724 (In other words, the number of prefix trees) 1725 """ 1726 NTREES = self.verboseRead(TypeCountAlphabet( 1727 kind[0].upper()+'T#', 1728 description='{} prefix trees'.format(kind))) 1729 mapSize = {L:64, D:4}[kind] 1730 if NTREES<2: 1731 self.cmaps[kind] = [0]*mapSize 1732 else: 1733 #read CMAPkind 1734 RLEMAX = self.verboseRead(RLEmaxAlphabet( 1735 'RLE#'+kind[0].upper(), 1736 description=kind+' context map')) 1737 alphabet = TreeAlphabet('CM'+kind[0].upper(), NTREES=NTREES, RLEMAX=RLEMAX) 1738 cmapCode = self.readPrefixCode(alphabet) 1739 tableSize = mapSize*self.numberOfBlockTypes[kind] 1740 cmap = [] 1741 while len(cmap)<tableSize: 1742 cmapCode.description = 'map {}, entry {}'.format( 1743 *divmod(len(cmap), mapSize)) 1744 count, value = self.verboseRead(cmapCode) 1745 cmap.extend([value]*count) 1746 assert len(cmap)==tableSize 1747 IMTF = self.verboseRead(BoolCode('IMTF', description='Apply inverse MTF')) 1748 if IMTF: 1749 self.IMTF(cmap) 1750 if kind==L: 1751 print('Context maps for literal data:') 1752 for i in range(0, len(cmap), 64): 1753 print(*( 1754 ''.join(map(str, cmap[j:j+8])) 1755 for j in range(i, i+64, 8) 1756 )) 1757 else: 1758 print('Context map for distances:') 1759 print(*( 1760 ''.join(map('{:x}'.format, cmap[i:i+4])) 1761 for i in range(0, len(cmap), 4) 1762 )) 1763 self.cmaps[kind] = cmap 1764 return NTREES 1765 1766 @staticmethod 1767 def IMTF(v): 1768 """In place inverse move to front transform. 1769 """ 1770 #mtf is initialized virtually with range(infinity) 1771 mtf = [] 1772 for i, vi in enumerate(v): 1773 #get old value from mtf. If never seen, take virtual value 1774 try: value = mtf.pop(vi) 1775 except IndexError: value = vi 1776 #put value at front 1777 mtf.insert(0, value) 1778 #replace transformed value 1779 v[i] = value 1780 1781 def readPrefixArray(self, kind, numberOfTrees): 1782 """Read prefix code array""" 1783 prefixes = [] 1784 for i in range(numberOfTrees): 1785 if kind==L: alphabet = LiteralAlphabet(i) 1786 elif kind==I: alphabet = InsertAndCopyAlphabet(i) 1787 elif kind==D: alphabet = DistanceAlphabet( 1788 i, NPOSTFIX=self.NPOSTFIX, NDIRECT=self.NDIRECT) 1789 self.readPrefixCode(alphabet) 1790 prefixes.append(alphabet) 1791 self.prefixCodes[kind] = prefixes 1792 1793 #metablock data 1794 def metablock(self): 1795 """Process the data. 1796 Relevant variables of self: 1797 numberOfBlockTypes[kind]: number of block types 1798 currentBlockTypes[kind]: current block types (=0) 1799 literalContextModes: the context modes for the literal block types 1800 currentBlockCounts[kind]: counters for block types 1801 blockTypeCodes[kind]: code for block type 1802 blockCountCodes[kind]: code for block count 1803 cmaps[kind]: the context maps (not for I) 1804 prefixCodes[kind][#]: the prefix codes 1805 lastDistances: the last four distances 1806 lastChars: the last two chars 1807 output: the result 1808 """ 1809 print('Meta block contents'.center(60, '=')) 1810 self.currentBlockTypes = {L:0, I:0, D:0, pL:1, pI:1, pD:1} 1811 self.lastDistances = deque([17,16,11,4], maxlen=4) 1812 #the current context mode is for block type 0 1813 self.contextMode = ContextModeKeeper(self.literalContextModes[0]) 1814 wordList = WordList() 1815 1816 #setup distance callback function 1817 def distanceCallback(symbol, extra): 1818 "callback function for displaying decoded distance" 1819 index, offset = symbol.value(extra) 1820 if index: 1821 #recent distance 1822 distance = self.lastDistances[-index]+offset 1823 return 'Distance: {}last{:+d}={}'.format(index, offset, distance) 1824 #absolute value 1825 if offset<=maxDistance: 1826 return 'Absolute value: {} (pos {})'.format(offset, maxDistance-offset) 1827 #word list value 1828 action, word = divmod(offset-maxDistance, 1<<wordList.NDBITS[copyLen]) 1829 return '{}-{} gives word {},{} action {}'.format( 1830 offset, maxDistance, copyLen, word, action) 1831 for dpc in self.prefixCodes[D]: dpc.callback = distanceCallback 1832 1833 blockLen = 0 1834 #there we go 1835 while blockLen<self.MLEN: 1836 #get insert© command 1837 litLen, copyLen, dist0Flag = self.verboseRead( 1838 self.prefixCodes[I][ 1839 self.figureBlockType(I)]) 1840 #literal data 1841 for i in range(litLen): 1842 bt = self.figureBlockType(L) 1843 cm = self.contextMode.getIndex() 1844 ct = self.cmaps[L][bt<<6|cm] 1845 char = self.verboseRead( 1846 self.prefixCodes[L][ct], 1847 context='{},{}='.format(bt,cm)) 1848 self.contextMode.add(char) 1849 self.output.append(char) 1850 blockLen += litLen 1851 #check if we're done 1852 if blockLen>=self.MLEN: return 1853 #distance 1854 #distances are computed relative to output length, at most window size 1855 maxDistance = min(len(self.output), self.windowSize) 1856 if dist0Flag: 1857 distance = self.lastDistances[-1] 1858 else: 1859 bt = self.figureBlockType(D) 1860 cm = {2:0, 3:1, 4:2}.get(copyLen, 3) 1861 ct = self.cmaps[D][bt<<2|cm] 1862 index, offset = self.verboseRead( 1863 self.prefixCodes[D][ct], 1864 context='{},{}='.format(bt,cm)) 1865 distance = self.lastDistances[-index]+offset if index else offset 1866 if index==1 and offset==0: 1867 #to make sure distance is not put in last distance list 1868 dist0Flag = True 1869 if distance<=maxDistance: 1870 #copy from output 1871 for i in range( 1872 maxDistance-distance, 1873 maxDistance-distance+copyLen): 1874 self.output.append(self.output[i]) 1875 if not dist0Flag: self.lastDistances.append(distance) 1876 comment = 'Seen before' 1877 else: 1878 #fetch from wordlist 1879 newWord = wordList.word(copyLen, distance-maxDistance-1) 1880 self.output.extend(newWord) 1881 #adjust copyLen to reflect actual new data 1882 copyLen = len(newWord) 1883 comment = 'From wordlist' 1884 blockLen += copyLen 1885 print(' '*40, 1886 comment, 1887 ': "', 1888 outputFormatter(self.output[-copyLen:]), 1889 '"', 1890 sep='') 1891 self.contextMode.add(self.output[-2]) 1892 self.contextMode.add(self.output[-1]) 1893 1894 def figureBlockType(self, kind): 1895 counts, types = self.currentBlockCounts, self.currentBlockTypes 1896 if counts[kind]==0: 1897 newType = self.verboseRead(self.blockTypeCodes[kind]) 1898 if newType==-2: newType = types['P'+kind] 1899 elif newType==-1: 1900 newType = (types[kind]+1)%self.numberOfBlockTypes[kind] 1901 types['P'+kind] = types[kind] 1902 types[kind] = newType 1903 counts[kind] = self.verboseRead(self.blockCountCodes[kind]) 1904 counts[kind] -=1 1905 return types[kind] 1906 1907__test__ = { 1908'BitStream': """ 1909 >>> bs = BitStream(b'Jurjen') 1910 >>> bs.readBytes(2) 1911 b'Ju' 1912 >>> bs.read(6) #r=01110010 1913 50 1914 >>> bs 1915 BitStream(pos=2:6) 1916 >>> bs.peek(5) #j=01101010 1917 9 1918 >>> bs.readBytes(2) 1919 Traceback (most recent call last): 1920 ... 1921 ValueError: readBytes: need byte boundary 1922 """, 1923 1924'Symbol': """ 1925 >>> a=Symbol(MetablockLengthAlphabet(),5) 1926 >>> len(a) 1927 2 1928 >>> int(a) 1929 5 1930 >>> a.bitPattern() 1931 '01' 1932 >>> a.value(200000) 1933 200001 1934 >>> a.explanation(300000) 1935 'data length: 493e0h+1=300001' 1936 """, 1937 1938'RangeDecoder': """ 1939 >>> a=RangeDecoder(bitLength=3) 1940 >>> len(a) 1941 8 1942 >>> a.name='t' 1943 >>> list(a) 1944 [Symbol(t, 0), Symbol(t, 1), Symbol(t, 2), Symbol(t, 3), Symbol(t, 4), Symbol(t, 5), Symbol(t, 6), Symbol(t, 7)] 1945 >>> a[2] 1946 Symbol(t, 2) 1947 >>> a.bitPattern(4) 1948 '100' 1949 >>> a.length(2) 1950 3 1951 >>> a.decodePeek(15) 1952 (3, Symbol(t, 7)) 1953 >>> 1954 1955 """, 1956 1957'PrefixDecoder': """ 1958 >>> a=PrefixDecoder(decodeTable={0:1,1:2,3:3,7:4}) 1959 >>> len(a) 1960 4 1961 >>> a.name='t' 1962 >>> list(a) 1963 [Symbol(t, 1), Symbol(t, 2), Symbol(t, 3), Symbol(t, 4)] 1964 >>> a.decodePeek(22) 1965 (1, Symbol(t, 1)) 1966 >>> a.decodePeek(27) 1967 (3, Symbol(t, 3)) 1968 >>> a.length(1) 1969 1 1970 >>> a.length(4) 1971 3 1972 """, 1973 1974'Code': """ 1975 >>> a=Code('t',alphabetSize=10) 1976 >>> len(a) 1977 10 1978 >>> a.showCode() 1979 0000:0 0001:1 0010:2 0011:3 0100:4 0101:5 0110:6 0111:7 1000:8 1001:9 1980 >>> a.setLength({2:1,3:2,5:3,6:3}) 1981 >>> a.showCode() 1982 0:2 01:3 011:5 111:6 1983 >>> len(a) 1984 4 1985 >>> def callback(i): return 'call{}back'.format(i) 1986 >>> a=Code('t',callback=callback,bitLength=3) 1987 >>> a[6].explanation() 1988 'call6back' 1989 """, 1990 1991'WithExtra': """ 1992 >>> class A(WithExtra): 1993 ... extraTable = [0,1,1,2,2] 1994 >>> a=A('t',alphabetSize=5) 1995 >>> a[1] 1996 Symbol(t, 1) 1997 >>> a.extraBits(2) 1998 1 1999 >>> a.mnemonic(4) 2000 '4' 2001 >>> a.readTupleAndExtra(BitStream(b'\x5b')) 2002 (3, Symbol(t, 3), 2, 3) 2003 """, 2004 2005'BoolCode': """ 2006 >>> BoolCode('test')[0].explanation() 2007 '0: False' 2008 """, 2009 2010'Enumerator': """ 2011 >>> class A(Enumerator): 2012 ... extraTable = [0,1,1,2,2] 2013 ... value0=3 2014 >>> a=A(alphabetLength=5) 2015 >>> a.value(3) 2016 Traceback (most recent call last): 2017 ... 2018 TypeError: value() missing 1 required positional argument: 'extra' 2019 >>> a.explanation(3,4) 2020 'xx 011: 8-11; 8+4=12' 2021 """, 2022 2023'WindowSizeAlphabet': """ 2024 >>> windowSizeAlphabet = WindowSizeAlphabet() 2025 >>> windowSizeAlphabet[0] 2026 Traceback (most recent call last): 2027 ... 2028 ValueError: No symbol WindowSizeAlphabet[0] 2029 >>> len(windowSizeAlphabet) 2030 16 2031 >>> windowSizeAlphabet[21] 2032 Symbol(WSIZE, 21) 2033 >>> windowSizeAlphabet[21].bitPattern() 2034 '1001' 2035 >>> windowSizeAlphabet[21].extraBits() 2036 0 2037 >>> windowSizeAlphabet[21].index 2038 21 2039 >>> windowSizeAlphabet[10].value() 2040 1008 2041 >>> windowSizeAlphabet[10].explanation() 2042 'windowsize=(1<<10)-16=1008' 2043 >>> windowSizeAlphabet.showCode() 2044 0:65520 1100001:16368 1110001:32752 0011:262128 2045 0000001:131056 0010001:None 1001:2097136 1011:4194288 2046 1000001:4080 1010001:8176 0101:524272 0111:1048560 2047 0100001:1008 0110001:2032 1101:8388592 1111:16777200 2048 """, 2049 2050'TypeCountAlphabet': """ 2051 >>> typeCountAlphabet = TypeCountAlphabet(description='bananas') 2052 >>> len(typeCountAlphabet) 2053 9 2054 >>> typeCountAlphabet[3] 2055 Symbol(BT#, 3) 2056 >>> typeCountAlphabet[9] 2057 Traceback (most recent call last): 2058 ... 2059 ValueError: No symbol TypeCountAlphabet[9] 2060 >>> print(typeCountAlphabet[3]) 2061 xx,0101 2062 >>> typeCountAlphabet[8].value(127) 2063 256 2064 >>> typeCountAlphabet[4].explanation(2) 2065 'xxx,0111: 11 bananas' 2066 >>> typeCountAlphabet[0].explanation() 2067 '0: 1 banana' 2068 """, 2069 2070'DistanceParamAlphabet': """ 2071 >>> dpa = DistanceParamAlphabet() 2072 >>> dpa.showCode() 2073 00:PF0 01:PF1 10:PF2 11:PF3 2074 >>> dpa.readTupleAndExtra(BitStream(b'\\x29')) 2075 (2, Symbol(DIST, 1), 4, 10) 2076 >>> dpa.explanation(2, 5) 2077 '2 postfix bits and 0101<<2=20 direct codes' 2078 """, 2079 2080'LiteralAlphabet': """ 2081 >>> LiteralAlphabet(-1).showCode() #doctest: +ELLIPSIS 2082 00000000:\\x00 00110100:4 01101000:h 10011100:\\x9c 11010000:\\xd0 2083 00000001:\\x01 00110101:5 01101001:i 10011101:\\x9d 11010001:\\xd1 2084 00000010:\\x02 00110110:6 01101010:j 10011110:\\x9e 11010010:\\xd2 2085 ... 2086 00101111:/ 01100011:c 10010111:\\x97 11001011:\\xcb 11111111:\\xff 2087 00110000:0 01100100:d 10011000:\\x98 11001100:\\xcc 2088 00110001:1 01100101:e 10011001:\\x99 11001101:\\xcd 2089 00110010:2 01100110:f 10011010:\\x9a 11001110:\\xce 2090 00110011:3 01100111:g 10011011:\\x9b 11001111:\\xcf 2091 """, 2092 2093'BlockCountAlphabet': """ 2094 >>> bc=BlockCountAlphabet('BCL') 2095 >>> len(bc) 2096 26 2097 >>> bs=BitStream(b'\\x40\\x83\\xc8\\x59\\12\\x02') 2098 >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3]) 2099 'Block count: xx 00000: 1-4; 1+2=3' 2100 >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3]) 2101 'Block count: xxx 00110: 33-40; 33+0=33' 2102 >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3]) 2103 'Block count: xxxxxx 10001: 305-368; 305+28=333' 2104 >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3]) 2105 'Block count: xxxxxxxxxxx 10110: 2289-4336; 2289+1044=3333' 2106 """, 2107 2108'Layout': """ 2109 >>> olleke.pos = 0 2110 >>> l = Layout(olleke) 2111 >>> l.verboseRead(WindowSizeAlphabet()) 2112 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288 2113 4194288 2114 >>> l.verboseRead(BoolCode('LAST', description="Last block")) 2115 1 LAST Last block: 1: True 2116 True 2117 >>> l.verboseRead(BoolCode('EMPTY', description="Empty block")) 2118 0 EMPTY Empty block: 0: False 2119 False 2120 >>> l.verboseRead(MetablockLengthAlphabet()) 2121 0001 2e 00 |00h|2Eh,|00 MLEN data length: 002eh+1=47 2122 47 2123 >>> olleke.pos = 76 2124 >>> l = Layout(olleke) 2125 >>> x = l.verboseRead(DistanceAlphabet(0,NPOSTFIX=0,NDIRECT=0), skipExtra=True) 2126 000a 82 10|1100 D0 10[15*x]-3 2127 >>> x.explanation(0x86a3) 2128 '10[1000011010100011]-3: [0]+100000' 2129 """, 2130 2131'olleke': """ 2132 >>> olleke.pos = 0 2133 >>> try: Layout(olleke).processStream() 2134 ... except NotImplementedError: pass 2135 ... #doctest: +REPORT_NDIFF 2136 addr hex binary context explanation 2137 -----------------------Stream header------------------------ 2138 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288 2139 ======================Metablock header====================== 2140 1 LAST Last block: 1: True 2141 0 EMPTY Empty block: 0: False 2142 0001 2e 00 |00h|2Eh,|00 MLEN data length: 002eh+1=47 2143 -------------------Block type descriptors------------------- 2144 0003 00 0 BT#L 0: 1 literal block type 2145 0 BT#I 0: 1 insert© block type 2146 0 BT#D 0: 1 distance block type 2147 ------------------Distance code parameters------------------ 2148 0004 44 0|000,00 DIST 0 postfix bits and 0000<<0=0 direct codes 2149 -----------------------Context modes------------------------ 2150 10 LC0 Context mode for type 0: 2(UTF8) 2151 ------------------------Context maps------------------------ 2152 0 LT# 0: 1 literal prefix tree 2153 0 DT# 0: 1 distance prefix tree 2154 ---------------------Prefix code lists---------------------- 2155 10 PFX L0 is complex with lengths 3,4,0,5,17... 2156 0005 4f 1|0 ##L0 len 3: coded with 3 bits 2157 0111 ##L0 len 4: coded with 1 bits 2158 10 ##L0 unused: coded with 3 bits 2159 0006 d6 0|0 ##L0 len 5: skipped 2160 011 ##L0 zero xxx: coded with 2 bits 2161 ***** Lengths for L0 will be coded as: 2162 0:len 4 01:zero xxx 011:unused 111:len 3 2163 0007 95 1|11,01 #L0 7+3 unused 2164 0 #L0 Length for \\n is 4 bits 2165 001,01 #L0 1+3 unused 2166 0008 44 010,0|1 #L0 total 19+2 unused 2167 0 #L0 Length for " " is 4 bits 2168 0 #L0 Length for ! is 4 bits 2169 0009 cb 011,|01 #L0 3+3 unused 2170 |110,01 #L0 total 35+6 unused 2171 000a 82 0 #L0 Length for K is 4 bits 2172 000,01 #L0 0+3 unused 2173 0 #L0 Length for O is 4 bits 2174 000b 4d 01|1 #L0 symbol P unused 2175 011 #L0 symbol Q unused 2176 0 #L0 Length for R is 4 bits 2177 000c 88 000,|01 #L0 0+3 unused 2178 |100,01 #L0 total 11+4 unused 2179 000d b6 0 #L0 Length for b is 4 bits 2180 011 #L0 symbol c unused 2181 011 #L0 symbol d unused 2182 000e 27 11|1 #L0 Length for e is 3 bits 2183 010,01 #L0 2+3 unused 2184 |0 #L0 Length for k is 4 bits 2185 000f 1f 111 #L0 Length for l is 3 bits 2186 011 #L0 symbol m unused 2187 0 #L0 Length for n is 4 bits 2188 |0 #L0 Length for o is 4 bits 2189 0010 c1 000,01 #L0 0+3 unused 2190 0 #L0 Length for s is 4 bits 2191 0011 b4 0|11 #L0 symbol t unused 2192 0 #L0 Length for u is 4 bits 2193 End of table. Prefix code L0: 2194 000:e 0010:\\n 0110:! 0001:O 0101:b 0011:n 0111:s 2195 100:l 1010:" " 1110:K 1001:R 1101:k 1011:o 1111:u 2196 11,01 PFX IC0 is simple with 4 code words 2197 0012 2a |2Ah|10 IC0 ? bits: I5C4 2198 0013 b5 ec 00|B5h IC0 ? bits: I6+xC7 2199 0015 22 0010|111011 IC0 ? bits: I8+xC5 2200 0016 8c 001100|0010 IC0 ? bits: I0C14+xx 2201 0 SHAPE False: lengths 2,2,2,2 2202 0017 74 10,0|1 PFX D0 is simple with 3 code words 2203 0018 a6 0|01110 D0 1 bit: 2last-3 2204 010011 D0 2 bits: 11xx-3 2205 0019 aa 01010|1 D0 2 bits: 11xxx-3 2206 ====================Meta block contents===================== 2207 |1,01 IC0 Literal: 9, copy: 5 2208 001a 41 0001 0,0=L0 O 2209 100 0,48=L0 l 2210 001b a2 10|0 0,62=L0 l 2211 000 0,63=L0 e 2212 001c a1 1|101 0,59=L0 k 2213 000 0,63=L0 e 2214 |1010 0,59=L0 " " 2215 001d b5 0101 0,11=L0 b 2216 |1011 0,60=L0 o 2217 001e 24 0 0,3=D0 Distance: 2last-3=8 2218 Seen before: "lleke" 2219 0,10 IC0 Literal: 6, copy: 7 2220 |0010 0,59=L0 \\n 2221 001f 89 1001 0,7=L0 R 2222 000 0,52=L0 e 2223 0020 fa 010|1 0,58=L0 b 2224 1111 0,63=L0 u 2225 0021 eb 011|1 0,59=L0 s 2226 11,01 0,3=D0 Absolute value: 12 (pos 8) 2227 Seen before: "olleke\\n" 2228 0022 db 01,1|1 IC0 Literal: 0, copy: 15 2229 |110,11 0,3=D0 Absolute value: 27 (pos 0) 2230 Seen before: "Olleke bolleke\\n" 2231 0023 f8 00 IC0 Literal: 5, copy: 4 2232 1110 0,7=L0 K 2233 0024 2c 00|11 0,52=L0 n 2234 1011 0,62=L0 o 2235 0025 0d 1|00 0,59=L0 l 2236 0110 0,63=L0 ! 2237 """, 2238 2239'file': """ 2240 >>> try: Layout(BitStream( 2241 ... open("H:/Downloads/brotli-master/tests/testdata/10x10y.compressed",'rb') 2242 ... .read())).processStream() 2243 ... except NotImplementedError: pass 2244 addr hex binary context explanation 2245 -----------------------Stream header------------------------ 2246 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288 2247 ======================Metablock header====================== 2248 1 LAST Last block: 1: True 2249 0 EMPTY Empty block: 0: False 2250 0001 13 00 |00h|13h,|00 MLEN data length: 0013h+1=20 2251 -------------------Block type descriptors------------------- 2252 0003 00 0 BT#L 0: 1 literal block type 2253 0 BT#I 0: 1 insert© block type 2254 0 BT#D 0: 1 distance block type 2255 ------------------Distance code parameters------------------ 2256 0004 a4 0|000,00 DIST 0 postfix bits and 0000<<0=0 direct codes 2257 -----------------------Context modes------------------------ 2258 10 LC0 Context mode for type 0: 2(UTF8) 2259 ------------------------Context maps------------------------ 2260 0 LT# 0: 1 literal prefix tree 2261 0 DT# 0: 1 distance prefix tree 2262 ---------------------Prefix code lists---------------------- 2263 0005 b0 0|1,01 PFX L0 is simple with 2 code words 2264 0006 b2 0|1011000 L0 1 bit: X 2265 0007 ea 0|1011001 L0 1 bit: Y 2266 01,01 PFX IC0 is simple with 2 code words 2267 0008 81 0000001|111 IC0 1 bit: I1C9&D=0 2268 0009 47 02 0|47h|1 IC0 1 bit: I1C9 2269 00,01 PFX D0 is simple with 1 code word 2270 000b 8a 010|000 D0 0 bits: 10x-3 2271 ====================Meta block contents===================== 2272 1 IC0 Literal: 1, copy: 9 2273 0 0,0=L0 X 2274 0,() 0,3=D0 Absolute value: 1 (pos 0) 2275 Seen before: "XXXXXXXXX" 2276 0 IC0 Literal: 1, copy: 9, same distance 2277 |1 0,54=L0 Y 2278 Seen before: "YYYYYYYYY" 2279 """, 2280 2281'XY': """ 2282 >>> try: Layout(BitStream(brotli.compress('X'*10+'Y'*10))).processStream() 2283 ... except NotImplementedError: pass 2284 addr hex binary context explanation 2285 -----------------------Stream header------------------------ 2286 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288 2287 ======================Metablock header====================== 2288 1 LAST Last block: 1: True 2289 0 EMPTY Empty block: 0: False 2290 0001 13 00 |00h|13h,|00 MLEN data length: 0013h+1=20 2291 -------------------Block type descriptors------------------- 2292 0003 00 0 BT#L 0: 1 literal block type 2293 0 BT#I 0: 1 insert© block type 2294 0 BT#D 0: 1 distance block type 2295 ------------------Distance code parameters------------------ 2296 0004 a4 0|000,00 DIST 0 postfix bits and 0000<<0=0 direct codes 2297 -----------------------Context modes------------------------ 2298 10 LC0 Context mode for type 0: 2(UTF8) 2299 ------------------------Context maps------------------------ 2300 0 LT# 0: 1 literal prefix tree 2301 0 DT# 0: 1 distance prefix tree 2302 ---------------------Prefix code lists---------------------- 2303 0005 b0 0|1,01 PFX L0 is simple with 2 code words 2304 0006 b2 0|1011000 L0 1 bit: X 2305 0007 82 0|1011001 L0 1 bit: Y 2306 00,01 PFX IC0 is simple with 1 code word 2307 0008 84 0000100|100 IC0 0 bits: I4C6&D=0 2308 0009 00 00,0|1 PFX D0 is simple with 1 code word 2309 000a e0 0|00000 D0 0 bits: last 2310 ====================Meta block contents===================== 2311 () IC0 Literal: 4, copy: 6, same distance 2312 0 0,0=L0 X 2313 0 0,52=L0 X 2314 0 0,54=L0 X 2315 0 0,54=L0 X 2316 Seen before: "XXXXXX" 2317 () IC0 Literal: 4, copy: 6, same distance 2318 1 0,54=L0 Y 2319 1 0,54=L0 Y 2320 |1 0,54=L0 Y 2321 000b 01 1 0,54=L0 Y 2322 Seen before: "YYYYYY" 2323 """, 2324 2325'empty': """ 2326 >>> try: Layout(BitStream(b'\\x81\\x16\\x00\\x58')).processStream() 2327 ... except NotImplementedError: pass 2328 addr hex binary context explanation 2329 -----------------------Stream header------------------------ 2330 0000 81 0000001 WSIZE windowsize=(1<<17)-16=131056 2331 ======================Metablock header====================== 2332 |1 LAST Last block: 1: True 2333 0001 16 0 EMPTY Empty block: 0: False 2334 11 MLEN 11: empty block 2335 0 RSVD Reserved (must be zero) 2336 0002 00 000000|00,01 SKIP skip length: 0h+1=1 2337 |00 SKIP 2 bits ignored 2338 Skipping to 4 2339 """, 2340 2341} 2342 2343if __name__=='__main__': 2344 import sys 2345 if len(sys.argv)>1: 2346 l = Layout(BitStream(open(sys.argv[1],'rb').read())) 2347 l.processStream() 2348 else: 2349 sys.path.append("h:/Persoonlijk/bin") 2350 try: 2351 import brotli 2352 open('brotlidump.br', 'wb').write( 2353 brotli.compress( 2354 open('brotlidump.py', 'r').read() 2355 )) 2356 olleke = BitStream(brotli.compress( 2357 'Olleke bolleke\nRebusolleke\nOlleke bolleke\nKnol!')) 2358 except ImportError: pass 2359 import doctest 2360 doctest.testmod(optionflags=doctest.REPORT_NDIFF 2361 #|doctest.FAIL_FAST 2362 ) 2363