1#!python3 2"""Program to dump contents of Brotli compressed files showing the compression format. 3Jurjen N.E. Bos, 2016. 4I found the following issues with the Brotli format: 5- The distance alphabet has size 16+(48<<POSTFIX), 6 but the last symbols are useless. 7 It could be lowered to 16+(44-POSTFIX<<POSTFIX), and this could matter. 8- The block type code is useless if NBLTYPES==2, you would only need 1 symbol 9 anyway, so why don't you just switch to "the other" type? 10""" 11import struct 12from operator import itemgetter, methodcaller 13from itertools import accumulate, repeat 14from collections import defaultdict, deque 15from functools import partial 16 17class InvalidStream(Exception): pass 18#lookup table 19L, I, D = "literal", "insert©", "distance" 20pL, pI, pD = 'P'+L, 'P'+I, 'P'+D 21 22def outputCharFormatter(c): 23 """Show character in readable format 24 """ 25 #TODO 2: allow hex only output 26 if 32<c<127: return chr(c) 27 elif c==10: return '\\n' 28 elif c==13: return '\\r' 29 elif c==32: return '" "' 30 else: return '\\x{:02x}'.format(c) 31 32def outputFormatter(s): 33 """Show string or char. 34 """ 35 result = '' 36 def formatSubString(s): 37 for c in s: 38 if c==32: yield ' ' 39 else: yield outputCharFormatter(c) 40 if len(result)<200: return ''.join(formatSubString(s)) 41 else: 42 return ''.join(formatSubString(s[:100]))+'...'+ \ 43 ''.join(formatSubString(s[-100:])) 44 45 46class BitStream: 47 """Represent a bytes object. Can read bits and prefix codes the way 48 Brotli does. 49 """ 50 def __init__(self, byteString): 51 self.data = byteString 52 #position in bits: byte pos is pos>>3, bit pos is pos&7 53 self.pos = 0 54 55 def __repr__(self): 56 """Representation 57 >>> olleke 58 BitStream(pos=0:0) 59 """ 60 return "BitStream(pos={:x}:{})".format(self.pos>>3, self.pos&7) 61 62 def read(self, n): 63 """Read n bits from the stream and return as an integer. 64 Produces zero bits beyond the stream. 65 >>> olleke.data[0]==27 66 True 67 >>> olleke.read(5) 68 27 69 70 >>> olleke 71 BitStream(pos=0:5) 72 """ 73 value = self.peek(n) 74 self.pos += n 75 if self.pos>len(self.data)*8: 76 raise ValueError('Read past end of stream') 77 return value 78 79 def peek(self, n): 80 """Peek an n bit integer from the stream without updating the pointer. 81 It is not an error to read beyond the end of the stream. 82 >>> olleke.data[:2]==b'\x1b\x2e' and 0x2e1b==11803 83 True 84 >>> olleke.peek(15) 85 11803 86 >>> hex(olleke.peek(32)) 87 '0x2e1b' 88 """ 89 #read bytes that contain the data: self.data[self.pos>>3:self.pos+n+7>>3] 90 #convert to int: int.from_bytes(..., 'little') 91 #shift out the bits from the first byte: >>(self.pos&7) 92 #mask unwanted bits: & (1<<n)-1 93 return int.from_bytes( 94 self.data[self.pos>>3:self.pos+n+7>>3], 95 'little')>>(self.pos&7) & (1<<n)-1 96 97 def readBytes(self, n): 98 """Read n bytes from the stream on a byte boundary. 99 """ 100 if self.pos&7: raise ValueError('readBytes: need byte boundary') 101 result = self.data[self.pos>>3:(self.pos>>3)+n] 102 self.pos += 8*n 103 return result 104 105#-----------------------Symbol------------------------------------------- 106class Symbol: 107 """A symbol in a code. 108 Refers back to the code that contains it. 109 Index is the place in the alphabet of the symbol. 110 """ 111 __slots__ = 'code', 'index' 112 def __init__(self, code, value): 113 self.code = code 114 self.index = value 115 116 def __repr__(self): 117 return 'Symbol({}, {})'.format(self.code.name, self.index) 118 119 def __len__(self): 120 """Number of bits in the prefix notation of this symbol 121 """ 122 return self.code.length(self.index) 123 124 def __int__(self): 125 return self.index 126 127 #these routines call equivalent routine in Code class 128 def bitPattern(self): 129 """Value of the symbol in the stream 130 """ 131 return self.code.bitPattern(self.index) 132 133 def extraBits(self): 134 """Number of extra bits to read for this symbol 135 """ 136 return self.code.extraBits(self.index) 137 138 def __str__(self): 139 """Short descriptor of the symbol without extra bits. 140 """ 141 return self.code.mnemonic(self.index) 142 143 #requiring optional extra bits, if self.code supports them 144 def value(self, extra=None): 145 """The value used for processing. Can be a tuple. 146 with optional extra bits 147 """ 148 if isinstance(self.code, WithExtra): 149 if not 0<=extra<1<<self.extraBits(): 150 raise ValueError("value: extra value doesn't fit in extraBits") 151 return self.code.value(self.index, extra) 152 if extra is not None: 153 raise ValueError('value: no extra bits for this code') 154 return self.code.value(self.index) 155 156 def explanation(self, extra=None): 157 """Long explanation of the value from the numeric value 158 with optional extra bits 159 Used by Layout.verboseRead when printing the value 160 """ 161 if isinstance(self.code, WithExtra): 162 return self.code.callback(self, extra) 163 return self.code.callback(self) 164 165#========================Code definitions================================== 166class RangeDecoder: 167 """A decoder for the Code class that assumes the symbols 168 are encoded consecutively in binary. 169 It all depends on the "alphabetSize" property. 170 The range runs from 0 to alphabetSize-1. 171 This is the default decoder. 172 """ 173 def __init__(self, *, alphabetSize=None, bitLength=None, **args): 174 if bitLength is not None: alphabetSize = 1<<bitLength 175 if alphabetSize is not None: 176 self.alphabetSize = alphabetSize 177 self.maxLength = (alphabetSize-1).bit_length() 178 179 def __len__(self): 180 return self.alphabetSize 181 182 def __iter__(self): 183 """Produce all symbols. 184 """ 185 return map(partial(Symbol, self), range(len(self))) 186 187 def __getitem__(self, index): 188 if index>=self.alphabetSize: raise ValueError('index out of range') 189 return Symbol(self, index) 190 191 def bitPattern(self, index): 192 return '{:0{}b}'.format(index, self.maxLength) 193 194 def length(self, index): 195 """Encoding length of given symbol. 196 Does not depend on index in this case. 197 """ 198 return self.maxLength 199 200 def decodePeek(self, data): 201 """Find which symbol index matches the given data (from peek, as a number) 202 and return the number of bits decoded. 203 Can also be used to figure out length of a symbol. 204 """ 205 return self.maxLength, Symbol(self, data&(1<<self.maxLength)-1) 206 207class PrefixDecoder: 208 """A decoder for the Code class that uses a prefix code. 209 The code is determined by encoding: 210 encode[p] gives the index corresponding to bit pattern p. 211 Used setDecode(decodeTable) to switch the decoder from the default 212 to a prefix decoder, or pass decodeTable at init. 213 You can also use setLength(lengthTable) 214 to define the encoding from the lengths. 215 The set of symbol values does not need to be consecutive. 216 """ 217 def __init__(self, *, decodeTable=None, **args): 218 if decodeTable is not None: self.setDecode(decodeTable) 219 220 def __len__(self): 221 return len(self.decodeTable) 222 223 def __iter__(self): 224 def revBits(index): 225 return self.bitPattern(index)[::-1] 226 return ( 227 Symbol(self, index) 228 for index in sorted(self.decodeTable.values(), key=revBits) 229 ) 230 231 def __getitem__(self, index): 232 if index not in self.lengthTable: 233 raise ValueError('No symbol {}[{}]'.format( 234 self.__class__.__name__, index)) 235 return Symbol(self, index) 236 237 def bitPattern(self, index): 238 bits = next(b for (b,s) in self.decodeTable.items() if s==index) 239 return '{:0{}b}'.format(bits, self.length(index)) 240 241 def length(self, index): 242 """Encoding length of given symbol. 243 """ 244 return self.lengthTable[index] 245 246 def decodePeek(self, data): 247 """Find which symbol index matches the given data (from peek, as a number) 248 and return the number of bits decoded. 249 Can also be used to figure out length of a symbol. 250 """ 251 #do binary search for word length 252 #invariant: lo<=length<=hi 253 lo, hi = self.minLength, self.maxLength 254 while lo<=hi: 255 mid = lo+hi>>1 256 #note lo<=mid<hi at this point 257 mask = (1<<mid)-1 258 #lets see what happens if we guess length is mid 259 try: index = self.decodeTable[data&mask] 260 except KeyError: 261 #too many bits specified, reduce estimated length 262 hi = mid-1 263 continue 264 #we found a symbol, but there could be a longer match 265 symbolLength = self.lengthTable[index] 266 if symbolLength<=mid: 267 #all bits match, symbol must be right 268 return symbolLength, Symbol(self, index) 269 #there must be more bits to match 270 lo = mid+1 271 return lo, Symbol(self, index) 272 273 #routine to set up the tables 274 def setDecode(self, decodeTable): 275 """Store decodeTable, 276 and compute lengthTable, minLength, maxLength from encodings. 277 """ 278 self.decodeTable = decodeTable 279 #set of symbols with unknown length 280 todo = set(decodeTable) 281 #bit size under investigation 282 maskLength = 0 283 lengthTable = {} 284 while todo: 285 mask = (1<<maskLength)-1 286 #split the encodings that we didn't find yet using b bits 287 splitSymbols = defaultdict(list) 288 for s in todo: splitSymbols[s&mask].append(s) 289 #unique encodings have a length of maskLength bits 290 #set length, and remove from todo list 291 for s,subset in splitSymbols.items(): 292 if len(subset)==1: 293 lengthTable[self.decodeTable[s]] = maskLength 294 todo.remove(s) 295 #now investigate with longer mask 296 maskLength +=1 297 #save result 298 self.lengthTable = lengthTable 299 self.minLength = min(lengthTable.values()) 300 self.maxLength = max(lengthTable.values()) 301 self.switchToPrefix() 302 303 def setLength(self, lengthTable): 304 """Given the bit pattern lengths for symbols given in lengthTable, 305 set decodeTable, minLength, maxLength 306 """ 307 self.lengthTable = lengthTable 308 self.minLength = min(lengthTable.values()) 309 self.maxLength = max(lengthTable.values()) 310 #compute the backwards codes first; then reverse them 311 #compute (backwards) first code for every separate lengths 312 nextCodes = [] 313 #build codes for each length, from right to left 314 code = 0 315 for bits in range(self.maxLength+1): 316 code <<= 1 317 nextCodes.append(code) 318 code += sum(x==bits for x in lengthTable.values()) 319 self.decodeTable = {} 320 #count codes for each length, and store reversed in the table 321 for symbol in sorted(lengthTable): 322 bits = lengthTable[symbol] 323 bitpattern = '{:0{}b}'.format(nextCodes[bits], bits) 324 self.decodeTable[int(bitpattern[::-1], 2)] = symbol 325 nextCodes[bits] += 1 326 self.switchToPrefix() 327 328 def switchToPrefix(self): 329 """This routine makes sure the prefix decoder is activated. 330 """ 331 self.mode = PrefixDecoder 332 333class Code(RangeDecoder, PrefixDecoder): 334 """An alphabet of symbols, that can be read from a stream. 335 If you use setDecode or setLength, you have a prefix code, 336 otherwise you have a range code. 337 Features: 338 code[index] produces symbol with given index 339 value(index): value of symbol 340 mnemonic(index): short description of symbol 341 explanation(index): show meaning of symbol, shown in Layout.verboseRead 342 iter(code): produce all symbols in some order 343 name: show as context in Layout.verboseRead 344 """ 345 name = '?' 346 #callback is a function that gets the symbol and the extra bits 347 #default callback calls explanation 348 def __init__(self, name=None, *, callback=None, description='', **args): 349 """Don't forget to set either alphabetSize or decodeTable 350 """ 351 #set name when provided, otherwise take class variable 352 if name is not None: self.name = name 353 if callback is not None: self.callback = callback 354 self.description = description 355 #mode switch 356 if 'bitLength' in args or 'alphabetSize' in args: 357 self.mode = RangeDecoder 358 RangeDecoder.__init__(self, **args) 359 elif 'decodeTable' in args: 360 self.mode = PrefixDecoder 361 PrefixDecoder.__init__(self, **args) 362 else: 363 super().__init__(**args) 364 365 def __repr__(self): 366 return self.__class__.__name__+' '+self.name 367 368 #the routines that get switched between RangeDecoder and PrefixDecoder 369 def __len__(self): return self.mode.__len__(self) 370 def __iter__(self): return self.mode.__iter__(self) 371 def __getitem__(self, index): return self.mode.__getitem__(self, index) 372 def bitPattern(self, index): return self.mode.bitPattern(self, index) 373 def length(self, index): return self.mode.length(self, index) 374 def decodePeek(self, data): return self.mode.decodePeek(self, data) 375 #general routines 376 def value(self, index, extra=None): 377 """Get value of symbol for computations. 378 Override where needed. 379 """ 380 if extra is not None: 381 raise ValueError('value: no extra for this symbol') 382 return index 383 384 def mnemonic(self, index): 385 """Give mnemonic of symbol. 386 Override where needed. 387 """ 388 return str(self.value(index)) 389 390 def callback(self, symbol): 391 return self.explanation(symbol.index) 392 393 def explanation(self, index): 394 """Long explanation of the value from the numeric value 395 This is a default routine. 396 You can customize in three ways: 397 - set description to add some text 398 - override to get more control 399 - set callback to make it dependent on you local variables 400 """ 401 value = self.value(index) 402 return '{0}{1}: {2}'.format( 403 self.description and self.description+': ', 404 self.bitPattern(index), 405 value, 406 ) 407 408 def extraBits(self, index): 409 return 0 410 411 #Routines that use the decode interface 412 def showCode(self, width=80): 413 """Show all words of the code in a nice format. 414 """ 415 #make table of all symbols with binary strings 416 symbolStrings = [ 417 (self.bitPattern(s.index), self.mnemonic(s.index)) 418 for s in self 419 ] 420 #determine column widths the way Lisp programmers do it 421 leftColWidth, rightColWidth = map(max, map( 422 map, 423 repeat(len), 424 zip(*symbolStrings) 425 )) 426 colwidth = leftColWidth+rightColWidth 427 columns = 81//(colwidth+2) 428 rows = -(-len(symbolStrings)//columns) 429 def justify(bs): 430 b,s = bs 431 return b.rjust(leftColWidth)+':'+s.ljust(rightColWidth) 432 for i in range(rows): 433 print(' '.join(map(justify, symbolStrings[i::rows])).rstrip()) 434 435 def readTuple(self, stream): 436 """Read symbol from stream. Returns symbol, length. 437 """ 438 length, symbol = self.decodePeek(stream.peek(self.maxLength)) 439 stream.pos += length 440 return length, symbol 441 442 def readTupleAndExtra(self, stream): 443 return self.readTuple(stream)+(0, None) 444 445class WithExtra(Code): 446 """Extension for Code so that symbol may have extra bits associated. 447 If you supply an extraTable, you can use extraBits 448 You can define an extraTable, 449 which allows to call extraBits to get the number of extraBits. 450 Otherwise, you can supply extraBits yourself. 451 Routine readTupleAndExtra now reads the extra bits too. 452 Value probably needs to be overridden; see Enumerator. 453 Note: this does not give you an decodeTable. 454 """ 455 #redefine these if you don't want to use an extraTable 456 def extraBits(self, index): 457 """Get the number of extra bits for this symbol. 458 """ 459 return self.extraTable[index] 460 461 def mnemonic(self, index): 462 """This value must be independent of extra. 463 """ 464 return str(index) 465 466 def readTupleAndExtra(self, stream): 467 """Read symbol and extrabits from stream. 468 Returns symbol length, symbol, extraBits, extra 469 >>> olleke.pos = 6 470 >>> MetablockLengthAlphabet().readTupleAndExtra(olleke) 471 (2, Symbol(MLEN, 4), 16, 46) 472 """ 473 length, symbol = self.decodePeek(stream.peek(self.maxLength)) 474 stream.pos += length 475 extraBits = self.extraBits(symbol.index) 476 return length, symbol, extraBits, stream.read(extraBits) 477 478 def explanation(self, index, extra=None): 479 """Expanded version of Code.explanation supporting extra bits. 480 If you don't supply extra, it is not mentioned. 481 """ 482 extraBits = 0 if extra is None else self.extraBits(index) 483 if not hasattr(self, 'extraTable'): 484 formatString = '{0}{3}' 485 lo = hi = value = self.value(index, extra) 486 elif extraBits==0: 487 formatString = '{0}{2}: {3}' 488 lo, hi = self.span(index) 489 value = lo 490 else: 491 formatString = '{0}{1} {2}: {3}-{4}; {3}+{5}={6}' 492 lo, hi = self.span(index) 493 value = lo+extra 494 return formatString.format( 495 self.description and self.description+': ', 496 'x'*extraBits, 497 self.bitPattern(index), 498 lo, hi, 499 extra, 500 value, 501 ) 502 503 def callback(self, symbol, extra): 504 return self.explanation(symbol.index, extra) 505 506class BoolCode(Code): 507 """Same as Code(bitLength=1), but shows a boolean. 508 """ 509 def __init__(self, name=None, **args): 510 super().__init__(name, bitLength=1, **args) 511 512 def value(self, index, extra=None): 513 return bool(super().value(index, extra)) 514 515class Enumerator(WithExtra): 516 """Code that is defined by the ExtraTable. 517 extraTable is a class variable that contains 518 the extraBits of the symbols from 0 519 value0 contains the value of symbol 0 520 encodings is not neccessary, but allowed. 521 Note: place for FixedCode to make sure extraBits works 522 """ 523 def __init__(self, name=None, **args): 524 #if there is no decodeTable to determine length, compute it ourselves 525 if 'decodeTable' not in args: 526 args['alphabetSize'] = len(self.extraTable) 527 super().__init__(name, **args) 528 529 def __len__(self): 530 return len(self.extraTable) 531 532 def __getitem__(self, index): 533 """Faster than PrefixDecoder 534 """ 535 if index>=len(self.extraTable): 536 raise ValueError("No symbol {}[{}]".format( 537 self.__class__.__name__, index)) 538 return Symbol(self, index) 539 540 def value(self, index, extra): 541 """Override if you don't define value0 and extraTable 542 """ 543 lower, upper = self.span(index) 544 value = lower+(extra or 0) 545 if value>upper: 546 raise ValueError('value: extra out of range') 547 return value 548 549 def span(self, index): 550 """Give the range of possible values in a tuple 551 Useful for mnemonic and explanation 552 """ 553 lower = self.value0+sum(1<<x for x in self.extraTable[:index]) 554 upper = lower+(1<<self.extraTable[index]) 555 return lower, upper-1 556 557#======================Code subclasses====================================== 558#Alphabets used in the metablock header---------------------------------- 559#For prefix codes 560class PrefixCodeHeader(WithExtra): 561 """Header of prefix codes. 562 """ 563 def __init__(self, codename): 564 super().__init__('PFX', bitLength=2) 565 #this is the name of the code that it describes 566 self.codename = codename 567 568 def extraBits(self, index): 569 return 2 if index==1 else 0 570 571 def value(self, index, extra): 572 """Returns ('Simple', #codewords) or ('Complex', HSKIP) 573 """ 574 if index==1: 575 if extra>3: 576 raise ValueError('value: extra out of range') 577 return 'Simple', extra+1 578 if extra: 579 raise ValueError('value: extra out of range') 580 return 'Complex', index 581 582 def explanation(self, index, extra): 583 if index==1: 584 return '{} is simple with {} code word{}'.format( 585 self.codename, extra+1, 's' if extra else '') 586 lengths = [1, 2, 3, 4, 0, 5, 17, 6] 587 return '{} is complex with lengths {}...'.format( 588 self.codename, 589 ','.join( 590 map(str, lengths[index:index+5])) 591 ) 592 593class TreeShapeAlhabet(BoolCode): 594 """The bit used to indicate if four word code is "deep" or "wide" 595 """ 596 name = 'SHAPE' 597 def value(self, index): 598 return [(2,2,2,2), (1,2,3,3)][index] 599 600 def explanation(self, index): 601 return str(bool(index))+': lengths {},{},{},{}'.format(*self.value(index)) 602 603class LengthOfLengthAlphabet(Code): 604 """For use in decoding complex code descriptors. 605 >>> lengthOfLengthAlphabet = LengthOfLengthAlphabet('') 606 >>> print(lengthOfLengthAlphabet[2]) 607 coded with 2 bits 608 >>> len(lengthOfLengthAlphabet[0]) 609 2 610 >>> [len(lengthOfLengthAlphabet[x]) for x in range(6)] 611 [2, 4, 3, 2, 2, 4] 612 >>> lengthOfLengthAlphabet.showCode() 613 00:skipped 01:coded with 4 bits 0111:coded with 1 bits 614 10:coded with 3 bits 011:coded with 2 bits 1111:coded with 5 bits 615 """ 616 decodeTable = { 617 0b00:0, 0b10:3, 618 0b0111:1, 0b01:4, 619 0b011:2, 0b1111:5, 620 } 621 622 def __init__(self, name=None, **args): 623 super().__init__(name, decodeTable=self.decodeTable, **args) 624 625 def mnemonic(self, index): 626 if index==0: return 'skipped' 627 return 'coded with {} bits'.format(index) 628 629 def explanation(self, index, extra=None): 630 return self.description+': '+self.mnemonic(index) 631 632class LengthAlphabet(WithExtra): 633 """Length of symbols 634 Used during construction of a code. 635 """ 636 def __init__(self, name): 637 super().__init__(name, alphabetSize=18) 638 639 def extraBits(self, index): 640 return {16:2, 17:3}.get(index, 0) 641 642 def mnemonic(self, index): 643 if index==0: return 'unused' 644 elif index==16: return 'rep xx' 645 elif index==17: return 'zero xxx' 646 else: return 'len {}'.format(index) 647 648 def explanation(self, index, extra): 649 return self.description.format(self[index], extra) 650 651 def value(self, index, extra): 652 #the caller got the length already, so extra is enough 653 return extra 654 655#Stream header 656class WindowSizeAlphabet(Code): 657 """The alphabet used for window size in the stream header. 658 >>> WindowSizeAlphabet()[10].explanation() 659 'windowsize=(1<<10)-16=1008' 660 """ 661 decodeTable = { 662 0b0100001: 10, 0b1100001: 14, 0b0011: 18, 0b1011: 22, 663 0b0110001: 11, 0b1110001: 15, 0b0101: 19, 0b1101: 23, 664 0b1000001: 12, 0b0: 16, 0b0111: 20, 0b1111: 24, 665 0b1010001: 13, 0b0000001: 17, 0b1001: 21, 666 0b0010001: None, 667 } 668 669 name = 'WSIZE' 670 671 def __init__(self, name=None): 672 super().__init__(name, decodeTable=self.decodeTable) 673 674 def value(self, index): 675 #missing value gives index None 676 if index is None: return None 677 return (1<<index)-16 678 679 def explanation(self, index): 680 return 'windowsize=(1<<{})-16={}'.format( 681 index, (1<<index)-16) 682 683#Metablock 684class MetablockLengthAlphabet(WithExtra): 685 """Used for the meta block length; 686 also indicates a block with no data 687 >>> metablockLengthAlphabet = MetablockLengthAlphabet() 688 >>> metablockLengthAlphabet[0]; str(metablockLengthAlphabet[0]) 689 Symbol(MLEN, 0) 690 'empty' 691 >>> metablockLengthAlphabet[3] 692 Traceback (most recent call last): 693 ... 694 ValueError: No symbol MetablockLengthAlphabet[3] 695 >>> print(metablockLengthAlphabet[4]) 696 hhhh00 697 >>> metablockLengthAlphabet[4].value(0x1000) 698 4097 699 >>> metablockLengthAlphabet[5].value(0x1000) 700 Traceback (most recent call last): 701 ... 702 InvalidStream: Zeros in high nibble of MLEN 703 >>> metablockLengthAlphabet[5].explanation(0x12345) 704 'data length: 12345h+1=74566' 705 >>> metablockLengthAlphabet.showCode() 706 00:hhhh00 10:hhhhhh10 01:hhhhh01 11:empty 707 """ 708 decodeTable = {0b11:0, 0b00:4, 0b01:5, 0b10:6} 709 710 name = 'MLEN' 711 def __init__(self, name=None): 712 super().__init__(name, decodeTable=self.decodeTable) 713 714 def extraBits(self, index): 715 return index*4 716 717 def mnemonic(self, index): 718 if index==0: return 'empty' 719 return 'h'*(self.extraBits(index)//4)+self.bitPattern(index) 720 721 def value(self, index, extra): 722 extraBits = self.extraBits(index) 723 if not 0<=extra<1<<extraBits: 724 raise ValueError('value: extra out of range') 725 if index==0: return 0 726 if index>4 and extra>>extraBits-4==0: raise InvalidStream( 727 'Zeros in high nibble of MLEN') 728 return extra+1 729 730 def explanation(self, index, extra): 731 if index==0: return '11: empty block' 732 extraBits = self.extraBits(index) 733 return 'data length: {:0{}x}h+1={}'.format(extra, extraBits//4, extra+1) 734 735 736class ReservedAlphabet(BoolCode): 737 """The reserved bit that must be zero. 738 """ 739 name = 'RSVD' 740 def value(self, index): 741 if index: raise ValueError('Reserved bit is not zero') 742 743 def explanation(self, index): 744 return 'Reserved (must be zero)' 745 746class FillerAlphabet(Code): 747 def __init__(self, *, streamPos): 748 super().__init__('SKIP', bitLength=(-streamPos)&7) 749 750 def explanation(self, index): 751 return '{} bit{} ignored'.format( 752 self.length(index), 753 '' if self.length(index)==1 else 's', 754 ) 755 756class SkipLengthAlphabet(WithExtra): 757 """Used for the skip length in an empty metablock 758 >>> skipLengthAlphabet = SkipLengthAlphabet() 759 >>> skipLengthAlphabet[0]; str(skipLengthAlphabet[0]) 760 Symbol(SKIP, 0) 761 'empty' 762 >>> skipLengthAlphabet[4] 763 Traceback (most recent call last): 764 ... 765 ValueError: index out of range 766 >>> print(skipLengthAlphabet[3]) 767 hhhhhh11 768 >>> skipLengthAlphabet[2].value(0x1000) 769 4097 770 >>> skipLengthAlphabet[3].value(0x1000) 771 Traceback (most recent call last): 772 ... 773 InvalidStream: Zeros in high byte of SKIPBYTES 774 >>> skipLengthAlphabet[3].explanation(0x12345) 775 'skip length: 12345h+1=74566' 776 >>> skipLengthAlphabet.showCode() 777 00:empty 01:hh01 10:hhhh10 11:hhhhhh11 778 """ 779 def __init__(self): 780 super().__init__('SKIP', bitLength=2) 781 782 def extraBits(self, index): 783 return index*8 784 785 def mnemonic(self, index): 786 if index==0: return 'empty' 787 return 'h'*(self.extraBits(index)//4)+self.bitPattern(index) 788 789 def value(self, index, extra): 790 extraBits = self.extraBits(index) 791 if not 0<=extra<1<<extraBits: 792 raise ValueError('value: extra out of range') 793 if index==0: return 0 794 if index>1 and extra>>extraBits-8==0: 795 raise InvalidStream('Zeros in high byte of SKIPBYTES') 796 return extra+1 797 798 def explanation(self, index, extra): 799 if index==0: return '00: no skip' 800 extraBits = self.extraBits(index) 801 return 'skip length: {:{}x}h+1={}'.format(extra, extraBits//8, extra+1) 802 803 804class TypeCountAlphabet(Enumerator): 805 """Used for giving block type counts and tree counts. 806 >>> TypeCountAlphabet(description='').showCode() 807 0:0 0101:xx,0101 1011:xxxxx,1011 808 0001:0001 1101:xxxxxx,1101 0111:xxx,0111 809 1001:xxxx,1001 0011:x,0011 1111:xxxxxxx,1111 810 """ 811 decodeTable = { 812 0b0: 0, 0b1001: 5, 813 0b0001: 1, 0b1011: 6, 814 0b0011: 2, 0b1101: 7, 815 0b0101: 3, 0b1111: 8, 816 0b0111: 4, 817 } 818 819 value0 = 1 820 extraTable = [0, 0, 1, 2, 3, 4, 5, 6, 7] 821 name = 'BT#' 822 823 def __init__(self, name=None, *, description): 824 super().__init__( 825 name, 826 decodeTable=self.decodeTable, 827 description=description) 828 829 def mnemonic(self, index): 830 if index==0: return '0' 831 if index==1: return '0001' 832 return 'x'*(self.extraBits(index))+','+self.bitPattern(index) 833 834 def explanation(self, index, extra): 835 value = self.value(index, extra) 836 description = self.description 837 if value==1: description = description[:-1] 838 return '{}: {} {}'.format( 839 self.mnemonic(index), 840 value, 841 description) 842 843class BlockTypeAlphabet(Code): 844 """The block types; this code works for all three kinds. 845 >>> b = BlockTypeAlphabet('T', NBLTYPES=5) 846 >>> print(*(x for x in b)) 847 prev +1 #0 #1 #2 #3 #4 848 """ 849 def __init__(self, name, NBLTYPES, **args): 850 super().__init__(name, alphabetSize=NBLTYPES+2, **args) 851 self.NBLTYPES = NBLTYPES 852 853 def mnemonic(self, index): 854 if index==0: return 'prev' 855 elif index==1: return '+1' 856 else: return '#'+str(index-2) 857 858 def value(self, index): 859 return index-2 860 861 def explanation(self, index): 862 if index==0: return '0: previous' 863 elif index==1: return '1: increment' 864 else: return 'Set block type to: '+str(index-2) 865 866class BlockCountAlphabet(Enumerator): 867 """Block counts 868 >>> b = BlockCountAlphabet('L') 869 >>> print(b[25]) 870 [24*x]: BC16625-16793840 871 """ 872 873 value0 = 1 874 extraTable = [2,2,2,2,3, 3,3,3,4,4, 4,4,5,5,5, 5,6,6,7,8, 9,10,11,12,13, 24] 875 def __init__(self, name, **args): 876 super().__init__(name, alphabetSize=26, **args) 877 878 def mnemonic(self, index): 879 extraBits = self.extraBits(index) 880 return '{}: BC{}-{}'.format( 881 'x'*extraBits if index<5 else '[{}*x]'.format(extraBits), 882 *self.span(index)) 883 884 def explanation(self, index, extra): 885 return 'Block count: '+super().explanation(index, extra) 886 887class DistanceParamAlphabet(WithExtra): 888 """The distance parameters NPOSTFIX and NDIRECT. 889 Although these are treated as two in the description, this is easier. 890 """ 891 def __init__(self): 892 super().__init__('DIST', bitLength=2) 893 894 def extraBits(self, index): 895 return 4 896 897 def value(self, index, extra): 898 """Returns NPOSTFIX and NDIRECT<<NPOSTFIX 899 """ 900 if extra>15: 901 raise ValueError('value: extra out of range') 902 return index, extra<<index 903 904 def explanation(self, index, extra): 905 return '{} postfix bits and {:04b}<<{}={} direct codes'.format( 906 index, extra, index, extra<<index) 907 908 def mnemonic(self, index): 909 return 'PF'+str(index) 910 911class LiteralContextMode(Code): 912 """For the literal context modes. 913 >>> LiteralContextMode().showCode() 914 00:LSB6 01:MSB6 10:UTF8 11:Signed 915 >>> LiteralContextMode().explanation(2) 916 'Context mode for type 9: 2(UTF8)' 917 """ 918 919 def __init__(self, *, number=9): 920 super().__init__('LC'+str(number), bitLength=2) 921 self.number = number 922 923 def mnemonic(self, index): 924 return ['LSB6', 'MSB6', 'UTF8', 'Signed'][index] 925 926 def explanation(self, index): 927 return 'Context mode for type {}: {}({})'.format( 928 self.number, 929 index, 930 self.mnemonic(index)) 931 932class RLEmaxAlphabet(Enumerator): 933 """Used for describing the run length encoding used for describing context maps. 934 >>> RLEmaxAlphabet().showCode() 935 0:1 1:more 936 """ 937 value0 = 0 938 extraTable = [0, 4] 939 name = 'RLE#' 940 941 def mnemonic(self, index): 942 return ['1', 'more'][index] 943 944 def explanation(self, index, extra): 945 description = self.description and self.description+': ' 946 if index==0: return description+'No RLE coding' 947 return '{}xxxx 1: RLEMAX={}'.format(description, extra+1) 948 949class TreeAlphabet(WithExtra): 950 """The alphabet to enumerate entries (called trees) in the context map. 951 parameters are RLEMAX and NTREES 952 >>> t = TreeAlphabet('', RLEMAX=3, NTREES=5) 953 >>> len(t) 954 8 955 >>> print(t[2]) 956 xx+4 zeroes 957 >>> t[3].explanation(2) 958 '8+010=10 zeroes' 959 >>> t[0].value(0) 960 (1, 0) 961 """ 962 name = 'CMI' 963 def __init__(self, name=None, *, RLEMAX, NTREES, **args): 964 super().__init__(name, alphabetSize=RLEMAX+NTREES, **args) 965 self.RLEMAX = RLEMAX 966 self.NTREES = NTREES 967 968 def extraBits(self, index): 969 if 0<index<=self.RLEMAX: return index 970 return 0 971 972 def mnemonic(self, index): 973 if index==0: return 'map #0' 974 if index<=self.RLEMAX: 975 return '{}+{} zeroes'.format('x'*index, 1<<index) 976 return 'map #{}'.format(index-self.RLEMAX) 977 978 def value(self, index, extra): 979 """Give count and value.""" 980 index = index 981 if index==0: return 1, 0 982 if index<=self.RLEMAX: return (1<<index)+extra, 0 983 return 1, index-self.RLEMAX 984 985 def explanation(self, index, extra): 986 description = self.description and self.description+': ' 987 if index==0: return description+'map #0' 988 if index<=self.RLEMAX: 989 return '{}+{:0{}b}={} zeroes'.format( 990 (1<<index), 991 extra, self.extraBits(index), 992 (1<<index)+extra) 993 return '{}map #{}-{}={}'.format( 994 description, 995 index, self.RLEMAX, index-self.RLEMAX) 996 997#Prefix alphabets for the data stream---------------------------------- 998class LiteralAlphabet(Code): 999 """Alphabet of symbols. 1000 """ 1001 minLength = maxLength = 8 1002 def __init__(self, number): 1003 super().__init__('L'+str(number), alphabetSize=1<<8) 1004 1005 def mnemonic(self, index): 1006 return outputCharFormatter(index) 1007 1008 def value(self, index, extra=None): 1009 return index 1010 1011 def explanation(self, index, extra=None): 1012 return self.mnemonic(index) 1013 1014class InsertLengthAlphabet(Enumerator): 1015 """Intern code for insert counts 1016 """ 1017 value0 = 0 1018 extraTable = [0,0,0,0,0, 0,1,1,2,2, 3,3,4,4,5, 5,6,7,8,9, 10,12,14,24] 1019 1020class CopyLengthAlphabet(Enumerator): 1021 value0 = 2 1022 extraTable = [0,0,0,0,0, 0,0,0,1,1, 2,2,3,3,4, 4,5,5,6,7, 8,9,10,24] 1023 1024class InsertAndCopyAlphabet(WithExtra): 1025 """The insert and copy code 1026 >>> for x in range(0,704,704//13): 1027 ... print('{:10b}'.format(x), InsertAndCopyAlphabet()[x]) 1028 0 I0C2&D=0 1029 110110 I6+xC8&D=0 1030 1101100 I5C22+xxx&D=0 1031 10100010 I4C4 1032 11011000 I3C10+x 1033 100001110 I14+xxC8 1034 101000100 I10+xxC22+xxx 1035 101111010 I98+xxxxxC14+xx 1036 110110000 I6+xC70+xxxxx 1037 111100110 I1090+[10*x]C8 1038 1000011100 I26+xxxC326+[8*x] 1039 1001010010 I322+[8*x]C14+xx 1040 1010001000 I194+[7*x]C70+xxxxx 1041 1010111110 I22594+[24*x]C1094+[10*x] 1042 """ 1043 insertLengthAlphabet = InsertLengthAlphabet(None) 1044 copyLengthAlphabet = CopyLengthAlphabet(None) 1045 1046 def __init__(self, number=''): 1047 super().__init__('IC'+str(number), bitLength=10) 1048 1049 def __len__(self): 1050 return 704 1051 1052 def extraBits(self, index): 1053 insertSymbol, copySymbol, dist0 = self.splitSymbol(index) 1054 return InsertLengthAlphabet.extraTable[insertSymbol.index] + \ 1055 CopyLengthAlphabet.extraTable[copySymbol.index] 1056 1057 def splitSymbol(self, index): 1058 """Give relevant values for computations: 1059 (insertSymbol, copySymbol, dist0flag) 1060 """ 1061 #determine insert and copy upper bits from table 1062 row = [0,0,1,1,2,2,1,3,2,3,3][index>>6] 1063 col = [0,1,0,1,0,1,2,0,2,1,2][index>>6] 1064 #determine inserts and copy sub codes 1065 insertLengthCode = row<<3 | index>>3&7 1066 if row: insertLengthCode -= 8 1067 copyLengthCode = col<<3 | index&7 1068 return ( 1069 Symbol(self.insertLengthAlphabet, insertLengthCode), 1070 Symbol(self.copyLengthAlphabet, copyLengthCode), 1071 row==0 1072 ) 1073 1074 def mnemonic(self, index): 1075 """Make a nice mnemonic 1076 """ 1077 i,c,d0 = self.splitSymbol(index) 1078 iLower, _ = i.code.span(i.index) 1079 iExtra = i.extraBits() 1080 cLower, _ = c.code.span(c.index) 1081 cExtra = c.extraBits() 1082 return 'I{}{}{}C{}{}{}{}'.format( 1083 iLower, 1084 '+' if iExtra else '', 1085 'x'*iExtra if iExtra<6 else '[{}*x]'.format(iExtra), 1086 cLower, 1087 '+' if cExtra else '', 1088 'x'*cExtra if cExtra<6 else '[{}*x]'.format(cExtra), 1089 '&D=0' if d0 else '') 1090 1091 def value(self, index, extra): 1092 i,c,d0 = self.splitSymbol(index) 1093 iExtra = i.extraBits() 1094 ce, ie = extra>>iExtra, extra&(1<<iExtra)-1 1095 insert = i.value(ie) 1096 copy = c.value(ce) 1097 return insert, copy, d0 1098 1099 def explanation(self, index, extra): 1100 insert, copy, d0 = self.value(index, extra) 1101 if d0: return 'Literal: {}, copy: {}, same distance'.format(insert, copy) 1102 else: return 'Literal: {}, copy: {}'.format(insert, copy) 1103 1104class DistanceAlphabet(WithExtra): 1105 """Represent the distance encoding. 1106 Dynamically generated alphabet. 1107 This is what the documentation should have said: 1108 Ignoring offsets for the moment, the "long" encoding works as follows: 1109 Write the distance in binary as follows: 1110 1xy..yz..z, then the distance symbol consists of n..nxz..z 1111 Where: 1112 n is one less than number of bits in y 1113 x is a single bit 1114 y..y are n+1 extra bits (encoded in the bit stream) 1115 z..z is NPOSTFIX bits that are part of the symbol 1116 The offsets are so as to start at the lowest useable value: 1117 if 1xyyyyz = distance +(4<<POSTFIX)-NDIRECT-1 1118 then n..nxz..z is symbol -NDIRECT-16 1119 >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10) 1120 >>> print(d[4], d[17], d[34]) 1121 last-1 1 10xx00-5 1122 >>> [str(d[x]) for x in range(26, 32)] 1123 ['10x00-5', '10x01-5', '10x10-5', '10x11-5', '11x00-5', '11x01-5'] 1124 """ 1125 def __init__(self, number, *, NPOSTFIX, NDIRECT): 1126 self.NPOSTFIX = NPOSTFIX 1127 self.NDIRECT = NDIRECT 1128 #set length 1129 #Actually, not all symbols are used, 1130 #only NDIRECT+16+(44-2*POSTFIX<<NPOSTFIX) 1131 super().__init__('D'+str(number), 1132 alphabetSize=self.NDIRECT+16+(48<<self.NPOSTFIX)) 1133 1134 def extraBits(self, index): 1135 """Indicate how many extra bits are needed to interpret symbol 1136 >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10) 1137 >>> [d[i].extraBits() for i in range(26)] 1138 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 1139 >>> [d[i].extraBits() for i in range(26,36)] 1140 [1, 1, 1, 1, 1, 1, 1, 1, 2, 2] 1141 """ 1142 if index<16+self.NDIRECT: return 0 1143 return 1 + ((index - self.NDIRECT - 16) >> (self.NPOSTFIX + 1)) 1144 1145 def value(self, dcode, dextra): 1146 """Decode value of symbol together with the extra bits. 1147 >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10) 1148 >>> d[34].value(2) 1149 (0, 35) 1150 """ 1151 if dcode<16: 1152 return [(1,0),(2,0),(3,0),(4,0), 1153 (1,-1),(1,+1),(1,-2),(1,+2),(1,-3),(1,+3), 1154 (2,-1),(2,+1),(2,-2),(2,+2),(2,-3),(2,+3) 1155 ][dcode] 1156 if dcode<16+self.NDIRECT: 1157 return (0,dcode-16) 1158 #we use the original formulas, instead of my clear explanation 1159 POSTFIX_MASK = (1 << self.NPOSTFIX) - 1 1160 ndistbits = 1 + ((dcode - self.NDIRECT - 16) >> (self.NPOSTFIX + 1)) 1161 hcode = (dcode - self.NDIRECT - 16) >> self.NPOSTFIX 1162 lcode = (dcode - self.NDIRECT - 16) & POSTFIX_MASK 1163 offset = ((2 + (hcode & 1)) << ndistbits) - 4 1164 distance = ((offset + dextra) << self.NPOSTFIX) + lcode + self.NDIRECT + 1 1165 return (0,distance) 1166 1167 def mnemonic(self, index, verbose=False): 1168 """Give mnemonic representation of meaning. 1169 verbose compresses strings of x's 1170 """ 1171 if index<16: 1172 return ['last', '2last', '3last', '4last', 1173 'last-1', 'last+1', 'last-2', 'last+2', 'last-3', 'last+3', 1174 '2last-1', '2last+1', '2last-2', '2last+2', '2last-3', '2last+3' 1175 ][index] 1176 if index<16+self.NDIRECT: 1177 return str(index-16) 1178 #construct strings like "1xx01-15" 1179 index -= self.NDIRECT+16 1180 hcode = index >> self.NPOSTFIX 1181 lcode = index & (1<<self.NPOSTFIX)-1 1182 if self.NPOSTFIX: formatString = '1{0}{1}{2:0{3}b}{4:+d}' 1183 else: formatString = '1{0}{1}{4:+d}' 1184 return formatString.format( 1185 hcode&1, 1186 'x'*(2+hcode>>1) if hcode<13 or verbose else '[{}*x]'.format(2+hcode>>1), 1187 lcode, self.NPOSTFIX, 1188 self.NDIRECT+1-(4<<self.NPOSTFIX)) 1189 1190 def explanation(self, index, extra): 1191 """ 1192 >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10) 1193 >>> d[55].explanation(13) 1194 '11[1101]01-5: [0]+240' 1195 """ 1196 extraBits = self.extraBits(index) 1197 extraString = '[{:0{}b}]'.format(extra, extraBits) 1198 return '{0}: [{1[0]}]{1[1]:+d}'.format( 1199 self.mnemonic(index, True).replace('x'*(extraBits or 1), extraString), 1200 self.value(index, extra)) 1201 1202#Classes for doing actual work------------------------------------------ 1203class ContextModeKeeper: 1204 """For computing the literal context mode. 1205 You feed it characters, and it computes indices in the context map. 1206 """ 1207 def __init__(self, mode): 1208 self.chars = deque([0,0], maxlen=2) 1209 self.mode = mode 1210 1211 def setContextMode(self, mode): 1212 """Switch to given context mode (0..3)""" 1213 self.mode = mode 1214 def getIndex(self): 1215 if self.mode==0: #LSB6 1216 return self.chars[1]&0x3f 1217 elif self.mode==1: #MSB6 1218 return self.chars[1]>>2 1219 elif self.mode==2: #UTF8: character class of previous and a bit of the second 1220 p2,p1 = self.chars 1221 return self.lut0[p1]|self.lut1[p2] 1222 elif self.mode==3: #Signed: initial bits of last two bytes 1223 p2,p1 = self.chars 1224 return self.lut2[p1]<<3|self.lut2[p2] 1225 1226 def add(self, index): 1227 """Adjust the context for output char (as int).""" 1228 self.chars.append(index) 1229 1230 #0: control #16: quote #32: ,:; #48: AEIOU 1231 #4: tab/lf/cr #20: % #36: . #52: BC..Z 1232 #8: space #24: (<[{ #40: = #56: aeiou 1233 #12:!#$&*+-/?@| #28: )>]} #44: 0-9 #60: bc..z 1234 lut0 = [0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 0, 0, 4, 0, 0, 1235 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1236 8, 12, 16, 12, 12, 20, 12, 16, 24, 28, 12, 12, 32, 12, 36, 12, 1237 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 32, 32, 24, 40, 28, 12, 1238 12, 48, 52, 52, 52, 48, 52, 52, 52, 48, 52, 52, 52, 52, 52, 48, 1239 52, 52, 52, 52, 52, 48, 52, 52, 52, 52, 52, 24, 12, 28, 12, 12, 1240 12, 56, 60, 60, 60, 56, 60, 60, 60, 56, 60, 60, 60, 60, 60, 56, 1241 60, 60, 60, 60, 60, 56, 60, 60, 60, 60, 60, 24, 12, 28, 12, 0 1242 ]+[0,1]*32+[2,3]*32 1243 #0: space 1:punctuation 2:digit/upper 3:lower 1244 lut1 = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1245 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1246 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1247 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1248 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1249 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1250 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1251 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0 1252 ]+[0]*96+[2]*32 1253 #initial bits: 8*0, 4*0, 2*0, 1*0, 1*1, 2*1, 4*1, 8*1 1254 lut2 = [0]+[1]*15+[2]*48+[3]*64+[4]*64+[5]*48+[6]*15+[7] 1255 assert len(lut0)==len(lut1)==len(lut2)==256 1256 1257class WordList: 1258 """Word list. 1259 >>> WordList().word(7, 35555) 1260 b'Program to ' 1261 """ 1262 NDBITS = [0, 0, 0, 0, 10, 10, 11, 11, 10, 10, 1263 10, 10, 10, 9, 9, 8, 7, 7, 8, 7, 1264 7, 6, 6, 5, 5] 1265 def __init__(self): 1266 self.file = open('dict', 'rb') 1267 self.compileActions() 1268 1269 def word(self, size, dist): 1270 """Get word 1271 """ 1272 #split dist in index and action 1273 ndbits = self.NDBITS[size] 1274 index = dist&(1<<ndbits)-1 1275 action = dist>>ndbits 1276 #compute position in file 1277 position = sum(n<<self.NDBITS[n] for n in range(4,size))+size*index 1278 self.file.seek(position) 1279 return self.doAction(self.file.read(size), action) 1280 1281 def upperCase1(self, word): 1282 word = word.decode('utf8') 1283 word = word[0].upper()+word[1:] 1284 return word.encode('utf8') 1285 1286 1287 #Super compact form of action table. 1288 #_ means space, .U means UpperCaseAll, U(w) means UpperCaseFirst 1289 actionTable = r""" 1290 0:w 25:w+_for_ 50:w+\n\t 75:w+. This_100:w+ize_ 1291 1:w+_ 26:w[3:] 51:w+: 76:w+, 101:w.U+. 1292 2:_+w+_ 27:w[:-2] 52:_+w+._ 77:.+w+_ 102:\xc2\xa0+w 1293 3:w[1:] 28:w+_a_ 53:w+ed_ 78:U(w)+( 103:_+w+, 1294 4:U(w)+_ 29:w+_that_ 54:w[9:] 79:U(w)+. 104:U(w)+=" 1295 5:w+_the_ 30:_+U(w) 55:w[7:] 80:w+_not_ 105:w.U+=" 1296 6:_+w 31:w+._ 56:w[:-6] 81:_+w+=" 106:w+ous_ 1297 7:s_+w+_ 32:.+w 57:w+( 82:w+er_ 107:w.U+,_ 1298 8:w+_of_ 33:_+w+,_ 58:U(w)+,_ 83:_+w.U+_ 108:U(w)+=\' 1299 9:U(w) 34:w[4:] 59:w[:-8] 84:w+al_ 109:_+U(w)+, 1300 10:w+_and_ 35:w+_with_ 60:w+_at_ 85:_+w.U 110:_+w.U+=" 1301 11:w[2:] 36:w+\' 61:w+ly_ 86:w+=\' 111:_+w.U+,_ 1302 12:w[:-1] 37:w+_from_ 62:_the_+w+_of_ 87:w.U+" 112:_+w.U+, 1303 13:,_+w+_ 38:w+_by_ 63:w[:-5] 88:U(w)+._ 113:w.U+( 1304 14:w+,_ 39:w[5:] 64:w[:-9] 89:_+w+( 114:w.U+._ 1305 15:_+U(w)+_ 40:w[6:] 65:_+U(w)+,_ 90:w+ful_ 115:_+w.U+. 1306 16:w+_in_ 41:_the_+w 66:U(w)+" 91:_+U(w)+._116:w.U+=\' 1307 17:w+_to_ 42:w[:-4] 67:.+w+( 92:w+ive_ 117:_+w.U+._ 1308 18:e_+w+_ 43:w+. The_ 68:w.U+_ 93:w+less_ 118:_+U(w)+=" 1309 19:w+" 44:w.U 69:U(w)+"> 94:w.U+\' 119:_+w.U+=\' 1310 20:w+. 45:w+_on_ 70:w+=" 95:w+est_ 120:_+U(w)+=\' 1311 21:w+"> 46:w+_as_ 71:_+w+. 96:_+U(w)+. 1312 22:w+\n 47:w+_is_ 72:.com/+w 97:w.U+"> 1313 23:w[:-3] 48:w[:-7] 98:_+w+=\' 1314 24:w+] 49:w[:-1]+ing_ 74:U(w)+\' 99:U(w)+, 1315 """ 1316 1317 def compileActions(self): 1318 """Build the action table from the text above 1319 """ 1320 import re 1321 self.actionList = actions = [None]*121 1322 #Action 73, which is too long, looks like this when expanded: 1323 actions[73] = "b' the '+w+b' of the '" 1324 #find out what the columns are 1325 actionLines = self.actionTable.splitlines() 1326 colonPositions = [m.start() 1327 for m in re.finditer(':',actionLines[1]) 1328 ]+[100] 1329 columns = [(colonPositions[i]-3,colonPositions[i+1]-3) 1330 for i in range(len(colonPositions)-1)] 1331 for line in self.actionTable.splitlines(keepends=False): 1332 for start,end in columns: 1333 action = line[start:end] 1334 #skip empty actions 1335 if not action or action.isspace(): continue 1336 #chop it up, and check if the colon is properly placed 1337 index, colon, action = action[:3], action[3], action[4:] 1338 assert colon==':' 1339 #remove filler spaces at right 1340 action = action.rstrip() 1341 #replace space symbols 1342 action = action.replace('_', ' ') 1343 wPos = action.index('w') 1344 #add quotes around left string when present 1345 #translation: any pattern from beginning, up to 1346 #(but not including) a + following by a w later on 1347 action = re.sub(r"^(.*)(?=\+[U(]*w)", r"b'\1'", action) 1348 #add quotes around right string when present 1349 #translation: anything with a w in it, followed by a + 1350 #and a pattern up to the end 1351 #(there is no variable lookbehind assertion, 1352 #so we have to copy the pattern) 1353 action = re.sub(r"(w[[:\-1\]).U]*)\+(.*)$", r"\1+b'\2'", action) 1354 #expand shortcut for uppercaseAll 1355 action = action.replace(".U", ".upper()") 1356 #store action 1357 actions[int(index)] = action 1358 1359 def doAction(self, w, action): 1360 """Perform the proper action 1361 """ 1362 #set environment for the UpperCaseFirst 1363 U = self.upperCase1 1364 return eval(self.actionList[action], locals()) 1365 1366class Layout: 1367 """Class to layout the output. 1368 """ 1369 #display width of hexdata+bitdata 1370 width = 25 1371 #general 1372 def __init__(self, stream): 1373 self.stream = stream 1374 self.bitPtr = self.width 1375 1376 def makeHexData(self, pos): 1377 """Produce hex dump of all data containing the bits 1378 from pos to stream.pos 1379 """ 1380 firstAddress = pos+7>>3 1381 lastAddress = self.stream.pos+7>>3 1382 return ''.join(map('{:02x} '.format, 1383 self.stream.data[firstAddress:lastAddress])) 1384 1385 def formatBitData(self, pos, width1, width2=0): 1386 """Show formatted bit data: 1387 Bytes are separated by commas 1388 whole bytes are displayed in hex 1389 >>> Layout(olleke).formatBitData(6, 2, 16) 1390 '|00h|2Eh,|00' 1391 >>> Layout(olleke).formatBitData(4, 1, 0) 1392 '1' 1393 """ 1394 result = [] 1395 #make empty prefix code explicit 1396 if width1==0: result = ['()', ','] 1397 for width in width1, width2: 1398 #skip empty width2 1399 if width==0: continue 1400 #build result backwards in a list 1401 while width>0: 1402 availableBits = 8-(pos&7) 1403 if width<availableBits: 1404 #read partial byte, beginning nor ending at boundary 1405 data = self.stream.data[pos>>3] >> (pos&7) & (1<<width)-1 1406 result.append('{:0{}b}'.format(data, width)) 1407 elif availableBits<8: 1408 #read rest of byte, ending at boundary 1409 data = self.stream.data[pos>>3] >> (pos&7) 1410 result.append('|{:0{}b}'.format(data, availableBits)) 1411 else: 1412 #read whole byte (in hex), beginning and ending at boundary 1413 data = self.stream.data[pos>>3] 1414 result.append('|{:02X}h'.format(data)) 1415 width -= availableBits 1416 pos += availableBits 1417 #if width overshot from the availableBits subtraction, fix it 1418 pos += width 1419 #add comma to separate fields 1420 result.append(',') 1421 #concatenate pieces, reversed, skipping the last space 1422 return ''.join(result[-2::-1]) 1423 1424 def readPrefixCode(self, alphabet): 1425 """give alphabet the prefix code that is read from the stream 1426 Called for the following alphabets, in this order: 1427 The alphabet in question must have a "logical" order, 1428 otherwise the assignment of symbols doesn't work. 1429 """ 1430 mode, numberOfSymbols = self.verboseRead(PrefixCodeHeader(alphabet.name)) 1431 if mode=='Complex': 1432 #for a complex code, numberOfSymbols means hskip 1433 self.readComplexCode(numberOfSymbols, alphabet) 1434 return alphabet 1435 else: 1436 table = [] 1437 #Set table of lengths for mnemonic function 1438 lengths = [[0], [1,1], [1,2,2], '????'][numberOfSymbols-1] 1439 #adjust mnemonic function of alphabet class 1440 def myMnemonic(index): 1441 return '{} bit{}: {}'.format( 1442 lengths[i], 1443 '' if lengths[i]==1 else 's', 1444 alphabet.__class__.mnemonic(alphabet, index) 1445 ) 1446 alphabet.mnemonic = myMnemonic 1447 for i in range(numberOfSymbols): 1448 table.append(self.verboseRead(alphabet, skipExtra=True).index) 1449 #restore mnemonic 1450 del alphabet.mnemonic 1451 if numberOfSymbols==4: 1452 #read tree shape to redefine lengths 1453 lengths = self.verboseRead(TreeShapeAlhabet()) 1454 #construct the alphabet prefix code 1455 alphabet.setLength(dict(zip(table, lengths))) 1456 return alphabet 1457 1458 def readComplexCode(self, hskip, alphabet): 1459 """Read complex code""" 1460 stream = self.stream 1461 #read the lengths for the length code 1462 lengths = [1,2,3,4,0,5,17,6,16,7,8,9,10,11,12,13,14,15][hskip:] 1463 codeLengths = {} 1464 total = 0 1465 lol = LengthOfLengthAlphabet('##'+alphabet.name) 1466 #lengthCode will be used for coding the lengths of the new code 1467 #we use it for display until now; definition comes below 1468 lengthCode = LengthAlphabet('#'+alphabet.name) 1469 lengthIter = iter(lengths) 1470 while total<32: 1471 newSymbol = next(lengthIter) 1472 lol.description = str(lengthCode[newSymbol]) 1473 length = self.verboseRead(lol) 1474 if length: 1475 codeLengths[newSymbol] = length 1476 total += 32>>length 1477 if total>=32: 1478 break 1479 if total>32: raise ValueError("Stream format") 1480 #Now set the encoding of the lengthCode 1481 lengthCode.setLength(codeLengths) 1482 print("***** Lengths for {} will be coded as:".format(alphabet.name)) 1483 lengthCode.showCode() 1484 #Now determine the symbol lengths with the lengthCode 1485 symbolLengths = {} 1486 total = 0 1487 lastLength = 8 1488 alphabetIter = iter(alphabet) 1489 while total<32768: 1490 #look ahead to see what is going to happen 1491 length = lengthCode.decodePeek( 1492 self.stream.peek(lengthCode.maxLength))[1].index 1493 #in every branch, set lengthCode.description to explanatory text 1494 #lengthCode calls format(symbol, extra) with this string 1495 if length==0: 1496 symbol = next(alphabetIter) 1497 lengthCode.description = 'symbol {} unused'.format(symbol) 1498 self.verboseRead(lengthCode) 1499 #unused symbol 1500 continue 1501 if length==16: 1502 lengthCode.description = \ 1503 '{1}+3 symbols of length '+str(lastLength) 1504 extra = self.verboseRead(lengthCode) 1505 #scan series of 16s (repeat counts) 1506 #start with repeat count 2 1507 repeat = 2 1508 startSymbol = next(alphabetIter) 1509 endSymbol = next(alphabetIter) 1510 symbolLengths[startSymbol.index] = \ 1511 symbolLengths[endSymbol.index] = lastLength 1512 #count the two just defined symbols 1513 total += 2*32768>>lastLength 1514 #note: loop may end because we're there 1515 #even if a 16 _appears_ to follow 1516 while True: 1517 #determine last symbol 1518 oldRepeat = repeat 1519 repeat = (repeat-2<<2)+extra+3 1520 #read as many symbols as repeat increased 1521 for i in range(oldRepeat, repeat): 1522 endSymbol = next(alphabetIter) 1523 symbolLengths[endSymbol.index] = lastLength 1524 #compute new total; it may be end of loop 1525 total += (repeat-oldRepeat)*32768>>lastLength 1526 if total>=32768: break 1527 #see if there is more to do 1528 length = lengthCode.decodePeek( 1529 self.stream.peek(lengthCode.maxLength))[1].index 1530 if length!=16: break 1531 lengthCode.description = 'total {}+{{1}} symbols'.format( 1532 (repeat-2<<2)+3) 1533 extra = self.verboseRead(lengthCode) 1534 elif length==17: 1535 #read, and show explanation 1536 lengthCode.description = '{1}+3 unused' 1537 extra = self.verboseRead(lengthCode) 1538 #scan series of 17s (groups of zero counts) 1539 #start with repeat count 2 1540 repeat = 2 1541 startSymbol = next(alphabetIter) 1542 endSymbol = next(alphabetIter) 1543 #note: loop will not end with total==32768, 1544 #since total doesn't change here 1545 while True: 1546 #determine last symbol 1547 oldRepeat = repeat 1548 repeat = (repeat-2<<3)+extra+3 1549 #read as many symbols as repeat increases 1550 for i in range(repeat-oldRepeat): 1551 endSymbol = next(alphabetIter) 1552 #see if there is more to do 1553 length = lengthCode.decodePeek( 1554 self.stream.peek(lengthCode.maxLength))[1].index 1555 if length!=17: break 1556 lengthCode.description = 'total {}+{{1}} unused'.format( 1557 (repeat-2<<3)+3) 1558 extra = self.verboseRead(lengthCode) 1559 else: 1560 symbol = next(alphabetIter) 1561 #double braces for format 1562 char = str(symbol) 1563 if char in '{}': char *= 2 1564 lengthCode.description = \ 1565 'Length for {} is {{0.index}} bits'.format(char) 1566 #output is not needed (will be 0) 1567 self.verboseRead(lengthCode) 1568 symbolLengths[symbol.index] = length 1569 total += 32768>>length 1570 lastLength = length 1571 assert total==32768 1572 alphabet.setLength(symbolLengths) 1573 print('End of table. Prefix code '+alphabet.name+':') 1574 alphabet.showCode() 1575 1576 #stream 1577 def processStream(self): 1578 """Process a brotli stream. 1579 """ 1580 print('addr hex{:{}s}binary context explanation'.format( 1581 '', self.width-10)) 1582 print('Stream header'.center(60, '-')) 1583 self.windowSize = self.verboseRead(WindowSizeAlphabet()) 1584 print('Metablock header'.center(60, '=')) 1585 self.ISLAST = False 1586 self.output = bytearray() 1587 while not self.ISLAST: 1588 self.ISLAST = self.verboseRead( 1589 BoolCode('LAST', description="Last block")) 1590 if self.ISLAST: 1591 if self.verboseRead( 1592 BoolCode('EMPTY', description="Empty block")): break 1593 if self.metablockLength(): continue 1594 if not self.ISLAST and self.uncompressed(): continue 1595 print('Block type descriptors'.center(60, '-')) 1596 self.numberOfBlockTypes = {} 1597 self.currentBlockCounts = {} 1598 self.blockTypeCodes = {} 1599 self.blockCountCodes = {} 1600 for blockType in (L,I,D): self.blockType(blockType) 1601 print('Distance code parameters'.center(60, '-')) 1602 self.NPOSTFIX, self.NDIRECT = self.verboseRead(DistanceParamAlphabet()) 1603 self.readLiteralContextModes() 1604 print('Context maps'.center(60, '-')) 1605 self.cmaps = {} 1606 #keep the number of each kind of prefix tree for the last loop 1607 numberOfTrees = {I: self.numberOfBlockTypes[I]} 1608 for blockType in (L,D): 1609 numberOfTrees[blockType] = self.contextMap(blockType) 1610 print('Prefix code lists'.center(60, '-')) 1611 self.prefixCodes = {} 1612 for blockType in (L,I,D): 1613 self.readPrefixArray(blockType, numberOfTrees[blockType]) 1614 self.metablock() 1615 1616 #metablock header 1617 def verboseRead(self, alphabet, context='', skipExtra=False): 1618 """Read symbol and extra from stream and explain what happens. 1619 Returns the value of the symbol 1620 >>> olleke.pos = 0 1621 >>> l = Layout(olleke) 1622 >>> l.verboseRead(WindowSizeAlphabet()) 1623 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288 1624 4194288 1625 """ 1626 #TODO 2: verbosity level, e.g. show only codes and maps in header 1627 stream = self.stream 1628 pos = stream.pos 1629 if skipExtra: 1630 length, symbol = alphabet.readTuple(stream) 1631 extraBits, extra = 0, None 1632 else: 1633 length, symbol, extraBits, extra = alphabet.readTupleAndExtra( 1634 stream) 1635 #fields: address, hex data, binary data, name of alphabet, explanation 1636 hexdata = self.makeHexData(pos) 1637 addressField = '{:04x}'.format(pos+7>>3) if hexdata else '' 1638 bitdata = self.formatBitData(pos, length, extraBits) 1639 #bitPtr moves bitdata so that the bytes are easier to read 1640 #jump back to right if a new byte starts 1641 if '|' in bitdata[1:]: 1642 #start over on the right side 1643 self.bitPtr = self.width 1644 fillWidth = self.bitPtr-(len(hexdata)+len(bitdata)) 1645 if fillWidth<0: fillWidth = 0 1646 print('{:<5s} {:<{}s} {:7s} {}'.format( 1647 addressField, 1648 hexdata+' '*fillWidth+bitdata, self.width, 1649 context+alphabet.name, 1650 symbol if skipExtra else symbol.explanation(extra), 1651 )) 1652 #jump to the right if we started with a '|' 1653 #because we didn't jump before printing 1654 if bitdata.startswith('|'): self.bitPtr = self.width 1655 else: self.bitPtr -= len(bitdata) 1656 return symbol if skipExtra else symbol.value(extra) 1657 1658 def metablockLength(self): 1659 """Read MNIBBLES and meta block length; 1660 if empty block, skip block and return true. 1661 """ 1662 self.MLEN = self.verboseRead(MetablockLengthAlphabet()) 1663 if self.MLEN: 1664 return False 1665 #empty block; skip and return False 1666 self.verboseRead(ReservedAlphabet()) 1667 MSKIP = self.verboseRead(SkipLengthAlphabet()) 1668 self.verboseRead(FillerAlphabet(streamPos=self.stream.pos)) 1669 self.stream.pos += 8*MSKIP 1670 print("Skipping to {:x}".format(self.stream.pos>>3)) 1671 return True 1672 1673 def uncompressed(self): 1674 """If true, handle uncompressed data 1675 """ 1676 ISUNCOMPRESSED = self.verboseRead( 1677 BoolCode('UNCMPR', description='Is uncompressed?')) 1678 if ISUNCOMPRESSED: 1679 self.verboseRead(FillerAlphabet(streamPos=self.stream.pos)) 1680 print('Uncompressed data:') 1681 self.output += self.stream.readBytes(self.MLEN) 1682 print(outputFormatter(self.output[-self.MLEN:])) 1683 return ISUNCOMPRESSED 1684 1685 def blockType(self, kind): 1686 """Read block type switch descriptor for given kind of blockType.""" 1687 NBLTYPES = self.verboseRead(TypeCountAlphabet( 1688 'BT#'+kind[0].upper(), 1689 description='{} block types'.format(kind), 1690 )) 1691 self.numberOfBlockTypes[kind] = NBLTYPES 1692 if NBLTYPES>=2: 1693 self.blockTypeCodes[kind] = self.readPrefixCode( 1694 BlockTypeAlphabet('BT'+kind[0].upper(), NBLTYPES)) 1695 self.blockCountCodes[kind] = self.readPrefixCode( 1696 BlockCountAlphabet('BC'+kind[0].upper())) 1697 blockCount = self.verboseRead(self.blockCountCodes[kind]) 1698 else: 1699 blockCount = 1<<24 1700 self.currentBlockCounts[kind] = blockCount 1701 1702 def readLiteralContextModes(self): 1703 """Read literal context modes. 1704 LSB6: lower 6 bits of last char 1705 MSB6: upper 6 bits of last char 1706 UTF8: rougly dependent on categories: 1707 upper 4 bits depend on category of last char: 1708 control/whitespace/space/ punctuation/quote/%/open/close/ 1709 comma/period/=/digits/ VOWEL/CONSONANT/vowel/consonant 1710 lower 2 bits depend on category of 2nd last char: 1711 space/punctuation/digit or upper/lowercase 1712 signed: hamming weight of last 2 chars 1713 """ 1714 print('Context modes'.center(60, '-')) 1715 self.literalContextModes = [] 1716 for i in range(self.numberOfBlockTypes[L]): 1717 self.literalContextModes.append( 1718 self.verboseRead(LiteralContextMode(number=i))) 1719 1720 def contextMap(self, kind): 1721 """Read context maps 1722 Returns the number of differnt values on the context map 1723 (In other words, the number of prefix trees) 1724 """ 1725 NTREES = self.verboseRead(TypeCountAlphabet( 1726 kind[0].upper()+'T#', 1727 description='{} prefix trees'.format(kind))) 1728 mapSize = {L:64, D:4}[kind] 1729 if NTREES<2: 1730 self.cmaps[kind] = [0]*mapSize 1731 else: 1732 #read CMAPkind 1733 RLEMAX = self.verboseRead(RLEmaxAlphabet( 1734 'RLE#'+kind[0].upper(), 1735 description=kind+' context map')) 1736 alphabet = TreeAlphabet('CM'+kind[0].upper(), NTREES=NTREES, RLEMAX=RLEMAX) 1737 cmapCode = self.readPrefixCode(alphabet) 1738 tableSize = mapSize*self.numberOfBlockTypes[kind] 1739 cmap = [] 1740 while len(cmap)<tableSize: 1741 cmapCode.description = 'map {}, entry {}'.format( 1742 *divmod(len(cmap), mapSize)) 1743 count, value = self.verboseRead(cmapCode) 1744 cmap.extend([value]*count) 1745 assert len(cmap)==tableSize 1746 IMTF = self.verboseRead(BoolCode('IMTF', description='Apply inverse MTF')) 1747 if IMTF: 1748 self.IMTF(cmap) 1749 if kind==L: 1750 print('Context maps for literal data:') 1751 for i in range(0, len(cmap), 64): 1752 print(*( 1753 ''.join(map(str, cmap[j:j+8])) 1754 for j in range(i, i+64, 8) 1755 )) 1756 else: 1757 print('Context map for distances:') 1758 print(*( 1759 ''.join(map('{:x}'.format, cmap[i:i+4])) 1760 for i in range(0, len(cmap), 4) 1761 )) 1762 self.cmaps[kind] = cmap 1763 return NTREES 1764 1765 @staticmethod 1766 def IMTF(v): 1767 """In place inverse move to front transform. 1768 """ 1769 #mtf is initialized virtually with range(infinity) 1770 mtf = [] 1771 for i, vi in enumerate(v): 1772 #get old value from mtf. If never seen, take virtual value 1773 try: value = mtf.pop(vi) 1774 except IndexError: value = vi 1775 #put value at front 1776 mtf.insert(0, value) 1777 #replace transformed value 1778 v[i] = value 1779 1780 def readPrefixArray(self, kind, numberOfTrees): 1781 """Read prefix code array""" 1782 prefixes = [] 1783 for i in range(numberOfTrees): 1784 if kind==L: alphabet = LiteralAlphabet(i) 1785 elif kind==I: alphabet = InsertAndCopyAlphabet(i) 1786 elif kind==D: alphabet = DistanceAlphabet( 1787 i, NPOSTFIX=self.NPOSTFIX, NDIRECT=self.NDIRECT) 1788 self.readPrefixCode(alphabet) 1789 prefixes.append(alphabet) 1790 self.prefixCodes[kind] = prefixes 1791 1792 #metablock data 1793 def metablock(self): 1794 """Process the data. 1795 Relevant variables of self: 1796 numberOfBlockTypes[kind]: number of block types 1797 currentBlockTypes[kind]: current block types (=0) 1798 literalContextModes: the context modes for the literal block types 1799 currentBlockCounts[kind]: counters for block types 1800 blockTypeCodes[kind]: code for block type 1801 blockCountCodes[kind]: code for block count 1802 cmaps[kind]: the context maps (not for I) 1803 prefixCodes[kind][#]: the prefix codes 1804 lastDistances: the last four distances 1805 lastChars: the last two chars 1806 output: the result 1807 """ 1808 print('Meta block contents'.center(60, '=')) 1809 self.currentBlockTypes = {L:0, I:0, D:0, pL:1, pI:1, pD:1} 1810 self.lastDistances = deque([17,16,11,4], maxlen=4) 1811 #the current context mode is for block type 0 1812 self.contextMode = ContextModeKeeper(self.literalContextModes[0]) 1813 wordList = WordList() 1814 1815 #setup distance callback function 1816 def distanceCallback(symbol, extra): 1817 "callback function for displaying decoded distance" 1818 index, offset = symbol.value(extra) 1819 if index: 1820 #recent distance 1821 distance = self.lastDistances[-index]+offset 1822 return 'Distance: {}last{:+d}={}'.format(index, offset, distance) 1823 #absolute value 1824 if offset<=maxDistance: 1825 return 'Absolute value: {} (pos {})'.format(offset, maxDistance-offset) 1826 #word list value 1827 action, word = divmod(offset-maxDistance, 1<<wordList.NDBITS[copyLen]) 1828 return '{}-{} gives word {},{} action {}'.format( 1829 offset, maxDistance, copyLen, word, action) 1830 for dpc in self.prefixCodes[D]: dpc.callback = distanceCallback 1831 1832 blockLen = 0 1833 #there we go 1834 while blockLen<self.MLEN: 1835 #get insert© command 1836 litLen, copyLen, dist0Flag = self.verboseRead( 1837 self.prefixCodes[I][ 1838 self.figureBlockType(I)]) 1839 #literal data 1840 for i in range(litLen): 1841 bt = self.figureBlockType(L) 1842 cm = self.contextMode.getIndex() 1843 ct = self.cmaps[L][bt<<6|cm] 1844 char = self.verboseRead( 1845 self.prefixCodes[L][ct], 1846 context='{},{}='.format(bt,cm)) 1847 self.contextMode.add(char) 1848 self.output.append(char) 1849 blockLen += litLen 1850 #check if we're done 1851 if blockLen>=self.MLEN: return 1852 #distance 1853 #distances are computed relative to output length, at most window size 1854 maxDistance = min(len(self.output), self.windowSize) 1855 if dist0Flag: 1856 distance = self.lastDistances[-1] 1857 else: 1858 bt = self.figureBlockType(D) 1859 cm = {2:0, 3:1, 4:2}.get(copyLen, 3) 1860 ct = self.cmaps[D][bt<<2|cm] 1861 index, offset = self.verboseRead( 1862 self.prefixCodes[D][ct], 1863 context='{},{}='.format(bt,cm)) 1864 distance = self.lastDistances[-index]+offset if index else offset 1865 if index==1 and offset==0: 1866 #to make sure distance is not put in last distance list 1867 dist0Flag = True 1868 if distance<=maxDistance: 1869 #copy from output 1870 for i in range( 1871 maxDistance-distance, 1872 maxDistance-distance+copyLen): 1873 self.output.append(self.output[i]) 1874 if not dist0Flag: self.lastDistances.append(distance) 1875 comment = 'Seen before' 1876 else: 1877 #fetch from wordlist 1878 newWord = wordList.word(copyLen, distance-maxDistance-1) 1879 self.output.extend(newWord) 1880 #adjust copyLen to reflect actual new data 1881 copyLen = len(newWord) 1882 comment = 'From wordlist' 1883 blockLen += copyLen 1884 print(' '*40, 1885 comment, 1886 ': "', 1887 outputFormatter(self.output[-copyLen:]), 1888 '"', 1889 sep='') 1890 self.contextMode.add(self.output[-2]) 1891 self.contextMode.add(self.output[-1]) 1892 1893 def figureBlockType(self, kind): 1894 counts, types = self.currentBlockCounts, self.currentBlockTypes 1895 if counts[kind]==0: 1896 newType = self.verboseRead(self.blockTypeCodes[kind]) 1897 if newType==-2: newType = types['P'+kind] 1898 elif newType==-1: 1899 newType = (types[kind]+1)%self.numberOfBlockTypes[kind] 1900 types['P'+kind] = types[kind] 1901 types[kind] = newType 1902 counts[kind] = self.verboseRead(self.blockCountCodes[kind]) 1903 counts[kind] -=1 1904 return types[kind] 1905 1906__test__ = { 1907'BitStream': """ 1908 >>> bs = BitStream(b'Jurjen') 1909 >>> bs.readBytes(2) 1910 b'Ju' 1911 >>> bs.read(6) #r=01110010 1912 50 1913 >>> bs 1914 BitStream(pos=2:6) 1915 >>> bs.peek(5) #j=01101010 1916 9 1917 >>> bs.readBytes(2) 1918 Traceback (most recent call last): 1919 ... 1920 ValueError: readBytes: need byte boundary 1921 """, 1922 1923'Symbol': """ 1924 >>> a=Symbol(MetablockLengthAlphabet(),5) 1925 >>> len(a) 1926 2 1927 >>> int(a) 1928 5 1929 >>> a.bitPattern() 1930 '01' 1931 >>> a.value(200000) 1932 200001 1933 >>> a.explanation(300000) 1934 'data length: 493e0h+1=300001' 1935 """, 1936 1937'RangeDecoder': """ 1938 >>> a=RangeDecoder(bitLength=3) 1939 >>> len(a) 1940 8 1941 >>> a.name='t' 1942 >>> list(a) 1943 [Symbol(t, 0), Symbol(t, 1), Symbol(t, 2), Symbol(t, 3), Symbol(t, 4), Symbol(t, 5), Symbol(t, 6), Symbol(t, 7)] 1944 >>> a[2] 1945 Symbol(t, 2) 1946 >>> a.bitPattern(4) 1947 '100' 1948 >>> a.length(2) 1949 3 1950 >>> a.decodePeek(15) 1951 (3, Symbol(t, 7)) 1952 >>> 1953 1954 """, 1955 1956'PrefixDecoder': """ 1957 >>> a=PrefixDecoder(decodeTable={0:1,1:2,3:3,7:4}) 1958 >>> len(a) 1959 4 1960 >>> a.name='t' 1961 >>> list(a) 1962 [Symbol(t, 1), Symbol(t, 2), Symbol(t, 3), Symbol(t, 4)] 1963 >>> a.decodePeek(22) 1964 (1, Symbol(t, 1)) 1965 >>> a.decodePeek(27) 1966 (3, Symbol(t, 3)) 1967 >>> a.length(1) 1968 1 1969 >>> a.length(4) 1970 3 1971 """, 1972 1973'Code': """ 1974 >>> a=Code('t',alphabetSize=10) 1975 >>> len(a) 1976 10 1977 >>> a.showCode() 1978 0000:0 0001:1 0010:2 0011:3 0100:4 0101:5 0110:6 0111:7 1000:8 1001:9 1979 >>> a.setLength({2:1,3:2,5:3,6:3}) 1980 >>> a.showCode() 1981 0:2 01:3 011:5 111:6 1982 >>> len(a) 1983 4 1984 >>> def callback(i): return 'call{}back'.format(i) 1985 >>> a=Code('t',callback=callback,bitLength=3) 1986 >>> a[6].explanation() 1987 'call6back' 1988 """, 1989 1990'WithExtra': """ 1991 >>> class A(WithExtra): 1992 ... extraTable = [0,1,1,2,2] 1993 >>> a=A('t',alphabetSize=5) 1994 >>> a[1] 1995 Symbol(t, 1) 1996 >>> a.extraBits(2) 1997 1 1998 >>> a.mnemonic(4) 1999 '4' 2000 >>> a.readTupleAndExtra(BitStream(b'\x5b')) 2001 (3, Symbol(t, 3), 2, 3) 2002 """, 2003 2004'BoolCode': """ 2005 >>> BoolCode('test')[0].explanation() 2006 '0: False' 2007 """, 2008 2009'Enumerator': """ 2010 >>> class A(Enumerator): 2011 ... extraTable = [0,1,1,2,2] 2012 ... value0=3 2013 >>> a=A(alphabetLength=5) 2014 >>> a.value(3) 2015 Traceback (most recent call last): 2016 ... 2017 TypeError: value() missing 1 required positional argument: 'extra' 2018 >>> a.explanation(3,4) 2019 'xx 011: 8-11; 8+4=12' 2020 """, 2021 2022'WindowSizeAlphabet': """ 2023 >>> windowSizeAlphabet = WindowSizeAlphabet() 2024 >>> windowSizeAlphabet[0] 2025 Traceback (most recent call last): 2026 ... 2027 ValueError: No symbol WindowSizeAlphabet[0] 2028 >>> len(windowSizeAlphabet) 2029 16 2030 >>> windowSizeAlphabet[21] 2031 Symbol(WSIZE, 21) 2032 >>> windowSizeAlphabet[21].bitPattern() 2033 '1001' 2034 >>> windowSizeAlphabet[21].extraBits() 2035 0 2036 >>> windowSizeAlphabet[21].index 2037 21 2038 >>> windowSizeAlphabet[10].value() 2039 1008 2040 >>> windowSizeAlphabet[10].explanation() 2041 'windowsize=(1<<10)-16=1008' 2042 >>> windowSizeAlphabet.showCode() 2043 0:65520 1100001:16368 1110001:32752 0011:262128 2044 0000001:131056 0010001:None 1001:2097136 1011:4194288 2045 1000001:4080 1010001:8176 0101:524272 0111:1048560 2046 0100001:1008 0110001:2032 1101:8388592 1111:16777200 2047 """, 2048 2049'TypeCountAlphabet': """ 2050 >>> typeCountAlphabet = TypeCountAlphabet(description='bananas') 2051 >>> len(typeCountAlphabet) 2052 9 2053 >>> typeCountAlphabet[3] 2054 Symbol(BT#, 3) 2055 >>> typeCountAlphabet[9] 2056 Traceback (most recent call last): 2057 ... 2058 ValueError: No symbol TypeCountAlphabet[9] 2059 >>> print(typeCountAlphabet[3]) 2060 xx,0101 2061 >>> typeCountAlphabet[8].value(127) 2062 256 2063 >>> typeCountAlphabet[4].explanation(2) 2064 'xxx,0111: 11 bananas' 2065 >>> typeCountAlphabet[0].explanation() 2066 '0: 1 banana' 2067 """, 2068 2069'DistanceParamAlphabet': """ 2070 >>> dpa = DistanceParamAlphabet() 2071 >>> dpa.showCode() 2072 00:PF0 01:PF1 10:PF2 11:PF3 2073 >>> dpa.readTupleAndExtra(BitStream(b'\\x29')) 2074 (2, Symbol(DIST, 1), 4, 10) 2075 >>> dpa.explanation(2, 5) 2076 '2 postfix bits and 0101<<2=20 direct codes' 2077 """, 2078 2079'LiteralAlphabet': """ 2080 >>> LiteralAlphabet(-1).showCode() #doctest: +ELLIPSIS 2081 00000000:\\x00 00110100:4 01101000:h 10011100:\\x9c 11010000:\\xd0 2082 00000001:\\x01 00110101:5 01101001:i 10011101:\\x9d 11010001:\\xd1 2083 00000010:\\x02 00110110:6 01101010:j 10011110:\\x9e 11010010:\\xd2 2084 ... 2085 00101111:/ 01100011:c 10010111:\\x97 11001011:\\xcb 11111111:\\xff 2086 00110000:0 01100100:d 10011000:\\x98 11001100:\\xcc 2087 00110001:1 01100101:e 10011001:\\x99 11001101:\\xcd 2088 00110010:2 01100110:f 10011010:\\x9a 11001110:\\xce 2089 00110011:3 01100111:g 10011011:\\x9b 11001111:\\xcf 2090 """, 2091 2092'BlockCountAlphabet': """ 2093 >>> bc=BlockCountAlphabet('BCL') 2094 >>> len(bc) 2095 26 2096 >>> bs=BitStream(b'\\x40\\x83\\xc8\\x59\\12\\x02') 2097 >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3]) 2098 'Block count: xx 00000: 1-4; 1+2=3' 2099 >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3]) 2100 'Block count: xxx 00110: 33-40; 33+0=33' 2101 >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3]) 2102 'Block count: xxxxxx 10001: 305-368; 305+28=333' 2103 >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3]) 2104 'Block count: xxxxxxxxxxx 10110: 2289-4336; 2289+1044=3333' 2105 """, 2106 2107'Layout': """ 2108 >>> olleke.pos = 0 2109 >>> l = Layout(olleke) 2110 >>> l.verboseRead(WindowSizeAlphabet()) 2111 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288 2112 4194288 2113 >>> l.verboseRead(BoolCode('LAST', description="Last block")) 2114 1 LAST Last block: 1: True 2115 True 2116 >>> l.verboseRead(BoolCode('EMPTY', description="Empty block")) 2117 0 EMPTY Empty block: 0: False 2118 False 2119 >>> l.verboseRead(MetablockLengthAlphabet()) 2120 0001 2e 00 |00h|2Eh,|00 MLEN data length: 002eh+1=47 2121 47 2122 >>> olleke.pos = 76 2123 >>> l = Layout(olleke) 2124 >>> x = l.verboseRead(DistanceAlphabet(0,NPOSTFIX=0,NDIRECT=0), skipExtra=True) 2125 000a 82 10|1100 D0 10[15*x]-3 2126 >>> x.explanation(0x86a3) 2127 '10[1000011010100011]-3: [0]+100000' 2128 """, 2129 2130'olleke': """ 2131 >>> olleke.pos = 0 2132 >>> try: Layout(olleke).processStream() 2133 ... except NotImplementedError: pass 2134 ... #doctest: +REPORT_NDIFF 2135 addr hex binary context explanation 2136 -----------------------Stream header------------------------ 2137 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288 2138 ======================Metablock header====================== 2139 1 LAST Last block: 1: True 2140 0 EMPTY Empty block: 0: False 2141 0001 2e 00 |00h|2Eh,|00 MLEN data length: 002eh+1=47 2142 -------------------Block type descriptors------------------- 2143 0003 00 0 BT#L 0: 1 literal block type 2144 0 BT#I 0: 1 insert© block type 2145 0 BT#D 0: 1 distance block type 2146 ------------------Distance code parameters------------------ 2147 0004 44 0|000,00 DIST 0 postfix bits and 0000<<0=0 direct codes 2148 -----------------------Context modes------------------------ 2149 10 LC0 Context mode for type 0: 2(UTF8) 2150 ------------------------Context maps------------------------ 2151 0 LT# 0: 1 literal prefix tree 2152 0 DT# 0: 1 distance prefix tree 2153 ---------------------Prefix code lists---------------------- 2154 10 PFX L0 is complex with lengths 3,4,0,5,17... 2155 0005 4f 1|0 ##L0 len 3: coded with 3 bits 2156 0111 ##L0 len 4: coded with 1 bits 2157 10 ##L0 unused: coded with 3 bits 2158 0006 d6 0|0 ##L0 len 5: skipped 2159 011 ##L0 zero xxx: coded with 2 bits 2160 ***** Lengths for L0 will be coded as: 2161 0:len 4 01:zero xxx 011:unused 111:len 3 2162 0007 95 1|11,01 #L0 7+3 unused 2163 0 #L0 Length for \\n is 4 bits 2164 001,01 #L0 1+3 unused 2165 0008 44 010,0|1 #L0 total 19+2 unused 2166 0 #L0 Length for " " is 4 bits 2167 0 #L0 Length for ! is 4 bits 2168 0009 cb 011,|01 #L0 3+3 unused 2169 |110,01 #L0 total 35+6 unused 2170 000a 82 0 #L0 Length for K is 4 bits 2171 000,01 #L0 0+3 unused 2172 0 #L0 Length for O is 4 bits 2173 000b 4d 01|1 #L0 symbol P unused 2174 011 #L0 symbol Q unused 2175 0 #L0 Length for R is 4 bits 2176 000c 88 000,|01 #L0 0+3 unused 2177 |100,01 #L0 total 11+4 unused 2178 000d b6 0 #L0 Length for b is 4 bits 2179 011 #L0 symbol c unused 2180 011 #L0 symbol d unused 2181 000e 27 11|1 #L0 Length for e is 3 bits 2182 010,01 #L0 2+3 unused 2183 |0 #L0 Length for k is 4 bits 2184 000f 1f 111 #L0 Length for l is 3 bits 2185 011 #L0 symbol m unused 2186 0 #L0 Length for n is 4 bits 2187 |0 #L0 Length for o is 4 bits 2188 0010 c1 000,01 #L0 0+3 unused 2189 0 #L0 Length for s is 4 bits 2190 0011 b4 0|11 #L0 symbol t unused 2191 0 #L0 Length for u is 4 bits 2192 End of table. Prefix code L0: 2193 000:e 0010:\\n 0110:! 0001:O 0101:b 0011:n 0111:s 2194 100:l 1010:" " 1110:K 1001:R 1101:k 1011:o 1111:u 2195 11,01 PFX IC0 is simple with 4 code words 2196 0012 2a |2Ah|10 IC0 ? bits: I5C4 2197 0013 b5 ec 00|B5h IC0 ? bits: I6+xC7 2198 0015 22 0010|111011 IC0 ? bits: I8+xC5 2199 0016 8c 001100|0010 IC0 ? bits: I0C14+xx 2200 0 SHAPE False: lengths 2,2,2,2 2201 0017 74 10,0|1 PFX D0 is simple with 3 code words 2202 0018 a6 0|01110 D0 1 bit: 2last-3 2203 010011 D0 2 bits: 11xx-3 2204 0019 aa 01010|1 D0 2 bits: 11xxx-3 2205 ====================Meta block contents===================== 2206 |1,01 IC0 Literal: 9, copy: 5 2207 001a 41 0001 0,0=L0 O 2208 100 0,48=L0 l 2209 001b a2 10|0 0,62=L0 l 2210 000 0,63=L0 e 2211 001c a1 1|101 0,59=L0 k 2212 000 0,63=L0 e 2213 |1010 0,59=L0 " " 2214 001d b5 0101 0,11=L0 b 2215 |1011 0,60=L0 o 2216 001e 24 0 0,3=D0 Distance: 2last-3=8 2217 Seen before: "lleke" 2218 0,10 IC0 Literal: 6, copy: 7 2219 |0010 0,59=L0 \\n 2220 001f 89 1001 0,7=L0 R 2221 000 0,52=L0 e 2222 0020 fa 010|1 0,58=L0 b 2223 1111 0,63=L0 u 2224 0021 eb 011|1 0,59=L0 s 2225 11,01 0,3=D0 Absolute value: 12 (pos 8) 2226 Seen before: "olleke\\n" 2227 0022 db 01,1|1 IC0 Literal: 0, copy: 15 2228 |110,11 0,3=D0 Absolute value: 27 (pos 0) 2229 Seen before: "Olleke bolleke\\n" 2230 0023 f8 00 IC0 Literal: 5, copy: 4 2231 1110 0,7=L0 K 2232 0024 2c 00|11 0,52=L0 n 2233 1011 0,62=L0 o 2234 0025 0d 1|00 0,59=L0 l 2235 0110 0,63=L0 ! 2236 """, 2237 2238'file': """ 2239 >>> try: Layout(BitStream( 2240 ... open("H:/Downloads/brotli-master/tests/testdata/10x10y.compressed",'rb') 2241 ... .read())).processStream() 2242 ... except NotImplementedError: pass 2243 addr hex binary context explanation 2244 -----------------------Stream header------------------------ 2245 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288 2246 ======================Metablock header====================== 2247 1 LAST Last block: 1: True 2248 0 EMPTY Empty block: 0: False 2249 0001 13 00 |00h|13h,|00 MLEN data length: 0013h+1=20 2250 -------------------Block type descriptors------------------- 2251 0003 00 0 BT#L 0: 1 literal block type 2252 0 BT#I 0: 1 insert© block type 2253 0 BT#D 0: 1 distance block type 2254 ------------------Distance code parameters------------------ 2255 0004 a4 0|000,00 DIST 0 postfix bits and 0000<<0=0 direct codes 2256 -----------------------Context modes------------------------ 2257 10 LC0 Context mode for type 0: 2(UTF8) 2258 ------------------------Context maps------------------------ 2259 0 LT# 0: 1 literal prefix tree 2260 0 DT# 0: 1 distance prefix tree 2261 ---------------------Prefix code lists---------------------- 2262 0005 b0 0|1,01 PFX L0 is simple with 2 code words 2263 0006 b2 0|1011000 L0 1 bit: X 2264 0007 ea 0|1011001 L0 1 bit: Y 2265 01,01 PFX IC0 is simple with 2 code words 2266 0008 81 0000001|111 IC0 1 bit: I1C9&D=0 2267 0009 47 02 0|47h|1 IC0 1 bit: I1C9 2268 00,01 PFX D0 is simple with 1 code word 2269 000b 8a 010|000 D0 0 bits: 10x-3 2270 ====================Meta block contents===================== 2271 1 IC0 Literal: 1, copy: 9 2272 0 0,0=L0 X 2273 0,() 0,3=D0 Absolute value: 1 (pos 0) 2274 Seen before: "XXXXXXXXX" 2275 0 IC0 Literal: 1, copy: 9, same distance 2276 |1 0,54=L0 Y 2277 Seen before: "YYYYYYYYY" 2278 """, 2279 2280'XY': """ 2281 >>> try: Layout(BitStream(brotli.compress('X'*10+'Y'*10))).processStream() 2282 ... except NotImplementedError: pass 2283 addr hex binary context explanation 2284 -----------------------Stream header------------------------ 2285 0000 1b 1011 WSIZE windowsize=(1<<22)-16=4194288 2286 ======================Metablock header====================== 2287 1 LAST Last block: 1: True 2288 0 EMPTY Empty block: 0: False 2289 0001 13 00 |00h|13h,|00 MLEN data length: 0013h+1=20 2290 -------------------Block type descriptors------------------- 2291 0003 00 0 BT#L 0: 1 literal block type 2292 0 BT#I 0: 1 insert© block type 2293 0 BT#D 0: 1 distance block type 2294 ------------------Distance code parameters------------------ 2295 0004 a4 0|000,00 DIST 0 postfix bits and 0000<<0=0 direct codes 2296 -----------------------Context modes------------------------ 2297 10 LC0 Context mode for type 0: 2(UTF8) 2298 ------------------------Context maps------------------------ 2299 0 LT# 0: 1 literal prefix tree 2300 0 DT# 0: 1 distance prefix tree 2301 ---------------------Prefix code lists---------------------- 2302 0005 b0 0|1,01 PFX L0 is simple with 2 code words 2303 0006 b2 0|1011000 L0 1 bit: X 2304 0007 82 0|1011001 L0 1 bit: Y 2305 00,01 PFX IC0 is simple with 1 code word 2306 0008 84 0000100|100 IC0 0 bits: I4C6&D=0 2307 0009 00 00,0|1 PFX D0 is simple with 1 code word 2308 000a e0 0|00000 D0 0 bits: last 2309 ====================Meta block contents===================== 2310 () IC0 Literal: 4, copy: 6, same distance 2311 0 0,0=L0 X 2312 0 0,52=L0 X 2313 0 0,54=L0 X 2314 0 0,54=L0 X 2315 Seen before: "XXXXXX" 2316 () IC0 Literal: 4, copy: 6, same distance 2317 1 0,54=L0 Y 2318 1 0,54=L0 Y 2319 |1 0,54=L0 Y 2320 000b 01 1 0,54=L0 Y 2321 Seen before: "YYYYYY" 2322 """, 2323 2324'empty': """ 2325 >>> try: Layout(BitStream(b'\\x81\\x16\\x00\\x58')).processStream() 2326 ... except NotImplementedError: pass 2327 addr hex binary context explanation 2328 -----------------------Stream header------------------------ 2329 0000 81 0000001 WSIZE windowsize=(1<<17)-16=131056 2330 ======================Metablock header====================== 2331 |1 LAST Last block: 1: True 2332 0001 16 0 EMPTY Empty block: 0: False 2333 11 MLEN 11: empty block 2334 0 RSVD Reserved (must be zero) 2335 0002 00 000000|00,01 SKIP skip length: 0h+1=1 2336 |00 SKIP 2 bits ignored 2337 Skipping to 4 2338 """, 2339 2340} 2341 2342if __name__=='__main__': 2343 import sys 2344 if len(sys.argv)>1: 2345 l = Layout(BitStream(open(sys.argv[1],'rb').read())) 2346 l.processStream() 2347 else: 2348 sys.path.append("h:/Persoonlijk/bin") 2349 try: 2350 import brotli 2351 open('brotlidump.br', 'wb').write( 2352 brotli.compress( 2353 open('brotlidump.py', 'r').read() 2354 )) 2355 olleke = BitStream(brotli.compress( 2356 'Olleke bolleke\nRebusolleke\nOlleke bolleke\nKnol!')) 2357 except ImportError: pass 2358 import doctest 2359 doctest.testmod(optionflags=doctest.REPORT_NDIFF 2360 #|doctest.FAIL_FAST 2361 ) 2362