• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!python3
2"""Program to dump contents of Brotli compressed files showing the compression format.
3Jurjen N.E. Bos, 2016.
4I found the following issues with the Brotli format:
5- The distance alphabet has size 16+(48<<POSTFIX),
6  but the last symbols are useless.
7  It could be lowered to 16+(44-POSTFIX<<POSTFIX), and this could matter.
8- The block type code is useless if NBLTYPES==2, you would only need 1 symbol
9  anyway, so why don't you just switch to "the other" type?
10"""
11import struct
12from operator import itemgetter, methodcaller
13from itertools import accumulate, repeat
14from collections import defaultdict, deque
15from functools import partial
16
17class InvalidStream(Exception): pass
18#lookup table
19L, I, D = "literal", "insert&copy", "distance"
20pL, pI, pD = 'P'+L, 'P'+I, 'P'+D
21
22def outputCharFormatter(c):
23    """Show character in readable format
24    """
25    #TODO 2: allow hex only output
26    if 32<c<127: return chr(c)
27    elif c==10: return '\\n'
28    elif c==13: return '\\r'
29    elif c==32: return '" "'
30    else: return '\\x{:02x}'.format(c)
31
32def outputFormatter(s):
33    """Show string or char.
34    """
35    result = ''
36    def formatSubString(s):
37        for c in s:
38            if c==32: yield ' '
39            else: yield outputCharFormatter(c)
40    if len(result)<200: return ''.join(formatSubString(s))
41    else:
42        return ''.join(formatSubString(s[:100]))+'...'+ \
43               ''.join(formatSubString(s[-100:]))
44
45
46class BitStream:
47    """Represent a bytes object. Can read bits and prefix codes the way
48    Brotli does.
49    """
50    def __init__(self, byteString):
51        self.data = byteString
52        #position in bits: byte pos is pos>>3, bit pos is pos&7
53        self.pos = 0
54
55    def __repr__(self):
56        """Representation
57        >>> olleke
58        BitStream(pos=0:0)
59        """
60        return "BitStream(pos={:x}:{})".format(self.pos>>3, self.pos&7)
61
62    def read(self, n):
63        """Read n bits from the stream and return as an integer.
64        Produces zero bits beyond the stream.
65        >>> olleke.data[0]==27
66        True
67        >>> olleke.read(5)
68        27
69
70        >>> olleke
71        BitStream(pos=0:5)
72        """
73        value = self.peek(n)
74        self.pos += n
75        if self.pos>len(self.data)*8:
76            raise ValueError('Read past end of stream')
77        return value
78
79    def peek(self, n):
80        """Peek an n bit integer from the stream without updating the pointer.
81        It is not an error to read beyond the end of the stream.
82        >>> olleke.data[:2]==b'\x1b\x2e' and 0x2e1b==11803
83        True
84        >>> olleke.peek(15)
85        11803
86        >>> hex(olleke.peek(32))
87        '0x2e1b'
88        """
89        #read bytes that contain the data: self.data[self.pos>>3:self.pos+n+7>>3]
90        #convert to int: int.from_bytes(..., 'little')
91        #shift out the bits from the first byte: >>(self.pos&7)
92        #mask unwanted bits: & (1<<n)-1
93        return int.from_bytes(
94            self.data[self.pos>>3:self.pos+n+7>>3],
95            'little')>>(self.pos&7) & (1<<n)-1
96
97    def readBytes(self, n):
98        """Read n bytes from the stream on a byte boundary.
99        """
100        if self.pos&7: raise ValueError('readBytes: need byte boundary')
101        result = self.data[self.pos>>3:(self.pos>>3)+n]
102        self.pos += 8*n
103        return result
104
105#-----------------------Symbol-------------------------------------------
106class Symbol:
107    """A symbol in a code.
108    Refers back to the code that contains it.
109    Index is the place in the alphabet of the symbol.
110    """
111    __slots__ = 'code', 'index'
112    def __init__(self, code, value):
113        self.code = code
114        self.index = value
115
116    def __repr__(self):
117        return 'Symbol({}, {})'.format(self.code.name, self.index)
118
119    def __len__(self):
120        """Number of bits in the prefix notation of this symbol
121        """
122        return self.code.length(self.index)
123
124    def __int__(self):
125        return self.index
126
127    #these routines call equivalent routine in Code class
128    def bitPattern(self):
129        """Value of the symbol in the stream
130        """
131        return self.code.bitPattern(self.index)
132
133    def extraBits(self):
134        """Number of extra bits to read for this symbol
135        """
136        return self.code.extraBits(self.index)
137
138    def __str__(self):
139        """Short descriptor of the symbol without extra bits.
140        """
141        return self.code.mnemonic(self.index)
142
143    #requiring optional extra bits, if self.code supports them
144    def value(self, extra=None):
145        """The value used for processing. Can be a tuple.
146        with optional extra bits
147        """
148        if isinstance(self.code, WithExtra):
149            if not 0<=extra<1<<self.extraBits():
150                raise ValueError("value: extra value doesn't fit in extraBits")
151            return self.code.value(self.index, extra)
152        if extra is not None:
153            raise ValueError('value: no extra bits for this code')
154        return self.code.value(self.index)
155
156    def explanation(self, extra=None):
157        """Long explanation of the value from the numeric value
158        with optional extra bits
159        Used by Layout.verboseRead when printing the value
160        """
161        if isinstance(self.code, WithExtra):
162            return self.code.callback(self, extra)
163        return self.code.callback(self)
164
165#========================Code definitions==================================
166class RangeDecoder:
167    """A decoder for the Code class that assumes the symbols
168    are encoded consecutively in binary.
169    It all depends on the "alphabetSize" property.
170    The range runs from 0 to alphabetSize-1.
171    This is the default decoder.
172    """
173    def __init__(self, *, alphabetSize=None, bitLength=None, **args):
174        if bitLength is not None: alphabetSize = 1<<bitLength
175        if alphabetSize is not None:
176            self.alphabetSize = alphabetSize
177            self.maxLength = (alphabetSize-1).bit_length()
178
179    def __len__(self):
180        return self.alphabetSize
181
182    def __iter__(self):
183        """Produce all symbols.
184        """
185        return map(partial(Symbol, self), range(len(self)))
186
187    def __getitem__(self, index):
188        if index>=self.alphabetSize: raise ValueError('index out of range')
189        return Symbol(self, index)
190
191    def bitPattern(self, index):
192        return '{:0{}b}'.format(index, self.maxLength)
193
194    def length(self, index):
195        """Encoding length of given symbol.
196        Does not depend on index in this case.
197        """
198        return self.maxLength
199
200    def decodePeek(self, data):
201        """Find which symbol index matches the given data (from peek, as a number)
202        and return the number of bits decoded.
203        Can also be used to figure out length of a symbol.
204        """
205        return self.maxLength, Symbol(self, data&(1<<self.maxLength)-1)
206
207class PrefixDecoder:
208    """A decoder for the Code class that uses a prefix code.
209    The code is determined by encoding:
210    encode[p] gives the index corresponding to bit pattern p.
211    Used setDecode(decodeTable) to switch the decoder from the default
212    to a prefix decoder, or pass decodeTable at init.
213    You can also use setLength(lengthTable)
214    to define the encoding from the lengths.
215    The set of symbol values does not need to be consecutive.
216    """
217    def __init__(self, *, decodeTable=None, **args):
218        if decodeTable is not None: self.setDecode(decodeTable)
219
220    def __len__(self):
221        return len(self.decodeTable)
222
223    def __iter__(self):
224        def revBits(index):
225            return self.bitPattern(index)[::-1]
226        return (
227            Symbol(self, index)
228            for index in sorted(self.decodeTable.values(), key=revBits)
229            )
230
231    def __getitem__(self, index):
232        if index not in self.lengthTable:
233            raise ValueError('No symbol {}[{}]'.format(
234                self.__class__.__name__, index))
235        return Symbol(self, index)
236
237    def bitPattern(self, index):
238        bits = next(b for (b,s) in self.decodeTable.items() if s==index)
239        return '{:0{}b}'.format(bits, self.length(index))
240
241    def length(self, index):
242        """Encoding length of given symbol.
243        """
244        return self.lengthTable[index]
245
246    def decodePeek(self, data):
247        """Find which symbol index matches the given data (from peek, as a number)
248        and return the number of bits decoded.
249        Can also be used to figure out length of a symbol.
250        """
251        #do binary search for word length
252        #invariant: lo<=length<=hi
253        lo, hi = self.minLength, self.maxLength
254        while lo<=hi:
255            mid = lo+hi>>1
256            #note lo<=mid<hi at this point
257            mask = (1<<mid)-1
258            #lets see what happens if we guess length is mid
259            try: index = self.decodeTable[data&mask]
260            except KeyError:
261                #too many bits specified, reduce estimated length
262                hi = mid-1
263                continue
264            #we found a symbol, but there could be a longer match
265            symbolLength = self.lengthTable[index]
266            if symbolLength<=mid:
267                #all bits match, symbol must be right
268                return symbolLength, Symbol(self, index)
269            #there must be more bits to match
270            lo = mid+1
271        return lo, Symbol(self, index)
272
273    #routine to set up the tables
274    def setDecode(self, decodeTable):
275        """Store decodeTable,
276        and compute lengthTable, minLength, maxLength from encodings.
277        """
278        self.decodeTable = decodeTable
279        #set of symbols with unknown length
280        todo = set(decodeTable)
281        #bit size under investigation
282        maskLength = 0
283        lengthTable = {}
284        while todo:
285            mask = (1<<maskLength)-1
286            #split the encodings that we didn't find yet using b bits
287            splitSymbols = defaultdict(list)
288            for s in todo: splitSymbols[s&mask].append(s)
289            #unique encodings have a length of maskLength bits
290            #set length, and remove from todo list
291            for s,subset in splitSymbols.items():
292                if len(subset)==1:
293                    lengthTable[self.decodeTable[s]] = maskLength
294                    todo.remove(s)
295            #now investigate with longer mask
296            maskLength +=1
297        #save result
298        self.lengthTable = lengthTable
299        self.minLength = min(lengthTable.values())
300        self.maxLength = max(lengthTable.values())
301        self.switchToPrefix()
302
303    def setLength(self, lengthTable):
304        """Given the bit pattern lengths for symbols given in lengthTable,
305        set decodeTable, minLength, maxLength
306        """
307        self.lengthTable = lengthTable
308        self.minLength = min(lengthTable.values())
309        self.maxLength = max(lengthTable.values())
310        #compute the backwards codes first; then reverse them
311        #compute (backwards) first code for every separate lengths
312        nextCodes = []
313        #build codes for each length, from right to left
314        code = 0
315        for bits in range(self.maxLength+1):
316            code <<= 1
317            nextCodes.append(code)
318            code += sum(x==bits for x in lengthTable.values())
319        self.decodeTable = {}
320        #count codes for each length, and store reversed in the table
321        for symbol in sorted(lengthTable):
322            bits = lengthTable[symbol]
323            bitpattern = '{:0{}b}'.format(nextCodes[bits], bits)
324            self.decodeTable[int(bitpattern[::-1], 2)] = symbol
325            nextCodes[bits] += 1
326        self.switchToPrefix()
327
328    def switchToPrefix(self):
329        """This routine makes sure the prefix decoder is activated.
330        """
331        self.mode = PrefixDecoder
332
333class Code(RangeDecoder, PrefixDecoder):
334    """An alphabet of symbols, that can be read from a stream.
335    If you use setDecode or setLength, you have a prefix code,
336    otherwise you have a range code.
337    Features:
338    code[index] produces symbol with given index
339    value(index): value of symbol
340    mnemonic(index): short description of symbol
341    explanation(index): show meaning of symbol, shown in Layout.verboseRead
342    iter(code): produce all symbols in some order
343    name: show as context in Layout.verboseRead
344    """
345    name = '?'
346    #callback is a function that gets the symbol and the extra bits
347    #default callback calls explanation
348    def __init__(self, name=None, *, callback=None, description='', **args):
349        """Don't forget to set either alphabetSize or decodeTable
350        """
351        #set name when provided, otherwise take class variable
352        if name is not None: self.name = name
353        if callback is not None: self.callback = callback
354        self.description = description
355        #mode switch
356        if 'bitLength' in args or 'alphabetSize' in args:
357            self.mode = RangeDecoder
358            RangeDecoder.__init__(self, **args)
359        elif 'decodeTable' in args:
360            self.mode = PrefixDecoder
361            PrefixDecoder.__init__(self, **args)
362        else:
363            super().__init__(**args)
364
365    def __repr__(self):
366        return self.__class__.__name__+' '+self.name
367
368    #the routines that get switched between RangeDecoder and PrefixDecoder
369    def __len__(self): return self.mode.__len__(self)
370    def __iter__(self): return self.mode.__iter__(self)
371    def __getitem__(self, index): return self.mode.__getitem__(self, index)
372    def bitPattern(self, index): return self.mode.bitPattern(self, index)
373    def length(self, index): return self.mode.length(self, index)
374    def decodePeek(self, data): return self.mode.decodePeek(self, data)
375    #general routines
376    def value(self, index, extra=None):
377        """Get value of symbol for computations.
378        Override where needed.
379        """
380        if extra is not None:
381            raise ValueError('value: no extra for this symbol')
382        return index
383
384    def mnemonic(self, index):
385        """Give mnemonic of symbol.
386        Override where needed.
387        """
388        return str(self.value(index))
389
390    def callback(self, symbol):
391        return self.explanation(symbol.index)
392
393    def explanation(self, index):
394        """Long explanation of the value from the numeric value
395        This is a default routine.
396        You can customize in three ways:
397        - set description to add some text
398        - override to get more control
399        - set callback to make it dependent on you local variables
400        """
401        value = self.value(index)
402        return '{0}{1}: {2}'.format(
403            self.description and self.description+': ',
404            self.bitPattern(index),
405            value,
406            )
407
408    def extraBits(self, index):
409        return 0
410
411    #Routines that use the decode interface
412    def showCode(self, width=80):
413        """Show all words of the code in a nice format.
414        """
415        #make table of all symbols with binary strings
416        symbolStrings = [
417            (self.bitPattern(s.index), self.mnemonic(s.index))
418            for s in self
419            ]
420        #determine column widths the way Lisp programmers do it
421        leftColWidth, rightColWidth = map(max, map(
422            map,
423            repeat(len),
424            zip(*symbolStrings)
425            ))
426        colwidth = leftColWidth+rightColWidth
427        columns = 81//(colwidth+2)
428        rows = -(-len(symbolStrings)//columns)
429        def justify(bs):
430            b,s = bs
431            return b.rjust(leftColWidth)+':'+s.ljust(rightColWidth)
432        for i in range(rows):
433            print(' '.join(map(justify, symbolStrings[i::rows])).rstrip())
434
435    def readTuple(self, stream):
436        """Read symbol from stream. Returns symbol, length.
437        """
438        length, symbol = self.decodePeek(stream.peek(self.maxLength))
439        stream.pos += length
440        return length, symbol
441
442    def readTupleAndExtra(self, stream):
443        return self.readTuple(stream)+(0, None)
444
445class WithExtra(Code):
446    """Extension for Code so that symbol may have extra bits associated.
447    If you supply an extraTable, you can use extraBits
448    You can define an extraTable,
449    which allows to call extraBits to get the number of extraBits.
450    Otherwise, you can supply extraBits yourself.
451    Routine readTupleAndExtra now reads the extra bits too.
452    Value probably needs to be overridden; see Enumerator.
453    Note: this does not give you an decodeTable.
454    """
455    #redefine these if you don't want to use an extraTable
456    def extraBits(self, index):
457        """Get the number of extra bits for this symbol.
458        """
459        return self.extraTable[index]
460
461    def mnemonic(self, index):
462        """This value must be independent of extra.
463        """
464        return str(index)
465
466    def readTupleAndExtra(self, stream):
467        """Read symbol and extrabits from stream.
468        Returns symbol length, symbol, extraBits, extra
469        >>> olleke.pos = 6
470        >>> MetablockLengthAlphabet().readTupleAndExtra(olleke)
471        (2, Symbol(MLEN, 4), 16, 46)
472        """
473        length, symbol = self.decodePeek(stream.peek(self.maxLength))
474        stream.pos += length
475        extraBits = self.extraBits(symbol.index)
476        return length, symbol, extraBits, stream.read(extraBits)
477
478    def explanation(self, index, extra=None):
479        """Expanded version of Code.explanation supporting extra bits.
480        If you don't supply extra, it is not mentioned.
481        """
482        extraBits = 0 if extra is None else self.extraBits(index)
483        if not hasattr(self, 'extraTable'):
484            formatString = '{0}{3}'
485            lo = hi = value = self.value(index, extra)
486        elif extraBits==0:
487            formatString = '{0}{2}: {3}'
488            lo, hi = self.span(index)
489            value = lo
490        else:
491            formatString = '{0}{1} {2}: {3}-{4}; {3}+{5}={6}'
492            lo, hi = self.span(index)
493            value = lo+extra
494        return formatString.format(
495            self.description and self.description+': ',
496            'x'*extraBits,
497            self.bitPattern(index),
498            lo, hi,
499            extra,
500            value,
501            )
502
503    def callback(self, symbol, extra):
504        return self.explanation(symbol.index, extra)
505
506class BoolCode(Code):
507    """Same as Code(bitLength=1), but shows a boolean.
508    """
509    def __init__(self, name=None, **args):
510        super().__init__(name, bitLength=1, **args)
511
512    def value(self, index, extra=None):
513        return bool(super().value(index, extra))
514
515class Enumerator(WithExtra):
516    """Code that is defined by the ExtraTable.
517    extraTable is a class variable that contains
518    the extraBits of the symbols from 0
519    value0 contains the value of symbol 0
520    encodings is not neccessary, but allowed.
521    Note: place for FixedCode to make sure extraBits works
522    """
523    def __init__(self, name=None, **args):
524        #if there is no decodeTable to determine length, compute it ourselves
525        if 'decodeTable' not in args:
526            args['alphabetSize'] = len(self.extraTable)
527        super().__init__(name, **args)
528
529    def __len__(self):
530        return len(self.extraTable)
531
532    def __getitem__(self, index):
533        """Faster than PrefixDecoder
534        """
535        if index>=len(self.extraTable):
536            raise ValueError("No symbol {}[{}]".format(
537                self.__class__.__name__, index))
538        return Symbol(self, index)
539
540    def value(self, index, extra):
541        """Override if you don't define value0 and extraTable
542        """
543        lower, upper = self.span(index)
544        value = lower+(extra or 0)
545        if value>upper:
546            raise ValueError('value: extra out of range')
547        return value
548
549    def span(self, index):
550        """Give the range of possible values in a tuple
551        Useful for mnemonic and explanation
552        """
553        lower = self.value0+sum(1<<x for x in self.extraTable[:index])
554        upper = lower+(1<<self.extraTable[index])
555        return lower, upper-1
556
557#======================Code subclasses======================================
558#Alphabets used in the metablock header----------------------------------
559#For prefix codes
560class PrefixCodeHeader(WithExtra):
561    """Header of prefix codes.
562    """
563    def __init__(self, codename):
564        super().__init__('PFX', bitLength=2)
565        #this is the name of the code that it describes
566        self.codename = codename
567
568    def extraBits(self, index):
569        return 2 if index==1 else 0
570
571    def value(self, index, extra):
572        """Returns ('Simple', #codewords) or ('Complex', HSKIP)
573        """
574        if index==1:
575            if extra>3:
576                raise ValueError('value: extra out of range')
577            return 'Simple', extra+1
578        if extra:
579            raise ValueError('value: extra out of range')
580        return 'Complex', index
581
582    def explanation(self, index, extra):
583        if index==1:
584            return '{} is simple with {} code word{}'.format(
585                self.codename, extra+1, 's' if extra else '')
586        lengths = [1, 2, 3, 4, 0, 5, 17, 6]
587        return '{} is complex with lengths {}...'.format(
588            self.codename,
589            ','.join(
590                map(str, lengths[index:index+5]))
591            )
592
593class TreeShapeAlhabet(BoolCode):
594    """The bit used to indicate if four word code is "deep" or "wide"
595    """
596    name = 'SHAPE'
597    def value(self, index):
598        return [(2,2,2,2), (1,2,3,3)][index]
599
600    def explanation(self, index):
601        return str(bool(index))+': lengths {},{},{},{}'.format(*self.value(index))
602
603class LengthOfLengthAlphabet(Code):
604    """For use in decoding complex code descriptors.
605    >>> lengthOfLengthAlphabet = LengthOfLengthAlphabet('')
606    >>> print(lengthOfLengthAlphabet[2])
607    coded with 2 bits
608    >>> len(lengthOfLengthAlphabet[0])
609    2
610    >>> [len(lengthOfLengthAlphabet[x]) for x in range(6)]
611    [2, 4, 3, 2, 2, 4]
612    >>> lengthOfLengthAlphabet.showCode()
613      00:skipped             01:coded with 4 bits 0111:coded with 1 bits
614      10:coded with 3 bits  011:coded with 2 bits 1111:coded with 5 bits
615    """
616    decodeTable = {
617         0b00:0,     0b10:3,
618       0b0111:1,     0b01:4,
619        0b011:2,   0b1111:5,
620       }
621
622    def __init__(self, name=None, **args):
623        super().__init__(name, decodeTable=self.decodeTable, **args)
624
625    def mnemonic(self, index):
626        if index==0: return 'skipped'
627        return 'coded with {} bits'.format(index)
628
629    def explanation(self, index, extra=None):
630        return self.description+': '+self.mnemonic(index)
631
632class LengthAlphabet(WithExtra):
633    """Length of symbols
634    Used during construction of a code.
635    """
636    def __init__(self, name):
637        super().__init__(name, alphabetSize=18)
638
639    def extraBits(self, index):
640        return {16:2, 17:3}.get(index, 0)
641
642    def mnemonic(self, index):
643        if index==0: return 'unused'
644        elif index==16: return 'rep xx'
645        elif index==17: return 'zero xxx'
646        else: return 'len {}'.format(index)
647
648    def explanation(self, index, extra):
649        return self.description.format(self[index], extra)
650
651    def value(self, index, extra):
652        #the caller got the length already, so extra is enough
653        return extra
654
655#Stream header
656class WindowSizeAlphabet(Code):
657    """The alphabet used for window size in the stream header.
658    >>> WindowSizeAlphabet()[10].explanation()
659    'windowsize=(1<<10)-16=1008'
660    """
661    decodeTable = {
662        0b0100001: 10,   0b1100001: 14,   0b0011: 18,   0b1011: 22,
663        0b0110001: 11,   0b1110001: 15,   0b0101: 19,   0b1101: 23,
664        0b1000001: 12,         0b0: 16,   0b0111: 20,   0b1111: 24,
665        0b1010001: 13,   0b0000001: 17,   0b1001: 21,
666        0b0010001: None,
667        }
668
669    name = 'WSIZE'
670
671    def __init__(self, name=None):
672        super().__init__(name, decodeTable=self.decodeTable)
673
674    def value(self, index):
675        #missing value gives index None
676        if index is None: return None
677        return (1<<index)-16
678
679    def explanation(self, index):
680        return 'windowsize=(1<<{})-16={}'.format(
681            index, (1<<index)-16)
682
683#Metablock
684class MetablockLengthAlphabet(WithExtra):
685    """Used for the meta block length;
686    also indicates a block with no data
687    >>> metablockLengthAlphabet = MetablockLengthAlphabet()
688    >>> metablockLengthAlphabet[0]; str(metablockLengthAlphabet[0])
689    Symbol(MLEN, 0)
690    'empty'
691    >>> metablockLengthAlphabet[3]
692    Traceback (most recent call last):
693        ...
694    ValueError: No symbol MetablockLengthAlphabet[3]
695    >>> print(metablockLengthAlphabet[4])
696    hhhh00
697    >>> metablockLengthAlphabet[4].value(0x1000)
698    4097
699    >>> metablockLengthAlphabet[5].value(0x1000)
700    Traceback (most recent call last):
701        ...
702    InvalidStream: Zeros in high nibble of MLEN
703    >>> metablockLengthAlphabet[5].explanation(0x12345)
704    'data length: 12345h+1=74566'
705    >>> metablockLengthAlphabet.showCode()
706    00:hhhh00   10:hhhhhh10 01:hhhhh01  11:empty
707    """
708    decodeTable = {0b11:0, 0b00:4, 0b01:5, 0b10:6}
709
710    name = 'MLEN'
711    def __init__(self, name=None):
712        super().__init__(name, decodeTable=self.decodeTable)
713
714    def extraBits(self, index):
715        return index*4
716
717    def mnemonic(self, index):
718        if index==0: return 'empty'
719        return 'h'*(self.extraBits(index)//4)+self.bitPattern(index)
720
721    def value(self, index, extra):
722        extraBits = self.extraBits(index)
723        if not 0<=extra<1<<extraBits:
724            raise ValueError('value: extra out of range')
725        if index==0: return 0
726        if index>4 and extra>>extraBits-4==0: raise InvalidStream(
727            'Zeros in high nibble of MLEN')
728        return extra+1
729
730    def explanation(self, index, extra):
731        if index==0: return '11: empty block'
732        extraBits = self.extraBits(index)
733        return 'data length: {:0{}x}h+1={}'.format(extra, extraBits//4, extra+1)
734
735
736class ReservedAlphabet(BoolCode):
737    """The reserved bit that must be zero.
738    """
739    name = 'RSVD'
740    def value(self, index):
741        if index: raise ValueError('Reserved bit is not zero')
742
743    def explanation(self, index):
744        return 'Reserved (must be zero)'
745
746class FillerAlphabet(Code):
747    def __init__(self, *, streamPos):
748        super().__init__('SKIP', bitLength=(-streamPos)&7)
749
750    def explanation(self, index):
751        return '{} bit{} ignored'.format(
752            self.length(index),
753            '' if self.length(index)==1 else 's',
754            )
755
756class SkipLengthAlphabet(WithExtra):
757    """Used for the skip length in an empty metablock
758    >>> skipLengthAlphabet = SkipLengthAlphabet()
759    >>> skipLengthAlphabet[0]; str(skipLengthAlphabet[0])
760    Symbol(SKIP, 0)
761    'empty'
762    >>> skipLengthAlphabet[4]
763    Traceback (most recent call last):
764        ...
765    ValueError: index out of range
766    >>> print(skipLengthAlphabet[3])
767    hhhhhh11
768    >>> skipLengthAlphabet[2].value(0x1000)
769    4097
770    >>> skipLengthAlphabet[3].value(0x1000)
771    Traceback (most recent call last):
772        ...
773    InvalidStream: Zeros in high byte of SKIPBYTES
774    >>> skipLengthAlphabet[3].explanation(0x12345)
775    'skip length: 12345h+1=74566'
776    >>> skipLengthAlphabet.showCode()
777    00:empty    01:hh01     10:hhhh10   11:hhhhhh11
778    """
779    def __init__(self):
780        super().__init__('SKIP', bitLength=2)
781
782    def extraBits(self, index):
783        return index*8
784
785    def mnemonic(self, index):
786        if index==0: return 'empty'
787        return 'h'*(self.extraBits(index)//4)+self.bitPattern(index)
788
789    def value(self, index, extra):
790        extraBits = self.extraBits(index)
791        if not 0<=extra<1<<extraBits:
792            raise ValueError('value: extra out of range')
793        if index==0: return 0
794        if index>1 and extra>>extraBits-8==0:
795            raise InvalidStream('Zeros in high byte of SKIPBYTES')
796        return extra+1
797
798    def explanation(self, index, extra):
799        if index==0: return '00: no skip'
800        extraBits = self.extraBits(index)
801        return 'skip length: {:{}x}h+1={}'.format(extra, extraBits//8, extra+1)
802
803
804class TypeCountAlphabet(Enumerator):
805    """Used for giving block type counts and tree counts.
806    >>> TypeCountAlphabet(description='').showCode()
807       0:0            0101:xx,0101      1011:xxxxx,1011
808    0001:0001         1101:xxxxxx,1101  0111:xxx,0111
809    1001:xxxx,1001    0011:x,0011       1111:xxxxxxx,1111
810    """
811    decodeTable = {
812             0b0: 0,   0b1001: 5,
813          0b0001: 1,   0b1011: 6,
814          0b0011: 2,   0b1101: 7,
815          0b0101: 3,   0b1111: 8,
816          0b0111: 4,
817          }
818
819    value0 = 1
820    extraTable = [0, 0, 1, 2, 3, 4, 5, 6, 7]
821    name = 'BT#'
822
823    def __init__(self, name=None, *, description):
824        super().__init__(
825            name,
826            decodeTable=self.decodeTable,
827            description=description)
828
829    def mnemonic(self, index):
830        if index==0: return '0'
831        if index==1: return '0001'
832        return 'x'*(self.extraBits(index))+','+self.bitPattern(index)
833
834    def explanation(self, index, extra):
835        value = self.value(index, extra)
836        description = self.description
837        if value==1: description = description[:-1]
838        return '{}: {} {}'.format(
839            self.mnemonic(index),
840            value,
841            description)
842
843class BlockTypeAlphabet(Code):
844    """The block types; this code works for all three kinds.
845    >>> b = BlockTypeAlphabet('T', NBLTYPES=5)
846    >>> print(*(x for x in b))
847    prev +1 #0 #1 #2 #3 #4
848    """
849    def __init__(self, name, NBLTYPES, **args):
850        super().__init__(name, alphabetSize=NBLTYPES+2, **args)
851        self.NBLTYPES = NBLTYPES
852
853    def mnemonic(self, index):
854        if index==0: return 'prev'
855        elif index==1: return '+1'
856        else: return '#'+str(index-2)
857
858    def value(self, index):
859        return index-2
860
861    def explanation(self, index):
862        if index==0: return '0: previous'
863        elif index==1: return '1: increment'
864        else: return 'Set block type to: '+str(index-2)
865
866class BlockCountAlphabet(Enumerator):
867    """Block counts
868    >>> b = BlockCountAlphabet('L')
869    >>> print(b[25])
870    [24*x]: BC16625-16793840
871    """
872
873    value0 = 1
874    extraTable = [2,2,2,2,3, 3,3,3,4,4, 4,4,5,5,5, 5,6,6,7,8, 9,10,11,12,13, 24]
875    def __init__(self, name, **args):
876        super().__init__(name, alphabetSize=26, **args)
877
878    def mnemonic(self, index):
879        extraBits = self.extraBits(index)
880        return '{}: BC{}-{}'.format(
881            'x'*extraBits if index<5 else '[{}*x]'.format(extraBits),
882            *self.span(index))
883
884    def explanation(self, index, extra):
885        return 'Block count: '+super().explanation(index, extra)
886
887class DistanceParamAlphabet(WithExtra):
888    """The distance parameters NPOSTFIX and NDIRECT.
889    Although these are treated as two in the description, this is easier.
890    """
891    def __init__(self):
892        super().__init__('DIST', bitLength=2)
893
894    def extraBits(self, index):
895        return 4
896
897    def value(self, index, extra):
898        """Returns NPOSTFIX and NDIRECT<<NPOSTFIX
899        """
900        if extra>15:
901            raise ValueError('value: extra out of range')
902        return index, extra<<index
903
904    def explanation(self, index, extra):
905        return '{} postfix bits and {:04b}<<{}={} direct codes'.format(
906            index, extra, index, extra<<index)
907
908    def mnemonic(self, index):
909        return 'PF'+str(index)
910
911class LiteralContextMode(Code):
912    """For the literal context modes.
913    >>> LiteralContextMode().showCode()
914    00:LSB6   01:MSB6   10:UTF8   11:Signed
915    >>> LiteralContextMode().explanation(2)
916    'Context mode for type 9: 2(UTF8)'
917    """
918
919    def __init__(self, *, number=9):
920        super().__init__('LC'+str(number), bitLength=2)
921        self.number = number
922
923    def mnemonic(self, index):
924        return ['LSB6', 'MSB6', 'UTF8', 'Signed'][index]
925
926    def explanation(self, index):
927        return 'Context mode for type {}: {}({})'.format(
928            self.number,
929            index,
930            self.mnemonic(index))
931
932class RLEmaxAlphabet(Enumerator):
933    """Used for describing the run length encoding used for describing context maps.
934    >>> RLEmaxAlphabet().showCode()
935    0:1    1:more
936    """
937    value0 = 0
938    extraTable = [0, 4]
939    name = 'RLE#'
940
941    def mnemonic(self, index):
942        return ['1', 'more'][index]
943
944    def explanation(self, index, extra):
945        description = self.description and self.description+': '
946        if index==0: return description+'No RLE coding'
947        return '{}xxxx 1: RLEMAX={}'.format(description, extra+1)
948
949class TreeAlphabet(WithExtra):
950    """The alphabet to enumerate entries (called trees) in the context map.
951    parameters are RLEMAX and NTREES
952    >>> t = TreeAlphabet('', RLEMAX=3, NTREES=5)
953    >>> len(t)
954    8
955    >>> print(t[2])
956    xx+4 zeroes
957    >>> t[3].explanation(2)
958    '8+010=10 zeroes'
959    >>> t[0].value(0)
960    (1, 0)
961    """
962    name = 'CMI'
963    def __init__(self, name=None, *, RLEMAX, NTREES, **args):
964        super().__init__(name, alphabetSize=RLEMAX+NTREES, **args)
965        self.RLEMAX = RLEMAX
966        self.NTREES = NTREES
967
968    def extraBits(self, index):
969        if 0<index<=self.RLEMAX: return index
970        return 0
971
972    def mnemonic(self, index):
973        if index==0: return 'map #0'
974        if index<=self.RLEMAX:
975            return '{}+{} zeroes'.format('x'*index, 1<<index)
976        return 'map #{}'.format(index-self.RLEMAX)
977
978    def value(self, index, extra):
979        """Give count and value."""
980        index = index
981        if index==0: return 1, 0
982        if index<=self.RLEMAX: return (1<<index)+extra, 0
983        return 1, index-self.RLEMAX
984
985    def explanation(self, index, extra):
986        description = self.description and self.description+': '
987        if index==0: return description+'map #0'
988        if index<=self.RLEMAX:
989            return '{}+{:0{}b}={} zeroes'.format(
990                (1<<index),
991                extra, self.extraBits(index),
992                (1<<index)+extra)
993        return '{}map #{}-{}={}'.format(
994            description,
995            index, self.RLEMAX, index-self.RLEMAX)
996
997#Prefix alphabets for the data stream----------------------------------
998class LiteralAlphabet(Code):
999    """Alphabet of symbols.
1000    """
1001    minLength = maxLength = 8
1002    def __init__(self, number):
1003        super().__init__('L'+str(number), alphabetSize=1<<8)
1004
1005    def mnemonic(self, index):
1006        return outputCharFormatter(index)
1007
1008    def value(self, index, extra=None):
1009        return index
1010
1011    def explanation(self, index, extra=None):
1012        return self.mnemonic(index)
1013
1014class InsertLengthAlphabet(Enumerator):
1015    """Intern code for insert counts
1016    """
1017    value0 = 0
1018    extraTable = [0,0,0,0,0, 0,1,1,2,2, 3,3,4,4,5, 5,6,7,8,9, 10,12,14,24]
1019
1020class CopyLengthAlphabet(Enumerator):
1021    value0 = 2
1022    extraTable = [0,0,0,0,0, 0,0,0,1,1, 2,2,3,3,4, 4,5,5,6,7, 8,9,10,24]
1023
1024class InsertAndCopyAlphabet(WithExtra):
1025    """The insert and copy code
1026    >>> for x in range(0,704,704//13):
1027    ...    print('{:10b}'.format(x), InsertAndCopyAlphabet()[x])
1028             0 I0C2&D=0
1029        110110 I6+xC8&D=0
1030       1101100 I5C22+xxx&D=0
1031      10100010 I4C4
1032      11011000 I3C10+x
1033     100001110 I14+xxC8
1034     101000100 I10+xxC22+xxx
1035     101111010 I98+xxxxxC14+xx
1036     110110000 I6+xC70+xxxxx
1037     111100110 I1090+[10*x]C8
1038    1000011100 I26+xxxC326+[8*x]
1039    1001010010 I322+[8*x]C14+xx
1040    1010001000 I194+[7*x]C70+xxxxx
1041    1010111110 I22594+[24*x]C1094+[10*x]
1042    """
1043    insertLengthAlphabet = InsertLengthAlphabet(None)
1044    copyLengthAlphabet = CopyLengthAlphabet(None)
1045
1046    def __init__(self, number=''):
1047        super().__init__('IC'+str(number), bitLength=10)
1048
1049    def __len__(self):
1050        return 704
1051
1052    def extraBits(self, index):
1053        insertSymbol, copySymbol, dist0 = self.splitSymbol(index)
1054        return InsertLengthAlphabet.extraTable[insertSymbol.index] + \
1055            CopyLengthAlphabet.extraTable[copySymbol.index]
1056
1057    def splitSymbol(self, index):
1058        """Give relevant values for computations:
1059        (insertSymbol, copySymbol, dist0flag)
1060        """
1061        #determine insert and copy upper bits from table
1062        row = [0,0,1,1,2,2,1,3,2,3,3][index>>6]
1063        col = [0,1,0,1,0,1,2,0,2,1,2][index>>6]
1064        #determine inserts and copy sub codes
1065        insertLengthCode = row<<3 | index>>3&7
1066        if row: insertLengthCode -= 8
1067        copyLengthCode = col<<3 | index&7
1068        return (
1069            Symbol(self.insertLengthAlphabet, insertLengthCode),
1070            Symbol(self.copyLengthAlphabet, copyLengthCode),
1071            row==0
1072            )
1073
1074    def mnemonic(self, index):
1075        """Make a nice mnemonic
1076        """
1077        i,c,d0 = self.splitSymbol(index)
1078        iLower, _ = i.code.span(i.index)
1079        iExtra = i.extraBits()
1080        cLower, _ = c.code.span(c.index)
1081        cExtra = c.extraBits()
1082        return 'I{}{}{}C{}{}{}{}'.format(
1083            iLower,
1084            '+' if iExtra else '',
1085            'x'*iExtra if iExtra<6 else '[{}*x]'.format(iExtra),
1086            cLower,
1087            '+' if cExtra else '',
1088            'x'*cExtra if cExtra<6 else '[{}*x]'.format(cExtra),
1089            '&D=0' if d0 else '')
1090
1091    def value(self, index, extra):
1092        i,c,d0 = self.splitSymbol(index)
1093        iExtra = i.extraBits()
1094        ce, ie = extra>>iExtra, extra&(1<<iExtra)-1
1095        insert = i.value(ie)
1096        copy = c.value(ce)
1097        return insert, copy, d0
1098
1099    def explanation(self, index, extra):
1100        insert, copy, d0 = self.value(index, extra)
1101        if d0: return 'Literal: {}, copy: {}, same distance'.format(insert, copy)
1102        else: return 'Literal: {}, copy: {}'.format(insert, copy)
1103
1104class DistanceAlphabet(WithExtra):
1105    """Represent the distance encoding.
1106    Dynamically generated alphabet.
1107    This is what the documentation should have said:
1108    Ignoring offsets for the moment, the "long" encoding works as follows:
1109    Write the distance in binary as follows:
1110    1xy..yz..z, then the distance symbol consists of n..nxz..z
1111    Where:
1112    n is one less than number of bits in y
1113    x is a single bit
1114    y..y are n+1 extra bits (encoded in the bit stream)
1115    z..z is NPOSTFIX bits that are part of the symbol
1116    The offsets are so as to start at the lowest useable value:
1117    if 1xyyyyz = distance +(4<<POSTFIX)-NDIRECT-1
1118    then n..nxz..z is symbol -NDIRECT-16
1119    >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10)
1120    >>> print(d[4], d[17], d[34])
1121    last-1 1 10xx00-5
1122    >>> [str(d[x]) for x in range(26, 32)]
1123    ['10x00-5', '10x01-5', '10x10-5', '10x11-5', '11x00-5', '11x01-5']
1124    """
1125    def __init__(self, number, *, NPOSTFIX, NDIRECT):
1126        self.NPOSTFIX = NPOSTFIX
1127        self.NDIRECT = NDIRECT
1128        #set length
1129        #Actually, not all symbols are used,
1130        #only NDIRECT+16+(44-2*POSTFIX<<NPOSTFIX)
1131        super().__init__('D'+str(number),
1132            alphabetSize=self.NDIRECT+16+(48<<self.NPOSTFIX))
1133
1134    def extraBits(self, index):
1135        """Indicate how many extra bits are needed to interpret symbol
1136        >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10)
1137        >>> [d[i].extraBits() for i in range(26)]
1138        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1139        >>> [d[i].extraBits() for i in range(26,36)]
1140        [1, 1, 1, 1, 1, 1, 1, 1, 2, 2]
1141        """
1142        if index<16+self.NDIRECT: return 0
1143        return 1 + ((index - self.NDIRECT - 16) >> (self.NPOSTFIX + 1))
1144
1145    def value(self, dcode, dextra):
1146        """Decode value of symbol together with the extra bits.
1147        >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10)
1148        >>> d[34].value(2)
1149        (0, 35)
1150        """
1151        if dcode<16:
1152            return [(1,0),(2,0),(3,0),(4,0),
1153                    (1,-1),(1,+1),(1,-2),(1,+2),(1,-3),(1,+3),
1154                    (2,-1),(2,+1),(2,-2),(2,+2),(2,-3),(2,+3)
1155                ][dcode]
1156        if dcode<16+self.NDIRECT:
1157            return (0,dcode-16)
1158        #we use the original formulas, instead of my clear explanation
1159        POSTFIX_MASK = (1 << self.NPOSTFIX) - 1
1160        ndistbits = 1 + ((dcode - self.NDIRECT - 16) >> (self.NPOSTFIX + 1))
1161        hcode = (dcode - self.NDIRECT - 16) >> self.NPOSTFIX
1162        lcode = (dcode - self.NDIRECT - 16) & POSTFIX_MASK
1163        offset = ((2 + (hcode & 1)) << ndistbits) - 4
1164        distance = ((offset + dextra) << self.NPOSTFIX) + lcode + self.NDIRECT + 1
1165        return (0,distance)
1166
1167    def mnemonic(self, index, verbose=False):
1168        """Give mnemonic representation of meaning.
1169        verbose compresses strings of x's
1170        """
1171        if index<16:
1172            return ['last', '2last', '3last', '4last',
1173                'last-1', 'last+1', 'last-2', 'last+2', 'last-3', 'last+3',
1174                '2last-1', '2last+1', '2last-2', '2last+2', '2last-3', '2last+3'
1175                ][index]
1176        if index<16+self.NDIRECT:
1177            return str(index-16)
1178        #construct strings like "1xx01-15"
1179        index -= self.NDIRECT+16
1180        hcode = index >> self.NPOSTFIX
1181        lcode = index & (1<<self.NPOSTFIX)-1
1182        if self.NPOSTFIX: formatString = '1{0}{1}{2:0{3}b}{4:+d}'
1183        else: formatString = '1{0}{1}{4:+d}'
1184        return formatString.format(
1185            hcode&1,
1186            'x'*(2+hcode>>1) if hcode<13 or verbose else '[{}*x]'.format(2+hcode>>1),
1187            lcode, self.NPOSTFIX,
1188            self.NDIRECT+1-(4<<self.NPOSTFIX))
1189
1190    def explanation(self, index, extra):
1191        """
1192        >>> d = DistanceAlphabet('D', NPOSTFIX=2, NDIRECT=10)
1193        >>> d[55].explanation(13)
1194        '11[1101]01-5: [0]+240'
1195        """
1196        extraBits = self.extraBits(index)
1197        extraString = '[{:0{}b}]'.format(extra, extraBits)
1198        return '{0}: [{1[0]}]{1[1]:+d}'.format(
1199            self.mnemonic(index, True).replace('x'*(extraBits or 1), extraString),
1200            self.value(index, extra))
1201
1202#Classes for doing actual work------------------------------------------
1203class ContextModeKeeper:
1204    """For computing the literal context mode.
1205    You feed it characters, and it computes indices in the context map.
1206    """
1207    def __init__(self, mode):
1208        self.chars = deque([0,0], maxlen=2)
1209        self.mode = mode
1210
1211    def setContextMode(self, mode):
1212        """Switch to given context mode (0..3)"""
1213        self.mode = mode
1214    def getIndex(self):
1215        if self.mode==0:  #LSB6
1216            return self.chars[1]&0x3f
1217        elif self.mode==1: #MSB6
1218            return self.chars[1]>>2
1219        elif self.mode==2: #UTF8: character class of previous and a bit of the second
1220            p2,p1 = self.chars
1221            return self.lut0[p1]|self.lut1[p2]
1222        elif self.mode==3: #Signed: initial bits of last two bytes
1223            p2,p1 = self.chars
1224            return self.lut2[p1]<<3|self.lut2[p2]
1225
1226    def add(self, index):
1227        """Adjust the context for output char (as int)."""
1228        self.chars.append(index)
1229
1230    #0: control     #16: quote  #32: ,:;  #48: AEIOU
1231    #4: tab/lf/cr   #20: %      #36: .    #52: BC..Z
1232    #8: space       #24: (<[{   #40: =    #56: aeiou
1233    #12:!#$&*+-/?@| #28: )>]}   #44: 0-9  #60: bc..z
1234    lut0 = [0,  0,  0,  0,  0,  0,  0,  0,  0,  4,  4,  0,  0,  4,  0,  0,
1235            0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1236            8, 12, 16, 12, 12, 20, 12, 16, 24, 28, 12, 12, 32, 12, 36, 12,
1237           44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 32, 32, 24, 40, 28, 12,
1238           12, 48, 52, 52, 52, 48, 52, 52, 52, 48, 52, 52, 52, 52, 52, 48,
1239           52, 52, 52, 52, 52, 48, 52, 52, 52, 52, 52, 24, 12, 28, 12, 12,
1240           12, 56, 60, 60, 60, 56, 60, 60, 60, 56, 60, 60, 60, 60, 60, 56,
1241           60, 60, 60, 60, 60, 56, 60, 60, 60, 60, 60, 24, 12, 28, 12,  0
1242           ]+[0,1]*32+[2,3]*32
1243    #0: space  1:punctuation  2:digit/upper 3:lower
1244    lut1 = [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1245             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1246             0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1247             2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
1248             1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1249             2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1,
1250             1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1251             3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 0
1252           ]+[0]*96+[2]*32
1253    #initial bits: 8*0, 4*0, 2*0, 1*0, 1*1, 2*1, 4*1, 8*1
1254    lut2 = [0]+[1]*15+[2]*48+[3]*64+[4]*64+[5]*48+[6]*15+[7]
1255    assert len(lut0)==len(lut1)==len(lut2)==256
1256
1257class WordList:
1258    """Word list.
1259    >>> WordList().word(7, 35555)
1260    b'Program to '
1261    """
1262    NDBITS = [0,  0,  0,  0, 10, 10, 11, 11, 10, 10,
1263             10, 10, 10,  9,  9,  8,  7,  7,  8,  7,
1264              7,  6,  6,  5,  5]
1265    def __init__(self):
1266        self.file = open('dict', 'rb')
1267        self.compileActions()
1268
1269    def word(self, size, dist):
1270        """Get word
1271        """
1272        #split dist in index and action
1273        ndbits = self.NDBITS[size]
1274        index = dist&(1<<ndbits)-1
1275        action = dist>>ndbits
1276        #compute position in file
1277        position = sum(n<<self.NDBITS[n] for n in range(4,size))+size*index
1278        self.file.seek(position)
1279        return self.doAction(self.file.read(size), action)
1280
1281    def upperCase1(self, word):
1282        word = word.decode('utf8')
1283        word = word[0].upper()+word[1:]
1284        return word.encode('utf8')
1285
1286
1287    #Super compact form of action table.
1288    #_ means space, .U means UpperCaseAll, U(w) means UpperCaseFirst
1289    actionTable = r"""
1290        0:w        25:w+_for_     50:w+\n\t       75:w+. This_100:w+ize_
1291        1:w+_      26:w[3:]       51:w+:          76:w+,      101:w.U+.
1292        2:_+w+_    27:w[:-2]      52:_+w+._       77:.+w+_    102:\xc2\xa0+w
1293        3:w[1:]    28:w+_a_       53:w+ed_        78:U(w)+(   103:_+w+,
1294        4:U(w)+_   29:w+_that_    54:w[9:]        79:U(w)+.   104:U(w)+="
1295        5:w+_the_  30:_+U(w)      55:w[7:]        80:w+_not_  105:w.U+="
1296        6:_+w      31:w+._        56:w[:-6]       81:_+w+="   106:w+ous_
1297        7:s_+w+_   32:.+w         57:w+(          82:w+er_    107:w.U+,_
1298        8:w+_of_   33:_+w+,_      58:U(w)+,_      83:_+w.U+_  108:U(w)+=\'
1299        9:U(w)     34:w[4:]       59:w[:-8]       84:w+al_    109:_+U(w)+,
1300       10:w+_and_  35:w+_with_    60:w+_at_       85:_+w.U    110:_+w.U+="
1301       11:w[2:]    36:w+\'        61:w+ly_        86:w+=\'    111:_+w.U+,_
1302       12:w[:-1]   37:w+_from_    62:_the_+w+_of_ 87:w.U+"    112:_+w.U+,
1303       13:,_+w+_   38:w+_by_      63:w[:-5]       88:U(w)+._  113:w.U+(
1304       14:w+,_     39:w[5:]       64:w[:-9]       89:_+w+(    114:w.U+._
1305       15:_+U(w)+_ 40:w[6:]       65:_+U(w)+,_    90:w+ful_   115:_+w.U+.
1306       16:w+_in_   41:_the_+w     66:U(w)+"       91:_+U(w)+._116:w.U+=\'
1307       17:w+_to_   42:w[:-4]      67:.+w+(        92:w+ive_   117:_+w.U+._
1308       18:e_+w+_   43:w+. The_    68:w.U+_        93:w+less_  118:_+U(w)+="
1309       19:w+"      44:w.U         69:U(w)+">      94:w.U+\'   119:_+w.U+=\'
1310       20:w+.      45:w+_on_      70:w+="         95:w+est_   120:_+U(w)+=\'
1311       21:w+">     46:w+_as_      71:_+w+.        96:_+U(w)+.
1312       22:w+\n     47:w+_is_      72:.com/+w      97:w.U+">
1313       23:w[:-3]   48:w[:-7]                      98:_+w+=\'
1314       24:w+]      49:w[:-1]+ing_ 74:U(w)+\'      99:U(w)+,
1315        """
1316
1317    def compileActions(self):
1318        """Build the action table from the text above
1319        """
1320        import re
1321        self.actionList = actions = [None]*121
1322        #Action 73, which is too long, looks like this when expanded:
1323        actions[73] = "b' the '+w+b' of the '"
1324        #find out what the columns are
1325        actionLines = self.actionTable.splitlines()
1326        colonPositions = [m.start()
1327            for m in re.finditer(':',actionLines[1])
1328            ]+[100]
1329        columns = [(colonPositions[i]-3,colonPositions[i+1]-3)
1330            for i in range(len(colonPositions)-1)]
1331        for line in self.actionTable.splitlines(keepends=False):
1332            for start,end in columns:
1333                action = line[start:end]
1334                #skip empty actions
1335                if not action or action.isspace(): continue
1336                #chop it up, and check if the colon is properly placed
1337                index, colon, action = action[:3], action[3], action[4:]
1338                assert colon==':'
1339                #remove filler spaces at right
1340                action = action.rstrip()
1341                #replace space symbols
1342                action = action.replace('_', ' ')
1343                wPos = action.index('w')
1344                #add quotes around left string when present
1345                #translation: any pattern from beginning, up to
1346                #(but not including) a + following by a w later on
1347                action = re.sub(r"^(.*)(?=\+[U(]*w)", r"b'\1'", action)
1348                #add quotes around right string when present
1349                #translation: anything with a w in it, followed by a +
1350                #and a pattern up to the end
1351                #(there is no variable lookbehind assertion,
1352                #so we have to copy the pattern)
1353                action = re.sub(r"(w[[:\-1\]).U]*)\+(.*)$", r"\1+b'\2'", action)
1354                #expand shortcut for uppercaseAll
1355                action = action.replace(".U", ".upper()")
1356                #store action
1357                actions[int(index)] = action
1358
1359    def doAction(self, w, action):
1360        """Perform the proper action
1361        """
1362        #set environment for the UpperCaseFirst
1363        U = self.upperCase1
1364        return eval(self.actionList[action], locals())
1365
1366class Layout:
1367    """Class to layout the output.
1368    """
1369    #display width of hexdata+bitdata
1370    width = 25
1371    #general
1372    def __init__(self, stream):
1373        self.stream = stream
1374        self.bitPtr = self.width
1375
1376    def makeHexData(self, pos):
1377        """Produce hex dump of all data containing the bits
1378        from pos to stream.pos
1379        """
1380        firstAddress = pos+7>>3
1381        lastAddress = self.stream.pos+7>>3
1382        return ''.join(map('{:02x} '.format,
1383            self.stream.data[firstAddress:lastAddress]))
1384
1385    def formatBitData(self, pos, width1, width2=0):
1386        """Show formatted bit data:
1387        Bytes are separated by commas
1388        whole bytes are displayed in hex
1389        >>> Layout(olleke).formatBitData(6, 2, 16)
1390        '|00h|2Eh,|00'
1391        >>> Layout(olleke).formatBitData(4, 1, 0)
1392        '1'
1393        """
1394        result = []
1395        #make empty prefix code explicit
1396        if width1==0: result = ['()', ',']
1397        for width in width1, width2:
1398            #skip empty width2
1399            if width==0: continue
1400            #build result backwards in a list
1401            while width>0:
1402                availableBits = 8-(pos&7)
1403                if width<availableBits:
1404                    #read partial byte, beginning nor ending at boundary
1405                    data = self.stream.data[pos>>3] >> (pos&7) & (1<<width)-1
1406                    result.append('{:0{}b}'.format(data, width))
1407                elif availableBits<8:
1408                    #read rest of byte, ending at boundary
1409                    data = self.stream.data[pos>>3] >> (pos&7)
1410                    result.append('|{:0{}b}'.format(data, availableBits))
1411                else:
1412                    #read whole byte (in hex), beginning and ending at boundary
1413                    data = self.stream.data[pos>>3]
1414                    result.append('|{:02X}h'.format(data))
1415                width -= availableBits
1416                pos += availableBits
1417            #if width overshot from the availableBits subtraction, fix it
1418            pos += width
1419            #add comma to separate fields
1420            result.append(',')
1421        #concatenate pieces, reversed, skipping the last space
1422        return ''.join(result[-2::-1])
1423
1424    def readPrefixCode(self, alphabet):
1425        """give alphabet the prefix code that is read from the stream
1426        Called for the following alphabets, in this order:
1427        The alphabet in question must have a "logical" order,
1428        otherwise the assignment of symbols doesn't work.
1429        """
1430        mode, numberOfSymbols = self.verboseRead(PrefixCodeHeader(alphabet.name))
1431        if mode=='Complex':
1432            #for a complex code, numberOfSymbols means hskip
1433            self.readComplexCode(numberOfSymbols, alphabet)
1434            return alphabet
1435        else:
1436            table = []
1437            #Set table of lengths for mnemonic function
1438            lengths = [[0], [1,1], [1,2,2], '????'][numberOfSymbols-1]
1439            #adjust mnemonic function of alphabet class
1440            def myMnemonic(index):
1441                return '{} bit{}: {}'.format(
1442                    lengths[i],
1443                    '' if lengths[i]==1 else 's',
1444                    alphabet.__class__.mnemonic(alphabet, index)
1445                    )
1446            alphabet.mnemonic = myMnemonic
1447            for i in range(numberOfSymbols):
1448                table.append(self.verboseRead(alphabet, skipExtra=True).index)
1449            #restore mnemonic
1450            del alphabet.mnemonic
1451            if numberOfSymbols==4:
1452                #read tree shape to redefine lengths
1453                lengths = self.verboseRead(TreeShapeAlhabet())
1454            #construct the alphabet prefix code
1455            alphabet.setLength(dict(zip(table, lengths)))
1456        return alphabet
1457
1458    def readComplexCode(self, hskip, alphabet):
1459        """Read complex code"""
1460        stream = self.stream
1461        #read the lengths for the length code
1462        lengths = [1,2,3,4,0,5,17,6,16,7,8,9,10,11,12,13,14,15][hskip:]
1463        codeLengths = {}
1464        total = 0
1465        lol = LengthOfLengthAlphabet('##'+alphabet.name)
1466        #lengthCode will be used for coding the lengths of the new code
1467        #we use it for display until now; definition comes below
1468        lengthCode = LengthAlphabet('#'+alphabet.name)
1469        lengthIter = iter(lengths)
1470        lengthsLeft = len(lengths)
1471        while total<32 and lengthsLeft>0:
1472            lengthsLeft -= 1
1473            newSymbol = next(lengthIter)
1474            lol.description = str(lengthCode[newSymbol])
1475            length = self.verboseRead(lol)
1476            if length:
1477                codeLengths[newSymbol] = length
1478                total += 32>>length
1479        if total>32: raise ValueError("Stream format")
1480        if len(codeLengths)==1: codeLengths[list(codeLengths.keys())[0]] = 0
1481        #Now set the encoding of the lengthCode
1482        lengthCode.setLength(codeLengths)
1483        print("***** Lengths for {} will be coded as:".format(alphabet.name))
1484        lengthCode.showCode()
1485        #Now determine the symbol lengths with the lengthCode
1486        symbolLengths = {}
1487        total = 0
1488        lastLength = 8
1489        alphabetIter = iter(alphabet)
1490        while total<32768:
1491            #look ahead to see what is going to happen
1492            length = lengthCode.decodePeek(
1493                self.stream.peek(lengthCode.maxLength))[1].index
1494            #in every branch, set lengthCode.description to explanatory text
1495            #lengthCode calls format(symbol, extra) with this string
1496            if length==0:
1497                symbol = next(alphabetIter)
1498                lengthCode.description = 'symbol {} unused'.format(symbol)
1499                self.verboseRead(lengthCode)
1500                #unused symbol
1501                continue
1502            if length==16:
1503                lengthCode.description = \
1504                    '{1}+3 symbols of length '+str(lastLength)
1505                extra = self.verboseRead(lengthCode)
1506                #scan series of 16s (repeat counts)
1507                #start with repeat count 2
1508                repeat = 2
1509                startSymbol = next(alphabetIter)
1510                endSymbol = next(alphabetIter)
1511                symbolLengths[startSymbol.index] = \
1512                    symbolLengths[endSymbol.index] = lastLength
1513                #count the two just defined symbols
1514                total += 2*32768>>lastLength
1515                #note: loop may end because we're there
1516                #even if a 16 _appears_ to follow
1517                while True:
1518                    #determine last symbol
1519                    oldRepeat = repeat
1520                    repeat = (repeat-2<<2)+extra+3
1521                    #read as many symbols as repeat increased
1522                    for i in range(oldRepeat, repeat):
1523                        endSymbol = next(alphabetIter)
1524                        symbolLengths[endSymbol.index] = lastLength
1525                    #compute new total; it may be end of loop
1526                    total += (repeat-oldRepeat)*32768>>lastLength
1527                    if total>=32768: break
1528                    #see if there is more to do
1529                    length = lengthCode.decodePeek(
1530                        self.stream.peek(lengthCode.maxLength))[1].index
1531                    if length!=16: break
1532                    lengthCode.description = 'total {}+{{1}} symbols'.format(
1533                        (repeat-2<<2)+3)
1534                    extra = self.verboseRead(lengthCode)
1535            elif length==17:
1536                #read, and show explanation
1537                lengthCode.description = '{1}+3 unused'
1538                extra = self.verboseRead(lengthCode)
1539                #scan series of 17s (groups of zero counts)
1540                #start with repeat count 2
1541                repeat = 2
1542                startSymbol = next(alphabetIter)
1543                endSymbol = next(alphabetIter)
1544                #note: loop will not end with total==32768,
1545                #since total doesn't change here
1546                while True:
1547                    #determine last symbol
1548                    oldRepeat = repeat
1549                    repeat = (repeat-2<<3)+extra+3
1550                    #read as many symbols as repeat increases
1551                    for i in range(repeat-oldRepeat):
1552                        endSymbol = next(alphabetIter)
1553                    #see if there is more to do
1554                    length = lengthCode.decodePeek(
1555                        self.stream.peek(lengthCode.maxLength))[1].index
1556                    if length!=17: break
1557                    lengthCode.description = 'total {}+{{1}} unused'.format(
1558                        (repeat-2<<3)+3)
1559                    extra = self.verboseRead(lengthCode)
1560            else:
1561                symbol = next(alphabetIter)
1562                #double braces for format
1563                char = str(symbol)
1564                if char in '{}': char *= 2
1565                lengthCode.description = \
1566                    'Length for {} is {{0.index}} bits'.format(char)
1567                #output is not needed (will be 0)
1568                self.verboseRead(lengthCode)
1569                symbolLengths[symbol.index] = length
1570                total += 32768>>length
1571                lastLength = length
1572        assert total==32768
1573        alphabet.setLength(symbolLengths)
1574        print('End of table. Prefix code '+alphabet.name+':')
1575        alphabet.showCode()
1576
1577    #stream
1578    def processStream(self):
1579        """Process a brotli stream.
1580        """
1581        print('addr  hex{:{}s}binary context explanation'.format(
1582            '', self.width-10))
1583        print('Stream header'.center(60, '-'))
1584        self.windowSize = self.verboseRead(WindowSizeAlphabet())
1585        print('Metablock header'.center(60, '='))
1586        self.ISLAST = False
1587        self.output = bytearray()
1588        while not self.ISLAST:
1589            self.ISLAST = self.verboseRead(
1590                BoolCode('LAST', description="Last block"))
1591            if self.ISLAST:
1592                if self.verboseRead(
1593                    BoolCode('EMPTY', description="Empty block")): break
1594            if self.metablockLength(): continue
1595            if not self.ISLAST and self.uncompressed(): continue
1596            print('Block type descriptors'.center(60, '-'))
1597            self.numberOfBlockTypes = {}
1598            self.currentBlockCounts = {}
1599            self.blockTypeCodes = {}
1600            self.blockCountCodes = {}
1601            for blockType in (L,I,D): self.blockType(blockType)
1602            print('Distance code parameters'.center(60, '-'))
1603            self.NPOSTFIX, self.NDIRECT = self.verboseRead(DistanceParamAlphabet())
1604            self.readLiteralContextModes()
1605            print('Context maps'.center(60, '-'))
1606            self.cmaps = {}
1607            #keep the number of each kind of prefix tree for the last loop
1608            numberOfTrees = {I: self.numberOfBlockTypes[I]}
1609            for blockType in (L,D):
1610                numberOfTrees[blockType] = self.contextMap(blockType)
1611            print('Prefix code lists'.center(60, '-'))
1612            self.prefixCodes = {}
1613            for blockType in (L,I,D):
1614                self.readPrefixArray(blockType, numberOfTrees[blockType])
1615            self.metablock()
1616
1617    #metablock header
1618    def verboseRead(self, alphabet, context='', skipExtra=False):
1619        """Read symbol and extra from stream and explain what happens.
1620        Returns the value of the symbol
1621        >>> olleke.pos = 0
1622        >>> l = Layout(olleke)
1623        >>> l.verboseRead(WindowSizeAlphabet())
1624        0000  1b                   1011 WSIZE   windowsize=(1<<22)-16=4194288
1625        4194288
1626        """
1627        #TODO 2: verbosity level, e.g. show only codes and maps in header
1628        stream = self.stream
1629        pos = stream.pos
1630        if skipExtra:
1631            length, symbol = alphabet.readTuple(stream)
1632            extraBits, extra = 0, None
1633        else:
1634            length, symbol, extraBits, extra = alphabet.readTupleAndExtra(
1635                stream)
1636        #fields: address, hex data, binary data, name of alphabet, explanation
1637        hexdata = self.makeHexData(pos)
1638        addressField = '{:04x}'.format(pos+7>>3) if hexdata else ''
1639        bitdata = self.formatBitData(pos, length, extraBits)
1640        #bitPtr moves bitdata so that the bytes are easier to read
1641        #jump back to right if a new byte starts
1642        if '|' in bitdata[1:]:
1643            #start over on the right side
1644            self.bitPtr = self.width
1645        fillWidth = self.bitPtr-(len(hexdata)+len(bitdata))
1646        if fillWidth<0: fillWidth = 0
1647        print('{:<5s} {:<{}s} {:7s} {}'.format(
1648            addressField,
1649            hexdata+' '*fillWidth+bitdata, self.width,
1650            context+alphabet.name,
1651            symbol if skipExtra else symbol.explanation(extra),
1652            ))
1653        #jump to the right if we started with a '|'
1654        #because we didn't jump before printing
1655        if bitdata.startswith('|'): self.bitPtr = self.width
1656        else: self.bitPtr -= len(bitdata)
1657        return symbol if skipExtra else symbol.value(extra)
1658
1659    def metablockLength(self):
1660        """Read MNIBBLES and meta block length;
1661        if empty block, skip block and return true.
1662        """
1663        self.MLEN = self.verboseRead(MetablockLengthAlphabet())
1664        if self.MLEN:
1665            return False
1666        #empty block; skip and return False
1667        self.verboseRead(ReservedAlphabet())
1668        MSKIP = self.verboseRead(SkipLengthAlphabet())
1669        self.verboseRead(FillerAlphabet(streamPos=self.stream.pos))
1670        self.stream.pos += 8*MSKIP
1671        print("Skipping to {:x}".format(self.stream.pos>>3))
1672        return True
1673
1674    def uncompressed(self):
1675        """If true, handle uncompressed data
1676        """
1677        ISUNCOMPRESSED = self.verboseRead(
1678            BoolCode('UNCMPR', description='Is uncompressed?'))
1679        if ISUNCOMPRESSED:
1680            self.verboseRead(FillerAlphabet(streamPos=self.stream.pos))
1681            print('Uncompressed data:')
1682            self.output += self.stream.readBytes(self.MLEN)
1683            print(outputFormatter(self.output[-self.MLEN:]))
1684        return ISUNCOMPRESSED
1685
1686    def blockType(self, kind):
1687        """Read block type switch descriptor for given kind of blockType."""
1688        NBLTYPES = self.verboseRead(TypeCountAlphabet(
1689            'BT#'+kind[0].upper(),
1690            description='{} block types'.format(kind),
1691            ))
1692        self.numberOfBlockTypes[kind] = NBLTYPES
1693        if NBLTYPES>=2:
1694            self.blockTypeCodes[kind] = self.readPrefixCode(
1695                BlockTypeAlphabet('BT'+kind[0].upper(), NBLTYPES))
1696            self.blockCountCodes[kind] = self.readPrefixCode(
1697                BlockCountAlphabet('BC'+kind[0].upper()))
1698            blockCount = self.verboseRead(self.blockCountCodes[kind])
1699        else:
1700            blockCount = 1<<24
1701        self.currentBlockCounts[kind] = blockCount
1702
1703    def readLiteralContextModes(self):
1704        """Read literal context modes.
1705        LSB6: lower 6 bits of last char
1706        MSB6: upper 6 bits of last char
1707        UTF8: rougly dependent on categories:
1708            upper 4 bits depend on category of last char:
1709                control/whitespace/space/ punctuation/quote/%/open/close/
1710                comma/period/=/digits/ VOWEL/CONSONANT/vowel/consonant
1711            lower 2 bits depend on category of 2nd last char:
1712                space/punctuation/digit or upper/lowercase
1713        signed: hamming weight of last 2 chars
1714        """
1715        print('Context modes'.center(60, '-'))
1716        self.literalContextModes = []
1717        for i in range(self.numberOfBlockTypes[L]):
1718            self.literalContextModes.append(
1719                self.verboseRead(LiteralContextMode(number=i)))
1720
1721    def contextMap(self, kind):
1722        """Read context maps
1723        Returns the number of differnt values on the context map
1724        (In other words, the number of prefix trees)
1725        """
1726        NTREES = self.verboseRead(TypeCountAlphabet(
1727            kind[0].upper()+'T#',
1728            description='{} prefix trees'.format(kind)))
1729        mapSize = {L:64, D:4}[kind]
1730        if NTREES<2:
1731            self.cmaps[kind] = [0]*mapSize
1732        else:
1733            #read CMAPkind
1734            RLEMAX = self.verboseRead(RLEmaxAlphabet(
1735                'RLE#'+kind[0].upper(),
1736                description=kind+' context map'))
1737            alphabet = TreeAlphabet('CM'+kind[0].upper(), NTREES=NTREES, RLEMAX=RLEMAX)
1738            cmapCode = self.readPrefixCode(alphabet)
1739            tableSize = mapSize*self.numberOfBlockTypes[kind]
1740            cmap = []
1741            while len(cmap)<tableSize:
1742                cmapCode.description = 'map {}, entry {}'.format(
1743                    *divmod(len(cmap), mapSize))
1744                count, value = self.verboseRead(cmapCode)
1745                cmap.extend([value]*count)
1746            assert len(cmap)==tableSize
1747            IMTF = self.verboseRead(BoolCode('IMTF', description='Apply inverse MTF'))
1748            if IMTF:
1749                self.IMTF(cmap)
1750            if kind==L:
1751                print('Context maps for literal data:')
1752                for i in range(0, len(cmap), 64):
1753                    print(*(
1754                        ''.join(map(str, cmap[j:j+8]))
1755                        for j in range(i, i+64, 8)
1756                        ))
1757            else:
1758                print('Context map for distances:')
1759                print(*(
1760                    ''.join(map('{:x}'.format, cmap[i:i+4]))
1761                    for i in range(0, len(cmap), 4)
1762                    ))
1763            self.cmaps[kind] = cmap
1764        return NTREES
1765
1766    @staticmethod
1767    def IMTF(v):
1768        """In place inverse move to front transform.
1769        """
1770        #mtf is initialized virtually with range(infinity)
1771        mtf = []
1772        for i, vi in enumerate(v):
1773            #get old value from mtf. If never seen, take virtual value
1774            try: value = mtf.pop(vi)
1775            except IndexError: value = vi
1776            #put value at front
1777            mtf.insert(0, value)
1778            #replace transformed value
1779            v[i] = value
1780
1781    def readPrefixArray(self, kind, numberOfTrees):
1782        """Read prefix code array"""
1783        prefixes = []
1784        for i in range(numberOfTrees):
1785            if kind==L: alphabet = LiteralAlphabet(i)
1786            elif kind==I: alphabet = InsertAndCopyAlphabet(i)
1787            elif kind==D: alphabet = DistanceAlphabet(
1788                i, NPOSTFIX=self.NPOSTFIX, NDIRECT=self.NDIRECT)
1789            self.readPrefixCode(alphabet)
1790            prefixes.append(alphabet)
1791        self.prefixCodes[kind] = prefixes
1792
1793    #metablock data
1794    def metablock(self):
1795        """Process the data.
1796        Relevant variables of self:
1797        numberOfBlockTypes[kind]: number of block types
1798        currentBlockTypes[kind]: current block types (=0)
1799        literalContextModes: the context modes for the literal block types
1800        currentBlockCounts[kind]: counters for block types
1801        blockTypeCodes[kind]: code for block type
1802        blockCountCodes[kind]: code for block count
1803        cmaps[kind]: the context maps (not for I)
1804        prefixCodes[kind][#]: the prefix codes
1805        lastDistances: the last four distances
1806        lastChars: the last two chars
1807        output: the result
1808        """
1809        print('Meta block contents'.center(60, '='))
1810        self.currentBlockTypes = {L:0, I:0, D:0, pL:1, pI:1, pD:1}
1811        self.lastDistances = deque([17,16,11,4], maxlen=4)
1812        #the current context mode is for block type 0
1813        self.contextMode = ContextModeKeeper(self.literalContextModes[0])
1814        wordList = WordList()
1815
1816        #setup distance callback function
1817        def distanceCallback(symbol, extra):
1818            "callback function for displaying decoded distance"
1819            index, offset = symbol.value(extra)
1820            if index:
1821                #recent distance
1822                distance = self.lastDistances[-index]+offset
1823                return 'Distance: {}last{:+d}={}'.format(index, offset, distance)
1824            #absolute value
1825            if offset<=maxDistance:
1826                return 'Absolute value: {} (pos {})'.format(offset, maxDistance-offset)
1827            #word list value
1828            action, word = divmod(offset-maxDistance, 1<<wordList.NDBITS[copyLen])
1829            return '{}-{} gives word {},{} action {}'.format(
1830                offset, maxDistance, copyLen, word, action)
1831        for dpc in self.prefixCodes[D]: dpc.callback = distanceCallback
1832
1833        blockLen = 0
1834        #there we go
1835        while blockLen<self.MLEN:
1836            #get insert&copy command
1837            litLen, copyLen, dist0Flag = self.verboseRead(
1838                self.prefixCodes[I][
1839                    self.figureBlockType(I)])
1840            #literal data
1841            for i in range(litLen):
1842                bt = self.figureBlockType(L)
1843                cm = self.contextMode.getIndex()
1844                ct = self.cmaps[L][bt<<6|cm]
1845                char = self.verboseRead(
1846                    self.prefixCodes[L][ct],
1847                    context='{},{}='.format(bt,cm))
1848                self.contextMode.add(char)
1849                self.output.append(char)
1850            blockLen += litLen
1851            #check if we're done
1852            if blockLen>=self.MLEN: return
1853            #distance
1854            #distances are computed relative to output length, at most window size
1855            maxDistance = min(len(self.output), self.windowSize)
1856            if dist0Flag:
1857                distance = self.lastDistances[-1]
1858            else:
1859                bt = self.figureBlockType(D)
1860                cm = {2:0, 3:1, 4:2}.get(copyLen, 3)
1861                ct = self.cmaps[D][bt<<2|cm]
1862                index, offset = self.verboseRead(
1863                    self.prefixCodes[D][ct],
1864                    context='{},{}='.format(bt,cm))
1865                distance = self.lastDistances[-index]+offset if index else offset
1866                if index==1 and offset==0:
1867                    #to make sure distance is not put in last distance list
1868                    dist0Flag = True
1869            if distance<=maxDistance:
1870                #copy from output
1871                for i in range(
1872                        maxDistance-distance,
1873                        maxDistance-distance+copyLen):
1874                    self.output.append(self.output[i])
1875                if not dist0Flag: self.lastDistances.append(distance)
1876                comment = 'Seen before'
1877            else:
1878                #fetch from wordlist
1879                newWord = wordList.word(copyLen, distance-maxDistance-1)
1880                self.output.extend(newWord)
1881                #adjust copyLen to reflect actual new data
1882                copyLen = len(newWord)
1883                comment = 'From wordlist'
1884            blockLen += copyLen
1885            print(' '*40,
1886                comment,
1887                ': "',
1888                outputFormatter(self.output[-copyLen:]),
1889                '"',
1890                sep='')
1891            self.contextMode.add(self.output[-2])
1892            self.contextMode.add(self.output[-1])
1893
1894    def figureBlockType(self, kind):
1895        counts, types = self.currentBlockCounts, self.currentBlockTypes
1896        if counts[kind]==0:
1897            newType = self.verboseRead(self.blockTypeCodes[kind])
1898            if newType==-2: newType = types['P'+kind]
1899            elif newType==-1:
1900                newType = (types[kind]+1)%self.numberOfBlockTypes[kind]
1901            types['P'+kind] = types[kind]
1902            types[kind] = newType
1903            counts[kind] = self.verboseRead(self.blockCountCodes[kind])
1904        counts[kind] -=1
1905        return types[kind]
1906
1907__test__ = {
1908'BitStream': """
1909    >>> bs = BitStream(b'Jurjen')
1910    >>> bs.readBytes(2)
1911    b'Ju'
1912    >>> bs.read(6) #r=01110010
1913    50
1914    >>> bs
1915    BitStream(pos=2:6)
1916    >>> bs.peek(5)  #j=01101010
1917    9
1918    >>> bs.readBytes(2)
1919    Traceback (most recent call last):
1920        ...
1921    ValueError: readBytes: need byte boundary
1922    """,
1923
1924'Symbol': """
1925    >>> a=Symbol(MetablockLengthAlphabet(),5)
1926    >>> len(a)
1927    2
1928    >>> int(a)
1929    5
1930    >>> a.bitPattern()
1931    '01'
1932    >>> a.value(200000)
1933    200001
1934    >>> a.explanation(300000)
1935    'data length: 493e0h+1=300001'
1936    """,
1937
1938'RangeDecoder': """
1939    >>> a=RangeDecoder(bitLength=3)
1940    >>> len(a)
1941    8
1942    >>> a.name='t'
1943    >>> list(a)
1944    [Symbol(t, 0), Symbol(t, 1), Symbol(t, 2), Symbol(t, 3), Symbol(t, 4), Symbol(t, 5), Symbol(t, 6), Symbol(t, 7)]
1945    >>> a[2]
1946    Symbol(t, 2)
1947    >>> a.bitPattern(4)
1948    '100'
1949    >>> a.length(2)
1950    3
1951    >>> a.decodePeek(15)
1952    (3, Symbol(t, 7))
1953    >>>
1954
1955    """,
1956
1957'PrefixDecoder': """
1958    >>> a=PrefixDecoder(decodeTable={0:1,1:2,3:3,7:4})
1959    >>> len(a)
1960    4
1961    >>> a.name='t'
1962    >>> list(a)
1963    [Symbol(t, 1), Symbol(t, 2), Symbol(t, 3), Symbol(t, 4)]
1964    >>> a.decodePeek(22)
1965    (1, Symbol(t, 1))
1966    >>> a.decodePeek(27)
1967    (3, Symbol(t, 3))
1968    >>> a.length(1)
1969    1
1970    >>> a.length(4)
1971    3
1972    """,
1973
1974'Code': """
1975    >>> a=Code('t',alphabetSize=10)
1976    >>> len(a)
1977    10
1978    >>> a.showCode()
1979    0000:0 0001:1 0010:2 0011:3 0100:4 0101:5 0110:6 0111:7 1000:8 1001:9
1980    >>> a.setLength({2:1,3:2,5:3,6:3})
1981    >>> a.showCode()
1982      0:2  01:3 011:5 111:6
1983    >>> len(a)
1984    4
1985    >>> def callback(i): return 'call{}back'.format(i)
1986    >>> a=Code('t',callback=callback,bitLength=3)
1987    >>> a[6].explanation()
1988    'call6back'
1989    """,
1990
1991'WithExtra': """
1992    >>> class A(WithExtra):
1993    ...    extraTable = [0,1,1,2,2]
1994    >>> a=A('t',alphabetSize=5)
1995    >>> a[1]
1996    Symbol(t, 1)
1997    >>> a.extraBits(2)
1998    1
1999    >>> a.mnemonic(4)
2000    '4'
2001    >>> a.readTupleAndExtra(BitStream(b'\x5b'))
2002    (3, Symbol(t, 3), 2, 3)
2003    """,
2004
2005'BoolCode': """
2006    >>> BoolCode('test')[0].explanation()
2007    '0: False'
2008    """,
2009
2010'Enumerator': """
2011    >>> class A(Enumerator):
2012    ...    extraTable = [0,1,1,2,2]
2013    ...    value0=3
2014    >>> a=A(alphabetLength=5)
2015    >>> a.value(3)
2016    Traceback (most recent call last):
2017        ...
2018    TypeError: value() missing 1 required positional argument: 'extra'
2019    >>> a.explanation(3,4)
2020    'xx 011: 8-11; 8+4=12'
2021    """,
2022
2023'WindowSizeAlphabet': """
2024    >>> windowSizeAlphabet = WindowSizeAlphabet()
2025    >>> windowSizeAlphabet[0]
2026    Traceback (most recent call last):
2027        ...
2028    ValueError: No symbol WindowSizeAlphabet[0]
2029    >>> len(windowSizeAlphabet)
2030    16
2031    >>> windowSizeAlphabet[21]
2032    Symbol(WSIZE, 21)
2033    >>> windowSizeAlphabet[21].bitPattern()
2034    '1001'
2035    >>> windowSizeAlphabet[21].extraBits()
2036    0
2037    >>> windowSizeAlphabet[21].index
2038    21
2039    >>> windowSizeAlphabet[10].value()
2040    1008
2041    >>> windowSizeAlphabet[10].explanation()
2042    'windowsize=(1<<10)-16=1008'
2043    >>> windowSizeAlphabet.showCode()
2044          0:65520    1100001:16368    1110001:32752       0011:262128
2045    0000001:131056   0010001:None        1001:2097136     1011:4194288
2046    1000001:4080     1010001:8176        0101:524272      0111:1048560
2047    0100001:1008     0110001:2032        1101:8388592     1111:16777200
2048    """,
2049
2050'TypeCountAlphabet': """
2051    >>> typeCountAlphabet = TypeCountAlphabet(description='bananas')
2052    >>> len(typeCountAlphabet)
2053    9
2054    >>> typeCountAlphabet[3]
2055    Symbol(BT#, 3)
2056    >>> typeCountAlphabet[9]
2057    Traceback (most recent call last):
2058        ...
2059    ValueError: No symbol TypeCountAlphabet[9]
2060    >>> print(typeCountAlphabet[3])
2061    xx,0101
2062    >>> typeCountAlphabet[8].value(127)
2063    256
2064    >>> typeCountAlphabet[4].explanation(2)
2065    'xxx,0111: 11 bananas'
2066    >>> typeCountAlphabet[0].explanation()
2067    '0: 1 banana'
2068    """,
2069
2070'DistanceParamAlphabet': """
2071    >>> dpa = DistanceParamAlphabet()
2072    >>> dpa.showCode()
2073    00:PF0 01:PF1 10:PF2 11:PF3
2074    >>> dpa.readTupleAndExtra(BitStream(b'\\x29'))
2075    (2, Symbol(DIST, 1), 4, 10)
2076    >>> dpa.explanation(2, 5)
2077    '2 postfix bits and 0101<<2=20 direct codes'
2078    """,
2079
2080'LiteralAlphabet': """
2081    >>> LiteralAlphabet(-1).showCode()   #doctest: +ELLIPSIS
2082    00000000:\\x00 00110100:4    01101000:h    10011100:\\x9c 11010000:\\xd0
2083    00000001:\\x01 00110101:5    01101001:i    10011101:\\x9d 11010001:\\xd1
2084    00000010:\\x02 00110110:6    01101010:j    10011110:\\x9e 11010010:\\xd2
2085    ...
2086    00101111:/    01100011:c    10010111:\\x97 11001011:\\xcb 11111111:\\xff
2087    00110000:0    01100100:d    10011000:\\x98 11001100:\\xcc
2088    00110001:1    01100101:e    10011001:\\x99 11001101:\\xcd
2089    00110010:2    01100110:f    10011010:\\x9a 11001110:\\xce
2090    00110011:3    01100111:g    10011011:\\x9b 11001111:\\xcf
2091    """,
2092
2093'BlockCountAlphabet': """
2094    >>> bc=BlockCountAlphabet('BCL')
2095    >>> len(bc)
2096    26
2097    >>> bs=BitStream(b'\\x40\\x83\\xc8\\x59\\12\\x02')
2098    >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3])
2099    'Block count: xx 00000: 1-4; 1+2=3'
2100    >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3])
2101    'Block count: xxx 00110: 33-40; 33+0=33'
2102    >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3])
2103    'Block count: xxxxxx 10001: 305-368; 305+28=333'
2104    >>> x = bc.readTupleAndExtra(bs); x[1].explanation(x[3])
2105    'Block count: xxxxxxxxxxx 10110: 2289-4336; 2289+1044=3333'
2106    """,
2107
2108'Layout': """
2109    >>> olleke.pos = 0
2110    >>> l = Layout(olleke)
2111    >>> l.verboseRead(WindowSizeAlphabet())
2112    0000  1b                   1011 WSIZE   windowsize=(1<<22)-16=4194288
2113    4194288
2114    >>> l.verboseRead(BoolCode('LAST', description="Last block"))
2115                              1     LAST    Last block: 1: True
2116    True
2117    >>> l.verboseRead(BoolCode('EMPTY', description="Empty block"))
2118                             0      EMPTY   Empty block: 0: False
2119    False
2120    >>> l.verboseRead(MetablockLengthAlphabet())
2121    0001  2e 00        |00h|2Eh,|00 MLEN    data length: 002eh+1=47
2122    47
2123    >>> olleke.pos = 76
2124    >>> l = Layout(olleke)
2125    >>> x = l.verboseRead(DistanceAlphabet(0,NPOSTFIX=0,NDIRECT=0), skipExtra=True)
2126    000a  82                10|1100 D0      10[15*x]-3
2127    >>> x.explanation(0x86a3)
2128    '10[1000011010100011]-3: [0]+100000'
2129    """,
2130
2131'olleke': """
2132    >>> olleke.pos = 0
2133    >>> try: Layout(olleke).processStream()
2134    ... except NotImplementedError: pass
2135    ... #doctest: +REPORT_NDIFF
2136    addr  hex               binary context explanation
2137    -----------------------Stream header------------------------
2138    0000  1b                   1011 WSIZE   windowsize=(1<<22)-16=4194288
2139    ======================Metablock header======================
2140                              1     LAST    Last block: 1: True
2141                             0      EMPTY   Empty block: 0: False
2142    0001  2e 00        |00h|2Eh,|00 MLEN    data length: 002eh+1=47
2143    -------------------Block type descriptors-------------------
2144    0003  00                      0 BT#L    0: 1 literal block type
2145                                 0  BT#I    0: 1 insert&copy block type
2146                                0   BT#D    0: 1 distance block type
2147    ------------------Distance code parameters------------------
2148    0004  44               0|000,00 DIST    0 postfix bits and 0000<<0=0 direct codes
2149    -----------------------Context modes------------------------
2150                         10         LC0     Context mode for type 0: 2(UTF8)
2151    ------------------------Context maps------------------------
2152                        0           LT#     0: 1 literal prefix tree
2153                       0            DT#     0: 1 distance prefix tree
2154    ---------------------Prefix code lists----------------------
2155                     10             PFX     L0 is complex with lengths 3,4,0,5,17...
2156    0005  4f                    1|0 ##L0    len 3: coded with 3 bits
2157                            0111    ##L0    len 4: coded with 1 bits
2158                          10        ##L0    unused: coded with 3 bits
2159    0006  d6                    0|0 ##L0    len 5: skipped
2160                             011    ##L0    zero xxx: coded with 2 bits
2161    ***** Lengths for L0 will be coded as:
2162      0:len 4     01:zero xxx 011:unused   111:len 3
2163    0007  95                1|11,01 #L0     7+3 unused
2164                           0        #L0     Length for \\n is 4 bits
2165                     001,01         #L0     1+3 unused
2166    0008  44                010,0|1 #L0     total 19+2 unused
2167                           0        #L0     Length for " " is 4 bits
2168                          0         #L0     Length for ! is 4 bits
2169    0009  cb                011,|01 #L0     3+3 unused
2170                     |110,01        #L0     total 35+6 unused
2171    000a  82                      0 #L0     Length for K is 4 bits
2172                            000,01  #L0     0+3 unused
2173                           0        #L0     Length for O is 4 bits
2174    000b  4d                   01|1 #L0     symbol P unused
2175                            011     #L0     symbol Q unused
2176                           0        #L0     Length for R is 4 bits
2177    000c  88                000,|01 #L0     0+3 unused
2178                     |100,01        #L0     total 11+4 unused
2179    000d  b6                      0 #L0     Length for b is 4 bits
2180                               011  #L0     symbol c unused
2181                            011     #L0     symbol d unused
2182    000e  27                   11|1 #L0     Length for e is 3 bits
2183                         010,01     #L0     2+3 unused
2184                       |0           #L0     Length for k is 4 bits
2185    000f  1f                    111 #L0     Length for l is 3 bits
2186                             011    #L0     symbol m unused
2187                            0       #L0     Length for n is 4 bits
2188                          |0        #L0     Length for o is 4 bits
2189    0010  c1                 000,01 #L0     0+3 unused
2190                            0       #L0     Length for s is 4 bits
2191    0011  b4                   0|11 #L0     symbol t unused
2192                              0     #L0     Length for u is 4 bits
2193    End of table. Prefix code L0:
2194     000:e   0010:\\n  0110:!   0001:O   0101:b   0011:n   0111:s
2195     100:l   1010:" " 1110:K   1001:R   1101:k   1011:o   1111:u
2196                         11,01      PFX     IC0 is simple with 4 code words
2197    0012  2a                |2Ah|10 IC0     ? bits: I5C4
2198    0013  b5 ec              00|B5h IC0     ? bits: I6+xC7
2199    0015  22            0010|111011 IC0     ? bits: I8+xC5
2200    0016  8c            001100|0010 IC0     ? bits: I0C14+xx
2201                       0            SHAPE   False: lengths 2,2,2,2
2202    0017  74                 10,0|1 PFX     D0 is simple with 3 code words
2203    0018  a6                0|01110 D0      1 bit: 2last-3
2204                      010011        D0      2 bits: 11xx-3
2205    0019  aa                01010|1 D0      2 bits: 11xxx-3
2206    ====================Meta block contents=====================
2207                       |1,01        IC0     Literal: 9, copy: 5
2208    001a  41                   0001 0,0=L0  O
2209                            100     0,48=L0 l
2210    001b  a2                   10|0 0,62=L0 l
2211                            000     0,63=L0 e
2212    001c  a1                  1|101 0,59=L0 k
2213                           000      0,63=L0 e
2214                      |1010         0,59=L0 " "
2215    001d  b5                   0101 0,11=L0 b
2216                          |1011     0,60=L0 o
2217    001e  24                      0 0,3=D0  Distance: 2last-3=8
2218                                            Seen before: "lleke"
2219                              0,10  IC0     Literal: 6, copy: 7
2220                         |0010      0,59=L0 \\n
2221    001f  89                   1001 0,7=L0  R
2222                            000     0,52=L0 e
2223    0020  fa                  010|1 0,58=L0 b
2224                          1111      0,63=L0 u
2225    0021  eb                  011|1 0,59=L0 s
2226                         11,01      0,3=D0  Absolute value: 12 (pos 8)
2227                                            Seen before: "olleke\\n"
2228    0022  db                 01,1|1 IC0     Literal: 0, copy: 15
2229                      |110,11       0,3=D0  Absolute value: 27 (pos 0)
2230                                            Seen before: "Olleke bolleke\\n"
2231    0023  f8                     00 IC0     Literal: 5, copy: 4
2232                             1110   0,7=L0  K
2233    0024  2c                  00|11 0,52=L0 n
2234                          1011      0,62=L0 o
2235    0025  0d                   1|00 0,59=L0 l
2236                           0110     0,63=L0 !
2237    """,
2238
2239'file': """
2240    >>> try: Layout(BitStream(
2241    ... open("H:/Downloads/brotli-master/tests/testdata/10x10y.compressed",'rb')
2242    ...     .read())).processStream()
2243    ... except NotImplementedError: pass
2244    addr  hex               binary context explanation
2245    -----------------------Stream header------------------------
2246    0000  1b                   1011 WSIZE   windowsize=(1<<22)-16=4194288
2247    ======================Metablock header======================
2248                              1     LAST    Last block: 1: True
2249                             0      EMPTY   Empty block: 0: False
2250    0001  13 00        |00h|13h,|00 MLEN    data length: 0013h+1=20
2251    -------------------Block type descriptors-------------------
2252    0003  00                      0 BT#L    0: 1 literal block type
2253                                 0  BT#I    0: 1 insert&copy block type
2254                                0   BT#D    0: 1 distance block type
2255    ------------------Distance code parameters------------------
2256    0004  a4               0|000,00 DIST    0 postfix bits and 0000<<0=0 direct codes
2257    -----------------------Context modes------------------------
2258                         10         LC0     Context mode for type 0: 2(UTF8)
2259    ------------------------Context maps------------------------
2260                        0           LT#     0: 1 literal prefix tree
2261                       0            DT#     0: 1 distance prefix tree
2262    ---------------------Prefix code lists----------------------
2263    0005  b0                 0|1,01 PFX     L0 is simple with 2 code words
2264    0006  b2              0|1011000 L0      1 bit: X
2265    0007  ea              0|1011001 L0      1 bit: Y
2266                     01,01          PFX     IC0 is simple with 2 code words
2267    0008  81            0000001|111 IC0     1 bit: I1C9&D=0
2268    0009  47 02             0|47h|1 IC0     1 bit: I1C9
2269                       00,01        PFX     D0 is simple with 1 code word
2270    000b  8a                010|000 D0      0 bits: 10x-3
2271    ====================Meta block contents=====================
2272                           1        IC0     Literal: 1, copy: 9
2273                          0         0,0=L0  X
2274                      0,()          0,3=D0  Absolute value: 1 (pos 0)
2275                                            Seen before: "XXXXXXXXX"
2276                     0              IC0     Literal: 1, copy: 9, same distance
2277                   |1               0,54=L0 Y
2278                                            Seen before: "YYYYYYYYY"
2279    """,
2280
2281'XY': """
2282    >>> try: Layout(BitStream(brotli.compress('X'*10+'Y'*10))).processStream()
2283    ... except NotImplementedError: pass
2284    addr  hex               binary context explanation
2285    -----------------------Stream header------------------------
2286    0000  1b                   1011 WSIZE   windowsize=(1<<22)-16=4194288
2287    ======================Metablock header======================
2288                              1     LAST    Last block: 1: True
2289                             0      EMPTY   Empty block: 0: False
2290    0001  13 00        |00h|13h,|00 MLEN    data length: 0013h+1=20
2291    -------------------Block type descriptors-------------------
2292    0003  00                      0 BT#L    0: 1 literal block type
2293                                 0  BT#I    0: 1 insert&copy block type
2294                                0   BT#D    0: 1 distance block type
2295    ------------------Distance code parameters------------------
2296    0004  a4               0|000,00 DIST    0 postfix bits and 0000<<0=0 direct codes
2297    -----------------------Context modes------------------------
2298                         10         LC0     Context mode for type 0: 2(UTF8)
2299    ------------------------Context maps------------------------
2300                        0           LT#     0: 1 literal prefix tree
2301                       0            DT#     0: 1 distance prefix tree
2302    ---------------------Prefix code lists----------------------
2303    0005  b0                 0|1,01 PFX     L0 is simple with 2 code words
2304    0006  b2              0|1011000 L0      1 bit: X
2305    0007  82              0|1011001 L0      1 bit: Y
2306                     00,01          PFX     IC0 is simple with 1 code word
2307    0008  84            0000100|100 IC0     0 bits: I4C6&D=0
2308    0009  00                 00,0|1 PFX     D0 is simple with 1 code word
2309    000a  e0                0|00000 D0      0 bits: last
2310    ====================Meta block contents=====================
2311                          ()        IC0     Literal: 4, copy: 6, same distance
2312                         0          0,0=L0  X
2313                        0           0,52=L0 X
2314                       0            0,54=L0 X
2315                      0             0,54=L0 X
2316                                            Seen before: "XXXXXX"
2317                    ()              IC0     Literal: 4, copy: 6, same distance
2318                   1                0,54=L0 Y
2319                  1                 0,54=L0 Y
2320                |1                  0,54=L0 Y
2321    000b  01                      1 0,54=L0 Y
2322                                            Seen before: "YYYYYY"
2323    """,
2324
2325'empty': """
2326    >>> try: Layout(BitStream(b'\\x81\\x16\\x00\\x58')).processStream()
2327    ... except NotImplementedError: pass
2328    addr  hex               binary context explanation
2329    -----------------------Stream header------------------------
2330    0000  81                0000001 WSIZE   windowsize=(1<<17)-16=131056
2331    ======================Metablock header======================
2332                          |1        LAST    Last block: 1: True
2333    0001  16                      0 EMPTY   Empty block: 0: False
2334                                11  MLEN    11: empty block
2335                               0    RSVD    Reserved (must be zero)
2336    0002  00           000000|00,01 SKIP    skip length: 0h+1=1
2337                    |00             SKIP    2 bits ignored
2338    Skipping to 4
2339    """,
2340
2341}
2342
2343if __name__=='__main__':
2344    import sys
2345    if len(sys.argv)>1:
2346        l = Layout(BitStream(open(sys.argv[1],'rb').read()))
2347        l.processStream()
2348    else:
2349        sys.path.append("h:/Persoonlijk/bin")
2350        try:
2351            import brotli
2352            open('brotlidump.br', 'wb').write(
2353                brotli.compress(
2354                    open('brotlidump.py', 'r').read()
2355                ))
2356            olleke = BitStream(brotli.compress(
2357                'Olleke bolleke\nRebusolleke\nOlleke bolleke\nKnol!'))
2358        except ImportError: pass
2359        import doctest
2360        doctest.testmod(optionflags=doctest.REPORT_NDIFF
2361            #|doctest.FAIL_FAST
2362            )
2363