1#!/usr/bin/env python3 2# Copyright 2019 The PDFium Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5"""Encodes binary data using one or more PDF stream filters. 6 7This tool helps with the common task of converting binary data into ASCII PDF 8streams. In test PDFs (and the corresponding .in files), we often want the 9contents to be plain (or mostly plain) ASCII. 10 11Requires Python 3 (mainly for Ascii85 support). This should be fine for a 12manually-run script. 13""" 14 15import argparse 16import base64 17import collections 18import collections.abc 19import io 20import sys 21import zlib 22 23 24class _PdfStream: 25 _unique_filter_classes = [] 26 _filter_classes = {} 27 28 @staticmethod 29 def GetFilterByName(name): 30 # Tolerate any case-insensitive match for "/Name" or "Name", or an alias. 31 key_name = name.lower() 32 if key_name and key_name[0] == '/': 33 key_name = key_name[:1] 34 35 filter_class = _PdfStream._filter_classes.get(key_name) 36 if not filter_class: 37 raise KeyError(name) 38 39 return filter_class 40 41 @staticmethod 42 def RegisterFilter(filter_class): 43 assert filter_class not in _PdfStream._unique_filter_classes 44 _PdfStream._unique_filter_classes.append(filter_class) 45 46 assert filter_class.name[0] == '/' 47 lower_name = filter_class.name.lower() 48 _PdfStream._filter_classes[lower_name] = filter_class 49 _PdfStream._filter_classes[lower_name[1:]] = filter_class 50 51 for alias in filter_class.aliases: 52 _PdfStream._filter_classes[alias.lower()] = filter_class 53 54 @staticmethod 55 def GetHelp(): 56 text = 'Available filters:\n' 57 for filter_class in _PdfStream._unique_filter_classes: 58 text += ' {} (aliases: {})\n'.format(filter_class.name, 59 ', '.join(filter_class.aliases)) 60 return text 61 62 def __init__(self, out_buffer, **kwargs): 63 del kwargs 64 self.buffer = out_buffer 65 66 def write(self, data): 67 self.buffer.write(data) 68 69 def flush(self): 70 self.buffer.flush() 71 72 def close(self): 73 self.buffer.close() 74 75 76class _SinkPdfStream(_PdfStream): 77 78 def __init__(self): 79 super().__init__(io.BytesIO()) 80 81 def close(self): 82 # Don't call io.BytesIO.close(); this deallocates the written data. 83 self.flush() 84 85 def getbuffer(self): 86 return self.buffer.getbuffer() 87 88 89class _AsciiPdfStream(_PdfStream): 90 91 def __init__(self, out_buffer, wrapcol=0, **kwargs): 92 super().__init__(out_buffer, **kwargs) 93 self.wrapcol = wrapcol 94 self.column = 0 95 96 def write(self, data): 97 if not self.wrapcol: 98 self.buffer.write(data) 99 return 100 101 tail = self.wrapcol - self.column 102 self.buffer.write(data[:tail]) 103 if tail >= len(data): 104 self.column += len(data) 105 return 106 107 for start in range(tail, len(data), self.wrapcol): 108 self.buffer.write(b'\n') 109 self.buffer.write(data[start:start + self.wrapcol]) 110 111 tail = len(data) - tail 112 self.column = self.wrapcol - -tail % self.wrapcol 113 114 115class _Ascii85DecodePdfStream(_AsciiPdfStream): 116 name = '/ASCII85Decode' 117 aliases = ('ascii85', 'base85') 118 119 def __init__(self, out_buffer, **kwargs): 120 super().__init__(out_buffer, **kwargs) 121 self.trailer = b'' 122 123 def write(self, data): 124 # Need to write ASCII85 in units of 4. 125 data = self.trailer + data 126 trailer_length = len(data) % 4 127 super().write(base64.a85encode(data[:-trailer_length])) 128 self.trailer = data[-trailer_length:] 129 130 def close(self): 131 super().write(base64.a85encode(self.trailer)) 132 # Avoid breaking the end-of-data marker (but still try to wrap). 133 if self.wrapcol and self.column > self.wrapcol - 2: 134 self.buffer.write(b'\n') 135 self.buffer.write(b'~>') 136 self.buffer.close() 137 138 139class _AsciiHexDecodePdfStream(_AsciiPdfStream): 140 name = '/ASCIIHexDecode' 141 aliases = ('base16', 'hex', 'hexadecimal') 142 143 def __init__(self, out_buffer, **kwargs): 144 super().__init__(out_buffer, **kwargs) 145 146 def write(self, data): 147 super().write(base64.b16encode(data)) 148 149 150class _FlateDecodePdfStream(_PdfStream): 151 name = '/FlateDecode' 152 aliases = ('deflate', 'flate', 'zlib') 153 154 def __init__(self, out_buffer, **kwargs): 155 super().__init__(out_buffer, **kwargs) 156 self.deflate = zlib.compressobj(level=9, memLevel=9) 157 158 def write(self, data): 159 self.buffer.write(self.deflate.compress(data)) 160 161 def flush(self): 162 self.buffer.write(self.deflate.flush(zlib.Z_NO_FLUSH)) 163 164 def close(self): 165 self.buffer.write(self.deflate.flush()) 166 self.buffer.close() 167 168 169_PdfStream.RegisterFilter(_Ascii85DecodePdfStream) 170_PdfStream.RegisterFilter(_AsciiHexDecodePdfStream) 171_PdfStream.RegisterFilter(_FlateDecodePdfStream) 172 173_DEFAULT_FILTERS = (_Ascii85DecodePdfStream, _FlateDecodePdfStream) 174 175 176def _ParseCommandLine(argv): 177 arg_parser = argparse.ArgumentParser( 178 description='Encodes binary data using one or more PDF stream filters.', 179 epilog=_PdfStream.GetHelp(), 180 formatter_class=argparse.RawDescriptionHelpFormatter) 181 arg_parser.add_argument( 182 '-r', 183 '--raw', 184 action='store_true', 185 help='output raw bytes (no PDF stream header or trailer)') 186 arg_parser.add_argument( 187 '-l', 188 '--length', 189 action='store_true', 190 help='output actual /Length, instead of {{streamlen}}') 191 arg_parser.add_argument( 192 '-w', 193 '--wrap', 194 default=80, 195 type=int, 196 help='wrap ASCII lines at COLUMN; defaults to 80 (0 = off)', 197 metavar='COLUMN') 198 arg_parser.add_argument( 199 '-f', 200 '--filter', 201 action='append', 202 type=_PdfStream.GetFilterByName, 203 help=('one or more filters, in decoding order; defaults to ' + ' '.join( 204 [f.name for f in _DEFAULT_FILTERS])), 205 metavar='NAME') 206 arg_parser.add_argument( 207 'infile', 208 nargs='?', 209 default=sys.stdin, 210 type=argparse.FileType('r'), 211 help='input file; use - for standard input (default)') 212 arg_parser.add_argument( 213 'outfile', 214 nargs='?', 215 default=sys.stdout, 216 type=argparse.FileType('w'), 217 help='output file; use - for standard output (default)') 218 args = arg_parser.parse_intermixed_args(argv) 219 args.filter = args.filter or _DEFAULT_FILTERS 220 assert args.wrap >= 0, '--wrap COLUMN must be non-negative' 221 return args 222 223 224def _WrapWithFilters(out_buffer, filter_classes, **kwargs): 225 for filter_class in filter_classes: 226 out_buffer = filter_class(out_buffer, **kwargs) 227 return out_buffer 228 229 230def _CopyBytes(in_buffer, out_buffer): 231 data = bytearray(io.DEFAULT_BUFFER_SIZE) 232 while True: 233 data_length = in_buffer.readinto(data) 234 if not data_length: 235 return 236 out_buffer.write(data[:data_length]) 237 238 239def _WritePdfStreamObject(out_buffer, 240 data, 241 entries, 242 raw=False, 243 use_streamlen=False): 244 if not raw: 245 out_buffer.write(b'<<\n') 246 entries['Length'] = len(data) 247 for k, v in entries.items(): 248 v = _EncodePdfValue(v) 249 if k == 'Length' and use_streamlen: 250 out_buffer.write(b' {{streamlen}}\n') 251 else: 252 out_buffer.write(' /{} {}\n'.format(k, v).encode('ascii')) 253 out_buffer.write(b'>>\nstream\n') 254 255 out_buffer.write(data) 256 257 if not raw: 258 if data and data[-1] != '\n': 259 out_buffer.write(b'\n') 260 out_buffer.write(b'endstream\n') 261 262 263def _EncodePdfValue(value): 264 if isinstance(value, collections.abc.Sequence): 265 value = '[' + ' '.join(value) + ']' 266 return value 267 268 269def main(argv): 270 args = _ParseCommandLine(argv) 271 272 encoded_sink = _SinkPdfStream() 273 with args.infile: 274 out_buffer = _WrapWithFilters(encoded_sink, args.filter, wrapcol=args.wrap) 275 _CopyBytes(args.infile.buffer, out_buffer) 276 out_buffer.close() 277 278 entries = collections.OrderedDict() 279 entries['Filter'] = [f.name for f in args.filter] 280 _WritePdfStreamObject( 281 args.outfile.buffer, 282 data=encoded_sink.getbuffer(), 283 entries=entries, 284 raw=args.raw, 285 use_streamlen=not args.length) 286 return args.outfile.close() 287 288 289if __name__ == '__main__': 290 sys.exit(main(sys.argv[1:])) 291