• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python3
2# Copyright 2019 The PDFium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5"""Encodes binary data using one or more PDF stream filters.
6
7This tool helps with the common task of converting binary data into ASCII PDF
8streams. In test PDFs (and the corresponding .in files), we often want the
9contents to be plain (or mostly plain) ASCII.
10
11Requires Python 3 (mainly for Ascii85 support). This should be fine for a
12manually-run script.
13"""
14
15import argparse
16import base64
17import collections
18import collections.abc
19import io
20import sys
21import zlib
22
23
24class _PdfStream:
25  _unique_filter_classes = []
26  _filter_classes = {}
27
28  @staticmethod
29  def GetFilterByName(name):
30    # Tolerate any case-insensitive match for "/Name" or "Name", or an alias.
31    key_name = name.lower()
32    if key_name and key_name[0] == '/':
33      key_name = key_name[:1]
34
35    filter_class = _PdfStream._filter_classes.get(key_name)
36    if not filter_class:
37      raise KeyError(name)
38
39    return filter_class
40
41  @staticmethod
42  def RegisterFilter(filter_class):
43    assert filter_class not in _PdfStream._unique_filter_classes
44    _PdfStream._unique_filter_classes.append(filter_class)
45
46    assert filter_class.name[0] == '/'
47    lower_name = filter_class.name.lower()
48    _PdfStream._filter_classes[lower_name] = filter_class
49    _PdfStream._filter_classes[lower_name[1:]] = filter_class
50
51    for alias in filter_class.aliases:
52      _PdfStream._filter_classes[alias.lower()] = filter_class
53
54  @staticmethod
55  def GetHelp():
56    text = 'Available filters:\n'
57    for filter_class in _PdfStream._unique_filter_classes:
58      text += '  {} (aliases: {})\n'.format(filter_class.name,
59                                            ', '.join(filter_class.aliases))
60    return text
61
62  def __init__(self, out_buffer, **kwargs):
63    del kwargs
64    self.buffer = out_buffer
65
66  def write(self, data):
67    self.buffer.write(data)
68
69  def flush(self):
70    self.buffer.flush()
71
72  def close(self):
73    self.buffer.close()
74
75
76class _SinkPdfStream(_PdfStream):
77
78  def __init__(self):
79    super().__init__(io.BytesIO())
80
81  def close(self):
82    # Don't call io.BytesIO.close(); this deallocates the written data.
83    self.flush()
84
85  def getbuffer(self):
86    return self.buffer.getbuffer()
87
88
89class _AsciiPdfStream(_PdfStream):
90
91  def __init__(self, out_buffer, wrapcol=0, **kwargs):
92    super().__init__(out_buffer, **kwargs)
93    self.wrapcol = wrapcol
94    self.column = 0
95
96  def write(self, data):
97    if not self.wrapcol:
98      self.buffer.write(data)
99      return
100
101    tail = self.wrapcol - self.column
102    self.buffer.write(data[:tail])
103    if tail >= len(data):
104      self.column += len(data)
105      return
106
107    for start in range(tail, len(data), self.wrapcol):
108      self.buffer.write(b'\n')
109      self.buffer.write(data[start:start + self.wrapcol])
110
111    tail = len(data) - tail
112    self.column = self.wrapcol - -tail % self.wrapcol
113
114
115class _Ascii85DecodePdfStream(_AsciiPdfStream):
116  name = '/ASCII85Decode'
117  aliases = ('ascii85', 'base85')
118
119  def __init__(self, out_buffer, **kwargs):
120    super().__init__(out_buffer, **kwargs)
121    self.trailer = b''
122
123  def write(self, data):
124    # Need to write ASCII85 in units of 4.
125    data = self.trailer + data
126    trailer_length = len(data) % 4
127    super().write(base64.a85encode(data[:-trailer_length]))
128    self.trailer = data[-trailer_length:]
129
130  def close(self):
131    super().write(base64.a85encode(self.trailer))
132    # Avoid breaking the end-of-data marker (but still try to wrap).
133    if self.wrapcol and self.column > self.wrapcol - 2:
134      self.buffer.write(b'\n')
135    self.buffer.write(b'~>')
136    self.buffer.close()
137
138
139class _AsciiHexDecodePdfStream(_AsciiPdfStream):
140  name = '/ASCIIHexDecode'
141  aliases = ('base16', 'hex', 'hexadecimal')
142
143  def __init__(self, out_buffer, **kwargs):
144    super().__init__(out_buffer, **kwargs)
145
146  def write(self, data):
147    super().write(base64.b16encode(data))
148
149
150class _FlateDecodePdfStream(_PdfStream):
151  name = '/FlateDecode'
152  aliases = ('deflate', 'flate', 'zlib')
153
154  def __init__(self, out_buffer, **kwargs):
155    super().__init__(out_buffer, **kwargs)
156    self.deflate = zlib.compressobj(level=9, memLevel=9)
157
158  def write(self, data):
159    self.buffer.write(self.deflate.compress(data))
160
161  def flush(self):
162    self.buffer.write(self.deflate.flush(zlib.Z_NO_FLUSH))
163
164  def close(self):
165    self.buffer.write(self.deflate.flush())
166    self.buffer.close()
167
168
169_PdfStream.RegisterFilter(_Ascii85DecodePdfStream)
170_PdfStream.RegisterFilter(_AsciiHexDecodePdfStream)
171_PdfStream.RegisterFilter(_FlateDecodePdfStream)
172
173_DEFAULT_FILTERS = (_Ascii85DecodePdfStream, _FlateDecodePdfStream)
174
175
176def _ParseCommandLine(argv):
177  arg_parser = argparse.ArgumentParser(
178      description='Encodes binary data using one or more PDF stream filters.',
179      epilog=_PdfStream.GetHelp(),
180      formatter_class=argparse.RawDescriptionHelpFormatter)
181  arg_parser.add_argument(
182      '-r',
183      '--raw',
184      action='store_true',
185      help='output raw bytes (no PDF stream header or trailer)')
186  arg_parser.add_argument(
187      '-l',
188      '--length',
189      action='store_true',
190      help='output actual /Length, instead of {{streamlen}}')
191  arg_parser.add_argument(
192      '-w',
193      '--wrap',
194      default=80,
195      type=int,
196      help='wrap ASCII lines at COLUMN; defaults to 80 (0 = off)',
197      metavar='COLUMN')
198  arg_parser.add_argument(
199      '-f',
200      '--filter',
201      action='append',
202      type=_PdfStream.GetFilterByName,
203      help=('one or more filters, in decoding order; defaults to ' + ' '.join(
204          [f.name for f in _DEFAULT_FILTERS])),
205      metavar='NAME')
206  arg_parser.add_argument(
207      'infile',
208      nargs='?',
209      default=sys.stdin,
210      type=argparse.FileType('r'),
211      help='input file; use - for standard input (default)')
212  arg_parser.add_argument(
213      'outfile',
214      nargs='?',
215      default=sys.stdout,
216      type=argparse.FileType('w'),
217      help='output file; use - for standard output (default)')
218  args = arg_parser.parse_intermixed_args(argv)
219  args.filter = args.filter or _DEFAULT_FILTERS
220  assert args.wrap >= 0, '--wrap COLUMN must be non-negative'
221  return args
222
223
224def _WrapWithFilters(out_buffer, filter_classes, **kwargs):
225  for filter_class in filter_classes:
226    out_buffer = filter_class(out_buffer, **kwargs)
227  return out_buffer
228
229
230def _CopyBytes(in_buffer, out_buffer):
231  data = bytearray(io.DEFAULT_BUFFER_SIZE)
232  while True:
233    data_length = in_buffer.readinto(data)
234    if not data_length:
235      return
236    out_buffer.write(data[:data_length])
237
238
239def _WritePdfStreamObject(out_buffer,
240                          data,
241                          entries,
242                          raw=False,
243                          use_streamlen=False):
244  if not raw:
245    out_buffer.write(b'<<\n')
246    entries['Length'] = len(data)
247    for k, v in entries.items():
248      v = _EncodePdfValue(v)
249      if k == 'Length' and use_streamlen:
250        out_buffer.write(b'  {{streamlen}}\n')
251      else:
252        out_buffer.write('  /{} {}\n'.format(k, v).encode('ascii'))
253    out_buffer.write(b'>>\nstream\n')
254
255  out_buffer.write(data)
256
257  if not raw:
258    if data and data[-1] != '\n':
259      out_buffer.write(b'\n')
260    out_buffer.write(b'endstream\n')
261
262
263def _EncodePdfValue(value):
264  if isinstance(value, collections.abc.Sequence):
265    value = '[' + ' '.join(value) + ']'
266  return value
267
268
269def main(argv):
270  args = _ParseCommandLine(argv)
271
272  encoded_sink = _SinkPdfStream()
273  with args.infile:
274    out_buffer = _WrapWithFilters(encoded_sink, args.filter, wrapcol=args.wrap)
275    _CopyBytes(args.infile.buffer, out_buffer)
276    out_buffer.close()
277
278  entries = collections.OrderedDict()
279  entries['Filter'] = [f.name for f in args.filter]
280  _WritePdfStreamObject(
281      args.outfile.buffer,
282      data=encoded_sink.getbuffer(),
283      entries=entries,
284      raw=args.raw,
285      use_streamlen=not args.length)
286  return args.outfile.close()
287
288
289if __name__ == '__main__':
290  sys.exit(main(sys.argv[1:]))
291