1#!/usr/bin/env python 2# Copyright 2014 The PDFium Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5"""Expands a hand-written PDF testcase (template) into a valid PDF file. 6 7There are several places in a PDF file where byte-offsets are required. This 8script replaces {{name}}-style variables in the input with calculated results 9 10 {{include path/to/file}} - inserts file's contents into stream. 11 {{header}} - expands to the header comment required for PDF files. 12 {{xref}} - expands to a generated xref table, noting the offset. 13 {{trailer}} - expands to a standard trailer with "1 0 R" as the /Root. 14 {{startxref} - expands to a startxref directive followed by correct offset. 15 {{object x y}} - expands to |x y obj| declaration, noting the offset. 16 {{streamlen}} - expands to |/Length n|. 17""" 18 19import cStringIO 20import optparse 21import os 22import re 23import sys 24 25 26class StreamLenState: 27 START = 1 28 FIND_STREAM = 2 29 FIND_ENDSTREAM = 3 30 31 32class TemplateProcessor: 33 HEADER_TOKEN = '{{header}}' 34 HEADER_REPLACEMENT = '%PDF-1.7\n%\xa0\xf2\xa4\xf4' 35 36 XREF_TOKEN = '{{xref}}' 37 XREF_REPLACEMENT = 'xref\n%d %d\n' 38 39 XREF_REPLACEMENT_N = '%010d %05d n \n' 40 XREF_REPLACEMENT_F = '0000000000 65535 f \n' 41 # XREF rows must be exactly 20 bytes - space required. 42 assert len(XREF_REPLACEMENT_F) == 20 43 44 TRAILER_TOKEN = '{{trailer}}' 45 TRAILER_REPLACEMENT = 'trailer <<\n /Root 1 0 R\n /Size %d\n>>' 46 47 STARTXREF_TOKEN = '{{startxref}}' 48 STARTXREF_REPLACEMENT = 'startxref\n%d' 49 50 OBJECT_PATTERN = r'\{\{object\s+(\d+)\s+(\d+)\}\}' 51 OBJECT_REPLACEMENT = r'\1 \2 obj' 52 53 STREAMLEN_TOKEN = '{{streamlen}}' 54 STREAMLEN_REPLACEMENT = '/Length %d' 55 56 def __init__(self): 57 self.streamlen_state = StreamLenState.START 58 self.streamlens = [] 59 self.offset = 0 60 self.xref_offset = 0 61 self.max_object_number = 0 62 self.objects = {} 63 64 def insert_xref_entry(self, object_number, generation_number): 65 self.objects[object_number] = (self.offset, generation_number) 66 self.max_object_number = max(self.max_object_number, object_number) 67 68 def generate_xref_table(self): 69 result = self.XREF_REPLACEMENT % (0, self.max_object_number + 1) 70 for i in range(0, self.max_object_number + 1): 71 if i in self.objects: 72 result += self.XREF_REPLACEMENT_N % self.objects[i] 73 else: 74 result += self.XREF_REPLACEMENT_F 75 return result 76 77 def preprocess_line(self, line): 78 if self.STREAMLEN_TOKEN in line: 79 assert self.streamlen_state == StreamLenState.START 80 self.streamlen_state = StreamLenState.FIND_STREAM 81 self.streamlens.append(0) 82 return 83 84 if (self.streamlen_state == StreamLenState.FIND_STREAM and 85 line.rstrip() == 'stream'): 86 self.streamlen_state = StreamLenState.FIND_ENDSTREAM 87 return 88 89 if self.streamlen_state == StreamLenState.FIND_ENDSTREAM: 90 if line.rstrip() == 'endstream': 91 self.streamlen_state = StreamLenState.START 92 else: 93 self.streamlens[-1] += len(line) 94 95 def process_line(self, line): 96 if self.HEADER_TOKEN in line: 97 line = line.replace(self.HEADER_TOKEN, self.HEADER_REPLACEMENT) 98 if self.STREAMLEN_TOKEN in line: 99 sub = self.STREAMLEN_REPLACEMENT % self.streamlens.pop(0) 100 line = re.sub(self.STREAMLEN_TOKEN, sub, line) 101 if self.XREF_TOKEN in line: 102 self.xref_offset = self.offset 103 line = self.generate_xref_table() 104 if self.TRAILER_TOKEN in line: 105 replacement = self.TRAILER_REPLACEMENT % (self.max_object_number + 1) 106 line = line.replace(self.TRAILER_TOKEN, replacement) 107 if self.STARTXREF_TOKEN in line: 108 replacement = self.STARTXREF_REPLACEMENT % self.xref_offset 109 line = line.replace(self.STARTXREF_TOKEN, replacement) 110 match = re.match(self.OBJECT_PATTERN, line) 111 if match: 112 self.insert_xref_entry(int(match.group(1)), int(match.group(2))) 113 line = re.sub(self.OBJECT_PATTERN, self.OBJECT_REPLACEMENT, line) 114 self.offset += len(line) 115 return line 116 117 118def expand_file(infile, output_path): 119 processor = TemplateProcessor() 120 try: 121 with open(output_path, 'wb') as outfile: 122 preprocessed = cStringIO.StringIO() 123 for line in infile: 124 preprocessed.write(line) 125 processor.preprocess_line(line) 126 preprocessed.seek(0) 127 for line in preprocessed: 128 outfile.write(processor.process_line(line)) 129 except IOError: 130 print >> sys.stderr, 'failed to process %s' % input_path 131 132 133def insert_includes(input_path, output_file, visited_set): 134 input_path = os.path.normpath(input_path) 135 if input_path in visited_set: 136 print >> sys.stderr, 'Circular inclusion %s, ignoring' % input_path 137 return 138 visited_set.add(input_path) 139 try: 140 with open(input_path, 'rb') as infile: 141 for line in infile: 142 match = re.match(r'\s*\{\{include\s+(.+)\}\}', line) 143 if match: 144 insert_includes( 145 os.path.join(os.path.dirname(input_path), match.group(1)), 146 output_file, visited_set) 147 else: 148 output_file.write(line) 149 except IOError: 150 print >> sys.stderr, 'failed to include %s' % input_path 151 raise 152 visited_set.discard(input_path) 153 154 155def main(): 156 parser = optparse.OptionParser() 157 parser.add_option('--output-dir', default='') 158 options, args = parser.parse_args() 159 for testcase_path in args: 160 testcase_filename = os.path.basename(testcase_path) 161 testcase_root, _ = os.path.splitext(testcase_filename) 162 output_dir = os.path.dirname(testcase_path) 163 if options.output_dir: 164 output_dir = options.output_dir 165 intermediate_stream = cStringIO.StringIO() 166 insert_includes(testcase_path, intermediate_stream, set()) 167 intermediate_stream.seek(0) 168 output_path = os.path.join(output_dir, testcase_root + '.pdf') 169 expand_file(intermediate_stream, output_path) 170 return 0 171 172 173if __name__ == '__main__': 174 sys.exit(main()) 175