1#!/usr/bin/env python3 2# Copyright 2014 The PDFium Authors 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5"""Expands a hand-written PDF testcase (template) into a valid PDF file. 6 7There are several places in a PDF file where byte-offsets are required. This 8script replaces {{name}}-style variables in the input with calculated results 9 10 {{include path/to/file}} - inserts file's contents into stream. 11 {{header}} - expands to the header comment required for PDF files. 12 {{xref}} - expands to a generated xref table, noting the offset. 13 {{trailer}} - expands to a standard trailer with "1 0 R" as the /Root. 14 {{trailersize}} - expands to `/Size n`, to be used in non-standard trailers. 15 {{startxref} - expands to a startxref directive followed by correct offset. 16 {{startxrefobj x y} - expands to a startxref directive followed by correct 17 offset pointing to the start of `x y obj`. 18 {{object x y}} - expands to `x y obj` declaration, noting the offset. 19 {{streamlen}} - expands to `/Length n`. 20""" 21 22import io 23import optparse 24import os 25import re 26import sys 27 28# Line Endings. 29WINDOWS_LINE_ENDING = b'\r\n' 30UNIX_LINE_ENDING = b'\n' 31 32# List of extensions whose line endings should be modified after parsing. 33EXTENSION_OVERRIDE_LINE_ENDINGS = [ 34 '.js', 35 '.fragment', 36 '.in', 37 '.xml', 38] 39 40 41class StreamLenState: 42 START = 1 43 FIND_STREAM = 2 44 FIND_ENDSTREAM = 3 45 46 47class TemplateProcessor: 48 HEADER_TOKEN = b'{{header}}' 49 HEADER_REPLACEMENT = b'%PDF-1.7\n%\xa0\xf2\xa4\xf4' 50 51 XREF_TOKEN = b'{{xref}}' 52 XREF_REPLACEMENT = b'xref\n%d %d\n' 53 54 XREF_REPLACEMENT_N = b'%010d %05d n \n' 55 XREF_REPLACEMENT_F = b'0000000000 65535 f \n' 56 # XREF rows must be exactly 20 bytes - space required. 57 assert len(XREF_REPLACEMENT_F) == 20 58 59 TRAILER_TOKEN = b'{{trailer}}' 60 TRAILER_REPLACEMENT = b'trailer <<\n /Root 1 0 R\n /Size %d\n>>' 61 62 TRAILERSIZE_TOKEN = b'{{trailersize}}' 63 TRAILERSIZE_REPLACEMENT = b'/Size %d' 64 65 STARTXREF_TOKEN = b'{{startxref}}' 66 STARTXREF_REPLACEMENT = b'startxref\n%d' 67 68 STARTXREFOBJ_PATTERN = b'\{\{startxrefobj\s+(\d+)\s+(\d+)\}\}' 69 70 OBJECT_PATTERN = b'\{\{object\s+(\d+)\s+(\d+)\}\}' 71 OBJECT_REPLACEMENT = b'\g<1> \g<2> obj' 72 73 STREAMLEN_TOKEN = b'{{streamlen}}' 74 STREAMLEN_REPLACEMENT = b'/Length %d' 75 76 def __init__(self): 77 self.streamlen_state = StreamLenState.START 78 self.streamlens = [] 79 self.offset = 0 80 self.xref_offset = 0 81 self.max_object_number = 0 82 self.objects = {} 83 84 def insert_xref_entry(self, object_number, generation_number): 85 self.objects[object_number] = (self.offset, generation_number) 86 self.max_object_number = max(self.max_object_number, object_number) 87 88 def generate_xref_table(self): 89 result = self.XREF_REPLACEMENT % (0, self.max_object_number + 1) 90 for i in range(0, self.max_object_number + 1): 91 if i in self.objects: 92 result += self.XREF_REPLACEMENT_N % self.objects[i] 93 else: 94 result += self.XREF_REPLACEMENT_F 95 return result 96 97 def preprocess_line(self, line): 98 if self.STREAMLEN_TOKEN in line: 99 assert self.streamlen_state == StreamLenState.START 100 self.streamlen_state = StreamLenState.FIND_STREAM 101 self.streamlens.append(0) 102 return 103 104 if (self.streamlen_state == StreamLenState.FIND_STREAM and 105 line.rstrip() == b'stream'): 106 self.streamlen_state = StreamLenState.FIND_ENDSTREAM 107 return 108 109 if self.streamlen_state == StreamLenState.FIND_ENDSTREAM: 110 if line.rstrip() == b'endstream': 111 self.streamlen_state = StreamLenState.START 112 else: 113 self.streamlens[-1] += len(line) 114 115 def process_line(self, line): 116 if self.HEADER_TOKEN in line: 117 line = line.replace(self.HEADER_TOKEN, self.HEADER_REPLACEMENT) 118 if self.STREAMLEN_TOKEN in line: 119 sub = self.STREAMLEN_REPLACEMENT % self.streamlens.pop(0) 120 line = re.sub(self.STREAMLEN_TOKEN, sub, line) 121 if self.XREF_TOKEN in line: 122 self.xref_offset = self.offset 123 line = self.generate_xref_table() 124 if self.TRAILER_TOKEN in line: 125 replacement = self.TRAILER_REPLACEMENT % (self.max_object_number + 1) 126 line = line.replace(self.TRAILER_TOKEN, replacement) 127 if self.TRAILERSIZE_TOKEN in line: 128 replacement = self.TRAILERSIZE_REPLACEMENT % (self.max_object_number + 1) 129 line = line.replace(self.TRAILERSIZE_TOKEN, replacement) 130 if self.STARTXREF_TOKEN in line: 131 replacement = self.STARTXREF_REPLACEMENT % self.xref_offset 132 line = line.replace(self.STARTXREF_TOKEN, replacement) 133 match = re.match(self.OBJECT_PATTERN, line) 134 if match: 135 self.insert_xref_entry(int(match.group(1)), int(match.group(2))) 136 line = re.sub(self.OBJECT_PATTERN, self.OBJECT_REPLACEMENT, line) 137 match = re.match(self.STARTXREFOBJ_PATTERN, line) 138 if match: 139 (offset, generation_number) = self.objects[int(match.group(1))] 140 assert int(match.group(2)) == generation_number 141 replacement = self.STARTXREF_REPLACEMENT % offset 142 line = re.sub(self.STARTXREFOBJ_PATTERN, replacement, line) 143 self.offset += len(line) 144 return line 145 146 147def expand_file(infile, output_path): 148 processor = TemplateProcessor() 149 try: 150 with open(output_path, 'wb') as outfile: 151 preprocessed = io.BytesIO() 152 for line in infile: 153 preprocessed.write(line) 154 processor.preprocess_line(line) 155 preprocessed.seek(0) 156 for line in preprocessed: 157 outfile.write(processor.process_line(line)) 158 except IOError: 159 print('failed to process %s' % input_path, file=sys.stderr) 160 161 162def insert_includes(input_path, output_file, visited_set): 163 input_path = os.path.normpath(input_path) 164 if input_path in visited_set: 165 print('Circular inclusion %s, ignoring' % input_path, file=sys.stderr) 166 return 167 visited_set.add(input_path) 168 try: 169 _, file_extension = os.path.splitext(input_path) 170 override_line_endings = (file_extension in EXTENSION_OVERRIDE_LINE_ENDINGS) 171 172 end_of_file_line_ending = False 173 with open(input_path, 'rb') as infile: 174 for line in infile: 175 match = re.match(b'\s*\{\{include\s+(.+)\}\}', line) 176 if match: 177 insert_includes( 178 os.path.join( 179 os.path.dirname(input_path), 180 match.group(1).decode('utf-8')), output_file, visited_set) 181 else: 182 if override_line_endings: 183 # Replace CRLF with LF line endings for .in files. 184 if line.endswith(WINDOWS_LINE_ENDING): 185 line = line.removesuffix(WINDOWS_LINE_ENDING) + UNIX_LINE_ENDING 186 end_of_file_line_ending = True 187 else: 188 end_of_file_line_ending = line.endswith(UNIX_LINE_ENDING) 189 output_file.write(line) 190 191 # Ensure the include ends on its own line. 192 if not end_of_file_line_ending: 193 output_file.write(UNIX_LINE_ENDING) 194 except IOError: 195 print('failed to include %s' % input_path, file=sys.stderr) 196 raise 197 visited_set.discard(input_path) 198 199 200def main(): 201 parser = optparse.OptionParser() 202 parser.add_option('--output-dir', default='') 203 options, args = parser.parse_args() 204 for testcase_path in args: 205 testcase_filename = os.path.basename(testcase_path) 206 testcase_root, _ = os.path.splitext(testcase_filename) 207 output_dir = os.path.dirname(testcase_path) 208 if options.output_dir: 209 output_dir = options.output_dir 210 intermediate_stream = io.BytesIO() 211 insert_includes(testcase_path, intermediate_stream, set()) 212 intermediate_stream.seek(0) 213 output_path = os.path.join(output_dir, testcase_root + '.pdf') 214 expand_file(intermediate_stream, output_path) 215 return 0 216 217 218if __name__ == '__main__': 219 sys.exit(main()) 220