1#!/usr/bin/env python 2# Copyright (c) 2012 The Chromium Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6"""Flattens a HTML file by inlining its external resources. 7 8This is a small script that takes a HTML file, looks for src attributes 9and inlines the specified file, producing one HTML file with no external 10dependencies. It recursively inlines the included files. 11""" 12 13import os 14import re 15import sys 16import base64 17import mimetypes 18 19from grit import lazy_re 20from grit import util 21 22DIST_DEFAULT = 'chromium' 23DIST_ENV_VAR = 'CHROMIUM_BUILD' 24DIST_SUBSTR = '%DISTRIBUTION%' 25 26# Matches beginning of an "if" block with trailing spaces. 27_BEGIN_IF_BLOCK = lazy_re.compile( 28 '<if [^>]*?expr="(?P<expression>[^"]*)"[^>]*?>\s*') 29 30# Matches ending of an "if" block with preceding spaces. 31_END_IF_BLOCK = lazy_re.compile('\s*</if>') 32 33# Used by DoInline to replace various links with inline content. 34_STYLESHEET_RE = lazy_re.compile( 35 '<link rel="stylesheet"[^>]+?href="(?P<filename>[^"]*)".*?>(\s*</link>)?', 36 re.DOTALL) 37_INCLUDE_RE = lazy_re.compile( 38 '<include[^>]+?src="(?P<filename>[^"\']*)".*?>(\s*</include>)?', 39 re.DOTALL) 40_SRC_RE = lazy_re.compile( 41 r'<(?!script)(?:[^>]+?\s)src=(?P<quote>")(?P<filename>[^"\']*)\1', 42 re.MULTILINE) 43_ICON_RE = lazy_re.compile( 44 r'<link rel="icon"\s(?:[^>]+?\s)?' 45 'href=(?P<quote>")(?P<filename>[^"\']*)\1', 46 re.MULTILINE) 47 48 49 50def FixupMimeType(mime_type): 51 """Helper function that normalizes platform differences in the mime type 52 returned by the Python's mimetypes.guess_type API. 53 """ 54 mappings = { 55 'image/x-png': 'image/png' 56 } 57 return mappings[mime_type] if mime_type in mappings else mime_type 58 59 60def GetDistribution(): 61 """Helper function that gets the distribution we are building. 62 63 Returns: 64 string 65 """ 66 distribution = DIST_DEFAULT 67 if DIST_ENV_VAR in os.environ.keys(): 68 distribution = os.environ[DIST_ENV_VAR] 69 if len(distribution) > 1 and distribution[0] == '_': 70 distribution = distribution[1:].lower() 71 return distribution 72 73 74def SrcInlineAsDataURL( 75 src_match, base_path, distribution, inlined_files, names_only=False, 76 filename_expansion_function=None): 77 """regex replace function. 78 79 Takes a regex match for src="filename", attempts to read the file 80 at 'filename' and returns the src attribute with the file inlined 81 as a data URI. If it finds DIST_SUBSTR string in file name, replaces 82 it with distribution. 83 84 Args: 85 src_match: regex match object with 'filename' and 'quote' named capturing 86 groups 87 base_path: path that to look for files in 88 distribution: string that should replace DIST_SUBSTR 89 inlined_files: The name of the opened file is appended to this list. 90 names_only: If true, the function will not read the file but just return "". 91 It will still add the filename to |inlined_files|. 92 93 Returns: 94 string 95 """ 96 filename = src_match.group('filename') 97 if filename_expansion_function: 98 filename = filename_expansion_function(filename) 99 quote = src_match.group('quote') 100 101 if filename.find(':') != -1: 102 # filename is probably a URL, which we don't want to bother inlining 103 return src_match.group(0) 104 105 filename = filename.replace(DIST_SUBSTR , distribution) 106 filepath = os.path.normpath(os.path.join(base_path, filename)) 107 inlined_files.add(filepath) 108 109 if names_only: 110 return "" 111 112 mimetype = FixupMimeType(mimetypes.guess_type(filename)[0]) or 'text/plain' 113 inline_data = base64.standard_b64encode(util.ReadFile(filepath, util.BINARY)) 114 115 prefix = src_match.string[src_match.start():src_match.start('filename')] 116 suffix = src_match.string[src_match.end('filename'):src_match.end()] 117 return '%sdata:%s;base64,%s%s' % (prefix, mimetype, inline_data, suffix) 118 119 120class InlinedData: 121 """Helper class holding the results from DoInline(). 122 123 Holds the inlined data and the set of filenames of all the inlined 124 files. 125 """ 126 def __init__(self, inlined_data, inlined_files): 127 self.inlined_data = inlined_data 128 self.inlined_files = inlined_files 129 130def DoInline( 131 input_filename, grd_node, allow_external_script=False, names_only=False, 132 rewrite_function=None, filename_expansion_function=None): 133 """Helper function that inlines the resources in a specified file. 134 135 Reads input_filename, finds all the src attributes and attempts to 136 inline the files they are referring to, then returns the result and 137 the set of inlined files. 138 139 Args: 140 input_filename: name of file to read in 141 grd_node: html node from the grd file for this include tag 142 names_only: |nil| will be returned for the inlined contents (faster). 143 rewrite_function: function(filepath, text, distribution) which will be 144 called to rewrite html content before inlining images. 145 filename_expansion_function: function(filename) which will be called to 146 rewrite filenames before attempting to read them. 147 Returns: 148 a tuple of the inlined data as a string and the set of filenames 149 of all the inlined files 150 """ 151 if filename_expansion_function: 152 input_filename = filename_expansion_function(input_filename) 153 input_filepath = os.path.dirname(input_filename) 154 distribution = GetDistribution() 155 156 # Keep track of all the files we inline. 157 inlined_files = set() 158 159 def SrcReplace(src_match, filepath=input_filepath, 160 inlined_files=inlined_files): 161 """Helper function to provide SrcInlineAsDataURL with the base file path""" 162 return SrcInlineAsDataURL( 163 src_match, filepath, distribution, inlined_files, names_only=names_only, 164 filename_expansion_function=filename_expansion_function) 165 166 def GetFilepath(src_match, base_path = input_filepath): 167 filename = src_match.group('filename') 168 169 if filename.find(':') != -1: 170 # filename is probably a URL, which we don't want to bother inlining 171 return None 172 173 filename = filename.replace('%DISTRIBUTION%', distribution) 174 if filename_expansion_function: 175 filename = filename_expansion_function(filename) 176 return os.path.normpath(os.path.join(base_path, filename)) 177 178 def IsConditionSatisfied(src_match): 179 expression = src_match.group('expression') 180 return grd_node is None or grd_node.EvaluateCondition(expression) 181 182 def CheckConditionalElements(str): 183 """Helper function to conditionally inline inner elements""" 184 while True: 185 begin_if = _BEGIN_IF_BLOCK.search(str) 186 if begin_if is None: 187 return str 188 189 condition_satisfied = IsConditionSatisfied(begin_if) 190 leading = str[0:begin_if.start()] 191 content_start = begin_if.end() 192 193 # Find matching "if" block end. 194 count = 1 195 pos = begin_if.end() 196 while True: 197 end_if = _END_IF_BLOCK.search(str, pos) 198 if end_if is None: 199 raise Exception('Unmatched <if>') 200 201 next_if = _BEGIN_IF_BLOCK.search(str, pos) 202 if next_if is None or next_if.start() >= end_if.end(): 203 count = count - 1 204 if count == 0: 205 break 206 pos = end_if.end() 207 else: 208 count = count + 1 209 pos = next_if.end() 210 211 content = str[content_start:end_if.start()] 212 trailing = str[end_if.end():] 213 214 if condition_satisfied: 215 str = leading + CheckConditionalElements(content) + trailing 216 else: 217 str = leading + trailing 218 219 def InlineFileContents(src_match, pattern, inlined_files=inlined_files): 220 """Helper function to inline external files of various types""" 221 filepath = GetFilepath(src_match) 222 if filepath is None: 223 return src_match.group(0) 224 inlined_files.add(filepath) 225 226 if names_only: 227 inlined_files.update(GetResourceFilenames( 228 filepath, 229 allow_external_script, 230 rewrite_function, 231 filename_expansion_function=filename_expansion_function)) 232 return "" 233 234 return pattern % InlineToString( 235 filepath, grd_node, allow_external_script, 236 filename_expansion_function=filename_expansion_function) 237 238 def InlineIncludeFiles(src_match): 239 """Helper function to directly inline generic external files (without 240 wrapping them with any kind of tags). 241 """ 242 return InlineFileContents(src_match, '%s') 243 244 def InlineScript(match): 245 """Helper function to inline external script files""" 246 attrs = (match.group('attrs1') + match.group('attrs2')).strip() 247 if attrs: 248 attrs = ' ' + attrs 249 return InlineFileContents(match, '<script' + attrs + '>%s</script>') 250 251 def InlineCSSText(text, css_filepath): 252 """Helper function that inlines external resources in CSS text""" 253 filepath = os.path.dirname(css_filepath) 254 # Allow custom modifications before inlining images. 255 if rewrite_function: 256 text = rewrite_function(filepath, text, distribution) 257 text = InlineCSSImages(text, filepath) 258 return InlineCSSImports(text, filepath) 259 260 def InlineCSSFile(src_match, pattern, base_path=input_filepath): 261 """Helper function to inline external CSS files. 262 263 Args: 264 src_match: A regular expression match with a named group named "filename". 265 pattern: The pattern to replace with the contents of the CSS file. 266 base_path: The base path to use for resolving the CSS file. 267 268 Returns: 269 The text that should replace the reference to the CSS file. 270 """ 271 filepath = GetFilepath(src_match, base_path) 272 if filepath is None: 273 return src_match.group(0) 274 275 # Even if names_only is set, the CSS file needs to be opened, because it 276 # can link to images that need to be added to the file set. 277 inlined_files.add(filepath) 278 # When resolving CSS files we need to pass in the path so that relative URLs 279 # can be resolved. 280 return pattern % InlineCSSText(util.ReadFile(filepath, util.BINARY), 281 filepath) 282 283 def InlineCSSImages(text, filepath=input_filepath): 284 """Helper function that inlines external images in CSS backgrounds.""" 285 # Replace contents of url() for css attributes: content, background, 286 # or *-image. 287 return re.sub('(content|background|[\w-]*-image):[^;]*' + 288 '(url\((?P<quote1>"|\'|)[^"\'()]*(?P=quote1)\)|' + 289 'image-set\(' + 290 '([ ]*url\((?P<quote2>"|\'|)[^"\'()]*(?P=quote2)\)' + 291 '[ ]*[0-9.]*x[ ]*(,[ ]*)?)+\))', 292 lambda m: InlineCSSUrls(m, filepath), 293 text) 294 295 def InlineCSSUrls(src_match, filepath=input_filepath): 296 """Helper function that inlines each url on a CSS image rule match.""" 297 # Replace contents of url() references in matches. 298 return re.sub('url\((?P<quote>"|\'|)(?P<filename>[^"\'()]*)(?P=quote)\)', 299 lambda m: SrcReplace(m, filepath), 300 src_match.group(0)) 301 302 def InlineCSSImports(text, filepath=input_filepath): 303 """Helper function that inlines CSS files included via the @import 304 directive. 305 """ 306 return re.sub('@import\s+url\((?P<quote>"|\'|)(?P<filename>[^"\'()]*)' + 307 '(?P=quote)\);', 308 lambda m: InlineCSSFile(m, '%s', filepath), 309 text) 310 311 312 flat_text = util.ReadFile(input_filename, util.BINARY) 313 314 # Check conditional elements, remove unsatisfied ones from the file. We do 315 # this twice. The first pass is so that we don't even bother calling 316 # InlineScript, InlineCSSFile and InlineIncludeFiles on text we're eventually 317 # going to throw out anyway. 318 flat_text = CheckConditionalElements(flat_text) 319 320 if not allow_external_script: 321 # We need to inline css and js before we inline images so that image 322 # references gets inlined in the css and js 323 flat_text = re.sub('<script (?P<attrs1>.*?)src="(?P<filename>[^"\']*)"' + 324 '(?P<attrs2>.*?)></script>', 325 InlineScript, 326 flat_text) 327 328 flat_text = _STYLESHEET_RE.sub( 329 lambda m: InlineCSSFile(m, '<style>%s</style>'), 330 flat_text) 331 332 flat_text = _INCLUDE_RE.sub(InlineIncludeFiles, flat_text) 333 334 # Check conditional elements, second pass. This catches conditionals in any 335 # of the text we just inlined. 336 flat_text = CheckConditionalElements(flat_text) 337 338 # Allow custom modifications before inlining images. 339 if rewrite_function: 340 flat_text = rewrite_function(input_filepath, flat_text, distribution) 341 342 flat_text = _SRC_RE.sub(SrcReplace, flat_text) 343 344 # TODO(arv): Only do this inside <style> tags. 345 flat_text = InlineCSSImages(flat_text) 346 347 flat_text = _ICON_RE.sub(SrcReplace, flat_text) 348 349 if names_only: 350 flat_text = None # Will contains garbage if the flag is set anyway. 351 return InlinedData(flat_text, inlined_files) 352 353 354def InlineToString(input_filename, grd_node, allow_external_script=False, 355 rewrite_function=None, filename_expansion_function=None): 356 """Inlines the resources in a specified file and returns it as a string. 357 358 Args: 359 input_filename: name of file to read in 360 grd_node: html node from the grd file for this include tag 361 Returns: 362 the inlined data as a string 363 """ 364 try: 365 return DoInline( 366 input_filename, 367 grd_node, 368 allow_external_script=allow_external_script, 369 rewrite_function=rewrite_function, 370 filename_expansion_function=filename_expansion_function).inlined_data 371 except IOError, e: 372 raise Exception("Failed to open %s while trying to flatten %s. (%s)" % 373 (e.filename, input_filename, e.strerror)) 374 375 376def InlineToFile(input_filename, output_filename, grd_node): 377 """Inlines the resources in a specified file and writes it. 378 379 Reads input_filename, finds all the src attributes and attempts to 380 inline the files they are referring to, then writes the result 381 to output_filename. 382 383 Args: 384 input_filename: name of file to read in 385 output_filename: name of file to be written to 386 grd_node: html node from the grd file for this include tag 387 Returns: 388 a set of filenames of all the inlined files 389 """ 390 inlined_data = InlineToString(input_filename, grd_node) 391 with open(output_filename, 'wb') as out_file: 392 out_file.writelines(inlined_data) 393 394 395def GetResourceFilenames(filename, 396 allow_external_script=False, 397 rewrite_function=None, 398 filename_expansion_function=None): 399 """For a grd file, returns a set of all the files that would be inline.""" 400 try: 401 return DoInline( 402 filename, 403 None, 404 names_only=True, 405 allow_external_script=allow_external_script, 406 rewrite_function=rewrite_function, 407 filename_expansion_function=filename_expansion_function).inlined_files 408 except IOError, e: 409 raise Exception("Failed to open %s while trying to flatten %s. (%s)" % 410 (e.filename, filename, e.strerror)) 411 412 413def main(): 414 if len(sys.argv) <= 2: 415 print "Flattens a HTML file by inlining its external resources.\n" 416 print "html_inline.py inputfile outputfile" 417 else: 418 InlineToFile(sys.argv[1], sys.argv[2], None) 419 420if __name__ == '__main__': 421 main() 422