1# Copyright (c) 2013 The Chromium Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5from __future__ import absolute_import 6from __future__ import division 7from __future__ import print_function 8 9import os 10import sys 11 12from py_vulcanize import html_generation_controller 13from py_vulcanize import js_utils 14from py_vulcanize import module 15from py_vulcanize import strip_js_comments 16import six 17 18 19def _AddToPathIfNeeded(path): 20 if path not in sys.path: 21 sys.path.insert(0, path) 22 23 24def _InitBeautifulSoup(): 25 catapult_path = os.path.abspath( 26 os.path.join(os.path.dirname(__file__), 27 os.path.pardir, os.path.pardir, os.path.pardir)) 28 bs_path = os.path.join(catapult_path, 'third_party', 'beautifulsoup4') 29 _AddToPathIfNeeded(bs_path) 30 31 html5lib_path = os.path.join(catapult_path, 'third_party', 'html5lib-python') 32 _AddToPathIfNeeded(html5lib_path) 33 34 six_path = os.path.join(catapult_path, 'third_party', 'six') 35 _AddToPathIfNeeded(six_path) 36 37 38_InitBeautifulSoup() 39import bs4 40 41class Script(object): 42 43 def __init__(self, soup): 44 if not soup: 45 raise module.DepsException('Script object created without soup') 46 self._soup = soup 47 48 def AppendJSContentsToFile(self, f, *args, **kwargs): 49 raise NotImplementedError() 50 51class InlineScript(Script): 52 53 def __init__(self, soup): 54 super(InlineScript, self).__init__(soup) 55 self._stripped_contents = None 56 self._open_tags = None 57 self.is_external = False 58 59 @property 60 def contents(self): 61 return six.text_type(self._soup.string) 62 63 @property 64 def stripped_contents(self): 65 if not self._stripped_contents: 66 self._stripped_contents = strip_js_comments.StripJSComments( 67 self.contents) 68 return self._stripped_contents 69 70 @property 71 def open_tags(self): 72 if self._open_tags: 73 return self._open_tags 74 open_tags = [] 75 cur = self._soup.parent 76 while cur: 77 if isinstance(cur, bs4.BeautifulSoup): 78 break 79 80 open_tags.append(_Tag(cur.name, cur.attrs)) 81 cur = cur.parent 82 83 open_tags.reverse() 84 assert open_tags[-1].tag == 'script' 85 del open_tags[-1] 86 87 self._open_tags = open_tags 88 return self._open_tags 89 90 def AppendJSContentsToFile(self, f, *args, **kwargs): 91 js = self.contents 92 escaped_js = js_utils.EscapeJSIfNeeded(js) 93 f.write(escaped_js) 94 f.write('\n') 95 96class ExternalScript(Script): 97 98 def __init__(self, soup): 99 super(ExternalScript, self).__init__(soup) 100 if 'src' not in soup.attrs: 101 raise Exception("{0} is not an external script.".format(soup)) 102 self.is_external = True 103 self._loaded_raw_script = None 104 105 @property 106 def loaded_raw_script(self): 107 if self._loaded_raw_script: 108 return self._loaded_raw_script 109 110 return None 111 112 @loaded_raw_script.setter 113 def loaded_raw_script(self, value): 114 self._loaded_raw_script = value 115 116 @property 117 def src(self): 118 return self._soup.attrs['src'] 119 120 def AppendJSContentsToFile(self, 121 f, 122 use_include_tags_for_scripts, 123 dir_for_include_tag_root): 124 raw_script = self.loaded_raw_script 125 if not raw_script: 126 return 127 128 if use_include_tags_for_scripts: 129 rel_filename = os.path.relpath(raw_script.filename, 130 dir_for_include_tag_root) 131 f.write("""<include src="%s">\n""" % rel_filename) 132 else: 133 f.write(js_utils.EscapeJSIfNeeded(raw_script.contents)) 134 f.write('\n') 135 136def _CreateSoupWithoutHeadOrBody(html): 137 soupCopy = bs4.BeautifulSoup(html, 'html5lib') 138 soup = bs4.BeautifulSoup() 139 soup.reset() 140 if soupCopy.head: 141 for n in soupCopy.head.contents: 142 n.extract() 143 soup.append(n) 144 if soupCopy.body: 145 for n in soupCopy.body.contents: 146 n.extract() 147 soup.append(n) 148 return soup 149 150 151class HTMLModuleParserResults(object): 152 153 def __init__(self, html): 154 self._soup = bs4.BeautifulSoup(html, 'html5lib') 155 self._inline_scripts = None 156 self._scripts = None 157 158 @property 159 def scripts_external(self): 160 tags = self._soup.findAll('script', src=True) 161 return [t['src'] for t in tags] 162 163 @property 164 def inline_scripts(self): 165 if not self._inline_scripts: 166 tags = self._soup.findAll('script', src=None) 167 self._inline_scripts = [InlineScript(t.string) for t in tags] 168 return self._inline_scripts 169 170 @property 171 def scripts(self): 172 if not self._scripts: 173 self._scripts = [] 174 script_elements = self._soup.findAll('script') 175 for element in script_elements: 176 if 'src' in element.attrs: 177 self._scripts.append(ExternalScript(element)) 178 else: 179 self._scripts.append(InlineScript(element)) 180 return self._scripts 181 182 @property 183 def imports(self): 184 tags = self._soup.findAll('link', rel='import') 185 return [t['href'] for t in tags] 186 187 @property 188 def stylesheets(self): 189 tags = self._soup.findAll('link', rel='stylesheet') 190 return [t['href'] for t in tags] 191 192 @property 193 def inline_stylesheets(self): 194 tags = self._soup.findAll('style') 195 return [six.text_type(t.string) for t in tags] 196 197 def YieldHTMLInPieces(self, controller, minify=False): 198 yield self.GenerateHTML(controller, minify) 199 200 def GenerateHTML(self, controller, minify=False, prettify=False): 201 soup = _CreateSoupWithoutHeadOrBody(six.text_type(self._soup)) 202 203 # Remove declaration. 204 for x in soup.contents: 205 if isinstance(x, bs4.Doctype): 206 x.extract() 207 208 # Remove declaration. 209 for x in soup.contents: 210 if isinstance(x, bs4.Declaration): 211 x.extract() 212 213 # Remove all imports. 214 imports = soup.findAll('link', rel='import') 215 for imp in imports: 216 imp.extract() 217 218 # Remove all script links. 219 scripts_external = soup.findAll('script', src=True) 220 for script in scripts_external: 221 script.extract() 222 223 # Remove all in-line scripts. 224 scripts_external = soup.findAll('script', src=None) 225 for script in scripts_external: 226 script.extract() 227 228 # Process all in-line styles. 229 inline_styles = soup.findAll('style') 230 for style in inline_styles: 231 html = controller.GetHTMLForInlineStylesheet(six.text_type(style.string)) 232 if html: 233 ns = soup.new_tag('style') 234 ns.append(bs4.NavigableString(html)) 235 style.replaceWith(ns) 236 else: 237 style.extract() 238 239 # Rewrite all external stylesheet hrefs or remove, as needed. 240 stylesheet_links = soup.findAll('link', rel='stylesheet') 241 for stylesheet_link in stylesheet_links: 242 html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href']) 243 if html: 244 tmp = bs4.BeautifulSoup(html, 'html5lib').findAll('style') 245 assert len(tmp) == 1 246 stylesheet_link.replaceWith(tmp[0]) 247 else: 248 stylesheet_link.extract() 249 250 # Remove comments if minifying. 251 if minify: 252 comments = soup.findAll( 253 text=lambda text: isinstance(text, bs4.Comment)) 254 for comment in comments: 255 comment.extract() 256 if prettify: 257 return soup.prettify('utf-8').strip() 258 259 # We are done. 260 return six.text_type(soup).strip() 261 262 @property 263 def html_contents_without_links_and_script(self): 264 return self.GenerateHTML( 265 html_generation_controller.HTMLGenerationController()) 266 267 268class _Tag(object): 269 270 def __init__(self, tag, attrs): 271 self.tag = tag 272 self.attrs = attrs 273 274 def __repr__(self): 275 attr_string = ' '.join('%s="%s"' % (x[0], x[1]) for x in self.attrs) 276 return '<%s %s>' % (self.tag, attr_string) 277 278 279class HTMLModuleParser(): 280 281 def Parse(self, html): 282 if html is None: 283 html = '' 284 else: 285 if html.find('< /script>') != -1: 286 raise Exception('Escape script tags with <\/script>') 287 288 return HTMLModuleParserResults(html) 289