1# Copyright (c) 2013 The Chromium Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5import os 6import sys 7 8from py_vulcanize import js_utils 9from py_vulcanize import module 10from py_vulcanize import strip_js_comments 11from py_vulcanize import html_generation_controller 12 13 14def _AddToPathIfNeeded(path): 15 if path not in sys.path: 16 sys.path.insert(0, path) 17 18 19def _InitBeautifulSoup(): 20 catapult_path = os.path.abspath( 21 os.path.join(os.path.dirname(__file__), 22 os.path.pardir, os.path.pardir, os.path.pardir)) 23 bs_path = os.path.join(catapult_path, 'third_party', 'beautifulsoup4') 24 _AddToPathIfNeeded(bs_path) 25 26 html5lib_path = os.path.join(catapult_path, 'third_party', 'html5lib-python') 27 _AddToPathIfNeeded(html5lib_path) 28 29 six_path = os.path.join(catapult_path, 'third_party', 'six') 30 _AddToPathIfNeeded(six_path) 31 32 33_InitBeautifulSoup() 34import bs4 35 36class Script(object): 37 38 def __init__(self, soup): 39 if not soup: 40 raise module.DepsException('Script object created without soup') 41 self._soup = soup 42 43 def AppendJSContentsToFile(self, f, *args, **kwargs): 44 raise NotImplementedError() 45 46class InlineScript(Script): 47 48 def __init__(self, soup): 49 super(InlineScript, self).__init__(soup) 50 self._stripped_contents = None 51 self._open_tags = None 52 self.is_external = False 53 54 @property 55 def contents(self): 56 return unicode(self._soup.string) 57 58 @property 59 def stripped_contents(self): 60 if not self._stripped_contents: 61 self._stripped_contents = strip_js_comments.StripJSComments( 62 self.contents) 63 return self._stripped_contents 64 65 @property 66 def open_tags(self): 67 if self._open_tags: 68 return self._open_tags 69 open_tags = [] 70 cur = self._soup.parent 71 while cur: 72 if isinstance(cur, bs4.BeautifulSoup): 73 break 74 75 open_tags.append(_Tag(cur.name, cur.attrs)) 76 cur = cur.parent 77 78 open_tags.reverse() 79 assert open_tags[-1].tag == 'script' 80 del open_tags[-1] 81 82 self._open_tags = open_tags 83 return self._open_tags 84 85 def AppendJSContentsToFile(self, f, *args, **kwargs): 86 js = self.contents 87 escaped_js = js_utils.EscapeJSIfNeeded(js) 88 f.write(escaped_js) 89 f.write('\n') 90 91class ExternalScript(Script): 92 93 def __init__(self, soup): 94 super(ExternalScript, self).__init__(soup) 95 if 'src' not in soup.attrs: 96 raise Exception("{0} is not an external script.".format(soup)) 97 self.is_external = True 98 self._loaded_raw_script = None 99 100 @property 101 def loaded_raw_script(self): 102 if self._loaded_raw_script: 103 return self._loaded_raw_script 104 105 return None 106 107 @loaded_raw_script.setter 108 def loaded_raw_script(self, value): 109 self._loaded_raw_script = value 110 111 @property 112 def src(self): 113 return self._soup.attrs['src'] 114 115 def AppendJSContentsToFile(self, 116 f, 117 use_include_tags_for_scripts, 118 dir_for_include_tag_root): 119 raw_script = self.loaded_raw_script 120 if not raw_script: 121 return 122 123 if use_include_tags_for_scripts: 124 rel_filename = os.path.relpath(raw_script.filename, 125 dir_for_include_tag_root) 126 f.write("""<include src="%s">\n""" % rel_filename) 127 else: 128 f.write(js_utils.EscapeJSIfNeeded(raw_script.contents)) 129 f.write('\n') 130 131def _CreateSoupWithoutHeadOrBody(html): 132 soupCopy = bs4.BeautifulSoup(html, 'html5lib') 133 soup = bs4.BeautifulSoup() 134 soup.reset() 135 if soupCopy.head: 136 for n in soupCopy.head.contents: 137 n.extract() 138 soup.append(n) 139 if soupCopy.body: 140 for n in soupCopy.body.contents: 141 n.extract() 142 soup.append(n) 143 return soup 144 145 146class HTMLModuleParserResults(object): 147 148 def __init__(self, html): 149 self._soup = bs4.BeautifulSoup(html, 'html5lib') 150 self._inline_scripts = None 151 self._scripts = None 152 153 @property 154 def scripts_external(self): 155 tags = self._soup.findAll('script', src=True) 156 return [t['src'] for t in tags] 157 158 @property 159 def inline_scripts(self): 160 if not self._inline_scripts: 161 tags = self._soup.findAll('script', src=None) 162 self._inline_scripts = [InlineScript(t.string) for t in tags] 163 return self._inline_scripts 164 165 @property 166 def scripts(self): 167 if not self._scripts: 168 self._scripts = [] 169 script_elements = self._soup.findAll('script') 170 for element in script_elements: 171 if 'src' in element.attrs: 172 self._scripts.append(ExternalScript(element)) 173 else: 174 self._scripts.append(InlineScript(element)) 175 return self._scripts 176 177 @property 178 def imports(self): 179 tags = self._soup.findAll('link', rel='import') 180 return [t['href'] for t in tags] 181 182 @property 183 def stylesheets(self): 184 tags = self._soup.findAll('link', rel='stylesheet') 185 return [t['href'] for t in tags] 186 187 @property 188 def inline_stylesheets(self): 189 tags = self._soup.findAll('style') 190 return [unicode(t.string) for t in tags] 191 192 def YieldHTMLInPieces(self, controller, minify=False): 193 yield self.GenerateHTML(controller, minify) 194 195 def GenerateHTML(self, controller, minify=False, prettify=False): 196 soup = _CreateSoupWithoutHeadOrBody(unicode(self._soup)) 197 198 # Remove declaration. 199 for x in soup.contents: 200 if isinstance(x, bs4.Doctype): 201 x.extract() 202 203 # Remove declaration. 204 for x in soup.contents: 205 if isinstance(x, bs4.Declaration): 206 x.extract() 207 208 # Remove all imports. 209 imports = soup.findAll('link', rel='import') 210 for imp in imports: 211 imp.extract() 212 213 # Remove all script links. 214 scripts_external = soup.findAll('script', src=True) 215 for script in scripts_external: 216 script.extract() 217 218 # Remove all in-line scripts. 219 scripts_external = soup.findAll('script', src=None) 220 for script in scripts_external: 221 script.extract() 222 223 # Process all in-line styles. 224 inline_styles = soup.findAll('style') 225 for style in inline_styles: 226 html = controller.GetHTMLForInlineStylesheet(unicode(style.string)) 227 if html: 228 ns = soup.new_tag('style') 229 ns.append(bs4.NavigableString(html)) 230 style.replaceWith(ns) 231 else: 232 style.extract() 233 234 # Rewrite all external stylesheet hrefs or remove, as needed. 235 stylesheet_links = soup.findAll('link', rel='stylesheet') 236 for stylesheet_link in stylesheet_links: 237 html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href']) 238 if html: 239 tmp = bs4.BeautifulSoup(html, 'html5lib').findAll('style') 240 assert len(tmp) == 1 241 stylesheet_link.replaceWith(tmp[0]) 242 else: 243 stylesheet_link.extract() 244 245 # Remove comments if minifying. 246 if minify: 247 comments = soup.findAll( 248 text=lambda text: isinstance(text, bs4.Comment)) 249 for comment in comments: 250 comment.extract() 251 if prettify: 252 return soup.prettify('utf-8').strip() 253 254 # We are done. 255 return unicode(soup).strip() 256 257 @property 258 def html_contents_without_links_and_script(self): 259 return self.GenerateHTML( 260 html_generation_controller.HTMLGenerationController()) 261 262 263class _Tag(object): 264 265 def __init__(self, tag, attrs): 266 self.tag = tag 267 self.attrs = attrs 268 269 def __repr__(self): 270 attr_string = ' '.join('%s="%s"' % (x[0], x[1]) for x in self.attrs) 271 return '<%s %s>' % (self.tag, attr_string) 272 273 274class HTMLModuleParser(): 275 276 def Parse(self, html): 277 if html is None: 278 html = '' 279 else: 280 if html.find('< /script>') != -1: 281 raise Exception('Escape script tags with <\/script>') 282 283 return HTMLModuleParserResults(html) 284