1# Copyright (c) 2013 The Chromium Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5import os 6import sys 7 8from tvcm import module 9from tvcm import strip_js_comments 10from tvcm import html_generation_controller 11 12 13def _InitBeautifulSoup(): 14 tvcm_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) 15 bs_path = os.path.join(tvcm_path, 'third_party', 'beautifulsoup') 16 if bs_path in sys.path: 17 return 18 sys.path.insert(0, bs_path) 19 20 21_InitBeautifulSoup() 22import BeautifulSoup 23import polymer_soup 24 25 26class InlineScript(object): 27 def __init__(self, soup): 28 if not soup: 29 raise module.DepsException('InlineScript created without soup') 30 self._soup = soup 31 self._stripped_contents = None 32 self._open_tags = None 33 34 @property 35 def contents(self): 36 #TODO(nednguyen): change other places to use unicode() instead of str(). 37 return unicode(self._soup.string) 38 39 @property 40 def stripped_contents(self): 41 if not self._stripped_contents: 42 self._stripped_contents = strip_js_comments.StripJSComments( 43 self.contents) 44 return self._stripped_contents 45 46 @property 47 def open_tags(self): 48 if self._open_tags: 49 return self._open_tags 50 open_tags = [] 51 cur = self._soup.parent 52 while cur: 53 if isinstance(cur, BeautifulSoup.BeautifulSoup): 54 break 55 56 open_tags.append(_Tag(cur.name, cur.attrs)) 57 cur = cur.parent 58 59 open_tags.reverse() 60 assert open_tags[-1].tag == 'script' 61 del open_tags[-1] 62 63 self._open_tags = open_tags 64 return self._open_tags 65 66 67def _IsDoctype(x): 68 if not isinstance(x, BeautifulSoup.Declaration): 69 return False 70 return x == 'DOCTYPE html' or x == 'DOCTYPE HTML' 71 72 73class HTMLModuleParserResults(object): 74 def __init__(self, html): 75 self._soup = polymer_soup.PolymerSoup(html) 76 self._inline_scripts = None 77 78 @property 79 def has_decl(self): 80 decls = [x for x in self._soup.contents 81 if _IsDoctype(x)] 82 return len(decls) == 1 83 84 @property 85 def scripts_external(self): 86 tags = self._soup.findAll('script', src=True) 87 return [t['src'] for t in tags] 88 89 @property 90 def inline_scripts(self): 91 if not self._inline_scripts: 92 tags = self._soup.findAll('script', src=None) 93 self._inline_scripts = [InlineScript(t.string) for t in tags] 94 return self._inline_scripts 95 96 @property 97 def imports(self): 98 tags = self._soup.findAll('link', rel='import') 99 return [t['href'] for t in tags] 100 101 @property 102 def stylesheets(self): 103 tags = self._soup.findAll('link', rel='stylesheet') 104 return [t['href'] for t in tags] 105 106 @property 107 def inline_stylesheets(self): 108 tags = self._soup.findAll('style') 109 return [str(t.string) for t in tags] 110 111 def YieldHTMLInPieces(self, controller, minify=False): 112 yield self.GenerateHTML(controller, minify) 113 114 def GenerateHTML(self, controller, minify=False): 115 soup = polymer_soup.PolymerSoup(str(self._soup)) 116 117 # Remove declaration. 118 for x in soup.contents: 119 if isinstance(x, BeautifulSoup.Declaration): 120 if _IsDoctype(x): 121 x.extract() 122 123 # Remove all imports. 124 imports = soup.findAll('link', rel='import') 125 for imp in imports: 126 imp.extract() 127 128 # Remove all script links. 129 scripts_external = soup.findAll('script', src=True) 130 for script in scripts_external: 131 script.extract() 132 133 # Remove all in-line scripts. 134 scripts_external = soup.findAll('script', src=None) 135 for script in scripts_external: 136 script.extract() 137 138 # Process all in-line styles. 139 inline_styles = soup.findAll('style') 140 for style in inline_styles: 141 html = controller.GetHTMLForInlineStylesheet(str(style.string)) 142 if html: 143 ns = BeautifulSoup.Tag(soup, 'style') 144 ns.append(BeautifulSoup.NavigableString(html)) 145 style.replaceWith(ns) 146 else: 147 style.extract() 148 149 # Rewrite all external stylesheet hrefs or remove, as needed. 150 stylesheet_links = soup.findAll('link', rel='stylesheet') 151 for stylesheet_link in stylesheet_links: 152 html = controller.GetHTMLForStylesheetHRef(stylesheet_link['href']) 153 if html: 154 tmp = polymer_soup.PolymerSoup(html).findChildren() 155 assert len(tmp) == 1 156 stylesheet_link.replaceWith(tmp[0]) 157 else: 158 stylesheet_link.extract() 159 160 # Remove comments if minifying. 161 if minify: 162 comments = soup.findAll( 163 text=lambda text: isinstance(text, BeautifulSoup.Comment)) 164 for comment in comments: 165 comment.extract() 166 167 # We are done. 168 return str(soup).strip() 169 170 @property 171 def html_contents_without_links_and_script(self): 172 return self.GenerateHTML( 173 html_generation_controller.HTMLGenerationController()) 174 175 176class _Tag(object): 177 178 def __init__(self, tag, attrs): 179 self.tag = tag 180 self.attrs = attrs 181 182 def __repr__(self): 183 attr_string = ' '.join(['%s="%s"' % (x[0], x[1]) for x in self.attrs]) 184 return '<%s %s>' % (self.tag, attr_string) 185 186 187class HTMLModuleParser(): 188 189 def Parse(self, html): 190 if html is None: 191 html = '' 192 else: 193 if html.find('< /script>') != -1: 194 raise Exception('Escape script tags with <\/script>') 195 196 return HTMLModuleParserResults(html) 197