1#!/usr/bin/python 2# -*- coding: utf-8 -*- 3""" $Id: htmldiff,v 1.62 2016/10/06 10:46:19 dom Exp $ 4""" 5 6import atexit 7import cgi 8import http_auth 9import httplib 10import os 11import re 12import surbl 13import sys 14import tempfile 15import tidy 16import urlparse 17 18from subprocess import Popen, PIPE 19 20CONTENT_TYPE = "text/html;charset=utf-8" 21 22Page = """ 23<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 24<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US"> 25<head><title>HTML Diff service</title> 26<link rel="stylesheet" href="http://www.w3.org/StyleSheets/base" /> 27</head> 28<body> 29 30<p><a href="http://www.w3.org/"><img src="http://www.w3.org/Icons/w3c_home" alt="W3C"/></a> <a href="http://www.w3.org/2003/Editors">W3C Editors homepage</a></p> 31 32<h1>Create Diff between HTML pages</h1> 33""" 34Page2 = """ 35<form method="GET"> 36<p>Address of reference document: <input name="doc1" type="url" value="%s" style="width:100%%"/></p> 37<p>Address of new document: <input name="doc2" value="%s" style="width:100%%"/></p> 38<p><input type="submit" value="get Diff"/></p> 39</form> 40 41<p><strong>Tip</strong>: if the document uses the W3C convention on linking to its previous version, you can specify only the address of the new document — the previous link will be automatically detected.</p> 42<h2>Diff markings</h2> 43<p>This service relies on <a href="https://www.gnu.org/software/diffutils/">GNU diff</a>. The found differences are roughly marked as follow: 44<ul> 45<li>deleted text is shown in pink with down-arrows (as styled for a <del> element)</li> 46<li>where there is replacement, it’s shown in green with bi-directional arrows,</li> 47<li>where there is newly inserted text, it’s yellow with up arrows (<ins> element)</li> 48</ul> 49<address> 50script $Revision: 1.62 $ of $Date: 2016/10/06 10:46:19 $<br /> 51by <a href="http://www.w3.org/People/Dom/">Dominique Hazaël-Massieux</a><br />based on <a href="https://dev.w3.org/cvsweb/2009/htmldiff/htmldiff.pl">Shane McCarron’ Perl script</a> wrapped in a <a href="http://dev.w3.org/cvsweb/2009/htmldiff/">Python CGI</a> 52</address> 53</body> 54</html> 55""" 56 57def checkInputUrl(url): 58 checker = surbl.SurblChecker('/usr/local/share/surbl/two-level-tlds','/afs/w3.org/pub/WWW/Systems/Server/debian/generic/usr/local/etc/surbl.whitelist') 59 60 if url[:5] == 'file:' or len(urlparse.urlparse(url)[0])<2: 61 print "Status: 403" 62 print "Content-Type: text/plain" 63 print 64 print "sorry, I decline to handle file: addresses" 65 sys.exit() 66 elif checker.isMarkedAsSpam(url): 67 print "Status: 403" 68 print "Content-Type: text/plain; charset=utf-8" 69 print 70 print "sorry, this URL matches a record known in SURBL. See http://www.surbl.org/" 71 sys.exit() 72 73def copyHeader(copy_func, source, key, header_name=None): 74 value = source.get(key) 75 if not value: 76 return False 77 elif header_name is None: 78 header_name = key 79 copy_func(header_name, value) 80 return True 81 82def setupRequest(source_headers): 83 opener = http_auth.ProxyAuthURLopener() 84 copyHeader(opener.addheader, source_headers, 'If-Modified-Since') 85 copyHeader(opener.addheader, os.environ, 'REMOTE_ADDR', 'X_Forward_IP_Addr') 86 return opener 87 88def tidyFile(file): 89 # option for tidy 90 options = dict(tidy_mark=0,show_warnings=0,quiet=1,char_encoding='utf8') 91 html5 = re.search(r"<!doctype\s+html\s*>", file.read(4096), 92 re.IGNORECASE) 93 file.seek(0) 94 html5_options = {"add_xml_space": "no", 95 "output_xhtml": "no", 96 "tidy_mark": "no", 97 "new_blocklevel_tags": 'article,aside,canvas,dialog,details,figcaption,figure,footer,header,hgroup,menu,nav,section,main,summary,math,semantics,mrow,mfenced,mtable,mtr,mtd,mi,mn,msub,mo,mfrac,munderover,mtext,svg,g,image,rect,text,desc,line,path,polygon,ellipse,tspan,defs,feoffset,fecolormatrix,filter,fegaussianblur,feblend,marker,circle', 98 "new_inline_tags": 'video,audio,canvas,ruby,rt,rp,time,meter,progress,track,source,emu-val,emu-nt,emu-t,mark', 99 "break_before_br": "no", 100 "vertical_space": "no", 101 "enclose_text": "no", 102 "numeric_entities": "yes", 103 "wrap": "1000", 104 "wrap_attributes": "no", 105 "drop_empty_paras": "no" 106 } 107 if html5: 108 options.update(html5_options) 109 newtidy = tidy.parseString(file.read(), **options) 110 if len(newtidy.errors) > 0: 111 if not html5: 112 file.seek(0) 113 options.update(html5_options) 114 newtidy = tidy.parseString(file.read(), **options) 115 file.close() 116 file = tempfile.NamedTemporaryFile( 117 mode='w+', prefix='htmldiff-', suffix='.html') 118 atexit.register(file.close) 119 file.write(str(newtidy)) 120 file.flush() 121 file.seek(0) 122 return (file, newtidy.errors) 123 124def matchPredecessorRel(rel): 125 return rel and "predecessor-version" in rel.lower().split(" ") 126 127def mirrorURL(url, opener): 128 try: 129 filename, headers = opener.retrieve(url) 130 except IOError, error: 131 opener.error = "I/O error: %s %s" % (error.errno, error.strerror) 132 except httplib.InvalidURL: 133 opener.error = "Invalid URL submitted" 134 except AttributeError: # ProxyAuthURLopener returned None. 135 pass # There's already an error set. 136 else: 137 atexit.register(os.unlink, filename) 138 file = open(filename) 139 if headers.has_key("content-encoding") and headers["content-encoding"] == "gzip": 140 import gzip 141 from StringIO import StringIO 142 data = StringIO(file.read()) 143 file.close() 144 file = gzip.GzipFile(fileobj=data) 145 file,errors = tidyFile(file) 146 if len(errors) == 0: 147 return (file, headers) 148 else: 149 opener.error = "Tidy errors: %s" % (str(errors)) 150 return (None, {}) 151 152def showPage(url1='', url2='', error_html='', **headers): 153 for name, value in headers.items(): 154 print "%s: %s" % (name.replace('_', '-'), value) 155 print 156 print Page 157 print error_html 158 print Page2 % (url1, url2) 159 sys.exit() 160 161def serveRequest(): 162 fields = cgi.FieldStorage() 163 164 if (not fields.has_key('doc2')): 165 showPage(Content_Type=CONTENT_TYPE) 166 # if doc1 is not specified, we load doc2 to check if it has a previous version link 167 doc2 = fields['doc2'].value 168 checkInputUrl(doc2) 169 url_opener2 = setupRequest(fields.headers) 170 newdoc, newheaders = mirrorURL(doc2, url_opener2) 171 if fields.has_key('doc1'): 172 doc1 = fields['doc1'].value 173 elif newdoc is not None: 174 from BeautifulSoup import BeautifulSoup 175 176 soup = BeautifulSoup(newdoc.read()) 177 newdoc.seek(0) 178 try: 179 doc1 = soup.find(text=re.compile("Previous Version",re.IGNORECASE)).findNext(name="a", attrs={"href":True})["href"] 180 except: 181 try: 182 doc1 = soup.find(name=["a", "link"], attrs={"href":True, rel:matchPredecessorRel})["href"] 183 except: 184 doc1 = None 185 else: 186 doc1 = None 187 if (not doc1): 188 showPage(Content_Type=CONTENT_TYPE) 189 190 checkInputUrl(doc1) 191 esc1 = cgi.escape(doc1, True) 192 esc2 = cgi.escape(doc2, True) 193 urlcomponents1 = urlparse.urlparse(doc1) 194 urlcomponents2 = urlparse.urlparse(doc2) 195 # if same domain, we can use the same urlopener 196 # otherwise, we create a separate one 197 if urlcomponents2[1] == urlcomponents1[1]: 198 url_opener = url_opener2 199 else: 200 url_opener = setupRequest(fields.headers) 201 202 refdoc, refheaders = mirrorURL(doc1, url_opener) 203 if not (refdoc and newdoc): 204 http_error = "" 205 url = "" 206 if not refdoc: 207 http_error = url_opener.error 208 url = esc1 209 else: 210 http_error = url_opener2.error 211 url = esc2 212 if re.match("^[1234][0-9][0-9] ", http_error): 213 print "Status: %s" %(http_error) 214 error="<p style='color:#FF0000'>An error (%s) occured trying to get <a href='%s'>%s</a>.</p>" % (cgi.escape(http_error), url, url) 215 showPage(esc1, esc2, error, Content_Type=CONTENT_TYPE) 216 217 print "Content-Type: text/html" 218 if newheaders.has_key('Content-Type'): 219 contentType = cgi.parse_header(newheaders["Content-Type"]) 220 if contentType[1].has_key('charset'): 221 charset = contentType[1]['charset'].lower() 222 #if charset == "iso-8859-1": 223 # options["char_encoding"]='latin1' 224 225 for proxy_header in ('Last-Modified', 'Expires'): 226 if copyHeader(lambda header, value: sys.stdout.write("%s: %s" %(header, value)), newheaders, proxy_header): 227 print 228 print 229 p = Popen(["/usr/local/bin/htmldiff", refdoc.name, newdoc.name], 230 stdin=PIPE, stdout=PIPE, stderr=PIPE) 231 sys.stdout.flush() 232 sys.stderr.flush() 233 (out, err) = p.communicate() 234 p.stdin.close() 235 if err: 236 error = "<p style='color:#FF0000'>An error occured when running <code>htmldiff</code> on the documents:</p><pre>%s</pre>" % (cgi.escape(err),) 237 showPage(esc1, esc2, error) 238 else: 239 print out 240if __name__ == '__main__': 241 if os.environ.has_key('SCRIPT_NAME'): 242 serveRequest() 243