• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3""" $Id: htmldiff,v 1.62 2016/10/06 10:46:19 dom Exp $
4"""
5
6import atexit
7import cgi
8import http_auth
9import httplib
10import os
11import re
12import surbl
13import sys
14import tempfile
15import tidy
16import urlparse
17
18from subprocess import Popen, PIPE
19
20CONTENT_TYPE = "text/html;charset=utf-8"
21
22Page = """
23<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
24<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US">
25<head><title>HTML Diff service</title>
26<link rel="stylesheet" href="http://www.w3.org/StyleSheets/base" />
27</head>
28<body>
29
30<p><a href="http://www.w3.org/"><img src="http://www.w3.org/Icons/w3c_home" alt="W3C"/></a> <a href="http://www.w3.org/2003/Editors">W3C Editors homepage</a></p>
31
32<h1>Create Diff between HTML pages</h1>
33"""
34Page2 = """
35<form method="GET">
36<p>Address of reference document: <input name="doc1" type="url" value="%s" style="width:100%%"/></p>
37<p>Address of new document: <input name="doc2" value="%s"  style="width:100%%"/></p>
38<p><input type="submit" value="get Diff"/></p>
39</form>
40
41<p><strong>Tip</strong>: if the document uses the W3C convention on linking to its previous version, you can specify only the address of the new document — the previous link will be automatically detected.</p>
42<h2>Diff markings</h2>
43<p>This service relies on <a href="https://www.gnu.org/software/diffutils/">GNU diff</a>. The found differences are roughly marked as follow:
44<ul>
45<li>deleted text is shown in pink with down-arrows (as styled for a &lt;del> element)</li>
46<li>where there is replacement, it’s shown in green with bi-directional arrows,</li>
47<li>where there is newly inserted text, it’s yellow with up arrows (&lt;ins> element)</li>
48</ul>
49<address>
50script $Revision: 1.62 $ of $Date: 2016/10/06 10:46:19 $<br />
51by <a href="http://www.w3.org/People/Dom/">Dominique Hazaël-Massieux</a><br />based on <a href="https://dev.w3.org/cvsweb/2009/htmldiff/htmldiff.pl">Shane McCarron’ Perl script</a> wrapped in a <a href="http://dev.w3.org/cvsweb/2009/htmldiff/">Python CGI</a>
52</address>
53</body>
54</html>
55"""
56
57def checkInputUrl(url):
58    checker = surbl.SurblChecker('/usr/local/share/surbl/two-level-tlds','/afs/w3.org/pub/WWW/Systems/Server/debian/generic/usr/local/etc/surbl.whitelist')
59
60    if  url[:5] == 'file:' or len(urlparse.urlparse(url)[0])<2:
61        print "Status: 403"
62        print "Content-Type: text/plain"
63        print
64        print "sorry, I decline to handle file: addresses"
65        sys.exit()
66    elif checker.isMarkedAsSpam(url):
67        print "Status: 403"
68        print "Content-Type: text/plain; charset=utf-8"
69        print
70        print "sorry, this URL matches a record known in SURBL. See http://www.surbl.org/"
71        sys.exit()
72
73def copyHeader(copy_func, source, key, header_name=None):
74    value = source.get(key)
75    if not value:
76        return False
77    elif header_name is None:
78        header_name = key
79    copy_func(header_name, value)
80    return True
81
82def setupRequest(source_headers):
83    opener = http_auth.ProxyAuthURLopener()
84    copyHeader(opener.addheader, source_headers, 'If-Modified-Since')
85    copyHeader(opener.addheader, os.environ, 'REMOTE_ADDR', 'X_Forward_IP_Addr')
86    return opener
87
88def tidyFile(file):
89    # option for tidy
90    options = dict(tidy_mark=0,show_warnings=0,quiet=1,char_encoding='utf8')
91    html5 = re.search(r"<!doctype\s+html\s*>", file.read(4096),
92                      re.IGNORECASE)
93    file.seek(0)
94    html5_options = {"add_xml_space": "no",
95                     "output_xhtml": "no",
96                     "tidy_mark": "no",
97                     "new_blocklevel_tags": 'article,aside,canvas,dialog,details,figcaption,figure,footer,header,hgroup,menu,nav,section,main,summary,math,semantics,mrow,mfenced,mtable,mtr,mtd,mi,mn,msub,mo,mfrac,munderover,mtext,svg,g,image,rect,text,desc,line,path,polygon,ellipse,tspan,defs,feoffset,fecolormatrix,filter,fegaussianblur,feblend,marker,circle',
98                     "new_inline_tags": 'video,audio,canvas,ruby,rt,rp,time,meter,progress,track,source,emu-val,emu-nt,emu-t,mark',
99                     "break_before_br": "no",
100                     "vertical_space": "no",
101                     "enclose_text": "no",
102                     "numeric_entities": "yes",
103                     "wrap": "1000",
104                     "wrap_attributes": "no",
105                     "drop_empty_paras": "no"
106                     }
107    if html5:
108        options.update(html5_options)
109    newtidy = tidy.parseString(file.read(), **options)
110    if len(newtidy.errors) > 0:
111        if not html5:
112            file.seek(0)
113            options.update(html5_options)
114            newtidy = tidy.parseString(file.read(), **options)
115    file.close()
116    file = tempfile.NamedTemporaryFile(
117        mode='w+', prefix='htmldiff-', suffix='.html')
118    atexit.register(file.close)
119    file.write(str(newtidy))
120    file.flush()
121    file.seek(0)
122    return (file, newtidy.errors)
123
124def matchPredecessorRel(rel):
125    return rel and "predecessor-version" in rel.lower().split(" ")
126
127def mirrorURL(url, opener):
128    try:
129        filename, headers = opener.retrieve(url)
130    except IOError, error:
131        opener.error = "I/O error: %s %s" % (error.errno, error.strerror)
132    except httplib.InvalidURL:
133        opener.error = "Invalid URL submitted"
134    except AttributeError:  # ProxyAuthURLopener returned None.
135        pass                # There's already an error set.
136    else:
137        atexit.register(os.unlink, filename)
138        file = open(filename)
139        if headers.has_key("content-encoding") and headers["content-encoding"] == "gzip":
140            import gzip
141            from StringIO import StringIO
142            data = StringIO(file.read())
143            file.close()
144            file = gzip.GzipFile(fileobj=data)
145        file,errors = tidyFile(file)
146        if len(errors) == 0:
147            return (file, headers)
148        else:
149            opener.error = "Tidy errors: %s" % (str(errors))
150    return (None, {})
151
152def showPage(url1='', url2='', error_html='', **headers):
153    for name, value in headers.items():
154        print "%s: %s" % (name.replace('_', '-'), value)
155    print
156    print Page
157    print error_html
158    print Page2 % (url1, url2)
159    sys.exit()
160
161def serveRequest():
162    fields = cgi.FieldStorage()
163
164    if (not fields.has_key('doc2')):
165        showPage(Content_Type=CONTENT_TYPE)
166    # if doc1 is not specified, we load doc2 to check if it has a previous version link
167    doc2 = fields['doc2'].value
168    checkInputUrl(doc2)
169    url_opener2 = setupRequest(fields.headers)
170    newdoc, newheaders = mirrorURL(doc2, url_opener2)
171    if fields.has_key('doc1'):
172        doc1 = fields['doc1'].value
173    elif newdoc is not None:
174        from BeautifulSoup import BeautifulSoup
175
176        soup = BeautifulSoup(newdoc.read())
177        newdoc.seek(0)
178        try:
179            doc1 = soup.find(text=re.compile("Previous Version",re.IGNORECASE)).findNext(name="a", attrs={"href":True})["href"]
180        except:
181            try:
182                doc1 = soup.find(name=["a", "link"], attrs={"href":True, rel:matchPredecessorRel})["href"]
183            except:
184                doc1 = None
185    else:
186        doc1 = None
187    if (not doc1):
188        showPage(Content_Type=CONTENT_TYPE)
189
190    checkInputUrl(doc1)
191    esc1 = cgi.escape(doc1, True)
192    esc2 = cgi.escape(doc2, True)
193    urlcomponents1 = urlparse.urlparse(doc1)
194    urlcomponents2 = urlparse.urlparse(doc2)
195    # if same domain, we can use the same urlopener
196    # otherwise, we create a separate one
197    if urlcomponents2[1] == urlcomponents1[1]:
198        url_opener = url_opener2
199    else:
200        url_opener = setupRequest(fields.headers)
201
202    refdoc, refheaders = mirrorURL(doc1, url_opener)
203    if not (refdoc and newdoc):
204        http_error = ""
205        url = ""
206        if not refdoc:
207            http_error = url_opener.error
208            url = esc1
209        else:
210            http_error = url_opener2.error
211            url = esc2
212        if re.match("^[1234][0-9][0-9] ", http_error):
213            print "Status: %s" %(http_error)
214        error="<p style='color:#FF0000'>An error (%s) occured trying to get <a href='%s'>%s</a>.</p>" % (cgi.escape(http_error), url, url)
215        showPage(esc1, esc2, error, Content_Type=CONTENT_TYPE)
216
217    print "Content-Type: text/html"
218    if newheaders.has_key('Content-Type'):
219        contentType = cgi.parse_header(newheaders["Content-Type"])
220        if contentType[1].has_key('charset'):
221            charset = contentType[1]['charset'].lower()
222            #if charset == "iso-8859-1":
223            #    options["char_encoding"]='latin1'
224
225    for proxy_header in ('Last-Modified', 'Expires'):
226        if copyHeader(lambda header, value: sys.stdout.write("%s: %s" %(header, value)), newheaders, proxy_header):
227            print
228    print
229    p = Popen(["/usr/local/bin/htmldiff", refdoc.name, newdoc.name],
230              stdin=PIPE, stdout=PIPE, stderr=PIPE)
231    sys.stdout.flush()
232    sys.stderr.flush()
233    (out, err) = p.communicate()
234    p.stdin.close()
235    if err:
236        error = "<p style='color:#FF0000'>An error occured when running <code>htmldiff</code> on the documents:</p><pre>%s</pre>" % (cgi.escape(err),)
237        showPage(esc1, esc2, error)
238    else:
239        print out
240if __name__ == '__main__':
241    if os.environ.has_key('SCRIPT_NAME'):
242        serveRequest()
243