• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python
2#
3# Modified from the htmldiff script developed by Dominique HazaC+l-Massieux
4# for the http://services.w3.org/htmldiff website. That script did not
5# include a copyright statement.
6
7import atexit
8import os
9import re
10import sys
11import tempfile
12import tidy
13
14from subprocess import Popen, PIPE
15
16def tidyFile(filename):
17    ifp = open(filename, 'r')
18
19    # option for tidy
20    options = dict(tidy_mark=0,show_warnings=0,quiet=1,char_encoding='utf8')
21    html5 = re.search(r"<!doctype\s+html\s*>", ifp.read(4096),
22                      re.IGNORECASE)
23    ifp.seek(0)
24    html5_options = {'add_xml_space': 'no',
25                     'output_xhtml': 'no',
26                     'tidy_mark': 'no',
27                     'new_blocklevel_tags': 'article,aside,canvas,dialog,details,figcaption,figure,footer,header,hgroup,menu,nav,section,main,summary,math,semantics,mrow,mfenced,mtable,mtr,mtd,mi,mn,msub,mo,mfrac,munderover,mtext,svg,g,image,rect,text,desc,line,path,polygon,ellipse,tspan,defs,feoffset,fecolormatrix,filter,fegaussianblur,feblend,marker,circle',
28                     'new_inline_tags': 'video,audio,canvas,ruby,rt,rp,time,meter,progress,track,source,emu-val,emu-nt,emu-t,mark',
29                     'break_before_br': 'no',
30                     'vertical_space': 'no',
31                     'enclose_text': 'no',
32                     'numeric_entities': 'yes',
33                     'wrap': '1000',
34                     'wrap_attributes': 'no',
35                     'drop_empty_paras': 'no'
36                     }
37    if html5:
38        options.update(html5_options)
39    newtidy = tidy.parseString(ifp.read(), **options)
40    if len(newtidy.errors) > 0:
41        if not html5:
42            ifp.seek(0)
43            options.update(html5_options)
44            newtidy = tidy.parseString(ifp.read(), **options)
45    ifp.close()
46
47    fp = tempfile.NamedTemporaryFile(
48           mode='w+', prefix='htmldiff-', suffix='.html')
49    atexit.register(fp.close)
50    fp.write(str(newtidy))
51    fp.flush()
52    fp.seek(0)
53
54    # sys.stderr.write('tidyFile: tempfile name %s\n' % fp.name)
55
56    if (newtidy.errors):
57        sys.stderr.write('tidyFile: tidy.parseString error: %s\n' % str(newtidy.errors))
58    return fp
59
60if __name__ == '__main__':
61    if (len(sys.argv) < 3):
62        sys.stderr.write('tidy: need args file1 file2\n')
63        sys.exit(1)
64
65    refdoc = tidyFile(sys.argv[1])
66    newdoc = tidyFile(sys.argv[2])
67
68    scriptdir = os.path.abspath(os.path.dirname(sys.argv[0]))
69    perlscript = os.path.join(scriptdir, 'htmldiff.pl')
70
71    p = Popen([perlscript, refdoc.name, newdoc.name],
72              stdin=PIPE, stdout=PIPE, stderr=PIPE)
73    sys.stdout.flush()
74    sys.stderr.flush()
75    (out, err) = p.communicate()
76    p.stdin.close()
77    if err:
78        sys.stderr.write('htmldiff: An error occured when running htmldiff.pl on the documents:', str(err))
79        exit(1)
80    else:
81        print(out)
82