• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/perl
2#
3# htmldiff - present a diff marked version of two html documents
4#
5# Copyright (c) 1998-2006 MACS, Inc.
6#
7# Copyright (c) 2007 SiSco, Inc.
8#
9# Permission is hereby granted, free of charge, to any person obtaining
10# a copy of this software and associated documentation files (the
11# "Software"), to deal in the Software without restriction, including
12# without limitation the rights to use, copy, modify, merge, publish,
13# distribute, sublicense, and/or sell copies of the Software, and to
14# permit persons to whom the Software is furnished to do so, subject to
15# the following conditions:
16#
17# The above copyright notice and this permission notice shall be
18# included in all copies or substantial portions of the Software.
19#
20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27#
28# See http://www.themacs.com for more information.
29#
30# usage: htmldiff [[-c] [-l] [-o] oldversion newversion [output]]
31#
32# -c - disable metahtml comment processing
33# -o - disable outputting of old text
34# -l - use navindex to create sequence of diffs
35# oldversion - the previous version of the document
36# newversion - the newer version of the document
37# output - a filename to place the output in. If omitted, the output goes to
38#          standard output.
39#
40# if invoked with no options or arguments, operates as a CGI script. It then
41# takes the following parameters:
42#
43# oldfile - the URL of the original file
44# newfile - the URL of the new file
45# mhtml - a flag to indicate whether it should be aware of MetaHTML comments.
46#
47# requires GNU diff utility
48# also requires the perl modules Getopt::Std
49#
50# NOTE: The markup created by htmldiff may not validate against the HTML 4.0
51# DTD. This is because the algorithm is realtively simple, and there are
52# places in the markup content model where the span element is not allowed.
53# Htmldiff is NOT aware of these places.
54#
55# $Source: /u/sources/public/2009/htmldiff/htmldiff.pl,v $
56# $Revision: 1.3 $
57#
58# $Log: htmldiff.pl,v $
59# Revision 1.3  2016/10/24 15:06:51  dom
60# Summary: Use nav script always
61#
62# Revision 1.2  2016/10/24 15:04:28  dom
63# Add navigation script
64#
65# Revision 1.1  2014-01-06 08:04:51  dom
66# added copy of htmldiff perl script since aptest.com repo no longer available
67#
68# Revision 1.5  2008/03/05 13:23:16  ahby
69# Fixed a problem with leading whitespace before markup.
70#
71# Revision 1.4  2007/12/13 13:09:16  ahby
72# Updated copyright and license.
73#
74# Revision 1.3  2007/12/13 12:53:34  ahby
75# Changed use of span to ins and del
76#
77# Revision 1.2  2002/02/13 16:27:23  ahby
78# Changed processing model.
79# Improved handling of old text and changed styles.
80#
81# Revision 1.1  2000/07/12 12:20:04  ahby
82# Updated to remove empty spans - this fixes validation problems under
83# strict.
84#
85# Revision 1.11  1999/12/08 19:46:45  ahby
86# Fixed validation errors introduced by placing markup where it didn't
87# belong.
88#
89# Revision 1.10  1999/10/18 13:42:58  ahby
90# Added -o to the usage message.
91#
92# Revision 1.9  1999/05/04 12:29:11  ahby
93# Added an option to turn off the display of old text.
94#
95# Revision 1.8  1999/04/09 14:37:27  ahby
96# Fixed a perl syntax error.
97#
98# Revision 1.7  1999/04/09 14:35:49  ahby
99# Added reference to MACS homepage.
100#
101# Revision 1.6  1999/04/09 14:35:09  ahby
102# Added comment about validity of generated markup.
103#
104# Revision 1.5  1999/02/22 22:17:54  ahby
105# Changed to use stylesheets.
106# Changed to rely upon span.
107# Changed to work around content model problems.
108#
109# Revision 1.4  1999/02/08 02:32:22  ahby
110# Added a copyright statement.
111#
112# Revision 1.3  1999/02/08 02:30:40  ahby
113# Added header processing.
114#
115# Revision 1.2  1998/12/10 17:31:31  ahby
116# Fixed to escape less-thans in change blocks and to not permit change
117# markup within specific elements (like TITLE).
118#
119# Revision 1.1  1998/11/26 00:09:22  ahby
120# Initial revision
121#
122#
123
124use Getopt::Std;
125
126sub usage {
127	print STDERR "htmldiff [-c] [-o] oldversion newversion [output]\n";
128	exit;
129}
130
131sub url_encode {
132    my $str = shift;
133    $str =~ s/([\x00-\x1f\x7F-\xFF])/
134                 sprintf ('%%%02x', ord ($1))/eg;
135    return $str;
136}
137
138# markit - diff-mark the streams
139#
140# markit(file1, file2)
141#
142# markit relies upon GNUdiff to mark up the text.
143#
144# The markup is encoded using special control sequences:
145#
146#   a block wrapped in control-a is deleted text
147#   a block wrapped in control-b is old text
148#   a block wrapped in control-c is new text
149#
150# The main processing loop attempts to wrap the text blocks in appropriate
151# SPANs based upon the type of text that it is.
152#
153# When the loop encounters a < in the text, it stops the span. Then it outputs
154# the element that is defined, then it restarts the span.
155
156sub markit {
157	my $retval = "";
158	my($file1) = shift;
159	my($file2) = shift;
160#	my $old="<span class=\\\"diff-old-a\\\">deleted text: </span>%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'";
161	my $old="%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'";
162	my $new="%c'\012'%c'\003'%c'\012'%>%c'\012'%c'\003'%c'\012'";
163	my $unchanged="%=";
164	my $changed="%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'%c'\004'%c'\012'%>%c'\012'%c'\004'%c'\012'";
165	if ($opt_o) {
166		$old = "";
167		$changed = "%c'\012'%c'\004'%c'\012'%>%c'\012'%c'\004'%c'\012'";
168	}
169#	my $old="%c'\002'<font color=\\\"purple\\\" size=\\\"-2\\\">deleted text:</font><s>%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'</s>%c'\012'%c'\002'";
170#	my $new="%c'\002'<font color=\\\"purple\\\"><u>%c'\012'%c'\002'%>%c'\002'</u></font>%c'\002'%c'\012'";
171#	my $unchanged="%=";
172#	my $changed="%c'\002'<s>%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'</s><font color=\\\"purple\\\"><u>%c'\002'%c'\012'%>%c'\012'%c'\002'</u></font>%c'\002'%c'\012'";
173
174	my @span;
175	$span[0]="</span>";
176	$span[1]="<del class=\"diff-old\">";
177	$span[2]="<del class=\"diff-old\">";
178	$span[3]="<ins class=\"diff-new\">";
179	$span[4]="<ins class=\"diff-chg\">";
180
181	my @diffEnd ;
182	$diffEnd[1] = '</del>';
183	$diffEnd[2] = '</del>';
184	$diffEnd[3] = '</ins>';
185	$diffEnd[4] = '</ins>';
186
187	my $diffcounter = 0;
188
189	open(FILE, qq(diff -d --old-group-format="$old" --new-group-format="$new" --changed-group-format="$changed" --unchanged-group-format="$unchanged" $file1 $file2 |)) || die("Diff failed: $!");
190#	system (qq(diff --old-group-format="$old" --new-group-format="$new" --changed-group-format="$changed" --unchanged-group-format="$unchanged" $file1 $file2 > /tmp/output));
191
192	my $state = 0;
193	my $inblock = 0;
194	my $temp = "";
195	my $lineCount = 0;
196
197# strategy:
198#
199# process the output of diff...
200#
201# a link with control A-D means the start/end of the corresponding ordinal
202# state (1-4). Resting state is state 0.
203#
204# While in a state, accumulate the contents for that state. When exiting the
205# state, determine if it is appropriate to emit the contents with markup or
206# not (basically, if the accumulated buffer contains only empty lines or lines
207# with markup, then we don't want to emit the wrappers.  We don't need them.
208#
209# Note that if there is markup in the "old" block, that markup is silently
210# removed.  It isn't really that interesting, and it messes up the output
211# something fierce.
212
213	while (<FILE>) {
214		my $anchor = $opt_l ? qq[<a tabindex="$diffcounter">] : "" ;
215		my $anchorEnd = $opt_l ? q[</a>] : "" ;
216		$lineCount ++;
217		if ($state == 0) {	# if we are resting and we find a marker,
218							# then we must be entering a block
219			if (m/^([\001-\004])/) {
220				$state = ord($1);
221				$_ = "";
222			}
223#			if (m/^\001/) {
224#				$state = 1;
225#				s/^/$span[1]/;
226#			} elsif (m/^\002/) {
227#				$state = 2;
228#				s/^/$span[2]/;
229#			} elsif (m/^\003/) {
230#				$state = 3;
231#				s/^/$span[3]/;
232#			} elsif (m/^\004/) {
233#				$state = 4;
234#				s/^/$span[4]/;
235#			}
236		} else {
237			# if we are in "old" state, remove markup
238			if (($state == 1) || ($state == 2)) {
239				s/\<.*\>//;	# get rid of any old markup
240				s/\</&lt;/g; # escape any remaining STAG or ETAGs
241				s/\>/&gt;/g;
242			}
243			# if we found another marker, we must be exiting the state
244			if (m/^([\001-\004])/) {
245				if ($temp ne "") {
246					$_ = $span[$state] . $anchor . $temp . $anchorEnd . $diffEnd[$state] . "\n";
247					$temp = "";
248				} else {
249					$_ = "" ;
250				}
251				$state = 0;
252			} elsif (m/^\s*\</) { # otherwise, is this line markup?
253				# if it is markup AND we haven't seen anything else yet,
254				# then we will emit the markup
255				if ($temp eq "") {
256					$retval .= $_;
257					$_ = "";
258				} else {	# we wrap it with the state switches and hold it
259					s/^/$anchorEnd$diffEnd[$state]/;
260					s/$/$span[$state]$anchor/;
261					$temp .= $_;
262					$_ = "";
263				}
264			} else {
265				if (m/.+/) {
266					$temp .= $_;
267					$_ = "";
268				}
269			}
270		}
271
272		s/\001//g;
273		s/\002//g;
274		s/\003//g;
275		s/\004//g;
276		if ($_ !~ m/^$/) {
277			$retval .= $_;
278		}
279		$diffcounter++;
280	}
281	close FILE;
282	$retval =~ s/$span[1]\n+$diffEnd[1]//g;
283	$retval =~ s/$span[2]\n+$diffEnd[2]//g;
284	$retval =~ s/$span[3]\n+$diffEnd[3]//g;
285	$retval =~ s/$span[4]\n+$diffEnd[4]//g;
286	$retval =~ s/$span[1]\n*$//g;
287	$retval =~ s/$span[2]\n*$//g;
288	$retval =~ s/$span[3]\n*$//g;
289	$retval =~ s/$span[4]\n*$//g;
290	return $retval;
291}
292
293sub splitit {
294	my $filename = shift;
295	my $headertmp = shift;
296	my $inheader=0;
297	my $preformatted=0;
298	my $inelement=0;
299	my $retval = "";
300	my $styles = q(<style type='text/css'>
301.diff-old-a {
302  font-size: smaller;
303  color: red;
304}
305
306.diff-new { background-color: yellow; }
307.diff-chg { background-color: lime; }
308.diff-new:before,
309.diff-new:after
310    { content: "\2191" }
311.diff-chg:before, .diff-chg:after
312    { content: "\2195" }
313.diff-old { text-decoration: line-through; background-color: #FBB; }
314.diff-old:before,
315.diff-old:after
316    { content: "\2193" }
317:focus { border: thin red solid}
318</style>
319<script src="https://www.w3.org/2016/10/htmldiff-nav.js"></script>);
320	if ($opt_t) {
321		$styles .= q(
322<script type="text/javascript">
323<!--
324function setOldDisplay() {
325	for ( var s = 0; s < document.styleSheets.length; s++ ) {
326		var css = document.styleSheets[s];
327		var mydata ;
328		try { mydata = css.cssRules ;
329		if ( ! mydata ) mydata = css.rules;
330		for ( var r = 0; r < mydata.length; r++ ) {
331			if ( mydata[r].selectorText == '.diff-old' ) {
332				mydata[r].style.display = ( mydata[r].style.display == '' ) ? 'none'
333: '';
334				return;
335			}
336		}
337		} catch(e) {} ;
338	}
339}
340-->
341</script>
342);
343
344	}
345
346	if ($stripheader) {
347		open(HEADER, ">$headertmp");
348	}
349
350	my $incomment = 0;
351	my $inhead = 1;
352	open(FILE, $filename) || die("File $filename cannot be opened: $!");
353	while (<FILE>) {
354		if ($inhead == 1) {
355			if (m/\<\/head/i) {
356				print HEADER $styles;
357			}
358			if (m/\<body/i) {
359				$inhead = 0;
360				print HEADER;
361				if ($opt_t) {
362					print HEADER q(
363<form action=""><input type="button" onclick="setOldDisplay()" value="Show/Hide Old Content" /></form>
364);
365				}
366				close HEADER;
367			} else {
368				print HEADER;
369			}
370		} else {
371			if ($incomment) {
372				if (m;-->;) {
373					$incomment = 0;
374					s/.*-->//;
375				} else {
376					next;
377				}
378			}
379			if (m;<!--;) {
380				while (m;<!--.*-->;) {
381					s/<!--.*?-->//;
382				}
383				if (m;<!--; ) {
384					$incomment = 1;
385					s/<!--.*//;
386				}
387			}
388			if (m/\<pre/i) {
389				$preformatted = 1;
390			}
391			if (m/\<\/pre\>/i) {
392				$preformatted = 0;
393			}
394			if ($preformatted) {
395				$retval .= $_;
396			} elsif ($mhtmlcomments && /^;;;/) {
397				$retval .= $_;
398			} else {
399				my @list = split(' ');
400				foreach $element (@list) {
401					if ($element =~ m/\<H[1-6]/i) {
402#						$inheader = 1;
403					}
404					if ($inheader == 0) {
405						$element =~ s/</\n</g;
406						$element =~ s/^\n//;
407						$element =~ s/>/>\n/g;
408						$element =~ s/\n$//;
409						$element =~ s/>\n([.,:!]+)/>$1/g;
410					}
411					if ($element =~ m/\<\/H[1-6]\>/i) {
412						$inheader = 0;
413					}
414					$retval .= "$element";
415					$inelement += ($element =~ s/</&lt;/g);
416					$inelement -= ($element =~ s/>/&gt;/g);
417					if ($inelement < 0) {
418						$inelement = 0;
419					}
420					if (($inelement == 0) && ($inheader == 0)) {
421						$retval .= "\n";
422					} else {
423						$retval .= " ";
424					}
425				}
426			undef @list;
427			}
428		}
429	}
430	$retval .= "\n";
431	close FILE;
432	return $retval;
433}
434
435$mhtmlcomments = 1;
436
437sub cli {
438	getopts("clto") || usage();
439
440	if ($opt_c) {$mhtmlcomments = 0;}
441
442	if (@ARGV < 2) { usage(); }
443
444	$file1 = $ARGV[0];
445	$file2 = $ARGV[1];
446	$file3 = $ARGV[2];
447
448	$tmp = splitit($file1, $headertmp1);
449	open (FILE, ">$tmp1");
450	print FILE $tmp;
451	close FILE;
452
453	$tmp = splitit($file2, $headertmp2);
454	open (FILE, ">$tmp2");
455	print FILE $tmp;
456	close FILE;
457
458	$output = "";
459
460	if ($stripheader) {
461		open(FILE, $headertmp2);
462		while (<FILE>) {
463			$output .= $_;
464		}
465		close(FILE);
466	}
467
468	$output .= markit($tmp1, $tmp2);
469
470	if ($file3) {
471		open(FILE, ">$file3");
472		print FILE $output;
473		close FILE;
474	} else {
475		print $output;
476	}
477}
478
479sub cgi {
480#	use LWP::UserAgent;
481#	use CGI;
482
483	my $query = new CGI;
484	my $url1 = $query->param("oldfile");
485	my $url2 = $query->param("newfile");
486	my $mhtml = $query->param("mhtml");
487
488	my $file1 = "/tmp/htdcgi1.$$";
489	my $file2 = "/tmp/htdcgi2.$$";
490
491	my $ua = new LWP::UserAgent;
492	$ua->agent("MACS, Inc. HTMLdiff/0.9 " . $ua->agent);
493
494	# Create a request
495
496	my $req1 = new HTTP::Request GET => $url1;
497
498	my $res1 = $ua->request($req1, $file1);
499	if ($res1->is_error) {
500		print $res1->error_as_HTML();
501		print "<p>The URL $url1 could not be found.  Please check it and try again.</p>";
502		return;
503	}
504
505	my $req2 = new HTTP::Request GET => $url2;
506
507	my $res2 = $ua->request($req2, $file2);
508	if ($res2->is_error) {
509		print $res2->error_as_HTML();
510		print "<p>The URL $url2 could not be found.  Please check it and try again.</p>";
511		return;
512	}
513
514	$split1 = splitit($file1, $headertmp1);
515	open (FILE, ">$tmp1");
516	print FILE $split1;
517	close FILE;
518
519	$split2 = splitit($file2, $headertmp2);
520	open (FILE, ">$tmp2");
521	print FILE $split2;
522	close FILE;
523
524	$output = "";
525
526	if ($stripheader) {
527		open(FILE, $headertmp2);
528		while (<FILE>) {
529			$output .= $_;
530		}
531		close(FILE);
532	}
533
534	$output .= markit($tmp1, $tmp2);
535
536	my $base=$res2->base;
537
538	if ($base !~ /\/$/) {
539		$base =~ s/[^\/]*$//;
540	}
541
542	if ( $output !~ /<base/i ) {
543		$output =~ s/<head>/<head>\n<base href="$base">/i ||
544	  	$output =~ s/<html>/<html>\n<base href="$base">/i ;
545	}
546
547	print $query->header(-type=>'text/html',-nph=>1);
548	print $output;
549
550	unlink $file1;
551	unlink $file2;
552
553}
554
555$tmp1="/tmp/htdtmp1.$$";
556$headertmp1="/tmp/htdhtmp1.$$";
557$tmp2="/tmp/htdtmp2.$$";
558$headertmp2="/tmp/htdhtmp2.$$";
559$stripheader = 1;
560
561if (@ARGV == 0) {
562	cgi();		# if no arguments, we must be operating as a cgi script
563} else {
564	cli();		# if there are arguments, then we are operating as a CLI
565}
566
567unlink $tmp1;
568unlink $headertmp1;
569unlink $tmp2;
570unlink $headertmp2;
571