1#!/usr/bin/perl 2# 3# htmldiff - present a diff marked version of two html documents 4# 5# Copyright (c) 1998-2006 MACS, Inc. 6# 7# Copyright (c) 2007 SiSco, Inc. 8# 9# Permission is hereby granted, free of charge, to any person obtaining 10# a copy of this software and associated documentation files (the 11# "Software"), to deal in the Software without restriction, including 12# without limitation the rights to use, copy, modify, merge, publish, 13# distribute, sublicense, and/or sell copies of the Software, and to 14# permit persons to whom the Software is furnished to do so, subject to 15# the following conditions: 16# 17# The above copyright notice and this permission notice shall be 18# included in all copies or substantial portions of the Software. 19# 20# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 21# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 22# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 23# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 24# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 25# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 26# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 27# 28# See http://www.themacs.com for more information. 29# 30# usage: htmldiff [[-c] [-l] [-o] oldversion newversion [output]] 31# 32# -c - disable metahtml comment processing 33# -o - disable outputting of old text 34# -l - use navindex to create sequence of diffs 35# oldversion - the previous version of the document 36# newversion - the newer version of the document 37# output - a filename to place the output in. If omitted, the output goes to 38# standard output. 39# 40# if invoked with no options or arguments, operates as a CGI script. It then 41# takes the following parameters: 42# 43# oldfile - the URL of the original file 44# newfile - the URL of the new file 45# mhtml - a flag to indicate whether it should be aware of MetaHTML comments. 46# 47# requires GNU diff utility 48# also requires the perl modules Getopt::Std 49# 50# NOTE: The markup created by htmldiff may not validate against the HTML 4.0 51# DTD. This is because the algorithm is realtively simple, and there are 52# places in the markup content model where the span element is not allowed. 53# Htmldiff is NOT aware of these places. 54# 55# $Source: /u/sources/public/2009/htmldiff/htmldiff.pl,v $ 56# $Revision: 1.3 $ 57# 58# $Log: htmldiff.pl,v $ 59# Revision 1.3 2016/10/24 15:06:51 dom 60# Summary: Use nav script always 61# 62# Revision 1.2 2016/10/24 15:04:28 dom 63# Add navigation script 64# 65# Revision 1.1 2014-01-06 08:04:51 dom 66# added copy of htmldiff perl script since aptest.com repo no longer available 67# 68# Revision 1.5 2008/03/05 13:23:16 ahby 69# Fixed a problem with leading whitespace before markup. 70# 71# Revision 1.4 2007/12/13 13:09:16 ahby 72# Updated copyright and license. 73# 74# Revision 1.3 2007/12/13 12:53:34 ahby 75# Changed use of span to ins and del 76# 77# Revision 1.2 2002/02/13 16:27:23 ahby 78# Changed processing model. 79# Improved handling of old text and changed styles. 80# 81# Revision 1.1 2000/07/12 12:20:04 ahby 82# Updated to remove empty spans - this fixes validation problems under 83# strict. 84# 85# Revision 1.11 1999/12/08 19:46:45 ahby 86# Fixed validation errors introduced by placing markup where it didn't 87# belong. 88# 89# Revision 1.10 1999/10/18 13:42:58 ahby 90# Added -o to the usage message. 91# 92# Revision 1.9 1999/05/04 12:29:11 ahby 93# Added an option to turn off the display of old text. 94# 95# Revision 1.8 1999/04/09 14:37:27 ahby 96# Fixed a perl syntax error. 97# 98# Revision 1.7 1999/04/09 14:35:49 ahby 99# Added reference to MACS homepage. 100# 101# Revision 1.6 1999/04/09 14:35:09 ahby 102# Added comment about validity of generated markup. 103# 104# Revision 1.5 1999/02/22 22:17:54 ahby 105# Changed to use stylesheets. 106# Changed to rely upon span. 107# Changed to work around content model problems. 108# 109# Revision 1.4 1999/02/08 02:32:22 ahby 110# Added a copyright statement. 111# 112# Revision 1.3 1999/02/08 02:30:40 ahby 113# Added header processing. 114# 115# Revision 1.2 1998/12/10 17:31:31 ahby 116# Fixed to escape less-thans in change blocks and to not permit change 117# markup within specific elements (like TITLE). 118# 119# Revision 1.1 1998/11/26 00:09:22 ahby 120# Initial revision 121# 122# 123 124use Getopt::Std; 125 126sub usage { 127 print STDERR "htmldiff [-c] [-o] oldversion newversion [output]\n"; 128 exit; 129} 130 131sub url_encode { 132 my $str = shift; 133 $str =~ s/([\x00-\x1f\x7F-\xFF])/ 134 sprintf ('%%%02x', ord ($1))/eg; 135 return $str; 136} 137 138# markit - diff-mark the streams 139# 140# markit(file1, file2) 141# 142# markit relies upon GNUdiff to mark up the text. 143# 144# The markup is encoded using special control sequences: 145# 146# a block wrapped in control-a is deleted text 147# a block wrapped in control-b is old text 148# a block wrapped in control-c is new text 149# 150# The main processing loop attempts to wrap the text blocks in appropriate 151# SPANs based upon the type of text that it is. 152# 153# When the loop encounters a < in the text, it stops the span. Then it outputs 154# the element that is defined, then it restarts the span. 155 156sub markit { 157 my $retval = ""; 158 my($file1) = shift; 159 my($file2) = shift; 160# my $old="<span class=\\\"diff-old-a\\\">deleted text: </span>%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'"; 161 my $old="%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'"; 162 my $new="%c'\012'%c'\003'%c'\012'%>%c'\012'%c'\003'%c'\012'"; 163 my $unchanged="%="; 164 my $changed="%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'%c'\004'%c'\012'%>%c'\012'%c'\004'%c'\012'"; 165 if ($opt_o) { 166 $old = ""; 167 $changed = "%c'\012'%c'\004'%c'\012'%>%c'\012'%c'\004'%c'\012'"; 168 } 169# my $old="%c'\002'<font color=\\\"purple\\\" size=\\\"-2\\\">deleted text:</font><s>%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'</s>%c'\012'%c'\002'"; 170# my $new="%c'\002'<font color=\\\"purple\\\"><u>%c'\012'%c'\002'%>%c'\002'</u></font>%c'\002'%c'\012'"; 171# my $unchanged="%="; 172# my $changed="%c'\002'<s>%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'</s><font color=\\\"purple\\\"><u>%c'\002'%c'\012'%>%c'\012'%c'\002'</u></font>%c'\002'%c'\012'"; 173 174 my @span; 175 $span[0]="</span>"; 176 $span[1]="<del class=\"diff-old\">"; 177 $span[2]="<del class=\"diff-old\">"; 178 $span[3]="<ins class=\"diff-new\">"; 179 $span[4]="<ins class=\"diff-chg\">"; 180 181 my @diffEnd ; 182 $diffEnd[1] = '</del>'; 183 $diffEnd[2] = '</del>'; 184 $diffEnd[3] = '</ins>'; 185 $diffEnd[4] = '</ins>'; 186 187 my $diffcounter = 0; 188 189 open(FILE, qq(diff -d --old-group-format="$old" --new-group-format="$new" --changed-group-format="$changed" --unchanged-group-format="$unchanged" $file1 $file2 |)) || die("Diff failed: $!"); 190# system (qq(diff --old-group-format="$old" --new-group-format="$new" --changed-group-format="$changed" --unchanged-group-format="$unchanged" $file1 $file2 > /tmp/output)); 191 192 my $state = 0; 193 my $inblock = 0; 194 my $temp = ""; 195 my $lineCount = 0; 196 197# strategy: 198# 199# process the output of diff... 200# 201# a link with control A-D means the start/end of the corresponding ordinal 202# state (1-4). Resting state is state 0. 203# 204# While in a state, accumulate the contents for that state. When exiting the 205# state, determine if it is appropriate to emit the contents with markup or 206# not (basically, if the accumulated buffer contains only empty lines or lines 207# with markup, then we don't want to emit the wrappers. We don't need them. 208# 209# Note that if there is markup in the "old" block, that markup is silently 210# removed. It isn't really that interesting, and it messes up the output 211# something fierce. 212 213 while (<FILE>) { 214 my $anchor = $opt_l ? qq[<a tabindex="$diffcounter">] : "" ; 215 my $anchorEnd = $opt_l ? q[</a>] : "" ; 216 $lineCount ++; 217 if ($state == 0) { # if we are resting and we find a marker, 218 # then we must be entering a block 219 if (m/^([\001-\004])/) { 220 $state = ord($1); 221 $_ = ""; 222 } 223# if (m/^\001/) { 224# $state = 1; 225# s/^/$span[1]/; 226# } elsif (m/^\002/) { 227# $state = 2; 228# s/^/$span[2]/; 229# } elsif (m/^\003/) { 230# $state = 3; 231# s/^/$span[3]/; 232# } elsif (m/^\004/) { 233# $state = 4; 234# s/^/$span[4]/; 235# } 236 } else { 237 # if we are in "old" state, remove markup 238 if (($state == 1) || ($state == 2)) { 239 s/\<.*\>//; # get rid of any old markup 240 s/\</</g; # escape any remaining STAG or ETAGs 241 s/\>/>/g; 242 } 243 # if we found another marker, we must be exiting the state 244 if (m/^([\001-\004])/) { 245 if ($temp ne "") { 246 $_ = $span[$state] . $anchor . $temp . $anchorEnd . $diffEnd[$state] . "\n"; 247 $temp = ""; 248 } else { 249 $_ = "" ; 250 } 251 $state = 0; 252 } elsif (m/^\s*\</) { # otherwise, is this line markup? 253 # if it is markup AND we haven't seen anything else yet, 254 # then we will emit the markup 255 if ($temp eq "") { 256 $retval .= $_; 257 $_ = ""; 258 } else { # we wrap it with the state switches and hold it 259 s/^/$anchorEnd$diffEnd[$state]/; 260 s/$/$span[$state]$anchor/; 261 $temp .= $_; 262 $_ = ""; 263 } 264 } else { 265 if (m/.+/) { 266 $temp .= $_; 267 $_ = ""; 268 } 269 } 270 } 271 272 s/\001//g; 273 s/\002//g; 274 s/\003//g; 275 s/\004//g; 276 if ($_ !~ m/^$/) { 277 $retval .= $_; 278 } 279 $diffcounter++; 280 } 281 close FILE; 282 $retval =~ s/$span[1]\n+$diffEnd[1]//g; 283 $retval =~ s/$span[2]\n+$diffEnd[2]//g; 284 $retval =~ s/$span[3]\n+$diffEnd[3]//g; 285 $retval =~ s/$span[4]\n+$diffEnd[4]//g; 286 $retval =~ s/$span[1]\n*$//g; 287 $retval =~ s/$span[2]\n*$//g; 288 $retval =~ s/$span[3]\n*$//g; 289 $retval =~ s/$span[4]\n*$//g; 290 return $retval; 291} 292 293sub splitit { 294 my $filename = shift; 295 my $headertmp = shift; 296 my $inheader=0; 297 my $preformatted=0; 298 my $inelement=0; 299 my $retval = ""; 300 my $styles = q(<style type='text/css'> 301.diff-old-a { 302 font-size: smaller; 303 color: red; 304} 305 306.diff-new { background-color: yellow; } 307.diff-chg { background-color: lime; } 308.diff-new:before, 309.diff-new:after 310 { content: "\2191" } 311.diff-chg:before, .diff-chg:after 312 { content: "\2195" } 313.diff-old { text-decoration: line-through; background-color: #FBB; } 314.diff-old:before, 315.diff-old:after 316 { content: "\2193" } 317:focus { border: thin red solid} 318</style> 319<script src="https://www.w3.org/2016/10/htmldiff-nav.js"></script>); 320 if ($opt_t) { 321 $styles .= q( 322<script type="text/javascript"> 323<!-- 324function setOldDisplay() { 325 for ( var s = 0; s < document.styleSheets.length; s++ ) { 326 var css = document.styleSheets[s]; 327 var mydata ; 328 try { mydata = css.cssRules ; 329 if ( ! mydata ) mydata = css.rules; 330 for ( var r = 0; r < mydata.length; r++ ) { 331 if ( mydata[r].selectorText == '.diff-old' ) { 332 mydata[r].style.display = ( mydata[r].style.display == '' ) ? 'none' 333: ''; 334 return; 335 } 336 } 337 } catch(e) {} ; 338 } 339} 340--> 341</script> 342); 343 344 } 345 346 if ($stripheader) { 347 open(HEADER, ">$headertmp"); 348 } 349 350 my $incomment = 0; 351 my $inhead = 1; 352 open(FILE, $filename) || die("File $filename cannot be opened: $!"); 353 while (<FILE>) { 354 if ($inhead == 1) { 355 if (m/\<\/head/i) { 356 print HEADER $styles; 357 } 358 if (m/\<body/i) { 359 $inhead = 0; 360 print HEADER; 361 if ($opt_t) { 362 print HEADER q( 363<form action=""><input type="button" onclick="setOldDisplay()" value="Show/Hide Old Content" /></form> 364); 365 } 366 close HEADER; 367 } else { 368 print HEADER; 369 } 370 } else { 371 if ($incomment) { 372 if (m;-->;) { 373 $incomment = 0; 374 s/.*-->//; 375 } else { 376 next; 377 } 378 } 379 if (m;<!--;) { 380 while (m;<!--.*-->;) { 381 s/<!--.*?-->//; 382 } 383 if (m;<!--; ) { 384 $incomment = 1; 385 s/<!--.*//; 386 } 387 } 388 if (m/\<pre/i) { 389 $preformatted = 1; 390 } 391 if (m/\<\/pre\>/i) { 392 $preformatted = 0; 393 } 394 if ($preformatted) { 395 $retval .= $_; 396 } elsif ($mhtmlcomments && /^;;;/) { 397 $retval .= $_; 398 } else { 399 my @list = split(' '); 400 foreach $element (@list) { 401 if ($element =~ m/\<H[1-6]/i) { 402# $inheader = 1; 403 } 404 if ($inheader == 0) { 405 $element =~ s/</\n</g; 406 $element =~ s/^\n//; 407 $element =~ s/>/>\n/g; 408 $element =~ s/\n$//; 409 $element =~ s/>\n([.,:!]+)/>$1/g; 410 } 411 if ($element =~ m/\<\/H[1-6]\>/i) { 412 $inheader = 0; 413 } 414 $retval .= "$element"; 415 $inelement += ($element =~ s/</</g); 416 $inelement -= ($element =~ s/>/>/g); 417 if ($inelement < 0) { 418 $inelement = 0; 419 } 420 if (($inelement == 0) && ($inheader == 0)) { 421 $retval .= "\n"; 422 } else { 423 $retval .= " "; 424 } 425 } 426 undef @list; 427 } 428 } 429 } 430 $retval .= "\n"; 431 close FILE; 432 return $retval; 433} 434 435$mhtmlcomments = 1; 436 437sub cli { 438 getopts("clto") || usage(); 439 440 if ($opt_c) {$mhtmlcomments = 0;} 441 442 if (@ARGV < 2) { usage(); } 443 444 $file1 = $ARGV[0]; 445 $file2 = $ARGV[1]; 446 $file3 = $ARGV[2]; 447 448 $tmp = splitit($file1, $headertmp1); 449 open (FILE, ">$tmp1"); 450 print FILE $tmp; 451 close FILE; 452 453 $tmp = splitit($file2, $headertmp2); 454 open (FILE, ">$tmp2"); 455 print FILE $tmp; 456 close FILE; 457 458 $output = ""; 459 460 if ($stripheader) { 461 open(FILE, $headertmp2); 462 while (<FILE>) { 463 $output .= $_; 464 } 465 close(FILE); 466 } 467 468 $output .= markit($tmp1, $tmp2); 469 470 if ($file3) { 471 open(FILE, ">$file3"); 472 print FILE $output; 473 close FILE; 474 } else { 475 print $output; 476 } 477} 478 479sub cgi { 480# use LWP::UserAgent; 481# use CGI; 482 483 my $query = new CGI; 484 my $url1 = $query->param("oldfile"); 485 my $url2 = $query->param("newfile"); 486 my $mhtml = $query->param("mhtml"); 487 488 my $file1 = "/tmp/htdcgi1.$$"; 489 my $file2 = "/tmp/htdcgi2.$$"; 490 491 my $ua = new LWP::UserAgent; 492 $ua->agent("MACS, Inc. HTMLdiff/0.9 " . $ua->agent); 493 494 # Create a request 495 496 my $req1 = new HTTP::Request GET => $url1; 497 498 my $res1 = $ua->request($req1, $file1); 499 if ($res1->is_error) { 500 print $res1->error_as_HTML(); 501 print "<p>The URL $url1 could not be found. Please check it and try again.</p>"; 502 return; 503 } 504 505 my $req2 = new HTTP::Request GET => $url2; 506 507 my $res2 = $ua->request($req2, $file2); 508 if ($res2->is_error) { 509 print $res2->error_as_HTML(); 510 print "<p>The URL $url2 could not be found. Please check it and try again.</p>"; 511 return; 512 } 513 514 $split1 = splitit($file1, $headertmp1); 515 open (FILE, ">$tmp1"); 516 print FILE $split1; 517 close FILE; 518 519 $split2 = splitit($file2, $headertmp2); 520 open (FILE, ">$tmp2"); 521 print FILE $split2; 522 close FILE; 523 524 $output = ""; 525 526 if ($stripheader) { 527 open(FILE, $headertmp2); 528 while (<FILE>) { 529 $output .= $_; 530 } 531 close(FILE); 532 } 533 534 $output .= markit($tmp1, $tmp2); 535 536 my $base=$res2->base; 537 538 if ($base !~ /\/$/) { 539 $base =~ s/[^\/]*$//; 540 } 541 542 if ( $output !~ /<base/i ) { 543 $output =~ s/<head>/<head>\n<base href="$base">/i || 544 $output =~ s/<html>/<html>\n<base href="$base">/i ; 545 } 546 547 print $query->header(-type=>'text/html',-nph=>1); 548 print $output; 549 550 unlink $file1; 551 unlink $file2; 552 553} 554 555$tmp1="/tmp/htdtmp1.$$"; 556$headertmp1="/tmp/htdhtmp1.$$"; 557$tmp2="/tmp/htdtmp2.$$"; 558$headertmp2="/tmp/htdhtmp2.$$"; 559$stripheader = 1; 560 561if (@ARGV == 0) { 562 cgi(); # if no arguments, we must be operating as a cgi script 563} else { 564 cli(); # if there are arguments, then we are operating as a CLI 565} 566 567unlink $tmp1; 568unlink $headertmp1; 569unlink $tmp2; 570unlink $headertmp2; 571