1#!/usr/bin/perl 2# 3# htmldiff - present a diff marked version of two html documents 4# 5# Copyright (c) 1998-2006 MACS, Inc. 6# 7# Copyright (c) 2007 SiSco, Inc. 8# 9# SPDX-License-Identifier: MIT 10# 11# Permission is hereby granted, free of charge, to any person obtaining 12# a copy of this software and associated documentation files (the 13# "Software"), to deal in the Software without restriction, including 14# without limitation the rights to use, copy, modify, merge, publish, 15# distribute, sublicense, and/or sell copies of the Software, and to 16# permit persons to whom the Software is furnished to do so, subject to 17# the following conditions: 18# 19# The above copyright notice and this permission notice shall be 20# included in all copies or substantial portions of the Software. 21# 22# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 23# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 24# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 25# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 26# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 27# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 28# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 29# 30# See http://www.themacs.com for more information. 31# 32# usage: htmldiff [[-c] [-l] [-o] oldversion newversion [output]] 33# 34# -c - disable metahtml comment processing 35# -o - disable outputting of old text 36# -l - use navindex to create sequence of diffs 37# oldversion - the previous version of the document 38# newversion - the newer version of the document 39# output - a filename to place the output in. If omitted, the output goes to 40# standard output. 41# 42# if invoked with no options or arguments, operates as a CGI script. It then 43# takes the following parameters: 44# 45# oldfile - the URL of the original file 46# newfile - the URL of the new file 47# mhtml - a flag to indicate whether it should be aware of MetaHTML comments. 48# 49# requires GNU diff utility 50# also requires the perl modules Getopt::Std 51# 52# NOTE: The markup created by htmldiff may not validate against the HTML 4.0 53# DTD. This is because the algorithm is realtively simple, and there are 54# places in the markup content model where the span element is not allowed. 55# Htmldiff is NOT aware of these places. 56# 57# $Source: /u/sources/public/2009/htmldiff/htmldiff.pl,v $ 58# $Revision: 1.3 $ 59# 60# $Log: htmldiff.pl,v $ 61# Revision 1.3 2016/10/24 15:06:51 dom 62# Summary: Use nav script always 63# 64# Revision 1.2 2016/10/24 15:04:28 dom 65# Add navigation script 66# 67# Revision 1.1 2014-01-06 08:04:51 dom 68# added copy of htmldiff perl script since aptest.com repo no longer available 69# 70# Revision 1.5 2008/03/05 13:23:16 ahby 71# Fixed a problem with leading whitespace before markup. 72# 73# Revision 1.4 2007/12/13 13:09:16 ahby 74# Updated copyright and license. 75# 76# Revision 1.3 2007/12/13 12:53:34 ahby 77# Changed use of span to ins and del 78# 79# Revision 1.2 2002/02/13 16:27:23 ahby 80# Changed processing model. 81# Improved handling of old text and changed styles. 82# 83# Revision 1.1 2000/07/12 12:20:04 ahby 84# Updated to remove empty spans - this fixes validation problems under 85# strict. 86# 87# Revision 1.11 1999/12/08 19:46:45 ahby 88# Fixed validation errors introduced by placing markup where it didn't 89# belong. 90# 91# Revision 1.10 1999/10/18 13:42:58 ahby 92# Added -o to the usage message. 93# 94# Revision 1.9 1999/05/04 12:29:11 ahby 95# Added an option to turn off the display of old text. 96# 97# Revision 1.8 1999/04/09 14:37:27 ahby 98# Fixed a perl syntax error. 99# 100# Revision 1.7 1999/04/09 14:35:49 ahby 101# Added reference to MACS homepage. 102# 103# Revision 1.6 1999/04/09 14:35:09 ahby 104# Added comment about validity of generated markup. 105# 106# Revision 1.5 1999/02/22 22:17:54 ahby 107# Changed to use stylesheets. 108# Changed to rely upon span. 109# Changed to work around content model problems. 110# 111# Revision 1.4 1999/02/08 02:32:22 ahby 112# Added a copyright statement. 113# 114# Revision 1.3 1999/02/08 02:30:40 ahby 115# Added header processing. 116# 117# Revision 1.2 1998/12/10 17:31:31 ahby 118# Fixed to escape less-thans in change blocks and to not permit change 119# markup within specific elements (like TITLE). 120# 121# Revision 1.1 1998/11/26 00:09:22 ahby 122# Initial revision 123# 124# 125 126use Getopt::Std; 127 128sub usage { 129 print STDERR "htmldiff [-c] [-o] oldversion newversion [output]\n"; 130 exit; 131} 132 133sub url_encode { 134 my $str = shift; 135 $str =~ s/([\x00-\x1f\x7F-\xFF])/ 136 sprintf ('%%%02x', ord ($1))/eg; 137 return $str; 138} 139 140# markit - diff-mark the streams 141# 142# markit(file1, file2) 143# 144# markit relies upon GNUdiff to mark up the text. 145# 146# The markup is encoded using special control sequences: 147# 148# a block wrapped in control-a is deleted text 149# a block wrapped in control-b is old text 150# a block wrapped in control-c is new text 151# 152# The main processing loop attempts to wrap the text blocks in appropriate 153# SPANs based upon the type of text that it is. 154# 155# When the loop encounters a < in the text, it stops the span. Then it outputs 156# the element that is defined, then it restarts the span. 157 158sub markit { 159 my $retval = ""; 160 my($file1) = shift; 161 my($file2) = shift; 162# my $old="<span class=\\\"diff-old-a\\\">deleted text: </span>%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'"; 163 my $old="%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'"; 164 my $new="%c'\012'%c'\003'%c'\012'%>%c'\012'%c'\003'%c'\012'"; 165 my $unchanged="%="; 166 my $changed="%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'%c'\004'%c'\012'%>%c'\012'%c'\004'%c'\012'"; 167 if ($opt_o) { 168 $old = ""; 169 $changed = "%c'\012'%c'\004'%c'\012'%>%c'\012'%c'\004'%c'\012'"; 170 } 171# my $old="%c'\002'<font color=\\\"purple\\\" size=\\\"-2\\\">deleted text:</font><s>%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'</s>%c'\012'%c'\002'"; 172# my $new="%c'\002'<font color=\\\"purple\\\"><u>%c'\012'%c'\002'%>%c'\002'</u></font>%c'\002'%c'\012'"; 173# my $unchanged="%="; 174# my $changed="%c'\002'<s>%c'\012'%c'\001'%c'\012'%<%c'\012'%c'\001'%c'\012'</s><font color=\\\"purple\\\"><u>%c'\002'%c'\012'%>%c'\012'%c'\002'</u></font>%c'\002'%c'\012'"; 175 176 my @span; 177 $span[0]="</span>"; 178 $span[1]="<del class=\"diff-old\">"; 179 $span[2]="<del class=\"diff-old\">"; 180 $span[3]="<ins class=\"diff-new\">"; 181 $span[4]="<ins class=\"diff-chg\">"; 182 183 my @diffEnd ; 184 $diffEnd[1] = '</del>'; 185 $diffEnd[2] = '</del>'; 186 $diffEnd[3] = '</ins>'; 187 $diffEnd[4] = '</ins>'; 188 189 my $diffcounter = 0; 190 191 open(FILE, qq(diff -d --old-group-format="$old" --new-group-format="$new" --changed-group-format="$changed" --unchanged-group-format="$unchanged" $file1 $file2 |)) || die("Diff failed: $!"); 192# system (qq(diff --old-group-format="$old" --new-group-format="$new" --changed-group-format="$changed" --unchanged-group-format="$unchanged" $file1 $file2 > /tmp/output)); 193 194 my $state = 0; 195 my $inblock = 0; 196 my $temp = ""; 197 my $lineCount = 0; 198 199# strategy: 200# 201# process the output of diff... 202# 203# a link with control A-D means the start/end of the corresponding ordinal 204# state (1-4). Resting state is state 0. 205# 206# While in a state, accumulate the contents for that state. When exiting the 207# state, determine if it is appropriate to emit the contents with markup or 208# not (basically, if the accumulated buffer contains only empty lines or lines 209# with markup, then we don't want to emit the wrappers. We don't need them. 210# 211# Note that if there is markup in the "old" block, that markup is silently 212# removed. It isn't really that interesting, and it messes up the output 213# something fierce. 214 215 while (<FILE>) { 216 my $nextCounter = $diffcounter + 1; 217 my $anchor = $opt_l ? qq[<a tabindex="$diffcounter" id="diff-$diffcounter" href="#diff-$nextCounter">] : "" ; 218 my $anchorEnd = $opt_l ? q[</a>] : "" ; 219 $lineCount ++; 220 if ($state == 0) { # if we are resting and we find a marker, 221 # then we must be entering a block 222 if (m/^([\001-\004])/) { 223 $state = ord($1); 224 $_ = ""; 225 } 226# if (m/^\001/) { 227# $state = 1; 228# s/^/$span[1]/; 229# } elsif (m/^\002/) { 230# $state = 2; 231# s/^/$span[2]/; 232# } elsif (m/^\003/) { 233# $state = 3; 234# s/^/$span[3]/; 235# } elsif (m/^\004/) { 236# $state = 4; 237# s/^/$span[4]/; 238# } 239 } else { 240 # if we are in "old" state, remove markup 241 if (($state == 1) || ($state == 2)) { 242 s/\<.*\>//; # get rid of any old markup 243 s/\</</g; # escape any remaining STAG or ETAGs 244 s/\>/>/g; 245 } 246 # if we found another marker, we must be exiting the state 247 if (m/^([\001-\004])/) { 248 if ($temp ne "") { 249 $_ = $span[$state] . $anchor . $temp . $anchorEnd . $diffEnd[$state] . "\n"; 250 $temp = ""; 251 $diffcounter++; 252 } else { 253 $_ = "" ; 254 } 255 $state = 0; 256 } elsif (m/^\s*\</) { # otherwise, is this line markup? 257 # if it is markup AND we haven't seen anything else yet, 258 # then we will emit the markup 259 if ($temp eq "") { 260 $retval .= $_; 261 $_ = ""; 262 } else { # we wrap it with the state switches and hold it 263 s/^/$anchorEnd$diffEnd[$state]/; 264 s/$/$span[$state]$anchor/; 265 $temp .= $_; 266 $_ = ""; 267 $diffcounter++; 268 } 269 } else { 270 if (m/.+/) { 271 $temp .= $_; 272 $_ = ""; 273 } 274 } 275 } 276 277 s/\001//g; 278 s/\002//g; 279 s/\003//g; 280 s/\004//g; 281 if ($_ !~ m/^$/) { 282 $retval .= $_; 283 } 284 } 285 close FILE; 286 $retval =~ s/$span[1]\n+$diffEnd[1]//g; 287 $retval =~ s/$span[2]\n+$diffEnd[2]//g; 288 $retval =~ s/$span[3]\n+$diffEnd[3]//g; 289 $retval =~ s/$span[4]\n+$diffEnd[4]//g; 290 $retval =~ s/$span[1]\n*$//g; 291 $retval =~ s/$span[2]\n*$//g; 292 $retval =~ s/$span[3]\n*$//g; 293 $retval =~ s/$span[4]\n*$//g; 294 return $retval; 295} 296 297sub splitit { 298 my $filename = shift; 299 my $headertmp = shift; 300 my $inheader=0; 301 my $preformatted=0; 302 my $inelement=0; 303 my $retval = ""; 304 my $styles = q(<style type='text/css'> 305.diff-old-a { 306 font-size: smaller; 307 color: red; 308} 309.diff-new a { text-decoration: none; } 310.diff-new { background-color: yellow; } 311.diff-chg { background-color: lime; } 312.diff-chg a { text-decoration: none; } 313.diff-new:before, 314.diff-new:after 315 { content: "\2191" } 316.diff-chg:before, .diff-chg:after 317 { content: "\2195" } 318.diff-old { text-decoration: line-through; background-color: #FBB; } 319.diff-old:before, 320.diff-old:after 321 { content: "\2193" } 322.diff-old a { text-decoration: none; } 323:focus { border: thin red solid} 324</style> 325<script src="https://www.w3.org/2016/10/htmldiff-nav.js"></script>); 326 if ($opt_t) { 327 $styles .= q( 328<script type="text/javascript"> 329<!-- 330function setOldDisplay() { 331 for ( var s = 0; s < document.styleSheets.length; s++ ) { 332 var css = document.styleSheets[s]; 333 var mydata ; 334 try { mydata = css.cssRules ; 335 if ( ! mydata ) mydata = css.rules; 336 for ( var r = 0; r < mydata.length; r++ ) { 337 if ( mydata[r].selectorText == '.diff-old' ) { 338 mydata[r].style.display = ( mydata[r].style.display == '' ) ? 'none' 339: ''; 340 return; 341 } 342 } 343 } catch(e) {} ; 344 } 345} 346--> 347</script> 348); 349 350 } 351 352 if ($stripheader) { 353 open(HEADER, ">$headertmp"); 354 } 355 356 my $incomment = 0; 357 my $inhead = 1; 358 open(FILE, $filename) || die("File $filename cannot be opened: $!"); 359 while (<FILE>) { 360 if ($inhead == 1) { 361 if (m/\<\/head/i) { 362 print HEADER $styles; 363 } 364 if (m/\<body/i) { 365 $inhead = 0; 366 print HEADER; 367 if ($opt_t) { 368 print HEADER q( 369<form action=""><input type="button" onclick="setOldDisplay()" value="Show/Hide Old Content" /></form> 370); 371 } 372 if ($opt_l) { 373 print HEADER q( 374 <p><em>NOTE: Click highlighted diff text to jump to the following difference.</em></p> 375 ); 376 } 377 close HEADER; 378 } else { 379 print HEADER; 380 } 381 } else { 382 if ($incomment) { 383 if (m;-->;) { 384 $incomment = 0; 385 s/.*-->//; 386 } else { 387 next; 388 } 389 } 390 if (m;<!--;) { 391 while (m;<!--.*-->;) { 392 s/<!--.*?-->//; 393 } 394 if (m;<!--; ) { 395 $incomment = 1; 396 s/<!--.*//; 397 } 398 } 399 if (m/\<pre/i) { 400 $preformatted = 1; 401 } 402 if (m/\<\/pre\>/i) { 403 $preformatted = 0; 404 } 405 if ($preformatted) { 406 $retval .= $_; 407 } elsif ($mhtmlcomments && /^;;;/) { 408 $retval .= $_; 409 } else { 410 my @list = split(' '); 411 foreach $element (@list) { 412 if ($element =~ m/\<H[1-6]/i) { 413# $inheader = 1; 414 } 415 if ($inheader == 0) { 416 $element =~ s/</\n</g; 417 $element =~ s/^\n//; 418 $element =~ s/>/>\n/g; 419 $element =~ s/\n$//; 420 $element =~ s/>\n([.,:!]+)/>$1/g; 421 } 422 if ($element =~ m/\<\/H[1-6]\>/i) { 423 $inheader = 0; 424 } 425 $retval .= "$element"; 426 $inelement += ($element =~ s/</</g); 427 $inelement -= ($element =~ s/>/>/g); 428 if ($inelement < 0) { 429 $inelement = 0; 430 } 431 if (($inelement == 0) && ($inheader == 0)) { 432 $retval .= "\n"; 433 } else { 434 $retval .= " "; 435 } 436 } 437 undef @list; 438 } 439 } 440 } 441 $retval .= "\n"; 442 close FILE; 443 return $retval; 444} 445 446$mhtmlcomments = 1; 447 448sub cli { 449 getopts("clto") || usage(); 450 451 if ($opt_c) {$mhtmlcomments = 0;} 452 453 if (@ARGV < 2) { usage(); } 454 455 $file1 = $ARGV[0]; 456 $file2 = $ARGV[1]; 457 $file3 = $ARGV[2]; 458 459 $tmp = splitit($file1, $headertmp1); 460 open (FILE, ">$tmp1"); 461 print FILE $tmp; 462 close FILE; 463 464 $tmp = splitit($file2, $headertmp2); 465 open (FILE, ">$tmp2"); 466 print FILE $tmp; 467 close FILE; 468 469 $output = ""; 470 471 if ($stripheader) { 472 open(FILE, $headertmp2); 473 while (<FILE>) { 474 $output .= $_; 475 } 476 close(FILE); 477 } 478 479 $output .= markit($tmp1, $tmp2); 480 481 if ($file3) { 482 open(FILE, ">$file3"); 483 print FILE $output; 484 close FILE; 485 } else { 486 print $output; 487 } 488} 489 490sub cgi { 491# use LWP::UserAgent; 492# use CGI; 493 494 my $query = new CGI; 495 my $url1 = $query->param("oldfile"); 496 my $url2 = $query->param("newfile"); 497 my $mhtml = $query->param("mhtml"); 498 499 my $file1 = "/tmp/htdcgi1.$$"; 500 my $file2 = "/tmp/htdcgi2.$$"; 501 502 my $ua = new LWP::UserAgent; 503 $ua->agent("MACS, Inc. HTMLdiff/0.9 " . $ua->agent); 504 505 # Create a request 506 507 my $req1 = new HTTP::Request GET => $url1; 508 509 my $res1 = $ua->request($req1, $file1); 510 if ($res1->is_error) { 511 print $res1->error_as_HTML(); 512 print "<p>The URL $url1 could not be found. Please check it and try again.</p>"; 513 return; 514 } 515 516 my $req2 = new HTTP::Request GET => $url2; 517 518 my $res2 = $ua->request($req2, $file2); 519 if ($res2->is_error) { 520 print $res2->error_as_HTML(); 521 print "<p>The URL $url2 could not be found. Please check it and try again.</p>"; 522 return; 523 } 524 525 $split1 = splitit($file1, $headertmp1); 526 open (FILE, ">$tmp1"); 527 print FILE $split1; 528 close FILE; 529 530 $split2 = splitit($file2, $headertmp2); 531 open (FILE, ">$tmp2"); 532 print FILE $split2; 533 close FILE; 534 535 $output = ""; 536 537 if ($stripheader) { 538 open(FILE, $headertmp2); 539 while (<FILE>) { 540 $output .= $_; 541 } 542 close(FILE); 543 } 544 545 $output .= markit($tmp1, $tmp2); 546 547 my $base=$res2->base; 548 549 if ($base !~ /\/$/) { 550 $base =~ s/[^\/]*$//; 551 } 552 553 if ( $output !~ /<base/i ) { 554 $output =~ s/<head>/<head>\n<base href="$base">/i || 555 $output =~ s/<html>/<html>\n<base href="$base">/i ; 556 } 557 558 print $query->header(-type=>'text/html',-nph=>1); 559 print $output; 560 561 unlink $file1; 562 unlink $file2; 563 564} 565 566$tmp1="/tmp/htdtmp1.$$"; 567$headertmp1="/tmp/htdhtmp1.$$"; 568$tmp2="/tmp/htdtmp2.$$"; 569$headertmp2="/tmp/htdhtmp2.$$"; 570$stripheader = 1; 571 572if (@ARGV == 0) { 573 cgi(); # if no arguments, we must be operating as a cgi script 574} else { 575 cli(); # if there are arguments, then we are operating as a CLI 576} 577 578unlink $tmp1; 579unlink $headertmp1; 580unlink $tmp2; 581unlink $headertmp2; 582