1#! /usr/bin/perl -w 2 3# Script to turn PCRE2 man pages into HTML 4 5 6# Subroutine to handle font changes and other escapes 7 8sub do_line { 9my($s) = $_[0]; 10 11$s =~ s/</</g; # Deal with < and > 12$s =~ s/>/>/g; 13$s =~ s"\\fI(.*?)\\f[RP]"<i>$1</i>"g; 14$s =~ s"\\fB(.*?)\\f[RP]"<b>$1</b>"g; 15$s =~ s"\\e"\\"g; 16$s =~ s/(?<=Copyright )\(c\)/©/g; 17$s; 18} 19 20# Subroutine to ensure not in a paragraph 21 22sub end_para { 23if ($inpara) 24 { 25 print TEMP "</PRE>\n" if ($inpre); 26 print TEMP "</P>\n"; 27 } 28$inpara = $inpre = 0; 29$wrotetext = 0; 30} 31 32# Subroutine to start a new paragraph 33 34sub new_para { 35&end_para(); 36print TEMP "<P>\n"; 37$inpara = 1; 38} 39 40 41# Main program 42 43$innf = 0; 44$inpara = 0; 45$inpre = 0; 46$wrotetext = 0; 47$toc = 0; 48$ref = 1; 49 50while ($#ARGV >= 0 && $ARGV[0] =~ /^-/) 51 { 52 $toc = 1 if $ARGV[0] eq "-toc"; 53 shift; 54 } 55 56# Initial output to STDOUT 57 58print <<End ; 59<html> 60<head> 61<title>$ARGV[0] specification</title> 62</head> 63<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB"> 64<h1>$ARGV[0] man page</h1> 65<p> 66Return to the <a href="index.html">PCRE2 index page</a>. 67</p> 68<p> 69This page is part of the PCRE2 HTML documentation. It was generated 70automatically from the original man page. If there is any nonsense in it, 71please consult the man page, in case the conversion went wrong. 72<br> 73End 74 75print "<ul>\n" if ($toc); 76 77open(TEMP, ">/tmp/$$") || die "Can't open /tmp/$$ for output\n"; 78 79while (<STDIN>) 80 { 81 # Handle lines beginning with a dot 82 83 if (/^\./) 84 { 85 # Some of the PCRE2 man pages used to contain instances of .br. However, 86 # they should have all been removed because they cause trouble in some 87 # (other) automated systems that translate man pages to HTML. Complain if 88 # we find .br or .in (another macro that is deprecated). 89 90 if (/^\.br/ || /^\.in/) 91 { 92 print STDERR "\n*** Deprecated macro encountered - rewrite needed\n"; 93 print STDERR "*** $_\n"; 94 die "*** Processing abandoned\n"; 95 } 96 97 # Instead of .br, relevent "literal" sections are enclosed in .nf/.fi. 98 99 elsif (/^\.nf/) 100 { 101 $innf = 1; 102 } 103 104 elsif (/^\.fi/) 105 { 106 $innf = 0; 107 } 108 109 # Handling .sp is subtle. If it is inside a literal section, do nothing if 110 # the next line is a non literal text line; similarly, if not inside a 111 # literal section, do nothing if a literal follows, unless we are inside 112 # a .nf/.fi section or about to enter one. The point being that the <pre> 113 # and </pre> that delimit literal sections will do the spacing. Always skip 114 # if no previous output. 115 116 elsif (/^\.sp/) 117 { 118 if ($wrotetext) 119 { 120 $_ = <STDIN>; 121 if ($inpre) 122 { 123 print TEMP "\n" if (/^[\s.]/); 124 } 125 else 126 { 127 print TEMP "<br>\n<br>\n" if ($innf || /^\.nf/ || !/^[\s.]/); 128 } 129 redo; # Now process the lookahead line we just read 130 } 131 } 132 elsif (/^\.TP/ || /^\.PP/ || /^\.P/) 133 { 134 &new_para(); 135 } 136 elsif (/^\.SH\s*("?)(.*)\1/) 137 { 138 # Ignore the NAME section 139 if ($2 =~ /^NAME\b/) 140 { 141 <STDIN>; 142 next; 143 } 144 145 &end_para(); 146 my($title) = &do_line($2); 147 if ($toc) 148 { 149 printf("<li><a name=\"TOC%d\" href=\"#SEC%d\">$title</a>\n", 150 $ref, $ref); 151 printf TEMP ("<br><a name=\"SEC%d\" href=\"#TOC1\">$title</a><br>\n", 152 $ref); 153 $ref++; 154 } 155 else 156 { 157 print TEMP "<br><b>\n$title\n</b><br>\n"; 158 } 159 } 160 elsif (/^\.SS\s*("?)(.*)\1/) 161 { 162 &end_para(); 163 my($title) = &do_line($2); 164 print TEMP "<br><b>\n$title\n</b><br>\n"; 165 } 166 elsif (/^\.B\s*(.*)/) 167 { 168 &new_para() if (!$inpara); 169 $_ = &do_line($1); 170 s/"(.*?)"/$1/g; 171 print TEMP "<b>$_</b>\n"; 172 $wrotetext = 1; 173 } 174 elsif (/^\.I\s*(.*)/) 175 { 176 &new_para() if (!$inpara); 177 $_ = &do_line($1); 178 s/"(.*?)"/$1/g; 179 print TEMP "<i>$_</i>\n"; 180 $wrotetext = 1; 181 } 182 183 # A comment that starts "HREF" takes the next line as a name that 184 # is turned into a hyperlink, using the text given, which might be 185 # in a special font. If it ends in () or (digits) or punctuation, they 186 # aren't part of the link. 187 188 elsif (/^\.\\"\s*HREF/) 189 { 190 $_=<STDIN>; 191 chomp; 192 $_ = &do_line($_); 193 $_ =~ s/\s+$//; 194 $_ =~ /^(?:<.>)?([^<(]+)(?:\(\))?(?:<\/.>)?(?:\(\d+\))?[.,;:]?$/; 195 print TEMP "<a href=\"$1.html\">$_</a>\n"; 196 } 197 198 # A comment that starts "HTML" inserts literal HTML 199 200 elsif (/^\.\\"\s*HTML\s*(.*)/) 201 { 202 print TEMP $1; 203 } 204 205 # A comment that starts < inserts that HTML at the end of the 206 # *next* input line - so as not to get a newline between them. 207 208 elsif (/^\.\\"\s*(<.*>)/) 209 { 210 my($markup) = $1; 211 $_=<STDIN>; 212 chomp; 213 $_ = &do_line($_); 214 $_ =~ s/\s+$//; 215 print TEMP "$_$markup\n"; 216 } 217 218 # A comment that starts JOIN joins the next two lines together, with one 219 # space between them. Then that line is processed. This is used in some 220 # displays where two lines are needed for the "man" version. JOINSH works 221 # the same, except that it assumes this is a shell command, so removes 222 # continuation backslashes. 223 224 elsif (/^\.\\"\s*JOIN(SH)?/) 225 { 226 my($one,$two); 227 $one = <STDIN>; 228 $two = <STDIN>; 229 $one =~ s/\s*\\e\s*$// if (defined($1)); 230 chomp($one); 231 $two =~ s/^\s+//; 232 $_ = "$one $two"; 233 redo; # Process the joined lines 234 } 235 236 # .EX/.EE are used in the pcre2demo page to bracket the entire program, 237 # which is unmodified except for turning backslash into "\e". 238 239 elsif (/^\.EX\s*$/) 240 { 241 print TEMP "<PRE>\n"; 242 while (<STDIN>) 243 { 244 last if /^\.EE\s*$/; 245 s/\\e/\\/g; 246 s/&/&/g; 247 s/</</g; 248 s/>/>/g; 249 print TEMP; 250 } 251 } 252 253 # Ignore anything not recognized 254 255 next; 256 } 257 258 # Line does not begin with a dot. Replace blank lines with new paragraphs 259 260 if (/^\s*$/) 261 { 262 &end_para() if ($wrotetext); 263 next; 264 } 265 266 # Convert fonts changes and output an ordinary line. Ensure that indented 267 # lines are marked as literal. 268 269 $_ = &do_line($_); 270 &new_para() if (!$inpara); 271 272 if (/^\s/) 273 { 274 if (!$inpre) 275 { 276 print TEMP "<pre>\n"; 277 $inpre = 1; 278 } 279 } 280 elsif ($inpre) 281 { 282 print TEMP "</pre>\n"; 283 $inpre = 0; 284 } 285 286 # Add <br> to the end of a non-literal line if we are within .nf/.fi 287 288 $_ .= "<br>\n" if (!$inpre && $innf); 289 290 print TEMP; 291 $wrotetext = 1; 292 } 293 294# The TOC, if present, will have been written - terminate it 295 296print "</ul>\n" if ($toc); 297 298# Copy the remainder to the standard output 299 300close(TEMP); 301open(TEMP, "/tmp/$$") || die "Can't open /tmp/$$ for input\n"; 302 303print while (<TEMP>); 304 305print <<End ; 306<p> 307Return to the <a href="index.html">PCRE2 index page</a>. 308</p> 309End 310 311close(TEMP); 312unlink("/tmp/$$"); 313 314# End 315