1// Copyright 2005 Google Inc. 2// All Rights Reserved. 3// 4// msamuel@google.com 5 6// Usage: 7// 1) include this source file in an html page via 8// <script type=text/javascript src=prettify.js></script> 9// 2) define style rules. See the example page for examples. 10// 3) mark the <pre> and <code> tags in your source with class=prettyprint. 11// You can also use the (html deprecated) <xmp> tag, but the pretty printer 12// needs to do more substantial DOM manipulations to support that, so some 13// css styles may not be preserved. 14 15// Change log: 16// cbeust, 2006/08/22 17// Java annotations (start with "@") are now captured as literals ("lit") 18// 19 20var PR_keywords = new Object(); 21/** initialize the keyword list for our target languages. */ 22(function () { 23 var CPP_KEYWORDS = ( 24 "bool break case catch char class const const_cast continue default " + 25 "delete deprecated dllexport dllimport do double dynamic_cast else enum " + 26 "explicit extern false float for friend goto if inline int long mutable " + 27 "naked namespace new noinline noreturn nothrow novtable operator private " + 28 "property protected public register reinterpret_cast return selectany " + 29 "short signed sizeof static static_cast struct switch template this " + 30 "thread throw true try typedef typeid typename union unsigned using " + 31 "declaration, using directive uuid virtual void volatile while typeof"); 32 var JAVA_KEYWORDS = ( 33 "abstract default goto package synchronized boolean do if private this " + 34 "break double implements protected throw byte else import public throws " + 35 "case enum instanceof return transient catch extends int short try char " + 36 "final interface static void class finally long strictfp volatile const " + 37 "float native super while continue for new switch"); 38 var PYTHON_KEYWORDS = ( 39 "and assert break class continue def del elif else except exec finally " + 40 "for from global if import in is lambda not or pass print raise return " + 41 "try while yield"); 42 var JSCRIPT_KEYWORDS = ( 43 "abstract boolean break byte case catch char class const continue " + 44 "debugger default delete do double else enum export extends false final " + 45 "finally float for function goto if implements import in instanceof int " + 46 "interface long native new null package private protected public return " + 47 "short static super switch synchronized this throw throws transient " + 48 "true try typeof var void volatile while with NaN Infinity"); 49 var PERL_KEYWORDS = ( 50 "foreach require sub unless until use elsif BEGIN END"); 51 var SH_KEYWORDS = ( 52 "if then do else fi end"); 53 var KEYWORDS = [CPP_KEYWORDS, JAVA_KEYWORDS, PYTHON_KEYWORDS, 54 JSCRIPT_KEYWORDS, PERL_KEYWORDS, SH_KEYWORDS]; 55 for (var k = 0; k < KEYWORDS.length; k++) { 56 var kw = KEYWORDS[k].split(' '); 57 for (var i = 0; i < kw.length; i++) { 58 if (kw[i]) { PR_keywords[kw[i]] = true; } 59 } 60 } 61}).call(this); 62 63// token style names. correspond to css classes 64/** token style for a string literal */ 65var PR_STRING = 'str'; 66/** token style for a keyword */ 67var PR_KEYWORD = 'kwd'; 68/** token style for a comment */ 69var PR_COMMENT = 'com'; 70/** token style for a type */ 71var PR_TYPE = 'typ'; 72/** token style for a literal value. e.g. 1, null, true. */ 73var PR_LITERAL = 'lit'; 74/** token style for a punctuation string. */ 75var PR_PUNCTUATION = 'pun'; 76/** token style for a punctuation string. */ 77var PR_PLAIN = 'pln'; 78 79/** token style for an sgml tag. */ 80var PR_TAG = 'tag'; 81/** token style for a markup declaration such as a DOCTYPE. */ 82var PR_DECLARATION = 'dec'; 83/** token style for embedded source. */ 84var PR_SOURCE = 'src'; 85/** token style for an sgml attribute name. */ 86var PR_ATTRIB_NAME = 'atn'; 87/** token style for an sgml attribute value. */ 88var PR_ATTRIB_VALUE = 'atv'; 89 90/** the position of the end of a token during. A division of a string into 91 * n tokens can be represented as a series n - 1 token ends, as long as 92 * runs of whitespace warrant their own token. 93 * @private 94 */ 95function PR_TokenEnd(end, style) { 96 if (undefined === style) { throw new Error('BAD'); } 97 if ('number' != typeof(end)) { throw new Error('BAD'); } 98 this.end = end; 99 this.style = style; 100} 101PR_TokenEnd.prototype.toString = function () { 102 return '[PR_TokenEnd ' + this.end + 103 (this.style ? ':' + this.style : '') + ']'; 104}; 105 106 107/** a chunk of text with a style. These are used to represent both the output 108 * from the lexing functions as well as intermediate results. 109 * @constructor 110 * @param token the token text 111 * @param style one of the token styles defined in designdoc-template, or null 112 * for a styleless token, such as an embedded html tag. 113 * @private 114 */ 115function PR_Token(token, style) { 116 if (undefined === style) { throw new Error('BAD'); } 117 this.token = token; 118 this.style = style; 119} 120 121PR_Token.prototype.toString = function () { 122 return '[PR_Token ' + this.token + (this.style ? ':' + this.style : '') + ']'; 123}; 124 125 126/** a helper class that decodes common html entities used to escape source and 127 * markup punctuation characters in html. 128 * @constructor 129 * @private 130 */ 131function PR_DecodeHelper() { 132 this.next = 0; 133 this.ch = '\0'; 134} 135 136PR_DecodeHelper.prototype.decode = function (s, i) { 137 var next = i + 1; 138 var ch = s.charAt(i); 139 if ('&' == ch) { 140 var semi = s.indexOf(';', next); 141 if (semi >= 0 && semi < next + 4) { 142 var entityName = s.substring(next, semi).toLowerCase(); 143 next = semi + 1; 144 if ('lt' == entityName) { 145 ch = '<'; 146 } else if ('gt' == entityName) { 147 ch = '>'; 148 } else if ('quot' == entityName) { 149 ch = '"'; 150 } else if ('apos' == entityName) { 151 ch = '\''; 152 } else if ('amp' == entityName) { 153 ch = '&'; 154 } else { 155 next = i + 1; 156 } 157 } 158 } 159 this.next = next; 160 this.ch = ch; 161 return this.ch; 162} 163 164 165// some string utilities 166function PR_isWordChar(ch) { 167 return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'); 168} 169 170function PR_isIdentifierStart(ch) { 171 return PR_isWordChar(ch) || ch == '_' || ch == '$' || ch == '@'; 172} 173 174function PR_isIdentifierPart(ch) { 175 return PR_isIdentifierStart(ch) || PR_isDigitChar(ch); 176} 177 178function PR_isSpaceChar(ch) { 179 return "\t \r\n".indexOf(ch) >= 0; 180} 181 182function PR_isDigitChar(ch) { 183 return ch >= '0' && ch <= '9'; 184} 185 186function PR_trim(s) { 187 var i = 0, j = s.length - 1; 188 while (i <= j && PR_isSpaceChar(s.charAt(i))) { ++i; } 189 while (j > i && PR_isSpaceChar(s.charAt(j))) { --j; } 190 return s.substring(i, j + 1); 191} 192 193function PR_startsWith(s, prefix) { 194 return s.length >= prefix.length && prefix == s.substring(0, prefix.length); 195} 196 197function PR_endsWith(s, suffix) { 198 return s.length >= suffix.length && 199 suffix == s.substring(s.length - suffix.length, s.length); 200} 201 202/** true iff prefix matches the first prefix characters in chars[0:len]. 203 * @private 204 */ 205function PR_prefixMatch(chars, len, prefix) { 206 if (len < prefix.length) { return false; } 207 for (var i = 0, n = prefix.length; i < n; ++i) { 208 if (prefix.charAt(i) != chars[i]) { return false; } 209 } 210 return true; 211} 212 213/** used to convert html special characters embedded in XMP tags into html. */ 214function PR_textToHtml(str) { 215 return str.replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>'); 216} 217 218 219/** split markup into chunks of html tags (style null) and 220 * plain text (style {@link #PR_PLAIN}). 221 * 222 * @param s a String of html. 223 * @return an Array of PR_Tokens of style PR_PLAIN and null. 224 * @private 225 */ 226function PR_chunkify(s) { 227 var chunks = new Array(); 228 var state = 0; 229 var start = 0; 230 var pos = -1; 231 for (var i = 0, n = s.length; i < n; ++i) { 232 var ch = s.charAt(i); 233 switch (state) { 234 case 0: 235 if ('<' == ch) { state = 1; } 236 break; 237 case 1: 238 pos = i - 1; 239 if ('/' == ch) { state = 2; } 240 else if (PR_isWordChar(ch)) { state = 3; } 241 else if ('<' == ch) { state = 1; } 242 else { state = 0; } 243 break; 244 case 2: 245 if (PR_isWordChar(ch)) { state = 3; } 246 else if ('<' == ch) { state = 1; } 247 else { state = 0; } 248 break; 249 case 3: 250 if ('>' == ch) { 251 if (pos > start) { 252 chunks.push(new PR_Token(s.substring(start, pos), PR_PLAIN)); 253 } 254 chunks.push(new PR_Token(s.substring(pos, i + 1), null)); 255 start = i + 1; 256 pos = -1; 257 state = 0; 258 } 259 break; 260 } 261 } 262 if (s.length > start) { 263 chunks.push(new PR_Token(s.substring(start, s.length), PR_PLAIN)); 264 } 265 return chunks; 266} 267 268/** splits chunks around entities. 269 * @private 270 */ 271function PR_splitEntities(chunks) { 272 var chunksOut = new Array(); 273 var state = 0; 274 for (var ci = 0, nc = chunks.length; ci < nc; ++ci) { 275 var chunk = chunks[ci]; 276 if (PR_PLAIN != chunk.style) { 277 chunksOut.push(chunk); 278 continue; 279 } 280 var s = chunk.token; 281 var pos = 0; 282 var start; 283 for (var i = 0; i < s.length; ++i) { 284 var ch = s.charAt(i); 285 switch (state) { 286 case 0: 287 if ('&' == ch) { state = 1; } 288 break; 289 case 1: 290 if ('#' == ch || PR_isWordChar(ch)) { 291 start = i - 1; 292 state = 2; 293 } else { 294 state = 0; 295 } 296 break; 297 case 2: 298 if (';' == ch) { 299 if (start > pos) { 300 chunksOut.push( 301 new PR_Token(s.substring(pos, start), chunk.style)); 302 } 303 chunksOut.push(new PR_Token(s.substring(start, i + 1), null)); 304 pos = i + 1; 305 state = 0; 306 } 307 break; 308 } 309 } 310 if (s.length > pos) { 311 chunksOut.push(pos ? 312 new PR_Token(s.substring(pos, s.length), chunk.style) : 313 chunk); 314 } 315 } 316 return chunksOut; 317} 318 319/** walk the tokenEnds list and the chunk list in parallel to generate a list 320 * of split tokens. 321 * @private 322 */ 323function PR_splitChunks(chunks, tokenEnds) { 324 var tokens = new Array(); // the output 325 326 var ci = 0; // index into chunks 327 // position of beginning of amount written so far in absolute space. 328 var posAbs = 0; 329 // position of amount written so far in chunk space 330 var posChunk = 0; 331 332 // current chunk 333 var chunk = new PR_Token('', null); 334 335 for (var ei = 0, ne = tokenEnds.length; ei < ne; ++ei) { 336 var tokenEnd = tokenEnds[ei]; 337 var end = tokenEnd.end; 338 339 var tokLen = end - posAbs; 340 var remainingInChunk = chunk.token.length - posChunk; 341 while (remainingInChunk <= tokLen) { 342 if (remainingInChunk > 0) { 343 tokens.push( 344 new PR_Token(chunk.token.substring(posChunk, chunk.token.length), 345 null == chunk.style ? null : tokenEnd.style)); 346 } 347 posAbs += remainingInChunk; 348 posChunk = 0; 349 if (ci < chunks.length) { chunk = chunks[ci++]; } 350 351 tokLen = end - posAbs; 352 remainingInChunk = chunk.token.length - posChunk; 353 } 354 355 if (tokLen) { 356 tokens.push( 357 new PR_Token(chunk.token.substring(posChunk, posChunk + tokLen), 358 tokenEnd.style)); 359 posAbs += tokLen; 360 posChunk += tokLen; 361 } 362 } 363 364 return tokens; 365} 366 367/** splits markup tokens into declarations, tags, and source chunks. 368 * @private 369 */ 370function PR_splitMarkup(chunks) { 371 // A state machine to split out declarations, tags, etc. 372 // This state machine deals with absolute space in the text, indexed by k, 373 // and position in the current chunk, indexed by pos and tokenStart to 374 // generate a list of the ends of tokens. 375 // Absolute space is calculated by considering the chunks as appended into 376 // one big string, as they were before being split. 377 378 // Known failure cases 379 // Server side scripting sections such as <?...?> in attributes. 380 // i.e. <span class="<? foo ?>"> 381 // Handling this would require a stack, and we don't use PHP. 382 383 // The output: a list of pairs of PR_TokenEnd instances 384 var tokenEnds = new Array(); 385 386 var state = 0; // FSM state variable 387 var k = 0; // position in absolute space of the start of the current chunk 388 var tokenStart = -1; // the start of the current token 389 390 // Try to find a closing tag for any open <style> or <script> tags 391 // We can't do this at a later stage because then the following case 392 // would fail: 393 // <script>document.writeln('<!--');</script> 394 395 // We use tokenChars[:tokenCharsI] to accumulate the tag name so that we 396 // can check whether to enter into a no scripting section when the tag ends. 397 var tokenChars = new Array(12); 398 var tokenCharsI = 0; 399 // if non null, the tag prefix that we need to see to break out. 400 var endScriptTag = null; 401 var decodeHelper = new PR_DecodeHelper(); 402 403 for (var ci = 0, nc = chunks.length; ci < nc; ++ci) { 404 var chunk = chunks[ci]; 405 if (PR_PLAIN != chunk.style) { 406 k += chunk.token.length; 407 continue; 408 } 409 410 var s = chunk.token; 411 var pos = 0; // the position past the last character processed so far in s 412 413 for (var i = 0, n = s.length; i < n; /* i = next at bottom */) { 414 decodeHelper.decode(s, i); 415 var ch = decodeHelper.ch; 416 var next = decodeHelper.next; 417 418 var tokenStyle = null; 419 switch (state) { 420 case 0: 421 if ('<' == ch) { state = 1; } 422 break; 423 case 1: 424 tokenCharsI = 0; 425 if ('/' == ch) { // only consider close tags if we're in script/style 426 state = 7; 427 } else if (null == endScriptTag) { 428 if ('!' == ch) { 429 state = 2; 430 } else if (PR_isWordChar(ch)) { 431 state = 8; 432 } else if ('?' == ch) { 433 state = 9; 434 } else if ('%' == ch) { 435 state = 11; 436 } else if ('<' != ch) { 437 state = 0; 438 } 439 } else if ('<' != ch) { 440 state = 0; 441 } 442 break; 443 case 2: 444 if ('-' == ch) { 445 state = 4; 446 } else if (PR_isWordChar(ch)) { 447 state = 3; 448 } else if ('<' == ch) { 449 state = 1; 450 } else { 451 state = 0; 452 } 453 break; 454 case 3: 455 if ('>' == ch) { 456 state = 0; 457 tokenStyle = PR_DECLARATION; 458 } 459 break; 460 case 4: 461 if ('-' == ch) { state = 5; } 462 break; 463 case 5: 464 if ('-' == ch) { state = 6; } 465 break; 466 case 6: 467 if ('>' == ch) { 468 state = 0; 469 tokenStyle = PR_COMMENT; 470 } else if ('-' == ch) { 471 state = 6; 472 } else { 473 state = 4; 474 } 475 break; 476 case 7: 477 if (PR_isWordChar(ch)) { 478 state = 8; 479 } else if ('<' == ch) { 480 state = 1; 481 } else { 482 state = 0; 483 } 484 break; 485 case 8: 486 if ('>' == ch) { 487 state = 0; 488 tokenStyle = PR_TAG; 489 } 490 break; 491 case 9: 492 if ('?' == ch) { state = 10; } 493 break; 494 case 10: 495 if ('>' == ch) { 496 state = 0; 497 tokenStyle = PR_SOURCE; 498 } else if ('?' != ch) { 499 state = 9; 500 } 501 break; 502 case 11: 503 if ('%' == ch) { state = 12; } 504 break; 505 case 12: 506 if ('>' == ch) { 507 state = 0; 508 tokenStyle = PR_SOURCE; 509 } else if ('%' != ch) { 510 state = 11; 511 } 512 break; 513 } 514 515 if (tokenCharsI < tokenChars.length) { 516 tokenChars[tokenCharsI++] = ch.toLowerCase(); 517 } 518 if (1 == state) { tokenStart = k + i; } 519 i = next; 520 if (tokenStyle != null) { 521 if (null != tokenStyle) { 522 if (endScriptTag) { 523 if (PR_prefixMatch(tokenChars, tokenCharsI, endScriptTag)) { 524 endScriptTag = null; 525 } 526 } else { 527 if (PR_prefixMatch(tokenChars, tokenCharsI, 'script')) { 528 endScriptTag = '/script'; 529 } else if (PR_prefixMatch(tokenChars, tokenCharsI, 'style')) { 530 endScriptTag = '/style'; 531 } else if (PR_prefixMatch(tokenChars, tokenCharsI, 'xmp')) { 532 endScriptTag = '/xmp'; 533 } 534 } 535 // disallow the tag if endScriptTag is set and this was not an open 536 // tag. 537 if (endScriptTag && tokenCharsI && '/' == tokenChars[0]) { 538 tokenStyle = null; 539 } 540 } 541 if (null != tokenStyle) { 542 tokenEnds.push(new PR_TokenEnd(tokenStart, PR_PLAIN)); 543 tokenEnds.push(new PR_TokenEnd(k + next, tokenStyle)); 544 } 545 } 546 } 547 k += chunk.token.length; 548 } 549 tokenEnds.push(new PR_TokenEnd(k, PR_PLAIN)); 550 551 return tokenEnds; 552} 553 554/** splits the given string into comment, string, and "other" tokens. 555 * @return an array of PR_Tokens with style in 556 * (PR_STRING, PR_COMMENT, PR_PLAIN, null) 557 * The result array may contain spurious zero length tokens. Ignore them. 558 * 559 * @private 560 */ 561function PR_splitStringAndCommentTokens(chunks) { 562 // a state machine to split out comments, strings, and other stuff 563 var tokenEnds = new Array(); // positions of ends of tokens in absolute space 564 var state = 0; // FSM state variable 565 var delim = -1; // string delimiter 566 var k = 0; // absolute position of beginning of current chunk 567 for (var ci = 0, nc = chunks.length; ci < nc; ++ci) { 568 var chunk = chunks[ci]; 569 var s = chunk.token; 570 if (PR_PLAIN == chunk.style) { 571 for (var i = 0, n = s.length; i < n; ++i) { 572 var ch = s.charAt(i); 573 if (0 == state) { 574 if (ch == '"' || ch == '\'' || ch == '`') { 575 tokenEnds.push(new PR_TokenEnd(k + i, PR_PLAIN)); 576 state = 1; 577 delim = ch; 578 } else if (ch == '/') { 579 state = 3; 580 } else if (ch == '#') { 581 tokenEnds.push(new PR_TokenEnd(k + i, PR_PLAIN)); 582 state = 4; 583 } 584 } else if (1 == state) { 585 if (ch == delim) { 586 state = 0; 587 tokenEnds.push(new PR_TokenEnd(k + i + 1, PR_STRING)); 588 } else if (ch == '\\') { 589 state = 2; 590 } 591 } else if (2 == state) { 592 state = 1; 593 } else if (3 == state) { 594 if (ch == '/') { 595 state = 4; 596 tokenEnds.push(new PR_TokenEnd(k + i - 1, PR_PLAIN)); 597 } else if (ch == '*') { 598 state = 5; 599 tokenEnds.push(new PR_TokenEnd(k + i - 1, PR_PLAIN)); 600 } else { 601 state = 0; 602 // next loop will reenter state 0 without same value of i, so 603 // ch will be reconsidered as start of new token. 604 --i; 605 } 606 } else if (4 == state) { 607 if (ch == '\r' || ch == '\n') { 608 state = 0; 609 tokenEnds.push(new PR_TokenEnd(k + i, PR_COMMENT)); 610 } 611 } else if (5 == state) { 612 if (ch == '*') { 613 state = 6; 614 } 615 } else if (6 == state) { 616 if (ch == '/') { 617 state = 0; 618 tokenEnds.push(new PR_TokenEnd(k + i + 1, PR_COMMENT)); 619 } else if (ch != '*') { 620 state = 5; 621 } 622 } 623 } 624 } 625 k += s.length; 626 } 627 tokenEnds.push(new PR_TokenEnd(k, PR_PLAIN)); // a token ends at the end 628 629 return PR_splitChunks(chunks, tokenEnds); 630} 631 632/** used by lexSource to split a non string, non comment token. 633 * @private 634 */ 635function PR_splitNonStringNonCommentToken(s, outlist) { 636 var pos = 0; 637 var state = 0; 638 for (var i = 0; i <= s.length; i++) { 639 var ch = s.charAt(i); 640 // the next state. 641 // if set to -1 then it will cause a reentry to state 0 without consuming 642 // another character. 643 var nstate = state; 644 645 if (i == s.length) { 646 // nstate will not be equal to state, so it will append the token 647 nstate = -2; 648 } else { 649 switch (state) { 650 case 0: // whitespace state 651 if (PR_isIdentifierStart(ch)) { 652 nstate = 1; 653 } else if (PR_isDigitChar(ch)) { 654 nstate = 2; 655 } else if (!PR_isSpaceChar(ch)) { 656 nstate = 3; 657 } 658 if (nstate && pos < i) { 659 var t = s.substring(pos, i); 660 outlist.push(new PR_Token(t, PR_PLAIN)); 661 pos = i; 662 } 663 break; 664 case 1: // identifier state 665 if (!PR_isIdentifierPart(ch)) { 666 nstate = -1; 667 } 668 break; 669 case 2: // number literal state 670 // handle numeric literals like 671 // 0x7f 300UL 100_000 672 673 // this does not treat floating point values as a single literal 674 // 0.1 and 3e-6 675 // are each split into multiple tokens 676 if (!(PR_isDigitChar(ch) || PR_isWordChar(ch) || ch == '_')) { 677 nstate = -1; 678 } 679 break; 680 case 3: // punctuation state 681 if (PR_isIdentifierStart(ch) || PR_isDigitChar(ch) || 682 PR_isSpaceChar(ch)) { 683 nstate = -1; 684 } 685 break; 686 } 687 } 688 689 if (nstate != state) { 690 if (nstate < 0) { 691 if (i > pos) { 692 var t = s.substring(pos, i); 693 var ch0 = t.charAt(0); 694 var style; 695 if (PR_isIdentifierStart(ch0)) { 696 if (PR_keywords[t]) { 697 style = PR_KEYWORD; 698 } 699 else if (ch0 == '@') { 700 style = PR_LITERAL; 701 } else { 702 // Treat any word that starts with an uppercase character and 703 // contains at least one lowercase character as a type, or 704 // ends with _t. 705 // This works perfectly for Java, pretty well for C++, and 706 // passably for Python. The _t catches C structs. 707 var isType = false; 708 if (ch0 >= 'A' && ch0 <= 'Z') { 709 for (var j = 1; j < t.length; j++) { 710 var ch1 = t.charAt(j); 711 if (ch1 >= 'a' && ch1 <= 'z') { 712 isType = true; 713 break; 714 } 715 } 716 if (!isType && t.length >= 2 && 717 t.substring(t.length - 2) == '_t') { 718 isType = true; 719 } 720 } 721 style = isType ? PR_TYPE : PR_PLAIN; 722 } 723 } else if (PR_isDigitChar(ch0)) { 724 style = PR_LITERAL; 725 } else if (!PR_isSpaceChar(ch0)) { 726 style = PR_PUNCTUATION; 727 } else { 728 style = PR_PLAIN; 729 } 730 pos = i; 731 outlist.push(new PR_Token(t, style)); 732 } 733 734 state = 0; 735 if (nstate == -1) { 736 // don't increment. This allows us to use state 0 to redispatch based 737 // on the current character. 738 i--; 739 continue; 740 } 741 } 742 state = nstate; 743 } 744 } 745} 746 747/** split a group of chunks of markup. 748 * @private 749 */ 750function PR_tokenizeMarkup(chunks) { 751 if (!(chunks && chunks.length)) { return chunks; } 752 753 var tokenEnds = PR_splitMarkup(chunks); 754 return PR_splitChunks(chunks, tokenEnds); 755} 756 757/** split tags attributes and their values out from the tag name, and 758 * recursively lex source chunks. 759 * @private 760 */ 761function PR_splitTagAttributes(tokens) { 762 var tokensOut = new Array(); 763 var state = 0; 764 var stateStyle = PR_TAG; 765 var delim = null; // attribute delimiter for quoted value state. 766 var decodeHelper = new PR_DecodeHelper(); 767 for (var ci = 0; ci < tokens.length; ++ci) { 768 var tok = tokens[ci]; 769 if (PR_TAG == tok.style) { 770 var s = tok.token; 771 var start = 0; 772 for (var i = 0; i < s.length; /* i = next at bottom */) { 773 decodeHelper.decode(s, i); 774 var ch = decodeHelper.ch; 775 var next = decodeHelper.next; 776 777 var emitEnd = null; // null or position of end of chunk to emit. 778 var nextStyle = null; // null or next value of stateStyle 779 if (ch == '>') { 780 if (PR_TAG != stateStyle) { 781 emitEnd = i; 782 nextStyle = PR_TAG; 783 } 784 } else { 785 switch (state) { 786 case 0: 787 if ('<' == ch) { state = 1; } 788 break; 789 case 1: 790 if (PR_isSpaceChar(ch)) { state = 2; } 791 break; 792 case 2: 793 if (!PR_isSpaceChar(ch)) { 794 nextStyle = PR_ATTRIB_NAME; 795 emitEnd = i; 796 state = 3; 797 } 798 break; 799 case 3: 800 if ('=' == ch) { 801 emitEnd = i; 802 nextStyle = PR_TAG; 803 state = 5; 804 } else if (PR_isSpaceChar(ch)) { 805 emitEnd = i; 806 nextStyle = PR_TAG; 807 state = 4; 808 } 809 break; 810 case 4: 811 if ('=' == ch) { 812 state = 5; 813 } else if (!PR_isSpaceChar(ch)) { 814 emitEnd = i; 815 nextStyle = PR_ATTRIB_NAME; 816 state = 3; 817 } 818 break; 819 case 5: 820 if ('"' == ch || '\'' == ch) { 821 emitEnd = i; 822 nextStyle = PR_ATTRIB_VALUE; 823 state = 6; 824 delim = ch; 825 } else if (!PR_isSpaceChar(ch)) { 826 emitEnd = i; 827 nextStyle = PR_ATTRIB_VALUE; 828 state = 7; 829 } 830 break; 831 case 6: 832 if (ch == delim) { 833 emitEnd = next; 834 nextStyle = PR_TAG; 835 state = 2; 836 } 837 break; 838 case 7: 839 if (PR_isSpaceChar(ch)) { 840 emitEnd = i; 841 nextStyle = PR_TAG; 842 state = 2; 843 } 844 break; 845 } 846 } 847 if (emitEnd) { 848 if (emitEnd > start) { 849 tokensOut.push( 850 new PR_Token(s.substring(start, emitEnd), stateStyle)); 851 start = emitEnd; 852 } 853 stateStyle = nextStyle; 854 } 855 i = next; 856 } 857 if (s.length > start) { 858 tokensOut.push(new PR_Token(s.substring(start, s.length), stateStyle)); 859 } 860 } else { 861 if (tok.style) { 862 state = 0; 863 stateStyle = PR_TAG; 864 } 865 tokensOut.push(tok); 866 } 867 } 868 return tokensOut; 869} 870 871/** identify regions of markup that are really source code, and recursivley 872 * lex them. 873 * @private 874 */ 875function PR_splitSourceNodes(tokens) { 876 var tokensOut = new Array(); 877 // when we see a <script> tag, store '/' here so that we know to end the 878 // source processing 879 var endScriptTag = null; 880 var decodeHelper = new PR_DecodeHelper(); 881 882 var sourceChunks = null; 883 884 for (var ci = 0, nc = tokens.length; ci < nc; ++ci) { 885 var tok = tokens[ci]; 886 if (null == tok.style) { 887 tokens.push(tok); 888 continue; 889 } 890 891 var s = tok.token; 892 893 if (null == endScriptTag) { 894 if (PR_SOURCE == tok.style) { 895 // split off any starting and trailing <?, <% 896 if ('<' == decodeHelper.decode(s, 0)) { 897 decodeHelper.decode(s, decodeHelper.next); 898 if ('%' == decodeHelper.ch || '?' == decodeHelper.ch) { 899 endScriptTag = decodeHelper.ch; 900 tokensOut.push(new PR_Token(s.substring(0, decodeHelper.next), 901 PR_TAG)); 902 s = s.substring(decodeHelper.next, s.length); 903 } 904 } 905 } else if (PR_TAG == tok.style) { 906 if ('<' == decodeHelper.decode(s, 0) && 907 '/' != s.charAt(decodeHelper.next)) { 908 var tagContent = s.substring(decodeHelper.next).toLowerCase(); 909 // FIXME(msamuel): this does not mirror exactly the code in 910 // in PR_splitMarkup that defers splitting tags inside script and 911 // style blocks. 912 if (PR_startsWith(tagContent, 'script') || 913 PR_startsWith(tagContent, 'style') || 914 PR_startsWith(tagContent, 'xmp')) { 915 endScriptTag = '/'; 916 } 917 } 918 } 919 } 920 921 if (null != endScriptTag) { 922 var endTok = null; 923 if (PR_SOURCE == tok.style) { 924 if (endScriptTag == '%' || endScriptTag == '?') { 925 var pos = s.lastIndexOf(endScriptTag); 926 if (pos >= 0 && '>' == decodeHelper.decode(s, pos + 1) && 927 s.length == decodeHelper.next) { 928 endTok = new PR_Token(s.substring(pos, s.length), PR_TAG); 929 s = s.substring(0, pos); 930 } 931 } 932 if (null == sourceChunks) { sourceChunks = new Array(); } 933 sourceChunks.push(new PR_Token(s, PR_PLAIN)); 934 } else if (PR_PLAIN == tok.style) { 935 if (null == sourceChunks) { sourceChunks = new Array(); } 936 sourceChunks.push(tok); 937 } else if (PR_TAG == tok.style) { 938 // if it starts with </ then it must be the end tag. 939 if ('<' == decodeHelper.decode(tok.token, 0) && 940 tok.token.length > decodeHelper.next && 941 '/' == decodeHelper.decode(tok.token, decodeHelper.next)) { 942 endTok = tok; 943 } else { 944 tokensOut.push(tok); 945 } 946 } else { 947 if (sourceChunks) { 948 sourceChunks.push(tok); 949 } else { 950 // push remaining tag and attribute tokens from the opening tag 951 tokensOut.push(tok); 952 } 953 } 954 if (endTok) { 955 if (sourceChunks) { 956 var sourceTokens = PR_lexSource(sourceChunks); 957 tokensOut.push(new PR_Token('<span class=embsrc>', null)); 958 for (var si = 0, ns = sourceTokens.length; si < ns; ++si) { 959 tokensOut.push(sourceTokens[si]); 960 } 961 tokensOut.push(new PR_Token('</span>', null)); 962 sourceChunks = null; 963 } 964 tokensOut.push(endTok); 965 endScriptTag = null; 966 } 967 } else { 968 tokensOut.push(tok); 969 } 970 } 971 return tokensOut; 972} 973 974/** splits the quotes from an attribute value. 975 * ['"foo"'] -> ['"', 'foo', '"'] 976 * @private 977 */ 978function PR_splitAttributeQuotes(tokens) { 979 var firstPlain = null, lastPlain = null; 980 for (var i = 0; i < tokens.length; ++i) { 981 if (PR_PLAIN = tokens[i].style) { 982 firstPlain = i; 983 break; 984 } 985 } 986 for (var i = tokens.length; --i >= 0;) { 987 if (PR_PLAIN = tokens[i].style) { 988 lastPlain = i; 989 break; 990 } 991 } 992 if (null == firstPlain) { return tokens; } 993 994 var decodeHelper = new PR_DecodeHelper(); 995 var fs = tokens[firstPlain].token; 996 var fc = decodeHelper.decode(fs, 0); 997 if ('"' != fc && '\'' != fc) { 998 return tokens; 999 } 1000 var fpos = decodeHelper.next; 1001 1002 var ls = tokens[lastPlain].token; 1003 var lpos = ls.lastIndexOf('&'); 1004 if (lpos < 0) { lpos = ls.length - 1; } 1005 var lc = decodeHelper.decode(ls, lpos); 1006 if (lc != fc || decodeHelper.next != ls.length) { 1007 lc = null; 1008 lpos = ls.length; 1009 } 1010 1011 var tokensOut = new Array(); 1012 for (var i = 0; i < firstPlain; ++i) { 1013 tokensOut.push(tokens[i]); 1014 } 1015 tokensOut.push(new PR_Token(fs.substring(0, fpos), PR_ATTRIB_VALUE)); 1016 if (lastPlain == firstPlain) { 1017 tokensOut.push(new PR_Token(fs.substring(fpos, lpos), PR_PLAIN)); 1018 } else { 1019 tokensOut.push(new PR_Token(fs.substring(fpos, fs.length), PR_PLAIN)); 1020 for (var i = firstPlain + 1; i < lastPlain; ++i) { 1021 tokensOut.push(tokens[i]); 1022 } 1023 if (lc) { 1024 tokens.push(new PR_Token(ls.substring(0, lpos), PR_PLAIN)); 1025 } else { 1026 tokens.push(tokens[lastPlain]); 1027 } 1028 } 1029 if (lc) { 1030 tokensOut.push(new PR_Token(ls.substring(lpos, ls.length), PR_PLAIN)); 1031 } 1032 for (var i = lastPlain + 1; i < tokens.length; ++i) { 1033 tokensOut.push(tokens[i]); 1034 } 1035 return tokensOut; 1036} 1037 1038/** identify attribute values that really contain source code and recursively 1039 * lex them. 1040 * @private 1041 */ 1042function PR_splitSourceAttributes(tokens) { 1043 var tokensOut = new Array(); 1044 1045 var sourceChunks = null; 1046 var inSource = false; 1047 var name = ''; 1048 1049 for (var ci = 0, nc = tokens.length; ci < nc; ++ci) { 1050 var tok = tokens[ci]; 1051 var outList = tokensOut; 1052 if (PR_TAG == tok.style) { 1053 if (inSource) { 1054 inSource = false; 1055 name = ''; 1056 if (sourceChunks) { 1057 tokensOut.push(new PR_Token('<span class=embsrc>', null)); 1058 var sourceTokens = 1059 PR_lexSource(PR_splitAttributeQuotes(sourceChunks)); 1060 for (var si = 0, ns = sourceTokens.length; si < ns; ++si) { 1061 tokensOut.push(sourceTokens[si]); 1062 } 1063 tokensOut.push(new PR_Token('</span>', null)); 1064 sourceChunks = null; 1065 } 1066 } else if (name && tok.token.indexOf('=') >= 0) { 1067 var nameLower = name.toLowerCase(); 1068 if (PR_startsWith(nameLower, 'on') || 'style' == nameLower) { 1069 inSource = true; 1070 } 1071 } else { 1072 name = ''; 1073 } 1074 } else if (PR_ATTRIB_NAME == tok.style) { 1075 name += tok.token; 1076 } else if (PR_ATTRIB_VALUE == tok.style) { 1077 if (inSource) { 1078 if (null == sourceChunks) { sourceChunks = new Array(); } 1079 outList = sourceChunks; 1080 tok = new PR_Token(tok.token, PR_PLAIN); 1081 } 1082 } else { 1083 if (sourceChunks) { 1084 outList = sourceChunks; 1085 } 1086 } 1087 outList.push(tok); 1088 } 1089 return tokensOut; 1090} 1091 1092/** returns a list of PR_Token objects given chunks of source code. 1093 * 1094 * This code assumes that < tokens are html escaped, but " are not. 1095 * It will do a resonable job with <, but will not recognize an " 1096 * as starting a string. 1097 * 1098 * This code treats ", ', and ` as string delimiters, and \ as a string escape. 1099 * It does not recognize double delimiter escapes, or perl's qq() style 1100 * strings. 1101 * 1102 * It recognizes C, C++, and shell style comments. 1103 * 1104 * @param chunks PR_Tokens with style in (null, PR_PLAIN) 1105 */ 1106function PR_lexSource(chunks) { 1107 // positions of ends of tokens in order 1108 var tokensIn = PR_splitStringAndCommentTokens(chunks); 1109 1110 // split entities out of so that we know to treat them as single units. 1111 tokensIn = PR_splitEntities(tokensIn); 1112 1113 // split non comment|string tokens on whitespace and word boundaries 1114 var tokensOut = new Array(); 1115 for (var i = 0; i < tokensIn.length; ++i) { 1116 var tok = tokensIn[i]; 1117 var t = tok.token; 1118 var s = tok.style; 1119 1120 if (PR_PLAIN == s) { 1121 PR_splitNonStringNonCommentToken(t, tokensOut); 1122 continue; 1123 } 1124 tokensOut.push(tok); 1125 } 1126 1127 return tokensOut; 1128} 1129 1130/** returns a list of PR_Token objects given a string of markup. 1131 * 1132 * This code assumes that < tokens are html escaped, but " are not. 1133 * It will do a resonable job with <, but will not recognize an " 1134 * as starting a string. 1135 * 1136 * This code recognizes a number of constructs. 1137 * <!-- ... --> comment 1138 * <!\w ... > declaration 1139 * <\w ... > tag 1140 * </\w ... > tag 1141 * <?...?> embedded source 1142 * &[#\w]...; entity 1143 * 1144 * It does not recognizes %foo; entities. 1145 * 1146 * It will recurse into any <style>, <script>, and on* attributes using 1147 * PR_lexSource. 1148 */ 1149function PR_lexMarkup(chunks) { 1150 // This function works as follows: 1151 // 1) Start by splitting the markup into text and tag chunks 1152 // Input: String s 1153 // Output: List<PR_Token> where style in (PR_PLAIN, null) 1154 // 2) Then split the text chunks further into comments, declarations, 1155 // tags, etc. 1156 // After each split, consider whether the token is the start of an 1157 // embedded source section, i.e. is an open <script> tag. If it is, 1158 // find the corresponding close token, and don't bother to lex in between. 1159 // Input: List<String> 1160 // Output: List<PR_Token> with style in (PR_TAG, PR_PLAIN, PR_SOURCE, null) 1161 // 3) Finally go over each tag token and split out attribute names and values. 1162 // Input: List<PR_Token> 1163 // Output: List<PR_Token> where style in 1164 // (PR_TAG, PR_PLAIN, PR_SOURCE, NAME, VALUE, null) 1165 var tokensOut = PR_tokenizeMarkup(chunks); 1166 tokensOut = PR_splitTagAttributes(tokensOut); 1167 tokensOut = PR_splitSourceNodes(tokensOut); 1168 tokensOut = PR_splitSourceAttributes(tokensOut); 1169 return tokensOut; 1170} 1171 1172/** classify the string as either source or markup and lex appropriately. */ 1173function PR_lexOne(s) { 1174 var chunks = PR_chunkify(s); 1175 // treat it as markup if the first non whitespace character is a < and the 1176 // last non-whitespace character is a > 1177 var isMarkup = false; 1178 for (var i = 0; i < chunks.length; ++i) { 1179 if (PR_PLAIN == chunks[i].style) { 1180 if (PR_startsWith(PR_trim(chunks[i].token), '<')) { 1181 for (var j = chunks.length; --j >= 0;) { 1182 if (PR_PLAIN == chunks[j].style) { 1183 isMarkup = PR_endsWith(PR_trim(chunks[j].token), '>'); 1184 break; 1185 } 1186 } 1187 } 1188 break; 1189 } 1190 } 1191 return isMarkup ? PR_lexMarkup(chunks) : PR_lexSource(chunks); 1192} 1193 1194/** pretty print a chunk of code. 1195 * 1196 * @param s code as html 1197 * @return code as html, but prettier 1198 */ 1199function prettyPrintOne(s) { 1200 try { 1201 var tokens = PR_lexOne(s); 1202 var out = ''; 1203 var lastStyle = null; 1204 for (var i = 0; i < tokens.length; i++) { 1205 var t = tokens[i]; 1206 if (t.style != lastStyle) { 1207 if (lastStyle != null) { 1208 out += '</span>'; 1209 } 1210 if (t.style != null) { 1211 out += '<span class=' + t.style + '>'; 1212 } 1213 lastStyle = t.style; 1214 } 1215 var html = t.token; 1216 if (null != t.style) { 1217 // This interacts badly with the wiki which introduces paragraph tags 1218 // int pre blocks for some strange reason. 1219 // It's necessary for IE though which seems to lose the preformattedness 1220 // of <pre> tags when their innerHTML is assigned. 1221 html = html.replace(/(?:\r\n?)|\n/g, '<br>').replace(/ /g, ' '); 1222 } 1223 out += html; 1224 } 1225 if (lastStyle != null) { 1226 out += '</span>'; 1227 } 1228 return out; 1229 } catch (e) { 1230 //alert(e.stack); // DISABLE in production 1231 return s; 1232 } 1233} 1234 1235/** find all the < pre > and < code > tags in the DOM with class=prettyprint and 1236 * prettify them. 1237 */ 1238function prettyPrint() { 1239 // fetch a list of nodes to rewrite 1240 var codeSegments = [ 1241 document.getElementsByTagName('pre'), 1242 document.getElementsByTagName('code'), 1243 document.getElementsByTagName('xmp') ]; 1244 var elements = []; 1245 for (var i = 0; i < codeSegments.length; ++i) { 1246 for (var j = 0; j < codeSegments[i].length; ++j) { 1247 elements.push(codeSegments[i][j]); 1248 } 1249 } 1250 codeSegments = null; 1251 1252 // the loop is broken into a series of continuations to make sure that we 1253 // don't make the browser unresponsive when rewriting a large page. 1254 var k = 0; 1255 1256 function doWork() { 1257 var endTime = new Date().getTime() + 250; 1258 for (; k < elements.length && new Date().getTime() < endTime; k++) { 1259 var cs = elements[k]; 1260 if (cs.className && cs.className.indexOf('prettyprint') >= 0) { 1261 1262 // make sure this is not nested in an already prettified element 1263 var nested = false; 1264 for (var p = cs.parentNode; p != null; p = p.parentNode) { 1265 if ((p.tagName == 'pre' || p.tagName == 'code' || 1266 p.tagName == 'xmp') && 1267 p.className && p.className.indexOf('prettyprint') >= 0) { 1268 nested = true; 1269 break; 1270 } 1271 } 1272 if (!nested) { 1273 // XMP tags contain unescaped entities so require special handling. 1274 var isRawContent = 'XMP' == cs.tagName; 1275 1276 // fetch the content as a snippet of properly escaped HTML 1277 var content = cs.innerHTML; 1278 if (isRawContent) { 1279 content = PR_textToHtml(content); 1280 } 1281 1282 // do the pretty printing 1283 var newContent = prettyPrintOne(content); 1284 1285 // push the prettified html back into the tag. 1286 if (!isRawContent) { 1287 // just replace the old html with the new 1288 cs.innerHTML = newContent; 1289 } else { 1290 // we need to change the tag to a <pre> since <xmp>s do not allow 1291 // embedded tags such as the span tags used to attach styles to 1292 // sections of source code. 1293 var pre = document.createElement('PRE'); 1294 for (var i = 0; i < cs.attributes.length; ++i) { 1295 var a = cs.attributes[i]; 1296 if (a.specified) { 1297 pre.setAttribute(a.name, a.value); 1298 } 1299 } 1300 pre.innerHTML = newContent; 1301 // remove the old 1302 cs.parentNode.replaceChild(pre, cs); 1303 } 1304 } 1305 } 1306 } 1307 if (k < elements.length) { 1308 // finish up in a continuation 1309 setTimeout(doWork, 250); 1310 } 1311 } 1312 1313 doWork(); 1314} 1315