1# Copyright (c) 2002-2009 International Business Machines Corporation and 2# others. All Rights Reserved. 3# 4# file: line.txt 5# 6# Line Breaking Rules 7# Implement default line breaking as defined by 8# Unicode Standard Annex #14 Revision 24 for Unicode 5.2 9# http://www.unicode.org/reports/tr14/ 10 11 12 13# 14# Character Classes defined by TR 14. 15# 16 17!!chain; 18!!LBCMNoChain; 19 20 21!!lookAheadHardBreak; 22# 23# !!lookAheadHardBreak Described here because it is (as yet) undocumented elsewhere 24# and only used for the line break rules. 25# 26# It is used in the implementation of rule LB 10 27# which says to treat any combining mark that is not attached to a base 28# character as if it were of class AL (alphabetic). 29# 30# The problem occurs in the reverse rules. 31# 32# Consider a sequence like, with correct breaks as shown 33# LF ID CM AL AL 34# ^ ^ ^ 35# Then consider the sequence without the initial ID (ideographic) 36# LF CM AL AL 37# ^ ^ 38# Our CM, which in the first example was attached to the ideograph, 39# is now unattached, becomes an alpha, and joins in with the other 40# alphas. 41# 42# When iterating forwards, these sequences do not present any problems 43# When iterating backwards, we need to look ahead when encountering 44# a CM to see whether it attaches to something further on or not. 45# (Look-ahead in a reverse rule is looking towards the start) 46# 47# If the CM is unattached, we need to force a break. 48# 49# !!lookAheadHardBreak forces the run time state machine to 50# stop immediately when a look ahead rule ( '/' operator) matches, 51# and set the match position to that of the look-ahead operator, 52# no matter what other rules may be in play at the time. 53# 54# See rule LB 19 for an example. 55# 56 57$AI = [:LineBreak = Ambiguous:]; 58$AL = [:LineBreak = Alphabetic:]; 59$BA = [:LineBreak = Break_After:]; 60$BB = [:LineBreak = Break_Before:]; 61$BK = [:LineBreak = Mandatory_Break:]; 62$B2 = [:LineBreak = Break_Both:]; 63$CB = [:LineBreak = Contingent_Break:]; 64$CL = [:LineBreak = Close_Punctuation:]; 65$CM = [:LineBreak = Combining_Mark:]; 66$CP = [:LineBreak = Close_Parenthesis:]; 67$CR = [:LineBreak = Carriage_Return:]; 68$EX = [:LineBreak = Exclamation:]; 69$GL = [:LineBreak = Glue:]; 70$HY = [:LineBreak = Hyphen:]; 71$H2 = [:LineBreak = H2:]; 72$H3 = [:LineBreak = H3:]; 73$ID = [:LineBreak = Ideographic:]; 74$IN = [:LineBreak = Inseperable:]; 75$IS = [:LineBreak = Infix_Numeric:]; 76$JL = [:LineBreak = JL:]; 77$JV = [:LineBreak = JV:]; 78$JT = [:LineBreak = JT:]; 79$LF = [:LineBreak = Line_Feed:]; 80$NL = [:LineBreak = Next_Line:]; 81$NS = [:LineBreak = Nonstarter:]; 82$NU = [:LineBreak = Numeric:]; 83$OP = [:LineBreak = Open_Punctuation:]; 84$PO = [:LineBreak = Postfix_Numeric:]; 85$PR = [:LineBreak = Prefix_Numeric:]; 86$QU = [:LineBreak = Quotation:]; 87$SA = [:LineBreak = Complex_Context:]; 88$SG = [:LineBreak = Surrogate:]; 89$SP = [:LineBreak = Space:]; 90$SY = [:LineBreak = Break_Symbols:]; 91$WJ = [:LineBreak = Word_Joiner:]; 92$XX = [:LineBreak = Unknown:]; 93$ZW = [:LineBreak = ZWSpace:]; 94 95# Dictionary character set, for triggering language-based break engines. Currently 96# limited to LineBreak=Complex_Context. Note that this set only works in Unicode 97# 5.0 or later as the definition of Complex_Context was corrected to include all 98# characters requiring dictionary break. 99 100$dictionary = [:LineBreak = Complex_Context:]; 101 102# 103# Rule LB1. By default, treat AI (characters with ambiguous east Asian width), 104# SA (South East Asian: Thai, Lao, Khmer) 105# SG (Unpaired Surrogates) 106# XX (Unknown, unassigned) 107# as $AL (Alphabetic) 108# 109$ALPlus = [$AL $AI $SA $SG $XX]; 110 111# 112# Combining Marks. X $CM* behaves as if it were X. Rule LB6. 113# 114$ALcm = $ALPlus $CM*; 115$BAcm = $BA $CM*; 116$BBcm = $BB $CM*; 117$B2cm = $B2 $CM*; 118$CLcm = $CL $CM*; 119$CPcm = $CP $CM*; 120$EXcm = $EX $CM*; 121$GLcm = $GL $CM*; 122$HYcm = $HY $CM*; 123$H2cm = $H2 $CM*; 124$H3cm = $H3 $CM*; 125$IDcm = $ID $CM*; 126$INcm = $IN $CM*; 127$IScm = $IS $CM*; 128$JLcm = $JL $CM*; 129$JVcm = $JV $CM*; 130$JTcm = $JT $CM*; 131$NScm = $NS $CM*; 132$NUcm = $NU $CM*; 133$OPcm = $OP $CM*; 134$POcm = $PO $CM*; 135$PRcm = $PR $CM*; 136$QUcm = $QU $CM*; 137$SYcm = $SY $CM*; 138$WJcm = $WJ $CM*; 139 140## ------------------------------------------------- 141 142!!forward; 143 144# 145# Each class of character can stand by itself as an unbroken token, with trailing combining stuff 146# 147$ALPlus $CM+; 148$BA $CM+; 149$BB $CM+; 150$B2 $CM+; 151$CL $CM+; 152$CP $CM+; 153$EX $CM+; 154$GL $CM+; 155$HY $CM+; 156$H2 $CM+; 157$H3 $CM+; 158$ID $CM+; 159$IN $CM+; 160$IS $CM+; 161$JL $CM+; 162$JV $CM+; 163$JT $CM+; 164$NS $CM+; 165$NU $CM+; 166$OP $CM+; 167$PO $CM+; 168$PR $CM+; 169$QU $CM+; 170$SY $CM+; 171$WJ $CM+; 172 173# 174# CAN_CM is the set of characters that may combine with CM combining chars. 175# Note that Linebreak UAX 14's concept of a combining char and the rules 176# for what they can combine with are _very_ different from the rest of Unicode. 177# 178# Note that $CM itself is left out of this set. If CM is needed as a base 179# it must be listed separately in the rule. 180# 181$CAN_CM = [^$SP $BK $CR $LF $NL $ZW $CM]; # Bases that can take CMs 182$CANT_CM = [ $SP $BK $CR $LF $NL $ZW $CM]; # Bases that can't take CMs 183 184# 185# AL_FOLLOW set of chars that can unconditionally follow an AL 186# Needed in rules where stand-alone $CM s are treated as AL. 187# Chaining is disabled with CM because it causes other failures, 188# so for this one case we need to manually list out longer sequences. 189# 190$AL_FOLLOW_NOCM = [$BK $CR $LF $NL $ZW $SP]; 191$AL_FOLLOW_CM = [$CL $CP $EX $IS $SY $WJ $GL $OP $QU $BA $HY $NS $IN $NU $ALPlus]; 192$AL_FOLLOW = [$AL_FOLLOW_NOCM $AL_FOLLOW_CM]; 193 194 195# 196# Rule LB 4, 5 Mandatory (Hard) breaks. 197# 198$LB4Breaks = [$BK $CR $LF $NL]; 199$LB4NonBreaks = [^$BK $CR $LF $NL]; 200$CR $LF {100}; 201 202# 203# LB 6 Do not break before hard line breaks. 204# 205$LB4NonBreaks? $LB4Breaks {100}; # LB 5 do not break before hard breaks. 206$CAN_CM $CM* $LB4Breaks {100}; 207$CM+ $LB4Breaks {100}; 208 209# LB 7 x SP 210# x ZW 211$LB4NonBreaks [$SP $ZW]; 212$CAN_CM $CM* [$SP $ZW]; 213$CM+ [$SP $ZW]; 214 215# 216# LB 8 Break after zero width space 217# 218$LB8Breaks = [$LB4Breaks $ZW]; 219$LB8NonBreaks = [[$LB4NonBreaks] - [$ZW]]; 220 221 222# LB 9 Combining marks. X $CM needs to behave like X, where X is not $SP, $BK $CR $LF $NL 223# $CM not covered by the above needs to behave like $AL 224# See definition of $CAN_CM. 225 226$CAN_CM $CM+; # Stick together any combining sequences that don't match other rules. 227$CM+; 228 229# 230# LB 11 Do not break before or after WORD JOINER & related characters. 231# 232$CAN_CM $CM* $WJcm; 233$LB8NonBreaks $WJcm; 234$CM+ $WJcm; 235 236$WJcm $CANT_CM; 237$WJcm $CAN_CM $CM*; 238 239# 240# LB 12 Do not break after NBSP and related characters. 241# GL x 242# 243$GLcm $CAN_CM $CM*; 244$GLcm $CANT_CM; 245 246# 247# LB 12a Do not break before NBSP and related characters ... 248# [^SP BA HY] x GL 249# 250[[$LB8NonBreaks] - [$SP $BA $HY]] $CM* $GLcm; 251$CM+ GLcm; 252 253 254 255# 256# LB 13 Don't break before ']' or '!' or ';' or '/', even after spaces. 257# 258$LB8NonBreaks $CL; 259$CAN_CM $CM* $CL; 260$CM+ $CL; # by rule 10, stand-alone CM behaves as AL 261 262$LB8NonBreaks $CP; 263$CAN_CM $CM* $CP; 264$CM+ $CP; # by rule 10, stand-alone CM behaves as AL 265 266$LB8NonBreaks $EX; 267$CAN_CM $CM* $EX; 268$CM+ $EX; # by rule 10, stand-alone CM behaves as AL 269 270$LB8NonBreaks $IS; 271$CAN_CM $CM* $IS; 272$CM+ $IS; # by rule 10, stand-alone CM behaves as AL 273 274$LB8NonBreaks $SY; 275$CAN_CM $CM* $SY; 276$CM+ $SY; # by rule 10, stand-alone CM behaves as AL 277 278 279# 280# LB 14 Do not break after OP, even after spaces 281# 282$OPcm $SP* $CAN_CM $CM*; 283$OPcm $SP* $CANT_CM; 284 285$OPcm $SP+ $CM+ $AL_FOLLOW?; # by rule 10, stand-alone CM behaves as AL 286 287# LB 15 288$QUcm $SP* $OPcm; 289 290# LB 16 291($CLcm | $CPcm) $SP* $NScm; 292 293# LB 17 294$B2cm $SP* $B2cm; 295 296# 297# LB 18 Break after spaces. 298# 299$LB18NonBreaks = [$LB8NonBreaks - [$SP]]; 300$LB18Breaks = [$LB8Breaks $SP]; 301 302 303# LB 19 304# x QU 305$LB18NonBreaks $CM* $QUcm; 306$CM+ $QUcm; 307 308# QU x 309$QUcm .?; 310$QUcm $LB18NonBreaks $CM*; # Don't let a combining mark go onto $CR, $BK, etc. 311 # TODO: I don't think this rule is needed. 312 313 314# LB 20 315# <break> $CB 316# $CB <break> 317 318$LB20NonBreaks = [$LB18NonBreaks - $CB]; 319 320# LB 21 x (BA | HY | NS) 321# BB x 322# 323$LB20NonBreaks $CM* ($BAcm | $HYcm | $NScm); 324 325$BBcm [^$CB]; # $BB x 326$BBcm $LB20NonBreaks $CM*; 327 328# LB 22 329$ALcm $INcm; 330$CM+ $INcm; # by rule 10, any otherwise unattached CM behaves as AL 331$IDcm $INcm; 332$INcm $INcm; 333$NUcm $INcm; 334 335 336# $LB 23 337$IDcm $POcm; 338$ALcm $NUcm; # includes $LB19 339$CM+ $NUcm; # Rule 10, any otherwise unattached CM behaves as AL 340$NUcm $ALcm; 341 342# 343# LB 24 344# 345$PRcm $IDcm; 346$PRcm $ALcm; 347$POcm $ALcm; 348 349# 350# LB 25 Numbers. 351# 352($PRcm | $POcm)? ($OPcm | $HYcm)? $NUcm ($NUcm | $SYcm | $IScm)* ($CLcm | $CPcm)? ($PRcm | $POcm)?; 353 354# LB 26 Do not break a Korean syllable 355# 356$JLcm ($JLcm | $JVcm | $H2cm | $H3cm); 357($JVcm | $H2cm) ($JVcm | $JTcm); 358($JTcm | $H3cm) $JTcm; 359 360# LB 27 Treat korean Syllable Block the same as ID (don't break it) 361($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $INcm; 362($JLcm | $JVcm | $JTcm | $H2cm | $H3cm) $POcm; 363$PRcm ($JLcm | $JVcm | $JTcm | $H2cm | $H3cm); 364 365 366# LB 28 Do not break between alphabetics 367# 368$ALcm $ALcm; 369$CM+ $ALcm; # The $CM+ is from rule 10, an unattached CM is treated as AL 370 371# LB 29 372$IScm $ALcm; 373 374# LB 30 375($ALcm | $NUcm) $OPcm; 376$CM+ $OPcm; # The $CM+ is from rule 10, an unattached CM is treated as AL. 377$CPcm ($ALcm | $NUcm); 378 379 380# 381# Reverse Rules. 382# 383## ------------------------------------------------- 384 385!!reverse; 386 387$CM+ $ALPlus; 388$CM+ $BA; 389$CM+ $BB; 390$CM+ $B2; 391$CM+ $CL; 392$CM+ $CP; 393$CM+ $EX; 394$CM+ $GL; 395$CM+ $HY; 396$CM+ $H2; 397$CM+ $H3; 398$CM+ $ID; 399$CM+ $IN; 400$CM+ $IS; 401$CM+ $JL; 402$CM+ $JV; 403$CM+ $JT; 404$CM+ $NS; 405$CM+ $NU; 406$CM+ $OP; 407$CM+ $PO; 408$CM+ $PR; 409$CM+ $QU; 410$CM+ $SY; 411$CM+ $WJ; 412$CM+; 413 414 415# 416# Sequences of the form (shown forwards) 417# [CANT_CM] <break> [CM] [whatever] 418# The CM needs to behave as an AL 419# 420$AL_FOLLOW $CM+ / ( 421 [$BK $CR $LF $NL $ZW {eof}] | 422 $SP+ $CM+ $SP | 423 $SP+ $CM* ([^$OP $CM $SP] | [$AL {eof}])); # if LB 14 will match, need to surpress this break. 424 # LB14 says OP SP* x . 425 # becomes OP SP* x AL 426 # becomes OP SP* x CM+ AL_FOLLOW 427 # 428 # Further note: the $AL in [$AL {eof}] is only to work around 429 # a rule compiler bug which complains about 430 # empty sets otherwise. 431 432# 433# Sequences of the form (shown forwards) 434# [CANT_CM] <break> [CM] <break> [PR] 435# The CM needs to behave as an AL 436# This rule is concerned about getting the second of the two <breaks> in place. 437# 438 439[$PR ] / $CM+ [$BK $CR $LF $NL $ZW $SP {eof}]; 440 441 442 443# LB 4, 5, 5 444 445$LB4Breaks [$LB4NonBreaks-$CM]; 446$LB4Breaks $CM+ $CAN_CM; 447$LF $CR; 448 449 450# LB 7 x SP 451# x ZW 452[$SP $ZW] [$LB4NonBreaks-$CM]; 453[$SP $ZW] $CM+ $CAN_CM; 454 455# LB 8 Break after zero width space 456 457 458# LB 9,10 Combining marks. 459# X $CM needs to behave like X, where X is not $SP or controls. 460# $CM not covered by the above needs to behave like $AL 461# Stick together any combining sequences that don't match other rules. 462$CM+ $CAN_CM; 463 464 465# LB 11 466$CM* $WJ $CM* $CAN_CM; 467$CM* $WJ [$LB8NonBreaks-$CM]; 468 469 $CANT_CM $CM* $WJ; 470$CM* $CAN_CM $CM* $WJ; 471 472# LB 12a 473# [^SP BA HY] x GL 474# 475$CM* $GL $CM* [$LB8NonBreaks-[$CM $SP $BA $HY]]; 476 477# LB 12 478# GL x 479# 480$CANT_CM $CM* $GL; 481$CM* $CAN_CM $CM* $GL; 482 483 484# LB 13 485$CL $CM+ $CAN_CM; 486$CP $CM+ $CAN_CM; 487$EX $CM+ $CAN_CM; 488$IS $CM+ $CAN_CM; 489$SY $CM+ $CAN_CM; 490 491$CL [$LB8NonBreaks-$CM]; 492$CP [$LB8NonBreaks-$CM]; 493$EX [$LB8NonBreaks-$CM]; 494$IS [$LB8NonBreaks-$CM]; 495$SY [$LB8NonBreaks-$CM]; 496 497# Rule 13 & 14 taken together for an edge case. 498# Match this, shown forward 499# OP SP+ ($CM+ behaving as $AL) (CL | CP | EX | IS | IY) 500# This really wants to chain at the $CM+ (which is acting as an $AL) 501# except for $CM chaining being disabled. 502[$CL $CP $EX $IS $SY] $CM+ $SP+ $CM* $OP; 503 504# LB 14 OP SP* x 505# 506$CM* $CAN_CM $SP* $CM* $OP; 507 $CANT_CM $SP* $CM* $OP; 508$AL_FOLLOW? $CM+ $SP $SP* $CM* $OP; # by LB 10, behaves like $AL_FOLLOW? $AL $SP* $CM* $OP 509 510 $AL_FOLLOW_NOCM $CM+ $SP+ $CM* $OP; 511$CM* $AL_FOLLOW_CM $CM+ $SP+ $CM* $OP; 512$SY $CM $SP+ $OP; # TODO: Experiment. Remove. 513 514 515 516# LB 15 517$CM* $OP $SP* $CM* $QU; 518 519# LB 16 520$CM* $NS $SP* $CM* ($CL | $CP); 521 522# LB 17 523$CM* $B2 $SP* $CM* $B2; 524 525# LB 18 break after spaces 526# Nothing explicit needed here. 527 528 529# 530# LB 19 531# 532$CM* $QU $CM* $CAN_CM; # . x QU 533$CM* $QU $LB18NonBreaks; 534 535 536$CM* $CAN_CM $CM* $QU; # QU x . 537 $CANT_CM $CM* $QU; 538 539# 540# LB 20 Break before and after CB. 541# nothing needed here. 542# 543 544# LB 21 545$CM* ($BA | $HY | $NS) $CM* [$LB20NonBreaks-$CM]; # . x (BA | HY | NS) 546 547$CM* [$LB20NonBreaks-$CM] $CM* $BB; # BB x . 548[^$CB] $CM* $BB; # 549 550 551 552# LB 22 553$CM* $IN $CM* $ALPlus; 554$CM* $IN $CM* $ID; 555$CM* $IN $CM* $IN; 556$CM* $IN $CM* $NU; 557 558# LB 23 559$CM* $PO $CM* $ID; 560$CM* $NU $CM* $ALPlus; 561$CM* $ALPlus $CM* $NU; 562 563# LB 24 564$CM* $ID $CM* $PR; 565$CM* $ALPlus $CM* $PR; 566$CM* $ALPlus $CM* $PO; 567 568 569# LB 25 570($CM* ($PR | $PO))? ($CM* ($CL | $CP))? ($CM* ($NU | $IS | $SY))* $CM* $NU ($CM* ($OP | $HY))? ($CM* ($PR | $PO))?; 571 572# LB 26 573$CM* ($H3 | $H2 | $JV | $JL) $CM* $JL; 574$CM* ($JT | $JV) $CM* ($H2 | $JV); 575$CM* $JT $CM* ($H3 | $JT); 576 577# LB 27 578$CM* $IN $CM* ($H3 | $H2 | $JT | $JV | $JL); 579$CM* $PO $CM* ($H3 | $H2 | $JT | $JV | $JL); 580$CM* ($H3 | $H2 | $JT | $JV | $JL) $CM* $PR; 581 582# LB 28 583$CM* $ALPlus $CM* $ALPlus; 584 585 586# LB 29 587$CM* $ALPlus $CM* $IS; 588 589# LB 30 590$CM* $OP $CM* ($ALPlus | $NU); 591$CM* ($ALPlus | $NU) $CM* $CP; 592 593 594## ------------------------------------------------- 595 596!!safe_reverse; 597 598# LB 9 599$CM+ [^$CM $BK $CR $LF $NL $ZW $SP]; 600$CM+ $SP / .; 601 602# LB 14 603$SP+ $CM* $OP; 604 605# LB 15 606$SP+ $CM* $QU; 607 608# LB 16 609$SP+ $CM* ($CL | $CP); 610 611# LB 17 612$SP+ $CM* $B2; 613 614# LB 25 615($CM* ($IS | $SY))+ $CM* $NU; 616($CL | $CP) $CM* ($NU | $IS | $SY); 617 618# For dictionary-based break 619$dictionary $dictionary; 620 621## ------------------------------------------------- 622 623!!safe_forward; 624 625# Skip forward over all character classes that are involved in 626# rules containing patterns with possibly more than one char 627# of context. 628# 629# It might be slightly more efficient to have specific rules 630# instead of one generic one, but only if we could 631# turn off rule chaining. We don't want to move more 632# than necessary. 633# 634[$CM $OP $QU $CL $CP $B2 $PR $HY $SP $dictionary]+ [^$CM $OP $QU $CL $CP $B2 $PR $HY $dictionary]; 635$dictionary $dictionary; 636 637