1# 2# Copyright (C) 2002-2010, International Business Machines Corporation 3# and others. All Rights Reserved. 4# 5# file: word.txt 6# 7# ICU Word Break Rules 8# See Unicode Standard Annex #29. 9# These rules are based on UAX-29 Revision 16 for Unicode 6.0 10# 11# Note: Updates to word.txt will usually need to be merged into 12# word_POSIX.txt and word_ja.txt also. 13 14############################################################################## 15# 16# Character class definitions from TR 29 17# 18############################################################################## 19 20!!chain; 21 22 23# 24# Character Class Definitions. 25# 26 27$CR = [\p{Word_Break = CR}]; 28$LF = [\p{Word_Break = LF}]; 29$Newline = [\p{Word_Break = Newline}]; 30$Extend = [\p{Word_Break = Extend}]; 31$Format = [\p{Word_Break = Format}]; 32$Katakana = [\p{Word_Break = Katakana}]; 33$ALetter = [\p{Word_Break = ALetter}]; 34$MidNumLet = [\p{Word_Break = MidNumLet}]; 35$MidLetter = [\p{Word_Break = MidLetter}]; 36$MidNum = [\p{Word_Break = MidNum}]; 37$Numeric = [\p{Word_Break = Numeric}]; 38$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; 39 40 41# Dictionary character set, for triggering language-based break engines. Currently 42# limited to LineBreak=Complex_Context. Note that this set only works in Unicode 43# 5.0 or later as the definition of Complex_Context was corrected to include all 44# characters requiring dictionary break. 45 46$dictionary = [:LineBreak = Complex_Context:]; 47$Control = [\p{Grapheme_Cluster_Break = Control}]; 48$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default ALetter does not 49 # include the dictionary characters. 50 51# 52# Rules 4 Ignore Format and Extend characters, 53# except when they appear at the beginning of a region of text. 54# 55$KatakanaEx = $Katakana ($Extend | $Format)*; 56$ALetterEx = $ALetterPlus ($Extend | $Format)*; 57$MidNumLetEx = $MidNumLet ($Extend | $Format)*; 58$MidLetterEx = $MidLetter ($Extend | $Format)*; 59$MidNumEx = $MidNum ($Extend | $Format)*; 60$NumericEx = $Numeric ($Extend | $Format)*; 61$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; 62 63$Hiragana = [\p{script=Hiragana}]; 64$Ideographic = [\p{Ideographic}]; 65$HiraganaEx = $Hiragana ($Extend | $Format)*; 66$IdeographicEx = $Ideographic ($Extend | $Format)*; 67 68## ------------------------------------------------- 69 70!!forward; 71 72 73# Rule 3 - CR x LF 74# 75$CR $LF; 76 77# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning 78# of a region of Text. The rule here comes into play when the start of text 79# begins with a group of Format chars, or with a "word" consisting of a single 80# char that is not in any of the listed word break categories followed by 81# format char(s). 82[^$CR $LF $Newline]? ($Extend | $Format)+; 83 84$NumericEx {100}; 85$ALetterEx {200}; 86$KatakanaEx {300}; # note: these status values override those from rule 5 87$HiraganaEx {300}; # by virtual of being numerically larger. 88$IdeographicEx {400}; # 89 90# 91# rule 5 92# Do not break between most letters. 93# 94$ALetterEx $ALetterEx {200}; 95 96# rule 6 and 7 97$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200}; 98 99# rule 8 100 101$NumericEx $NumericEx {100}; 102 103# rule 9 104 105$ALetterEx $NumericEx {200}; 106 107# rule 10 108 109$NumericEx $ALetterEx {200}; 110 111# rule 11 and 12 112 113$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100}; 114 115# rule 13 116 117$KatakanaEx $KatakanaEx {300}; 118 119# rule 13a/b 120 121$ALetterEx $ExtendNumLetEx {200}; # (13a) 122$NumericEx $ExtendNumLetEx {100}; # (13a) 123$KatakanaEx $ExtendNumLetEx {300}; # (13a) 124$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a) 125 126$ExtendNumLetEx $ALetterEx {200}; # (13b) 127$ExtendNumLetEx $NumericEx {100}; # (13b) 128$ExtendNumLetEx $KatakanaEx {300}; # (13b) 129 130 131 132## ------------------------------------------------- 133 134!!reverse; 135 136$BackALetterEx = ($Format | $Extend)* $ALetterPlus; 137$BackMidNumLetEx = ($Format | $Extend)* $MidNumLet; 138$BackNumericEx = ($Format | $Extend)* $Numeric; 139$BackMidNumEx = ($Format | $Extend)* $MidNum; 140$BackMidLetterEx = ($Format | $Extend)* $MidLetter; 141$BackKatakanaEx = ($Format | $Extend)* $Katakana; 142$BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet; 143 144# rule 3 145$LF $CR; 146 147# rule 4 148($Format | $Extend)* [^$CR $LF $Newline]?; 149 150# rule 5 151 152$BackALetterEx $BackALetterEx; 153 154# rule 6 and 7 155 156$BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx; 157 158 159# rule 8 160 161$BackNumericEx $BackNumericEx; 162 163# rule 9 164 165$BackNumericEx $BackALetterEx; 166 167# rule 10 168 169$BackALetterEx $BackNumericEx; 170 171# rule 11 and 12 172 173$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx) $BackNumericEx; 174 175# rule 13 176 177$BackKatakanaEx $BackKatakanaEx; 178 179# rules 13 a/b 180# 181$BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx); 182($BackALetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx; 183 184## ------------------------------------------------- 185 186!!safe_reverse; 187 188# rule 3 189($Extend | $Format)+ .?; 190 191# rule 6 192($MidLetter | $MidNumLet) $BackALetterEx; 193 194# rule 11 195($MidNum | $MidNumLet) $BackNumericEx; 196 197# For dictionary-based break 198$dictionary $dictionary; 199 200## ------------------------------------------------- 201 202!!safe_forward; 203 204# rule 4 205($Extend | $Format)+ .?; 206 207# rule 6 208($MidLetterEx | $MidNumLetEx) $ALetterEx; 209 210# rule 11 211($MidNumEx | $MidNumLetEx) $NumericEx; 212 213# For dictionary-based break 214$dictionary $dictionary; 215