1# 2# Copyright (C) 2002-2013, International Business Machines Corporation 3# and others. All Rights Reserved. 4# 5# file: word.txt 6# 7# ICU Word Break Rules 8# See Unicode Standard Annex #29. 9# These rules are based on UAX #29 Revision 20 for Unicode Version 6.2 10# 11# Note: Updates to word.txt will usually need to be merged into 12# word_POSIX.txt also. 13 14############################################################################## 15# 16# Character class definitions from TR 29 17# 18############################################################################## 19 20!!chain; 21 22 23# 24# Character Class Definitions. 25# 26 27$CR = [\p{Word_Break = CR}]; 28$LF = [\p{Word_Break = LF}]; 29$Newline = [\p{Word_Break = Newline}]; 30$Extend = [\p{Word_Break = Extend}]; 31$Format = [\p{Word_Break = Format}]; 32$Hiragana = [:Hiragana:]; 33$Katakana = [\p{Word_Break = Katakana}]; 34$Han = [:Han:]; 35$ALetter = [\p{Word_Break = ALetter}]; 36$MidNumLet = [\p{Word_Break = MidNumLet}]; 37$MidLetter = [\p{Word_Break = MidLetter}]; 38$MidNum = [\p{Word_Break = MidNum}]; 39$Numeric = [\p{Word_Break = Numeric}]; 40$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; 41$Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; 42 43 44# Dictionary character set, for triggering language-based break engines. Currently 45# limited to LineBreak=Complex_Context. Note that this set only works in Unicode 46# 5.0 or later as the definition of Complex_Context was corrected to include all 47# characters requiring dictionary break. 48 49$Control = [\p{Grapheme_Cluster_Break = Control}]; 50$HangulSyllable = [\uac00-\ud7a3]; 51$ComplexContext = [:LineBreak = Complex_Context:]; 52$KanaKanji = [$Han $Hiragana $Katakana]; 53$dictionaryCJK = [$KanaKanji $HangulSyllable]; 54$dictionary = [$ComplexContext $dictionaryCJK]; 55 56# leave CJK scripts out of ALetterPlus 57$ALetterPlus = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]]; 58 59 60# 61# Rules 4 Ignore Format and Extend characters, 62# except when they appear at the beginning of a region of text. 63# 64# TODO: check if handling of katakana in dictionary makes rules incorrect/void 65$KatakanaEx = $Katakana ($Extend | $Format)*; 66$ALetterEx = $ALetterPlus ($Extend | $Format)*; 67$MidNumLetEx = $MidNumLet ($Extend | $Format)*; 68$MidLetterEx = $MidLetter ($Extend | $Format)*; 69$MidNumEx = $MidNum ($Extend | $Format)*; 70$NumericEx = $Numeric ($Extend | $Format)*; 71$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; 72$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format)*; 73 74$Ideographic = [\p{Ideographic}]; 75$HiraganaEx = $Hiragana ($Extend | $Format)*; 76$IdeographicEx = $Ideographic ($Extend | $Format)*; 77 78## ------------------------------------------------- 79 80!!forward; 81 82 83# Rule 3 - CR x LF 84# 85$CR $LF; 86 87# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning 88# of a region of Text. The rule here comes into play when the start of text 89# begins with a group of Format chars, or with a "word" consisting of a single 90# char that is not in any of the listed word break categories followed by 91# format char(s), or is not a CJK dictionary character. 92[^$CR $LF $Newline]? ($Extend | $Format)+; 93 94$NumericEx {100}; 95$ALetterEx {200}; 96$HangulSyllable {200}; 97$KatakanaEx {400}; # note: these status values override those from rule 5 98$HiraganaEx {400}; # by virtue of being numerically larger. 99$IdeographicEx {400}; # 100 101# 102# rule 5 103# Do not break between most letters. 104# 105$ALetterEx $ALetterEx {200}; 106 107# rule 6 and 7 108$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200}; 109 110# rule 8 111 112$NumericEx $NumericEx {100}; 113 114# rule 9 115 116$ALetterEx $NumericEx {200}; 117 118# rule 10 119 120$NumericEx $ALetterEx {200}; 121 122# rule 11 and 12 123 124$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100}; 125 126# rule 13 127# to be consistent with $KanaKanji $KanaKanhi, changed 128# from 300 to 400. 129# See also TestRuleStatus in intltest/rbbiapts.cpp 130$KatakanaEx $KatakanaEx {400}; 131 132# rule 13a/b 133 134$ALetterEx $ExtendNumLetEx {200}; # (13a) 135$NumericEx $ExtendNumLetEx {100}; # (13a) 136$KatakanaEx $ExtendNumLetEx {400}; # (13a) 137$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a) 138 139$ExtendNumLetEx $ALetterEx {200}; # (13b) 140$ExtendNumLetEx $NumericEx {100}; # (13b) 141$ExtendNumLetEx $KatakanaEx {400}; # (13b) 142 143# rule 13c 144 145$Regional_IndicatorEx $Regional_IndicatorEx; 146 147# special handling for CJK characters: chain for later dictionary segmentation 148$HangulSyllable $HangulSyllable {200}; 149$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found 150 151 152## ------------------------------------------------- 153 154!!reverse; 155 156$BackALetterEx = ($Format | $Extend)* $ALetterPlus; 157$BackMidNumLetEx = ($Format | $Extend)* $MidNumLet; 158$BackNumericEx = ($Format | $Extend)* $Numeric; 159$BackMidNumEx = ($Format | $Extend)* $MidNum; 160$BackMidLetterEx = ($Format | $Extend)* $MidLetter; 161$BackKatakanaEx = ($Format | $Extend)* $Katakana; 162$BackHiraganaEx = ($Format | $Extend)* $Hiragana; 163$BackExtendNumLetEx = ($Format | $Extend)* $ExtendNumLet; 164$BackRegional_IndicatorEx = ($Format | $Extend)* $Regional_Indicator; 165 166# rule 3 167$LF $CR; 168 169# rule 4 170($Format | $Extend)* [^$CR $LF $Newline]?; 171 172# rule 5 173 174$BackALetterEx $BackALetterEx; 175 176# rule 6 and 7 177 178$BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx; 179 180 181# rule 8 182 183$BackNumericEx $BackNumericEx; 184 185# rule 9 186 187$BackNumericEx $BackALetterEx; 188 189# rule 10 190 191$BackALetterEx $BackNumericEx; 192 193# rule 11 and 12 194 195$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx) $BackNumericEx; 196 197# rule 13 198 199$BackKatakanaEx $BackKatakanaEx; 200 201# rules 13 a/b 202# 203$BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx); 204($BackALetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx; 205 206# rule 13c 207 208$BackRegional_IndicatorEx $BackRegional_IndicatorEx; 209 210# special handling for CJK characters: chain for later dictionary segmentation 211$HangulSyllable $HangulSyllable; 212$KanaKanji $KanaKanji; #different rule status if both kanji and kana found 213 214## ------------------------------------------------- 215 216!!safe_reverse; 217 218# rule 3 219($Extend | $Format)+ .?; 220 221# rule 6 222($MidLetter | $MidNumLet) $BackALetterEx; 223 224# rule 11 225($MidNum | $MidNumLet) $BackNumericEx; 226 227# For dictionary-based break 228$dictionary $dictionary; 229 230## ------------------------------------------------- 231 232!!safe_forward; 233 234# rule 4 235($Extend | $Format)+ .?; 236 237# rule 6 238($MidLetterEx | $MidNumLetEx) $ALetterEx; 239 240# rule 11 241($MidNumEx | $MidNumLetEx) $NumericEx; 242 243# For dictionary-based break 244$dictionary $dictionary; 245