1# 2# Copyright (C) 2002-2013, International Business Machines Corporation 3# and others. All Rights Reserved. 4# 5# file: word_ja.txt 6# 7# ICU Word Break Rules 8# See Unicode Standard Annex #29. 9# These rules are based on UAX #29 Revision 22 for Unicode Version 6.3 10# 11# Note: Updates to word.txt will usually need to be merged into 12# word_POSIX.txt also. 13 14############################################################################## 15# 16# Character class definitions from TR 29 17# 18############################################################################## 19 20!!chain; 21 22 23# 24# Character Class Definitions. 25# 26 27$CR = [\p{Word_Break = CR}]; 28$LF = [\p{Word_Break = LF}]; 29$Newline = [\p{Word_Break = Newline}]; 30$Extend = [\p{Word_Break = Extend}]; 31$Regional_Indicator = [\p{Word_Break = Regional_Indicator}]; 32$Format = [\p{Word_Break = Format}]; 33$Katakana = [\p{Word_Break = Katakana}]; 34$Hebrew_Letter = [\p{Word_Break = Hebrew_Letter}]; 35$ALetter = [\p{Word_Break = ALetter}]; 36$Single_Quote = [\p{Word_Break = Single_Quote}]; 37$Double_Quote = [\p{Word_Break = Double_Quote}]; 38# Remove two full stop characters from $MidNumLet and add them to $MidNum 39# to break a hostname into its components at the cost of breaking 40# 'e.g.' and 'i.e.' as well. 41# $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12. 42# Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected 43# while rules 6/7 are reverted to the old behavior we want. 44$MidNumLet = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]]; 45$MidLetter = [\p{Word_Break = MidLetter}]; 46$MidNum = [\p{Word_Break = MidNum}[\u002E \uFF0E]]; 47$Numeric = [\p{Word_Break = Numeric}[\uff10-\uff19]]; #includes fullwidth digits 48$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; 49 50$Han = [:Han:]; 51$Hiragana = [:Hiragana:]; 52 53 54# Dictionary character set, for triggering language-based break engines. Currently 55# limited to LineBreak=Complex_Context. Note that this set only works in Unicode 56# 5.0 or later as the definition of Complex_Context was corrected to include all 57# characters requiring dictionary break. 58 59$Control = [\p{Grapheme_Cluster_Break = Control}]; 60$HangulSyllable = [\uac00-\ud7a3]; 61$ComplexContext = [:LineBreak = Complex_Context:]; 62$KanaKanji = [$Han $Hiragana $Katakana]; 63$dictionary = [$ComplexContext]; 64 65$ALetterPlus = [$ALetter [$ComplexContext-$Extend-$Control]]; 66 67 68# 69# Rules 4 Ignore Format and Extend characters, 70# except when they appear at the beginning of a region of text. 71# 72# TODO: check if handling of katakana in dictionary makes rules incorrect/void 73$KatakanaEx = $Katakana ($Extend | $Format)*; 74$Hebrew_LetterEx = $Hebrew_Letter ($Extend | $Format)*; 75$ALetterEx = $ALetterPlus ($Extend | $Format)*; 76$Single_QuoteEx = $Single_Quote ($Extend | $Format)*; 77$Double_QuoteEx = $Double_Quote ($Extend | $Format)*; 78$MidNumLetEx = $MidNumLet ($Extend | $Format)*; 79$MidLetterEx = $MidLetter ($Extend | $Format)*; 80$MidNumEx = $MidNum ($Extend | $Format)*; 81$NumericEx = $Numeric ($Extend | $Format)*; 82$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; 83$Regional_IndicatorEx = $Regional_Indicator ($Extend | $Format)*; 84 85$Ideographic = [\p{Ideographic} [\u3005 \u3007 \u303B]]; 86$HiraganaEx = $Hiragana ($Extend | $Format)*; 87$IdeographicEx = $Ideographic ($Extend | $Format)*; 88 89## ------------------------------------------------- 90 91!!forward; 92 93 94# Rule 3 - CR x LF 95# 96$CR $LF; 97 98# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning 99# of a region of Text. The rule here comes into play when the start of text 100# begins with a group of Format chars, or with a "word" consisting of a single 101# char that is not in any of the listed word break categories followed by 102# format char(s), or is not a CJK dictionary character. 103[^$CR $LF $Newline]? ($Extend | $Format)+; 104 105$NumericEx {100}; 106$ALetterEx {200}; 107$HangulSyllable {200}; 108$Hebrew_LetterEx{200}; 109$KatakanaEx {400}; # note: these status values override those from rule 5 110$HiraganaEx {400}; # by virtue of being numerically larger. 111$IdeographicEx {400}; # 112 113# 114# rule 5 115# Do not break between most letters. 116# 117($ALetterEx | $Hebrew_LetterEx) ($ALetterEx | $Hebrew_LetterEx) {200}; 118 119# rule 6 and 7 120($ALetterEx | $Hebrew_LetterEx) ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx) {200}; 121 122# rule 7a 123$Hebrew_LetterEx $Single_QuoteEx {200}; 124 125# rule 7b and 7c 126$Hebrew_LetterEx $Double_QuoteEx $Hebrew_LetterEx {200}; 127 128# rule 8 129 130$NumericEx $NumericEx {100}; 131 132# rule 9 133 134($ALetterEx | $Hebrew_LetterEx) $NumericEx {200}; 135 136# rule 10 137 138$NumericEx ($ALetterEx | $Hebrew_LetterEx) {200}; 139 140# rule 11 and 12 141 142$NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100}; 143 144# rule 13 145# to be consistent with $KanaKanji $KanaKanhi, changed 146# from 300 to 400. 147# See also TestRuleStatus in intltest/rbbiapts.cpp 148$KatakanaEx $KatakanaEx {400}; 149$HiraganaEx $HiraganaEx {400}; 150$IdeographicEx $IdeographicEx {400}; 151 152# rule 13a/b 153 154$ALetterEx $ExtendNumLetEx {200}; # (13a) 155$Hebrew_LetterEx $ExtendNumLetEx {200}; # (13a) 156$NumericEx $ExtendNumLetEx {100}; # (13a) 157$KatakanaEx $ExtendNumLetEx {400}; # (13a) 158$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a) 159 160$ExtendNumLetEx $ALetterEx {200}; # (13b) 161$ExtendNumLetEx $Hebrew_Letter {200}; # (13b) 162$ExtendNumLetEx $NumericEx {100}; # (13b) 163$ExtendNumLetEx $KatakanaEx {400}; # (13b) 164 165# rule 13c 166 167$Regional_IndicatorEx $Regional_IndicatorEx; 168 169## ------------------------------------------------- 170 171!!reverse; 172 173$BackHebrew_LetterEx = ($Format | $Extend)* $Hebrew_Letter; 174$BackALetterEx = ($Format | $Extend)* $ALetterPlus; 175$BackSingle_QuoteEx = ($Format | $Extend)* $Single_Quote; 176$BackDouble_QuoteEx = ($Format | $Extend)* $Double_Quote; 177$BackMidNumLetEx = ($Format | $Extend)* $MidNumLet; 178$BackNumericEx = ($Format | $Extend)* $Numeric; 179$BackMidNumEx = ($Format | $Extend)* $MidNum; 180$BackMidLetterEx = ($Format | $Extend)* $MidLetter; 181$BackKatakanaEx = ($Format | $Extend)* $Katakana; 182$BackHiraganaEx = ($Format | $Extend)* $Hiragana; 183$BackIdeographicEx = ($Format | $Extend)* $Ideographic; 184$BackExtendNumLetEx = ($Format | $Extend)* $ExtendNumLet; 185$BackRegional_IndicatorEx = ($Format | $Extend)* $Regional_Indicator; 186 187# rule 3 188$LF $CR; 189 190# rule 4 191($Format | $Extend)* [^$CR $LF $Newline]?; 192 193# rule 5 194 195($BackALetterEx | $BackHebrew_LetterEx) ($BackALetterEx | $BackHebrew_LetterEx); 196 197# rule 6 and 7 198 199($BackALetterEx | $BackHebrew_LetterEx) ($BackMidLetterEx | $BackMidNumLetEx | $BackSingle_QuoteEx) ($BackALetterEx | $BackHebrew_LetterEx); 200 201# rule 7a 202$BackSingle_QuoteEx $BackHebrew_LetterEx; 203 204# Rule 7b and 7c 205$BackHebrew_LetterEx $BackDouble_QuoteEx $BackHebrew_LetterEx; 206 207# rule 8 208 209$BackNumericEx $BackNumericEx; 210 211# rule 9 212 213$BackNumericEx ($BackALetterEx | $BackHebrew_LetterEx); 214 215# rule 10 216 217($BackALetterEx | $BackHebrew_LetterEx) $BackNumericEx; 218 219# rule 11 and 12 220 221$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx | $BackSingle_QuoteEx) $BackNumericEx; 222 223# rule 13 224 225$BackKatakanaEx $BackKatakanaEx; 226$BackHiraganaEx $BackHiraganaEx; 227$BackIdeographicEx $BackIdeographicEx; 228 229# rules 13 a/b 230# 231$BackExtendNumLetEx ($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx); 232($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx; 233 234# rule 13c 235 236$BackRegional_IndicatorEx $BackRegional_IndicatorEx; 237 238## ------------------------------------------------- 239 240!!safe_reverse; 241 242# rule 3 243($Extend | $Format)+ .?; 244 245# rule 6 246($MidLetter | $MidNumLet | $Single_Quote) ($BackALetterEx | $BackHebrew_LetterEx); 247 248# rule 7b 249$Double_Quote $BackHebrew_LetterEx; 250 251 252# rule 11 253($MidNum | $MidNumLet | $Single_Quote) $BackNumericEx; 254 255# For dictionary-based break 256$dictionary $dictionary; 257 258## ------------------------------------------------- 259 260!!safe_forward; 261 262# rule 4 263($Extend | $Format)+ .?; 264 265# rule 6 266($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx); 267 268# rule 7b 269$Double_QuoteEx $Hebrew_LetterEx; 270 271# rule 11 272($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx; 273 274# For dictionary-based break 275$dictionary $dictionary; 276 277# Skip over potentially very long words or numbers. 278# Not necessary to reach a safe point, but avoids potential performance 279problems. 280($ALetter | $Numeric) ($ALetter | $Numeric); 281