1# 2# Copyright (C) 2002-2010, International Business Machines Corporation 3# and others. All Rights Reserved. 4# 5# file: word_ja.txt 6# 7# ICU Word Break Rules 8# See Unicode Standard Annex #29. 9# These rules are based on UAX-29 Revision 16 for Unicode 6.0 10# 11# Note: Updates to word.txt will usually need to be merged into 12# word_POSIX.txt and word_ja.txt also. 13 14############################################################################## 15# 16# Character class definitions from TR 29 17# 18############################################################################## 19 20!!chain; 21 22 23# 24# Character Class Definitions. 25# 26 27$CR = [\p{Word_Break = CR}]; 28$LF = [\p{Word_Break = LF}]; 29$Newline = [\p{Word_Break = Newline}]; 30$Extend = [\p{Word_Break = Extend}]; 31$Format = [\p{Word_Break = Format}]; 32$Katakana = [\p{Word_Break = Katakana}]; 33$ALetter = [\p{Word_Break = ALetter}]; 34$MidNumLet = [\p{Word_Break = MidNumLet}]; 35$MidLetter = [\p{Word_Break = MidLetter}]; 36$MidNum = [\p{Word_Break = MidNum}]; 37$Numeric = [\p{Word_Break = Numeric}]; 38$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; 39 40 41# Dictionary character set, for triggering language-based break engines. Currently 42# limited to LineBreak=Complex_Context. Note that this set only works in Unicode 43# 5.0 or later as the definition of Complex_Context was corrected to include all 44# characters requiring dictionary break. 45 46$dictionary = [:LineBreak = Complex_Context:]; 47$Control = [\p{Grapheme_Cluster_Break = Control}]; 48$ALetterPlus = [$ALetter [$dictionary-$Extend-$Control]]; # Note: default ALetter does not 49 # include the dictionary characters. 50 51# 52# Rules 4 Ignore Format and Extend characters, 53# except when they appear at the beginning of a region of text. 54# 55$KatakanaEx = $Katakana ($Extend | $Format)*; 56$ALetterEx = $ALetterPlus ($Extend | $Format)*; 57$MidNumLetEx = $MidNumLet ($Extend | $Format)*; 58$MidLetterEx = $MidLetter ($Extend | $Format)*; 59$MidNumEx = $MidNum ($Extend | $Format)*; 60$NumericEx = $Numeric ($Extend | $Format)*; 61$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; 62 63$Hiragana = [\p{script=Hiragana}]; 64$Ideographic = [\p{Ideographic} [\u3005 \u3007 \u303B]]; 65$HiraganaEx = $Hiragana ($Extend | $Format)*; 66$IdeographicEx = $Ideographic ($Extend | $Format)*; 67 68## ------------------------------------------------- 69 70!!forward; 71 72 73# Rule 3 - CR x LF 74# 75$CR $LF; 76 77# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning 78# of a region of Text. The rule here comes into play when the start of text 79# begins with a group of Format chars, or with a "word" consisting of a single 80# char that is not in any of the listed word break categories followed by 81# format char(s). 82[^$CR $LF $Newline]? ($Extend | $Format)+; 83 84$NumericEx {100}; 85$ALetterEx {200}; 86$KatakanaEx {300}; # note: these status values override those from rule 5 87$HiraganaEx {300}; # by virtual of being numerically larger. 88$IdeographicEx {400}; # 89 90# 91# rule 5 92# Do not break between most letters. 93# 94$ALetterEx $ALetterEx {200}; 95 96# rule 6 and 7 97$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200}; 98 99# rule 8 100 101$NumericEx $NumericEx {100}; 102 103# rule 9 104 105$ALetterEx $NumericEx {200}; 106 107# rule 10 108 109$NumericEx $ALetterEx {200}; 110 111# rule 11 and 12 112 113$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100}; 114 115# rule 13 116 117$KatakanaEx $KatakanaEx {300}; 118$HiraganaEx $HiraganaEx {300}; 119$IdeographicEx $IdeographicEx {400}; 120 121 122# rule 13a/b 123 124$ALetterEx $ExtendNumLetEx {200}; # (13a) 125$NumericEx $ExtendNumLetEx {100}; # (13a) 126$KatakanaEx $ExtendNumLetEx {300}; # (13a) 127$ExtendNumLetEx $ExtendNumLetEx {200}; # (13a) 128 129$ExtendNumLetEx $ALetterEx {200}; # (13b) 130$ExtendNumLetEx $NumericEx {100}; # (13b) 131$ExtendNumLetEx $KatakanaEx {300}; # (13b) 132 133 134 135## ------------------------------------------------- 136 137!!reverse; 138 139$BackALetterEx = ($Format | $Extend)* $ALetterPlus; 140$BackMidNumLetEx = ($Format | $Extend)* $MidNumLet; 141$BackNumericEx = ($Format | $Extend)* $Numeric; 142$BackMidNumEx = ($Format | $Extend)* $MidNum; 143$BackMidLetterEx = ($Format | $Extend)* $MidLetter; 144$BackKatakanaEx = ($Format | $Extend)* $Katakana; 145$BackHiraganaEx = ($Format | $Extend)* $Hiragana; 146$BackIdeographicEx = ($Format | $Extend)* $Ideographic; 147$BackExtendNumLetEx= ($Format | $Extend)* $ExtendNumLet; 148 149# rule 3 150$LF $CR; 151 152# rule 4 153($Format | $Extend)* [^$CR $LF $Newline]?; 154 155# rule 5 156 157$BackALetterEx $BackALetterEx; 158 159# rule 6 and 7 160 161$BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx; 162 163 164# rule 8 165 166$BackNumericEx $BackNumericEx; 167 168# rule 9 169 170$BackNumericEx $BackALetterEx; 171 172# rule 10 173 174$BackALetterEx $BackNumericEx; 175 176# rule 11 and 12 177 178$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx) $BackNumericEx; 179 180# rule 13 181 182$BackKatakanaEx $BackKatakanaEx; 183$BackHiraganaEx $BackHiraganaEx; 184$BackIdeographicEx $BackIdeographicEx; 185 186 187 188# rules 13 a/b 189# 190$BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx); 191($BackALetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx; 192 193## ------------------------------------------------- 194 195!!safe_reverse; 196 197# rule 3 198($Extend | $Format)+ .?; 199 200# rule 6 201($MidLetter | $MidNumLet) $BackALetterEx; 202 203# rule 11 204($MidNum | $MidNumLet) $BackNumericEx; 205 206# For dictionary-based break 207$dictionary $dictionary; 208 209## ------------------------------------------------- 210 211!!safe_forward; 212 213# rule 4 214($Extend | $Format)+ .?; 215 216# rule 6 217($MidLetterEx | $MidNumLetEx) $ALetterEx; 218 219# rule 11 220($MidNumEx | $MidNumLetEx) $NumericEx; 221 222# For dictionary-based break 223$dictionary $dictionary; 224