• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#
2# Copyright (C) 2002-2013, International Business Machines Corporation
3# and others. All Rights Reserved.
4#
5# file:  word_ja.txt
6#
7# ICU Word Break Rules
8#      See Unicode Standard Annex #29.
9#      These rules are based on UAX #29 Revision 22 for Unicode Version 6.3
10#
11# Note:  Updates to word.txt will usually need to be merged into
12#        word_POSIX.txt also.
13
14##############################################################################
15#
16#  Character class definitions from TR 29
17#
18##############################################################################
19
20!!chain;
21
22
23#
24#  Character Class Definitions.
25#
26
27$CR                 = [\p{Word_Break = CR}];
28$LF                 = [\p{Word_Break = LF}];
29$Newline            = [\p{Word_Break = Newline}];
30$Extend             = [\p{Word_Break = Extend}];
31$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
32$Format             = [\p{Word_Break = Format}];
33$Katakana           = [\p{Word_Break = Katakana}];
34$Hebrew_Letter      = [\p{Word_Break = Hebrew_Letter}];
35$ALetter            = [\p{Word_Break = ALetter}];
36$Single_Quote       = [\p{Word_Break = Single_Quote}];
37$Double_Quote       = [\p{Word_Break = Double_Quote}];
38# Remove two full stop characters from $MidNumLet and add them to $MidNum
39# to break a hostname into its components at the cost of breaking
40# 'e.g.' and 'i.e.' as well.
41# $MidNumLet is used in rules 6/7 (rules of our interest) and rules 11/12.
42# Because it's OR'd with $MidNum in rules 11/12, rules 11/12 are not affected
43# while rules 6/7 are reverted to the old behavior we want.
44$MidNumLet    = [[\p{Word_Break = MidNumLet}] - [\u002E \uFF0E]];
45$MidLetter          = [\p{Word_Break = MidLetter}];
46$MidNum       = [\p{Word_Break = MidNum}[\u002E \uFF0E]];
47$Numeric      = [\p{Word_Break = Numeric}[\uff10-\uff19]]; #includes fullwidth digits
48$ExtendNumLet       = [\p{Word_Break = ExtendNumLet}];
49
50$Han                = [:Han:];
51$Hiragana           = [:Hiragana:];
52
53
54#   Dictionary character set, for triggering language-based break engines. Currently
55#   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
56#   5.0 or later as the definition of Complex_Context was corrected to include all
57#   characters requiring dictionary break.
58
59$Control        = [\p{Grapheme_Cluster_Break = Control}];
60$HangulSyllable = [\uac00-\ud7a3];
61$ComplexContext = [:LineBreak = Complex_Context:];
62$KanaKanji      = [$Han $Hiragana $Katakana];
63$dictionary     = [$ComplexContext];
64
65$ALetterPlus  = [$ALetter [$ComplexContext-$Extend-$Control]];
66
67
68#
69#  Rules 4    Ignore Format and Extend characters,
70#             except when they appear at the beginning of a region of text.
71#
72# TODO: check if handling of katakana in dictionary makes rules incorrect/void
73$KatakanaEx           = $Katakana           ($Extend |  $Format)*;
74$Hebrew_LetterEx      = $Hebrew_Letter      ($Extend |  $Format)*;
75$ALetterEx            = $ALetterPlus        ($Extend |  $Format)*;
76$Single_QuoteEx       = $Single_Quote       ($Extend |  $Format)*;
77$Double_QuoteEx       = $Double_Quote       ($Extend |  $Format)*;
78$MidNumLetEx          = $MidNumLet          ($Extend |  $Format)*;
79$MidLetterEx          = $MidLetter          ($Extend |  $Format)*;
80$MidNumEx             = $MidNum             ($Extend |  $Format)*;
81$NumericEx            = $Numeric            ($Extend |  $Format)*;
82$ExtendNumLetEx       = $ExtendNumLet       ($Extend |  $Format)*;
83$Regional_IndicatorEx = $Regional_Indicator ($Extend |  $Format)*;
84
85$Ideographic    = [\p{Ideographic} [\u3005 \u3007 \u303B]];
86$HiraganaEx     = $Hiragana     ($Extend |  $Format)*;
87$IdeographicEx  = $Ideographic  ($Extend |  $Format)*;
88
89## -------------------------------------------------
90
91!!forward;
92
93
94# Rule 3 - CR x LF
95#
96$CR $LF;
97
98# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
99#          of a region of Text.   The rule here comes into play when the start of text
100#          begins with a group of Format chars, or with a "word" consisting of a single
101#          char that is not in any of the listed word break categories followed by
102#          format char(s), or is not a CJK dictionary character.
103[^$CR $LF $Newline]? ($Extend |  $Format)+;
104
105$NumericEx {100};
106$ALetterEx {200};
107$HangulSyllable {200};
108$Hebrew_LetterEx{200};
109$KatakanaEx {400};       # note:  these status values override those from rule 5
110$HiraganaEx {400};       #        by virtue of being numerically larger.
111$IdeographicEx {400};    #
112
113#
114# rule 5
115#    Do not break between most letters.
116#
117($ALetterEx | $Hebrew_LetterEx)  ($ALetterEx | $Hebrew_LetterEx) {200};
118
119# rule 6 and 7
120($ALetterEx | $Hebrew_LetterEx) ($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx) {200};
121
122# rule 7a
123$Hebrew_LetterEx $Single_QuoteEx {200};
124
125# rule 7b and 7c
126$Hebrew_LetterEx $Double_QuoteEx $Hebrew_LetterEx {200};
127
128# rule 8
129
130$NumericEx $NumericEx {100};
131
132# rule 9
133
134($ALetterEx | $Hebrew_LetterEx) $NumericEx {200};
135
136# rule 10
137
138$NumericEx ($ALetterEx | $Hebrew_LetterEx) {200};
139
140# rule 11 and 12
141
142$NumericEx ($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx {100};
143
144# rule 13
145# to be consistent with $KanaKanji $KanaKanhi, changed
146# from 300 to 400.
147# See also TestRuleStatus in intltest/rbbiapts.cpp
148$KatakanaEx  $KatakanaEx {400};
149$HiraganaEx  $HiraganaEx {400};
150$IdeographicEx  $IdeographicEx {400};
151
152# rule 13a/b
153
154$ALetterEx       $ExtendNumLetEx {200};    #  (13a)
155$Hebrew_LetterEx $ExtendNumLetEx {200};    #  (13a)
156$NumericEx       $ExtendNumLetEx {100};    #  (13a)
157$KatakanaEx      $ExtendNumLetEx {400};    #  (13a)
158$ExtendNumLetEx  $ExtendNumLetEx {200};    #  (13a)
159
160$ExtendNumLetEx  $ALetterEx      {200};    #  (13b)
161$ExtendNumLetEx  $Hebrew_Letter  {200};    #  (13b)
162$ExtendNumLetEx  $NumericEx      {100};    #  (13b)
163$ExtendNumLetEx  $KatakanaEx     {400};    #  (13b)
164
165# rule 13c
166
167$Regional_IndicatorEx $Regional_IndicatorEx;
168
169## -------------------------------------------------
170
171!!reverse;
172
173$BackHebrew_LetterEx      = ($Format | $Extend)* $Hebrew_Letter;
174$BackALetterEx            = ($Format | $Extend)* $ALetterPlus;
175$BackSingle_QuoteEx       = ($Format | $Extend)* $Single_Quote;
176$BackDouble_QuoteEx       = ($Format | $Extend)* $Double_Quote;
177$BackMidNumLetEx          = ($Format | $Extend)* $MidNumLet;
178$BackNumericEx            = ($Format | $Extend)* $Numeric;
179$BackMidNumEx             = ($Format | $Extend)* $MidNum;
180$BackMidLetterEx          = ($Format | $Extend)* $MidLetter;
181$BackKatakanaEx           = ($Format | $Extend)* $Katakana;
182$BackHiraganaEx           = ($Format | $Extend)* $Hiragana;
183$BackIdeographicEx        = ($Format | $Extend)* $Ideographic;
184$BackExtendNumLetEx       = ($Format | $Extend)* $ExtendNumLet;
185$BackRegional_IndicatorEx = ($Format | $Extend)* $Regional_Indicator;
186
187# rule 3
188$LF $CR;
189
190# rule 4
191($Format | $Extend)*  [^$CR $LF $Newline]?;
192
193# rule 5
194
195($BackALetterEx | $BackHebrew_LetterEx) ($BackALetterEx | $BackHebrew_LetterEx);
196
197# rule 6 and 7
198
199($BackALetterEx | $BackHebrew_LetterEx) ($BackMidLetterEx | $BackMidNumLetEx | $BackSingle_QuoteEx) ($BackALetterEx | $BackHebrew_LetterEx);
200
201# rule 7a
202$BackSingle_QuoteEx $BackHebrew_LetterEx;
203
204# Rule 7b and 7c
205$BackHebrew_LetterEx $BackDouble_QuoteEx $BackHebrew_LetterEx;
206
207# rule 8
208
209$BackNumericEx $BackNumericEx;
210
211# rule 9
212
213$BackNumericEx ($BackALetterEx | $BackHebrew_LetterEx);
214
215# rule 10
216
217($BackALetterEx | $BackHebrew_LetterEx) $BackNumericEx;
218
219# rule 11 and 12
220
221$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx | $BackSingle_QuoteEx) $BackNumericEx;
222
223# rule 13
224
225$BackKatakanaEx $BackKatakanaEx;
226$BackHiraganaEx $BackHiraganaEx;
227$BackIdeographicEx $BackIdeographicEx;
228
229# rules 13 a/b
230#
231$BackExtendNumLetEx ($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx);
232($BackALetterEx | $BackHebrew_LetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx;
233
234# rule 13c
235
236$BackRegional_IndicatorEx $BackRegional_IndicatorEx;
237
238## -------------------------------------------------
239
240!!safe_reverse;
241
242# rule 3
243($Extend | $Format)+ .?;
244
245# rule 6
246($MidLetter | $MidNumLet | $Single_Quote) ($BackALetterEx | $BackHebrew_LetterEx);
247
248# rule 7b
249$Double_Quote $BackHebrew_LetterEx;
250
251
252# rule 11
253($MidNum | $MidNumLet | $Single_Quote) $BackNumericEx;
254
255# For dictionary-based break
256$dictionary $dictionary;
257
258## -------------------------------------------------
259
260!!safe_forward;
261
262# rule 4
263($Extend | $Format)+ .?;
264
265# rule 6
266($MidLetterEx | $MidNumLetEx | $Single_QuoteEx) ($ALetterEx | $Hebrew_LetterEx);
267
268# rule 7b
269$Double_QuoteEx $Hebrew_LetterEx;
270
271# rule 11
272($MidNumEx | $MidNumLetEx | $Single_QuoteEx) $NumericEx;
273
274# For dictionary-based break
275$dictionary $dictionary;
276
277# Skip over potentially very long words or numbers.
278# Not necessary to reach a safe point, but avoids potential performance
279problems.
280($ALetter | $Numeric) ($ALetter | $Numeric);
281