• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#
2# Copyright (C) 2002-2013, International Business Machines Corporation
3# and others. All Rights Reserved.
4#
5# file:  word.txt
6#
7# ICU Word Break Rules
8#      See Unicode Standard Annex #29.
9#      These rules are based on UAX #29 Revision 20 for Unicode Version 6.2
10#
11# Note:  Updates to word.txt will usually need to be merged into
12#        word_POSIX.txt also.
13
14##############################################################################
15#
16#  Character class definitions from TR 29
17#
18##############################################################################
19
20!!chain;
21
22
23#
24#  Character Class Definitions.
25#
26
27$CR           = [\p{Word_Break = CR}];
28$LF           = [\p{Word_Break = LF}];
29$Newline      = [\p{Word_Break = Newline}];
30$Extend       = [\p{Word_Break = Extend}];
31$Format       = [\p{Word_Break = Format}];
32$Hiragana     = [:Hiragana:];
33$Katakana     = [\p{Word_Break = Katakana}];
34$Han          = [:Han:];
35$ALetter      = [\p{Word_Break = ALetter}];
36$MidNumLet    = [\p{Word_Break = MidNumLet}];
37$MidLetter    = [\p{Word_Break = MidLetter}];
38$MidNum       = [\p{Word_Break = MidNum}];
39$Numeric      = [\p{Word_Break = Numeric}];
40$ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
41$Regional_Indicator = [\p{Word_Break = Regional_Indicator}];
42
43
44#   Dictionary character set, for triggering language-based break engines. Currently
45#   limited to LineBreak=Complex_Context. Note that this set only works in Unicode
46#   5.0 or later as the definition of Complex_Context was corrected to include all
47#   characters requiring dictionary break.
48
49$Control        = [\p{Grapheme_Cluster_Break = Control}];
50$HangulSyllable = [\uac00-\ud7a3];
51$ComplexContext = [:LineBreak = Complex_Context:];
52$KanaKanji      = [$Han $Hiragana $Katakana];
53$dictionaryCJK  = [$KanaKanji $HangulSyllable];
54$dictionary     = [$ComplexContext $dictionaryCJK];
55
56# leave CJK scripts out of ALetterPlus
57$ALetterPlus  = [$ALetter-$dictionaryCJK [$ComplexContext-$Extend-$Control]];
58
59
60#
61#  Rules 4    Ignore Format and Extend characters,
62#             except when they appear at the beginning of a region of text.
63#
64# TODO: check if handling of katakana in dictionary makes rules incorrect/void
65$KatakanaEx     = $Katakana     ($Extend |  $Format)*;
66$ALetterEx      = $ALetterPlus  ($Extend |  $Format)*;
67$MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
68$MidLetterEx    = $MidLetter    ($Extend |  $Format)*;
69$MidNumEx       = $MidNum       ($Extend |  $Format)*;
70$NumericEx      = $Numeric      ($Extend |  $Format)*;
71$ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
72$Regional_IndicatorEx = $Regional_Indicator ($Extend |  $Format)*;
73
74$Ideographic    = [\p{Ideographic}];
75$HiraganaEx     = $Hiragana     ($Extend |  $Format)*;
76$IdeographicEx  = $Ideographic  ($Extend |  $Format)*;
77
78## -------------------------------------------------
79
80!!forward;
81
82
83# Rule 3 - CR x LF
84#
85$CR $LF;
86
87# Rule 4 - ignore Format and Extend characters, except when they appear at the beginning
88#          of a region of Text.   The rule here comes into play when the start of text
89#          begins with a group of Format chars, or with a "word" consisting of a single
90#          char that is not in any of the listed word break categories followed by
91#          format char(s), or is not a CJK dictionary character.
92[^$CR $LF $Newline]? ($Extend |  $Format)+;
93
94$NumericEx {100};
95$ALetterEx {200};
96$HangulSyllable {200};
97$KatakanaEx {400};       # note:  these status values override those from rule 5
98$HiraganaEx {400};       #        by virtue of being numerically larger.
99$IdeographicEx {400};    #
100
101#
102# rule 5
103#    Do not break between most letters.
104#
105$ALetterEx $ALetterEx {200};
106
107# rule 6 and 7
108$ALetterEx ($MidLetterEx | $MidNumLetEx) $ALetterEx {200};
109
110# rule 8
111
112$NumericEx $NumericEx {100};
113
114# rule 9
115
116$ALetterEx $NumericEx {200};
117
118# rule 10
119
120$NumericEx $ALetterEx {200};
121
122# rule 11 and 12
123
124$NumericEx ($MidNumEx | $MidNumLetEx) $NumericEx {100};
125
126# rule 13
127# to be consistent with $KanaKanji $KanaKanhi, changed
128# from 300 to 400.
129# See also TestRuleStatus in intltest/rbbiapts.cpp
130$KatakanaEx  $KatakanaEx {400};
131
132# rule 13a/b
133
134$ALetterEx      $ExtendNumLetEx {200};    #  (13a)
135$NumericEx      $ExtendNumLetEx {100};    #  (13a)
136$KatakanaEx     $ExtendNumLetEx {400};    #  (13a)
137$ExtendNumLetEx $ExtendNumLetEx {200};    #  (13a)
138
139$ExtendNumLetEx $ALetterEx  {200};    #  (13b)
140$ExtendNumLetEx $NumericEx  {100};    #  (13b)
141$ExtendNumLetEx $KatakanaEx {400};    #  (13b)
142
143# rule 13c
144
145$Regional_IndicatorEx $Regional_IndicatorEx;
146
147# special handling for CJK characters: chain for later dictionary segmentation
148$HangulSyllable $HangulSyllable {200};
149$KanaKanji $KanaKanji {400}; # different rule status if both kana and kanji found
150
151
152## -------------------------------------------------
153
154!!reverse;
155
156$BackALetterEx            = ($Format | $Extend)* $ALetterPlus;
157$BackMidNumLetEx          = ($Format | $Extend)* $MidNumLet;
158$BackNumericEx            = ($Format | $Extend)* $Numeric;
159$BackMidNumEx             = ($Format | $Extend)* $MidNum;
160$BackMidLetterEx          = ($Format | $Extend)* $MidLetter;
161$BackKatakanaEx           = ($Format | $Extend)* $Katakana;
162$BackHiraganaEx           = ($Format | $Extend)* $Hiragana;
163$BackExtendNumLetEx       = ($Format | $Extend)* $ExtendNumLet;
164$BackRegional_IndicatorEx = ($Format | $Extend)* $Regional_Indicator;
165
166# rule 3
167$LF $CR;
168
169# rule 4
170($Format | $Extend)*  [^$CR $LF $Newline]?;
171
172# rule 5
173
174$BackALetterEx $BackALetterEx;
175
176# rule 6 and 7
177
178$BackALetterEx ($BackMidLetterEx | $BackMidNumLetEx) $BackALetterEx;
179
180
181# rule 8
182
183$BackNumericEx $BackNumericEx;
184
185# rule 9
186
187$BackNumericEx $BackALetterEx;
188
189# rule 10
190
191$BackALetterEx $BackNumericEx;
192
193# rule 11 and 12
194
195$BackNumericEx ($BackMidNumEx | $BackMidNumLetEx) $BackNumericEx;
196
197# rule 13
198
199$BackKatakanaEx $BackKatakanaEx;
200
201# rules 13 a/b
202#
203$BackExtendNumLetEx ($BackALetterEx | $BackNumericEx | $BackKatakanaEx | $BackExtendNumLetEx);
204($BackALetterEx | $BackNumericEx | $BackKatakanaEx) $BackExtendNumLetEx;
205
206# rule 13c
207
208$BackRegional_IndicatorEx $BackRegional_IndicatorEx;
209
210# special handling for CJK characters: chain for later dictionary segmentation
211$HangulSyllable $HangulSyllable;
212$KanaKanji $KanaKanji; #different rule status if both kanji and kana found
213
214## -------------------------------------------------
215
216!!safe_reverse;
217
218# rule 3
219($Extend | $Format)+ .?;
220
221# rule 6
222($MidLetter | $MidNumLet) $BackALetterEx;
223
224# rule 11
225($MidNum | $MidNumLet) $BackNumericEx;
226
227# For dictionary-based break
228$dictionary $dictionary;
229
230## -------------------------------------------------
231
232!!safe_forward;
233
234# rule 4
235($Extend | $Format)+ .?;
236
237# rule 6
238($MidLetterEx | $MidNumLetEx) $ALetterEx;
239
240# rule 11
241($MidNumEx | $MidNumLetEx) $NumericEx;
242
243# For dictionary-based break
244$dictionary $dictionary;
245