1 /*
2 * Copyright (C) 2006 Lars Knoll <lars@trolltech.com>
3 * Copyright (C) 2007 Apple Inc. All rights reserved.
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 *
20 */
21
22 #include "config.h"
23 #include "TextBreakIterator.h"
24
25 #include "PlatformString.h"
26 #include "TextBreakIteratorInternalICU.h"
27
28 #include <unicode/ubrk.h>
29 #include <wtf/Assertions.h>
30
31 namespace WebCore {
32
setUpIterator(bool & createdIterator,TextBreakIterator * & iterator,UBreakIteratorType type,const UChar * string,int length)33 static TextBreakIterator* setUpIterator(bool& createdIterator, TextBreakIterator*& iterator,
34 UBreakIteratorType type, const UChar* string, int length)
35 {
36 if (!string)
37 return 0;
38
39 if (!createdIterator) {
40 UErrorCode openStatus = U_ZERO_ERROR;
41 iterator = static_cast<TextBreakIterator*>(ubrk_open(type, currentTextBreakLocaleID(), 0, 0, &openStatus));
42 createdIterator = true;
43 ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
44 }
45 if (!iterator)
46 return 0;
47
48 UErrorCode setTextStatus = U_ZERO_ERROR;
49 ubrk_setText(iterator, string, length, &setTextStatus);
50 if (U_FAILURE(setTextStatus))
51 return 0;
52
53 return iterator;
54 }
55
characterBreakIterator(const UChar * string,int length)56 TextBreakIterator* characterBreakIterator(const UChar* string, int length)
57 {
58 static bool createdCharacterBreakIterator = false;
59 static TextBreakIterator* staticCharacterBreakIterator;
60 return setUpIterator(createdCharacterBreakIterator,
61 staticCharacterBreakIterator, UBRK_CHARACTER, string, length);
62 }
63
wordBreakIterator(const UChar * string,int length)64 TextBreakIterator* wordBreakIterator(const UChar* string, int length)
65 {
66 static bool createdWordBreakIterator = false;
67 static TextBreakIterator* staticWordBreakIterator;
68 return setUpIterator(createdWordBreakIterator,
69 staticWordBreakIterator, UBRK_WORD, string, length);
70 }
71
lineBreakIterator(const UChar * string,int length)72 TextBreakIterator* lineBreakIterator(const UChar* string, int length)
73 {
74 static bool createdLineBreakIterator = false;
75 static TextBreakIterator* staticLineBreakIterator;
76 return setUpIterator(createdLineBreakIterator,
77 staticLineBreakIterator, UBRK_LINE, string, length);
78 }
79
sentenceBreakIterator(const UChar * string,int length)80 TextBreakIterator* sentenceBreakIterator(const UChar* string, int length)
81 {
82 static bool createdSentenceBreakIterator = false;
83 static TextBreakIterator* staticSentenceBreakIterator;
84 return setUpIterator(createdSentenceBreakIterator,
85 staticSentenceBreakIterator, UBRK_SENTENCE, string, length);
86 }
87
textBreakFirst(TextBreakIterator * bi)88 int textBreakFirst(TextBreakIterator* bi)
89 {
90 return ubrk_first(bi);
91 }
92
textBreakNext(TextBreakIterator * bi)93 int textBreakNext(TextBreakIterator* bi)
94 {
95 return ubrk_next(bi);
96 }
97
textBreakPreceding(TextBreakIterator * bi,int pos)98 int textBreakPreceding(TextBreakIterator* bi, int pos)
99 {
100 return ubrk_preceding(bi, pos);
101 }
102
textBreakFollowing(TextBreakIterator * bi,int pos)103 int textBreakFollowing(TextBreakIterator* bi, int pos)
104 {
105 return ubrk_following(bi, pos);
106 }
107
textBreakCurrent(TextBreakIterator * bi)108 int textBreakCurrent(TextBreakIterator* bi)
109 {
110 return ubrk_current(bi);
111 }
112
isTextBreak(TextBreakIterator * bi,int pos)113 bool isTextBreak(TextBreakIterator* bi, int pos)
114 {
115 return ubrk_isBoundary(bi, pos);
116 }
117
118 #ifndef BUILDING_ON_TIGER
setUpIteratorWithRules(bool & createdIterator,TextBreakIterator * & iterator,const char * breakRules,const UChar * string,int length)119 static TextBreakIterator* setUpIteratorWithRules(bool& createdIterator, TextBreakIterator*& iterator,
120 const char* breakRules, const UChar* string, int length)
121 {
122 if (!string)
123 return 0;
124
125 if (!createdIterator) {
126 UParseError parseStatus;
127 UErrorCode openStatus = U_ZERO_ERROR;
128 String rules(breakRules);
129 iterator = static_cast<TextBreakIterator*>(ubrk_openRules(rules.characters(), rules.length(), 0, 0, &parseStatus, &openStatus));
130 createdIterator = true;
131 ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
132 }
133 if (!iterator)
134 return 0;
135
136 UErrorCode setTextStatus = U_ZERO_ERROR;
137 ubrk_setText(iterator, string, length, &setTextStatus);
138 if (U_FAILURE(setTextStatus))
139 return 0;
140
141 return iterator;
142 }
143 #endif // BUILDING_ON_TIGER
144
cursorMovementIterator(const UChar * string,int length)145 TextBreakIterator* cursorMovementIterator(const UChar* string, int length)
146 {
147 #ifdef BUILDING_ON_TIGER
148 // ICU 3.2 cannot compile the below rules.
149 return characterBreakIterator(string, length);
150 #else
151 // This rule set is based on character-break iterator rules of ICU 4.0
152 // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/char.txt>.
153 // The major differences from the original ones are listed below:
154 // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier;
155 // * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342);
156 // * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and;
157 // * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks.
158 static const char* kRules =
159 "$CR = [\\p{Grapheme_Cluster_Break = CR}];"
160 "$LF = [\\p{Grapheme_Cluster_Break = LF}];"
161 "$Control = [\\p{Grapheme_Cluster_Break = Control}];"
162 "$VoiceMarks = [\\uFF9E\\uFF9F];" // Japanese half-width katakana voiced marks
163 "$Extend = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];"
164 "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];"
165 "$L = [\\p{Grapheme_Cluster_Break = L}];"
166 "$V = [\\p{Grapheme_Cluster_Break = V}];"
167 "$T = [\\p{Grapheme_Cluster_Break = T}];"
168 "$LV = [\\p{Grapheme_Cluster_Break = LV}];"
169 "$LVT = [\\p{Grapheme_Cluster_Break = LVT}];"
170 "$Hin0 = [\\u0905-\\u0939];" // Devanagari Letter A,...,Ha
171 "$HinV = \\u094D;" // Devanagari Sign Virama
172 "$Hin1 = [\\u0915-\\u0939];" // Devanagari Letter Ka,...,Ha
173 "$Ben0 = [\\u0985-\\u09B9];" // Bengali Letter A,...,Ha
174 "$BenV = \\u09CD;" // Bengali Sign Virama
175 "$Ben1 = [\\u0995-\\u09B9];" // Bengali Letter Ka,...,Ha
176 "$Pan0 = [\\u0A05-\\u0A39];" // Gurmukhi Letter A,...,Ha
177 "$PanV = \\u0A4D;" // Gurmukhi Sign Virama
178 "$Pan1 = [\\u0A15-\\u0A39];" // Gurmukhi Letter Ka,...,Ha
179 "$Guj0 = [\\u0A85-\\u0AB9];" // Gujarati Letter A,...,Ha
180 "$GujV = \\u0ACD;" // Gujarati Sign Virama
181 "$Guj1 = [\\u0A95-\\u0AB9];" // Gujarati Letter Ka,...,Ha
182 "$Ori0 = [\\u0B05-\\u0B39];" // Oriya Letter A,...,Ha
183 "$OriV = \\u0B4D;" // Oriya Sign Virama
184 "$Ori1 = [\\u0B15-\\u0B39];" // Oriya Letter Ka,...,Ha
185 "$Tel0 = [\\u0C05-\\u0C39];" // Telugu Letter A,...,Ha
186 "$TelV = \\u0C4D;" // Telugu Sign Virama
187 "$Tel1 = [\\u0C14-\\u0C39];" // Telugu Letter Ka,...,Ha
188 "$Kan0 = [\\u0C85-\\u0CB9];" // Kannada Letter A,...,Ha
189 "$KanV = \\u0CCD;" // Kannada Sign Virama
190 "$Kan1 = [\\u0C95-\\u0CB9];" // Kannada Letter A,...,Ha
191 "$Mal0 = [\\u0D05-\\u0D39];" // Malayalam Letter A,...,Ha
192 "$MalV = \\u0D4D;" // Malayalam Sign Virama
193 "$Mal1 = [\\u0D15-\\u0D39];" // Malayalam Letter A,...,Ha
194 "!!chain;"
195 "!!forward;"
196 "$CR $LF;"
197 "$L ($L | $V | $LV | $LVT);"
198 "($LV | $V) ($V | $T);"
199 "($LVT | $T) $T;"
200 "[^$Control $CR $LF] $Extend;"
201 "[^$Control $CR $LF] $SpacingMark;"
202 "$Hin0 $HinV $Hin1;" // Devanagari Virama (forward)
203 "$Ben0 $BenV $Ben1;" // Bengali Virama (forward)
204 "$Pan0 $PanV $Pan1;" // Gurmukhi Virama (forward)
205 "$Guj0 $GujV $Guj1;" // Gujarati Virama (forward)
206 "$Ori0 $OriV $Ori1;" // Oriya Virama (forward)
207 "$Tel0 $TelV $Tel1;" // Telugu Virama (forward)
208 "$Kan0 $KanV $Kan1;" // Kannada Virama (forward)
209 "$Mal0 $MalV $Mal1;" // Malayalam Virama (forward)
210 "!!reverse;"
211 "$LF $CR;"
212 "($L | $V | $LV | $LVT) $L;"
213 "($V | $T) ($LV | $V);"
214 "$T ($LVT | $T);"
215 "$Extend [^$Control $CR $LF];"
216 "$SpacingMark [^$Control $CR $LF];"
217 "$Hin1 $HinV $Hin0;" // Devanagari Virama (backward)
218 "$Ben1 $BenV $Ben0;" // Bengali Virama (backward)
219 "$Pan1 $PanV $Pan0;" // Gurmukhi Virama (backward)
220 "$Guj1 $GujV $Guj0;" // Gujarati Virama (backward)
221 "$Ori1 $OriV $Ori0;" // Gujarati Virama (backward)
222 "$Tel1 $TelV $Tel0;" // Telugu Virama (backward)
223 "$Kan1 $KanV $Kan0;" // Kannada Virama (backward)
224 "$Mal1 $MalV $Mal0;" // Malayalam Virama (backward)
225 "!!safe_reverse;"
226 "!!safe_forward;";
227 static bool createdCursorMovementIterator = false;
228 static TextBreakIterator* staticCursorMovementIterator;
229 return setUpIteratorWithRules(createdCursorMovementIterator, staticCursorMovementIterator, kRules, string, length);
230 #endif // BUILDING_ON_TIGER
231 }
232
233 }
234