• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2006 Lars Knoll <lars@trolltech.com>
3  * Copyright (C) 2007 Apple Inc. All rights reserved.
4  *
5  * This library is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU Library General Public
7  * License as published by the Free Software Foundation; either
8  * version 2 of the License, or (at your option) any later version.
9  *
10  * This library is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * Library General Public License for more details.
14  *
15  * You should have received a copy of the GNU Library General Public License
16  * along with this library; see the file COPYING.LIB.  If not, write to
17  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18  * Boston, MA 02110-1301, USA.
19  *
20  */
21 
22 #include "config.h"
23 #include "TextBreakIterator.h"
24 
25 #include "PlatformString.h"
26 #include "TextBreakIteratorInternalICU.h"
27 
28 #include <unicode/ubrk.h>
29 #include <wtf/Assertions.h>
30 
31 namespace WebCore {
32 
setUpIterator(bool & createdIterator,TextBreakIterator * & iterator,UBreakIteratorType type,const UChar * string,int length)33 static TextBreakIterator* setUpIterator(bool& createdIterator, TextBreakIterator*& iterator,
34     UBreakIteratorType type, const UChar* string, int length)
35 {
36     if (!string)
37         return 0;
38 
39     if (!createdIterator) {
40         UErrorCode openStatus = U_ZERO_ERROR;
41         iterator = static_cast<TextBreakIterator*>(ubrk_open(type, currentTextBreakLocaleID(), 0, 0, &openStatus));
42         createdIterator = true;
43         ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
44     }
45     if (!iterator)
46         return 0;
47 
48     UErrorCode setTextStatus = U_ZERO_ERROR;
49     ubrk_setText(iterator, string, length, &setTextStatus);
50     if (U_FAILURE(setTextStatus))
51         return 0;
52 
53     return iterator;
54 }
55 
characterBreakIterator(const UChar * string,int length)56 TextBreakIterator* characterBreakIterator(const UChar* string, int length)
57 {
58     static bool createdCharacterBreakIterator = false;
59     static TextBreakIterator* staticCharacterBreakIterator;
60     return setUpIterator(createdCharacterBreakIterator,
61         staticCharacterBreakIterator, UBRK_CHARACTER, string, length);
62 }
63 
wordBreakIterator(const UChar * string,int length)64 TextBreakIterator* wordBreakIterator(const UChar* string, int length)
65 {
66     static bool createdWordBreakIterator = false;
67     static TextBreakIterator* staticWordBreakIterator;
68     return setUpIterator(createdWordBreakIterator,
69         staticWordBreakIterator, UBRK_WORD, string, length);
70 }
71 
lineBreakIterator(const UChar * string,int length)72 TextBreakIterator* lineBreakIterator(const UChar* string, int length)
73 {
74     static bool createdLineBreakIterator = false;
75     static TextBreakIterator* staticLineBreakIterator;
76     return setUpIterator(createdLineBreakIterator,
77         staticLineBreakIterator, UBRK_LINE, string, length);
78 }
79 
sentenceBreakIterator(const UChar * string,int length)80 TextBreakIterator* sentenceBreakIterator(const UChar* string, int length)
81 {
82     static bool createdSentenceBreakIterator = false;
83     static TextBreakIterator* staticSentenceBreakIterator;
84     return setUpIterator(createdSentenceBreakIterator,
85         staticSentenceBreakIterator, UBRK_SENTENCE, string, length);
86 }
87 
textBreakFirst(TextBreakIterator * bi)88 int textBreakFirst(TextBreakIterator* bi)
89 {
90     return ubrk_first(bi);
91 }
92 
textBreakNext(TextBreakIterator * bi)93 int textBreakNext(TextBreakIterator* bi)
94 {
95     return ubrk_next(bi);
96 }
97 
textBreakPreceding(TextBreakIterator * bi,int pos)98 int textBreakPreceding(TextBreakIterator* bi, int pos)
99 {
100     return ubrk_preceding(bi, pos);
101 }
102 
textBreakFollowing(TextBreakIterator * bi,int pos)103 int textBreakFollowing(TextBreakIterator* bi, int pos)
104 {
105     return ubrk_following(bi, pos);
106 }
107 
textBreakCurrent(TextBreakIterator * bi)108 int textBreakCurrent(TextBreakIterator* bi)
109 {
110     return ubrk_current(bi);
111 }
112 
isTextBreak(TextBreakIterator * bi,int pos)113 bool isTextBreak(TextBreakIterator* bi, int pos)
114 {
115     return ubrk_isBoundary(bi, pos);
116 }
117 
118 #ifndef BUILDING_ON_TIGER
setUpIteratorWithRules(bool & createdIterator,TextBreakIterator * & iterator,const char * breakRules,const UChar * string,int length)119 static TextBreakIterator* setUpIteratorWithRules(bool& createdIterator, TextBreakIterator*& iterator,
120     const char* breakRules, const UChar* string, int length)
121 {
122     if (!string)
123         return 0;
124 
125     if (!createdIterator) {
126         UParseError parseStatus;
127         UErrorCode openStatus = U_ZERO_ERROR;
128         String rules(breakRules);
129         iterator = static_cast<TextBreakIterator*>(ubrk_openRules(rules.characters(), rules.length(), 0, 0, &parseStatus, &openStatus));
130         createdIterator = true;
131         ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
132     }
133     if (!iterator)
134         return 0;
135 
136     UErrorCode setTextStatus = U_ZERO_ERROR;
137     ubrk_setText(iterator, string, length, &setTextStatus);
138     if (U_FAILURE(setTextStatus))
139         return 0;
140 
141     return iterator;
142 }
143 #endif // BUILDING_ON_TIGER
144 
cursorMovementIterator(const UChar * string,int length)145 TextBreakIterator* cursorMovementIterator(const UChar* string, int length)
146 {
147 #ifdef BUILDING_ON_TIGER
148     // ICU 3.2 cannot compile the below rules.
149     return characterBreakIterator(string, length);
150 #else
151     // This rule set is based on character-break iterator rules of ICU 4.0
152     // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/char.txt>.
153     // The major differences from the original ones are listed below:
154     // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier;
155     // * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342);
156     // * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and;
157     // * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks.
158     static const char* kRules =
159         "$CR      = [\\p{Grapheme_Cluster_Break = CR}];"
160         "$LF      = [\\p{Grapheme_Cluster_Break = LF}];"
161         "$Control = [\\p{Grapheme_Cluster_Break = Control}];"
162         "$VoiceMarks = [\\uFF9E\\uFF9F];"  // Japanese half-width katakana voiced marks
163         "$Extend  = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];"
164         "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];"
165         "$L       = [\\p{Grapheme_Cluster_Break = L}];"
166         "$V       = [\\p{Grapheme_Cluster_Break = V}];"
167         "$T       = [\\p{Grapheme_Cluster_Break = T}];"
168         "$LV      = [\\p{Grapheme_Cluster_Break = LV}];"
169         "$LVT     = [\\p{Grapheme_Cluster_Break = LVT}];"
170         "$Hin0    = [\\u0905-\\u0939];"    // Devanagari Letter A,...,Ha
171         "$HinV    = \\u094D;"              // Devanagari Sign Virama
172         "$Hin1    = [\\u0915-\\u0939];"    // Devanagari Letter Ka,...,Ha
173         "$Ben0    = [\\u0985-\\u09B9];"    // Bengali Letter A,...,Ha
174         "$BenV    = \\u09CD;"              // Bengali Sign Virama
175         "$Ben1    = [\\u0995-\\u09B9];"    // Bengali Letter Ka,...,Ha
176         "$Pan0    = [\\u0A05-\\u0A39];"    // Gurmukhi Letter A,...,Ha
177         "$PanV    = \\u0A4D;"              // Gurmukhi Sign Virama
178         "$Pan1    = [\\u0A15-\\u0A39];"    // Gurmukhi Letter Ka,...,Ha
179         "$Guj0    = [\\u0A85-\\u0AB9];"    // Gujarati Letter A,...,Ha
180         "$GujV    = \\u0ACD;"              // Gujarati Sign Virama
181         "$Guj1    = [\\u0A95-\\u0AB9];"    // Gujarati Letter Ka,...,Ha
182         "$Ori0    = [\\u0B05-\\u0B39];"    // Oriya Letter A,...,Ha
183         "$OriV    = \\u0B4D;"              // Oriya Sign Virama
184         "$Ori1    = [\\u0B15-\\u0B39];"    // Oriya Letter Ka,...,Ha
185         "$Tel0    = [\\u0C05-\\u0C39];"    // Telugu Letter A,...,Ha
186         "$TelV    = \\u0C4D;"              // Telugu Sign Virama
187         "$Tel1    = [\\u0C14-\\u0C39];"    // Telugu Letter Ka,...,Ha
188         "$Kan0    = [\\u0C85-\\u0CB9];"    // Kannada Letter A,...,Ha
189         "$KanV    = \\u0CCD;"              // Kannada Sign Virama
190         "$Kan1    = [\\u0C95-\\u0CB9];"    // Kannada Letter A,...,Ha
191         "$Mal0    = [\\u0D05-\\u0D39];"    // Malayalam Letter A,...,Ha
192         "$MalV    = \\u0D4D;"              // Malayalam Sign Virama
193         "$Mal1    = [\\u0D15-\\u0D39];"    // Malayalam Letter A,...,Ha
194         "!!chain;"
195         "!!forward;"
196         "$CR $LF;"
197         "$L ($L | $V | $LV | $LVT);"
198         "($LV | $V) ($V | $T);"
199         "($LVT | $T) $T;"
200         "[^$Control $CR $LF] $Extend;"
201         "[^$Control $CR $LF] $SpacingMark;"
202         "$Hin0 $HinV $Hin1;"               // Devanagari Virama (forward)
203         "$Ben0 $BenV $Ben1;"               // Bengali Virama (forward)
204         "$Pan0 $PanV $Pan1;"               // Gurmukhi Virama (forward)
205         "$Guj0 $GujV $Guj1;"               // Gujarati Virama (forward)
206         "$Ori0 $OriV $Ori1;"               // Oriya Virama (forward)
207         "$Tel0 $TelV $Tel1;"               // Telugu Virama (forward)
208         "$Kan0 $KanV $Kan1;"               // Kannada Virama (forward)
209         "$Mal0 $MalV $Mal1;"               // Malayalam Virama (forward)
210         "!!reverse;"
211         "$LF $CR;"
212         "($L | $V | $LV | $LVT) $L;"
213         "($V | $T) ($LV | $V);"
214         "$T ($LVT | $T);"
215         "$Extend      [^$Control $CR $LF];"
216         "$SpacingMark [^$Control $CR $LF];"
217         "$Hin1 $HinV $Hin0;"               // Devanagari Virama (backward)
218         "$Ben1 $BenV $Ben0;"               // Bengali Virama (backward)
219         "$Pan1 $PanV $Pan0;"               // Gurmukhi Virama (backward)
220         "$Guj1 $GujV $Guj0;"               // Gujarati Virama (backward)
221         "$Ori1 $OriV $Ori0;"               // Gujarati Virama (backward)
222         "$Tel1 $TelV $Tel0;"               // Telugu Virama (backward)
223         "$Kan1 $KanV $Kan0;"               // Kannada Virama (backward)
224         "$Mal1 $MalV $Mal0;"               // Malayalam Virama (backward)
225         "!!safe_reverse;"
226         "!!safe_forward;";
227     static bool createdCursorMovementIterator = false;
228     static TextBreakIterator* staticCursorMovementIterator;
229     return setUpIteratorWithRules(createdCursorMovementIterator, staticCursorMovementIterator, kRules, string, length);
230 #endif // BUILDING_ON_TIGER
231 }
232 
233 }
234