1 /*
2 * Copyright (C) 2006 Lars Knoll <lars@trolltech.com>
3 * Copyright (C) 2007 Apple Inc. All rights reserved.
4 * Copyright (C) 2008 Jürg Billeter <j@bitron.ch>
5 * Copyright (C) 2008 Dominik Röttsches <dominik.roettsches@access-company.com>
6 * Copyright (C) 2010 Igalia S.L.
7 *
8 * This library is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Library General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
12 *
13 * This library is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Library General Public License for more details.
17 *
18 * You should have received a copy of the GNU Library General Public License
19 * along with this library; see the file COPYING.LIB. If not, write to
20 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
21 * Boston, MA 02110-1301, USA.
22 *
23 */
24
25 #include "config.h"
26
27 #include "TextBreakIterator.h"
28
29 #include "GOwnPtr.h"
30 #include <pango/pango.h>
31 using namespace std;
32
33 #define UTF8_IS_SURROGATE(character) (character >= 0x10000 && character <= 0x10FFFF)
34
35 namespace WebCore {
36
37 class CharacterIterator {
38 public:
39 bool setText(const UChar* string, int length);
getText()40 const gchar* getText() { return m_utf8.get(); }
getLength()41 int getLength() { return m_length; }
getSize()42 glong getSize() { return m_size; }
43 void setIndex(int index);
getIndex()44 int getIndex() { return m_index; }
45 void setUTF16Index(int index);
getUTF16Index()46 int getUTF16Index() { return m_utf16Index; }
getUTF16Length()47 int getUTF16Length() { return m_utf16Length; }
48 int first();
49 int last();
50 int next();
51 int previous();
52 private:
53 int characterSize(int index);
54
55 GOwnPtr<char> m_utf8;
56 int m_length;
57 long m_size;
58 int m_index;
59 int m_utf16Index;
60 int m_utf16Length;
61 };
62
characterSize(int index)63 int CharacterIterator::characterSize(int index)
64 {
65 if (index == m_length || index < 0)
66 return 0;
67 if (m_length == m_utf16Length)
68 return 1;
69
70 gchar* indexPtr = g_utf8_offset_to_pointer(m_utf8.get(), index);
71 gunichar character = g_utf8_get_char(indexPtr);
72 return UTF8_IS_SURROGATE(character) ? 2 : 1;
73 }
74
setText(const UChar * string,int length)75 bool CharacterIterator::setText(const UChar* string, int length)
76 {
77 long utf8Size = 0;
78 m_utf8.set(g_utf16_to_utf8(string, length, 0, &utf8Size, 0));
79 if (!utf8Size)
80 return false;
81
82 m_utf16Length = length;
83 m_length = g_utf8_strlen(m_utf8.get(), utf8Size);
84 m_size = utf8Size;
85 m_index = 0;
86 m_utf16Index = 0;
87
88 return true;
89 }
90
setIndex(int index)91 void CharacterIterator::setIndex(int index)
92 {
93 if (index == m_index)
94 return;
95 if (index <= 0)
96 m_index = m_utf16Index = 0;
97 else if (index >= m_length) {
98 m_index = m_length;
99 m_utf16Index = m_utf16Length;
100 } else if (m_length == m_utf16Length)
101 m_index = m_utf16Index = index;
102 else {
103 m_index = index;
104 int utf16Index = 0;
105 int utf8Index = 0;
106 while (utf8Index < index) {
107 utf16Index += characterSize(utf8Index);
108 utf8Index++;
109 }
110 m_utf16Index = utf16Index;
111 }
112 }
113
setUTF16Index(int index)114 void CharacterIterator::setUTF16Index(int index)
115 {
116 if (index == m_utf16Index)
117 return;
118 if (index <= 0)
119 m_utf16Index = m_index = 0;
120 else if (index >= m_utf16Length) {
121 m_utf16Index = m_utf16Length;
122 m_index = m_length;
123 } else if (m_length == m_utf16Length)
124 m_utf16Index = m_index = index;
125 else {
126 m_utf16Index = index;
127 int utf16Index = 0;
128 int utf8Index = 0;
129 while (utf16Index < index) {
130 utf16Index += characterSize(utf8Index);
131 utf8Index++;
132 }
133 m_index = utf8Index;
134 }
135 }
136
first()137 int CharacterIterator::first()
138 {
139 m_index = m_utf16Index = 0;
140 return m_index;
141 }
142
last()143 int CharacterIterator::last()
144 {
145 m_index = m_length;
146 m_utf16Index = m_utf16Length;
147 return m_index;
148 }
149
next()150 int CharacterIterator::next()
151 {
152 int next = m_index + 1;
153
154 if (next <= m_length) {
155 m_utf16Index = min(m_utf16Index + characterSize(m_index), m_utf16Length);
156 m_index = next;
157 } else {
158 m_index = TextBreakDone;
159 m_utf16Index = TextBreakDone;
160 }
161
162 return m_index;
163 }
164
previous()165 int CharacterIterator::previous()
166 {
167 int previous = m_index - 1;
168
169 if (previous >= 0) {
170 m_utf16Index = max(m_utf16Index - characterSize(previous), 0);
171 m_index = previous;
172 } else {
173 m_index = TextBreakDone;
174 m_utf16Index = TextBreakDone;
175 }
176
177 return m_index;
178 }
179
180 enum UBreakIteratorType {
181 UBRK_CHARACTER,
182 UBRK_WORD,
183 UBRK_LINE,
184 UBRK_SENTENCE
185 };
186
187 class TextBreakIterator {
188 public:
189 UBreakIteratorType m_type;
190 PangoLogAttr* m_logAttrs;
191 CharacterIterator m_charIterator;
192 };
193
setUpIterator(bool & createdIterator,TextBreakIterator * & iterator,UBreakIteratorType type,const UChar * string,int length)194 static TextBreakIterator* setUpIterator(bool& createdIterator, TextBreakIterator*& iterator,
195 UBreakIteratorType type, const UChar* string, int length)
196 {
197 if (!string)
198 return 0;
199
200 if (!createdIterator) {
201 iterator = new TextBreakIterator();
202 createdIterator = true;
203 }
204 if (!iterator)
205 return 0;
206
207 if (!iterator->m_charIterator.setText(string, length))
208 return 0;
209
210 int charLength = iterator->m_charIterator.getLength();
211
212 iterator->m_type = type;
213 if (createdIterator)
214 g_free(iterator->m_logAttrs);
215 iterator->m_logAttrs = g_new0(PangoLogAttr, charLength + 1);
216 pango_get_log_attrs(iterator->m_charIterator.getText(), iterator->m_charIterator.getSize(),
217 -1, 0, iterator->m_logAttrs, charLength + 1);
218
219 return iterator;
220 }
221
characterBreakIterator(const UChar * string,int length)222 TextBreakIterator* characterBreakIterator(const UChar* string, int length)
223 {
224 static bool createdCharacterBreakIterator = false;
225 static TextBreakIterator* staticCharacterBreakIterator;
226 return setUpIterator(createdCharacterBreakIterator, staticCharacterBreakIterator, UBRK_CHARACTER, string, length);
227 }
228
cursorMovementIterator(const UChar * string,int length)229 TextBreakIterator* cursorMovementIterator(const UChar* string, int length)
230 {
231 // FIXME: This needs closer inspection to achieve behaviour identical to the ICU version.
232 return characterBreakIterator(string, length);
233 }
234
wordBreakIterator(const UChar * string,int length)235 TextBreakIterator* wordBreakIterator(const UChar* string, int length)
236 {
237 static bool createdWordBreakIterator = false;
238 static TextBreakIterator* staticWordBreakIterator;
239 return setUpIterator(createdWordBreakIterator, staticWordBreakIterator, UBRK_WORD, string, length);
240 }
241
242 static bool createdLineBreakIterator = false;
243 static TextBreakIterator* staticLineBreakIterator;
244
acquireLineBreakIterator(const UChar * string,int length)245 TextBreakIterator* acquireLineBreakIterator(const UChar* string, int length)
246 {
247 TextBreakIterator* lineBreakIterator = 0;
248 if (!createdLineBreakIterator || staticLineBreakIterator) {
249 setUpIterator(createdLineBreakIterator, staticLineBreakIterator, UBRK_LINE, string, length);
250 swap(staticLineBreakIterator, lineBreakIterator);
251 }
252
253 if (!lineBreakIterator) {
254 bool createdNewLineBreakIterator = false;
255 setUpIterator(createdNewLineBreakIterator, lineBreakIterator, UBRK_LINE, string, length);
256 }
257
258 return lineBreakIterator;
259 }
260
releaseLineBreakIterator(TextBreakIterator * iterator)261 void releaseLineBreakIterator(TextBreakIterator* iterator)
262 {
263 ASSERT(createdLineBreakIterator);
264 ASSERT(iterator);
265
266 if (!staticLineBreakIterator)
267 staticLineBreakIterator = iterator;
268 else
269 delete iterator;
270 }
271
sentenceBreakIterator(const UChar * string,int length)272 TextBreakIterator* sentenceBreakIterator(const UChar* string, int length)
273 {
274 static bool createdSentenceBreakIterator = false;
275 static TextBreakIterator* staticSentenceBreakIterator;
276 return setUpIterator(createdSentenceBreakIterator, staticSentenceBreakIterator, UBRK_SENTENCE, string, length);
277 }
278
textBreakFirst(TextBreakIterator * iterator)279 int textBreakFirst(TextBreakIterator* iterator)
280 {
281 iterator->m_charIterator.first();
282 return iterator->m_charIterator.getUTF16Index();
283 }
284
textBreakLast(TextBreakIterator * iterator)285 int textBreakLast(TextBreakIterator* iterator)
286 {
287 // TextBreakLast is not meant to find just any break according to bi->m_type
288 // but really the one near the last character.
289 // (cmp ICU documentation for ubrk_first and ubrk_last)
290 // From ICU docs for ubrk_last:
291 // "Determine the index immediately beyond the last character in the text being scanned."
292
293 // So we should advance or traverse back based on bi->m_logAttrs cursor positions.
294 // If last character position in the original string is a whitespace,
295 // traverse to the left until the first non-white character position is found
296 // and return the position of the first white-space char after this one.
297 // Otherwise return m_length, as "the first character beyond the last" is outside our string.
298
299 bool whiteSpaceAtTheEnd = true;
300 int nextWhiteSpacePos = iterator->m_charIterator.getLength();
301
302 int pos = iterator->m_charIterator.last();
303 while (pos >= 0 && whiteSpaceAtTheEnd) {
304 if (iterator->m_logAttrs[pos].is_cursor_position) {
305 if (whiteSpaceAtTheEnd = iterator->m_logAttrs[pos].is_white)
306 nextWhiteSpacePos = pos;
307 }
308 pos = iterator->m_charIterator.previous();
309 }
310 iterator->m_charIterator.setIndex(nextWhiteSpacePos);
311 return iterator->m_charIterator.getUTF16Index();
312 }
313
textBreakNext(TextBreakIterator * iterator)314 int textBreakNext(TextBreakIterator* iterator)
315 {
316 while (iterator->m_charIterator.next() != TextBreakDone) {
317 int index = iterator->m_charIterator.getIndex();
318
319 // FIXME: UBRK_WORD case: Single multibyte characters (i.e. white space around them), such as the euro symbol €,
320 // are not marked as word_start & word_end as opposed to the way ICU does it.
321 // This leads to - for example - different word selection behaviour when right clicking.
322
323 if ((iterator->m_type == UBRK_LINE && iterator->m_logAttrs[index].is_line_break)
324 || (iterator->m_type == UBRK_WORD && (iterator->m_logAttrs[index].is_word_start || iterator->m_logAttrs[index].is_word_end))
325 || (iterator->m_type == UBRK_CHARACTER && iterator->m_logAttrs[index].is_cursor_position)
326 || (iterator->m_type == UBRK_SENTENCE && iterator->m_logAttrs[index].is_sentence_boundary)) {
327 break;
328 }
329 }
330 return iterator->m_charIterator.getUTF16Index();
331 }
332
textBreakPrevious(TextBreakIterator * iterator)333 int textBreakPrevious(TextBreakIterator* iterator)
334 {
335 while (iterator->m_charIterator.previous() != TextBreakDone) {
336 int index = iterator->m_charIterator.getIndex();
337
338 if ((iterator->m_type == UBRK_LINE && iterator->m_logAttrs[index].is_line_break)
339 || (iterator->m_type == UBRK_WORD && (iterator->m_logAttrs[index].is_word_start || iterator->m_logAttrs[index].is_word_end))
340 || (iterator->m_type == UBRK_CHARACTER && iterator->m_logAttrs[index].is_cursor_position)
341 || (iterator->m_type == UBRK_SENTENCE && iterator->m_logAttrs[index].is_sentence_boundary)) {
342 break;
343 }
344 }
345 return iterator->m_charIterator.getUTF16Index();
346 }
347
textBreakPreceding(TextBreakIterator * iterator,int offset)348 int textBreakPreceding(TextBreakIterator* iterator, int offset)
349 {
350 if (offset > iterator->m_charIterator.getUTF16Length())
351 return TextBreakDone;
352 if (offset < 0)
353 return 0;
354 iterator->m_charIterator.setUTF16Index(offset);
355 return textBreakPrevious(iterator);
356 }
357
textBreakFollowing(TextBreakIterator * iterator,int offset)358 int textBreakFollowing(TextBreakIterator* iterator, int offset)
359 {
360 if (offset > iterator->m_charIterator.getUTF16Length())
361 return TextBreakDone;
362 if (offset < 0)
363 return 0;
364 iterator->m_charIterator.setUTF16Index(offset);
365 return textBreakNext(iterator);
366 }
367
textBreakCurrent(TextBreakIterator * iterator)368 int textBreakCurrent(TextBreakIterator* iterator)
369 {
370 return iterator->m_charIterator.getUTF16Index();
371 }
372
isTextBreak(TextBreakIterator * iterator,int offset)373 bool isTextBreak(TextBreakIterator* iterator, int offset)
374 {
375 if (!offset)
376 return true;
377 if (offset > iterator->m_charIterator.getUTF16Length())
378 return false;
379
380 iterator->m_charIterator.setUTF16Index(offset);
381
382 int index = iterator->m_charIterator.getIndex();
383 iterator->m_charIterator.previous();
384 textBreakNext(iterator);
385 return iterator->m_charIterator.getIndex() == index;
386 }
387
388 }
389