1 /*
2 * Copyright (C) 2006 Lars Knoll <lars@trolltech.com>
3 * Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved.
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 *
20 */
21
22 #include "config.h"
23 #include "platform/text/TextBreakIterator.h"
24
25 #include "platform/text/TextBreakIteratorInternalICU.h"
26 #include "wtf/Assertions.h"
27 #include "wtf/HashMap.h"
28 #include "wtf/PassOwnPtr.h"
29 #include "wtf/ThreadSpecific.h"
30 #include "wtf/ThreadingPrimitives.h"
31 #include "wtf/text/AtomicString.h"
32 #include "wtf/text/CString.h"
33 #include "wtf/text/WTFString.h"
34 #include <unicode/rbbi.h>
35 #include <unicode/ubrk.h>
36
37 using namespace WTF;
38 using namespace std;
39
40 namespace WebCore {
41
42 class LineBreakIteratorPool {
43 WTF_MAKE_NONCOPYABLE(LineBreakIteratorPool);
44 public:
sharedPool()45 static LineBreakIteratorPool& sharedPool()
46 {
47 static WTF::ThreadSpecific<LineBreakIteratorPool>* pool = new WTF::ThreadSpecific<LineBreakIteratorPool>;
48 return **pool;
49 }
50
create()51 static PassOwnPtr<LineBreakIteratorPool> create() { return adoptPtr(new LineBreakIteratorPool); }
52
take(const AtomicString & locale)53 icu::BreakIterator* take(const AtomicString& locale)
54 {
55 icu::BreakIterator* iterator = 0;
56 for (size_t i = 0; i < m_pool.size(); ++i) {
57 if (m_pool[i].first == locale) {
58 iterator = m_pool[i].second;
59 m_pool.remove(i);
60 break;
61 }
62 }
63
64 if (!iterator) {
65 UErrorCode openStatus = U_ZERO_ERROR;
66 bool localeIsEmpty = locale.isEmpty();
67 iterator = icu::BreakIterator::createLineInstance(localeIsEmpty ? icu::Locale(currentTextBreakLocaleID()) : icu::Locale(locale.utf8().data()), openStatus);
68 // locale comes from a web page and it can be invalid, leading ICU
69 // to fail, in which case we fall back to the default locale.
70 if (!localeIsEmpty && U_FAILURE(openStatus)) {
71 openStatus = U_ZERO_ERROR;
72 iterator = icu::BreakIterator::createLineInstance(icu::Locale(currentTextBreakLocaleID()), openStatus);
73 }
74
75 if (U_FAILURE(openStatus)) {
76 WTF_LOG_ERROR("icu::BreakIterator construction failed with status %d", openStatus);
77 return 0;
78 }
79 }
80
81 ASSERT(!m_vendedIterators.contains(iterator));
82 m_vendedIterators.set(iterator, locale);
83 return iterator;
84 }
85
put(icu::BreakIterator * iterator)86 void put(icu::BreakIterator* iterator)
87 {
88 ASSERT_ARG(iterator, m_vendedIterators.contains(iterator));
89
90 if (m_pool.size() == capacity) {
91 delete(m_pool[0].second);
92 m_pool.remove(0);
93 }
94
95 m_pool.append(Entry(m_vendedIterators.take(iterator), iterator));
96 }
97
98 private:
LineBreakIteratorPool()99 LineBreakIteratorPool() { }
100
101 static const size_t capacity = 4;
102
103 typedef pair<AtomicString, icu::BreakIterator*> Entry;
104 typedef Vector<Entry, capacity> Pool;
105 Pool m_pool;
106 HashMap<icu::BreakIterator*, AtomicString> m_vendedIterators;
107
108 friend WTF::ThreadSpecific<LineBreakIteratorPool>::operator LineBreakIteratorPool*();
109 };
110
111 enum TextContext { NoContext, PriorContext, PrimaryContext };
112
113 const int textBufferCapacity = 16;
114
115 typedef struct {
116 UText text;
117 UChar buffer[textBufferCapacity];
118 } UTextWithBuffer;
119
textPinIndex(int64_t & index,int64_t limit)120 static inline int64_t textPinIndex(int64_t& index, int64_t limit)
121 {
122 if (index < 0)
123 index = 0;
124 else if (index > limit)
125 index = limit;
126 return index;
127 }
128
textNativeLength(UText * text)129 static inline int64_t textNativeLength(UText* text)
130 {
131 return text->a + text->b;
132 }
133
134 // Relocate pointer from source into destination as required.
textFixPointer(const UText * source,UText * destination,const void * & pointer)135 static void textFixPointer(const UText* source, UText* destination, const void*& pointer)
136 {
137 if (pointer >= source->pExtra && pointer < static_cast<char*>(source->pExtra) + source->extraSize) {
138 // Pointer references source extra buffer.
139 pointer = static_cast<char*>(destination->pExtra) + (static_cast<const char*>(pointer) - static_cast<const char*>(source->pExtra));
140 } else if (pointer >= source && pointer < reinterpret_cast<const char*>(source) + source->sizeOfStruct) {
141 // Pointer references source text structure, but not source extra buffer.
142 pointer = reinterpret_cast<char*>(destination) + (static_cast<const char*>(pointer) - reinterpret_cast<const char*>(source));
143 }
144 }
145
textClone(UText * destination,const UText * source,UBool deep,UErrorCode * status)146 static UText* textClone(UText* destination, const UText* source, UBool deep, UErrorCode* status)
147 {
148 ASSERT_UNUSED(deep, !deep);
149 if (U_FAILURE(*status))
150 return 0;
151 int32_t extraSize = source->extraSize;
152 destination = utext_setup(destination, extraSize, status);
153 if (U_FAILURE(*status))
154 return destination;
155 void* extraNew = destination->pExtra;
156 int32_t flags = destination->flags;
157 int sizeToCopy = min(source->sizeOfStruct, destination->sizeOfStruct);
158 memcpy(destination, source, sizeToCopy);
159 destination->pExtra = extraNew;
160 destination->flags = flags;
161 memcpy(destination->pExtra, source->pExtra, extraSize);
162 textFixPointer(source, destination, destination->context);
163 textFixPointer(source, destination, destination->p);
164 textFixPointer(source, destination, destination->q);
165 ASSERT(!destination->r);
166 const void * chunkContents = static_cast<const void*>(destination->chunkContents);
167 textFixPointer(source, destination, chunkContents);
168 destination->chunkContents = static_cast<const UChar*>(chunkContents);
169 return destination;
170 }
171
textExtract(UText *,int64_t,int64_t,UChar *,int32_t,UErrorCode * errorCode)172 static int32_t textExtract(UText*, int64_t, int64_t, UChar*, int32_t, UErrorCode* errorCode)
173 {
174 // In the present context, this text provider is used only with ICU functions
175 // that do not perform an extract operation.
176 ASSERT_NOT_REACHED();
177 *errorCode = U_UNSUPPORTED_ERROR;
178 return 0;
179 }
180
textClose(UText * text)181 static void textClose(UText* text)
182 {
183 text->context = 0;
184 }
185
textGetContext(const UText * text,int64_t nativeIndex,UBool forward)186 static inline TextContext textGetContext(const UText* text, int64_t nativeIndex, UBool forward)
187 {
188 if (!text->b || nativeIndex > text->b)
189 return PrimaryContext;
190 if (nativeIndex == text->b)
191 return forward ? PrimaryContext : PriorContext;
192 return PriorContext;
193 }
194
textLatin1GetCurrentContext(const UText * text)195 static inline TextContext textLatin1GetCurrentContext(const UText* text)
196 {
197 if (!text->chunkContents)
198 return NoContext;
199 return text->chunkContents == text->pExtra ? PrimaryContext : PriorContext;
200 }
201
textLatin1MoveInPrimaryContext(UText * text,int64_t nativeIndex,int64_t nativeLength,UBool forward)202 static void textLatin1MoveInPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
203 {
204 ASSERT(text->chunkContents == text->pExtra);
205 if (forward) {
206 ASSERT(nativeIndex >= text->b && nativeIndex < nativeLength);
207 text->chunkNativeStart = nativeIndex;
208 text->chunkNativeLimit = nativeIndex + text->extraSize / sizeof(UChar);
209 if (text->chunkNativeLimit > nativeLength)
210 text->chunkNativeLimit = nativeLength;
211 } else {
212 ASSERT(nativeIndex > text->b && nativeIndex <= nativeLength);
213 text->chunkNativeLimit = nativeIndex;
214 text->chunkNativeStart = nativeIndex - text->extraSize / sizeof(UChar);
215 if (text->chunkNativeStart < text->b)
216 text->chunkNativeStart = text->b;
217 }
218 int64_t length = text->chunkNativeLimit - text->chunkNativeStart;
219 // Ensure chunk length is well defined if computed length exceeds int32_t range.
220 ASSERT(length <= numeric_limits<int32_t>::max());
221 text->chunkLength = length <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(length) : 0;
222 text->nativeIndexingLimit = text->chunkLength;
223 text->chunkOffset = forward ? 0 : text->chunkLength;
224 StringImpl::copyChars(const_cast<UChar*>(text->chunkContents), static_cast<const LChar*>(text->p) + (text->chunkNativeStart - text->b), static_cast<unsigned>(text->chunkLength));
225 }
226
textLatin1SwitchToPrimaryContext(UText * text,int64_t nativeIndex,int64_t nativeLength,UBool forward)227 static void textLatin1SwitchToPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
228 {
229 ASSERT(!text->chunkContents || text->chunkContents == text->q);
230 text->chunkContents = static_cast<const UChar*>(text->pExtra);
231 textLatin1MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
232 }
233
textLatin1MoveInPriorContext(UText * text,int64_t nativeIndex,int64_t nativeLength,UBool forward)234 static void textLatin1MoveInPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
235 {
236 ASSERT(text->chunkContents == text->q);
237 ASSERT(forward ? nativeIndex < text->b : nativeIndex <= text->b);
238 ASSERT_UNUSED(nativeLength, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
239 ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
240 text->chunkNativeStart = 0;
241 text->chunkNativeLimit = text->b;
242 text->chunkLength = text->b;
243 text->nativeIndexingLimit = text->chunkLength;
244 int64_t offset = nativeIndex - text->chunkNativeStart;
245 // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length.
246 ASSERT(offset <= numeric_limits<int32_t>::max());
247 text->chunkOffset = min(offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength);
248 }
249
textLatin1SwitchToPriorContext(UText * text,int64_t nativeIndex,int64_t nativeLength,UBool forward)250 static void textLatin1SwitchToPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
251 {
252 ASSERT(!text->chunkContents || text->chunkContents == text->pExtra);
253 text->chunkContents = static_cast<const UChar*>(text->q);
254 textLatin1MoveInPriorContext(text, nativeIndex, nativeLength, forward);
255 }
256
textInChunkOrOutOfRange(UText * text,int64_t nativeIndex,int64_t nativeLength,UBool forward,UBool & isAccessible)257 static inline bool textInChunkOrOutOfRange(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward, UBool& isAccessible)
258 {
259 if (forward) {
260 if (nativeIndex >= text->chunkNativeStart && nativeIndex < text->chunkNativeLimit) {
261 int64_t offset = nativeIndex - text->chunkNativeStart;
262 // Ensure chunk offset is well formed if computed offset exceeds int32_t range.
263 ASSERT(offset <= numeric_limits<int32_t>::max());
264 text->chunkOffset = offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0;
265 isAccessible = TRUE;
266 return true;
267 }
268 if (nativeIndex >= nativeLength && text->chunkNativeLimit == nativeLength) {
269 text->chunkOffset = text->chunkLength;
270 isAccessible = FALSE;
271 return true;
272 }
273 } else {
274 if (nativeIndex > text->chunkNativeStart && nativeIndex <= text->chunkNativeLimit) {
275 int64_t offset = nativeIndex - text->chunkNativeStart;
276 // Ensure chunk offset is well formed if computed offset exceeds int32_t range.
277 ASSERT(offset <= numeric_limits<int32_t>::max());
278 text->chunkOffset = offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0;
279 isAccessible = TRUE;
280 return true;
281 }
282 if (nativeIndex <= 0 && !text->chunkNativeStart) {
283 text->chunkOffset = 0;
284 isAccessible = FALSE;
285 return true;
286 }
287 }
288 return false;
289 }
290
textLatin1Access(UText * text,int64_t nativeIndex,UBool forward)291 static UBool textLatin1Access(UText* text, int64_t nativeIndex, UBool forward)
292 {
293 if (!text->context)
294 return FALSE;
295 int64_t nativeLength = textNativeLength(text);
296 UBool isAccessible;
297 if (textInChunkOrOutOfRange(text, nativeIndex, nativeLength, forward, isAccessible))
298 return isAccessible;
299 nativeIndex = textPinIndex(nativeIndex, nativeLength - 1);
300 TextContext currentContext = textLatin1GetCurrentContext(text);
301 TextContext newContext = textGetContext(text, nativeIndex, forward);
302 ASSERT(newContext != NoContext);
303 if (newContext == currentContext) {
304 if (currentContext == PrimaryContext) {
305 textLatin1MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
306 } else {
307 textLatin1MoveInPriorContext(text, nativeIndex, nativeLength, forward);
308 }
309 } else if (newContext == PrimaryContext) {
310 textLatin1SwitchToPrimaryContext(text, nativeIndex, nativeLength, forward);
311 } else {
312 ASSERT(newContext == PriorContext);
313 textLatin1SwitchToPriorContext(text, nativeIndex, nativeLength, forward);
314 }
315 return TRUE;
316 }
317
318 static const struct UTextFuncs textLatin1Funcs = {
319 sizeof(UTextFuncs),
320 0, 0, 0,
321 textClone,
322 textNativeLength,
323 textLatin1Access,
324 textExtract,
325 0, 0, 0, 0,
326 textClose,
327 0, 0, 0,
328 };
329
textInit(UText * text,const UTextFuncs * funcs,const void * string,unsigned length,const UChar * priorContext,int priorContextLength)330 static void textInit(UText* text, const UTextFuncs* funcs, const void* string, unsigned length, const UChar* priorContext, int priorContextLength)
331 {
332 text->pFuncs = funcs;
333 text->providerProperties = 1 << UTEXT_PROVIDER_STABLE_CHUNKS;
334 text->context = string;
335 text->p = string;
336 text->a = length;
337 text->q = priorContext;
338 text->b = priorContextLength;
339 }
340
textOpenLatin1(UTextWithBuffer * utWithBuffer,const LChar * string,unsigned length,const UChar * priorContext,int priorContextLength,UErrorCode * status)341 static UText* textOpenLatin1(UTextWithBuffer* utWithBuffer, const LChar* string, unsigned length, const UChar* priorContext, int priorContextLength, UErrorCode* status)
342 {
343 if (U_FAILURE(*status))
344 return 0;
345
346 if (!string || length > static_cast<unsigned>(numeric_limits<int32_t>::max())) {
347 *status = U_ILLEGAL_ARGUMENT_ERROR;
348 return 0;
349 }
350 UText* text = utext_setup(&utWithBuffer->text, sizeof(utWithBuffer->buffer), status);
351 if (U_FAILURE(*status)) {
352 ASSERT(!text);
353 return 0;
354 }
355 textInit(text, &textLatin1Funcs, string, length, priorContext, priorContextLength);
356 return text;
357 }
358
textUTF16GetCurrentContext(const UText * text)359 static inline TextContext textUTF16GetCurrentContext(const UText* text)
360 {
361 if (!text->chunkContents)
362 return NoContext;
363 return text->chunkContents == text->p ? PrimaryContext : PriorContext;
364 }
365
textUTF16MoveInPrimaryContext(UText * text,int64_t nativeIndex,int64_t nativeLength,UBool forward)366 static void textUTF16MoveInPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
367 {
368 ASSERT(text->chunkContents == text->p);
369 ASSERT_UNUSED(forward, forward ? nativeIndex >= text->b : nativeIndex > text->b);
370 ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
371 text->chunkNativeStart = text->b;
372 text->chunkNativeLimit = nativeLength;
373 int64_t length = text->chunkNativeLimit - text->chunkNativeStart;
374 // Ensure chunk length is well defined if computed length exceeds int32_t range.
375 ASSERT(length <= numeric_limits<int32_t>::max());
376 text->chunkLength = length <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(length) : 0;
377 text->nativeIndexingLimit = text->chunkLength;
378 int64_t offset = nativeIndex - text->chunkNativeStart;
379 // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length.
380 ASSERT(offset <= numeric_limits<int32_t>::max());
381 text->chunkOffset = min(offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength);
382 }
383
textUTF16SwitchToPrimaryContext(UText * text,int64_t nativeIndex,int64_t nativeLength,UBool forward)384 static void textUTF16SwitchToPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
385 {
386 ASSERT(!text->chunkContents || text->chunkContents == text->q);
387 text->chunkContents = static_cast<const UChar*>(text->p);
388 textUTF16MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
389 }
390
textUTF16MoveInPriorContext(UText * text,int64_t nativeIndex,int64_t nativeLength,UBool forward)391 static void textUTF16MoveInPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
392 {
393 ASSERT(text->chunkContents == text->q);
394 ASSERT(forward ? nativeIndex < text->b : nativeIndex <= text->b);
395 ASSERT_UNUSED(nativeLength, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
396 ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
397 text->chunkNativeStart = 0;
398 text->chunkNativeLimit = text->b;
399 text->chunkLength = text->b;
400 text->nativeIndexingLimit = text->chunkLength;
401 int64_t offset = nativeIndex - text->chunkNativeStart;
402 // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length.
403 ASSERT(offset <= numeric_limits<int32_t>::max());
404 text->chunkOffset = min(offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength);
405 }
406
textUTF16SwitchToPriorContext(UText * text,int64_t nativeIndex,int64_t nativeLength,UBool forward)407 static void textUTF16SwitchToPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
408 {
409 ASSERT(!text->chunkContents || text->chunkContents == text->p);
410 text->chunkContents = static_cast<const UChar*>(text->q);
411 textUTF16MoveInPriorContext(text, nativeIndex, nativeLength, forward);
412 }
413
textUTF16Access(UText * text,int64_t nativeIndex,UBool forward)414 static UBool textUTF16Access(UText* text, int64_t nativeIndex, UBool forward)
415 {
416 if (!text->context)
417 return FALSE;
418 int64_t nativeLength = textNativeLength(text);
419 UBool isAccessible;
420 if (textInChunkOrOutOfRange(text, nativeIndex, nativeLength, forward, isAccessible))
421 return isAccessible;
422 nativeIndex = textPinIndex(nativeIndex, nativeLength - 1);
423 TextContext currentContext = textUTF16GetCurrentContext(text);
424 TextContext newContext = textGetContext(text, nativeIndex, forward);
425 ASSERT(newContext != NoContext);
426 if (newContext == currentContext) {
427 if (currentContext == PrimaryContext) {
428 textUTF16MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
429 } else {
430 textUTF16MoveInPriorContext(text, nativeIndex, nativeLength, forward);
431 }
432 } else if (newContext == PrimaryContext) {
433 textUTF16SwitchToPrimaryContext(text, nativeIndex, nativeLength, forward);
434 } else {
435 ASSERT(newContext == PriorContext);
436 textUTF16SwitchToPriorContext(text, nativeIndex, nativeLength, forward);
437 }
438 return TRUE;
439 }
440
441 static const struct UTextFuncs textUTF16Funcs = {
442 sizeof(UTextFuncs),
443 0, 0, 0,
444 textClone,
445 textNativeLength,
446 textUTF16Access,
447 textExtract,
448 0, 0, 0, 0,
449 textClose,
450 0, 0, 0,
451 };
452
textOpenUTF16(UText * text,const UChar * string,unsigned length,const UChar * priorContext,int priorContextLength,UErrorCode * status)453 static UText* textOpenUTF16(UText* text, const UChar* string, unsigned length, const UChar* priorContext, int priorContextLength, UErrorCode* status)
454 {
455 if (U_FAILURE(*status))
456 return 0;
457
458 if (!string || length > static_cast<unsigned>(numeric_limits<int32_t>::max())) {
459 *status = U_ILLEGAL_ARGUMENT_ERROR;
460 return 0;
461 }
462
463 text = utext_setup(text, 0, status);
464 if (U_FAILURE(*status)) {
465 ASSERT(!text);
466 return 0;
467 }
468 textInit(text, &textUTF16Funcs, string, length, priorContext, priorContextLength);
469 return text;
470 }
471
472 static UText emptyText = UTEXT_INITIALIZER;
473
wordBreakIterator(const LChar * string,int length)474 static TextBreakIterator* wordBreakIterator(const LChar* string, int length)
475 {
476 UErrorCode errorCode = U_ZERO_ERROR;
477 static TextBreakIterator* breakIter = 0;
478 if (!breakIter) {
479 breakIter = icu::BreakIterator::createWordInstance(icu::Locale(currentTextBreakLocaleID()), errorCode);
480 ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode);
481 if (!breakIter)
482 return 0;
483 }
484
485 UTextWithBuffer textLocal;
486 textLocal.text = emptyText;
487 textLocal.text.extraSize = sizeof(textLocal.buffer);
488 textLocal.text.pExtra = textLocal.buffer;
489
490 UErrorCode openStatus = U_ZERO_ERROR;
491 UText* text = textOpenLatin1(&textLocal, string, length, 0, 0, &openStatus);
492 if (U_FAILURE(openStatus)) {
493 WTF_LOG_ERROR("textOpenLatin1 failed with status %d", openStatus);
494 return 0;
495 }
496
497 UErrorCode setTextStatus = U_ZERO_ERROR;
498 breakIter->setText(text, setTextStatus);
499 if (U_FAILURE(setTextStatus))
500 WTF_LOG_ERROR("BreakIterator::seText failed with status %d", setTextStatus);
501
502 utext_close(text);
503
504 return breakIter;
505 }
506
setText16(TextBreakIterator * iter,const UChar * string,int length)507 static void setText16(TextBreakIterator* iter, const UChar* string, int length)
508 {
509 UErrorCode errorCode = U_ZERO_ERROR;
510 UText uText = UTEXT_INITIALIZER;
511 utext_openUChars(&uText, string, length, &errorCode);
512 if (U_FAILURE(errorCode))
513 return;
514 iter->setText(&uText, errorCode);
515 }
516
wordBreakIterator(const UChar * string,int length)517 TextBreakIterator* wordBreakIterator(const UChar* string, int length)
518 {
519 UErrorCode errorCode = U_ZERO_ERROR;
520 static TextBreakIterator* breakIter = 0;
521 if (!breakIter) {
522 breakIter = icu::BreakIterator::createWordInstance(icu::Locale(currentTextBreakLocaleID()), errorCode);
523 ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode);
524 if (!breakIter)
525 return 0;
526 }
527 setText16(breakIter, string, length);
528 return breakIter;
529 }
530
wordBreakIterator(const String & string,int start,int length)531 TextBreakIterator* wordBreakIterator(const String& string, int start, int length)
532 {
533 if (string.isEmpty())
534 return 0;
535 if (string.is8Bit())
536 return wordBreakIterator(string.characters8() + start, length);
537 return wordBreakIterator(string.characters16() + start, length);
538 }
539
acquireLineBreakIterator(const LChar * string,int length,const AtomicString & locale,const UChar * priorContext,unsigned priorContextLength)540 TextBreakIterator* acquireLineBreakIterator(const LChar* string, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength)
541 {
542 TextBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale);
543 if (!iterator)
544 return 0;
545
546 UTextWithBuffer textLocal;
547 textLocal.text = emptyText;
548 textLocal.text.extraSize = sizeof(textLocal.buffer);
549 textLocal.text.pExtra = textLocal.buffer;
550
551 UErrorCode openStatus = U_ZERO_ERROR;
552 UText* text = textOpenLatin1(&textLocal, string, length, priorContext, priorContextLength, &openStatus);
553 if (U_FAILURE(openStatus)) {
554 WTF_LOG_ERROR("textOpenLatin1 failed with status %d", openStatus);
555 return 0;
556 }
557
558 UErrorCode setTextStatus = U_ZERO_ERROR;
559 iterator->setText(text, setTextStatus);
560 if (U_FAILURE(setTextStatus)) {
561 WTF_LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
562 return 0;
563 }
564
565 utext_close(text);
566
567 return iterator;
568 }
569
acquireLineBreakIterator(const UChar * string,int length,const AtomicString & locale,const UChar * priorContext,unsigned priorContextLength)570 TextBreakIterator* acquireLineBreakIterator(const UChar* string, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength)
571 {
572 TextBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale);
573 if (!iterator)
574 return 0;
575
576 UText textLocal = UTEXT_INITIALIZER;
577
578 UErrorCode openStatus = U_ZERO_ERROR;
579 UText* text = textOpenUTF16(&textLocal, string, length, priorContext, priorContextLength, &openStatus);
580 if (U_FAILURE(openStatus)) {
581 WTF_LOG_ERROR("textOpenUTF16 failed with status %d", openStatus);
582 return 0;
583 }
584
585 UErrorCode setTextStatus = U_ZERO_ERROR;
586 iterator->setText(text, setTextStatus);
587 if (U_FAILURE(setTextStatus)) {
588 WTF_LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
589 return 0;
590 }
591
592 utext_close(text);
593
594 return iterator;
595 }
596
releaseLineBreakIterator(TextBreakIterator * iterator)597 void releaseLineBreakIterator(TextBreakIterator* iterator)
598 {
599 ASSERT_ARG(iterator, iterator);
600
601 LineBreakIteratorPool::sharedPool().put(iterator);
602 }
603
604 static TextBreakIterator* nonSharedCharacterBreakIterator;
605
compareAndSwapNonSharedCharacterBreakIterator(TextBreakIterator * expected,TextBreakIterator * newValue)606 static inline bool compareAndSwapNonSharedCharacterBreakIterator(TextBreakIterator* expected, TextBreakIterator* newValue)
607 {
608 DEFINE_STATIC_LOCAL(Mutex, nonSharedCharacterBreakIteratorMutex, ());
609 MutexLocker locker(nonSharedCharacterBreakIteratorMutex);
610 if (nonSharedCharacterBreakIterator != expected)
611 return false;
612 nonSharedCharacterBreakIterator = newValue;
613 return true;
614 }
615
NonSharedCharacterBreakIterator(const String & string)616 NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(const String& string)
617 : m_is8Bit(true)
618 , m_charaters8(0)
619 , m_offset(0)
620 , m_length(0)
621 , m_iterator(0)
622 {
623 if (string.isEmpty())
624 return;
625
626 m_is8Bit = string.is8Bit();
627
628 if (m_is8Bit) {
629 m_charaters8 = string.characters8();
630 m_offset = 0;
631 m_length = string.length();
632 return;
633 }
634
635 createIteratorForBuffer(string.characters16(), string.length());
636 }
637
NonSharedCharacterBreakIterator(const UChar * buffer,unsigned length)638 NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(const UChar* buffer, unsigned length)
639 : m_is8Bit(false)
640 , m_charaters8(0)
641 , m_offset(0)
642 , m_length(0)
643 , m_iterator(0)
644 {
645 createIteratorForBuffer(buffer, length);
646 }
647
createIteratorForBuffer(const UChar * buffer,unsigned length)648 void NonSharedCharacterBreakIterator::createIteratorForBuffer(const UChar* buffer, unsigned length)
649 {
650 m_iterator = nonSharedCharacterBreakIterator;
651 bool createdIterator = m_iterator && compareAndSwapNonSharedCharacterBreakIterator(m_iterator, 0);
652 if (!createdIterator) {
653 UErrorCode errorCode = U_ZERO_ERROR;
654 m_iterator = icu::BreakIterator::createCharacterInstance(icu::Locale(currentTextBreakLocaleID()), errorCode);
655 ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode);
656 }
657
658 setText16(m_iterator, buffer, length);
659 }
660
~NonSharedCharacterBreakIterator()661 NonSharedCharacterBreakIterator::~NonSharedCharacterBreakIterator()
662 {
663 if (m_is8Bit)
664 return;
665 if (!compareAndSwapNonSharedCharacterBreakIterator(0, m_iterator))
666 delete m_iterator;
667 }
668
next()669 int NonSharedCharacterBreakIterator::next()
670 {
671 if (!m_is8Bit)
672 return m_iterator->next();
673
674 if (m_offset >= m_length)
675 return TextBreakDone;
676
677 m_offset += clusterLengthStartingAt(m_offset);
678 return m_offset;
679 }
680
current()681 int NonSharedCharacterBreakIterator::current()
682 {
683 if (!m_is8Bit)
684 return m_iterator->current();
685 return m_offset;
686 }
687
isBreak(int offset) const688 bool NonSharedCharacterBreakIterator::isBreak(int offset) const
689 {
690 if (!m_is8Bit)
691 return m_iterator->isBoundary(offset);
692 return !isLFAfterCR(offset);
693 }
694
preceding(int offset) const695 int NonSharedCharacterBreakIterator::preceding(int offset) const
696 {
697 if (!m_is8Bit)
698 return m_iterator->preceding(offset);
699 if (offset <= 0)
700 return TextBreakDone;
701 if (isLFAfterCR(offset))
702 return offset - 2;
703 return offset - 1;
704 }
705
following(int offset) const706 int NonSharedCharacterBreakIterator::following(int offset) const
707 {
708 if (!m_is8Bit)
709 return m_iterator->following(offset);
710 if (static_cast<unsigned>(offset) >= m_length)
711 return TextBreakDone;
712 return offset + clusterLengthStartingAt(offset);
713 }
714
sentenceBreakIterator(const UChar * string,int length)715 TextBreakIterator* sentenceBreakIterator(const UChar* string, int length)
716 {
717 UErrorCode openStatus = U_ZERO_ERROR;
718 static TextBreakIterator* iterator = 0;
719 if (!iterator) {
720 iterator = icu::BreakIterator::createSentenceInstance(icu::Locale(currentTextBreakLocaleID()), openStatus);
721 ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
722 if (!iterator)
723 return 0;
724 }
725
726 setText16(iterator, string, length);
727 return iterator;
728 }
729
isWordTextBreak(TextBreakIterator * iterator)730 bool isWordTextBreak(TextBreakIterator* iterator)
731 {
732 icu::RuleBasedBreakIterator* ruleBasedBreakIterator = static_cast<icu::RuleBasedBreakIterator*>(iterator);
733 int ruleStatus = ruleBasedBreakIterator->getRuleStatus();
734 return ruleStatus != UBRK_WORD_NONE;
735 }
736
setUpIteratorWithRules(const char * breakRules,const UChar * string,int length)737 static TextBreakIterator* setUpIteratorWithRules(const char* breakRules, const UChar* string, int length)
738 {
739 if (!string)
740 return 0;
741
742 static TextBreakIterator* iterator = 0;
743 if (!iterator) {
744 UParseError parseStatus;
745 UErrorCode openStatus = U_ZERO_ERROR;
746 Vector<UChar> rules;
747 String(breakRules).appendTo(rules);
748
749 iterator = new icu::RuleBasedBreakIterator(icu::UnicodeString(rules.data(), rules.size()), parseStatus, openStatus);
750 ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
751 if (!iterator)
752 return 0;
753 }
754
755 setText16(iterator, string, length);
756 return iterator;
757 }
758
cursorMovementIterator(const UChar * string,int length)759 TextBreakIterator* cursorMovementIterator(const UChar* string, int length)
760 {
761 // This rule set is based on character-break iterator rules of ICU 4.0
762 // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/char.txt>.
763 // The major differences from the original ones are listed below:
764 // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier;
765 // * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342);
766 // * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and;
767 // * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks.
768 // * Added rules for regional indicator symbols.
769 static const char* const kRules =
770 "$CR = [\\p{Grapheme_Cluster_Break = CR}];"
771 "$LF = [\\p{Grapheme_Cluster_Break = LF}];"
772 "$Control = [\\p{Grapheme_Cluster_Break = Control}];"
773 "$VoiceMarks = [\\uFF9E\\uFF9F];" // Japanese half-width katakana voiced marks
774 "$Extend = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];"
775 "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];"
776 "$L = [\\p{Grapheme_Cluster_Break = L}];"
777 "$V = [\\p{Grapheme_Cluster_Break = V}];"
778 "$T = [\\p{Grapheme_Cluster_Break = T}];"
779 "$LV = [\\p{Grapheme_Cluster_Break = LV}];"
780 "$LVT = [\\p{Grapheme_Cluster_Break = LVT}];"
781 "$Hin0 = [\\u0905-\\u0939];" // Devanagari Letter A,...,Ha
782 "$HinV = \\u094D;" // Devanagari Sign Virama
783 "$Hin1 = [\\u0915-\\u0939];" // Devanagari Letter Ka,...,Ha
784 "$Ben0 = [\\u0985-\\u09B9];" // Bengali Letter A,...,Ha
785 "$BenV = \\u09CD;" // Bengali Sign Virama
786 "$Ben1 = [\\u0995-\\u09B9];" // Bengali Letter Ka,...,Ha
787 "$Pan0 = [\\u0A05-\\u0A39];" // Gurmukhi Letter A,...,Ha
788 "$PanV = \\u0A4D;" // Gurmukhi Sign Virama
789 "$Pan1 = [\\u0A15-\\u0A39];" // Gurmukhi Letter Ka,...,Ha
790 "$Guj0 = [\\u0A85-\\u0AB9];" // Gujarati Letter A,...,Ha
791 "$GujV = \\u0ACD;" // Gujarati Sign Virama
792 "$Guj1 = [\\u0A95-\\u0AB9];" // Gujarati Letter Ka,...,Ha
793 "$Ori0 = [\\u0B05-\\u0B39];" // Oriya Letter A,...,Ha
794 "$OriV = \\u0B4D;" // Oriya Sign Virama
795 "$Ori1 = [\\u0B15-\\u0B39];" // Oriya Letter Ka,...,Ha
796 "$Tel0 = [\\u0C05-\\u0C39];" // Telugu Letter A,...,Ha
797 "$TelV = \\u0C4D;" // Telugu Sign Virama
798 "$Tel1 = [\\u0C14-\\u0C39];" // Telugu Letter Ka,...,Ha
799 "$Kan0 = [\\u0C85-\\u0CB9];" // Kannada Letter A,...,Ha
800 "$KanV = \\u0CCD;" // Kannada Sign Virama
801 "$Kan1 = [\\u0C95-\\u0CB9];" // Kannada Letter A,...,Ha
802 "$Mal0 = [\\u0D05-\\u0D39];" // Malayalam Letter A,...,Ha
803 "$MalV = \\u0D4D;" // Malayalam Sign Virama
804 "$Mal1 = [\\u0D15-\\u0D39];" // Malayalam Letter A,...,Ha
805 "$RI = [\\U0001F1E6-\\U0001F1FF];" // Emoji regional indicators
806 "!!chain;"
807 "!!forward;"
808 "$CR $LF;"
809 "$L ($L | $V | $LV | $LVT);"
810 "($LV | $V) ($V | $T);"
811 "($LVT | $T) $T;"
812 "[^$Control $CR $LF] $Extend;"
813 "[^$Control $CR $LF] $SpacingMark;"
814 "$RI $RI / $RI;"
815 "$RI $RI;"
816 "$Hin0 $HinV $Hin1;" // Devanagari Virama (forward)
817 "$Ben0 $BenV $Ben1;" // Bengali Virama (forward)
818 "$Pan0 $PanV $Pan1;" // Gurmukhi Virama (forward)
819 "$Guj0 $GujV $Guj1;" // Gujarati Virama (forward)
820 "$Ori0 $OriV $Ori1;" // Oriya Virama (forward)
821 "$Tel0 $TelV $Tel1;" // Telugu Virama (forward)
822 "$Kan0 $KanV $Kan1;" // Kannada Virama (forward)
823 "$Mal0 $MalV $Mal1;" // Malayalam Virama (forward)
824 "!!reverse;"
825 "$LF $CR;"
826 "($L | $V | $LV | $LVT) $L;"
827 "($V | $T) ($LV | $V);"
828 "$T ($LVT | $T);"
829 "$Extend [^$Control $CR $LF];"
830 "$SpacingMark [^$Control $CR $LF];"
831 "$RI $RI / $RI $RI;"
832 "$RI $RI;"
833 "$Hin1 $HinV $Hin0;" // Devanagari Virama (backward)
834 "$Ben1 $BenV $Ben0;" // Bengali Virama (backward)
835 "$Pan1 $PanV $Pan0;" // Gurmukhi Virama (backward)
836 "$Guj1 $GujV $Guj0;" // Gujarati Virama (backward)
837 "$Ori1 $OriV $Ori0;" // Gujarati Virama (backward)
838 "$Tel1 $TelV $Tel0;" // Telugu Virama (backward)
839 "$Kan1 $KanV $Kan0;" // Kannada Virama (backward)
840 "$Mal1 $MalV $Mal0;" // Malayalam Virama (backward)
841 "!!safe_reverse;"
842 "!!safe_forward;";
843
844 return setUpIteratorWithRules(kRules, string, length);
845 }
846
847 }
848