1 /*
2 * Copyright (C) 2006 Lars Knoll <lars@trolltech.com>
3 * Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved.
4 *
5 * This library is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU Library General Public
7 * License as published by the Free Software Foundation; either
8 * version 2 of the License, or (at your option) any later version.
9 *
10 * This library is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * Library General Public License for more details.
14 *
15 * You should have received a copy of the GNU Library General Public License
16 * along with this library; see the file COPYING.LIB. If not, write to
17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18 * Boston, MA 02110-1301, USA.
19 *
20 */
21
22 #include "config.h"
23 #include "platform/text/TextBreakIterator.h"
24
25 #include "platform/text/TextBreakIteratorInternalICU.h"
26 #include "wtf/Assertions.h"
27 #include "wtf/HashMap.h"
28 #include "wtf/PassOwnPtr.h"
29 #include "wtf/ThreadSpecific.h"
30 #include "wtf/ThreadingPrimitives.h"
31 #include "wtf/text/AtomicString.h"
32 #include "wtf/text/CString.h"
33 #include "wtf/text/WTFString.h"
34 #include <unicode/ubrk.h>
35
36 using namespace WTF;
37 using namespace std;
38
39 namespace WebCore {
40
41 class LineBreakIteratorPool {
42 WTF_MAKE_NONCOPYABLE(LineBreakIteratorPool);
43 public:
sharedPool()44 static LineBreakIteratorPool& sharedPool()
45 {
46 static WTF::ThreadSpecific<LineBreakIteratorPool>* pool = new WTF::ThreadSpecific<LineBreakIteratorPool>;
47 return **pool;
48 }
49
create()50 static PassOwnPtr<LineBreakIteratorPool> create() { return adoptPtr(new LineBreakIteratorPool); }
51
take(const AtomicString & locale)52 icu::BreakIterator* take(const AtomicString& locale)
53 {
54 icu::BreakIterator* iterator = 0;
55 for (size_t i = 0; i < m_pool.size(); ++i) {
56 if (m_pool[i].first == locale) {
57 iterator = m_pool[i].second;
58 m_pool.remove(i);
59 break;
60 }
61 }
62
63 if (!iterator) {
64 UErrorCode openStatus = U_ZERO_ERROR;
65 bool localeIsEmpty = locale.isEmpty();
66 iterator = icu::BreakIterator::createLineInstance(localeIsEmpty ? icu::Locale(currentTextBreakLocaleID()) : icu::Locale(locale.string().utf8().data()), openStatus);
67 // locale comes from a web page and it can be invalid, leading ICU
68 // to fail, in which case we fall back to the default locale.
69 if (!localeIsEmpty && U_FAILURE(openStatus)) {
70 openStatus = U_ZERO_ERROR;
71 iterator = icu::BreakIterator::createLineInstance(icu::Locale(currentTextBreakLocaleID()), openStatus);
72 }
73
74 if (U_FAILURE(openStatus)) {
75 WTF_LOG_ERROR("icu::BreakIterator construction failed with status %d", openStatus);
76 return 0;
77 }
78 }
79
80 ASSERT(!m_vendedIterators.contains(iterator));
81 m_vendedIterators.set(iterator, locale);
82 return iterator;
83 }
84
put(icu::BreakIterator * iterator)85 void put(icu::BreakIterator* iterator)
86 {
87 ASSERT_ARG(iterator, m_vendedIterators.contains(iterator));
88
89 if (m_pool.size() == capacity) {
90 delete(m_pool[0].second);
91 m_pool.remove(0);
92 }
93
94 m_pool.append(Entry(m_vendedIterators.take(iterator), iterator));
95 }
96
97 private:
LineBreakIteratorPool()98 LineBreakIteratorPool() { }
99
100 static const size_t capacity = 4;
101
102 typedef pair<AtomicString, icu::BreakIterator*> Entry;
103 typedef Vector<Entry, capacity> Pool;
104 Pool m_pool;
105 HashMap<icu::BreakIterator*, AtomicString> m_vendedIterators;
106
107 friend WTF::ThreadSpecific<LineBreakIteratorPool>::operator LineBreakIteratorPool*();
108 };
109
110 enum TextContext { NoContext, PriorContext, PrimaryContext };
111
112 const int textBufferCapacity = 16;
113
114 typedef struct {
115 UText text;
116 UChar buffer[textBufferCapacity];
117 } UTextWithBuffer;
118
textPinIndex(int64_t & index,int64_t limit)119 static inline int64_t textPinIndex(int64_t& index, int64_t limit)
120 {
121 if (index < 0)
122 index = 0;
123 else if (index > limit)
124 index = limit;
125 return index;
126 }
127
textNativeLength(UText * text)128 static inline int64_t textNativeLength(UText* text)
129 {
130 return text->a + text->b;
131 }
132
133 // Relocate pointer from source into destination as required.
textFixPointer(const UText * source,UText * destination,const void * & pointer)134 static void textFixPointer(const UText* source, UText* destination, const void*& pointer)
135 {
136 if (pointer >= source->pExtra && pointer < static_cast<char*>(source->pExtra) + source->extraSize) {
137 // Pointer references source extra buffer.
138 pointer = static_cast<char*>(destination->pExtra) + (static_cast<const char*>(pointer) - static_cast<const char*>(source->pExtra));
139 } else if (pointer >= source && pointer < reinterpret_cast<const char*>(source) + source->sizeOfStruct) {
140 // Pointer references source text structure, but not source extra buffer.
141 pointer = reinterpret_cast<char*>(destination) + (static_cast<const char*>(pointer) - reinterpret_cast<const char*>(source));
142 }
143 }
144
textClone(UText * destination,const UText * source,UBool deep,UErrorCode * status)145 static UText* textClone(UText* destination, const UText* source, UBool deep, UErrorCode* status)
146 {
147 ASSERT_UNUSED(deep, !deep);
148 if (U_FAILURE(*status))
149 return 0;
150 int32_t extraSize = source->extraSize;
151 destination = utext_setup(destination, extraSize, status);
152 if (U_FAILURE(*status))
153 return destination;
154 void* extraNew = destination->pExtra;
155 int32_t flags = destination->flags;
156 int sizeToCopy = min(source->sizeOfStruct, destination->sizeOfStruct);
157 memcpy(destination, source, sizeToCopy);
158 destination->pExtra = extraNew;
159 destination->flags = flags;
160 memcpy(destination->pExtra, source->pExtra, extraSize);
161 textFixPointer(source, destination, destination->context);
162 textFixPointer(source, destination, destination->p);
163 textFixPointer(source, destination, destination->q);
164 ASSERT(!destination->r);
165 const void * chunkContents = static_cast<const void*>(destination->chunkContents);
166 textFixPointer(source, destination, chunkContents);
167 destination->chunkContents = static_cast<const UChar*>(chunkContents);
168 return destination;
169 }
170
textExtract(UText *,int64_t,int64_t,UChar *,int32_t,UErrorCode * errorCode)171 static int32_t textExtract(UText*, int64_t, int64_t, UChar*, int32_t, UErrorCode* errorCode)
172 {
173 // In the present context, this text provider is used only with ICU functions
174 // that do not perform an extract operation.
175 ASSERT_NOT_REACHED();
176 *errorCode = U_UNSUPPORTED_ERROR;
177 return 0;
178 }
179
textClose(UText * text)180 static void textClose(UText* text)
181 {
182 text->context = 0;
183 }
184
textGetContext(const UText * text,int64_t nativeIndex,UBool forward)185 static inline TextContext textGetContext(const UText* text, int64_t nativeIndex, UBool forward)
186 {
187 if (!text->b || nativeIndex > text->b)
188 return PrimaryContext;
189 if (nativeIndex == text->b)
190 return forward ? PrimaryContext : PriorContext;
191 return PriorContext;
192 }
193
textLatin1GetCurrentContext(const UText * text)194 static inline TextContext textLatin1GetCurrentContext(const UText* text)
195 {
196 if (!text->chunkContents)
197 return NoContext;
198 return text->chunkContents == text->pExtra ? PrimaryContext : PriorContext;
199 }
200
textLatin1MoveInPrimaryContext(UText * text,int64_t nativeIndex,int64_t nativeLength,UBool forward)201 static void textLatin1MoveInPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
202 {
203 ASSERT(text->chunkContents == text->pExtra);
204 if (forward) {
205 ASSERT(nativeIndex >= text->b && nativeIndex < nativeLength);
206 text->chunkNativeStart = nativeIndex;
207 text->chunkNativeLimit = nativeIndex + text->extraSize / sizeof(UChar);
208 if (text->chunkNativeLimit > nativeLength)
209 text->chunkNativeLimit = nativeLength;
210 } else {
211 ASSERT(nativeIndex > text->b && nativeIndex <= nativeLength);
212 text->chunkNativeLimit = nativeIndex;
213 text->chunkNativeStart = nativeIndex - text->extraSize / sizeof(UChar);
214 if (text->chunkNativeStart < text->b)
215 text->chunkNativeStart = text->b;
216 }
217 int64_t length = text->chunkNativeLimit - text->chunkNativeStart;
218 // Ensure chunk length is well defined if computed length exceeds int32_t range.
219 ASSERT(length <= numeric_limits<int32_t>::max());
220 text->chunkLength = length <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(length) : 0;
221 text->nativeIndexingLimit = text->chunkLength;
222 text->chunkOffset = forward ? 0 : text->chunkLength;
223 StringImpl::copyChars(const_cast<UChar*>(text->chunkContents), static_cast<const LChar*>(text->p) + (text->chunkNativeStart - text->b), static_cast<unsigned>(text->chunkLength));
224 }
225
textLatin1SwitchToPrimaryContext(UText * text,int64_t nativeIndex,int64_t nativeLength,UBool forward)226 static void textLatin1SwitchToPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
227 {
228 ASSERT(!text->chunkContents || text->chunkContents == text->q);
229 text->chunkContents = static_cast<const UChar*>(text->pExtra);
230 textLatin1MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
231 }
232
textLatin1MoveInPriorContext(UText * text,int64_t nativeIndex,int64_t nativeLength,UBool forward)233 static void textLatin1MoveInPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
234 {
235 ASSERT(text->chunkContents == text->q);
236 ASSERT(forward ? nativeIndex < text->b : nativeIndex <= text->b);
237 ASSERT_UNUSED(nativeLength, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
238 ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
239 text->chunkNativeStart = 0;
240 text->chunkNativeLimit = text->b;
241 text->chunkLength = text->b;
242 text->nativeIndexingLimit = text->chunkLength;
243 int64_t offset = nativeIndex - text->chunkNativeStart;
244 // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length.
245 ASSERT(offset <= numeric_limits<int32_t>::max());
246 text->chunkOffset = min(offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength);
247 }
248
textLatin1SwitchToPriorContext(UText * text,int64_t nativeIndex,int64_t nativeLength,UBool forward)249 static void textLatin1SwitchToPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
250 {
251 ASSERT(!text->chunkContents || text->chunkContents == text->pExtra);
252 text->chunkContents = static_cast<const UChar*>(text->q);
253 textLatin1MoveInPriorContext(text, nativeIndex, nativeLength, forward);
254 }
255
textInChunkOrOutOfRange(UText * text,int64_t nativeIndex,int64_t nativeLength,UBool forward,UBool & isAccessible)256 static inline bool textInChunkOrOutOfRange(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward, UBool& isAccessible)
257 {
258 if (forward) {
259 if (nativeIndex >= text->chunkNativeStart && nativeIndex < text->chunkNativeLimit) {
260 int64_t offset = nativeIndex - text->chunkNativeStart;
261 // Ensure chunk offset is well formed if computed offset exceeds int32_t range.
262 ASSERT(offset <= numeric_limits<int32_t>::max());
263 text->chunkOffset = offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0;
264 isAccessible = TRUE;
265 return true;
266 }
267 if (nativeIndex >= nativeLength && text->chunkNativeLimit == nativeLength) {
268 text->chunkOffset = text->chunkLength;
269 isAccessible = FALSE;
270 return true;
271 }
272 } else {
273 if (nativeIndex > text->chunkNativeStart && nativeIndex <= text->chunkNativeLimit) {
274 int64_t offset = nativeIndex - text->chunkNativeStart;
275 // Ensure chunk offset is well formed if computed offset exceeds int32_t range.
276 ASSERT(offset <= numeric_limits<int32_t>::max());
277 text->chunkOffset = offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0;
278 isAccessible = TRUE;
279 return true;
280 }
281 if (nativeIndex <= 0 && !text->chunkNativeStart) {
282 text->chunkOffset = 0;
283 isAccessible = FALSE;
284 return true;
285 }
286 }
287 return false;
288 }
289
textLatin1Access(UText * text,int64_t nativeIndex,UBool forward)290 static UBool textLatin1Access(UText* text, int64_t nativeIndex, UBool forward)
291 {
292 if (!text->context)
293 return FALSE;
294 int64_t nativeLength = textNativeLength(text);
295 UBool isAccessible;
296 if (textInChunkOrOutOfRange(text, nativeIndex, nativeLength, forward, isAccessible))
297 return isAccessible;
298 nativeIndex = textPinIndex(nativeIndex, nativeLength - 1);
299 TextContext currentContext = textLatin1GetCurrentContext(text);
300 TextContext newContext = textGetContext(text, nativeIndex, forward);
301 ASSERT(newContext != NoContext);
302 if (newContext == currentContext) {
303 if (currentContext == PrimaryContext) {
304 textLatin1MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
305 } else {
306 textLatin1MoveInPriorContext(text, nativeIndex, nativeLength, forward);
307 }
308 } else if (newContext == PrimaryContext) {
309 textLatin1SwitchToPrimaryContext(text, nativeIndex, nativeLength, forward);
310 } else {
311 ASSERT(newContext == PriorContext);
312 textLatin1SwitchToPriorContext(text, nativeIndex, nativeLength, forward);
313 }
314 return TRUE;
315 }
316
317 static const struct UTextFuncs textLatin1Funcs = {
318 sizeof(UTextFuncs),
319 0, 0, 0,
320 textClone,
321 textNativeLength,
322 textLatin1Access,
323 textExtract,
324 0, 0, 0, 0,
325 textClose,
326 0, 0, 0,
327 };
328
textInit(UText * text,const UTextFuncs * funcs,const void * string,unsigned length,const UChar * priorContext,int priorContextLength)329 static void textInit(UText* text, const UTextFuncs* funcs, const void* string, unsigned length, const UChar* priorContext, int priorContextLength)
330 {
331 text->pFuncs = funcs;
332 text->providerProperties = 1 << UTEXT_PROVIDER_STABLE_CHUNKS;
333 text->context = string;
334 text->p = string;
335 text->a = length;
336 text->q = priorContext;
337 text->b = priorContextLength;
338 }
339
textOpenLatin1(UTextWithBuffer * utWithBuffer,const LChar * string,unsigned length,const UChar * priorContext,int priorContextLength,UErrorCode * status)340 static UText* textOpenLatin1(UTextWithBuffer* utWithBuffer, const LChar* string, unsigned length, const UChar* priorContext, int priorContextLength, UErrorCode* status)
341 {
342 if (U_FAILURE(*status))
343 return 0;
344
345 if (!string || length > static_cast<unsigned>(numeric_limits<int32_t>::max())) {
346 *status = U_ILLEGAL_ARGUMENT_ERROR;
347 return 0;
348 }
349 UText* text = utext_setup(&utWithBuffer->text, sizeof(utWithBuffer->buffer), status);
350 if (U_FAILURE(*status)) {
351 ASSERT(!text);
352 return 0;
353 }
354 textInit(text, &textLatin1Funcs, string, length, priorContext, priorContextLength);
355 return text;
356 }
357
textUTF16GetCurrentContext(const UText * text)358 static inline TextContext textUTF16GetCurrentContext(const UText* text)
359 {
360 if (!text->chunkContents)
361 return NoContext;
362 return text->chunkContents == text->p ? PrimaryContext : PriorContext;
363 }
364
textUTF16MoveInPrimaryContext(UText * text,int64_t nativeIndex,int64_t nativeLength,UBool forward)365 static void textUTF16MoveInPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
366 {
367 ASSERT(text->chunkContents == text->p);
368 ASSERT_UNUSED(forward, forward ? nativeIndex >= text->b : nativeIndex > text->b);
369 ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
370 text->chunkNativeStart = text->b;
371 text->chunkNativeLimit = nativeLength;
372 int64_t length = text->chunkNativeLimit - text->chunkNativeStart;
373 // Ensure chunk length is well defined if computed length exceeds int32_t range.
374 ASSERT(length <= numeric_limits<int32_t>::max());
375 text->chunkLength = length <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(length) : 0;
376 text->nativeIndexingLimit = text->chunkLength;
377 int64_t offset = nativeIndex - text->chunkNativeStart;
378 // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length.
379 ASSERT(offset <= numeric_limits<int32_t>::max());
380 text->chunkOffset = min(offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength);
381 }
382
textUTF16SwitchToPrimaryContext(UText * text,int64_t nativeIndex,int64_t nativeLength,UBool forward)383 static void textUTF16SwitchToPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
384 {
385 ASSERT(!text->chunkContents || text->chunkContents == text->q);
386 text->chunkContents = static_cast<const UChar*>(text->p);
387 textUTF16MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
388 }
389
textUTF16MoveInPriorContext(UText * text,int64_t nativeIndex,int64_t nativeLength,UBool forward)390 static void textUTF16MoveInPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
391 {
392 ASSERT(text->chunkContents == text->q);
393 ASSERT(forward ? nativeIndex < text->b : nativeIndex <= text->b);
394 ASSERT_UNUSED(nativeLength, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
395 ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
396 text->chunkNativeStart = 0;
397 text->chunkNativeLimit = text->b;
398 text->chunkLength = text->b;
399 text->nativeIndexingLimit = text->chunkLength;
400 int64_t offset = nativeIndex - text->chunkNativeStart;
401 // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length.
402 ASSERT(offset <= numeric_limits<int32_t>::max());
403 text->chunkOffset = min(offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength);
404 }
405
textUTF16SwitchToPriorContext(UText * text,int64_t nativeIndex,int64_t nativeLength,UBool forward)406 static void textUTF16SwitchToPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
407 {
408 ASSERT(!text->chunkContents || text->chunkContents == text->p);
409 text->chunkContents = static_cast<const UChar*>(text->q);
410 textUTF16MoveInPriorContext(text, nativeIndex, nativeLength, forward);
411 }
412
textUTF16Access(UText * text,int64_t nativeIndex,UBool forward)413 static UBool textUTF16Access(UText* text, int64_t nativeIndex, UBool forward)
414 {
415 if (!text->context)
416 return FALSE;
417 int64_t nativeLength = textNativeLength(text);
418 UBool isAccessible;
419 if (textInChunkOrOutOfRange(text, nativeIndex, nativeLength, forward, isAccessible))
420 return isAccessible;
421 nativeIndex = textPinIndex(nativeIndex, nativeLength - 1);
422 TextContext currentContext = textUTF16GetCurrentContext(text);
423 TextContext newContext = textGetContext(text, nativeIndex, forward);
424 ASSERT(newContext != NoContext);
425 if (newContext == currentContext) {
426 if (currentContext == PrimaryContext) {
427 textUTF16MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
428 } else {
429 textUTF16MoveInPriorContext(text, nativeIndex, nativeLength, forward);
430 }
431 } else if (newContext == PrimaryContext) {
432 textUTF16SwitchToPrimaryContext(text, nativeIndex, nativeLength, forward);
433 } else {
434 ASSERT(newContext == PriorContext);
435 textUTF16SwitchToPriorContext(text, nativeIndex, nativeLength, forward);
436 }
437 return TRUE;
438 }
439
440 static const struct UTextFuncs textUTF16Funcs = {
441 sizeof(UTextFuncs),
442 0, 0, 0,
443 textClone,
444 textNativeLength,
445 textUTF16Access,
446 textExtract,
447 0, 0, 0, 0,
448 textClose,
449 0, 0, 0,
450 };
451
textOpenUTF16(UText * text,const UChar * string,unsigned length,const UChar * priorContext,int priorContextLength,UErrorCode * status)452 static UText* textOpenUTF16(UText* text, const UChar* string, unsigned length, const UChar* priorContext, int priorContextLength, UErrorCode* status)
453 {
454 if (U_FAILURE(*status))
455 return 0;
456
457 if (!string || length > static_cast<unsigned>(numeric_limits<int32_t>::max())) {
458 *status = U_ILLEGAL_ARGUMENT_ERROR;
459 return 0;
460 }
461
462 text = utext_setup(text, 0, status);
463 if (U_FAILURE(*status)) {
464 ASSERT(!text);
465 return 0;
466 }
467 textInit(text, &textUTF16Funcs, string, length, priorContext, priorContextLength);
468 return text;
469 }
470
471 static UText emptyText = UTEXT_INITIALIZER;
472
wordBreakIterator(const LChar * string,int length)473 static TextBreakIterator* wordBreakIterator(const LChar* string, int length)
474 {
475 UErrorCode errorCode = U_ZERO_ERROR;
476 static TextBreakIterator* breakIter = 0;
477 if (!breakIter) {
478 breakIter = icu::BreakIterator::createWordInstance(icu::Locale(currentTextBreakLocaleID()), errorCode);
479 ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode);
480 if (!breakIter)
481 return 0;
482 }
483
484 UTextWithBuffer textLocal;
485 textLocal.text = emptyText;
486 textLocal.text.extraSize = sizeof(textLocal.buffer);
487 textLocal.text.pExtra = textLocal.buffer;
488
489 UErrorCode openStatus = U_ZERO_ERROR;
490 UText* text = textOpenLatin1(&textLocal, string, length, 0, 0, &openStatus);
491 if (U_FAILURE(openStatus)) {
492 WTF_LOG_ERROR("textOpenLatin1 failed with status %d", openStatus);
493 return 0;
494 }
495
496 UErrorCode setTextStatus = U_ZERO_ERROR;
497 breakIter->setText(text, setTextStatus);
498 if (U_FAILURE(setTextStatus))
499 WTF_LOG_ERROR("BreakIterator::seText failed with status %d", setTextStatus);
500
501 utext_close(text);
502
503 return breakIter;
504 }
505
setText16(TextBreakIterator * iter,const UChar * string,int length)506 static void setText16(TextBreakIterator* iter, const UChar* string, int length)
507 {
508 UErrorCode errorCode = U_ZERO_ERROR;
509 UText uText = UTEXT_INITIALIZER;
510 utext_openUChars(&uText, string, length, &errorCode);
511 if (U_FAILURE(errorCode))
512 return;
513 iter->setText(&uText, errorCode);
514 }
515
wordBreakIterator(const UChar * string,int length)516 TextBreakIterator* wordBreakIterator(const UChar* string, int length)
517 {
518 UErrorCode errorCode = U_ZERO_ERROR;
519 static TextBreakIterator* breakIter = 0;
520 if (!breakIter) {
521 breakIter = icu::BreakIterator::createWordInstance(icu::Locale(currentTextBreakLocaleID()), errorCode);
522 ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode);
523 if (!breakIter)
524 return 0;
525 }
526 setText16(breakIter, string, length);
527 return breakIter;
528 }
529
wordBreakIterator(const String & string,int start,int length)530 TextBreakIterator* wordBreakIterator(const String& string, int start, int length)
531 {
532 if (string.isEmpty())
533 return 0;
534 if (string.is8Bit())
535 return wordBreakIterator(string.characters8() + start, length);
536 return wordBreakIterator(string.characters16() + start, length);
537 }
538
acquireLineBreakIterator(const LChar * string,int length,const AtomicString & locale,const UChar * priorContext,unsigned priorContextLength)539 TextBreakIterator* acquireLineBreakIterator(const LChar* string, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength)
540 {
541 TextBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale);
542 if (!iterator)
543 return 0;
544
545 UTextWithBuffer textLocal;
546 textLocal.text = emptyText;
547 textLocal.text.extraSize = sizeof(textLocal.buffer);
548 textLocal.text.pExtra = textLocal.buffer;
549
550 UErrorCode openStatus = U_ZERO_ERROR;
551 UText* text = textOpenLatin1(&textLocal, string, length, priorContext, priorContextLength, &openStatus);
552 if (U_FAILURE(openStatus)) {
553 WTF_LOG_ERROR("textOpenLatin1 failed with status %d", openStatus);
554 return 0;
555 }
556
557 UErrorCode setTextStatus = U_ZERO_ERROR;
558 iterator->setText(text, setTextStatus);
559 if (U_FAILURE(setTextStatus)) {
560 WTF_LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
561 return 0;
562 }
563
564 utext_close(text);
565
566 return iterator;
567 }
568
acquireLineBreakIterator(const UChar * string,int length,const AtomicString & locale,const UChar * priorContext,unsigned priorContextLength)569 TextBreakIterator* acquireLineBreakIterator(const UChar* string, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength)
570 {
571 TextBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale);
572 if (!iterator)
573 return 0;
574
575 UText textLocal = UTEXT_INITIALIZER;
576
577 UErrorCode openStatus = U_ZERO_ERROR;
578 UText* text = textOpenUTF16(&textLocal, string, length, priorContext, priorContextLength, &openStatus);
579 if (U_FAILURE(openStatus)) {
580 WTF_LOG_ERROR("textOpenUTF16 failed with status %d", openStatus);
581 return 0;
582 }
583
584 UErrorCode setTextStatus = U_ZERO_ERROR;
585 iterator->setText(text, setTextStatus);
586 if (U_FAILURE(setTextStatus)) {
587 WTF_LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
588 return 0;
589 }
590
591 utext_close(text);
592
593 return iterator;
594 }
595
releaseLineBreakIterator(TextBreakIterator * iterator)596 void releaseLineBreakIterator(TextBreakIterator* iterator)
597 {
598 ASSERT_ARG(iterator, iterator);
599
600 LineBreakIteratorPool::sharedPool().put(iterator);
601 }
602
603 static TextBreakIterator* nonSharedCharacterBreakIterator;
604
compareAndSwapNonSharedCharacterBreakIterator(TextBreakIterator * expected,TextBreakIterator * newValue)605 static inline bool compareAndSwapNonSharedCharacterBreakIterator(TextBreakIterator* expected, TextBreakIterator* newValue)
606 {
607 DEFINE_STATIC_LOCAL(Mutex, nonSharedCharacterBreakIteratorMutex, ());
608 MutexLocker locker(nonSharedCharacterBreakIteratorMutex);
609 if (nonSharedCharacterBreakIterator != expected)
610 return false;
611 nonSharedCharacterBreakIterator = newValue;
612 return true;
613 }
614
NonSharedCharacterBreakIterator(const String & string)615 NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(const String& string)
616 : m_is8Bit(true)
617 , m_charaters8(0)
618 , m_offset(0)
619 , m_length(0)
620 , m_iterator(0)
621 {
622 if (string.isEmpty())
623 return;
624
625 m_is8Bit = string.is8Bit();
626
627 if (m_is8Bit) {
628 m_charaters8 = string.characters8();
629 m_offset = 0;
630 m_length = string.length();
631 return;
632 }
633
634 createIteratorForBuffer(string.characters16(), string.length());
635 }
636
NonSharedCharacterBreakIterator(const UChar * buffer,unsigned length)637 NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(const UChar* buffer, unsigned length)
638 : m_is8Bit(false)
639 , m_charaters8(0)
640 , m_offset(0)
641 , m_length(0)
642 , m_iterator(0)
643 {
644 createIteratorForBuffer(buffer, length);
645 }
646
createIteratorForBuffer(const UChar * buffer,unsigned length)647 void NonSharedCharacterBreakIterator::createIteratorForBuffer(const UChar* buffer, unsigned length)
648 {
649 m_iterator = nonSharedCharacterBreakIterator;
650 bool createdIterator = m_iterator && compareAndSwapNonSharedCharacterBreakIterator(m_iterator, 0);
651 if (!createdIterator) {
652 UErrorCode errorCode = U_ZERO_ERROR;
653 m_iterator = icu::BreakIterator::createCharacterInstance(icu::Locale(currentTextBreakLocaleID()), errorCode);
654 ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode);
655 }
656
657 setText16(m_iterator, buffer, length);
658 }
659
~NonSharedCharacterBreakIterator()660 NonSharedCharacterBreakIterator::~NonSharedCharacterBreakIterator()
661 {
662 if (m_is8Bit)
663 return;
664 if (!compareAndSwapNonSharedCharacterBreakIterator(0, m_iterator))
665 delete m_iterator;
666 }
667
next()668 int NonSharedCharacterBreakIterator::next()
669 {
670 if (!m_is8Bit)
671 return m_iterator->next();
672
673 if (m_offset >= m_length)
674 return TextBreakDone;
675
676 m_offset += clusterLengthStartingAt(m_offset);
677 return m_offset;
678 }
679
current()680 int NonSharedCharacterBreakIterator::current()
681 {
682 if (!m_is8Bit)
683 return m_iterator->current();
684 return m_offset;
685 }
686
isBreak(int offset) const687 bool NonSharedCharacterBreakIterator::isBreak(int offset) const
688 {
689 if (!m_is8Bit)
690 return m_iterator->isBoundary(offset);
691 return !isLFAfterCR(offset);
692 }
693
preceding(int offset) const694 int NonSharedCharacterBreakIterator::preceding(int offset) const
695 {
696 if (!m_is8Bit)
697 return m_iterator->preceding(offset);
698 if (offset <= 0)
699 return TextBreakDone;
700 if (isLFAfterCR(offset))
701 return offset - 2;
702 return offset - 1;
703 }
704
following(int offset) const705 int NonSharedCharacterBreakIterator::following(int offset) const
706 {
707 if (!m_is8Bit)
708 return m_iterator->following(offset);
709 if (static_cast<unsigned>(offset) >= m_length)
710 return TextBreakDone;
711 return offset + clusterLengthStartingAt(offset);
712 }
713
sentenceBreakIterator(const UChar * string,int length)714 TextBreakIterator* sentenceBreakIterator(const UChar* string, int length)
715 {
716 UErrorCode openStatus = U_ZERO_ERROR;
717 static TextBreakIterator* iterator = 0;
718 if (!iterator) {
719 iterator = icu::BreakIterator::createSentenceInstance(icu::Locale(currentTextBreakLocaleID()), openStatus);
720 ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
721 if (!iterator)
722 return 0;
723 }
724
725 setText16(iterator, string, length);
726 return iterator;
727 }
728
isWordTextBreak(TextBreakIterator * iterator)729 bool isWordTextBreak(TextBreakIterator* iterator)
730 {
731 icu::RuleBasedBreakIterator* ruleBasedBreakIterator = static_cast<icu::RuleBasedBreakIterator*>(iterator);
732 int ruleStatus = ruleBasedBreakIterator->getRuleStatus();
733 return ruleStatus != UBRK_WORD_NONE;
734 }
735
setUpIteratorWithRules(const char * breakRules,const UChar * string,int length)736 static TextBreakIterator* setUpIteratorWithRules(const char* breakRules, const UChar* string, int length)
737 {
738 if (!string)
739 return 0;
740
741 static TextBreakIterator* iterator = 0;
742 if (!iterator) {
743 UParseError parseStatus;
744 UErrorCode openStatus = U_ZERO_ERROR;
745 Vector<UChar> rules;
746 String(breakRules).appendTo(rules);
747
748 iterator = new icu::RuleBasedBreakIterator(icu::UnicodeString(rules.data(), rules.size()), parseStatus, openStatus);
749 ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
750 if (!iterator)
751 return 0;
752 }
753
754 setText16(iterator, string, length);
755 return iterator;
756 }
757
cursorMovementIterator(const UChar * string,int length)758 TextBreakIterator* cursorMovementIterator(const UChar* string, int length)
759 {
760 // This rule set is based on character-break iterator rules of ICU 4.0
761 // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/char.txt>.
762 // The major differences from the original ones are listed below:
763 // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier;
764 // * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342);
765 // * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and;
766 // * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks.
767 // * Added rules for regional indicator symbols.
768 static const char* const kRules =
769 "$CR = [\\p{Grapheme_Cluster_Break = CR}];"
770 "$LF = [\\p{Grapheme_Cluster_Break = LF}];"
771 "$Control = [\\p{Grapheme_Cluster_Break = Control}];"
772 "$VoiceMarks = [\\uFF9E\\uFF9F];" // Japanese half-width katakana voiced marks
773 "$Extend = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];"
774 "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];"
775 "$L = [\\p{Grapheme_Cluster_Break = L}];"
776 "$V = [\\p{Grapheme_Cluster_Break = V}];"
777 "$T = [\\p{Grapheme_Cluster_Break = T}];"
778 "$LV = [\\p{Grapheme_Cluster_Break = LV}];"
779 "$LVT = [\\p{Grapheme_Cluster_Break = LVT}];"
780 "$Hin0 = [\\u0905-\\u0939];" // Devanagari Letter A,...,Ha
781 "$HinV = \\u094D;" // Devanagari Sign Virama
782 "$Hin1 = [\\u0915-\\u0939];" // Devanagari Letter Ka,...,Ha
783 "$Ben0 = [\\u0985-\\u09B9];" // Bengali Letter A,...,Ha
784 "$BenV = \\u09CD;" // Bengali Sign Virama
785 "$Ben1 = [\\u0995-\\u09B9];" // Bengali Letter Ka,...,Ha
786 "$Pan0 = [\\u0A05-\\u0A39];" // Gurmukhi Letter A,...,Ha
787 "$PanV = \\u0A4D;" // Gurmukhi Sign Virama
788 "$Pan1 = [\\u0A15-\\u0A39];" // Gurmukhi Letter Ka,...,Ha
789 "$Guj0 = [\\u0A85-\\u0AB9];" // Gujarati Letter A,...,Ha
790 "$GujV = \\u0ACD;" // Gujarati Sign Virama
791 "$Guj1 = [\\u0A95-\\u0AB9];" // Gujarati Letter Ka,...,Ha
792 "$Ori0 = [\\u0B05-\\u0B39];" // Oriya Letter A,...,Ha
793 "$OriV = \\u0B4D;" // Oriya Sign Virama
794 "$Ori1 = [\\u0B15-\\u0B39];" // Oriya Letter Ka,...,Ha
795 "$Tel0 = [\\u0C05-\\u0C39];" // Telugu Letter A,...,Ha
796 "$TelV = \\u0C4D;" // Telugu Sign Virama
797 "$Tel1 = [\\u0C14-\\u0C39];" // Telugu Letter Ka,...,Ha
798 "$Kan0 = [\\u0C85-\\u0CB9];" // Kannada Letter A,...,Ha
799 "$KanV = \\u0CCD;" // Kannada Sign Virama
800 "$Kan1 = [\\u0C95-\\u0CB9];" // Kannada Letter A,...,Ha
801 "$Mal0 = [\\u0D05-\\u0D39];" // Malayalam Letter A,...,Ha
802 "$MalV = \\u0D4D;" // Malayalam Sign Virama
803 "$Mal1 = [\\u0D15-\\u0D39];" // Malayalam Letter A,...,Ha
804 "$RI = [\\U0001F1E6-\\U0001F1FF];" // Emoji regional indicators
805 "!!chain;"
806 "!!forward;"
807 "$CR $LF;"
808 "$L ($L | $V | $LV | $LVT);"
809 "($LV | $V) ($V | $T);"
810 "($LVT | $T) $T;"
811 "[^$Control $CR $LF] $Extend;"
812 "[^$Control $CR $LF] $SpacingMark;"
813 "$RI $RI / $RI;"
814 "$RI $RI;"
815 "$Hin0 $HinV $Hin1;" // Devanagari Virama (forward)
816 "$Ben0 $BenV $Ben1;" // Bengali Virama (forward)
817 "$Pan0 $PanV $Pan1;" // Gurmukhi Virama (forward)
818 "$Guj0 $GujV $Guj1;" // Gujarati Virama (forward)
819 "$Ori0 $OriV $Ori1;" // Oriya Virama (forward)
820 "$Tel0 $TelV $Tel1;" // Telugu Virama (forward)
821 "$Kan0 $KanV $Kan1;" // Kannada Virama (forward)
822 "$Mal0 $MalV $Mal1;" // Malayalam Virama (forward)
823 "!!reverse;"
824 "$LF $CR;"
825 "($L | $V | $LV | $LVT) $L;"
826 "($V | $T) ($LV | $V);"
827 "$T ($LVT | $T);"
828 "$Extend [^$Control $CR $LF];"
829 "$SpacingMark [^$Control $CR $LF];"
830 "$RI $RI / $RI $RI;"
831 "$RI $RI;"
832 "$Hin1 $HinV $Hin0;" // Devanagari Virama (backward)
833 "$Ben1 $BenV $Ben0;" // Bengali Virama (backward)
834 "$Pan1 $PanV $Pan0;" // Gurmukhi Virama (backward)
835 "$Guj1 $GujV $Guj0;" // Gujarati Virama (backward)
836 "$Ori1 $OriV $Ori0;" // Gujarati Virama (backward)
837 "$Tel1 $TelV $Tel0;" // Telugu Virama (backward)
838 "$Kan1 $KanV $Kan0;" // Kannada Virama (backward)
839 "$Mal1 $MalV $Mal0;" // Malayalam Virama (backward)
840 "!!safe_reverse;"
841 "!!safe_forward;";
842
843 return setUpIteratorWithRules(kRules, string, length);
844 }
845
846 }
847