• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2006 Lars Knoll <lars@trolltech.com>
3  * Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved.
4  *
5  * This library is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU Library General Public
7  * License as published by the Free Software Foundation; either
8  * version 2 of the License, or (at your option) any later version.
9  *
10  * This library is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * Library General Public License for more details.
14  *
15  * You should have received a copy of the GNU Library General Public License
16  * along with this library; see the file COPYING.LIB.  If not, write to
17  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18  * Boston, MA 02110-1301, USA.
19  *
20  */
21 
22 #include "config.h"
23 #include "platform/text/TextBreakIterator.h"
24 
25 #include "platform/text/TextBreakIteratorInternalICU.h"
26 #include "wtf/Assertions.h"
27 #include "wtf/HashMap.h"
28 #include "wtf/PassOwnPtr.h"
29 #include "wtf/ThreadSpecific.h"
30 #include "wtf/ThreadingPrimitives.h"
31 #include "wtf/text/AtomicString.h"
32 #include "wtf/text/CString.h"
33 #include "wtf/text/WTFString.h"
34 #include <unicode/ubrk.h>
35 
36 using namespace WTF;
37 using namespace std;
38 
39 namespace WebCore {
40 
41 class LineBreakIteratorPool {
42     WTF_MAKE_NONCOPYABLE(LineBreakIteratorPool);
43 public:
sharedPool()44     static LineBreakIteratorPool& sharedPool()
45     {
46         static WTF::ThreadSpecific<LineBreakIteratorPool>* pool = new WTF::ThreadSpecific<LineBreakIteratorPool>;
47         return **pool;
48     }
49 
create()50     static PassOwnPtr<LineBreakIteratorPool> create() { return adoptPtr(new LineBreakIteratorPool); }
51 
take(const AtomicString & locale)52     icu::BreakIterator* take(const AtomicString& locale)
53     {
54         icu::BreakIterator* iterator = 0;
55         for (size_t i = 0; i < m_pool.size(); ++i) {
56             if (m_pool[i].first == locale) {
57                 iterator = m_pool[i].second;
58                 m_pool.remove(i);
59                 break;
60             }
61         }
62 
63         if (!iterator) {
64             UErrorCode openStatus = U_ZERO_ERROR;
65             bool localeIsEmpty = locale.isEmpty();
66             iterator = icu::BreakIterator::createLineInstance(localeIsEmpty ? icu::Locale(currentTextBreakLocaleID()) : icu::Locale(locale.string().utf8().data()), openStatus);
67             // locale comes from a web page and it can be invalid, leading ICU
68             // to fail, in which case we fall back to the default locale.
69             if (!localeIsEmpty && U_FAILURE(openStatus)) {
70                 openStatus = U_ZERO_ERROR;
71                 iterator = icu::BreakIterator::createLineInstance(icu::Locale(currentTextBreakLocaleID()), openStatus);
72             }
73 
74             if (U_FAILURE(openStatus)) {
75                 WTF_LOG_ERROR("icu::BreakIterator construction failed with status %d", openStatus);
76                 return 0;
77             }
78         }
79 
80         ASSERT(!m_vendedIterators.contains(iterator));
81         m_vendedIterators.set(iterator, locale);
82         return iterator;
83     }
84 
put(icu::BreakIterator * iterator)85     void put(icu::BreakIterator* iterator)
86     {
87         ASSERT_ARG(iterator, m_vendedIterators.contains(iterator));
88 
89         if (m_pool.size() == capacity) {
90             delete(m_pool[0].second);
91             m_pool.remove(0);
92         }
93 
94         m_pool.append(Entry(m_vendedIterators.take(iterator), iterator));
95     }
96 
97 private:
LineBreakIteratorPool()98     LineBreakIteratorPool() { }
99 
100     static const size_t capacity = 4;
101 
102     typedef pair<AtomicString, icu::BreakIterator*> Entry;
103     typedef Vector<Entry, capacity> Pool;
104     Pool m_pool;
105     HashMap<icu::BreakIterator*, AtomicString> m_vendedIterators;
106 
107     friend WTF::ThreadSpecific<LineBreakIteratorPool>::operator LineBreakIteratorPool*();
108 };
109 
110 enum TextContext { NoContext, PriorContext, PrimaryContext };
111 
112 const int textBufferCapacity = 16;
113 
114 typedef struct {
115     UText text;
116     UChar buffer[textBufferCapacity];
117 } UTextWithBuffer;
118 
textPinIndex(int64_t & index,int64_t limit)119 static inline int64_t textPinIndex(int64_t& index, int64_t limit)
120 {
121     if (index < 0)
122         index = 0;
123     else if (index > limit)
124         index = limit;
125     return index;
126 }
127 
textNativeLength(UText * text)128 static inline int64_t textNativeLength(UText* text)
129 {
130     return text->a + text->b;
131 }
132 
133 // Relocate pointer from source into destination as required.
textFixPointer(const UText * source,UText * destination,const void * & pointer)134 static void textFixPointer(const UText* source, UText* destination, const void*& pointer)
135 {
136     if (pointer >= source->pExtra && pointer < static_cast<char*>(source->pExtra) + source->extraSize) {
137         // Pointer references source extra buffer.
138         pointer = static_cast<char*>(destination->pExtra) + (static_cast<const char*>(pointer) - static_cast<const char*>(source->pExtra));
139     } else if (pointer >= source && pointer < reinterpret_cast<const char*>(source) + source->sizeOfStruct) {
140         // Pointer references source text structure, but not source extra buffer.
141         pointer = reinterpret_cast<char*>(destination) + (static_cast<const char*>(pointer) - reinterpret_cast<const char*>(source));
142     }
143 }
144 
textClone(UText * destination,const UText * source,UBool deep,UErrorCode * status)145 static UText* textClone(UText* destination, const UText* source, UBool deep, UErrorCode* status)
146 {
147     ASSERT_UNUSED(deep, !deep);
148     if (U_FAILURE(*status))
149         return 0;
150     int32_t extraSize = source->extraSize;
151     destination = utext_setup(destination, extraSize, status);
152     if (U_FAILURE(*status))
153         return destination;
154     void* extraNew = destination->pExtra;
155     int32_t flags = destination->flags;
156     int sizeToCopy = min(source->sizeOfStruct, destination->sizeOfStruct);
157     memcpy(destination, source, sizeToCopy);
158     destination->pExtra = extraNew;
159     destination->flags = flags;
160     memcpy(destination->pExtra, source->pExtra, extraSize);
161     textFixPointer(source, destination, destination->context);
162     textFixPointer(source, destination, destination->p);
163     textFixPointer(source, destination, destination->q);
164     ASSERT(!destination->r);
165     const void * chunkContents = static_cast<const void*>(destination->chunkContents);
166     textFixPointer(source, destination, chunkContents);
167     destination->chunkContents = static_cast<const UChar*>(chunkContents);
168     return destination;
169 }
170 
textExtract(UText *,int64_t,int64_t,UChar *,int32_t,UErrorCode * errorCode)171 static int32_t textExtract(UText*, int64_t, int64_t, UChar*, int32_t, UErrorCode* errorCode)
172 {
173     // In the present context, this text provider is used only with ICU functions
174     // that do not perform an extract operation.
175     ASSERT_NOT_REACHED();
176     *errorCode = U_UNSUPPORTED_ERROR;
177     return 0;
178 }
179 
textClose(UText * text)180 static void textClose(UText* text)
181 {
182     text->context = 0;
183 }
184 
textGetContext(const UText * text,int64_t nativeIndex,UBool forward)185 static inline TextContext textGetContext(const UText* text, int64_t nativeIndex, UBool forward)
186 {
187     if (!text->b || nativeIndex > text->b)
188         return PrimaryContext;
189     if (nativeIndex == text->b)
190         return forward ? PrimaryContext : PriorContext;
191     return PriorContext;
192 }
193 
textLatin1GetCurrentContext(const UText * text)194 static inline TextContext textLatin1GetCurrentContext(const UText* text)
195 {
196     if (!text->chunkContents)
197         return NoContext;
198     return text->chunkContents == text->pExtra ? PrimaryContext : PriorContext;
199 }
200 
textLatin1MoveInPrimaryContext(UText * text,int64_t nativeIndex,int64_t nativeLength,UBool forward)201 static void textLatin1MoveInPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
202 {
203     ASSERT(text->chunkContents == text->pExtra);
204     if (forward) {
205         ASSERT(nativeIndex >= text->b && nativeIndex < nativeLength);
206         text->chunkNativeStart = nativeIndex;
207         text->chunkNativeLimit = nativeIndex + text->extraSize / sizeof(UChar);
208         if (text->chunkNativeLimit > nativeLength)
209             text->chunkNativeLimit = nativeLength;
210     } else {
211         ASSERT(nativeIndex > text->b && nativeIndex <= nativeLength);
212         text->chunkNativeLimit = nativeIndex;
213         text->chunkNativeStart = nativeIndex - text->extraSize / sizeof(UChar);
214         if (text->chunkNativeStart < text->b)
215             text->chunkNativeStart = text->b;
216     }
217     int64_t length = text->chunkNativeLimit - text->chunkNativeStart;
218     // Ensure chunk length is well defined if computed length exceeds int32_t range.
219     ASSERT(length <= numeric_limits<int32_t>::max());
220     text->chunkLength = length <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(length) : 0;
221     text->nativeIndexingLimit = text->chunkLength;
222     text->chunkOffset = forward ? 0 : text->chunkLength;
223     StringImpl::copyChars(const_cast<UChar*>(text->chunkContents), static_cast<const LChar*>(text->p) + (text->chunkNativeStart - text->b), static_cast<unsigned>(text->chunkLength));
224 }
225 
textLatin1SwitchToPrimaryContext(UText * text,int64_t nativeIndex,int64_t nativeLength,UBool forward)226 static void textLatin1SwitchToPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
227 {
228     ASSERT(!text->chunkContents || text->chunkContents == text->q);
229     text->chunkContents = static_cast<const UChar*>(text->pExtra);
230     textLatin1MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
231 }
232 
textLatin1MoveInPriorContext(UText * text,int64_t nativeIndex,int64_t nativeLength,UBool forward)233 static void textLatin1MoveInPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
234 {
235     ASSERT(text->chunkContents == text->q);
236     ASSERT(forward ? nativeIndex < text->b : nativeIndex <= text->b);
237     ASSERT_UNUSED(nativeLength, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
238     ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
239     text->chunkNativeStart = 0;
240     text->chunkNativeLimit = text->b;
241     text->chunkLength = text->b;
242     text->nativeIndexingLimit = text->chunkLength;
243     int64_t offset = nativeIndex - text->chunkNativeStart;
244     // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length.
245     ASSERT(offset <= numeric_limits<int32_t>::max());
246     text->chunkOffset = min(offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength);
247 }
248 
textLatin1SwitchToPriorContext(UText * text,int64_t nativeIndex,int64_t nativeLength,UBool forward)249 static void textLatin1SwitchToPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
250 {
251     ASSERT(!text->chunkContents || text->chunkContents == text->pExtra);
252     text->chunkContents = static_cast<const UChar*>(text->q);
253     textLatin1MoveInPriorContext(text, nativeIndex, nativeLength, forward);
254 }
255 
textInChunkOrOutOfRange(UText * text,int64_t nativeIndex,int64_t nativeLength,UBool forward,UBool & isAccessible)256 static inline bool textInChunkOrOutOfRange(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward, UBool& isAccessible)
257 {
258     if (forward) {
259         if (nativeIndex >= text->chunkNativeStart && nativeIndex < text->chunkNativeLimit) {
260             int64_t offset = nativeIndex - text->chunkNativeStart;
261             // Ensure chunk offset is well formed if computed offset exceeds int32_t range.
262             ASSERT(offset <= numeric_limits<int32_t>::max());
263             text->chunkOffset = offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0;
264             isAccessible = TRUE;
265             return true;
266         }
267         if (nativeIndex >= nativeLength && text->chunkNativeLimit == nativeLength) {
268             text->chunkOffset = text->chunkLength;
269             isAccessible = FALSE;
270             return true;
271         }
272     } else {
273         if (nativeIndex > text->chunkNativeStart && nativeIndex <= text->chunkNativeLimit) {
274             int64_t offset = nativeIndex - text->chunkNativeStart;
275             // Ensure chunk offset is well formed if computed offset exceeds int32_t range.
276             ASSERT(offset <= numeric_limits<int32_t>::max());
277             text->chunkOffset = offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0;
278             isAccessible = TRUE;
279             return true;
280         }
281         if (nativeIndex <= 0 && !text->chunkNativeStart) {
282             text->chunkOffset = 0;
283             isAccessible = FALSE;
284             return true;
285         }
286     }
287     return false;
288 }
289 
textLatin1Access(UText * text,int64_t nativeIndex,UBool forward)290 static UBool textLatin1Access(UText* text, int64_t nativeIndex, UBool forward)
291 {
292     if (!text->context)
293         return FALSE;
294     int64_t nativeLength = textNativeLength(text);
295     UBool isAccessible;
296     if (textInChunkOrOutOfRange(text, nativeIndex, nativeLength, forward, isAccessible))
297         return isAccessible;
298     nativeIndex = textPinIndex(nativeIndex, nativeLength - 1);
299     TextContext currentContext = textLatin1GetCurrentContext(text);
300     TextContext newContext = textGetContext(text, nativeIndex, forward);
301     ASSERT(newContext != NoContext);
302     if (newContext == currentContext) {
303         if (currentContext == PrimaryContext) {
304             textLatin1MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
305         } else {
306             textLatin1MoveInPriorContext(text, nativeIndex, nativeLength, forward);
307         }
308     } else if (newContext == PrimaryContext) {
309         textLatin1SwitchToPrimaryContext(text, nativeIndex, nativeLength, forward);
310     } else {
311         ASSERT(newContext == PriorContext);
312         textLatin1SwitchToPriorContext(text, nativeIndex, nativeLength, forward);
313     }
314     return TRUE;
315 }
316 
317 static const struct UTextFuncs textLatin1Funcs = {
318     sizeof(UTextFuncs),
319     0, 0, 0,
320     textClone,
321     textNativeLength,
322     textLatin1Access,
323     textExtract,
324     0, 0, 0, 0,
325     textClose,
326     0, 0, 0,
327 };
328 
textInit(UText * text,const UTextFuncs * funcs,const void * string,unsigned length,const UChar * priorContext,int priorContextLength)329 static void textInit(UText* text, const UTextFuncs* funcs, const void* string, unsigned length, const UChar* priorContext, int priorContextLength)
330 {
331     text->pFuncs = funcs;
332     text->providerProperties = 1 << UTEXT_PROVIDER_STABLE_CHUNKS;
333     text->context = string;
334     text->p = string;
335     text->a = length;
336     text->q = priorContext;
337     text->b = priorContextLength;
338 }
339 
textOpenLatin1(UTextWithBuffer * utWithBuffer,const LChar * string,unsigned length,const UChar * priorContext,int priorContextLength,UErrorCode * status)340 static UText* textOpenLatin1(UTextWithBuffer* utWithBuffer, const LChar* string, unsigned length, const UChar* priorContext, int priorContextLength, UErrorCode* status)
341 {
342     if (U_FAILURE(*status))
343         return 0;
344 
345     if (!string || length > static_cast<unsigned>(numeric_limits<int32_t>::max())) {
346         *status = U_ILLEGAL_ARGUMENT_ERROR;
347         return 0;
348     }
349     UText* text = utext_setup(&utWithBuffer->text, sizeof(utWithBuffer->buffer), status);
350     if (U_FAILURE(*status)) {
351         ASSERT(!text);
352         return 0;
353     }
354     textInit(text, &textLatin1Funcs, string, length, priorContext, priorContextLength);
355     return text;
356 }
357 
textUTF16GetCurrentContext(const UText * text)358 static inline TextContext textUTF16GetCurrentContext(const UText* text)
359 {
360     if (!text->chunkContents)
361         return NoContext;
362     return text->chunkContents == text->p ? PrimaryContext : PriorContext;
363 }
364 
textUTF16MoveInPrimaryContext(UText * text,int64_t nativeIndex,int64_t nativeLength,UBool forward)365 static void textUTF16MoveInPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
366 {
367     ASSERT(text->chunkContents == text->p);
368     ASSERT_UNUSED(forward, forward ? nativeIndex >= text->b : nativeIndex > text->b);
369     ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
370     text->chunkNativeStart = text->b;
371     text->chunkNativeLimit = nativeLength;
372     int64_t length = text->chunkNativeLimit - text->chunkNativeStart;
373     // Ensure chunk length is well defined if computed length exceeds int32_t range.
374     ASSERT(length <= numeric_limits<int32_t>::max());
375     text->chunkLength = length <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(length) : 0;
376     text->nativeIndexingLimit = text->chunkLength;
377     int64_t offset = nativeIndex - text->chunkNativeStart;
378     // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length.
379     ASSERT(offset <= numeric_limits<int32_t>::max());
380     text->chunkOffset = min(offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength);
381 }
382 
textUTF16SwitchToPrimaryContext(UText * text,int64_t nativeIndex,int64_t nativeLength,UBool forward)383 static void textUTF16SwitchToPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
384 {
385     ASSERT(!text->chunkContents || text->chunkContents == text->q);
386     text->chunkContents = static_cast<const UChar*>(text->p);
387     textUTF16MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
388 }
389 
textUTF16MoveInPriorContext(UText * text,int64_t nativeIndex,int64_t nativeLength,UBool forward)390 static void textUTF16MoveInPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
391 {
392     ASSERT(text->chunkContents == text->q);
393     ASSERT(forward ? nativeIndex < text->b : nativeIndex <= text->b);
394     ASSERT_UNUSED(nativeLength, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
395     ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
396     text->chunkNativeStart = 0;
397     text->chunkNativeLimit = text->b;
398     text->chunkLength = text->b;
399     text->nativeIndexingLimit = text->chunkLength;
400     int64_t offset = nativeIndex - text->chunkNativeStart;
401     // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length.
402     ASSERT(offset <= numeric_limits<int32_t>::max());
403     text->chunkOffset = min(offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength);
404 }
405 
textUTF16SwitchToPriorContext(UText * text,int64_t nativeIndex,int64_t nativeLength,UBool forward)406 static void textUTF16SwitchToPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
407 {
408     ASSERT(!text->chunkContents || text->chunkContents == text->p);
409     text->chunkContents = static_cast<const UChar*>(text->q);
410     textUTF16MoveInPriorContext(text, nativeIndex, nativeLength, forward);
411 }
412 
textUTF16Access(UText * text,int64_t nativeIndex,UBool forward)413 static UBool textUTF16Access(UText* text, int64_t nativeIndex, UBool forward)
414 {
415     if (!text->context)
416         return FALSE;
417     int64_t nativeLength = textNativeLength(text);
418     UBool isAccessible;
419     if (textInChunkOrOutOfRange(text, nativeIndex, nativeLength, forward, isAccessible))
420         return isAccessible;
421     nativeIndex = textPinIndex(nativeIndex, nativeLength - 1);
422     TextContext currentContext = textUTF16GetCurrentContext(text);
423     TextContext newContext = textGetContext(text, nativeIndex, forward);
424     ASSERT(newContext != NoContext);
425     if (newContext == currentContext) {
426         if (currentContext == PrimaryContext) {
427             textUTF16MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
428         } else {
429             textUTF16MoveInPriorContext(text, nativeIndex, nativeLength, forward);
430         }
431     } else if (newContext == PrimaryContext) {
432         textUTF16SwitchToPrimaryContext(text, nativeIndex, nativeLength, forward);
433     } else {
434         ASSERT(newContext == PriorContext);
435         textUTF16SwitchToPriorContext(text, nativeIndex, nativeLength, forward);
436     }
437     return TRUE;
438 }
439 
440 static const struct UTextFuncs textUTF16Funcs = {
441     sizeof(UTextFuncs),
442     0, 0, 0,
443     textClone,
444     textNativeLength,
445     textUTF16Access,
446     textExtract,
447     0, 0, 0, 0,
448     textClose,
449     0, 0, 0,
450 };
451 
textOpenUTF16(UText * text,const UChar * string,unsigned length,const UChar * priorContext,int priorContextLength,UErrorCode * status)452 static UText* textOpenUTF16(UText* text, const UChar* string, unsigned length, const UChar* priorContext, int priorContextLength, UErrorCode* status)
453 {
454     if (U_FAILURE(*status))
455         return 0;
456 
457     if (!string || length > static_cast<unsigned>(numeric_limits<int32_t>::max())) {
458         *status = U_ILLEGAL_ARGUMENT_ERROR;
459         return 0;
460     }
461 
462     text = utext_setup(text, 0, status);
463     if (U_FAILURE(*status)) {
464         ASSERT(!text);
465         return 0;
466     }
467     textInit(text, &textUTF16Funcs, string, length, priorContext, priorContextLength);
468     return text;
469 }
470 
471 static UText emptyText = UTEXT_INITIALIZER;
472 
wordBreakIterator(const LChar * string,int length)473 static TextBreakIterator* wordBreakIterator(const LChar* string, int length)
474 {
475     UErrorCode errorCode = U_ZERO_ERROR;
476     static TextBreakIterator* breakIter = 0;
477     if (!breakIter) {
478         breakIter = icu::BreakIterator::createWordInstance(icu::Locale(currentTextBreakLocaleID()), errorCode);
479         ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode);
480         if (!breakIter)
481             return 0;
482     }
483 
484     UTextWithBuffer textLocal;
485     textLocal.text = emptyText;
486     textLocal.text.extraSize = sizeof(textLocal.buffer);
487     textLocal.text.pExtra = textLocal.buffer;
488 
489     UErrorCode openStatus = U_ZERO_ERROR;
490     UText* text = textOpenLatin1(&textLocal, string, length, 0, 0, &openStatus);
491     if (U_FAILURE(openStatus)) {
492         WTF_LOG_ERROR("textOpenLatin1 failed with status %d", openStatus);
493         return 0;
494     }
495 
496     UErrorCode setTextStatus = U_ZERO_ERROR;
497     breakIter->setText(text, setTextStatus);
498     if (U_FAILURE(setTextStatus))
499         WTF_LOG_ERROR("BreakIterator::seText failed with status %d", setTextStatus);
500 
501     utext_close(text);
502 
503     return breakIter;
504 }
505 
setText16(TextBreakIterator * iter,const UChar * string,int length)506 static void setText16(TextBreakIterator* iter, const UChar* string, int length)
507 {
508     UErrorCode errorCode = U_ZERO_ERROR;
509     UText uText = UTEXT_INITIALIZER;
510     utext_openUChars(&uText, string, length, &errorCode);
511     if (U_FAILURE(errorCode))
512         return;
513     iter->setText(&uText, errorCode);
514 }
515 
wordBreakIterator(const UChar * string,int length)516 TextBreakIterator* wordBreakIterator(const UChar* string, int length)
517 {
518     UErrorCode errorCode = U_ZERO_ERROR;
519     static TextBreakIterator* breakIter = 0;
520     if (!breakIter) {
521         breakIter = icu::BreakIterator::createWordInstance(icu::Locale(currentTextBreakLocaleID()), errorCode);
522         ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode);
523         if (!breakIter)
524             return 0;
525     }
526     setText16(breakIter, string, length);
527     return breakIter;
528 }
529 
wordBreakIterator(const String & string,int start,int length)530 TextBreakIterator* wordBreakIterator(const String& string, int start, int length)
531 {
532     if (string.isEmpty())
533         return 0;
534     if (string.is8Bit())
535         return wordBreakIterator(string.characters8() + start, length);
536     return wordBreakIterator(string.characters16() + start, length);
537 }
538 
acquireLineBreakIterator(const LChar * string,int length,const AtomicString & locale,const UChar * priorContext,unsigned priorContextLength)539 TextBreakIterator* acquireLineBreakIterator(const LChar* string, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength)
540 {
541     TextBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale);
542     if (!iterator)
543         return 0;
544 
545     UTextWithBuffer textLocal;
546     textLocal.text = emptyText;
547     textLocal.text.extraSize = sizeof(textLocal.buffer);
548     textLocal.text.pExtra = textLocal.buffer;
549 
550     UErrorCode openStatus = U_ZERO_ERROR;
551     UText* text = textOpenLatin1(&textLocal, string, length, priorContext, priorContextLength, &openStatus);
552     if (U_FAILURE(openStatus)) {
553         WTF_LOG_ERROR("textOpenLatin1 failed with status %d", openStatus);
554         return 0;
555     }
556 
557     UErrorCode setTextStatus = U_ZERO_ERROR;
558     iterator->setText(text, setTextStatus);
559     if (U_FAILURE(setTextStatus)) {
560         WTF_LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
561         return 0;
562     }
563 
564     utext_close(text);
565 
566     return iterator;
567 }
568 
acquireLineBreakIterator(const UChar * string,int length,const AtomicString & locale,const UChar * priorContext,unsigned priorContextLength)569 TextBreakIterator* acquireLineBreakIterator(const UChar* string, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength)
570 {
571     TextBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale);
572     if (!iterator)
573         return 0;
574 
575     UText textLocal = UTEXT_INITIALIZER;
576 
577     UErrorCode openStatus = U_ZERO_ERROR;
578     UText* text = textOpenUTF16(&textLocal, string, length, priorContext, priorContextLength, &openStatus);
579     if (U_FAILURE(openStatus)) {
580         WTF_LOG_ERROR("textOpenUTF16 failed with status %d", openStatus);
581         return 0;
582     }
583 
584     UErrorCode setTextStatus = U_ZERO_ERROR;
585     iterator->setText(text, setTextStatus);
586     if (U_FAILURE(setTextStatus)) {
587         WTF_LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
588         return 0;
589     }
590 
591     utext_close(text);
592 
593     return iterator;
594 }
595 
releaseLineBreakIterator(TextBreakIterator * iterator)596 void releaseLineBreakIterator(TextBreakIterator* iterator)
597 {
598     ASSERT_ARG(iterator, iterator);
599 
600     LineBreakIteratorPool::sharedPool().put(iterator);
601 }
602 
603 static TextBreakIterator* nonSharedCharacterBreakIterator;
604 
compareAndSwapNonSharedCharacterBreakIterator(TextBreakIterator * expected,TextBreakIterator * newValue)605 static inline bool compareAndSwapNonSharedCharacterBreakIterator(TextBreakIterator* expected, TextBreakIterator* newValue)
606 {
607     DEFINE_STATIC_LOCAL(Mutex, nonSharedCharacterBreakIteratorMutex, ());
608     MutexLocker locker(nonSharedCharacterBreakIteratorMutex);
609     if (nonSharedCharacterBreakIterator != expected)
610         return false;
611     nonSharedCharacterBreakIterator = newValue;
612     return true;
613 }
614 
NonSharedCharacterBreakIterator(const String & string)615 NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(const String& string)
616     : m_is8Bit(true)
617     , m_charaters8(0)
618     , m_offset(0)
619     , m_length(0)
620     , m_iterator(0)
621 {
622     if (string.isEmpty())
623         return;
624 
625     m_is8Bit = string.is8Bit();
626 
627     if (m_is8Bit) {
628         m_charaters8 = string.characters8();
629         m_offset = 0;
630         m_length = string.length();
631         return;
632     }
633 
634     createIteratorForBuffer(string.characters16(), string.length());
635 }
636 
NonSharedCharacterBreakIterator(const UChar * buffer,unsigned length)637 NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(const UChar* buffer, unsigned length)
638     : m_is8Bit(false)
639     , m_charaters8(0)
640     , m_offset(0)
641     , m_length(0)
642     , m_iterator(0)
643 {
644     createIteratorForBuffer(buffer, length);
645 }
646 
createIteratorForBuffer(const UChar * buffer,unsigned length)647 void NonSharedCharacterBreakIterator::createIteratorForBuffer(const UChar* buffer, unsigned length)
648 {
649     m_iterator = nonSharedCharacterBreakIterator;
650     bool createdIterator = m_iterator && compareAndSwapNonSharedCharacterBreakIterator(m_iterator, 0);
651     if (!createdIterator) {
652         UErrorCode errorCode = U_ZERO_ERROR;
653         m_iterator = icu::BreakIterator::createCharacterInstance(icu::Locale(currentTextBreakLocaleID()), errorCode);
654         ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode);
655     }
656 
657     setText16(m_iterator, buffer, length);
658 }
659 
~NonSharedCharacterBreakIterator()660 NonSharedCharacterBreakIterator::~NonSharedCharacterBreakIterator()
661 {
662     if (m_is8Bit)
663         return;
664     if (!compareAndSwapNonSharedCharacterBreakIterator(0, m_iterator))
665         delete m_iterator;
666 }
667 
next()668 int NonSharedCharacterBreakIterator::next()
669 {
670     if (!m_is8Bit)
671         return m_iterator->next();
672 
673     if (m_offset >= m_length)
674         return TextBreakDone;
675 
676     m_offset += clusterLengthStartingAt(m_offset);
677     return m_offset;
678 }
679 
current()680 int NonSharedCharacterBreakIterator::current()
681 {
682     if (!m_is8Bit)
683         return m_iterator->current();
684     return m_offset;
685 }
686 
isBreak(int offset) const687 bool NonSharedCharacterBreakIterator::isBreak(int offset) const
688 {
689     if (!m_is8Bit)
690         return m_iterator->isBoundary(offset);
691     return !isLFAfterCR(offset);
692 }
693 
preceding(int offset) const694 int NonSharedCharacterBreakIterator::preceding(int offset) const
695 {
696     if (!m_is8Bit)
697         return m_iterator->preceding(offset);
698     if (offset <= 0)
699         return TextBreakDone;
700     if (isLFAfterCR(offset))
701         return offset - 2;
702     return offset - 1;
703 }
704 
following(int offset) const705 int NonSharedCharacterBreakIterator::following(int offset) const
706 {
707     if (!m_is8Bit)
708         return m_iterator->following(offset);
709     if (static_cast<unsigned>(offset) >= m_length)
710         return TextBreakDone;
711     return offset + clusterLengthStartingAt(offset);
712 }
713 
sentenceBreakIterator(const UChar * string,int length)714 TextBreakIterator* sentenceBreakIterator(const UChar* string, int length)
715 {
716     UErrorCode openStatus = U_ZERO_ERROR;
717     static TextBreakIterator* iterator = 0;
718     if (!iterator) {
719         iterator =  icu::BreakIterator::createSentenceInstance(icu::Locale(currentTextBreakLocaleID()), openStatus);
720         ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
721         if (!iterator)
722             return 0;
723     }
724 
725     setText16(iterator, string, length);
726     return iterator;
727 }
728 
isWordTextBreak(TextBreakIterator * iterator)729 bool isWordTextBreak(TextBreakIterator* iterator)
730 {
731     icu::RuleBasedBreakIterator* ruleBasedBreakIterator = static_cast<icu::RuleBasedBreakIterator*>(iterator);
732     int ruleStatus = ruleBasedBreakIterator->getRuleStatus();
733     return ruleStatus != UBRK_WORD_NONE;
734 }
735 
setUpIteratorWithRules(const char * breakRules,const UChar * string,int length)736 static TextBreakIterator* setUpIteratorWithRules(const char* breakRules, const UChar* string, int length)
737 {
738     if (!string)
739         return 0;
740 
741     static TextBreakIterator* iterator = 0;
742     if (!iterator) {
743         UParseError parseStatus;
744         UErrorCode openStatus = U_ZERO_ERROR;
745         Vector<UChar> rules;
746         String(breakRules).appendTo(rules);
747 
748         iterator = new icu::RuleBasedBreakIterator(icu::UnicodeString(rules.data(), rules.size()), parseStatus, openStatus);
749         ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
750         if (!iterator)
751             return 0;
752     }
753 
754     setText16(iterator, string, length);
755     return iterator;
756 }
757 
cursorMovementIterator(const UChar * string,int length)758 TextBreakIterator* cursorMovementIterator(const UChar* string, int length)
759 {
760     // This rule set is based on character-break iterator rules of ICU 4.0
761     // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/char.txt>.
762     // The major differences from the original ones are listed below:
763     // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier;
764     // * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342);
765     // * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and;
766     // * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks.
767     // * Added rules for regional indicator symbols.
768     static const char* const kRules =
769         "$CR      = [\\p{Grapheme_Cluster_Break = CR}];"
770         "$LF      = [\\p{Grapheme_Cluster_Break = LF}];"
771         "$Control = [\\p{Grapheme_Cluster_Break = Control}];"
772         "$VoiceMarks = [\\uFF9E\\uFF9F];"  // Japanese half-width katakana voiced marks
773         "$Extend  = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];"
774         "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];"
775         "$L       = [\\p{Grapheme_Cluster_Break = L}];"
776         "$V       = [\\p{Grapheme_Cluster_Break = V}];"
777         "$T       = [\\p{Grapheme_Cluster_Break = T}];"
778         "$LV      = [\\p{Grapheme_Cluster_Break = LV}];"
779         "$LVT     = [\\p{Grapheme_Cluster_Break = LVT}];"
780         "$Hin0    = [\\u0905-\\u0939];"    // Devanagari Letter A,...,Ha
781         "$HinV    = \\u094D;"              // Devanagari Sign Virama
782         "$Hin1    = [\\u0915-\\u0939];"    // Devanagari Letter Ka,...,Ha
783         "$Ben0    = [\\u0985-\\u09B9];"    // Bengali Letter A,...,Ha
784         "$BenV    = \\u09CD;"              // Bengali Sign Virama
785         "$Ben1    = [\\u0995-\\u09B9];"    // Bengali Letter Ka,...,Ha
786         "$Pan0    = [\\u0A05-\\u0A39];"    // Gurmukhi Letter A,...,Ha
787         "$PanV    = \\u0A4D;"              // Gurmukhi Sign Virama
788         "$Pan1    = [\\u0A15-\\u0A39];"    // Gurmukhi Letter Ka,...,Ha
789         "$Guj0    = [\\u0A85-\\u0AB9];"    // Gujarati Letter A,...,Ha
790         "$GujV    = \\u0ACD;"              // Gujarati Sign Virama
791         "$Guj1    = [\\u0A95-\\u0AB9];"    // Gujarati Letter Ka,...,Ha
792         "$Ori0    = [\\u0B05-\\u0B39];"    // Oriya Letter A,...,Ha
793         "$OriV    = \\u0B4D;"              // Oriya Sign Virama
794         "$Ori1    = [\\u0B15-\\u0B39];"    // Oriya Letter Ka,...,Ha
795         "$Tel0    = [\\u0C05-\\u0C39];"    // Telugu Letter A,...,Ha
796         "$TelV    = \\u0C4D;"              // Telugu Sign Virama
797         "$Tel1    = [\\u0C14-\\u0C39];"    // Telugu Letter Ka,...,Ha
798         "$Kan0    = [\\u0C85-\\u0CB9];"    // Kannada Letter A,...,Ha
799         "$KanV    = \\u0CCD;"              // Kannada Sign Virama
800         "$Kan1    = [\\u0C95-\\u0CB9];"    // Kannada Letter A,...,Ha
801         "$Mal0    = [\\u0D05-\\u0D39];"    // Malayalam Letter A,...,Ha
802         "$MalV    = \\u0D4D;"              // Malayalam Sign Virama
803         "$Mal1    = [\\u0D15-\\u0D39];"    // Malayalam Letter A,...,Ha
804         "$RI      = [\\U0001F1E6-\\U0001F1FF];" // Emoji regional indicators
805         "!!chain;"
806         "!!forward;"
807         "$CR $LF;"
808         "$L ($L | $V | $LV | $LVT);"
809         "($LV | $V) ($V | $T);"
810         "($LVT | $T) $T;"
811         "[^$Control $CR $LF] $Extend;"
812         "[^$Control $CR $LF] $SpacingMark;"
813         "$RI $RI / $RI;"
814         "$RI $RI;"
815         "$Hin0 $HinV $Hin1;"               // Devanagari Virama (forward)
816         "$Ben0 $BenV $Ben1;"               // Bengali Virama (forward)
817         "$Pan0 $PanV $Pan1;"               // Gurmukhi Virama (forward)
818         "$Guj0 $GujV $Guj1;"               // Gujarati Virama (forward)
819         "$Ori0 $OriV $Ori1;"               // Oriya Virama (forward)
820         "$Tel0 $TelV $Tel1;"               // Telugu Virama (forward)
821         "$Kan0 $KanV $Kan1;"               // Kannada Virama (forward)
822         "$Mal0 $MalV $Mal1;"               // Malayalam Virama (forward)
823         "!!reverse;"
824         "$LF $CR;"
825         "($L | $V | $LV | $LVT) $L;"
826         "($V | $T) ($LV | $V);"
827         "$T ($LVT | $T);"
828         "$Extend      [^$Control $CR $LF];"
829         "$SpacingMark [^$Control $CR $LF];"
830         "$RI $RI / $RI $RI;"
831         "$RI $RI;"
832         "$Hin1 $HinV $Hin0;"               // Devanagari Virama (backward)
833         "$Ben1 $BenV $Ben0;"               // Bengali Virama (backward)
834         "$Pan1 $PanV $Pan0;"               // Gurmukhi Virama (backward)
835         "$Guj1 $GujV $Guj0;"               // Gujarati Virama (backward)
836         "$Ori1 $OriV $Ori0;"               // Gujarati Virama (backward)
837         "$Tel1 $TelV $Tel0;"               // Telugu Virama (backward)
838         "$Kan1 $KanV $Kan0;"               // Kannada Virama (backward)
839         "$Mal1 $MalV $Mal0;"               // Malayalam Virama (backward)
840         "!!safe_reverse;"
841         "!!safe_forward;";
842 
843     return setUpIteratorWithRules(kRules, string, length);
844 }
845 
846 }
847