• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2006 Lars Knoll <lars@trolltech.com>
3  * Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved.
4  *
5  * This library is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU Library General Public
7  * License as published by the Free Software Foundation; either
8  * version 2 of the License, or (at your option) any later version.
9  *
10  * This library is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * Library General Public License for more details.
14  *
15  * You should have received a copy of the GNU Library General Public License
16  * along with this library; see the file COPYING.LIB.  If not, write to
17  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18  * Boston, MA 02110-1301, USA.
19  *
20  */
21 
22 #ifndef TextBreakIterator_h
23 #define TextBreakIterator_h
24 
25 #include "platform/PlatformExport.h"
26 #include "wtf/text/AtomicString.h"
27 #include "wtf/unicode/Unicode.h"
28 
29 #include <unicode/brkiter.h>
30 
31 namespace blink {
32 
33 typedef icu::BreakIterator TextBreakIterator;
34 
35 // Note: The returned iterator is good only until you get another iterator, with the exception of acquireLineBreakIterator.
36 
37 // This is similar to character break iterator in most cases, but is subject to
38 // platform UI conventions. One notable example where this can be different
39 // from character break iterator is Thai prepend characters, see bug 24342.
40 // Use this for insertion point and selection manipulations.
41 PLATFORM_EXPORT TextBreakIterator* cursorMovementIterator(const UChar*, int length);
42 
43 PLATFORM_EXPORT TextBreakIterator* wordBreakIterator(const String&, int start, int length);
44 PLATFORM_EXPORT TextBreakIterator* wordBreakIterator(const UChar*, int length);
45 PLATFORM_EXPORT TextBreakIterator* acquireLineBreakIterator(const LChar*, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength);
46 PLATFORM_EXPORT TextBreakIterator* acquireLineBreakIterator(const UChar*, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength);
47 PLATFORM_EXPORT void releaseLineBreakIterator(TextBreakIterator*);
48 PLATFORM_EXPORT TextBreakIterator* sentenceBreakIterator(const UChar*, int length);
49 
50 PLATFORM_EXPORT bool isWordTextBreak(TextBreakIterator*);
51 
52 const int TextBreakDone = -1;
53 
54 class PLATFORM_EXPORT LazyLineBreakIterator {
55 public:
LazyLineBreakIterator()56     LazyLineBreakIterator()
57         : m_iterator(0)
58         , m_cachedPriorContext(0)
59         , m_cachedPriorContextLength(0)
60     {
61         resetPriorContext();
62     }
63 
64     LazyLineBreakIterator(String string, const AtomicString& locale = AtomicString())
m_string(string)65         : m_string(string)
66         , m_locale(locale)
67         , m_iterator(0)
68         , m_cachedPriorContext(0)
69         , m_cachedPriorContextLength(0)
70     {
71         resetPriorContext();
72     }
73 
~LazyLineBreakIterator()74     ~LazyLineBreakIterator()
75     {
76         if (m_iterator)
77             releaseLineBreakIterator(m_iterator);
78     }
79 
string()80     String string() const { return m_string; }
81 
lastCharacter()82     UChar lastCharacter() const
83     {
84         COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
85         return m_priorContext[1];
86     }
87 
secondToLastCharacter()88     UChar secondToLastCharacter() const
89     {
90         COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
91         return m_priorContext[0];
92     }
93 
setPriorContext(UChar last,UChar secondToLast)94     void setPriorContext(UChar last, UChar secondToLast)
95     {
96         COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
97         m_priorContext[0] = secondToLast;
98         m_priorContext[1] = last;
99     }
100 
updatePriorContext(UChar last)101     void updatePriorContext(UChar last)
102     {
103         COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
104         m_priorContext[0] = m_priorContext[1];
105         m_priorContext[1] = last;
106     }
107 
resetPriorContext()108     void resetPriorContext()
109     {
110         COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
111         m_priorContext[0] = 0;
112         m_priorContext[1] = 0;
113     }
114 
priorContextLength()115     unsigned priorContextLength() const
116     {
117         unsigned priorContextLength = 0;
118         COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
119         if (m_priorContext[1]) {
120             ++priorContextLength;
121             if (m_priorContext[0])
122                 ++priorContextLength;
123         }
124         return priorContextLength;
125     }
126 
127     // Obtain text break iterator, possibly previously cached, where this iterator is (or has been)
128     // initialized to use the previously stored string as the primary breaking context and using
129     // previously stored prior context if non-empty.
get(unsigned priorContextLength)130     TextBreakIterator* get(unsigned priorContextLength)
131     {
132         ASSERT(priorContextLength <= priorContextCapacity);
133         const UChar* priorContext = priorContextLength ? &m_priorContext[priorContextCapacity - priorContextLength] : 0;
134         if (!m_iterator) {
135             if (m_string.is8Bit())
136                 m_iterator = acquireLineBreakIterator(m_string.characters8(), m_string.length(), m_locale, priorContext, priorContextLength);
137             else
138                 m_iterator = acquireLineBreakIterator(m_string.characters16(), m_string.length(), m_locale, priorContext, priorContextLength);
139             m_cachedPriorContext = priorContext;
140             m_cachedPriorContextLength = priorContextLength;
141         } else if (priorContext != m_cachedPriorContext || priorContextLength != m_cachedPriorContextLength) {
142             this->resetStringAndReleaseIterator(m_string, m_locale);
143             return this->get(priorContextLength);
144         }
145         return m_iterator;
146     }
147 
resetStringAndReleaseIterator(String string,const AtomicString & locale)148     void resetStringAndReleaseIterator(String string, const AtomicString& locale)
149     {
150         if (m_iterator)
151             releaseLineBreakIterator(m_iterator);
152 
153         m_string = string;
154         m_locale = locale;
155         m_iterator = 0;
156         m_cachedPriorContext = 0;
157         m_cachedPriorContextLength = 0;
158     }
159 
160 private:
161     static const unsigned priorContextCapacity = 2;
162     String m_string;
163     AtomicString m_locale;
164     TextBreakIterator* m_iterator;
165     UChar m_priorContext[priorContextCapacity];
166     const UChar* m_cachedPriorContext;
167     unsigned m_cachedPriorContextLength;
168 };
169 
170 // Iterates over "extended grapheme clusters", as defined in UAX #29.
171 // Note that platform implementations may be less sophisticated - e.g. ICU prior to
172 // version 4.0 only supports "legacy grapheme clusters".
173 // Use this for general text processing, e.g. string truncation.
174 
175 class PLATFORM_EXPORT NonSharedCharacterBreakIterator {
176     WTF_MAKE_NONCOPYABLE(NonSharedCharacterBreakIterator);
177 public:
178     explicit NonSharedCharacterBreakIterator(const String&);
179     NonSharedCharacterBreakIterator(const UChar*, unsigned length);
180     ~NonSharedCharacterBreakIterator();
181 
182     int next();
183     int current();
184 
185     bool isBreak(int offset) const;
186     int preceding(int offset) const;
187     int following(int offset) const;
188 
189     bool operator!() const
190     {
191         return !m_is8Bit && !m_iterator;
192     }
193 
194 private:
195     void createIteratorForBuffer(const UChar*, unsigned length);
196 
clusterLengthStartingAt(unsigned offset)197     unsigned clusterLengthStartingAt(unsigned offset) const
198     {
199         ASSERT(m_is8Bit);
200         // The only Latin-1 Extended Grapheme Cluster is CR LF
201         return isCRBeforeLF(offset) ? 2 : 1;
202     }
203 
isCRBeforeLF(unsigned offset)204     bool isCRBeforeLF(unsigned offset) const
205     {
206         ASSERT(m_is8Bit);
207         return m_charaters8[offset] == '\r' && offset + 1 < m_length && m_charaters8[offset + 1] == '\n';
208     }
209 
isLFAfterCR(unsigned offset)210     bool isLFAfterCR(unsigned offset) const
211     {
212         ASSERT(m_is8Bit);
213         return m_charaters8[offset] == '\n' && offset >= 1 && m_charaters8[offset - 1] == '\r';
214     }
215 
216     bool m_is8Bit;
217 
218     // For 8 bit strings, we implement the iterator ourselves.
219     const LChar* m_charaters8;
220     unsigned m_offset;
221     unsigned m_length;
222 
223     // For 16 bit strings, we use a TextBreakIterator.
224     TextBreakIterator* m_iterator;
225 };
226 
227 // Counts the number of grapheme clusters. A surrogate pair or a sequence
228 // of a non-combining character and following combining characters is
229 // counted as 1 grapheme cluster.
230 PLATFORM_EXPORT unsigned numGraphemeClusters(const String&);
231 // Returns the number of characters which will be less than or equal to
232 // the specified grapheme cluster length.
233 PLATFORM_EXPORT unsigned numCharactersInGraphemeClusters(const String&, unsigned);
234 
235 }
236 
237 #endif
238