• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2006 Lars Knoll <lars@trolltech.com>
3  * Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved.
4  *
5  * This library is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU Library General Public
7  * License as published by the Free Software Foundation; either
8  * version 2 of the License, or (at your option) any later version.
9  *
10  * This library is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * Library General Public License for more details.
14  *
15  * You should have received a copy of the GNU Library General Public License
16  * along with this library; see the file COPYING.LIB.  If not, write to
17  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
18  * Boston, MA 02110-1301, USA.
19  *
20  */
21 
22 #ifndef TextBreakIterator_h
23 #define TextBreakIterator_h
24 
25 #include "platform/PlatformExport.h"
26 #include "wtf/text/AtomicString.h"
27 #include "wtf/unicode/Unicode.h"
28 
29 namespace WebCore {
30 
31 typedef icu::BreakIterator TextBreakIterator;
32 
33 // Note: The returned iterator is good only until you get another iterator, with the exception of acquireLineBreakIterator.
34 
35 // This is similar to character break iterator in most cases, but is subject to
36 // platform UI conventions. One notable example where this can be different
37 // from character break iterator is Thai prepend characters, see bug 24342.
38 // Use this for insertion point and selection manipulations.
39 PLATFORM_EXPORT TextBreakIterator* cursorMovementIterator(const UChar*, int length);
40 
41 PLATFORM_EXPORT TextBreakIterator* wordBreakIterator(const String&, int start, int length);
42 PLATFORM_EXPORT TextBreakIterator* wordBreakIterator(const UChar*, int length);
43 PLATFORM_EXPORT TextBreakIterator* acquireLineBreakIterator(const LChar*, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength);
44 PLATFORM_EXPORT TextBreakIterator* acquireLineBreakIterator(const UChar*, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength);
45 PLATFORM_EXPORT void releaseLineBreakIterator(TextBreakIterator*);
46 PLATFORM_EXPORT TextBreakIterator* sentenceBreakIterator(const UChar*, int length);
47 
48 PLATFORM_EXPORT bool isWordTextBreak(TextBreakIterator*);
49 
50 const int TextBreakDone = -1;
51 
52 class PLATFORM_EXPORT LazyLineBreakIterator {
53 public:
LazyLineBreakIterator()54     LazyLineBreakIterator()
55         : m_iterator(0)
56         , m_cachedPriorContext(0)
57         , m_cachedPriorContextLength(0)
58     {
59         resetPriorContext();
60     }
61 
62     LazyLineBreakIterator(String string, const AtomicString& locale = AtomicString())
m_string(string)63         : m_string(string)
64         , m_locale(locale)
65         , m_iterator(0)
66         , m_cachedPriorContext(0)
67         , m_cachedPriorContextLength(0)
68     {
69         resetPriorContext();
70     }
71 
~LazyLineBreakIterator()72     ~LazyLineBreakIterator()
73     {
74         if (m_iterator)
75             releaseLineBreakIterator(m_iterator);
76     }
77 
string()78     String string() const { return m_string; }
79 
lastCharacter()80     UChar lastCharacter() const
81     {
82         COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
83         return m_priorContext[1];
84     }
85 
secondToLastCharacter()86     UChar secondToLastCharacter() const
87     {
88         COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
89         return m_priorContext[0];
90     }
91 
setPriorContext(UChar last,UChar secondToLast)92     void setPriorContext(UChar last, UChar secondToLast)
93     {
94         COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
95         m_priorContext[0] = secondToLast;
96         m_priorContext[1] = last;
97     }
98 
updatePriorContext(UChar last)99     void updatePriorContext(UChar last)
100     {
101         COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
102         m_priorContext[0] = m_priorContext[1];
103         m_priorContext[1] = last;
104     }
105 
resetPriorContext()106     void resetPriorContext()
107     {
108         COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
109         m_priorContext[0] = 0;
110         m_priorContext[1] = 0;
111     }
112 
priorContextLength()113     unsigned priorContextLength() const
114     {
115         unsigned priorContextLength = 0;
116         COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
117         if (m_priorContext[1]) {
118             ++priorContextLength;
119             if (m_priorContext[0])
120                 ++priorContextLength;
121         }
122         return priorContextLength;
123     }
124 
125     // Obtain text break iterator, possibly previously cached, where this iterator is (or has been)
126     // initialized to use the previously stored string as the primary breaking context and using
127     // previously stored prior context if non-empty.
get(unsigned priorContextLength)128     TextBreakIterator* get(unsigned priorContextLength)
129     {
130         ASSERT(priorContextLength <= priorContextCapacity);
131         const UChar* priorContext = priorContextLength ? &m_priorContext[priorContextCapacity - priorContextLength] : 0;
132         if (!m_iterator) {
133             if (m_string.is8Bit())
134                 m_iterator = acquireLineBreakIterator(m_string.characters8(), m_string.length(), m_locale, priorContext, priorContextLength);
135             else
136                 m_iterator = acquireLineBreakIterator(m_string.characters16(), m_string.length(), m_locale, priorContext, priorContextLength);
137             m_cachedPriorContext = priorContext;
138             m_cachedPriorContextLength = priorContextLength;
139         } else if (priorContext != m_cachedPriorContext || priorContextLength != m_cachedPriorContextLength) {
140             this->resetStringAndReleaseIterator(m_string, m_locale);
141             return this->get(priorContextLength);
142         }
143         return m_iterator;
144     }
145 
resetStringAndReleaseIterator(String string,const AtomicString & locale)146     void resetStringAndReleaseIterator(String string, const AtomicString& locale)
147     {
148         if (m_iterator)
149             releaseLineBreakIterator(m_iterator);
150 
151         m_string = string;
152         m_locale = locale;
153         m_iterator = 0;
154         m_cachedPriorContext = 0;
155         m_cachedPriorContextLength = 0;
156     }
157 
158 private:
159     static const unsigned priorContextCapacity = 2;
160     String m_string;
161     AtomicString m_locale;
162     TextBreakIterator* m_iterator;
163     UChar m_priorContext[priorContextCapacity];
164     const UChar* m_cachedPriorContext;
165     unsigned m_cachedPriorContextLength;
166 };
167 
168 // Iterates over "extended grapheme clusters", as defined in UAX #29.
169 // Note that platform implementations may be less sophisticated - e.g. ICU prior to
170 // version 4.0 only supports "legacy grapheme clusters".
171 // Use this for general text processing, e.g. string truncation.
172 
173 class PLATFORM_EXPORT NonSharedCharacterBreakIterator {
174     WTF_MAKE_NONCOPYABLE(NonSharedCharacterBreakIterator);
175 public:
176     explicit NonSharedCharacterBreakIterator(const String&);
177     NonSharedCharacterBreakIterator(const UChar*, unsigned length);
178     ~NonSharedCharacterBreakIterator();
179 
180     int next();
181     int current();
182 
183     bool isBreak(int offset) const;
184     int preceding(int offset) const;
185     int following(int offset) const;
186 
187     bool operator!() const
188     {
189         return !m_is8Bit && !m_iterator;
190     }
191 
192 private:
193     void createIteratorForBuffer(const UChar*, unsigned length);
194 
clusterLengthStartingAt(unsigned offset)195     unsigned clusterLengthStartingAt(unsigned offset) const
196     {
197         ASSERT(m_is8Bit);
198         // The only Latin-1 Extended Grapheme Cluster is CR LF
199         return isCRBeforeLF(offset) ? 2 : 1;
200     }
201 
isCRBeforeLF(unsigned offset)202     bool isCRBeforeLF(unsigned offset) const
203     {
204         ASSERT(m_is8Bit);
205         return m_charaters8[offset] == '\r' && offset + 1 < m_length && m_charaters8[offset + 1] == '\n';
206     }
207 
isLFAfterCR(unsigned offset)208     bool isLFAfterCR(unsigned offset) const
209     {
210         ASSERT(m_is8Bit);
211         return m_charaters8[offset] == '\n' && offset >= 1 && m_charaters8[offset - 1] == '\r';
212     }
213 
214     bool m_is8Bit;
215 
216     // For 8 bit strings, we implement the iterator ourselves.
217     const LChar* m_charaters8;
218     unsigned m_offset;
219     unsigned m_length;
220 
221     // For 16 bit strings, we use a TextBreakIterator.
222     TextBreakIterator* m_iterator;
223 };
224 
225 // Counts the number of grapheme clusters. A surrogate pair or a sequence
226 // of a non-combining character and following combining characters is
227 // counted as 1 grapheme cluster.
228 PLATFORM_EXPORT unsigned numGraphemeClusters(const String&);
229 // Returns the number of characters which will be less than or equal to
230 // the specified grapheme cluster length.
231 PLATFORM_EXPORT unsigned numCharactersInGraphemeClusters(const String&, unsigned);
232 
233 }
234 
235 #endif
236