• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010 Apple Inc. All rights reserved.
3  * Copyright (C) 2005 Alexey Proskuryakov.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #include "config.h"
28 #include "TextIterator.h"
29 
30 #include "Document.h"
31 #include "Frame.h"
32 #include "HTMLElement.h"
33 #include "HTMLNames.h"
34 #include "htmlediting.h"
35 #include "InlineTextBox.h"
36 #include "Range.h"
37 #include "RenderTableCell.h"
38 #include "RenderTableRow.h"
39 #include "RenderTextControl.h"
40 #include "RenderTextFragment.h"
41 #include "TextBoundaries.h"
42 #include "TextBreakIterator.h"
43 #include "VisiblePosition.h"
44 #include "visible_units.h"
45 #include <wtf/unicode/CharacterNames.h>
46 
47 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
48 #include "TextBreakIteratorInternalICU.h"
49 #include <unicode/usearch.h>
50 #endif
51 
52 using namespace WTF::Unicode;
53 using namespace std;
54 
55 namespace WebCore {
56 
57 using namespace HTMLNames;
58 
59 // Buffer that knows how to compare with a search target.
60 // Keeps enough of the previous text to be able to search in the future, but no more.
61 // Non-breaking spaces are always equal to normal spaces.
62 // Case folding is also done if the CaseInsensitive option is specified.
63 // Matches are further filtered if the AtWordStarts option is specified, although some
64 // matches inside a word are permitted if TreatMedialCapitalAsWordStart is specified as well.
65 class SearchBuffer {
66     WTF_MAKE_NONCOPYABLE(SearchBuffer);
67 public:
68     SearchBuffer(const String& target, FindOptions);
69     ~SearchBuffer();
70 
71     // Returns number of characters appended; guaranteed to be in the range [1, length].
72     size_t append(const UChar*, size_t length);
73     bool needsMoreContext() const;
74     void prependContext(const UChar*, size_t length);
75     void reachedBreak();
76 
77     // Result is the size in characters of what was found.
78     // And <startOffset> is the number of characters back to the start of what was found.
79     size_t search(size_t& startOffset);
80     bool atBreak() const;
81 
82 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
83 
84 private:
85     bool isBadMatch(const UChar*, size_t length) const;
86     bool isWordStartMatch(size_t start, size_t length) const;
87 
88     String m_target;
89     FindOptions m_options;
90 
91     Vector<UChar> m_buffer;
92     size_t m_overlap;
93     size_t m_prefixLength;
94     bool m_atBreak;
95     bool m_needsMoreContext;
96 
97     bool m_targetRequiresKanaWorkaround;
98     Vector<UChar> m_normalizedTarget;
99     mutable Vector<UChar> m_normalizedMatch;
100 
101 #else
102 
103 private:
104     void append(UChar, bool isCharacterStart);
105     size_t length() const;
106 
107     String m_target;
108     FindOptions m_options;
109 
110     Vector<UChar> m_buffer;
111     Vector<bool> m_isCharacterStartBuffer;
112     bool m_isBufferFull;
113     size_t m_cursor;
114 
115 #endif
116 };
117 
118 // --------
119 
120 static const unsigned bitsInWord = sizeof(unsigned) * 8;
121 static const unsigned bitInWordMask = bitsInWord - 1;
122 
BitStack()123 BitStack::BitStack()
124     : m_size(0)
125 {
126 }
127 
~BitStack()128 BitStack::~BitStack()
129 {
130 }
131 
push(bool bit)132 void BitStack::push(bool bit)
133 {
134     unsigned index = m_size / bitsInWord;
135     unsigned shift = m_size & bitInWordMask;
136     if (!shift && index == m_words.size()) {
137         m_words.grow(index + 1);
138         m_words[index] = 0;
139     }
140     unsigned& word = m_words[index];
141     unsigned mask = 1U << shift;
142     if (bit)
143         word |= mask;
144     else
145         word &= ~mask;
146     ++m_size;
147 }
148 
pop()149 void BitStack::pop()
150 {
151     if (m_size)
152         --m_size;
153 }
154 
top() const155 bool BitStack::top() const
156 {
157     if (!m_size)
158         return false;
159     unsigned shift = (m_size - 1) & bitInWordMask;
160     return m_words.last() & (1U << shift);
161 }
162 
size() const163 unsigned BitStack::size() const
164 {
165     return m_size;
166 }
167 
168 // --------
169 
170 #if !ASSERT_DISABLED
171 
depthCrossingShadowBoundaries(Node * node)172 static unsigned depthCrossingShadowBoundaries(Node* node)
173 {
174     unsigned depth = 0;
175     for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
176         ++depth;
177     return depth;
178 }
179 
180 #endif
181 
182 // This function is like Range::pastLastNode, except for the fact that it can climb up out of shadow trees.
nextInPreOrderCrossingShadowBoundaries(Node * rangeEndContainer,int rangeEndOffset)183 static Node* nextInPreOrderCrossingShadowBoundaries(Node* rangeEndContainer, int rangeEndOffset)
184 {
185     if (!rangeEndContainer)
186         return 0;
187     if (rangeEndOffset >= 0 && !rangeEndContainer->offsetInCharacters()) {
188         if (Node* next = rangeEndContainer->childNode(rangeEndOffset))
189             return next;
190     }
191     for (Node* node = rangeEndContainer; node; node = node->parentOrHostNode()) {
192         if (Node* next = node->nextSibling())
193             return next;
194     }
195     return 0;
196 }
197 
198 // --------
199 
fullyClipsContents(Node * node)200 static inline bool fullyClipsContents(Node* node)
201 {
202     RenderObject* renderer = node->renderer();
203     if (!renderer || !renderer->isBox() || !renderer->hasOverflowClip())
204         return false;
205     return toRenderBox(renderer)->size().isEmpty();
206 }
207 
ignoresContainerClip(Node * node)208 static inline bool ignoresContainerClip(Node* node)
209 {
210     RenderObject* renderer = node->renderer();
211     if (!renderer || renderer->isText())
212         return false;
213     EPosition position = renderer->style()->position();
214     return position == AbsolutePosition || position == FixedPosition;
215 }
216 
pushFullyClippedState(BitStack & stack,Node * node)217 static void pushFullyClippedState(BitStack& stack, Node* node)
218 {
219     ASSERT(stack.size() == depthCrossingShadowBoundaries(node));
220 
221     // Push true if this node full clips its contents, or if a parent already has fully
222     // clipped and this is not a node that ignores its container's clip.
223     stack.push(fullyClipsContents(node) || (stack.top() && !ignoresContainerClip(node)));
224 }
225 
setUpFullyClippedStack(BitStack & stack,Node * node)226 static void setUpFullyClippedStack(BitStack& stack, Node* node)
227 {
228     // Put the nodes in a vector so we can iterate in reverse order.
229     Vector<Node*, 100> ancestry;
230     for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
231         ancestry.append(parent);
232 
233     // Call pushFullyClippedState on each node starting with the earliest ancestor.
234     size_t size = ancestry.size();
235     for (size_t i = 0; i < size; ++i)
236         pushFullyClippedState(stack, ancestry[size - i - 1]);
237     pushFullyClippedState(stack, node);
238 
239     ASSERT(stack.size() == 1 + depthCrossingShadowBoundaries(node));
240 }
241 
242 // --------
243 
TextIterator()244 TextIterator::TextIterator()
245     : m_startContainer(0)
246     , m_startOffset(0)
247     , m_endContainer(0)
248     , m_endOffset(0)
249     , m_positionNode(0)
250     , m_textCharacters(0)
251     , m_textLength(0)
252     , m_remainingTextBox(0)
253     , m_firstLetterText(0)
254     , m_lastCharacter(0)
255     , m_emitsCharactersBetweenAllVisiblePositions(false)
256     , m_entersTextControls(false)
257     , m_emitsTextWithoutTranscoding(false)
258     , m_handledFirstLetter(false)
259     , m_ignoresStyleVisibility(false)
260     , m_emitsObjectReplacementCharacters(false)
261 {
262 }
263 
TextIterator(const Range * r,TextIteratorBehavior behavior)264 TextIterator::TextIterator(const Range* r, TextIteratorBehavior behavior)
265     : m_startContainer(0)
266     , m_startOffset(0)
267     , m_endContainer(0)
268     , m_endOffset(0)
269     , m_positionNode(0)
270     , m_textCharacters(0)
271     , m_textLength(0)
272     , m_remainingTextBox(0)
273     , m_firstLetterText(0)
274     , m_emitsCharactersBetweenAllVisiblePositions(behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions)
275     , m_entersTextControls(behavior & TextIteratorEntersTextControls)
276     , m_emitsTextWithoutTranscoding(behavior & TextIteratorEmitsTextsWithoutTranscoding)
277     , m_handledFirstLetter(false)
278     , m_ignoresStyleVisibility(behavior & TextIteratorIgnoresStyleVisibility)
279     , m_emitsObjectReplacementCharacters(behavior & TextIteratorEmitsObjectReplacementCharacters)
280 {
281     if (!r)
282         return;
283 
284     // get and validate the range endpoints
285     Node* startContainer = r->startContainer();
286     if (!startContainer)
287         return;
288     int startOffset = r->startOffset();
289     Node* endContainer = r->endContainer();
290     int endOffset = r->endOffset();
291 
292     // Callers should be handing us well-formed ranges. If we discover that this isn't
293     // the case, we could consider changing this assertion to an early return.
294     ASSERT(r->boundaryPointsValid());
295 
296     // remember range - this does not change
297     m_startContainer = startContainer;
298     m_startOffset = startOffset;
299     m_endContainer = endContainer;
300     m_endOffset = endOffset;
301 
302     // set up the current node for processing
303     m_node = r->firstNode();
304     if (!m_node)
305         return;
306     setUpFullyClippedStack(m_fullyClippedStack, m_node);
307     m_offset = m_node == m_startContainer ? m_startOffset : 0;
308     m_handledNode = false;
309     m_handledChildren = false;
310 
311     // calculate first out of bounds node
312     m_pastEndNode = nextInPreOrderCrossingShadowBoundaries(endContainer, endOffset);
313 
314     // initialize node processing state
315     m_needsAnotherNewline = false;
316     m_textBox = 0;
317 
318     // initialize record of previous node processing
319     m_hasEmitted = false;
320     m_lastTextNode = 0;
321     m_lastTextNodeEndedWithCollapsedSpace = false;
322     m_lastCharacter = 0;
323 
324 #ifndef NDEBUG
325     // need this just because of the assert in advance()
326     m_positionNode = m_node;
327 #endif
328 
329     // identify the first run
330     advance();
331 }
332 
~TextIterator()333 TextIterator::~TextIterator()
334 {
335 }
336 
advance()337 void TextIterator::advance()
338 {
339     // reset the run information
340     m_positionNode = 0;
341     m_textLength = 0;
342 
343     // handle remembered node that needed a newline after the text node's newline
344     if (m_needsAnotherNewline) {
345         // Emit the extra newline, and position it *inside* m_node, after m_node's
346         // contents, in case it's a block, in the same way that we position the first
347         // newline.  The range for the emitted newline should start where the line
348         // break begins.
349         // FIXME: It would be cleaner if we emitted two newlines during the last
350         // iteration, instead of using m_needsAnotherNewline.
351         Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
352         emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
353         m_needsAnotherNewline = false;
354         return;
355     }
356 
357     if (!m_textBox && m_remainingTextBox) {
358         m_textBox = m_remainingTextBox;
359         m_remainingTextBox = 0;
360         m_firstLetterText = 0;
361         m_offset = 0;
362     }
363     // handle remembered text box
364     if (m_textBox) {
365         handleTextBox();
366         if (m_positionNode)
367             return;
368     }
369 
370     while (m_node && m_node != m_pastEndNode) {
371         // if the range ends at offset 0 of an element, represent the
372         // position, but not the content, of that element e.g. if the
373         // node is a blockflow element, emit a newline that
374         // precedes the element
375         if (m_node == m_endContainer && m_endOffset == 0) {
376             representNodeOffsetZero();
377             m_node = 0;
378             return;
379         }
380 
381         RenderObject* renderer = m_node->renderer();
382         if (!renderer) {
383             m_handledNode = true;
384             m_handledChildren = true;
385         } else {
386             // handle current node according to its type
387             if (!m_handledNode) {
388                 if (renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) // FIXME: What about CDATA_SECTION_NODE?
389                     m_handledNode = handleTextNode();
390                 else if (renderer && (renderer->isImage() || renderer->isWidget() ||
391                          (renderer->node() && renderer->node()->isElementNode() &&
392                           static_cast<Element*>(renderer->node())->isFormControlElement())))
393                     m_handledNode = handleReplacedElement();
394                 else
395                     m_handledNode = handleNonTextNode();
396                 if (m_positionNode)
397                     return;
398             }
399         }
400 
401         // find a new current node to handle in depth-first manner,
402         // calling exitNode() as we come back thru a parent node
403         Node* next = m_handledChildren ? 0 : m_node->firstChild();
404         m_offset = 0;
405         if (!next) {
406             next = m_node->nextSibling();
407             if (!next) {
408                 bool pastEnd = m_node->traverseNextNode() == m_pastEndNode;
409                 Node* parentNode = m_node->parentOrHostNode();
410                 while (!next && parentNode) {
411                     if ((pastEnd && parentNode == m_endContainer) || m_endContainer->isDescendantOf(parentNode))
412                         return;
413                     bool haveRenderer = m_node->renderer();
414                     m_node = parentNode;
415                     m_fullyClippedStack.pop();
416                     parentNode = m_node->parentOrHostNode();
417                     if (haveRenderer)
418                         exitNode();
419                     if (m_positionNode) {
420                         m_handledNode = true;
421                         m_handledChildren = true;
422                         return;
423                     }
424                     next = m_node->nextSibling();
425                 }
426             }
427             m_fullyClippedStack.pop();
428         }
429 
430         // set the new current node
431         m_node = next;
432         if (m_node)
433             pushFullyClippedState(m_fullyClippedStack, m_node);
434         m_handledNode = false;
435         m_handledChildren = false;
436         m_handledFirstLetter = false;
437         m_firstLetterText = 0;
438 
439         // how would this ever be?
440         if (m_positionNode)
441             return;
442     }
443 }
444 
handleTextNode()445 bool TextIterator::handleTextNode()
446 {
447     if (m_fullyClippedStack.top() && !m_ignoresStyleVisibility)
448         return false;
449 
450     RenderText* renderer = toRenderText(m_node->renderer());
451 
452     m_lastTextNode = m_node;
453     String str = renderer->text();
454 
455     // handle pre-formatted text
456     if (!renderer->style()->collapseWhiteSpace()) {
457         int runStart = m_offset;
458         if (m_lastTextNodeEndedWithCollapsedSpace && hasVisibleTextNode(renderer)) {
459             emitCharacter(' ', m_node, 0, runStart, runStart);
460             return false;
461         }
462         if (!m_handledFirstLetter && renderer->isTextFragment()) {
463             handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
464             if (m_firstLetterText) {
465                 String firstLetter = m_firstLetterText->text();
466                 emitText(m_node, m_firstLetterText, m_offset, m_offset + firstLetter.length());
467                 m_firstLetterText = 0;
468                 m_textBox = 0;
469                 return false;
470             }
471         }
472         if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
473             return false;
474         int strLength = str.length();
475         int end = (m_node == m_endContainer) ? m_endOffset : INT_MAX;
476         int runEnd = min(strLength, end);
477 
478         if (runStart >= runEnd)
479             return true;
480 
481         emitText(m_node, runStart, runEnd);
482         return true;
483     }
484 
485     if (!renderer->firstTextBox() && str.length() > 0) {
486         if (!m_handledFirstLetter && renderer->isTextFragment()) {
487             handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
488             if (m_firstLetterText) {
489                 handleTextBox();
490                 return false;
491             }
492         }
493         if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
494             return false;
495         m_lastTextNodeEndedWithCollapsedSpace = true; // entire block is collapsed space
496         return true;
497     }
498 
499     // Used when text boxes are out of order (Hebrew/Arabic w/ embeded LTR text)
500     if (renderer->containsReversedText()) {
501         m_sortedTextBoxes.clear();
502         for (InlineTextBox* textBox = renderer->firstTextBox(); textBox; textBox = textBox->nextTextBox()) {
503             m_sortedTextBoxes.append(textBox);
504         }
505         std::sort(m_sortedTextBoxes.begin(), m_sortedTextBoxes.end(), InlineTextBox::compareByStart);
506         m_sortedTextBoxesPosition = 0;
507     }
508 
509     m_textBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox();
510     if (!m_handledFirstLetter && renderer->isTextFragment() && !m_offset)
511         handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
512     handleTextBox();
513     return true;
514 }
515 
handleTextBox()516 void TextIterator::handleTextBox()
517 {
518     RenderText* renderer = m_firstLetterText ? m_firstLetterText : toRenderText(m_node->renderer());
519     if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility) {
520         m_textBox = 0;
521         return;
522     }
523     String str = renderer->text();
524     unsigned start = m_offset;
525     unsigned end = (m_node == m_endContainer) ? static_cast<unsigned>(m_endOffset) : UINT_MAX;
526     while (m_textBox) {
527         unsigned textBoxStart = m_textBox->start();
528         unsigned runStart = max(textBoxStart, start);
529 
530         // Check for collapsed space at the start of this run.
531         InlineTextBox* firstTextBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox();
532         bool needSpace = m_lastTextNodeEndedWithCollapsedSpace
533             || (m_textBox == firstTextBox && textBoxStart == runStart && runStart > 0);
534         if (needSpace && !isCollapsibleWhitespace(m_lastCharacter) && m_lastCharacter) {
535             if (m_lastTextNode == m_node && runStart > 0 && str[runStart - 1] == ' ') {
536                 unsigned spaceRunStart = runStart - 1;
537                 while (spaceRunStart > 0 && str[spaceRunStart - 1] == ' ')
538                     --spaceRunStart;
539                 emitText(m_node, renderer, spaceRunStart, spaceRunStart + 1);
540             } else
541                 emitCharacter(' ', m_node, 0, runStart, runStart);
542             return;
543         }
544         unsigned textBoxEnd = textBoxStart + m_textBox->len();
545         unsigned runEnd = min(textBoxEnd, end);
546 
547         // Determine what the next text box will be, but don't advance yet
548         InlineTextBox* nextTextBox = 0;
549         if (renderer->containsReversedText()) {
550             if (m_sortedTextBoxesPosition + 1 < m_sortedTextBoxes.size())
551                 nextTextBox = m_sortedTextBoxes[m_sortedTextBoxesPosition + 1];
552         } else
553             nextTextBox = m_textBox->nextTextBox();
554 
555         if (runStart < runEnd) {
556             // Handle either a single newline character (which becomes a space),
557             // or a run of characters that does not include a newline.
558             // This effectively translates newlines to spaces without copying the text.
559             if (str[runStart] == '\n') {
560                 emitCharacter(' ', m_node, 0, runStart, runStart + 1);
561                 m_offset = runStart + 1;
562             } else {
563                 size_t subrunEnd = str.find('\n', runStart);
564                 if (subrunEnd == notFound || subrunEnd > runEnd)
565                     subrunEnd = runEnd;
566 
567                 m_offset = subrunEnd;
568                 emitText(m_node, renderer, runStart, subrunEnd);
569             }
570 
571             // If we are doing a subrun that doesn't go to the end of the text box,
572             // come back again to finish handling this text box; don't advance to the next one.
573             if (static_cast<unsigned>(m_positionEndOffset) < textBoxEnd)
574                 return;
575 
576             // Advance and return
577             unsigned nextRunStart = nextTextBox ? nextTextBox->start() : str.length();
578             if (nextRunStart > runEnd)
579                 m_lastTextNodeEndedWithCollapsedSpace = true; // collapsed space between runs or at the end
580             m_textBox = nextTextBox;
581             if (renderer->containsReversedText())
582                 ++m_sortedTextBoxesPosition;
583             return;
584         }
585         // Advance and continue
586         m_textBox = nextTextBox;
587         if (renderer->containsReversedText())
588             ++m_sortedTextBoxesPosition;
589     }
590     if (!m_textBox && m_remainingTextBox) {
591         m_textBox = m_remainingTextBox;
592         m_remainingTextBox = 0;
593         m_firstLetterText = 0;
594         m_offset = 0;
595         handleTextBox();
596     }
597 }
598 
handleTextNodeFirstLetter(RenderTextFragment * renderer)599 void TextIterator::handleTextNodeFirstLetter(RenderTextFragment* renderer)
600 {
601     if (renderer->firstLetter()) {
602         RenderObject* r = renderer->firstLetter();
603         if (r->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
604             return;
605         for (RenderObject *currChild = r->firstChild(); currChild; currChild->nextSibling()) {
606             if (currChild->isText()) {
607                 RenderText* firstLetter = toRenderText(currChild);
608                 m_handledFirstLetter = true;
609                 m_remainingTextBox = m_textBox;
610                 m_textBox = firstLetter->firstTextBox();
611                 m_firstLetterText = firstLetter;
612                 return;
613             }
614         }
615     }
616     m_handledFirstLetter = true;
617 }
618 
handleReplacedElement()619 bool TextIterator::handleReplacedElement()
620 {
621     if (m_fullyClippedStack.top())
622         return false;
623 
624     RenderObject* renderer = m_node->renderer();
625     if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
626         return false;
627 
628     if (m_lastTextNodeEndedWithCollapsedSpace) {
629         emitCharacter(' ', m_lastTextNode->parentNode(), m_lastTextNode, 1, 1);
630         return false;
631     }
632 
633     if (m_entersTextControls && renderer->isTextControl()) {
634         if (HTMLElement* innerTextElement = toRenderTextControl(renderer)->innerTextElement()) {
635             m_node = innerTextElement->shadowTreeRootNode();
636             pushFullyClippedState(m_fullyClippedStack, m_node);
637             m_offset = 0;
638             return false;
639         }
640     }
641 
642     m_hasEmitted = true;
643 
644     if (m_emitsObjectReplacementCharacters && renderer && renderer->isReplaced()) {
645         emitCharacter(objectReplacementCharacter, m_node->parentNode(), m_node, 0, 1);
646         return true;
647     }
648 
649     if (m_emitsCharactersBetweenAllVisiblePositions) {
650         // We want replaced elements to behave like punctuation for boundary
651         // finding, and to simply take up space for the selection preservation
652         // code in moveParagraphs, so we use a comma.
653         emitCharacter(',', m_node->parentNode(), m_node, 0, 1);
654         return true;
655     }
656 
657     m_positionNode = m_node->parentNode();
658     m_positionOffsetBaseNode = m_node;
659     m_positionStartOffset = 0;
660     m_positionEndOffset = 1;
661 
662     m_textCharacters = 0;
663     m_textLength = 0;
664 
665     m_lastCharacter = 0;
666 
667     return true;
668 }
669 
hasVisibleTextNode(RenderText * renderer)670 bool TextIterator::hasVisibleTextNode(RenderText* renderer)
671 {
672     if (renderer->style()->visibility() == VISIBLE)
673         return true;
674     if (renderer->isTextFragment()) {
675         RenderTextFragment* fragment = static_cast<RenderTextFragment*>(renderer);
676         if (fragment->firstLetter() && fragment->firstLetter()->style()->visibility() == VISIBLE)
677             return true;
678     }
679     return false;
680 }
681 
shouldEmitTabBeforeNode(Node * node)682 static bool shouldEmitTabBeforeNode(Node* node)
683 {
684     RenderObject* r = node->renderer();
685 
686     // Table cells are delimited by tabs.
687     if (!r || !isTableCell(node))
688         return false;
689 
690     // Want a tab before every cell other than the first one
691     RenderTableCell* rc = toRenderTableCell(r);
692     RenderTable* t = rc->table();
693     return t && (t->cellBefore(rc) || t->cellAbove(rc));
694 }
695 
shouldEmitNewlineForNode(Node * node)696 static bool shouldEmitNewlineForNode(Node* node)
697 {
698     // br elements are represented by a single newline.
699     RenderObject* r = node->renderer();
700     if (!r)
701         return node->hasTagName(brTag);
702 
703     return r->isBR();
704 }
705 
shouldEmitNewlinesBeforeAndAfterNode(Node * node)706 static bool shouldEmitNewlinesBeforeAndAfterNode(Node* node)
707 {
708     // Block flow (versus inline flow) is represented by having
709     // a newline both before and after the element.
710     RenderObject* r = node->renderer();
711     if (!r) {
712         return (node->hasTagName(blockquoteTag)
713                 || node->hasTagName(ddTag)
714                 || node->hasTagName(divTag)
715                 || node->hasTagName(dlTag)
716                 || node->hasTagName(dtTag)
717                 || node->hasTagName(h1Tag)
718                 || node->hasTagName(h2Tag)
719                 || node->hasTagName(h3Tag)
720                 || node->hasTagName(h4Tag)
721                 || node->hasTagName(h5Tag)
722                 || node->hasTagName(h6Tag)
723                 || node->hasTagName(hrTag)
724                 || node->hasTagName(liTag)
725                 || node->hasTagName(listingTag)
726                 || node->hasTagName(olTag)
727                 || node->hasTagName(pTag)
728                 || node->hasTagName(preTag)
729                 || node->hasTagName(trTag)
730                 || node->hasTagName(ulTag));
731     }
732 
733     // Need to make an exception for table cells, because they are blocks, but we
734     // want them tab-delimited rather than having newlines before and after.
735     if (isTableCell(node))
736         return false;
737 
738     // Need to make an exception for table row elements, because they are neither
739     // "inline" or "RenderBlock", but we want newlines for them.
740     if (r->isTableRow()) {
741         RenderTable* t = toRenderTableRow(r)->table();
742         if (t && !t->isInline())
743             return true;
744     }
745 
746     return !r->isInline() && r->isRenderBlock() && !r->isFloatingOrPositioned() && !r->isBody();
747 }
748 
shouldEmitNewlineAfterNode(Node * node)749 static bool shouldEmitNewlineAfterNode(Node* node)
750 {
751     // FIXME: It should be better but slower to create a VisiblePosition here.
752     if (!shouldEmitNewlinesBeforeAndAfterNode(node))
753         return false;
754     // Check if this is the very last renderer in the document.
755     // If so, then we should not emit a newline.
756     while ((node = node->traverseNextSibling()))
757         if (node->renderer())
758             return true;
759     return false;
760 }
761 
shouldEmitNewlineBeforeNode(Node * node)762 static bool shouldEmitNewlineBeforeNode(Node* node)
763 {
764     return shouldEmitNewlinesBeforeAndAfterNode(node);
765 }
766 
shouldEmitExtraNewlineForNode(Node * node)767 static bool shouldEmitExtraNewlineForNode(Node* node)
768 {
769     // When there is a significant collapsed bottom margin, emit an extra
770     // newline for a more realistic result.  We end up getting the right
771     // result even without margin collapsing. For example: <div><p>text</p></div>
772     // will work right even if both the <div> and the <p> have bottom margins.
773     RenderObject* r = node->renderer();
774     if (!r || !r->isBox())
775         return false;
776 
777     // NOTE: We only do this for a select set of nodes, and fwiw WinIE appears
778     // not to do this at all
779     if (node->hasTagName(h1Tag)
780         || node->hasTagName(h2Tag)
781         || node->hasTagName(h3Tag)
782         || node->hasTagName(h4Tag)
783         || node->hasTagName(h5Tag)
784         || node->hasTagName(h6Tag)
785         || node->hasTagName(pTag)) {
786         RenderStyle* style = r->style();
787         if (style) {
788             int bottomMargin = toRenderBox(r)->collapsedMarginAfter();
789             int fontSize = style->fontDescription().computedPixelSize();
790             if (bottomMargin * 2 >= fontSize)
791                 return true;
792         }
793     }
794 
795     return false;
796 }
797 
collapsedSpaceLength(RenderText * renderer,int textEnd)798 static int collapsedSpaceLength(RenderText* renderer, int textEnd)
799 {
800     const UChar* characters = renderer->text()->characters();
801     int length = renderer->text()->length();
802     for (int i = textEnd; i < length; ++i) {
803         if (!renderer->style()->isCollapsibleWhiteSpace(characters[i]))
804             return i - textEnd;
805     }
806 
807     return length - textEnd;
808 }
809 
maxOffsetIncludingCollapsedSpaces(Node * node)810 static int maxOffsetIncludingCollapsedSpaces(Node* node)
811 {
812     int offset = caretMaxOffset(node);
813 
814     if (node->renderer() && node->renderer()->isText())
815         offset += collapsedSpaceLength(toRenderText(node->renderer()), offset);
816 
817     return offset;
818 }
819 
820 // Whether or not we should emit a character as we enter m_node (if it's a container) or as we hit it (if it's atomic).
shouldRepresentNodeOffsetZero()821 bool TextIterator::shouldRepresentNodeOffsetZero()
822 {
823     if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isTable())
824         return true;
825 
826     // Leave element positioned flush with start of a paragraph
827     // (e.g. do not insert tab before a table cell at the start of a paragraph)
828     if (m_lastCharacter == '\n')
829         return false;
830 
831     // Otherwise, show the position if we have emitted any characters
832     if (m_hasEmitted)
833         return true;
834 
835     // We've not emitted anything yet. Generally, there is no need for any positioning then.
836     // The only exception is when the element is visually not in the same line as
837     // the start of the range (e.g. the range starts at the end of the previous paragraph).
838     // NOTE: Creating VisiblePositions and comparing them is relatively expensive, so we
839     // make quicker checks to possibly avoid that. Another check that we could make is
840     // is whether the inline vs block flow changed since the previous visible element.
841     // I think we're already in a special enough case that that won't be needed, tho.
842 
843     // No character needed if this is the first node in the range.
844     if (m_node == m_startContainer)
845         return false;
846 
847     // If we are outside the start container's subtree, assume we need to emit.
848     // FIXME: m_startContainer could be an inline block
849     if (!m_node->isDescendantOf(m_startContainer))
850         return true;
851 
852     // If we started as m_startContainer offset 0 and the current node is a descendant of
853     // the start container, we already had enough context to correctly decide whether to
854     // emit after a preceding block. We chose not to emit (m_hasEmitted is false),
855     // so don't second guess that now.
856     // NOTE: Is this really correct when m_node is not a leftmost descendant? Probably
857     // immaterial since we likely would have already emitted something by now.
858     if (m_startOffset == 0)
859         return false;
860 
861     // If this node is unrendered or invisible the VisiblePosition checks below won't have much meaning.
862     // Additionally, if the range we are iterating over contains huge sections of unrendered content,
863     // we would create VisiblePositions on every call to this function without this check.
864     if (!m_node->renderer() || m_node->renderer()->style()->visibility() != VISIBLE)
865         return false;
866 
867     // The startPos.isNotNull() check is needed because the start could be before the body,
868     // and in that case we'll get null. We don't want to put in newlines at the start in that case.
869     // The currPos.isNotNull() check is needed because positions in non-HTML content
870     // (like SVG) do not have visible positions, and we don't want to emit for them either.
871     VisiblePosition startPos = VisiblePosition(Position(m_startContainer, m_startOffset, Position::PositionIsOffsetInAnchor), DOWNSTREAM);
872     VisiblePosition currPos = VisiblePosition(positionBeforeNode(m_node), DOWNSTREAM);
873     return startPos.isNotNull() && currPos.isNotNull() && !inSameLine(startPos, currPos);
874 }
875 
shouldEmitSpaceBeforeAndAfterNode(Node * node)876 bool TextIterator::shouldEmitSpaceBeforeAndAfterNode(Node* node)
877 {
878     return node->renderer() && node->renderer()->isTable() && (node->renderer()->isInline() || m_emitsCharactersBetweenAllVisiblePositions);
879 }
880 
representNodeOffsetZero()881 void TextIterator::representNodeOffsetZero()
882 {
883     // Emit a character to show the positioning of m_node.
884 
885     // When we haven't been emitting any characters, shouldRepresentNodeOffsetZero() can
886     // create VisiblePositions, which is expensive.  So, we perform the inexpensive checks
887     // on m_node to see if it necessitates emitting a character first and will early return
888     // before encountering shouldRepresentNodeOffsetZero()s worse case behavior.
889     if (shouldEmitTabBeforeNode(m_node)) {
890         if (shouldRepresentNodeOffsetZero())
891             emitCharacter('\t', m_node->parentNode(), m_node, 0, 0);
892     } else if (shouldEmitNewlineBeforeNode(m_node)) {
893         if (shouldRepresentNodeOffsetZero())
894             emitCharacter('\n', m_node->parentNode(), m_node, 0, 0);
895     } else if (shouldEmitSpaceBeforeAndAfterNode(m_node)) {
896         if (shouldRepresentNodeOffsetZero())
897             emitCharacter(' ', m_node->parentNode(), m_node, 0, 0);
898     }
899 }
900 
handleNonTextNode()901 bool TextIterator::handleNonTextNode()
902 {
903     if (shouldEmitNewlineForNode(m_node))
904         emitCharacter('\n', m_node->parentNode(), m_node, 0, 1);
905     else if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isHR())
906         emitCharacter(' ', m_node->parentNode(), m_node, 0, 1);
907     else
908         representNodeOffsetZero();
909 
910     return true;
911 }
912 
exitNode()913 void TextIterator::exitNode()
914 {
915     // prevent emitting a newline when exiting a collapsed block at beginning of the range
916     // FIXME: !m_hasEmitted does not necessarily mean there was a collapsed block... it could
917     // have been an hr (e.g.). Also, a collapsed block could have height (e.g. a table) and
918     // therefore look like a blank line.
919     if (!m_hasEmitted)
920         return;
921 
922     // Emit with a position *inside* m_node, after m_node's contents, in
923     // case it is a block, because the run should start where the
924     // emitted character is positioned visually.
925     Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
926     // FIXME: This shouldn't require the m_lastTextNode to be true, but we can't change that without making
927     // the logic in _web_attributedStringFromRange match.  We'll get that for free when we switch to use
928     // TextIterator in _web_attributedStringFromRange.
929     // See <rdar://problem/5428427> for an example of how this mismatch will cause problems.
930     if (m_lastTextNode && shouldEmitNewlineAfterNode(m_node)) {
931         // use extra newline to represent margin bottom, as needed
932         bool addNewline = shouldEmitExtraNewlineForNode(m_node);
933 
934         // FIXME: We need to emit a '\n' as we leave an empty block(s) that
935         // contain a VisiblePosition when doing selection preservation.
936         if (m_lastCharacter != '\n') {
937             // insert a newline with a position following this block's contents.
938             emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
939             // remember whether to later add a newline for the current node
940             ASSERT(!m_needsAnotherNewline);
941             m_needsAnotherNewline = addNewline;
942         } else if (addNewline)
943             // insert a newline with a position following this block's contents.
944             emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
945     }
946 
947     // If nothing was emitted, see if we need to emit a space.
948     if (!m_positionNode && shouldEmitSpaceBeforeAndAfterNode(m_node))
949         emitCharacter(' ', baseNode->parentNode(), baseNode, 1, 1);
950 }
951 
emitCharacter(UChar c,Node * textNode,Node * offsetBaseNode,int textStartOffset,int textEndOffset)952 void TextIterator::emitCharacter(UChar c, Node* textNode, Node* offsetBaseNode, int textStartOffset, int textEndOffset)
953 {
954     m_hasEmitted = true;
955 
956     // remember information with which to construct the TextIterator::range()
957     // NOTE: textNode is often not a text node, so the range will specify child nodes of positionNode
958     m_positionNode = textNode;
959     m_positionOffsetBaseNode = offsetBaseNode;
960     m_positionStartOffset = textStartOffset;
961     m_positionEndOffset = textEndOffset;
962 
963     // remember information with which to construct the TextIterator::characters() and length()
964     m_singleCharacterBuffer = c;
965     m_textCharacters = &m_singleCharacterBuffer;
966     m_textLength = 1;
967 
968     // remember some iteration state
969     m_lastTextNodeEndedWithCollapsedSpace = false;
970     m_lastCharacter = c;
971 }
972 
emitText(Node * textNode,RenderObject * renderObject,int textStartOffset,int textEndOffset)973 void TextIterator::emitText(Node* textNode, RenderObject* renderObject, int textStartOffset, int textEndOffset)
974 {
975     RenderText* renderer = toRenderText(renderObject);
976     m_text = m_emitsTextWithoutTranscoding ? renderer->textWithoutTranscoding() : renderer->text();
977     ASSERT(m_text.characters());
978 
979     m_positionNode = textNode;
980     m_positionOffsetBaseNode = 0;
981     m_positionStartOffset = textStartOffset;
982     m_positionEndOffset = textEndOffset;
983     m_textCharacters = m_text.characters() + textStartOffset;
984     m_textLength = textEndOffset - textStartOffset;
985     m_lastCharacter = m_text[textEndOffset - 1];
986 
987     m_lastTextNodeEndedWithCollapsedSpace = false;
988     m_hasEmitted = true;
989 }
990 
emitText(Node * textNode,int textStartOffset,int textEndOffset)991 void TextIterator::emitText(Node* textNode, int textStartOffset, int textEndOffset)
992 {
993     emitText(textNode, m_node->renderer(), textStartOffset, textEndOffset);
994 }
995 
range() const996 PassRefPtr<Range> TextIterator::range() const
997 {
998     // use the current run information, if we have it
999     if (m_positionNode) {
1000         if (m_positionOffsetBaseNode) {
1001             int index = m_positionOffsetBaseNode->nodeIndex();
1002             m_positionStartOffset += index;
1003             m_positionEndOffset += index;
1004             m_positionOffsetBaseNode = 0;
1005         }
1006         return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1007     }
1008 
1009     // otherwise, return the end of the overall range we were given
1010     if (m_endContainer)
1011         return Range::create(m_endContainer->document(), m_endContainer, m_endOffset, m_endContainer, m_endOffset);
1012 
1013     return 0;
1014 }
1015 
node() const1016 Node* TextIterator::node() const
1017 {
1018     RefPtr<Range> textRange = range();
1019     if (!textRange)
1020         return 0;
1021 
1022     Node* node = textRange->startContainer();
1023     if (!node)
1024         return 0;
1025     if (node->offsetInCharacters())
1026         return node;
1027 
1028     return node->childNode(textRange->startOffset());
1029 }
1030 
1031 // --------
1032 
SimplifiedBackwardsTextIterator()1033 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator()
1034     : m_behavior(TextIteratorDefaultBehavior)
1035     , m_node(0)
1036     , m_positionNode(0)
1037 {
1038 }
1039 
SimplifiedBackwardsTextIterator(const Range * r,TextIteratorBehavior behavior)1040 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator(const Range* r, TextIteratorBehavior behavior)
1041     : m_behavior(behavior)
1042     , m_node(0)
1043     , m_positionNode(0)
1044 {
1045     ASSERT(m_behavior == TextIteratorDefaultBehavior);
1046 
1047     if (!r)
1048         return;
1049 
1050     Node* startNode = r->startContainer();
1051     if (!startNode)
1052         return;
1053     Node* endNode = r->endContainer();
1054     int startOffset = r->startOffset();
1055     int endOffset = r->endOffset();
1056 
1057     if (!startNode->offsetInCharacters()) {
1058         if (startOffset >= 0 && startOffset < static_cast<int>(startNode->childNodeCount())) {
1059             startNode = startNode->childNode(startOffset);
1060             startOffset = 0;
1061         }
1062     }
1063     if (!endNode->offsetInCharacters()) {
1064         if (endOffset > 0 && endOffset <= static_cast<int>(endNode->childNodeCount())) {
1065             endNode = endNode->childNode(endOffset - 1);
1066             endOffset = lastOffsetInNode(endNode);
1067         }
1068     }
1069 
1070     m_node = endNode;
1071     setUpFullyClippedStack(m_fullyClippedStack, m_node);
1072     m_offset = endOffset;
1073     m_handledNode = false;
1074     m_handledChildren = endOffset == 0;
1075 
1076     m_startNode = startNode;
1077     m_startOffset = startOffset;
1078     m_endNode = endNode;
1079     m_endOffset = endOffset;
1080 
1081 #ifndef NDEBUG
1082     // Need this just because of the assert.
1083     m_positionNode = endNode;
1084 #endif
1085 
1086     m_lastTextNode = 0;
1087     m_lastCharacter = '\n';
1088 
1089     m_havePassedStartNode = false;
1090 
1091     advance();
1092 }
1093 
advance()1094 void SimplifiedBackwardsTextIterator::advance()
1095 {
1096     ASSERT(m_positionNode);
1097 
1098     m_positionNode = 0;
1099     m_textLength = 0;
1100 
1101     while (m_node && !m_havePassedStartNode) {
1102         // Don't handle node if we start iterating at [node, 0].
1103         if (!m_handledNode && !(m_node == m_endNode && m_endOffset == 0)) {
1104             RenderObject* renderer = m_node->renderer();
1105             if (renderer && renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) {
1106                 // FIXME: What about CDATA_SECTION_NODE?
1107                 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
1108                     m_handledNode = handleTextNode();
1109             } else if (renderer && (renderer->isImage() || renderer->isWidget())) {
1110                 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
1111                     m_handledNode = handleReplacedElement();
1112             } else
1113                 m_handledNode = handleNonTextNode();
1114             if (m_positionNode)
1115                 return;
1116         }
1117 
1118         if (!m_handledChildren && m_node->hasChildNodes()) {
1119             m_node = m_node->lastChild();
1120             pushFullyClippedState(m_fullyClippedStack, m_node);
1121         } else {
1122             // Exit empty containers as we pass over them or containers
1123             // where [container, 0] is where we started iterating.
1124             if (!m_handledNode
1125                     && canHaveChildrenForEditing(m_node)
1126                     && m_node->parentNode()
1127                     && (!m_node->lastChild() || (m_node == m_endNode && !m_endOffset))) {
1128                 exitNode();
1129                 if (m_positionNode) {
1130                     m_handledNode = true;
1131                     m_handledChildren = true;
1132                     return;
1133                 }
1134             }
1135 
1136             // Exit all other containers.
1137             while (!m_node->previousSibling()) {
1138                 if (!advanceRespectingRange(m_node->parentOrHostNode()))
1139                     break;
1140                 m_fullyClippedStack.pop();
1141                 exitNode();
1142                 if (m_positionNode) {
1143                     m_handledNode = true;
1144                     m_handledChildren = true;
1145                     return;
1146                 }
1147             }
1148 
1149             m_fullyClippedStack.pop();
1150             if (advanceRespectingRange(m_node->previousSibling()))
1151                 pushFullyClippedState(m_fullyClippedStack, m_node);
1152             else
1153                 m_node = 0;
1154         }
1155 
1156         // For the purpose of word boundary detection,
1157         // we should iterate all visible text and trailing (collapsed) whitespaces.
1158         m_offset = m_node ? maxOffsetIncludingCollapsedSpaces(m_node) : 0;
1159         m_handledNode = false;
1160         m_handledChildren = false;
1161 
1162         if (m_positionNode)
1163             return;
1164     }
1165 }
1166 
handleTextNode()1167 bool SimplifiedBackwardsTextIterator::handleTextNode()
1168 {
1169     m_lastTextNode = m_node;
1170 
1171     RenderText* renderer = toRenderText(m_node->renderer());
1172     String str = renderer->text();
1173 
1174     if (!renderer->firstTextBox() && str.length() > 0)
1175         return true;
1176 
1177     m_positionEndOffset = m_offset;
1178 
1179     m_offset = (m_node == m_startNode) ? m_startOffset : 0;
1180     m_positionNode = m_node;
1181     m_positionStartOffset = m_offset;
1182     m_textLength = m_positionEndOffset - m_positionStartOffset;
1183     m_textCharacters = str.characters() + m_positionStartOffset;
1184 
1185     m_lastCharacter = str[m_positionEndOffset - 1];
1186 
1187     return true;
1188 }
1189 
handleReplacedElement()1190 bool SimplifiedBackwardsTextIterator::handleReplacedElement()
1191 {
1192     unsigned index = m_node->nodeIndex();
1193     // We want replaced elements to behave like punctuation for boundary
1194     // finding, and to simply take up space for the selection preservation
1195     // code in moveParagraphs, so we use a comma.  Unconditionally emit
1196     // here because this iterator is only used for boundary finding.
1197     emitCharacter(',', m_node->parentNode(), index, index + 1);
1198     return true;
1199 }
1200 
handleNonTextNode()1201 bool SimplifiedBackwardsTextIterator::handleNonTextNode()
1202 {
1203     // We can use a linefeed in place of a tab because this simple iterator is only used to
1204     // find boundaries, not actual content.  A linefeed breaks words, sentences, and paragraphs.
1205     if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineAfterNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
1206         unsigned index = m_node->nodeIndex();
1207         // The start of this emitted range is wrong. Ensuring correctness would require
1208         // VisiblePositions and so would be slow. previousBoundary expects this.
1209         emitCharacter('\n', m_node->parentNode(), index + 1, index + 1);
1210     }
1211     return true;
1212 }
1213 
exitNode()1214 void SimplifiedBackwardsTextIterator::exitNode()
1215 {
1216     if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineBeforeNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
1217         // The start of this emitted range is wrong. Ensuring correctness would require
1218         // VisiblePositions and so would be slow. previousBoundary expects this.
1219         emitCharacter('\n', m_node, 0, 0);
1220     }
1221 }
1222 
emitCharacter(UChar c,Node * node,int startOffset,int endOffset)1223 void SimplifiedBackwardsTextIterator::emitCharacter(UChar c, Node* node, int startOffset, int endOffset)
1224 {
1225     m_singleCharacterBuffer = c;
1226     m_positionNode = node;
1227     m_positionStartOffset = startOffset;
1228     m_positionEndOffset = endOffset;
1229     m_textCharacters = &m_singleCharacterBuffer;
1230     m_textLength = 1;
1231     m_lastCharacter = c;
1232 }
1233 
advanceRespectingRange(Node * next)1234 bool SimplifiedBackwardsTextIterator::advanceRespectingRange(Node* next)
1235 {
1236     if (!next)
1237         return false;
1238     m_havePassedStartNode |= m_node == m_startNode;
1239     if (m_havePassedStartNode)
1240         return false;
1241     m_node = next;
1242     return true;
1243 }
1244 
range() const1245 PassRefPtr<Range> SimplifiedBackwardsTextIterator::range() const
1246 {
1247     if (m_positionNode)
1248         return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1249 
1250     return Range::create(m_startNode->document(), m_startNode, m_startOffset, m_startNode, m_startOffset);
1251 }
1252 
1253 // --------
1254 
CharacterIterator()1255 CharacterIterator::CharacterIterator()
1256     : m_offset(0)
1257     , m_runOffset(0)
1258     , m_atBreak(true)
1259 {
1260 }
1261 
CharacterIterator(const Range * r,TextIteratorBehavior behavior)1262 CharacterIterator::CharacterIterator(const Range* r, TextIteratorBehavior behavior)
1263     : m_offset(0)
1264     , m_runOffset(0)
1265     , m_atBreak(true)
1266     , m_textIterator(r, behavior)
1267 {
1268     while (!atEnd() && m_textIterator.length() == 0)
1269         m_textIterator.advance();
1270 }
1271 
range() const1272 PassRefPtr<Range> CharacterIterator::range() const
1273 {
1274     RefPtr<Range> r = m_textIterator.range();
1275     if (!m_textIterator.atEnd()) {
1276         if (m_textIterator.length() <= 1) {
1277             ASSERT(m_runOffset == 0);
1278         } else {
1279             Node* n = r->startContainer();
1280             ASSERT(n == r->endContainer());
1281             int offset = r->startOffset() + m_runOffset;
1282             ExceptionCode ec = 0;
1283             r->setStart(n, offset, ec);
1284             r->setEnd(n, offset + 1, ec);
1285             ASSERT(!ec);
1286         }
1287     }
1288     return r.release();
1289 }
1290 
advance(int count)1291 void CharacterIterator::advance(int count)
1292 {
1293     if (count <= 0) {
1294         ASSERT(count == 0);
1295         return;
1296     }
1297 
1298     m_atBreak = false;
1299 
1300     // easy if there is enough left in the current m_textIterator run
1301     int remaining = m_textIterator.length() - m_runOffset;
1302     if (count < remaining) {
1303         m_runOffset += count;
1304         m_offset += count;
1305         return;
1306     }
1307 
1308     // exhaust the current m_textIterator run
1309     count -= remaining;
1310     m_offset += remaining;
1311 
1312     // move to a subsequent m_textIterator run
1313     for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
1314         int runLength = m_textIterator.length();
1315         if (runLength == 0)
1316             m_atBreak = true;
1317         else {
1318             // see whether this is m_textIterator to use
1319             if (count < runLength) {
1320                 m_runOffset = count;
1321                 m_offset += count;
1322                 return;
1323             }
1324 
1325             // exhaust this m_textIterator run
1326             count -= runLength;
1327             m_offset += runLength;
1328         }
1329     }
1330 
1331     // ran to the end of the m_textIterator... no more runs left
1332     m_atBreak = true;
1333     m_runOffset = 0;
1334 }
1335 
string(int numChars)1336 String CharacterIterator::string(int numChars)
1337 {
1338     Vector<UChar> result;
1339     result.reserveInitialCapacity(numChars);
1340     while (numChars > 0 && !atEnd()) {
1341         int runSize = min(numChars, length());
1342         result.append(characters(), runSize);
1343         numChars -= runSize;
1344         advance(runSize);
1345     }
1346     return String::adopt(result);
1347 }
1348 
characterSubrange(CharacterIterator & it,int offset,int length)1349 static PassRefPtr<Range> characterSubrange(CharacterIterator& it, int offset, int length)
1350 {
1351     it.advance(offset);
1352     RefPtr<Range> start = it.range();
1353 
1354     if (length > 1)
1355         it.advance(length - 1);
1356     RefPtr<Range> end = it.range();
1357 
1358     return Range::create(start->startContainer()->document(),
1359         start->startContainer(), start->startOffset(),
1360         end->endContainer(), end->endOffset());
1361 }
1362 
BackwardsCharacterIterator()1363 BackwardsCharacterIterator::BackwardsCharacterIterator()
1364     : m_offset(0)
1365     , m_runOffset(0)
1366     , m_atBreak(true)
1367 {
1368 }
1369 
BackwardsCharacterIterator(const Range * range,TextIteratorBehavior behavior)1370 BackwardsCharacterIterator::BackwardsCharacterIterator(const Range* range, TextIteratorBehavior behavior)
1371     : m_offset(0)
1372     , m_runOffset(0)
1373     , m_atBreak(true)
1374     , m_textIterator(range, behavior)
1375 {
1376     while (!atEnd() && !m_textIterator.length())
1377         m_textIterator.advance();
1378 }
1379 
range() const1380 PassRefPtr<Range> BackwardsCharacterIterator::range() const
1381 {
1382     RefPtr<Range> r = m_textIterator.range();
1383     if (!m_textIterator.atEnd()) {
1384         if (m_textIterator.length() <= 1)
1385             ASSERT(m_runOffset == 0);
1386         else {
1387             Node* n = r->startContainer();
1388             ASSERT(n == r->endContainer());
1389             int offset = r->endOffset() - m_runOffset;
1390             ExceptionCode ec = 0;
1391             r->setStart(n, offset - 1, ec);
1392             r->setEnd(n, offset, ec);
1393             ASSERT(!ec);
1394         }
1395     }
1396     return r.release();
1397 }
1398 
advance(int count)1399 void BackwardsCharacterIterator::advance(int count)
1400 {
1401     if (count <= 0) {
1402         ASSERT(!count);
1403         return;
1404     }
1405 
1406     m_atBreak = false;
1407 
1408     int remaining = m_textIterator.length() - m_runOffset;
1409     if (count < remaining) {
1410         m_runOffset += count;
1411         m_offset += count;
1412         return;
1413     }
1414 
1415     count -= remaining;
1416     m_offset += remaining;
1417 
1418     for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
1419         int runLength = m_textIterator.length();
1420         if (runLength == 0)
1421             m_atBreak = true;
1422         else {
1423             if (count < runLength) {
1424                 m_runOffset = count;
1425                 m_offset += count;
1426                 return;
1427             }
1428 
1429             count -= runLength;
1430             m_offset += runLength;
1431         }
1432     }
1433 
1434     m_atBreak = true;
1435     m_runOffset = 0;
1436 }
1437 
1438 // --------
1439 
WordAwareIterator()1440 WordAwareIterator::WordAwareIterator()
1441     : m_previousText(0)
1442     , m_didLookAhead(false)
1443 {
1444 }
1445 
WordAwareIterator(const Range * r)1446 WordAwareIterator::WordAwareIterator(const Range* r)
1447     : m_previousText(0)
1448     , m_didLookAhead(true) // so we consider the first chunk from the text iterator
1449     , m_textIterator(r)
1450 {
1451     advance(); // get in position over the first chunk of text
1452 }
1453 
~WordAwareIterator()1454 WordAwareIterator::~WordAwareIterator()
1455 {
1456 }
1457 
1458 // We're always in one of these modes:
1459 // - The current chunk in the text iterator is our current chunk
1460 //      (typically its a piece of whitespace, or text that ended with whitespace)
1461 // - The previous chunk in the text iterator is our current chunk
1462 //      (we looked ahead to the next chunk and found a word boundary)
1463 // - We built up our own chunk of text from many chunks from the text iterator
1464 
1465 // FIXME: Performance could be bad for huge spans next to each other that don't fall on word boundaries.
1466 
advance()1467 void WordAwareIterator::advance()
1468 {
1469     m_previousText = 0;
1470     m_buffer.clear();      // toss any old buffer we built up
1471 
1472     // If last time we did a look-ahead, start with that looked-ahead chunk now
1473     if (!m_didLookAhead) {
1474         ASSERT(!m_textIterator.atEnd());
1475         m_textIterator.advance();
1476     }
1477     m_didLookAhead = false;
1478 
1479     // Go to next non-empty chunk
1480     while (!m_textIterator.atEnd() && m_textIterator.length() == 0)
1481         m_textIterator.advance();
1482     m_range = m_textIterator.range();
1483 
1484     if (m_textIterator.atEnd())
1485         return;
1486 
1487     while (1) {
1488         // If this chunk ends in whitespace we can just use it as our chunk.
1489         if (isSpaceOrNewline(m_textIterator.characters()[m_textIterator.length() - 1]))
1490             return;
1491 
1492         // If this is the first chunk that failed, save it in previousText before look ahead
1493         if (m_buffer.isEmpty()) {
1494             m_previousText = m_textIterator.characters();
1495             m_previousLength = m_textIterator.length();
1496         }
1497 
1498         // Look ahead to next chunk.  If it is whitespace or a break, we can use the previous stuff
1499         m_textIterator.advance();
1500         if (m_textIterator.atEnd() || m_textIterator.length() == 0 || isSpaceOrNewline(m_textIterator.characters()[0])) {
1501             m_didLookAhead = true;
1502             return;
1503         }
1504 
1505         if (m_buffer.isEmpty()) {
1506             // Start gobbling chunks until we get to a suitable stopping point
1507             m_buffer.append(m_previousText, m_previousLength);
1508             m_previousText = 0;
1509         }
1510         m_buffer.append(m_textIterator.characters(), m_textIterator.length());
1511         int exception = 0;
1512         m_range->setEnd(m_textIterator.range()->endContainer(), m_textIterator.range()->endOffset(), exception);
1513     }
1514 }
1515 
length() const1516 int WordAwareIterator::length() const
1517 {
1518     if (!m_buffer.isEmpty())
1519         return m_buffer.size();
1520     if (m_previousText)
1521         return m_previousLength;
1522     return m_textIterator.length();
1523 }
1524 
characters() const1525 const UChar* WordAwareIterator::characters() const
1526 {
1527     if (!m_buffer.isEmpty())
1528         return m_buffer.data();
1529     if (m_previousText)
1530         return m_previousText;
1531     return m_textIterator.characters();
1532 }
1533 
1534 // --------
1535 
foldQuoteMarkOrSoftHyphen(UChar c)1536 static inline UChar foldQuoteMarkOrSoftHyphen(UChar c)
1537 {
1538     switch (c) {
1539         case hebrewPunctuationGershayim:
1540         case leftDoubleQuotationMark:
1541         case rightDoubleQuotationMark:
1542             return '"';
1543         case hebrewPunctuationGeresh:
1544         case leftSingleQuotationMark:
1545         case rightSingleQuotationMark:
1546             return '\'';
1547         case softHyphen:
1548             // Replace soft hyphen with an ignorable character so that their presence or absence will
1549             // not affect string comparison.
1550             return 0;
1551         default:
1552             return c;
1553     }
1554 }
1555 
foldQuoteMarksAndSoftHyphens(String & s)1556 static inline void foldQuoteMarksAndSoftHyphens(String& s)
1557 {
1558     s.replace(hebrewPunctuationGeresh, '\'');
1559     s.replace(hebrewPunctuationGershayim, '"');
1560     s.replace(leftDoubleQuotationMark, '"');
1561     s.replace(leftSingleQuotationMark, '\'');
1562     s.replace(rightDoubleQuotationMark, '"');
1563     s.replace(rightSingleQuotationMark, '\'');
1564     // Replace soft hyphen with an ignorable character so that their presence or absence will
1565     // not affect string comparison.
1566     s.replace(softHyphen, 0);
1567 }
1568 
1569 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
1570 
foldQuoteMarksAndSoftHyphens(UChar * data,size_t length)1571 static inline void foldQuoteMarksAndSoftHyphens(UChar* data, size_t length)
1572 {
1573     for (size_t i = 0; i < length; ++i)
1574         data[i] = foldQuoteMarkOrSoftHyphen(data[i]);
1575 }
1576 
1577 static const size_t minimumSearchBufferSize = 8192;
1578 
1579 #ifndef NDEBUG
1580 static bool searcherInUse;
1581 #endif
1582 
createSearcher()1583 static UStringSearch* createSearcher()
1584 {
1585     // Provide a non-empty pattern and non-empty text so usearch_open will not fail,
1586     // but it doesn't matter exactly what it is, since we don't perform any searches
1587     // without setting both the pattern and the text.
1588     UErrorCode status = U_ZERO_ERROR;
1589     UStringSearch* searcher = usearch_open(&newlineCharacter, 1, &newlineCharacter, 1, currentSearchLocaleID(), 0, &status);
1590     ASSERT(status == U_ZERO_ERROR || status == U_USING_FALLBACK_WARNING || status == U_USING_DEFAULT_WARNING);
1591     return searcher;
1592 }
1593 
searcher()1594 static UStringSearch* searcher()
1595 {
1596     static UStringSearch* searcher = createSearcher();
1597     return searcher;
1598 }
1599 
lockSearcher()1600 static inline void lockSearcher()
1601 {
1602 #ifndef NDEBUG
1603     ASSERT(!searcherInUse);
1604     searcherInUse = true;
1605 #endif
1606 }
1607 
unlockSearcher()1608 static inline void unlockSearcher()
1609 {
1610 #ifndef NDEBUG
1611     ASSERT(searcherInUse);
1612     searcherInUse = false;
1613 #endif
1614 }
1615 
1616 // ICU's search ignores the distinction between small kana letters and ones
1617 // that are not small, and also characters that differ only in the voicing
1618 // marks when considering only primary collation strength diffrences.
1619 // This is not helpful for end users, since these differences make words
1620 // distinct, so for our purposes we need these to be considered.
1621 // The Unicode folks do not think the collation algorithm should be
1622 // changed. To work around this, we would like to tailor the ICU searcher,
1623 // but we can't get that to work yet. So instead, we check for cases where
1624 // these differences occur, and skip those matches.
1625 
1626 // We refer to the above technique as the "kana workaround". The next few
1627 // functions are helper functinos for the kana workaround.
1628 
isKanaLetter(UChar character)1629 static inline bool isKanaLetter(UChar character)
1630 {
1631     // Hiragana letters.
1632     if (character >= 0x3041 && character <= 0x3096)
1633         return true;
1634 
1635     // Katakana letters.
1636     if (character >= 0x30A1 && character <= 0x30FA)
1637         return true;
1638     if (character >= 0x31F0 && character <= 0x31FF)
1639         return true;
1640 
1641     // Halfwidth katakana letters.
1642     if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70)
1643         return true;
1644 
1645     return false;
1646 }
1647 
isSmallKanaLetter(UChar character)1648 static inline bool isSmallKanaLetter(UChar character)
1649 {
1650     ASSERT(isKanaLetter(character));
1651 
1652     switch (character) {
1653     case 0x3041: // HIRAGANA LETTER SMALL A
1654     case 0x3043: // HIRAGANA LETTER SMALL I
1655     case 0x3045: // HIRAGANA LETTER SMALL U
1656     case 0x3047: // HIRAGANA LETTER SMALL E
1657     case 0x3049: // HIRAGANA LETTER SMALL O
1658     case 0x3063: // HIRAGANA LETTER SMALL TU
1659     case 0x3083: // HIRAGANA LETTER SMALL YA
1660     case 0x3085: // HIRAGANA LETTER SMALL YU
1661     case 0x3087: // HIRAGANA LETTER SMALL YO
1662     case 0x308E: // HIRAGANA LETTER SMALL WA
1663     case 0x3095: // HIRAGANA LETTER SMALL KA
1664     case 0x3096: // HIRAGANA LETTER SMALL KE
1665     case 0x30A1: // KATAKANA LETTER SMALL A
1666     case 0x30A3: // KATAKANA LETTER SMALL I
1667     case 0x30A5: // KATAKANA LETTER SMALL U
1668     case 0x30A7: // KATAKANA LETTER SMALL E
1669     case 0x30A9: // KATAKANA LETTER SMALL O
1670     case 0x30C3: // KATAKANA LETTER SMALL TU
1671     case 0x30E3: // KATAKANA LETTER SMALL YA
1672     case 0x30E5: // KATAKANA LETTER SMALL YU
1673     case 0x30E7: // KATAKANA LETTER SMALL YO
1674     case 0x30EE: // KATAKANA LETTER SMALL WA
1675     case 0x30F5: // KATAKANA LETTER SMALL KA
1676     case 0x30F6: // KATAKANA LETTER SMALL KE
1677     case 0x31F0: // KATAKANA LETTER SMALL KU
1678     case 0x31F1: // KATAKANA LETTER SMALL SI
1679     case 0x31F2: // KATAKANA LETTER SMALL SU
1680     case 0x31F3: // KATAKANA LETTER SMALL TO
1681     case 0x31F4: // KATAKANA LETTER SMALL NU
1682     case 0x31F5: // KATAKANA LETTER SMALL HA
1683     case 0x31F6: // KATAKANA LETTER SMALL HI
1684     case 0x31F7: // KATAKANA LETTER SMALL HU
1685     case 0x31F8: // KATAKANA LETTER SMALL HE
1686     case 0x31F9: // KATAKANA LETTER SMALL HO
1687     case 0x31FA: // KATAKANA LETTER SMALL MU
1688     case 0x31FB: // KATAKANA LETTER SMALL RA
1689     case 0x31FC: // KATAKANA LETTER SMALL RI
1690     case 0x31FD: // KATAKANA LETTER SMALL RU
1691     case 0x31FE: // KATAKANA LETTER SMALL RE
1692     case 0x31FF: // KATAKANA LETTER SMALL RO
1693     case 0xFF67: // HALFWIDTH KATAKANA LETTER SMALL A
1694     case 0xFF68: // HALFWIDTH KATAKANA LETTER SMALL I
1695     case 0xFF69: // HALFWIDTH KATAKANA LETTER SMALL U
1696     case 0xFF6A: // HALFWIDTH KATAKANA LETTER SMALL E
1697     case 0xFF6B: // HALFWIDTH KATAKANA LETTER SMALL O
1698     case 0xFF6C: // HALFWIDTH KATAKANA LETTER SMALL YA
1699     case 0xFF6D: // HALFWIDTH KATAKANA LETTER SMALL YU
1700     case 0xFF6E: // HALFWIDTH KATAKANA LETTER SMALL YO
1701     case 0xFF6F: // HALFWIDTH KATAKANA LETTER SMALL TU
1702         return true;
1703     }
1704     return false;
1705 }
1706 
1707 enum VoicedSoundMarkType { NoVoicedSoundMark, VoicedSoundMark, SemiVoicedSoundMark };
1708 
composedVoicedSoundMark(UChar character)1709 static inline VoicedSoundMarkType composedVoicedSoundMark(UChar character)
1710 {
1711     ASSERT(isKanaLetter(character));
1712 
1713     switch (character) {
1714     case 0x304C: // HIRAGANA LETTER GA
1715     case 0x304E: // HIRAGANA LETTER GI
1716     case 0x3050: // HIRAGANA LETTER GU
1717     case 0x3052: // HIRAGANA LETTER GE
1718     case 0x3054: // HIRAGANA LETTER GO
1719     case 0x3056: // HIRAGANA LETTER ZA
1720     case 0x3058: // HIRAGANA LETTER ZI
1721     case 0x305A: // HIRAGANA LETTER ZU
1722     case 0x305C: // HIRAGANA LETTER ZE
1723     case 0x305E: // HIRAGANA LETTER ZO
1724     case 0x3060: // HIRAGANA LETTER DA
1725     case 0x3062: // HIRAGANA LETTER DI
1726     case 0x3065: // HIRAGANA LETTER DU
1727     case 0x3067: // HIRAGANA LETTER DE
1728     case 0x3069: // HIRAGANA LETTER DO
1729     case 0x3070: // HIRAGANA LETTER BA
1730     case 0x3073: // HIRAGANA LETTER BI
1731     case 0x3076: // HIRAGANA LETTER BU
1732     case 0x3079: // HIRAGANA LETTER BE
1733     case 0x307C: // HIRAGANA LETTER BO
1734     case 0x3094: // HIRAGANA LETTER VU
1735     case 0x30AC: // KATAKANA LETTER GA
1736     case 0x30AE: // KATAKANA LETTER GI
1737     case 0x30B0: // KATAKANA LETTER GU
1738     case 0x30B2: // KATAKANA LETTER GE
1739     case 0x30B4: // KATAKANA LETTER GO
1740     case 0x30B6: // KATAKANA LETTER ZA
1741     case 0x30B8: // KATAKANA LETTER ZI
1742     case 0x30BA: // KATAKANA LETTER ZU
1743     case 0x30BC: // KATAKANA LETTER ZE
1744     case 0x30BE: // KATAKANA LETTER ZO
1745     case 0x30C0: // KATAKANA LETTER DA
1746     case 0x30C2: // KATAKANA LETTER DI
1747     case 0x30C5: // KATAKANA LETTER DU
1748     case 0x30C7: // KATAKANA LETTER DE
1749     case 0x30C9: // KATAKANA LETTER DO
1750     case 0x30D0: // KATAKANA LETTER BA
1751     case 0x30D3: // KATAKANA LETTER BI
1752     case 0x30D6: // KATAKANA LETTER BU
1753     case 0x30D9: // KATAKANA LETTER BE
1754     case 0x30DC: // KATAKANA LETTER BO
1755     case 0x30F4: // KATAKANA LETTER VU
1756     case 0x30F7: // KATAKANA LETTER VA
1757     case 0x30F8: // KATAKANA LETTER VI
1758     case 0x30F9: // KATAKANA LETTER VE
1759     case 0x30FA: // KATAKANA LETTER VO
1760         return VoicedSoundMark;
1761     case 0x3071: // HIRAGANA LETTER PA
1762     case 0x3074: // HIRAGANA LETTER PI
1763     case 0x3077: // HIRAGANA LETTER PU
1764     case 0x307A: // HIRAGANA LETTER PE
1765     case 0x307D: // HIRAGANA LETTER PO
1766     case 0x30D1: // KATAKANA LETTER PA
1767     case 0x30D4: // KATAKANA LETTER PI
1768     case 0x30D7: // KATAKANA LETTER PU
1769     case 0x30DA: // KATAKANA LETTER PE
1770     case 0x30DD: // KATAKANA LETTER PO
1771         return SemiVoicedSoundMark;
1772     }
1773     return NoVoicedSoundMark;
1774 }
1775 
isCombiningVoicedSoundMark(UChar character)1776 static inline bool isCombiningVoicedSoundMark(UChar character)
1777 {
1778     switch (character) {
1779     case 0x3099: // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
1780     case 0x309A: // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
1781         return true;
1782     }
1783     return false;
1784 }
1785 
containsKanaLetters(const String & pattern)1786 static inline bool containsKanaLetters(const String& pattern)
1787 {
1788     const UChar* characters = pattern.characters();
1789     unsigned length = pattern.length();
1790     for (unsigned i = 0; i < length; ++i) {
1791         if (isKanaLetter(characters[i]))
1792             return true;
1793     }
1794     return false;
1795 }
1796 
normalizeCharacters(const UChar * characters,unsigned length,Vector<UChar> & buffer)1797 static void normalizeCharacters(const UChar* characters, unsigned length, Vector<UChar>& buffer)
1798 {
1799     ASSERT(length);
1800 
1801     buffer.resize(length);
1802 
1803     UErrorCode status = U_ZERO_ERROR;
1804     size_t bufferSize = unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), length, &status);
1805     ASSERT(status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR);
1806     ASSERT(bufferSize);
1807 
1808     buffer.resize(bufferSize);
1809 
1810     if (status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING)
1811         return;
1812 
1813     status = U_ZERO_ERROR;
1814     unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), bufferSize, &status);
1815     ASSERT(status == U_STRING_NOT_TERMINATED_WARNING);
1816 }
1817 
isNonLatin1Separator(UChar32 character)1818 static bool isNonLatin1Separator(UChar32 character)
1819 {
1820     ASSERT_ARG(character, character >= 256);
1821 
1822     return U_GET_GC_MASK(character) & (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK);
1823 }
1824 
isSeparator(UChar32 character)1825 static inline bool isSeparator(UChar32 character)
1826 {
1827     static const bool latin1SeparatorTable[256] = {
1828         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1829         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1830         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // space ! " # $ % & ' ( ) * + , - . /
1831         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, //                         : ; < = > ?
1832         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //   @
1833         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, //                         [ \ ] ^ _
1834         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //   `
1835         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, //                           { | } ~
1836         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1837         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1838         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1839         1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1840         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1841         0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
1842         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1843         0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
1844     };
1845 
1846     if (character < 256)
1847         return latin1SeparatorTable[character];
1848 
1849     return isNonLatin1Separator(character);
1850 }
1851 
SearchBuffer(const String & target,FindOptions options)1852 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
1853     : m_target(target)
1854     , m_options(options)
1855     , m_prefixLength(0)
1856     , m_atBreak(true)
1857     , m_needsMoreContext(options & AtWordStarts)
1858     , m_targetRequiresKanaWorkaround(containsKanaLetters(m_target))
1859 {
1860     ASSERT(!m_target.isEmpty());
1861 
1862     // FIXME: We'd like to tailor the searcher to fold quote marks for us instead
1863     // of doing it in a separate replacement pass here, but ICU doesn't offer a way
1864     // to add tailoring on top of the locale-specific tailoring as of this writing.
1865     foldQuoteMarksAndSoftHyphens(m_target);
1866 
1867     size_t targetLength = m_target.length();
1868     m_buffer.reserveInitialCapacity(max(targetLength * 8, minimumSearchBufferSize));
1869     m_overlap = m_buffer.capacity() / 4;
1870 
1871     if ((m_options & AtWordStarts) && targetLength) {
1872         UChar32 targetFirstCharacter;
1873         U16_GET(m_target.characters(), 0, 0, targetLength, targetFirstCharacter);
1874         // Characters in the separator category never really occur at the beginning of a word,
1875         // so if the target begins with such a character, we just ignore the AtWordStart option.
1876         if (isSeparator(targetFirstCharacter)) {
1877             m_options &= ~AtWordStarts;
1878             m_needsMoreContext = false;
1879         }
1880     }
1881 
1882     // Grab the single global searcher.
1883     // If we ever have a reason to do more than once search buffer at once, we'll have
1884     // to move to multiple searchers.
1885     lockSearcher();
1886 
1887     UStringSearch* searcher = WebCore::searcher();
1888     UCollator* collator = usearch_getCollator(searcher);
1889 
1890     UCollationStrength strength = m_options & CaseInsensitive ? UCOL_PRIMARY : UCOL_TERTIARY;
1891     if (ucol_getStrength(collator) != strength) {
1892         ucol_setStrength(collator, strength);
1893         usearch_reset(searcher);
1894     }
1895 
1896     UErrorCode status = U_ZERO_ERROR;
1897     usearch_setPattern(searcher, m_target.characters(), targetLength, &status);
1898     ASSERT(status == U_ZERO_ERROR);
1899 
1900     // The kana workaround requires a normalized copy of the target string.
1901     if (m_targetRequiresKanaWorkaround)
1902         normalizeCharacters(m_target.characters(), m_target.length(), m_normalizedTarget);
1903 }
1904 
~SearchBuffer()1905 inline SearchBuffer::~SearchBuffer()
1906 {
1907     // Leave the static object pointing to a valid string.
1908     UErrorCode status = U_ZERO_ERROR;
1909     usearch_setPattern(WebCore::searcher(), &newlineCharacter, 1, &status);
1910     ASSERT(status == U_ZERO_ERROR);
1911 
1912     unlockSearcher();
1913 }
1914 
append(const UChar * characters,size_t length)1915 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
1916 {
1917     ASSERT(length);
1918 
1919     if (m_atBreak) {
1920         m_buffer.shrink(0);
1921         m_prefixLength = 0;
1922         m_atBreak = false;
1923     } else if (m_buffer.size() == m_buffer.capacity()) {
1924         memcpy(m_buffer.data(), m_buffer.data() + m_buffer.size() - m_overlap, m_overlap * sizeof(UChar));
1925         m_prefixLength -= min(m_prefixLength, m_buffer.size() - m_overlap);
1926         m_buffer.shrink(m_overlap);
1927     }
1928 
1929     size_t oldLength = m_buffer.size();
1930     size_t usableLength = min(m_buffer.capacity() - oldLength, length);
1931     ASSERT(usableLength);
1932     m_buffer.append(characters, usableLength);
1933     foldQuoteMarksAndSoftHyphens(m_buffer.data() + oldLength, usableLength);
1934     return usableLength;
1935 }
1936 
needsMoreContext() const1937 inline bool SearchBuffer::needsMoreContext() const
1938 {
1939     return m_needsMoreContext;
1940 }
1941 
prependContext(const UChar * characters,size_t length)1942 inline void SearchBuffer::prependContext(const UChar* characters, size_t length)
1943 {
1944     ASSERT(m_needsMoreContext);
1945     ASSERT(m_prefixLength == m_buffer.size());
1946 
1947     if (!length)
1948         return;
1949 
1950     m_atBreak = false;
1951 
1952     size_t wordBoundaryContextStart = length;
1953     if (wordBoundaryContextStart) {
1954         U16_BACK_1(characters, 0, wordBoundaryContextStart);
1955         wordBoundaryContextStart = startOfLastWordBoundaryContext(characters, wordBoundaryContextStart);
1956     }
1957 
1958     size_t usableLength = min(m_buffer.capacity() - m_prefixLength, length - wordBoundaryContextStart);
1959     m_buffer.prepend(characters + length - usableLength, usableLength);
1960     m_prefixLength += usableLength;
1961 
1962     if (wordBoundaryContextStart || m_prefixLength == m_buffer.capacity())
1963         m_needsMoreContext = false;
1964 }
1965 
atBreak() const1966 inline bool SearchBuffer::atBreak() const
1967 {
1968     return m_atBreak;
1969 }
1970 
reachedBreak()1971 inline void SearchBuffer::reachedBreak()
1972 {
1973     m_atBreak = true;
1974 }
1975 
isBadMatch(const UChar * match,size_t matchLength) const1976 inline bool SearchBuffer::isBadMatch(const UChar* match, size_t matchLength) const
1977 {
1978     // This function implements the kana workaround. If usearch treats
1979     // it as a match, but we do not want to, then it's a "bad match".
1980     if (!m_targetRequiresKanaWorkaround)
1981         return false;
1982 
1983     // Normalize into a match buffer. We reuse a single buffer rather than
1984     // creating a new one each time.
1985     normalizeCharacters(match, matchLength, m_normalizedMatch);
1986 
1987     const UChar* a = m_normalizedTarget.begin();
1988     const UChar* aEnd = m_normalizedTarget.end();
1989 
1990     const UChar* b = m_normalizedMatch.begin();
1991     const UChar* bEnd = m_normalizedMatch.end();
1992 
1993     while (true) {
1994         // Skip runs of non-kana-letter characters. This is necessary so we can
1995         // correctly handle strings where the target and match have different-length
1996         // runs of characters that match, while still double checking the correctness
1997         // of matches of kana letters with other kana letters.
1998         while (a != aEnd && !isKanaLetter(*a))
1999             ++a;
2000         while (b != bEnd && !isKanaLetter(*b))
2001             ++b;
2002 
2003         // If we reached the end of either the target or the match, we should have
2004         // reached the end of both; both should have the same number of kana letters.
2005         if (a == aEnd || b == bEnd) {
2006             ASSERT(a == aEnd);
2007             ASSERT(b == bEnd);
2008             return false;
2009         }
2010 
2011         // Check for differences in the kana letter character itself.
2012         if (isSmallKanaLetter(*a) != isSmallKanaLetter(*b))
2013             return true;
2014         if (composedVoicedSoundMark(*a) != composedVoicedSoundMark(*b))
2015             return true;
2016         ++a;
2017         ++b;
2018 
2019         // Check for differences in combining voiced sound marks found after the letter.
2020         while (1) {
2021             if (!(a != aEnd && isCombiningVoicedSoundMark(*a))) {
2022                 if (b != bEnd && isCombiningVoicedSoundMark(*b))
2023                     return true;
2024                 break;
2025             }
2026             if (!(b != bEnd && isCombiningVoicedSoundMark(*b)))
2027                 return true;
2028             if (*a != *b)
2029                 return true;
2030             ++a;
2031             ++b;
2032         }
2033     }
2034 }
2035 
isWordStartMatch(size_t start,size_t length) const2036 inline bool SearchBuffer::isWordStartMatch(size_t start, size_t length) const
2037 {
2038     ASSERT(m_options & AtWordStarts);
2039 
2040     if (!start)
2041         return true;
2042 
2043     if (m_options & TreatMedialCapitalAsWordStart) {
2044         int size = m_buffer.size();
2045         int offset = start;
2046         UChar32 firstCharacter;
2047         U16_GET(m_buffer.data(), 0, offset, size, firstCharacter);
2048         UChar32 previousCharacter;
2049         U16_PREV(m_buffer.data(), 0, offset, previousCharacter);
2050 
2051         if (isSeparator(firstCharacter)) {
2052             // The start of a separator run is a word start (".org" in "webkit.org").
2053             if (!isSeparator(previousCharacter))
2054                 return true;
2055         } else if (isASCIIUpper(firstCharacter)) {
2056             // The start of an uppercase run is a word start ("Kit" in "WebKit").
2057             if (!isASCIIUpper(previousCharacter))
2058                 return true;
2059             // The last character of an uppercase run followed by a non-separator, non-digit
2060             // is a word start ("Request" in "XMLHTTPRequest").
2061             offset = start;
2062             U16_FWD_1(m_buffer.data(), offset, size);
2063             UChar32 nextCharacter = 0;
2064             if (offset < size)
2065                 U16_GET(m_buffer.data(), 0, offset, size, nextCharacter);
2066             if (!isASCIIUpper(nextCharacter) && !isASCIIDigit(nextCharacter) && !isSeparator(nextCharacter))
2067                 return true;
2068         } else if (isASCIIDigit(firstCharacter)) {
2069             // The start of a digit run is a word start ("2" in "WebKit2").
2070             if (!isASCIIDigit(previousCharacter))
2071                 return true;
2072         } else if (isSeparator(previousCharacter) || isASCIIDigit(previousCharacter)) {
2073             // The start of a non-separator, non-uppercase, non-digit run is a word start,
2074             // except after an uppercase. ("org" in "webkit.org", but not "ore" in "WebCore").
2075             return true;
2076         }
2077     }
2078 
2079     size_t wordBreakSearchStart = start + length;
2080     while (wordBreakSearchStart > start)
2081         wordBreakSearchStart = findNextWordFromIndex(m_buffer.data(), m_buffer.size(), wordBreakSearchStart, false /* backwards */);
2082     return wordBreakSearchStart == start;
2083 }
2084 
search(size_t & start)2085 inline size_t SearchBuffer::search(size_t& start)
2086 {
2087     size_t size = m_buffer.size();
2088     if (m_atBreak) {
2089         if (!size)
2090             return 0;
2091     } else {
2092         if (size != m_buffer.capacity())
2093             return 0;
2094     }
2095 
2096     UStringSearch* searcher = WebCore::searcher();
2097 
2098     UErrorCode status = U_ZERO_ERROR;
2099     usearch_setText(searcher, m_buffer.data(), size, &status);
2100     ASSERT(status == U_ZERO_ERROR);
2101 
2102     usearch_setOffset(searcher, m_prefixLength, &status);
2103     ASSERT(status == U_ZERO_ERROR);
2104 
2105     int matchStart = usearch_next(searcher, &status);
2106     ASSERT(status == U_ZERO_ERROR);
2107 
2108 nextMatch:
2109     if (!(matchStart >= 0 && static_cast<size_t>(matchStart) < size)) {
2110         ASSERT(matchStart == USEARCH_DONE);
2111         return 0;
2112     }
2113 
2114     // Matches that start in the overlap area are only tentative.
2115     // The same match may appear later, matching more characters,
2116     // possibly including a combining character that's not yet in the buffer.
2117     if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) {
2118         size_t overlap = m_overlap;
2119         if (m_options & AtWordStarts) {
2120             // Ensure that there is sufficient context before matchStart the next time around for
2121             // determining if it is at a word boundary.
2122             int wordBoundaryContextStart = matchStart;
2123             U16_BACK_1(m_buffer.data(), 0, wordBoundaryContextStart);
2124             wordBoundaryContextStart = startOfLastWordBoundaryContext(m_buffer.data(), wordBoundaryContextStart);
2125             overlap = min(size - 1, max(overlap, size - wordBoundaryContextStart));
2126         }
2127         memcpy(m_buffer.data(), m_buffer.data() + size - overlap, overlap * sizeof(UChar));
2128         m_prefixLength -= min(m_prefixLength, size - overlap);
2129         m_buffer.shrink(overlap);
2130         return 0;
2131     }
2132 
2133     size_t matchedLength = usearch_getMatchedLength(searcher);
2134     ASSERT(matchStart + matchedLength <= size);
2135 
2136     // If this match is "bad", move on to the next match.
2137     if (isBadMatch(m_buffer.data() + matchStart, matchedLength) || ((m_options & AtWordStarts) && !isWordStartMatch(matchStart, matchedLength))) {
2138         matchStart = usearch_next(searcher, &status);
2139         ASSERT(status == U_ZERO_ERROR);
2140         goto nextMatch;
2141     }
2142 
2143     size_t newSize = size - (matchStart + 1);
2144     memmove(m_buffer.data(), m_buffer.data() + matchStart + 1, newSize * sizeof(UChar));
2145     m_prefixLength -= min<size_t>(m_prefixLength, matchStart + 1);
2146     m_buffer.shrink(newSize);
2147 
2148     start = size - matchStart;
2149     return matchedLength;
2150 }
2151 
2152 #else // !ICU_UNICODE
2153 
SearchBuffer(const String & target,FindOptions options)2154 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
2155     : m_target(options & CaseInsensitive ? target.foldCase() : target)
2156     , m_options(options)
2157     , m_buffer(m_target.length())
2158     , m_isCharacterStartBuffer(m_target.length())
2159     , m_isBufferFull(false)
2160     , m_cursor(0)
2161 {
2162     ASSERT(!m_target.isEmpty());
2163     m_target.replace(noBreakSpace, ' ');
2164     foldQuoteMarksAndSoftHyphens(m_target);
2165 }
2166 
~SearchBuffer()2167 inline SearchBuffer::~SearchBuffer()
2168 {
2169 }
2170 
reachedBreak()2171 inline void SearchBuffer::reachedBreak()
2172 {
2173     m_cursor = 0;
2174     m_isBufferFull = false;
2175 }
2176 
atBreak() const2177 inline bool SearchBuffer::atBreak() const
2178 {
2179     return !m_cursor && !m_isBufferFull;
2180 }
2181 
append(UChar c,bool isStart)2182 inline void SearchBuffer::append(UChar c, bool isStart)
2183 {
2184     m_buffer[m_cursor] = c == noBreakSpace ? ' ' : foldQuoteMarkOrSoftHyphen(c);
2185     m_isCharacterStartBuffer[m_cursor] = isStart;
2186     if (++m_cursor == m_target.length()) {
2187         m_cursor = 0;
2188         m_isBufferFull = true;
2189     }
2190 }
2191 
append(const UChar * characters,size_t length)2192 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
2193 {
2194     ASSERT(length);
2195     if (!(m_options & CaseInsensitive)) {
2196         append(characters[0], true);
2197         return 1;
2198     }
2199     const int maxFoldedCharacters = 16; // sensible maximum is 3, this should be more than enough
2200     UChar foldedCharacters[maxFoldedCharacters];
2201     bool error;
2202     int numFoldedCharacters = foldCase(foldedCharacters, maxFoldedCharacters, characters, 1, &error);
2203     ASSERT(!error);
2204     ASSERT(numFoldedCharacters);
2205     ASSERT(numFoldedCharacters <= maxFoldedCharacters);
2206     if (!error && numFoldedCharacters) {
2207         numFoldedCharacters = min(numFoldedCharacters, maxFoldedCharacters);
2208         append(foldedCharacters[0], true);
2209         for (int i = 1; i < numFoldedCharacters; ++i)
2210             append(foldedCharacters[i], false);
2211     }
2212     return 1;
2213 }
2214 
needsMoreContext() const2215 inline bool SearchBuffer::needsMoreContext() const
2216 {
2217     return false;
2218 }
2219 
prependContext(const UChar *,size_t)2220 void SearchBuffer::prependContext(const UChar*, size_t)
2221 {
2222     ASSERT_NOT_REACHED();
2223 }
2224 
search(size_t & start)2225 inline size_t SearchBuffer::search(size_t& start)
2226 {
2227     if (!m_isBufferFull)
2228         return 0;
2229     if (!m_isCharacterStartBuffer[m_cursor])
2230         return 0;
2231 
2232     size_t tailSpace = m_target.length() - m_cursor;
2233     if (memcmp(&m_buffer[m_cursor], m_target.characters(), tailSpace * sizeof(UChar)) != 0)
2234         return 0;
2235     if (memcmp(&m_buffer[0], m_target.characters() + tailSpace, m_cursor * sizeof(UChar)) != 0)
2236         return 0;
2237 
2238     start = length();
2239 
2240     // Now that we've found a match once, we don't want to find it again, because those
2241     // are the SearchBuffer semantics, allowing for a buffer where you append more than one
2242     // character at a time. To do this we take advantage of m_isCharacterStartBuffer, but if
2243     // we want to get rid of that in the future we could track this with a separate boolean
2244     // or even move the characters to the start of the buffer and set m_isBufferFull to false.
2245     m_isCharacterStartBuffer[m_cursor] = false;
2246 
2247     return start;
2248 }
2249 
2250 // Returns the number of characters that were appended to the buffer (what we are searching in).
2251 // That's not necessarily the same length as the passed-in target string, because case folding
2252 // can make two strings match even though they're not the same length.
length() const2253 size_t SearchBuffer::length() const
2254 {
2255     size_t bufferSize = m_target.length();
2256     size_t length = 0;
2257     for (size_t i = 0; i < bufferSize; ++i)
2258         length += m_isCharacterStartBuffer[i];
2259     return length;
2260 }
2261 
2262 #endif // !ICU_UNICODE
2263 
2264 // --------
2265 
rangeLength(const Range * r,bool forSelectionPreservation)2266 int TextIterator::rangeLength(const Range* r, bool forSelectionPreservation)
2267 {
2268     int length = 0;
2269     for (TextIterator it(r, forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior); !it.atEnd(); it.advance())
2270         length += it.length();
2271 
2272     return length;
2273 }
2274 
subrange(Range * entireRange,int characterOffset,int characterCount)2275 PassRefPtr<Range> TextIterator::subrange(Range* entireRange, int characterOffset, int characterCount)
2276 {
2277     CharacterIterator entireRangeIterator(entireRange);
2278     return characterSubrange(entireRangeIterator, characterOffset, characterCount);
2279 }
2280 
rangeFromLocationAndLength(Element * scope,int rangeLocation,int rangeLength,bool forSelectionPreservation)2281 PassRefPtr<Range> TextIterator::rangeFromLocationAndLength(Element* scope, int rangeLocation, int rangeLength, bool forSelectionPreservation)
2282 {
2283     RefPtr<Range> resultRange = scope->document()->createRange();
2284 
2285     int docTextPosition = 0;
2286     int rangeEnd = rangeLocation + rangeLength;
2287     bool startRangeFound = false;
2288 
2289     RefPtr<Range> textRunRange;
2290 
2291     TextIterator it(rangeOfContents(scope).get(), forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior);
2292 
2293     // FIXME: the atEnd() check shouldn't be necessary, workaround for <http://bugs.webkit.org/show_bug.cgi?id=6289>.
2294     if (rangeLocation == 0 && rangeLength == 0 && it.atEnd()) {
2295         textRunRange = it.range();
2296 
2297         ExceptionCode ec = 0;
2298         resultRange->setStart(textRunRange->startContainer(), 0, ec);
2299         ASSERT(!ec);
2300         resultRange->setEnd(textRunRange->startContainer(), 0, ec);
2301         ASSERT(!ec);
2302 
2303         return resultRange.release();
2304     }
2305 
2306     for (; !it.atEnd(); it.advance()) {
2307         int len = it.length();
2308         textRunRange = it.range();
2309 
2310         bool foundStart = rangeLocation >= docTextPosition && rangeLocation <= docTextPosition + len;
2311         bool foundEnd = rangeEnd >= docTextPosition && rangeEnd <= docTextPosition + len;
2312 
2313         // Fix textRunRange->endPosition(), but only if foundStart || foundEnd, because it is only
2314         // in those cases that textRunRange is used.
2315         if (foundEnd) {
2316             // FIXME: This is a workaround for the fact that the end of a run is often at the wrong
2317             // position for emitted '\n's.
2318             if (len == 1 && it.characters()[0] == '\n') {
2319                 scope->document()->updateLayoutIgnorePendingStylesheets();
2320                 it.advance();
2321                 if (!it.atEnd()) {
2322                     RefPtr<Range> range = it.range();
2323                     ExceptionCode ec = 0;
2324                     textRunRange->setEnd(range->startContainer(), range->startOffset(), ec);
2325                     ASSERT(!ec);
2326                 } else {
2327                     Position runStart = textRunRange->startPosition();
2328                     Position runEnd = VisiblePosition(runStart).next().deepEquivalent();
2329                     if (runEnd.isNotNull()) {
2330                         ExceptionCode ec = 0;
2331                         textRunRange->setEnd(runEnd.containerNode(), runEnd.computeOffsetInContainerNode(), ec);
2332                         ASSERT(!ec);
2333                     }
2334                 }
2335             }
2336         }
2337 
2338         if (foundStart) {
2339             startRangeFound = true;
2340             int exception = 0;
2341             if (textRunRange->startContainer()->isTextNode()) {
2342                 int offset = rangeLocation - docTextPosition;
2343                 resultRange->setStart(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
2344             } else {
2345                 if (rangeLocation == docTextPosition)
2346                     resultRange->setStart(textRunRange->startContainer(), textRunRange->startOffset(), exception);
2347                 else
2348                     resultRange->setStart(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2349             }
2350         }
2351 
2352         if (foundEnd) {
2353             int exception = 0;
2354             if (textRunRange->startContainer()->isTextNode()) {
2355                 int offset = rangeEnd - docTextPosition;
2356                 resultRange->setEnd(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
2357             } else {
2358                 if (rangeEnd == docTextPosition)
2359                     resultRange->setEnd(textRunRange->startContainer(), textRunRange->startOffset(), exception);
2360                 else
2361                     resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2362             }
2363             docTextPosition += len;
2364             break;
2365         }
2366         docTextPosition += len;
2367     }
2368 
2369     if (!startRangeFound)
2370         return 0;
2371 
2372     if (rangeLength != 0 && rangeEnd > docTextPosition) { // rangeEnd is out of bounds
2373         int exception = 0;
2374         resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2375     }
2376 
2377     return resultRange.release();
2378 }
2379 
locationAndLengthFromRange(const Range * range,size_t & location,size_t & length)2380 bool TextIterator::locationAndLengthFromRange(const Range* range, size_t& location, size_t& length)
2381 {
2382     location = notFound;
2383     length = 0;
2384 
2385     if (!range->startContainer())
2386         return false;
2387 
2388     Element* selectionRoot = range->ownerDocument()->frame()->selection()->rootEditableElement();
2389     Element* scope = selectionRoot ? selectionRoot : range->ownerDocument()->documentElement();
2390 
2391     // The critical assumption is that this only gets called with ranges that
2392     // concentrate on a given area containing the selection root. This is done
2393     // because of text fields and textareas. The DOM for those is not
2394     // directly in the document DOM, so ensure that the range does not cross a
2395     // boundary of one of those.
2396     if (range->startContainer() != scope && !range->startContainer()->isDescendantOf(scope))
2397         return false;
2398     if (range->endContainer() != scope && !range->endContainer()->isDescendantOf(scope))
2399         return false;
2400 
2401     RefPtr<Range> testRange = Range::create(scope->document(), scope, 0, range->startContainer(), range->startOffset());
2402     ASSERT(testRange->startContainer() == scope);
2403     location = TextIterator::rangeLength(testRange.get());
2404 
2405     ExceptionCode ec;
2406     testRange->setEnd(range->endContainer(), range->endOffset(), ec);
2407     ASSERT(testRange->startContainer() == scope);
2408     length = TextIterator::rangeLength(testRange.get()) - location;
2409     return true;
2410 }
2411 
2412 // --------
2413 
plainTextToMallocAllocatedBuffer(const Range * r,unsigned & bufferLength,bool isDisplayString,TextIteratorBehavior defaultBehavior)2414 UChar* plainTextToMallocAllocatedBuffer(const Range* r, unsigned& bufferLength, bool isDisplayString, TextIteratorBehavior defaultBehavior)
2415 {
2416     UChar* result = 0;
2417 
2418     // Do this in pieces to avoid massive reallocations if there is a large amount of text.
2419     // Use system malloc for buffers since they can consume lots of memory and current TCMalloc is unable return it back to OS.
2420     static const unsigned cMaxSegmentSize = 1 << 16;
2421     bufferLength = 0;
2422     typedef pair<UChar*, unsigned> TextSegment;
2423     OwnPtr<Vector<TextSegment> > textSegments;
2424     Vector<UChar> textBuffer;
2425     textBuffer.reserveInitialCapacity(cMaxSegmentSize);
2426     TextIteratorBehavior behavior = defaultBehavior;
2427     if (!isDisplayString)
2428         behavior = static_cast<TextIteratorBehavior>(behavior | TextIteratorEmitsTextsWithoutTranscoding);
2429 
2430     for (TextIterator it(r, behavior); !it.atEnd(); it.advance()) {
2431         if (textBuffer.size() && textBuffer.size() + it.length() > cMaxSegmentSize) {
2432             UChar* newSegmentBuffer = static_cast<UChar*>(malloc(textBuffer.size() * sizeof(UChar)));
2433             if (!newSegmentBuffer)
2434                 goto exit;
2435             memcpy(newSegmentBuffer, textBuffer.data(), textBuffer.size() * sizeof(UChar));
2436             if (!textSegments)
2437                 textSegments = adoptPtr(new Vector<TextSegment>);
2438             textSegments->append(make_pair(newSegmentBuffer, (unsigned)textBuffer.size()));
2439             textBuffer.clear();
2440         }
2441         textBuffer.append(it.characters(), it.length());
2442         bufferLength += it.length();
2443     }
2444 
2445     if (!bufferLength)
2446         return 0;
2447 
2448     // Since we know the size now, we can make a single buffer out of the pieces with one big alloc
2449     result = static_cast<UChar*>(malloc(bufferLength * sizeof(UChar)));
2450     if (!result)
2451         goto exit;
2452 
2453     {
2454         UChar* resultPos = result;
2455         if (textSegments) {
2456             unsigned size = textSegments->size();
2457             for (unsigned i = 0; i < size; ++i) {
2458                 const TextSegment& segment = textSegments->at(i);
2459                 memcpy(resultPos, segment.first, segment.second * sizeof(UChar));
2460                 resultPos += segment.second;
2461             }
2462         }
2463         memcpy(resultPos, textBuffer.data(), textBuffer.size() * sizeof(UChar));
2464     }
2465 
2466 exit:
2467     if (textSegments) {
2468         unsigned size = textSegments->size();
2469         for (unsigned i = 0; i < size; ++i)
2470             free(textSegments->at(i).first);
2471     }
2472 
2473     if (isDisplayString && r->ownerDocument())
2474         r->ownerDocument()->displayBufferModifiedByEncoding(result, bufferLength);
2475 
2476     return result;
2477 }
2478 
plainText(const Range * r,TextIteratorBehavior defaultBehavior)2479 String plainText(const Range* r, TextIteratorBehavior defaultBehavior)
2480 {
2481     unsigned length;
2482     UChar* buf = plainTextToMallocAllocatedBuffer(r, length, false, defaultBehavior);
2483     if (!buf)
2484         return "";
2485     String result(buf, length);
2486     free(buf);
2487     return result;
2488 }
2489 
isAllCollapsibleWhitespace(const String & string)2490 static inline bool isAllCollapsibleWhitespace(const String& string)
2491 {
2492     const UChar* characters = string.characters();
2493     unsigned length = string.length();
2494     for (unsigned i = 0; i < length; ++i) {
2495         if (!isCollapsibleWhitespace(characters[i]))
2496             return false;
2497     }
2498     return true;
2499 }
2500 
collapsedToBoundary(const Range * range,bool forward)2501 static PassRefPtr<Range> collapsedToBoundary(const Range* range, bool forward)
2502 {
2503     ExceptionCode ec = 0;
2504     RefPtr<Range> result = range->cloneRange(ec);
2505     ASSERT(!ec);
2506     result->collapse(!forward, ec);
2507     ASSERT(!ec);
2508     return result.release();
2509 }
2510 
findPlainText(CharacterIterator & it,const String & target,FindOptions options,size_t & matchStart)2511 static size_t findPlainText(CharacterIterator& it, const String& target, FindOptions options, size_t& matchStart)
2512 {
2513     matchStart = 0;
2514     size_t matchLength = 0;
2515 
2516     SearchBuffer buffer(target, options);
2517 
2518     if (buffer.needsMoreContext()) {
2519         RefPtr<Range> startRange = it.range();
2520         RefPtr<Range> beforeStartRange = startRange->ownerDocument()->createRange();
2521         ExceptionCode ec = 0;
2522         beforeStartRange->setEnd(startRange->startContainer(), startRange->startOffset(), ec);
2523         for (SimplifiedBackwardsTextIterator backwardsIterator(beforeStartRange.get()); !backwardsIterator.atEnd(); backwardsIterator.advance()) {
2524             buffer.prependContext(backwardsIterator.characters(), backwardsIterator.length());
2525             if (!buffer.needsMoreContext())
2526                 break;
2527         }
2528     }
2529 
2530     while (!it.atEnd()) {
2531         it.advance(buffer.append(it.characters(), it.length()));
2532 tryAgain:
2533         size_t matchStartOffset;
2534         if (size_t newMatchLength = buffer.search(matchStartOffset)) {
2535             // Note that we found a match, and where we found it.
2536             size_t lastCharacterInBufferOffset = it.characterOffset();
2537             ASSERT(lastCharacterInBufferOffset >= matchStartOffset);
2538             matchStart = lastCharacterInBufferOffset - matchStartOffset;
2539             matchLength = newMatchLength;
2540             // If searching forward, stop on the first match.
2541             // If searching backward, don't stop, so we end up with the last match.
2542             if (!(options & Backwards))
2543                 break;
2544             goto tryAgain;
2545         }
2546         if (it.atBreak() && !buffer.atBreak()) {
2547             buffer.reachedBreak();
2548             goto tryAgain;
2549         }
2550     }
2551 
2552     return matchLength;
2553 }
2554 
findPlainText(const Range * range,const String & target,FindOptions options)2555 PassRefPtr<Range> findPlainText(const Range* range, const String& target, FindOptions options)
2556 {
2557     // First, find the text.
2558     size_t matchStart;
2559     size_t matchLength;
2560     {
2561         CharacterIterator findIterator(range, TextIteratorEntersTextControls);
2562         matchLength = findPlainText(findIterator, target, options, matchStart);
2563         if (!matchLength)
2564             return collapsedToBoundary(range, !(options & Backwards));
2565     }
2566 
2567     // Then, find the document position of the start and the end of the text.
2568     CharacterIterator computeRangeIterator(range, TextIteratorEntersTextControls);
2569     return characterSubrange(computeRangeIterator, matchStart, matchLength);
2570 }
2571 
2572 }
2573