• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010 Apple Inc. All rights reserved.
3  * Copyright (C) 2005 Alexey Proskuryakov.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #include "config.h"
28 #include "TextIterator.h"
29 
30 #include "Document.h"
31 #include "Frame.h"
32 #include "HTMLElement.h"
33 #include "HTMLNames.h"
34 #include "htmlediting.h"
35 #include "InlineTextBox.h"
36 #include "Range.h"
37 #include "RenderTableCell.h"
38 #include "RenderTableRow.h"
39 #include "RenderTextControl.h"
40 #include "RenderTextFragment.h"
41 #include "TextBoundaries.h"
42 #include "TextBreakIterator.h"
43 #include "VisiblePosition.h"
44 #include "visible_units.h"
45 #include <wtf/unicode/CharacterNames.h>
46 
47 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
48 #include "TextBreakIteratorInternalICU.h"
49 #include <unicode/usearch.h>
50 #endif
51 
52 using namespace WTF::Unicode;
53 using namespace std;
54 
55 namespace WebCore {
56 
57 using namespace HTMLNames;
58 
59 // Buffer that knows how to compare with a search target.
60 // Keeps enough of the previous text to be able to search in the future, but no more.
61 // Non-breaking spaces are always equal to normal spaces.
62 // Case folding is also done if the CaseInsensitive option is specified.
63 // Matches are further filtered if the AtWordStarts option is specified, although some
64 // matches inside a word are permitted if TreatMedialCapitalAsWordStart is specified as well.
65 class SearchBuffer {
66     WTF_MAKE_NONCOPYABLE(SearchBuffer);
67 public:
68     SearchBuffer(const String& target, FindOptions);
69     ~SearchBuffer();
70 
71     // Returns number of characters appended; guaranteed to be in the range [1, length].
72     size_t append(const UChar*, size_t length);
73     bool needsMoreContext() const;
74     void prependContext(const UChar*, size_t length);
75     void reachedBreak();
76 
77     // Result is the size in characters of what was found.
78     // And <startOffset> is the number of characters back to the start of what was found.
79     size_t search(size_t& startOffset);
80     bool atBreak() const;
81 
82 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
83 
84 private:
85     bool isBadMatch(const UChar*, size_t length) const;
86     bool isWordStartMatch(size_t start, size_t length) const;
87 
88     String m_target;
89     FindOptions m_options;
90 
91     Vector<UChar> m_buffer;
92     size_t m_overlap;
93     size_t m_prefixLength;
94     bool m_atBreak;
95     bool m_needsMoreContext;
96 
97     bool m_targetRequiresKanaWorkaround;
98     Vector<UChar> m_normalizedTarget;
99     mutable Vector<UChar> m_normalizedMatch;
100 
101 #else
102 
103 private:
104     void append(UChar, bool isCharacterStart);
105     size_t length() const;
106 
107     String m_target;
108     FindOptions m_options;
109 
110     Vector<UChar> m_buffer;
111     Vector<bool> m_isCharacterStartBuffer;
112     bool m_isBufferFull;
113     size_t m_cursor;
114 
115 #endif
116 };
117 
118 // --------
119 
120 static const unsigned bitsInWord = sizeof(unsigned) * 8;
121 static const unsigned bitInWordMask = bitsInWord - 1;
122 
BitStack()123 BitStack::BitStack()
124     : m_size(0)
125 {
126 }
127 
~BitStack()128 BitStack::~BitStack()
129 {
130 }
131 
push(bool bit)132 void BitStack::push(bool bit)
133 {
134     unsigned index = m_size / bitsInWord;
135     unsigned shift = m_size & bitInWordMask;
136     if (!shift && index == m_words.size()) {
137         m_words.grow(index + 1);
138         m_words[index] = 0;
139     }
140     unsigned& word = m_words[index];
141     unsigned mask = 1U << shift;
142     if (bit)
143         word |= mask;
144     else
145         word &= ~mask;
146     ++m_size;
147 }
148 
pop()149 void BitStack::pop()
150 {
151     if (m_size)
152         --m_size;
153 }
154 
top() const155 bool BitStack::top() const
156 {
157     if (!m_size)
158         return false;
159     unsigned shift = (m_size - 1) & bitInWordMask;
160     return m_words.last() & (1U << shift);
161 }
162 
size() const163 unsigned BitStack::size() const
164 {
165     return m_size;
166 }
167 
168 // --------
169 
170 #if !ASSERT_DISABLED
171 
depthCrossingShadowBoundaries(Node * node)172 static unsigned depthCrossingShadowBoundaries(Node* node)
173 {
174     unsigned depth = 0;
175     for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
176         ++depth;
177     return depth;
178 }
179 
180 #endif
181 
182 // This function is like Range::pastLastNode, except for the fact that it can climb up out of shadow trees.
nextInPreOrderCrossingShadowBoundaries(Node * rangeEndContainer,int rangeEndOffset)183 static Node* nextInPreOrderCrossingShadowBoundaries(Node* rangeEndContainer, int rangeEndOffset)
184 {
185     if (!rangeEndContainer)
186         return 0;
187     if (rangeEndOffset >= 0 && !rangeEndContainer->offsetInCharacters()) {
188         if (Node* next = rangeEndContainer->childNode(rangeEndOffset))
189             return next;
190     }
191     for (Node* node = rangeEndContainer; node; node = node->parentOrHostNode()) {
192         if (Node* next = node->nextSibling())
193             return next;
194     }
195     return 0;
196 }
197 
198 // --------
199 
fullyClipsContents(Node * node)200 static inline bool fullyClipsContents(Node* node)
201 {
202     RenderObject* renderer = node->renderer();
203     if (!renderer || !renderer->isBox() || !renderer->hasOverflowClip())
204         return false;
205     return toRenderBox(renderer)->size().isEmpty();
206 }
207 
ignoresContainerClip(Node * node)208 static inline bool ignoresContainerClip(Node* node)
209 {
210     RenderObject* renderer = node->renderer();
211     if (!renderer || renderer->isText())
212         return false;
213     EPosition position = renderer->style()->position();
214     return position == AbsolutePosition || position == FixedPosition;
215 }
216 
pushFullyClippedState(BitStack & stack,Node * node)217 static void pushFullyClippedState(BitStack& stack, Node* node)
218 {
219     ASSERT(stack.size() == depthCrossingShadowBoundaries(node));
220 
221     // Push true if this node full clips its contents, or if a parent already has fully
222     // clipped and this is not a node that ignores its container's clip.
223     stack.push(fullyClipsContents(node) || (stack.top() && !ignoresContainerClip(node)));
224 }
225 
setUpFullyClippedStack(BitStack & stack,Node * node)226 static void setUpFullyClippedStack(BitStack& stack, Node* node)
227 {
228     // Put the nodes in a vector so we can iterate in reverse order.
229     Vector<Node*, 100> ancestry;
230     for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
231         ancestry.append(parent);
232 
233     // Call pushFullyClippedState on each node starting with the earliest ancestor.
234     size_t size = ancestry.size();
235     for (size_t i = 0; i < size; ++i)
236         pushFullyClippedState(stack, ancestry[size - i - 1]);
237     pushFullyClippedState(stack, node);
238 
239     ASSERT(stack.size() == 1 + depthCrossingShadowBoundaries(node));
240 }
241 
242 #if OS(ANDROID)
checkFormControlElement(Node * startNode)243 static bool checkFormControlElement(Node* startNode)
244 {
245     Node* node = startNode;
246     while (node) {
247         if (node->isElementNode() && static_cast<Element*>(node)->isFormControlElement())
248             return true;
249         node = node->parentNode();
250     }
251     return false;
252 }
253 #endif
254 
255 
256 // --------
257 
TextIterator()258 TextIterator::TextIterator()
259     : m_startContainer(0)
260     , m_startOffset(0)
261     , m_endContainer(0)
262     , m_endOffset(0)
263     , m_positionNode(0)
264     , m_textCharacters(0)
265     , m_textLength(0)
266     , m_remainingTextBox(0)
267     , m_firstLetterText(0)
268     , m_lastCharacter(0)
269     , m_emitsCharactersBetweenAllVisiblePositions(false)
270     , m_entersTextControls(false)
271     , m_emitsTextWithoutTranscoding(false)
272     , m_handledFirstLetter(false)
273     , m_ignoresStyleVisibility(false)
274     , m_emitsObjectReplacementCharacters(false)
275 #if OS(ANDROID)
276     , m_stopsOnFormControls(false)
277     , m_shouldStop(false)
278 #endif
279 {
280 }
281 
TextIterator(const Range * r,TextIteratorBehavior behavior)282 TextIterator::TextIterator(const Range* r, TextIteratorBehavior behavior)
283     : m_startContainer(0)
284     , m_startOffset(0)
285     , m_endContainer(0)
286     , m_endOffset(0)
287     , m_positionNode(0)
288     , m_textCharacters(0)
289     , m_textLength(0)
290     , m_remainingTextBox(0)
291     , m_firstLetterText(0)
292     , m_emitsCharactersBetweenAllVisiblePositions(behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions)
293     , m_entersTextControls(behavior & TextIteratorEntersTextControls)
294     , m_emitsTextWithoutTranscoding(behavior & TextIteratorEmitsTextsWithoutTranscoding)
295     , m_handledFirstLetter(false)
296     , m_ignoresStyleVisibility(behavior & TextIteratorIgnoresStyleVisibility)
297     , m_emitsObjectReplacementCharacters(behavior & TextIteratorEmitsObjectReplacementCharacters)
298 #if OS(ANDROID)
299     , m_stopsOnFormControls(behavior & TextIteratorStopsOnFormControls)
300     , m_shouldStop(false)
301 #endif
302 {
303     if (!r)
304         return;
305 
306     // get and validate the range endpoints
307     Node* startContainer = r->startContainer();
308     if (!startContainer)
309         return;
310     int startOffset = r->startOffset();
311     Node* endContainer = r->endContainer();
312     int endOffset = r->endOffset();
313 
314     // Callers should be handing us well-formed ranges. If we discover that this isn't
315     // the case, we could consider changing this assertion to an early return.
316     ASSERT(r->boundaryPointsValid());
317 
318     // remember range - this does not change
319     m_startContainer = startContainer;
320     m_startOffset = startOffset;
321     m_endContainer = endContainer;
322     m_endOffset = endOffset;
323 
324     // set up the current node for processing
325     m_node = r->firstNode();
326     if (!m_node)
327         return;
328     setUpFullyClippedStack(m_fullyClippedStack, m_node);
329     m_offset = m_node == m_startContainer ? m_startOffset : 0;
330     m_handledNode = false;
331     m_handledChildren = false;
332 
333     // calculate first out of bounds node
334     m_pastEndNode = nextInPreOrderCrossingShadowBoundaries(endContainer, endOffset);
335 
336     // initialize node processing state
337     m_needsAnotherNewline = false;
338     m_textBox = 0;
339 
340     // initialize record of previous node processing
341     m_hasEmitted = false;
342     m_lastTextNode = 0;
343     m_lastTextNodeEndedWithCollapsedSpace = false;
344     m_lastCharacter = 0;
345 
346 #ifndef NDEBUG
347     // need this just because of the assert in advance()
348     m_positionNode = m_node;
349 #endif
350 
351     // identify the first run
352     advance();
353 }
354 
~TextIterator()355 TextIterator::~TextIterator()
356 {
357 }
358 
atEnd() const359 bool TextIterator::atEnd() const
360 {
361 #if OS(ANDROID)
362     return !m_positionNode || m_shouldStop;
363 #else
364     return !m_positionNode;
365 #endif
366 }
367 
advance()368 void TextIterator::advance()
369 {
370 #if OS(ANDROID)
371     if (m_shouldStop)
372         return;
373 #endif
374     // reset the run information
375     m_positionNode = 0;
376     m_textLength = 0;
377 
378     // handle remembered node that needed a newline after the text node's newline
379     if (m_needsAnotherNewline) {
380         // Emit the extra newline, and position it *inside* m_node, after m_node's
381         // contents, in case it's a block, in the same way that we position the first
382         // newline.  The range for the emitted newline should start where the line
383         // break begins.
384         // FIXME: It would be cleaner if we emitted two newlines during the last
385         // iteration, instead of using m_needsAnotherNewline.
386         Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
387         emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
388         m_needsAnotherNewline = false;
389         return;
390     }
391 
392     if (!m_textBox && m_remainingTextBox) {
393         m_textBox = m_remainingTextBox;
394         m_remainingTextBox = 0;
395         m_firstLetterText = 0;
396         m_offset = 0;
397     }
398     // handle remembered text box
399     if (m_textBox) {
400         handleTextBox();
401         if (m_positionNode)
402             return;
403     }
404 
405     while (m_node && m_node != m_pastEndNode) {
406 #if OS(ANDROID)
407         if (!m_shouldStop && m_stopsOnFormControls && checkFormControlElement(m_node))
408             m_shouldStop = true;
409 #endif
410         // if the range ends at offset 0 of an element, represent the
411         // position, but not the content, of that element e.g. if the
412         // node is a blockflow element, emit a newline that
413         // precedes the element
414         if (m_node == m_endContainer && m_endOffset == 0) {
415             representNodeOffsetZero();
416             m_node = 0;
417             return;
418         }
419 
420         RenderObject* renderer = m_node->renderer();
421         if (!renderer) {
422             m_handledNode = true;
423             m_handledChildren = true;
424         } else {
425             // handle current node according to its type
426             if (!m_handledNode) {
427                 if (renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) // FIXME: What about CDATA_SECTION_NODE?
428                     m_handledNode = handleTextNode();
429                 else if (renderer && (renderer->isImage() || renderer->isWidget() ||
430                          (renderer->node() && renderer->node()->isElementNode() &&
431                           static_cast<Element*>(renderer->node())->isFormControlElement())))
432                     m_handledNode = handleReplacedElement();
433                 else
434                     m_handledNode = handleNonTextNode();
435                 if (m_positionNode)
436                     return;
437             }
438         }
439 
440         // find a new current node to handle in depth-first manner,
441         // calling exitNode() as we come back thru a parent node
442         Node* next = m_handledChildren ? 0 : m_node->firstChild();
443         m_offset = 0;
444         if (!next) {
445             next = m_node->nextSibling();
446             if (!next) {
447                 bool pastEnd = m_node->traverseNextNode() == m_pastEndNode;
448                 Node* parentNode = m_node->parentOrHostNode();
449                 while (!next && parentNode) {
450                     if ((pastEnd && parentNode == m_endContainer) || m_endContainer->isDescendantOf(parentNode))
451                         return;
452                     bool haveRenderer = m_node->renderer();
453                     m_node = parentNode;
454                     m_fullyClippedStack.pop();
455                     parentNode = m_node->parentOrHostNode();
456                     if (haveRenderer)
457                         exitNode();
458                     if (m_positionNode) {
459                         m_handledNode = true;
460                         m_handledChildren = true;
461                         return;
462                     }
463                     next = m_node->nextSibling();
464                 }
465             }
466             m_fullyClippedStack.pop();
467         }
468 
469         // set the new current node
470         m_node = next;
471         if (m_node)
472             pushFullyClippedState(m_fullyClippedStack, m_node);
473         m_handledNode = false;
474         m_handledChildren = false;
475         m_handledFirstLetter = false;
476         m_firstLetterText = 0;
477 
478         // how would this ever be?
479         if (m_positionNode)
480             return;
481     }
482 }
483 
handleTextNode()484 bool TextIterator::handleTextNode()
485 {
486     if (m_fullyClippedStack.top() && !m_ignoresStyleVisibility)
487         return false;
488 
489     RenderText* renderer = toRenderText(m_node->renderer());
490 
491     m_lastTextNode = m_node;
492     String str = renderer->text();
493 
494     // handle pre-formatted text
495     if (!renderer->style()->collapseWhiteSpace()) {
496         int runStart = m_offset;
497         if (m_lastTextNodeEndedWithCollapsedSpace && hasVisibleTextNode(renderer)) {
498             emitCharacter(' ', m_node, 0, runStart, runStart);
499             return false;
500         }
501         if (!m_handledFirstLetter && renderer->isTextFragment()) {
502             handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
503             if (m_firstLetterText) {
504                 String firstLetter = m_firstLetterText->text();
505                 emitText(m_node, m_firstLetterText, m_offset, m_offset + firstLetter.length());
506                 m_firstLetterText = 0;
507                 m_textBox = 0;
508                 return false;
509             }
510         }
511         if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
512             return false;
513         int strLength = str.length();
514         int end = (m_node == m_endContainer) ? m_endOffset : INT_MAX;
515         int runEnd = min(strLength, end);
516 
517         if (runStart >= runEnd)
518             return true;
519 
520         emitText(m_node, runStart, runEnd);
521         return true;
522     }
523 
524     if (!renderer->firstTextBox() && str.length() > 0) {
525         if (!m_handledFirstLetter && renderer->isTextFragment()) {
526             handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
527             if (m_firstLetterText) {
528                 handleTextBox();
529                 return false;
530             }
531         }
532         if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
533             return false;
534         m_lastTextNodeEndedWithCollapsedSpace = true; // entire block is collapsed space
535         return true;
536     }
537 
538     // Used when text boxes are out of order (Hebrew/Arabic w/ embeded LTR text)
539     if (renderer->containsReversedText()) {
540         m_sortedTextBoxes.clear();
541         for (InlineTextBox* textBox = renderer->firstTextBox(); textBox; textBox = textBox->nextTextBox()) {
542             m_sortedTextBoxes.append(textBox);
543         }
544         std::sort(m_sortedTextBoxes.begin(), m_sortedTextBoxes.end(), InlineTextBox::compareByStart);
545         m_sortedTextBoxesPosition = 0;
546     }
547 
548     m_textBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox();
549     if (!m_handledFirstLetter && renderer->isTextFragment() && !m_offset)
550         handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
551     handleTextBox();
552     return true;
553 }
554 
handleTextBox()555 void TextIterator::handleTextBox()
556 {
557     RenderText* renderer = m_firstLetterText ? m_firstLetterText : toRenderText(m_node->renderer());
558     if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility) {
559         m_textBox = 0;
560         return;
561     }
562     String str = renderer->text();
563     unsigned start = m_offset;
564     unsigned end = (m_node == m_endContainer) ? static_cast<unsigned>(m_endOffset) : UINT_MAX;
565     while (m_textBox) {
566         unsigned textBoxStart = m_textBox->start();
567         unsigned runStart = max(textBoxStart, start);
568 
569         // Check for collapsed space at the start of this run.
570         InlineTextBox* firstTextBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox();
571         bool needSpace = m_lastTextNodeEndedWithCollapsedSpace
572             || (m_textBox == firstTextBox && textBoxStart == runStart && runStart > 0);
573         if (needSpace && !isCollapsibleWhitespace(m_lastCharacter) && m_lastCharacter) {
574             if (m_lastTextNode == m_node && runStart > 0 && str[runStart - 1] == ' ') {
575                 unsigned spaceRunStart = runStart - 1;
576                 while (spaceRunStart > 0 && str[spaceRunStart - 1] == ' ')
577                     --spaceRunStart;
578                 emitText(m_node, renderer, spaceRunStart, spaceRunStart + 1);
579             } else
580                 emitCharacter(' ', m_node, 0, runStart, runStart);
581             return;
582         }
583         unsigned textBoxEnd = textBoxStart + m_textBox->len();
584         unsigned runEnd = min(textBoxEnd, end);
585 
586         // Determine what the next text box will be, but don't advance yet
587         InlineTextBox* nextTextBox = 0;
588         if (renderer->containsReversedText()) {
589             if (m_sortedTextBoxesPosition + 1 < m_sortedTextBoxes.size())
590                 nextTextBox = m_sortedTextBoxes[m_sortedTextBoxesPosition + 1];
591         } else
592             nextTextBox = m_textBox->nextTextBox();
593 
594         if (runStart < runEnd) {
595             // Handle either a single newline character (which becomes a space),
596             // or a run of characters that does not include a newline.
597             // This effectively translates newlines to spaces without copying the text.
598             if (str[runStart] == '\n') {
599                 emitCharacter(' ', m_node, 0, runStart, runStart + 1);
600                 m_offset = runStart + 1;
601             } else {
602                 size_t subrunEnd = str.find('\n', runStart);
603                 if (subrunEnd == notFound || subrunEnd > runEnd)
604                     subrunEnd = runEnd;
605 
606                 m_offset = subrunEnd;
607                 emitText(m_node, renderer, runStart, subrunEnd);
608             }
609 
610             // If we are doing a subrun that doesn't go to the end of the text box,
611             // come back again to finish handling this text box; don't advance to the next one.
612             if (static_cast<unsigned>(m_positionEndOffset) < textBoxEnd)
613                 return;
614 
615             // Advance and return
616             unsigned nextRunStart = nextTextBox ? nextTextBox->start() : str.length();
617             if (nextRunStart > runEnd)
618                 m_lastTextNodeEndedWithCollapsedSpace = true; // collapsed space between runs or at the end
619             m_textBox = nextTextBox;
620             if (renderer->containsReversedText())
621                 ++m_sortedTextBoxesPosition;
622             return;
623         }
624         // Advance and continue
625         m_textBox = nextTextBox;
626         if (renderer->containsReversedText())
627             ++m_sortedTextBoxesPosition;
628     }
629     if (!m_textBox && m_remainingTextBox) {
630         m_textBox = m_remainingTextBox;
631         m_remainingTextBox = 0;
632         m_firstLetterText = 0;
633         m_offset = 0;
634         handleTextBox();
635     }
636 }
637 
handleTextNodeFirstLetter(RenderTextFragment * renderer)638 void TextIterator::handleTextNodeFirstLetter(RenderTextFragment* renderer)
639 {
640     if (renderer->firstLetter()) {
641         RenderObject* r = renderer->firstLetter();
642         if (r->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
643             return;
644         for (RenderObject *currChild = r->firstChild(); currChild; currChild->nextSibling()) {
645             if (currChild->isText()) {
646                 RenderText* firstLetter = toRenderText(currChild);
647                 m_handledFirstLetter = true;
648                 m_remainingTextBox = m_textBox;
649                 m_textBox = firstLetter->firstTextBox();
650                 m_firstLetterText = firstLetter;
651                 return;
652             }
653         }
654     }
655     m_handledFirstLetter = true;
656 }
657 
handleReplacedElement()658 bool TextIterator::handleReplacedElement()
659 {
660     if (m_fullyClippedStack.top())
661         return false;
662 
663     RenderObject* renderer = m_node->renderer();
664     if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
665         return false;
666 
667     if (m_lastTextNodeEndedWithCollapsedSpace) {
668         emitCharacter(' ', m_lastTextNode->parentNode(), m_lastTextNode, 1, 1);
669         return false;
670     }
671 
672     if (m_entersTextControls && renderer->isTextControl()) {
673         if (HTMLElement* innerTextElement = toRenderTextControl(renderer)->innerTextElement()) {
674             m_node = innerTextElement->shadowTreeRootNode();
675             pushFullyClippedState(m_fullyClippedStack, m_node);
676             m_offset = 0;
677             return false;
678         }
679     }
680 
681     m_hasEmitted = true;
682 
683     if (m_emitsObjectReplacementCharacters && renderer && renderer->isReplaced()) {
684         emitCharacter(objectReplacementCharacter, m_node->parentNode(), m_node, 0, 1);
685         return true;
686     }
687 
688     if (m_emitsCharactersBetweenAllVisiblePositions) {
689         // We want replaced elements to behave like punctuation for boundary
690         // finding, and to simply take up space for the selection preservation
691         // code in moveParagraphs, so we use a comma.
692         emitCharacter(',', m_node->parentNode(), m_node, 0, 1);
693         return true;
694     }
695 
696     m_positionNode = m_node->parentNode();
697     m_positionOffsetBaseNode = m_node;
698     m_positionStartOffset = 0;
699     m_positionEndOffset = 1;
700 
701     m_textCharacters = 0;
702     m_textLength = 0;
703 
704     m_lastCharacter = 0;
705 
706     return true;
707 }
708 
hasVisibleTextNode(RenderText * renderer)709 bool TextIterator::hasVisibleTextNode(RenderText* renderer)
710 {
711     if (renderer->style()->visibility() == VISIBLE)
712         return true;
713     if (renderer->isTextFragment()) {
714         RenderTextFragment* fragment = static_cast<RenderTextFragment*>(renderer);
715         if (fragment->firstLetter() && fragment->firstLetter()->style()->visibility() == VISIBLE)
716             return true;
717     }
718     return false;
719 }
720 
shouldEmitTabBeforeNode(Node * node)721 static bool shouldEmitTabBeforeNode(Node* node)
722 {
723     RenderObject* r = node->renderer();
724 
725     // Table cells are delimited by tabs.
726     if (!r || !isTableCell(node))
727         return false;
728 
729     // Want a tab before every cell other than the first one
730     RenderTableCell* rc = toRenderTableCell(r);
731     RenderTable* t = rc->table();
732     return t && (t->cellBefore(rc) || t->cellAbove(rc));
733 }
734 
shouldEmitNewlineForNode(Node * node)735 static bool shouldEmitNewlineForNode(Node* node)
736 {
737     // br elements are represented by a single newline.
738     RenderObject* r = node->renderer();
739     if (!r)
740         return node->hasTagName(brTag);
741 
742     return r->isBR();
743 }
744 
shouldEmitNewlinesBeforeAndAfterNode(Node * node)745 static bool shouldEmitNewlinesBeforeAndAfterNode(Node* node)
746 {
747     // Block flow (versus inline flow) is represented by having
748     // a newline both before and after the element.
749     RenderObject* r = node->renderer();
750     if (!r) {
751         return (node->hasTagName(blockquoteTag)
752                 || node->hasTagName(ddTag)
753                 || node->hasTagName(divTag)
754                 || node->hasTagName(dlTag)
755                 || node->hasTagName(dtTag)
756                 || node->hasTagName(h1Tag)
757                 || node->hasTagName(h2Tag)
758                 || node->hasTagName(h3Tag)
759                 || node->hasTagName(h4Tag)
760                 || node->hasTagName(h5Tag)
761                 || node->hasTagName(h6Tag)
762                 || node->hasTagName(hrTag)
763                 || node->hasTagName(liTag)
764                 || node->hasTagName(listingTag)
765                 || node->hasTagName(olTag)
766                 || node->hasTagName(pTag)
767                 || node->hasTagName(preTag)
768                 || node->hasTagName(trTag)
769                 || node->hasTagName(ulTag));
770     }
771 
772     // Need to make an exception for table cells, because they are blocks, but we
773     // want them tab-delimited rather than having newlines before and after.
774     if (isTableCell(node))
775         return false;
776 
777     // Need to make an exception for table row elements, because they are neither
778     // "inline" or "RenderBlock", but we want newlines for them.
779     if (r->isTableRow()) {
780         RenderTable* t = toRenderTableRow(r)->table();
781         if (t && !t->isInline())
782             return true;
783     }
784 
785     return !r->isInline() && r->isRenderBlock() && !r->isFloatingOrPositioned() && !r->isBody();
786 }
787 
shouldEmitNewlineAfterNode(Node * node)788 static bool shouldEmitNewlineAfterNode(Node* node)
789 {
790     // FIXME: It should be better but slower to create a VisiblePosition here.
791     if (!shouldEmitNewlinesBeforeAndAfterNode(node))
792         return false;
793     // Check if this is the very last renderer in the document.
794     // If so, then we should not emit a newline.
795     while ((node = node->traverseNextSibling()))
796         if (node->renderer())
797             return true;
798     return false;
799 }
800 
shouldEmitNewlineBeforeNode(Node * node)801 static bool shouldEmitNewlineBeforeNode(Node* node)
802 {
803     return shouldEmitNewlinesBeforeAndAfterNode(node);
804 }
805 
shouldEmitExtraNewlineForNode(Node * node)806 static bool shouldEmitExtraNewlineForNode(Node* node)
807 {
808     // When there is a significant collapsed bottom margin, emit an extra
809     // newline for a more realistic result.  We end up getting the right
810     // result even without margin collapsing. For example: <div><p>text</p></div>
811     // will work right even if both the <div> and the <p> have bottom margins.
812     RenderObject* r = node->renderer();
813     if (!r || !r->isBox())
814         return false;
815 
816     // NOTE: We only do this for a select set of nodes, and fwiw WinIE appears
817     // not to do this at all
818     if (node->hasTagName(h1Tag)
819         || node->hasTagName(h2Tag)
820         || node->hasTagName(h3Tag)
821         || node->hasTagName(h4Tag)
822         || node->hasTagName(h5Tag)
823         || node->hasTagName(h6Tag)
824         || node->hasTagName(pTag)) {
825         RenderStyle* style = r->style();
826         if (style) {
827             int bottomMargin = toRenderBox(r)->collapsedMarginAfter();
828             int fontSize = style->fontDescription().computedPixelSize();
829             if (bottomMargin * 2 >= fontSize)
830                 return true;
831         }
832     }
833 
834     return false;
835 }
836 
collapsedSpaceLength(RenderText * renderer,int textEnd)837 static int collapsedSpaceLength(RenderText* renderer, int textEnd)
838 {
839     const UChar* characters = renderer->text()->characters();
840     int length = renderer->text()->length();
841     for (int i = textEnd; i < length; ++i) {
842         if (!renderer->style()->isCollapsibleWhiteSpace(characters[i]))
843             return i - textEnd;
844     }
845 
846     return length - textEnd;
847 }
848 
maxOffsetIncludingCollapsedSpaces(Node * node)849 static int maxOffsetIncludingCollapsedSpaces(Node* node)
850 {
851     int offset = caretMaxOffset(node);
852 
853     if (node->renderer() && node->renderer()->isText())
854         offset += collapsedSpaceLength(toRenderText(node->renderer()), offset);
855 
856     return offset;
857 }
858 
859 // Whether or not we should emit a character as we enter m_node (if it's a container) or as we hit it (if it's atomic).
shouldRepresentNodeOffsetZero()860 bool TextIterator::shouldRepresentNodeOffsetZero()
861 {
862     if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isTable())
863         return true;
864 
865     // Leave element positioned flush with start of a paragraph
866     // (e.g. do not insert tab before a table cell at the start of a paragraph)
867     if (m_lastCharacter == '\n')
868         return false;
869 
870     // Otherwise, show the position if we have emitted any characters
871     if (m_hasEmitted)
872         return true;
873 
874     // We've not emitted anything yet. Generally, there is no need for any positioning then.
875     // The only exception is when the element is visually not in the same line as
876     // the start of the range (e.g. the range starts at the end of the previous paragraph).
877     // NOTE: Creating VisiblePositions and comparing them is relatively expensive, so we
878     // make quicker checks to possibly avoid that. Another check that we could make is
879     // is whether the inline vs block flow changed since the previous visible element.
880     // I think we're already in a special enough case that that won't be needed, tho.
881 
882     // No character needed if this is the first node in the range.
883     if (m_node == m_startContainer)
884         return false;
885 
886     // If we are outside the start container's subtree, assume we need to emit.
887     // FIXME: m_startContainer could be an inline block
888     if (!m_node->isDescendantOf(m_startContainer))
889         return true;
890 
891     // If we started as m_startContainer offset 0 and the current node is a descendant of
892     // the start container, we already had enough context to correctly decide whether to
893     // emit after a preceding block. We chose not to emit (m_hasEmitted is false),
894     // so don't second guess that now.
895     // NOTE: Is this really correct when m_node is not a leftmost descendant? Probably
896     // immaterial since we likely would have already emitted something by now.
897     if (m_startOffset == 0)
898         return false;
899 
900     // If this node is unrendered or invisible the VisiblePosition checks below won't have much meaning.
901     // Additionally, if the range we are iterating over contains huge sections of unrendered content,
902     // we would create VisiblePositions on every call to this function without this check.
903     if (!m_node->renderer() || m_node->renderer()->style()->visibility() != VISIBLE)
904         return false;
905 
906     // The startPos.isNotNull() check is needed because the start could be before the body,
907     // and in that case we'll get null. We don't want to put in newlines at the start in that case.
908     // The currPos.isNotNull() check is needed because positions in non-HTML content
909     // (like SVG) do not have visible positions, and we don't want to emit for them either.
910     VisiblePosition startPos = VisiblePosition(Position(m_startContainer, m_startOffset, Position::PositionIsOffsetInAnchor), DOWNSTREAM);
911     VisiblePosition currPos = VisiblePosition(positionBeforeNode(m_node), DOWNSTREAM);
912     return startPos.isNotNull() && currPos.isNotNull() && !inSameLine(startPos, currPos);
913 }
914 
shouldEmitSpaceBeforeAndAfterNode(Node * node)915 bool TextIterator::shouldEmitSpaceBeforeAndAfterNode(Node* node)
916 {
917     return node->renderer() && node->renderer()->isTable() && (node->renderer()->isInline() || m_emitsCharactersBetweenAllVisiblePositions);
918 }
919 
representNodeOffsetZero()920 void TextIterator::representNodeOffsetZero()
921 {
922     // Emit a character to show the positioning of m_node.
923 
924     // When we haven't been emitting any characters, shouldRepresentNodeOffsetZero() can
925     // create VisiblePositions, which is expensive.  So, we perform the inexpensive checks
926     // on m_node to see if it necessitates emitting a character first and will early return
927     // before encountering shouldRepresentNodeOffsetZero()s worse case behavior.
928     if (shouldEmitTabBeforeNode(m_node)) {
929         if (shouldRepresentNodeOffsetZero())
930             emitCharacter('\t', m_node->parentNode(), m_node, 0, 0);
931     } else if (shouldEmitNewlineBeforeNode(m_node)) {
932         if (shouldRepresentNodeOffsetZero())
933             emitCharacter('\n', m_node->parentNode(), m_node, 0, 0);
934     } else if (shouldEmitSpaceBeforeAndAfterNode(m_node)) {
935         if (shouldRepresentNodeOffsetZero())
936             emitCharacter(' ', m_node->parentNode(), m_node, 0, 0);
937     }
938 }
939 
handleNonTextNode()940 bool TextIterator::handleNonTextNode()
941 {
942     if (shouldEmitNewlineForNode(m_node))
943         emitCharacter('\n', m_node->parentNode(), m_node, 0, 1);
944     else if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isHR())
945         emitCharacter(' ', m_node->parentNode(), m_node, 0, 1);
946     else
947         representNodeOffsetZero();
948 
949     return true;
950 }
951 
exitNode()952 void TextIterator::exitNode()
953 {
954     // prevent emitting a newline when exiting a collapsed block at beginning of the range
955     // FIXME: !m_hasEmitted does not necessarily mean there was a collapsed block... it could
956     // have been an hr (e.g.). Also, a collapsed block could have height (e.g. a table) and
957     // therefore look like a blank line.
958     if (!m_hasEmitted)
959         return;
960 
961     // Emit with a position *inside* m_node, after m_node's contents, in
962     // case it is a block, because the run should start where the
963     // emitted character is positioned visually.
964     Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
965     // FIXME: This shouldn't require the m_lastTextNode to be true, but we can't change that without making
966     // the logic in _web_attributedStringFromRange match.  We'll get that for free when we switch to use
967     // TextIterator in _web_attributedStringFromRange.
968     // See <rdar://problem/5428427> for an example of how this mismatch will cause problems.
969     if (m_lastTextNode && shouldEmitNewlineAfterNode(m_node)) {
970         // use extra newline to represent margin bottom, as needed
971         bool addNewline = shouldEmitExtraNewlineForNode(m_node);
972 
973         // FIXME: We need to emit a '\n' as we leave an empty block(s) that
974         // contain a VisiblePosition when doing selection preservation.
975         if (m_lastCharacter != '\n') {
976             // insert a newline with a position following this block's contents.
977             emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
978             // remember whether to later add a newline for the current node
979             ASSERT(!m_needsAnotherNewline);
980             m_needsAnotherNewline = addNewline;
981         } else if (addNewline)
982             // insert a newline with a position following this block's contents.
983             emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
984     }
985 
986     // If nothing was emitted, see if we need to emit a space.
987     if (!m_positionNode && shouldEmitSpaceBeforeAndAfterNode(m_node))
988         emitCharacter(' ', baseNode->parentNode(), baseNode, 1, 1);
989 }
990 
emitCharacter(UChar c,Node * textNode,Node * offsetBaseNode,int textStartOffset,int textEndOffset)991 void TextIterator::emitCharacter(UChar c, Node* textNode, Node* offsetBaseNode, int textStartOffset, int textEndOffset)
992 {
993     m_hasEmitted = true;
994 
995     // remember information with which to construct the TextIterator::range()
996     // NOTE: textNode is often not a text node, so the range will specify child nodes of positionNode
997     m_positionNode = textNode;
998     m_positionOffsetBaseNode = offsetBaseNode;
999     m_positionStartOffset = textStartOffset;
1000     m_positionEndOffset = textEndOffset;
1001 
1002     // remember information with which to construct the TextIterator::characters() and length()
1003     m_singleCharacterBuffer = c;
1004     m_textCharacters = &m_singleCharacterBuffer;
1005     m_textLength = 1;
1006 
1007     // remember some iteration state
1008     m_lastTextNodeEndedWithCollapsedSpace = false;
1009     m_lastCharacter = c;
1010 }
1011 
emitText(Node * textNode,RenderObject * renderObject,int textStartOffset,int textEndOffset)1012 void TextIterator::emitText(Node* textNode, RenderObject* renderObject, int textStartOffset, int textEndOffset)
1013 {
1014     RenderText* renderer = toRenderText(renderObject);
1015     m_text = m_emitsTextWithoutTranscoding ? renderer->textWithoutTranscoding() : renderer->text();
1016     ASSERT(m_text.characters());
1017 
1018     m_positionNode = textNode;
1019     m_positionOffsetBaseNode = 0;
1020     m_positionStartOffset = textStartOffset;
1021     m_positionEndOffset = textEndOffset;
1022     m_textCharacters = m_text.characters() + textStartOffset;
1023     m_textLength = textEndOffset - textStartOffset;
1024     m_lastCharacter = m_text[textEndOffset - 1];
1025 
1026     m_lastTextNodeEndedWithCollapsedSpace = false;
1027     m_hasEmitted = true;
1028 }
1029 
emitText(Node * textNode,int textStartOffset,int textEndOffset)1030 void TextIterator::emitText(Node* textNode, int textStartOffset, int textEndOffset)
1031 {
1032     emitText(textNode, m_node->renderer(), textStartOffset, textEndOffset);
1033 }
1034 
range() const1035 PassRefPtr<Range> TextIterator::range() const
1036 {
1037     // use the current run information, if we have it
1038     if (m_positionNode) {
1039         if (m_positionOffsetBaseNode) {
1040             int index = m_positionOffsetBaseNode->nodeIndex();
1041             m_positionStartOffset += index;
1042             m_positionEndOffset += index;
1043             m_positionOffsetBaseNode = 0;
1044         }
1045         return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1046     }
1047 
1048     // otherwise, return the end of the overall range we were given
1049     if (m_endContainer)
1050         return Range::create(m_endContainer->document(), m_endContainer, m_endOffset, m_endContainer, m_endOffset);
1051 
1052     return 0;
1053 }
1054 
node() const1055 Node* TextIterator::node() const
1056 {
1057     RefPtr<Range> textRange = range();
1058     if (!textRange)
1059         return 0;
1060 
1061     Node* node = textRange->startContainer();
1062     if (!node)
1063         return 0;
1064     if (node->offsetInCharacters())
1065         return node;
1066 
1067     return node->childNode(textRange->startOffset());
1068 }
1069 
1070 // --------
1071 
SimplifiedBackwardsTextIterator()1072 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator()
1073     : m_behavior(TextIteratorDefaultBehavior)
1074     , m_node(0)
1075     , m_positionNode(0)
1076 #if OS(ANDROID)
1077     , m_stopsOnFormControls(false)
1078     , m_shouldStop(false)
1079 #endif
1080 {
1081 }
1082 
SimplifiedBackwardsTextIterator(const Range * r,TextIteratorBehavior behavior)1083 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator(const Range* r, TextIteratorBehavior behavior)
1084     : m_behavior(behavior)
1085     , m_node(0)
1086     , m_positionNode(0)
1087 #if OS(ANDROID)
1088     , m_stopsOnFormControls(behavior & TextIteratorStopsOnFormControls)
1089     , m_shouldStop(false)
1090 #endif
1091 {
1092 #if OS(ANDROID)
1093     ASSERT(m_behavior == TextIteratorDefaultBehavior || m_behavior == TextIteratorStopsOnFormControls);
1094 #else
1095     ASSERT(m_behavior == TextIteratorDefaultBehavior);
1096 #endif
1097 
1098     if (!r)
1099         return;
1100 
1101     Node* startNode = r->startContainer();
1102     if (!startNode)
1103         return;
1104     Node* endNode = r->endContainer();
1105     int startOffset = r->startOffset();
1106     int endOffset = r->endOffset();
1107 
1108     if (!startNode->offsetInCharacters()) {
1109         if (startOffset >= 0 && startOffset < static_cast<int>(startNode->childNodeCount())) {
1110             startNode = startNode->childNode(startOffset);
1111             startOffset = 0;
1112         }
1113     }
1114     if (!endNode->offsetInCharacters()) {
1115         if (endOffset > 0 && endOffset <= static_cast<int>(endNode->childNodeCount())) {
1116             endNode = endNode->childNode(endOffset - 1);
1117             endOffset = lastOffsetInNode(endNode);
1118         }
1119     }
1120 
1121     m_node = endNode;
1122     setUpFullyClippedStack(m_fullyClippedStack, m_node);
1123     m_offset = endOffset;
1124     m_handledNode = false;
1125     m_handledChildren = endOffset == 0;
1126 
1127     m_startNode = startNode;
1128     m_startOffset = startOffset;
1129     m_endNode = endNode;
1130     m_endOffset = endOffset;
1131 
1132 #ifndef NDEBUG
1133     // Need this just because of the assert.
1134     m_positionNode = endNode;
1135 #endif
1136 
1137     m_lastTextNode = 0;
1138     m_lastCharacter = '\n';
1139 
1140     m_havePassedStartNode = false;
1141 
1142     advance();
1143 }
1144 
atEnd() const1145 bool SimplifiedBackwardsTextIterator::atEnd() const
1146 {
1147 #if OS(ANDROID)
1148     return !m_positionNode || m_shouldStop;
1149 #else
1150     return !m_positionNode;
1151 #endif
1152 }
1153 
advance()1154 void SimplifiedBackwardsTextIterator::advance()
1155 {
1156     ASSERT(m_positionNode);
1157 
1158 #if OS(ANDROID)
1159     if (m_shouldStop)
1160         return;
1161 
1162     // Prevent changing the iterator position if a form control element was found and advance should stop on it.
1163     if (m_stopsOnFormControls && checkFormControlElement(m_node)) {
1164         m_shouldStop = true;
1165         return;
1166     }
1167 #endif
1168 
1169     m_positionNode = 0;
1170     m_textLength = 0;
1171 
1172     while (m_node && !m_havePassedStartNode) {
1173         // Don't handle node if we start iterating at [node, 0].
1174         if (!m_handledNode && !(m_node == m_endNode && m_endOffset == 0)) {
1175             RenderObject* renderer = m_node->renderer();
1176             if (renderer && renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) {
1177                 // FIXME: What about CDATA_SECTION_NODE?
1178                 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
1179                     m_handledNode = handleTextNode();
1180             } else if (renderer && (renderer->isImage() || renderer->isWidget())) {
1181                 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
1182                     m_handledNode = handleReplacedElement();
1183             } else
1184                 m_handledNode = handleNonTextNode();
1185             if (m_positionNode)
1186                 return;
1187         }
1188 
1189         if (!m_handledChildren && m_node->hasChildNodes()) {
1190             m_node = m_node->lastChild();
1191             pushFullyClippedState(m_fullyClippedStack, m_node);
1192         } else {
1193             // Exit empty containers as we pass over them or containers
1194             // where [container, 0] is where we started iterating.
1195             if (!m_handledNode
1196                     && canHaveChildrenForEditing(m_node)
1197                     && m_node->parentNode()
1198                     && (!m_node->lastChild() || (m_node == m_endNode && !m_endOffset))) {
1199                 exitNode();
1200                 if (m_positionNode) {
1201                     m_handledNode = true;
1202                     m_handledChildren = true;
1203                     return;
1204                 }
1205             }
1206 
1207             // Exit all other containers.
1208             while (!m_node->previousSibling()) {
1209                 if (!advanceRespectingRange(m_node->parentOrHostNode()))
1210                     break;
1211                 m_fullyClippedStack.pop();
1212                 exitNode();
1213                 if (m_positionNode) {
1214                     m_handledNode = true;
1215                     m_handledChildren = true;
1216                     return;
1217                 }
1218             }
1219 
1220             m_fullyClippedStack.pop();
1221             if (advanceRespectingRange(m_node->previousSibling()))
1222                 pushFullyClippedState(m_fullyClippedStack, m_node);
1223             else
1224                 m_node = 0;
1225         }
1226 
1227         // For the purpose of word boundary detection,
1228         // we should iterate all visible text and trailing (collapsed) whitespaces.
1229         m_offset = m_node ? maxOffsetIncludingCollapsedSpaces(m_node) : 0;
1230         m_handledNode = false;
1231         m_handledChildren = false;
1232 
1233         if (m_positionNode)
1234             return;
1235     }
1236 }
1237 
handleTextNode()1238 bool SimplifiedBackwardsTextIterator::handleTextNode()
1239 {
1240     m_lastTextNode = m_node;
1241 
1242     RenderText* renderer = toRenderText(m_node->renderer());
1243     String str = renderer->text();
1244 
1245     if (!renderer->firstTextBox() && str.length() > 0)
1246         return true;
1247 
1248     m_positionEndOffset = m_offset;
1249 
1250     m_offset = (m_node == m_startNode) ? m_startOffset : 0;
1251     m_positionNode = m_node;
1252     m_positionStartOffset = m_offset;
1253     m_textLength = m_positionEndOffset - m_positionStartOffset;
1254     m_textCharacters = str.characters() + m_positionStartOffset;
1255 
1256     m_lastCharacter = str[m_positionEndOffset - 1];
1257 
1258     return true;
1259 }
1260 
handleReplacedElement()1261 bool SimplifiedBackwardsTextIterator::handleReplacedElement()
1262 {
1263     unsigned index = m_node->nodeIndex();
1264     // We want replaced elements to behave like punctuation for boundary
1265     // finding, and to simply take up space for the selection preservation
1266     // code in moveParagraphs, so we use a comma.  Unconditionally emit
1267     // here because this iterator is only used for boundary finding.
1268     emitCharacter(',', m_node->parentNode(), index, index + 1);
1269     return true;
1270 }
1271 
handleNonTextNode()1272 bool SimplifiedBackwardsTextIterator::handleNonTextNode()
1273 {
1274     // We can use a linefeed in place of a tab because this simple iterator is only used to
1275     // find boundaries, not actual content.  A linefeed breaks words, sentences, and paragraphs.
1276     if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineAfterNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
1277         unsigned index = m_node->nodeIndex();
1278         // The start of this emitted range is wrong. Ensuring correctness would require
1279         // VisiblePositions and so would be slow. previousBoundary expects this.
1280         emitCharacter('\n', m_node->parentNode(), index + 1, index + 1);
1281     }
1282     return true;
1283 }
1284 
exitNode()1285 void SimplifiedBackwardsTextIterator::exitNode()
1286 {
1287     if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineBeforeNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
1288         // The start of this emitted range is wrong. Ensuring correctness would require
1289         // VisiblePositions and so would be slow. previousBoundary expects this.
1290         emitCharacter('\n', m_node, 0, 0);
1291     }
1292 }
1293 
emitCharacter(UChar c,Node * node,int startOffset,int endOffset)1294 void SimplifiedBackwardsTextIterator::emitCharacter(UChar c, Node* node, int startOffset, int endOffset)
1295 {
1296     m_singleCharacterBuffer = c;
1297     m_positionNode = node;
1298     m_positionStartOffset = startOffset;
1299     m_positionEndOffset = endOffset;
1300     m_textCharacters = &m_singleCharacterBuffer;
1301     m_textLength = 1;
1302     m_lastCharacter = c;
1303 }
1304 
advanceRespectingRange(Node * next)1305 bool SimplifiedBackwardsTextIterator::advanceRespectingRange(Node* next)
1306 {
1307     if (!next)
1308         return false;
1309     m_havePassedStartNode |= m_node == m_startNode;
1310     if (m_havePassedStartNode)
1311         return false;
1312     m_node = next;
1313     return true;
1314 }
1315 
range() const1316 PassRefPtr<Range> SimplifiedBackwardsTextIterator::range() const
1317 {
1318     if (m_positionNode)
1319         return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1320 
1321     return Range::create(m_startNode->document(), m_startNode, m_startOffset, m_startNode, m_startOffset);
1322 }
1323 
1324 // --------
1325 
CharacterIterator()1326 CharacterIterator::CharacterIterator()
1327     : m_offset(0)
1328     , m_runOffset(0)
1329     , m_atBreak(true)
1330 {
1331 }
1332 
CharacterIterator(const Range * r,TextIteratorBehavior behavior)1333 CharacterIterator::CharacterIterator(const Range* r, TextIteratorBehavior behavior)
1334     : m_offset(0)
1335     , m_runOffset(0)
1336     , m_atBreak(true)
1337     , m_textIterator(r, behavior)
1338 {
1339     while (!atEnd() && m_textIterator.length() == 0)
1340         m_textIterator.advance();
1341 }
1342 
range() const1343 PassRefPtr<Range> CharacterIterator::range() const
1344 {
1345     RefPtr<Range> r = m_textIterator.range();
1346     if (!m_textIterator.atEnd()) {
1347         if (m_textIterator.length() <= 1) {
1348             ASSERT(m_runOffset == 0);
1349         } else {
1350             Node* n = r->startContainer();
1351             ASSERT(n == r->endContainer());
1352             int offset = r->startOffset() + m_runOffset;
1353             ExceptionCode ec = 0;
1354             r->setStart(n, offset, ec);
1355             r->setEnd(n, offset + 1, ec);
1356             ASSERT(!ec);
1357         }
1358     }
1359     return r.release();
1360 }
1361 
advance(int count)1362 void CharacterIterator::advance(int count)
1363 {
1364     if (count <= 0) {
1365         ASSERT(count == 0);
1366         return;
1367     }
1368 
1369     m_atBreak = false;
1370 
1371     // easy if there is enough left in the current m_textIterator run
1372     int remaining = m_textIterator.length() - m_runOffset;
1373     if (count < remaining) {
1374         m_runOffset += count;
1375         m_offset += count;
1376         return;
1377     }
1378 
1379     // exhaust the current m_textIterator run
1380     count -= remaining;
1381     m_offset += remaining;
1382 
1383     // move to a subsequent m_textIterator run
1384     for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
1385         int runLength = m_textIterator.length();
1386         if (runLength == 0)
1387             m_atBreak = true;
1388         else {
1389             // see whether this is m_textIterator to use
1390             if (count < runLength) {
1391                 m_runOffset = count;
1392                 m_offset += count;
1393                 return;
1394             }
1395 
1396             // exhaust this m_textIterator run
1397             count -= runLength;
1398             m_offset += runLength;
1399         }
1400     }
1401 
1402     // ran to the end of the m_textIterator... no more runs left
1403     m_atBreak = true;
1404     m_runOffset = 0;
1405 }
1406 
string(int numChars)1407 String CharacterIterator::string(int numChars)
1408 {
1409     Vector<UChar> result;
1410     result.reserveInitialCapacity(numChars);
1411     while (numChars > 0 && !atEnd()) {
1412         int runSize = min(numChars, length());
1413         result.append(characters(), runSize);
1414         numChars -= runSize;
1415         advance(runSize);
1416     }
1417     return String::adopt(result);
1418 }
1419 
characterSubrange(CharacterIterator & it,int offset,int length)1420 static PassRefPtr<Range> characterSubrange(CharacterIterator& it, int offset, int length)
1421 {
1422     it.advance(offset);
1423     RefPtr<Range> start = it.range();
1424 
1425     if (length > 1)
1426         it.advance(length - 1);
1427     RefPtr<Range> end = it.range();
1428 
1429     return Range::create(start->startContainer()->document(),
1430         start->startContainer(), start->startOffset(),
1431         end->endContainer(), end->endOffset());
1432 }
1433 
BackwardsCharacterIterator()1434 BackwardsCharacterIterator::BackwardsCharacterIterator()
1435     : m_offset(0)
1436     , m_runOffset(0)
1437     , m_atBreak(true)
1438 {
1439 }
1440 
BackwardsCharacterIterator(const Range * range,TextIteratorBehavior behavior)1441 BackwardsCharacterIterator::BackwardsCharacterIterator(const Range* range, TextIteratorBehavior behavior)
1442     : m_offset(0)
1443     , m_runOffset(0)
1444     , m_atBreak(true)
1445     , m_textIterator(range, behavior)
1446 {
1447     while (!atEnd() && !m_textIterator.length())
1448         m_textIterator.advance();
1449 }
1450 
range() const1451 PassRefPtr<Range> BackwardsCharacterIterator::range() const
1452 {
1453     RefPtr<Range> r = m_textIterator.range();
1454     if (!m_textIterator.atEnd()) {
1455         if (m_textIterator.length() <= 1)
1456             ASSERT(m_runOffset == 0);
1457         else {
1458             Node* n = r->startContainer();
1459             ASSERT(n == r->endContainer());
1460             int offset = r->endOffset() - m_runOffset;
1461             ExceptionCode ec = 0;
1462             r->setStart(n, offset - 1, ec);
1463             r->setEnd(n, offset, ec);
1464             ASSERT(!ec);
1465         }
1466     }
1467     return r.release();
1468 }
1469 
advance(int count)1470 void BackwardsCharacterIterator::advance(int count)
1471 {
1472     if (count <= 0) {
1473         ASSERT(!count);
1474         return;
1475     }
1476 
1477     m_atBreak = false;
1478 
1479     int remaining = m_textIterator.length() - m_runOffset;
1480     if (count < remaining) {
1481         m_runOffset += count;
1482         m_offset += count;
1483         return;
1484     }
1485 
1486     count -= remaining;
1487     m_offset += remaining;
1488 
1489     for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
1490         int runLength = m_textIterator.length();
1491         if (runLength == 0)
1492             m_atBreak = true;
1493         else {
1494             if (count < runLength) {
1495                 m_runOffset = count;
1496                 m_offset += count;
1497                 return;
1498             }
1499 
1500             count -= runLength;
1501             m_offset += runLength;
1502         }
1503     }
1504 
1505     m_atBreak = true;
1506     m_runOffset = 0;
1507 }
1508 
1509 // --------
1510 
WordAwareIterator()1511 WordAwareIterator::WordAwareIterator()
1512     : m_previousText(0)
1513     , m_didLookAhead(false)
1514 {
1515 }
1516 
WordAwareIterator(const Range * r)1517 WordAwareIterator::WordAwareIterator(const Range* r)
1518     : m_previousText(0)
1519     , m_didLookAhead(true) // so we consider the first chunk from the text iterator
1520     , m_textIterator(r)
1521 {
1522     advance(); // get in position over the first chunk of text
1523 }
1524 
~WordAwareIterator()1525 WordAwareIterator::~WordAwareIterator()
1526 {
1527 }
1528 
1529 // We're always in one of these modes:
1530 // - The current chunk in the text iterator is our current chunk
1531 //      (typically its a piece of whitespace, or text that ended with whitespace)
1532 // - The previous chunk in the text iterator is our current chunk
1533 //      (we looked ahead to the next chunk and found a word boundary)
1534 // - We built up our own chunk of text from many chunks from the text iterator
1535 
1536 // FIXME: Performance could be bad for huge spans next to each other that don't fall on word boundaries.
1537 
advance()1538 void WordAwareIterator::advance()
1539 {
1540     m_previousText = 0;
1541     m_buffer.clear();      // toss any old buffer we built up
1542 
1543     // If last time we did a look-ahead, start with that looked-ahead chunk now
1544     if (!m_didLookAhead) {
1545         ASSERT(!m_textIterator.atEnd());
1546         m_textIterator.advance();
1547     }
1548     m_didLookAhead = false;
1549 
1550     // Go to next non-empty chunk
1551     while (!m_textIterator.atEnd() && m_textIterator.length() == 0)
1552         m_textIterator.advance();
1553     m_range = m_textIterator.range();
1554 
1555     if (m_textIterator.atEnd())
1556         return;
1557 
1558     while (1) {
1559         // If this chunk ends in whitespace we can just use it as our chunk.
1560         if (isSpaceOrNewline(m_textIterator.characters()[m_textIterator.length() - 1]))
1561             return;
1562 
1563         // If this is the first chunk that failed, save it in previousText before look ahead
1564         if (m_buffer.isEmpty()) {
1565             m_previousText = m_textIterator.characters();
1566             m_previousLength = m_textIterator.length();
1567         }
1568 
1569         // Look ahead to next chunk.  If it is whitespace or a break, we can use the previous stuff
1570         m_textIterator.advance();
1571         if (m_textIterator.atEnd() || m_textIterator.length() == 0 || isSpaceOrNewline(m_textIterator.characters()[0])) {
1572             m_didLookAhead = true;
1573             return;
1574         }
1575 
1576         if (m_buffer.isEmpty()) {
1577             // Start gobbling chunks until we get to a suitable stopping point
1578             m_buffer.append(m_previousText, m_previousLength);
1579             m_previousText = 0;
1580         }
1581         m_buffer.append(m_textIterator.characters(), m_textIterator.length());
1582         int exception = 0;
1583         m_range->setEnd(m_textIterator.range()->endContainer(), m_textIterator.range()->endOffset(), exception);
1584     }
1585 }
1586 
length() const1587 int WordAwareIterator::length() const
1588 {
1589     if (!m_buffer.isEmpty())
1590         return m_buffer.size();
1591     if (m_previousText)
1592         return m_previousLength;
1593     return m_textIterator.length();
1594 }
1595 
characters() const1596 const UChar* WordAwareIterator::characters() const
1597 {
1598     if (!m_buffer.isEmpty())
1599         return m_buffer.data();
1600     if (m_previousText)
1601         return m_previousText;
1602     return m_textIterator.characters();
1603 }
1604 
1605 // --------
1606 
foldQuoteMarkOrSoftHyphen(UChar c)1607 static inline UChar foldQuoteMarkOrSoftHyphen(UChar c)
1608 {
1609     switch (c) {
1610         case hebrewPunctuationGershayim:
1611         case leftDoubleQuotationMark:
1612         case rightDoubleQuotationMark:
1613             return '"';
1614         case hebrewPunctuationGeresh:
1615         case leftSingleQuotationMark:
1616         case rightSingleQuotationMark:
1617             return '\'';
1618         case softHyphen:
1619             // Replace soft hyphen with an ignorable character so that their presence or absence will
1620             // not affect string comparison.
1621             return 0;
1622         default:
1623             return c;
1624     }
1625 }
1626 
foldQuoteMarksAndSoftHyphens(String & s)1627 static inline void foldQuoteMarksAndSoftHyphens(String& s)
1628 {
1629     s.replace(hebrewPunctuationGeresh, '\'');
1630     s.replace(hebrewPunctuationGershayim, '"');
1631     s.replace(leftDoubleQuotationMark, '"');
1632     s.replace(leftSingleQuotationMark, '\'');
1633     s.replace(rightDoubleQuotationMark, '"');
1634     s.replace(rightSingleQuotationMark, '\'');
1635     // Replace soft hyphen with an ignorable character so that their presence or absence will
1636     // not affect string comparison.
1637     s.replace(softHyphen, 0);
1638 }
1639 
1640 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
1641 
foldQuoteMarksAndSoftHyphens(UChar * data,size_t length)1642 static inline void foldQuoteMarksAndSoftHyphens(UChar* data, size_t length)
1643 {
1644     for (size_t i = 0; i < length; ++i)
1645         data[i] = foldQuoteMarkOrSoftHyphen(data[i]);
1646 }
1647 
1648 static const size_t minimumSearchBufferSize = 8192;
1649 
1650 #ifndef NDEBUG
1651 static bool searcherInUse;
1652 #endif
1653 
createSearcher()1654 static UStringSearch* createSearcher()
1655 {
1656     // Provide a non-empty pattern and non-empty text so usearch_open will not fail,
1657     // but it doesn't matter exactly what it is, since we don't perform any searches
1658     // without setting both the pattern and the text.
1659     UErrorCode status = U_ZERO_ERROR;
1660     UStringSearch* searcher = usearch_open(&newlineCharacter, 1, &newlineCharacter, 1, currentSearchLocaleID(), 0, &status);
1661     ASSERT(status == U_ZERO_ERROR || status == U_USING_FALLBACK_WARNING || status == U_USING_DEFAULT_WARNING);
1662     return searcher;
1663 }
1664 
searcher()1665 static UStringSearch* searcher()
1666 {
1667     static UStringSearch* searcher = createSearcher();
1668     return searcher;
1669 }
1670 
lockSearcher()1671 static inline void lockSearcher()
1672 {
1673 #ifndef NDEBUG
1674     ASSERT(!searcherInUse);
1675     searcherInUse = true;
1676 #endif
1677 }
1678 
unlockSearcher()1679 static inline void unlockSearcher()
1680 {
1681 #ifndef NDEBUG
1682     ASSERT(searcherInUse);
1683     searcherInUse = false;
1684 #endif
1685 }
1686 
1687 // ICU's search ignores the distinction between small kana letters and ones
1688 // that are not small, and also characters that differ only in the voicing
1689 // marks when considering only primary collation strength diffrences.
1690 // This is not helpful for end users, since these differences make words
1691 // distinct, so for our purposes we need these to be considered.
1692 // The Unicode folks do not think the collation algorithm should be
1693 // changed. To work around this, we would like to tailor the ICU searcher,
1694 // but we can't get that to work yet. So instead, we check for cases where
1695 // these differences occur, and skip those matches.
1696 
1697 // We refer to the above technique as the "kana workaround". The next few
1698 // functions are helper functinos for the kana workaround.
1699 
isKanaLetter(UChar character)1700 static inline bool isKanaLetter(UChar character)
1701 {
1702     // Hiragana letters.
1703     if (character >= 0x3041 && character <= 0x3096)
1704         return true;
1705 
1706     // Katakana letters.
1707     if (character >= 0x30A1 && character <= 0x30FA)
1708         return true;
1709     if (character >= 0x31F0 && character <= 0x31FF)
1710         return true;
1711 
1712     // Halfwidth katakana letters.
1713     if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70)
1714         return true;
1715 
1716     return false;
1717 }
1718 
isSmallKanaLetter(UChar character)1719 static inline bool isSmallKanaLetter(UChar character)
1720 {
1721     ASSERT(isKanaLetter(character));
1722 
1723     switch (character) {
1724     case 0x3041: // HIRAGANA LETTER SMALL A
1725     case 0x3043: // HIRAGANA LETTER SMALL I
1726     case 0x3045: // HIRAGANA LETTER SMALL U
1727     case 0x3047: // HIRAGANA LETTER SMALL E
1728     case 0x3049: // HIRAGANA LETTER SMALL O
1729     case 0x3063: // HIRAGANA LETTER SMALL TU
1730     case 0x3083: // HIRAGANA LETTER SMALL YA
1731     case 0x3085: // HIRAGANA LETTER SMALL YU
1732     case 0x3087: // HIRAGANA LETTER SMALL YO
1733     case 0x308E: // HIRAGANA LETTER SMALL WA
1734     case 0x3095: // HIRAGANA LETTER SMALL KA
1735     case 0x3096: // HIRAGANA LETTER SMALL KE
1736     case 0x30A1: // KATAKANA LETTER SMALL A
1737     case 0x30A3: // KATAKANA LETTER SMALL I
1738     case 0x30A5: // KATAKANA LETTER SMALL U
1739     case 0x30A7: // KATAKANA LETTER SMALL E
1740     case 0x30A9: // KATAKANA LETTER SMALL O
1741     case 0x30C3: // KATAKANA LETTER SMALL TU
1742     case 0x30E3: // KATAKANA LETTER SMALL YA
1743     case 0x30E5: // KATAKANA LETTER SMALL YU
1744     case 0x30E7: // KATAKANA LETTER SMALL YO
1745     case 0x30EE: // KATAKANA LETTER SMALL WA
1746     case 0x30F5: // KATAKANA LETTER SMALL KA
1747     case 0x30F6: // KATAKANA LETTER SMALL KE
1748     case 0x31F0: // KATAKANA LETTER SMALL KU
1749     case 0x31F1: // KATAKANA LETTER SMALL SI
1750     case 0x31F2: // KATAKANA LETTER SMALL SU
1751     case 0x31F3: // KATAKANA LETTER SMALL TO
1752     case 0x31F4: // KATAKANA LETTER SMALL NU
1753     case 0x31F5: // KATAKANA LETTER SMALL HA
1754     case 0x31F6: // KATAKANA LETTER SMALL HI
1755     case 0x31F7: // KATAKANA LETTER SMALL HU
1756     case 0x31F8: // KATAKANA LETTER SMALL HE
1757     case 0x31F9: // KATAKANA LETTER SMALL HO
1758     case 0x31FA: // KATAKANA LETTER SMALL MU
1759     case 0x31FB: // KATAKANA LETTER SMALL RA
1760     case 0x31FC: // KATAKANA LETTER SMALL RI
1761     case 0x31FD: // KATAKANA LETTER SMALL RU
1762     case 0x31FE: // KATAKANA LETTER SMALL RE
1763     case 0x31FF: // KATAKANA LETTER SMALL RO
1764     case 0xFF67: // HALFWIDTH KATAKANA LETTER SMALL A
1765     case 0xFF68: // HALFWIDTH KATAKANA LETTER SMALL I
1766     case 0xFF69: // HALFWIDTH KATAKANA LETTER SMALL U
1767     case 0xFF6A: // HALFWIDTH KATAKANA LETTER SMALL E
1768     case 0xFF6B: // HALFWIDTH KATAKANA LETTER SMALL O
1769     case 0xFF6C: // HALFWIDTH KATAKANA LETTER SMALL YA
1770     case 0xFF6D: // HALFWIDTH KATAKANA LETTER SMALL YU
1771     case 0xFF6E: // HALFWIDTH KATAKANA LETTER SMALL YO
1772     case 0xFF6F: // HALFWIDTH KATAKANA LETTER SMALL TU
1773         return true;
1774     }
1775     return false;
1776 }
1777 
1778 enum VoicedSoundMarkType { NoVoicedSoundMark, VoicedSoundMark, SemiVoicedSoundMark };
1779 
composedVoicedSoundMark(UChar character)1780 static inline VoicedSoundMarkType composedVoicedSoundMark(UChar character)
1781 {
1782     ASSERT(isKanaLetter(character));
1783 
1784     switch (character) {
1785     case 0x304C: // HIRAGANA LETTER GA
1786     case 0x304E: // HIRAGANA LETTER GI
1787     case 0x3050: // HIRAGANA LETTER GU
1788     case 0x3052: // HIRAGANA LETTER GE
1789     case 0x3054: // HIRAGANA LETTER GO
1790     case 0x3056: // HIRAGANA LETTER ZA
1791     case 0x3058: // HIRAGANA LETTER ZI
1792     case 0x305A: // HIRAGANA LETTER ZU
1793     case 0x305C: // HIRAGANA LETTER ZE
1794     case 0x305E: // HIRAGANA LETTER ZO
1795     case 0x3060: // HIRAGANA LETTER DA
1796     case 0x3062: // HIRAGANA LETTER DI
1797     case 0x3065: // HIRAGANA LETTER DU
1798     case 0x3067: // HIRAGANA LETTER DE
1799     case 0x3069: // HIRAGANA LETTER DO
1800     case 0x3070: // HIRAGANA LETTER BA
1801     case 0x3073: // HIRAGANA LETTER BI
1802     case 0x3076: // HIRAGANA LETTER BU
1803     case 0x3079: // HIRAGANA LETTER BE
1804     case 0x307C: // HIRAGANA LETTER BO
1805     case 0x3094: // HIRAGANA LETTER VU
1806     case 0x30AC: // KATAKANA LETTER GA
1807     case 0x30AE: // KATAKANA LETTER GI
1808     case 0x30B0: // KATAKANA LETTER GU
1809     case 0x30B2: // KATAKANA LETTER GE
1810     case 0x30B4: // KATAKANA LETTER GO
1811     case 0x30B6: // KATAKANA LETTER ZA
1812     case 0x30B8: // KATAKANA LETTER ZI
1813     case 0x30BA: // KATAKANA LETTER ZU
1814     case 0x30BC: // KATAKANA LETTER ZE
1815     case 0x30BE: // KATAKANA LETTER ZO
1816     case 0x30C0: // KATAKANA LETTER DA
1817     case 0x30C2: // KATAKANA LETTER DI
1818     case 0x30C5: // KATAKANA LETTER DU
1819     case 0x30C7: // KATAKANA LETTER DE
1820     case 0x30C9: // KATAKANA LETTER DO
1821     case 0x30D0: // KATAKANA LETTER BA
1822     case 0x30D3: // KATAKANA LETTER BI
1823     case 0x30D6: // KATAKANA LETTER BU
1824     case 0x30D9: // KATAKANA LETTER BE
1825     case 0x30DC: // KATAKANA LETTER BO
1826     case 0x30F4: // KATAKANA LETTER VU
1827     case 0x30F7: // KATAKANA LETTER VA
1828     case 0x30F8: // KATAKANA LETTER VI
1829     case 0x30F9: // KATAKANA LETTER VE
1830     case 0x30FA: // KATAKANA LETTER VO
1831         return VoicedSoundMark;
1832     case 0x3071: // HIRAGANA LETTER PA
1833     case 0x3074: // HIRAGANA LETTER PI
1834     case 0x3077: // HIRAGANA LETTER PU
1835     case 0x307A: // HIRAGANA LETTER PE
1836     case 0x307D: // HIRAGANA LETTER PO
1837     case 0x30D1: // KATAKANA LETTER PA
1838     case 0x30D4: // KATAKANA LETTER PI
1839     case 0x30D7: // KATAKANA LETTER PU
1840     case 0x30DA: // KATAKANA LETTER PE
1841     case 0x30DD: // KATAKANA LETTER PO
1842         return SemiVoicedSoundMark;
1843     }
1844     return NoVoicedSoundMark;
1845 }
1846 
isCombiningVoicedSoundMark(UChar character)1847 static inline bool isCombiningVoicedSoundMark(UChar character)
1848 {
1849     switch (character) {
1850     case 0x3099: // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
1851     case 0x309A: // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
1852         return true;
1853     }
1854     return false;
1855 }
1856 
containsKanaLetters(const String & pattern)1857 static inline bool containsKanaLetters(const String& pattern)
1858 {
1859     const UChar* characters = pattern.characters();
1860     unsigned length = pattern.length();
1861     for (unsigned i = 0; i < length; ++i) {
1862         if (isKanaLetter(characters[i]))
1863             return true;
1864     }
1865     return false;
1866 }
1867 
normalizeCharacters(const UChar * characters,unsigned length,Vector<UChar> & buffer)1868 static void normalizeCharacters(const UChar* characters, unsigned length, Vector<UChar>& buffer)
1869 {
1870     ASSERT(length);
1871 
1872     buffer.resize(length);
1873 
1874     UErrorCode status = U_ZERO_ERROR;
1875     size_t bufferSize = unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), length, &status);
1876     ASSERT(status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR);
1877     ASSERT(bufferSize);
1878 
1879     buffer.resize(bufferSize);
1880 
1881     if (status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING)
1882         return;
1883 
1884     status = U_ZERO_ERROR;
1885     unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), bufferSize, &status);
1886     ASSERT(status == U_STRING_NOT_TERMINATED_WARNING);
1887 }
1888 
isNonLatin1Separator(UChar32 character)1889 static bool isNonLatin1Separator(UChar32 character)
1890 {
1891     ASSERT_ARG(character, character >= 256);
1892 
1893     return U_GET_GC_MASK(character) & (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK);
1894 }
1895 
isSeparator(UChar32 character)1896 static inline bool isSeparator(UChar32 character)
1897 {
1898     static const bool latin1SeparatorTable[256] = {
1899         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1900         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1901         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // space ! " # $ % & ' ( ) * + , - . /
1902         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, //                         : ; < = > ?
1903         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //   @
1904         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, //                         [ \ ] ^ _
1905         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //   `
1906         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, //                           { | } ~
1907         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1908         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1909         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1910         1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1911         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1912         0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
1913         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1914         0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
1915     };
1916 
1917     if (character < 256)
1918         return latin1SeparatorTable[character];
1919 
1920     return isNonLatin1Separator(character);
1921 }
1922 
SearchBuffer(const String & target,FindOptions options)1923 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
1924     : m_target(target)
1925     , m_options(options)
1926     , m_prefixLength(0)
1927     , m_atBreak(true)
1928     , m_needsMoreContext(options & AtWordStarts)
1929     , m_targetRequiresKanaWorkaround(containsKanaLetters(m_target))
1930 {
1931     ASSERT(!m_target.isEmpty());
1932 
1933     // FIXME: We'd like to tailor the searcher to fold quote marks for us instead
1934     // of doing it in a separate replacement pass here, but ICU doesn't offer a way
1935     // to add tailoring on top of the locale-specific tailoring as of this writing.
1936     foldQuoteMarksAndSoftHyphens(m_target);
1937 
1938     size_t targetLength = m_target.length();
1939     m_buffer.reserveInitialCapacity(max(targetLength * 8, minimumSearchBufferSize));
1940     m_overlap = m_buffer.capacity() / 4;
1941 
1942     if ((m_options & AtWordStarts) && targetLength) {
1943         UChar32 targetFirstCharacter;
1944         U16_GET(m_target.characters(), 0, 0, targetLength, targetFirstCharacter);
1945         // Characters in the separator category never really occur at the beginning of a word,
1946         // so if the target begins with such a character, we just ignore the AtWordStart option.
1947         if (isSeparator(targetFirstCharacter)) {
1948             m_options &= ~AtWordStarts;
1949             m_needsMoreContext = false;
1950         }
1951     }
1952 
1953     // Grab the single global searcher.
1954     // If we ever have a reason to do more than once search buffer at once, we'll have
1955     // to move to multiple searchers.
1956     lockSearcher();
1957 
1958     UStringSearch* searcher = WebCore::searcher();
1959     UCollator* collator = usearch_getCollator(searcher);
1960 
1961     UCollationStrength strength = m_options & CaseInsensitive ? UCOL_PRIMARY : UCOL_TERTIARY;
1962     if (ucol_getStrength(collator) != strength) {
1963         ucol_setStrength(collator, strength);
1964         usearch_reset(searcher);
1965     }
1966 
1967     UErrorCode status = U_ZERO_ERROR;
1968     usearch_setPattern(searcher, m_target.characters(), targetLength, &status);
1969     ASSERT(status == U_ZERO_ERROR);
1970 
1971     // The kana workaround requires a normalized copy of the target string.
1972     if (m_targetRequiresKanaWorkaround)
1973         normalizeCharacters(m_target.characters(), m_target.length(), m_normalizedTarget);
1974 }
1975 
~SearchBuffer()1976 inline SearchBuffer::~SearchBuffer()
1977 {
1978     // Leave the static object pointing to a valid string.
1979     UErrorCode status = U_ZERO_ERROR;
1980     usearch_setPattern(WebCore::searcher(), &newlineCharacter, 1, &status);
1981     ASSERT(status == U_ZERO_ERROR);
1982 
1983     unlockSearcher();
1984 }
1985 
append(const UChar * characters,size_t length)1986 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
1987 {
1988     ASSERT(length);
1989 
1990     if (m_atBreak) {
1991         m_buffer.shrink(0);
1992         m_prefixLength = 0;
1993         m_atBreak = false;
1994     } else if (m_buffer.size() == m_buffer.capacity()) {
1995         memcpy(m_buffer.data(), m_buffer.data() + m_buffer.size() - m_overlap, m_overlap * sizeof(UChar));
1996         m_prefixLength -= min(m_prefixLength, m_buffer.size() - m_overlap);
1997         m_buffer.shrink(m_overlap);
1998     }
1999 
2000     size_t oldLength = m_buffer.size();
2001     size_t usableLength = min(m_buffer.capacity() - oldLength, length);
2002     ASSERT(usableLength);
2003     m_buffer.append(characters, usableLength);
2004     foldQuoteMarksAndSoftHyphens(m_buffer.data() + oldLength, usableLength);
2005     return usableLength;
2006 }
2007 
needsMoreContext() const2008 inline bool SearchBuffer::needsMoreContext() const
2009 {
2010     return m_needsMoreContext;
2011 }
2012 
prependContext(const UChar * characters,size_t length)2013 inline void SearchBuffer::prependContext(const UChar* characters, size_t length)
2014 {
2015     ASSERT(m_needsMoreContext);
2016     ASSERT(m_prefixLength == m_buffer.size());
2017 
2018     if (!length)
2019         return;
2020 
2021     m_atBreak = false;
2022 
2023     size_t wordBoundaryContextStart = length;
2024     if (wordBoundaryContextStart) {
2025         U16_BACK_1(characters, 0, wordBoundaryContextStart);
2026         wordBoundaryContextStart = startOfLastWordBoundaryContext(characters, wordBoundaryContextStart);
2027     }
2028 
2029     size_t usableLength = min(m_buffer.capacity() - m_prefixLength, length - wordBoundaryContextStart);
2030     m_buffer.prepend(characters + length - usableLength, usableLength);
2031     m_prefixLength += usableLength;
2032 
2033     if (wordBoundaryContextStart || m_prefixLength == m_buffer.capacity())
2034         m_needsMoreContext = false;
2035 }
2036 
atBreak() const2037 inline bool SearchBuffer::atBreak() const
2038 {
2039     return m_atBreak;
2040 }
2041 
reachedBreak()2042 inline void SearchBuffer::reachedBreak()
2043 {
2044     m_atBreak = true;
2045 }
2046 
isBadMatch(const UChar * match,size_t matchLength) const2047 inline bool SearchBuffer::isBadMatch(const UChar* match, size_t matchLength) const
2048 {
2049     // This function implements the kana workaround. If usearch treats
2050     // it as a match, but we do not want to, then it's a "bad match".
2051     if (!m_targetRequiresKanaWorkaround)
2052         return false;
2053 
2054     // Normalize into a match buffer. We reuse a single buffer rather than
2055     // creating a new one each time.
2056     normalizeCharacters(match, matchLength, m_normalizedMatch);
2057 
2058     const UChar* a = m_normalizedTarget.begin();
2059     const UChar* aEnd = m_normalizedTarget.end();
2060 
2061     const UChar* b = m_normalizedMatch.begin();
2062     const UChar* bEnd = m_normalizedMatch.end();
2063 
2064     while (true) {
2065         // Skip runs of non-kana-letter characters. This is necessary so we can
2066         // correctly handle strings where the target and match have different-length
2067         // runs of characters that match, while still double checking the correctness
2068         // of matches of kana letters with other kana letters.
2069         while (a != aEnd && !isKanaLetter(*a))
2070             ++a;
2071         while (b != bEnd && !isKanaLetter(*b))
2072             ++b;
2073 
2074         // If we reached the end of either the target or the match, we should have
2075         // reached the end of both; both should have the same number of kana letters.
2076         if (a == aEnd || b == bEnd) {
2077             ASSERT(a == aEnd);
2078             ASSERT(b == bEnd);
2079             return false;
2080         }
2081 
2082         // Check for differences in the kana letter character itself.
2083         if (isSmallKanaLetter(*a) != isSmallKanaLetter(*b))
2084             return true;
2085         if (composedVoicedSoundMark(*a) != composedVoicedSoundMark(*b))
2086             return true;
2087         ++a;
2088         ++b;
2089 
2090         // Check for differences in combining voiced sound marks found after the letter.
2091         while (1) {
2092             if (!(a != aEnd && isCombiningVoicedSoundMark(*a))) {
2093                 if (b != bEnd && isCombiningVoicedSoundMark(*b))
2094                     return true;
2095                 break;
2096             }
2097             if (!(b != bEnd && isCombiningVoicedSoundMark(*b)))
2098                 return true;
2099             if (*a != *b)
2100                 return true;
2101             ++a;
2102             ++b;
2103         }
2104     }
2105 }
2106 
isWordStartMatch(size_t start,size_t length) const2107 inline bool SearchBuffer::isWordStartMatch(size_t start, size_t length) const
2108 {
2109     ASSERT(m_options & AtWordStarts);
2110 
2111     if (!start)
2112         return true;
2113 
2114     if (m_options & TreatMedialCapitalAsWordStart) {
2115         int size = m_buffer.size();
2116         int offset = start;
2117         UChar32 firstCharacter;
2118         U16_GET(m_buffer.data(), 0, offset, size, firstCharacter);
2119         UChar32 previousCharacter;
2120         U16_PREV(m_buffer.data(), 0, offset, previousCharacter);
2121 
2122         if (isSeparator(firstCharacter)) {
2123             // The start of a separator run is a word start (".org" in "webkit.org").
2124             if (!isSeparator(previousCharacter))
2125                 return true;
2126         } else if (isASCIIUpper(firstCharacter)) {
2127             // The start of an uppercase run is a word start ("Kit" in "WebKit").
2128             if (!isASCIIUpper(previousCharacter))
2129                 return true;
2130             // The last character of an uppercase run followed by a non-separator, non-digit
2131             // is a word start ("Request" in "XMLHTTPRequest").
2132             offset = start;
2133             U16_FWD_1(m_buffer.data(), offset, size);
2134             UChar32 nextCharacter = 0;
2135             if (offset < size)
2136                 U16_GET(m_buffer.data(), 0, offset, size, nextCharacter);
2137             if (!isASCIIUpper(nextCharacter) && !isASCIIDigit(nextCharacter) && !isSeparator(nextCharacter))
2138                 return true;
2139         } else if (isASCIIDigit(firstCharacter)) {
2140             // The start of a digit run is a word start ("2" in "WebKit2").
2141             if (!isASCIIDigit(previousCharacter))
2142                 return true;
2143         } else if (isSeparator(previousCharacter) || isASCIIDigit(previousCharacter)) {
2144             // The start of a non-separator, non-uppercase, non-digit run is a word start,
2145             // except after an uppercase. ("org" in "webkit.org", but not "ore" in "WebCore").
2146             return true;
2147         }
2148     }
2149 
2150     size_t wordBreakSearchStart = start + length;
2151     while (wordBreakSearchStart > start)
2152         wordBreakSearchStart = findNextWordFromIndex(m_buffer.data(), m_buffer.size(), wordBreakSearchStart, false /* backwards */);
2153     return wordBreakSearchStart == start;
2154 }
2155 
search(size_t & start)2156 inline size_t SearchBuffer::search(size_t& start)
2157 {
2158     size_t size = m_buffer.size();
2159     if (m_atBreak) {
2160         if (!size)
2161             return 0;
2162     } else {
2163         if (size != m_buffer.capacity())
2164             return 0;
2165     }
2166 
2167     UStringSearch* searcher = WebCore::searcher();
2168 
2169     UErrorCode status = U_ZERO_ERROR;
2170     usearch_setText(searcher, m_buffer.data(), size, &status);
2171     ASSERT(status == U_ZERO_ERROR);
2172 
2173     usearch_setOffset(searcher, m_prefixLength, &status);
2174     ASSERT(status == U_ZERO_ERROR);
2175 
2176     int matchStart = usearch_next(searcher, &status);
2177     ASSERT(status == U_ZERO_ERROR);
2178 
2179 nextMatch:
2180     if (!(matchStart >= 0 && static_cast<size_t>(matchStart) < size)) {
2181         ASSERT(matchStart == USEARCH_DONE);
2182         return 0;
2183     }
2184 
2185     // Matches that start in the overlap area are only tentative.
2186     // The same match may appear later, matching more characters,
2187     // possibly including a combining character that's not yet in the buffer.
2188     if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) {
2189         size_t overlap = m_overlap;
2190         if (m_options & AtWordStarts) {
2191             // Ensure that there is sufficient context before matchStart the next time around for
2192             // determining if it is at a word boundary.
2193             int wordBoundaryContextStart = matchStart;
2194             U16_BACK_1(m_buffer.data(), 0, wordBoundaryContextStart);
2195             wordBoundaryContextStart = startOfLastWordBoundaryContext(m_buffer.data(), wordBoundaryContextStart);
2196             overlap = min(size - 1, max(overlap, size - wordBoundaryContextStart));
2197         }
2198         memcpy(m_buffer.data(), m_buffer.data() + size - overlap, overlap * sizeof(UChar));
2199         m_prefixLength -= min(m_prefixLength, size - overlap);
2200         m_buffer.shrink(overlap);
2201         return 0;
2202     }
2203 
2204     size_t matchedLength = usearch_getMatchedLength(searcher);
2205     ASSERT(matchStart + matchedLength <= size);
2206 
2207     // If this match is "bad", move on to the next match.
2208     if (isBadMatch(m_buffer.data() + matchStart, matchedLength) || ((m_options & AtWordStarts) && !isWordStartMatch(matchStart, matchedLength))) {
2209         matchStart = usearch_next(searcher, &status);
2210         ASSERT(status == U_ZERO_ERROR);
2211         goto nextMatch;
2212     }
2213 
2214     size_t newSize = size - (matchStart + 1);
2215     memmove(m_buffer.data(), m_buffer.data() + matchStart + 1, newSize * sizeof(UChar));
2216     m_prefixLength -= min<size_t>(m_prefixLength, matchStart + 1);
2217     m_buffer.shrink(newSize);
2218 
2219     start = size - matchStart;
2220     return matchedLength;
2221 }
2222 
2223 #else // !ICU_UNICODE
2224 
SearchBuffer(const String & target,FindOptions options)2225 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
2226     : m_target(options & CaseInsensitive ? target.foldCase() : target)
2227     , m_options(options)
2228     , m_buffer(m_target.length())
2229     , m_isCharacterStartBuffer(m_target.length())
2230     , m_isBufferFull(false)
2231     , m_cursor(0)
2232 {
2233     ASSERT(!m_target.isEmpty());
2234     m_target.replace(noBreakSpace, ' ');
2235     foldQuoteMarksAndSoftHyphens(m_target);
2236 }
2237 
~SearchBuffer()2238 inline SearchBuffer::~SearchBuffer()
2239 {
2240 }
2241 
reachedBreak()2242 inline void SearchBuffer::reachedBreak()
2243 {
2244     m_cursor = 0;
2245     m_isBufferFull = false;
2246 }
2247 
atBreak() const2248 inline bool SearchBuffer::atBreak() const
2249 {
2250     return !m_cursor && !m_isBufferFull;
2251 }
2252 
append(UChar c,bool isStart)2253 inline void SearchBuffer::append(UChar c, bool isStart)
2254 {
2255     m_buffer[m_cursor] = c == noBreakSpace ? ' ' : foldQuoteMarkOrSoftHyphen(c);
2256     m_isCharacterStartBuffer[m_cursor] = isStart;
2257     if (++m_cursor == m_target.length()) {
2258         m_cursor = 0;
2259         m_isBufferFull = true;
2260     }
2261 }
2262 
append(const UChar * characters,size_t length)2263 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
2264 {
2265     ASSERT(length);
2266     if (!(m_options & CaseInsensitive)) {
2267         append(characters[0], true);
2268         return 1;
2269     }
2270     const int maxFoldedCharacters = 16; // sensible maximum is 3, this should be more than enough
2271     UChar foldedCharacters[maxFoldedCharacters];
2272     bool error;
2273     int numFoldedCharacters = foldCase(foldedCharacters, maxFoldedCharacters, characters, 1, &error);
2274     ASSERT(!error);
2275     ASSERT(numFoldedCharacters);
2276     ASSERT(numFoldedCharacters <= maxFoldedCharacters);
2277     if (!error && numFoldedCharacters) {
2278         numFoldedCharacters = min(numFoldedCharacters, maxFoldedCharacters);
2279         append(foldedCharacters[0], true);
2280         for (int i = 1; i < numFoldedCharacters; ++i)
2281             append(foldedCharacters[i], false);
2282     }
2283     return 1;
2284 }
2285 
needsMoreContext() const2286 inline bool SearchBuffer::needsMoreContext() const
2287 {
2288     return false;
2289 }
2290 
prependContext(const UChar *,size_t)2291 void SearchBuffer::prependContext(const UChar*, size_t)
2292 {
2293     ASSERT_NOT_REACHED();
2294 }
2295 
search(size_t & start)2296 inline size_t SearchBuffer::search(size_t& start)
2297 {
2298     if (!m_isBufferFull)
2299         return 0;
2300     if (!m_isCharacterStartBuffer[m_cursor])
2301         return 0;
2302 
2303     size_t tailSpace = m_target.length() - m_cursor;
2304     if (memcmp(&m_buffer[m_cursor], m_target.characters(), tailSpace * sizeof(UChar)) != 0)
2305         return 0;
2306     if (memcmp(&m_buffer[0], m_target.characters() + tailSpace, m_cursor * sizeof(UChar)) != 0)
2307         return 0;
2308 
2309     start = length();
2310 
2311     // Now that we've found a match once, we don't want to find it again, because those
2312     // are the SearchBuffer semantics, allowing for a buffer where you append more than one
2313     // character at a time. To do this we take advantage of m_isCharacterStartBuffer, but if
2314     // we want to get rid of that in the future we could track this with a separate boolean
2315     // or even move the characters to the start of the buffer and set m_isBufferFull to false.
2316     m_isCharacterStartBuffer[m_cursor] = false;
2317 
2318     return start;
2319 }
2320 
2321 // Returns the number of characters that were appended to the buffer (what we are searching in).
2322 // That's not necessarily the same length as the passed-in target string, because case folding
2323 // can make two strings match even though they're not the same length.
length() const2324 size_t SearchBuffer::length() const
2325 {
2326     size_t bufferSize = m_target.length();
2327     size_t length = 0;
2328     for (size_t i = 0; i < bufferSize; ++i)
2329         length += m_isCharacterStartBuffer[i];
2330     return length;
2331 }
2332 
2333 #endif // !ICU_UNICODE
2334 
2335 // --------
2336 
rangeLength(const Range * r,bool forSelectionPreservation)2337 int TextIterator::rangeLength(const Range* r, bool forSelectionPreservation)
2338 {
2339     int length = 0;
2340     for (TextIterator it(r, forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior); !it.atEnd(); it.advance())
2341         length += it.length();
2342 
2343     return length;
2344 }
2345 
subrange(Range * entireRange,int characterOffset,int characterCount)2346 PassRefPtr<Range> TextIterator::subrange(Range* entireRange, int characterOffset, int characterCount)
2347 {
2348     CharacterIterator entireRangeIterator(entireRange);
2349     return characterSubrange(entireRangeIterator, characterOffset, characterCount);
2350 }
2351 
rangeFromLocationAndLength(Element * scope,int rangeLocation,int rangeLength,bool forSelectionPreservation)2352 PassRefPtr<Range> TextIterator::rangeFromLocationAndLength(Element* scope, int rangeLocation, int rangeLength, bool forSelectionPreservation)
2353 {
2354     RefPtr<Range> resultRange = scope->document()->createRange();
2355 
2356     int docTextPosition = 0;
2357     int rangeEnd = rangeLocation + rangeLength;
2358     bool startRangeFound = false;
2359 
2360     RefPtr<Range> textRunRange;
2361 
2362     TextIterator it(rangeOfContents(scope).get(), forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior);
2363 
2364     // FIXME: the atEnd() check shouldn't be necessary, workaround for <http://bugs.webkit.org/show_bug.cgi?id=6289>.
2365     if (rangeLocation == 0 && rangeLength == 0 && it.atEnd()) {
2366         textRunRange = it.range();
2367 
2368         ExceptionCode ec = 0;
2369         resultRange->setStart(textRunRange->startContainer(), 0, ec);
2370         ASSERT(!ec);
2371         resultRange->setEnd(textRunRange->startContainer(), 0, ec);
2372         ASSERT(!ec);
2373 
2374         return resultRange.release();
2375     }
2376 
2377     for (; !it.atEnd(); it.advance()) {
2378         int len = it.length();
2379         textRunRange = it.range();
2380 
2381         bool foundStart = rangeLocation >= docTextPosition && rangeLocation <= docTextPosition + len;
2382         bool foundEnd = rangeEnd >= docTextPosition && rangeEnd <= docTextPosition + len;
2383 
2384         // Fix textRunRange->endPosition(), but only if foundStart || foundEnd, because it is only
2385         // in those cases that textRunRange is used.
2386         if (foundEnd) {
2387             // FIXME: This is a workaround for the fact that the end of a run is often at the wrong
2388             // position for emitted '\n's.
2389             if (len == 1 && it.characters()[0] == '\n') {
2390                 scope->document()->updateLayoutIgnorePendingStylesheets();
2391                 it.advance();
2392                 if (!it.atEnd()) {
2393                     RefPtr<Range> range = it.range();
2394                     ExceptionCode ec = 0;
2395                     textRunRange->setEnd(range->startContainer(), range->startOffset(), ec);
2396                     ASSERT(!ec);
2397                 } else {
2398                     Position runStart = textRunRange->startPosition();
2399                     Position runEnd = VisiblePosition(runStart).next().deepEquivalent();
2400                     if (runEnd.isNotNull()) {
2401                         ExceptionCode ec = 0;
2402                         textRunRange->setEnd(runEnd.containerNode(), runEnd.computeOffsetInContainerNode(), ec);
2403                         ASSERT(!ec);
2404                     }
2405                 }
2406             }
2407         }
2408 
2409         if (foundStart) {
2410             startRangeFound = true;
2411             int exception = 0;
2412             if (textRunRange->startContainer()->isTextNode()) {
2413                 int offset = rangeLocation - docTextPosition;
2414                 resultRange->setStart(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
2415             } else {
2416                 if (rangeLocation == docTextPosition)
2417                     resultRange->setStart(textRunRange->startContainer(), textRunRange->startOffset(), exception);
2418                 else
2419                     resultRange->setStart(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2420             }
2421         }
2422 
2423         if (foundEnd) {
2424             int exception = 0;
2425             if (textRunRange->startContainer()->isTextNode()) {
2426                 int offset = rangeEnd - docTextPosition;
2427                 resultRange->setEnd(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
2428             } else {
2429                 if (rangeEnd == docTextPosition)
2430                     resultRange->setEnd(textRunRange->startContainer(), textRunRange->startOffset(), exception);
2431                 else
2432                     resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2433             }
2434             docTextPosition += len;
2435             break;
2436         }
2437         docTextPosition += len;
2438     }
2439 
2440     if (!startRangeFound)
2441         return 0;
2442 
2443     if (rangeLength != 0 && rangeEnd > docTextPosition) { // rangeEnd is out of bounds
2444         int exception = 0;
2445         resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2446     }
2447 
2448     return resultRange.release();
2449 }
2450 
locationAndLengthFromRange(const Range * range,size_t & location,size_t & length)2451 bool TextIterator::locationAndLengthFromRange(const Range* range, size_t& location, size_t& length)
2452 {
2453     location = notFound;
2454     length = 0;
2455 
2456     if (!range->startContainer())
2457         return false;
2458 
2459     Element* selectionRoot = range->ownerDocument()->frame()->selection()->rootEditableElement();
2460     Element* scope = selectionRoot ? selectionRoot : range->ownerDocument()->documentElement();
2461 
2462     // The critical assumption is that this only gets called with ranges that
2463     // concentrate on a given area containing the selection root. This is done
2464     // because of text fields and textareas. The DOM for those is not
2465     // directly in the document DOM, so ensure that the range does not cross a
2466     // boundary of one of those.
2467     if (range->startContainer() != scope && !range->startContainer()->isDescendantOf(scope))
2468         return false;
2469     if (range->endContainer() != scope && !range->endContainer()->isDescendantOf(scope))
2470         return false;
2471 
2472     RefPtr<Range> testRange = Range::create(scope->document(), scope, 0, range->startContainer(), range->startOffset());
2473     ASSERT(testRange->startContainer() == scope);
2474     location = TextIterator::rangeLength(testRange.get());
2475 
2476     ExceptionCode ec;
2477     testRange->setEnd(range->endContainer(), range->endOffset(), ec);
2478     ASSERT(testRange->startContainer() == scope);
2479     length = TextIterator::rangeLength(testRange.get()) - location;
2480     return true;
2481 }
2482 
2483 // --------
2484 
plainTextToMallocAllocatedBuffer(const Range * r,unsigned & bufferLength,bool isDisplayString,TextIteratorBehavior defaultBehavior)2485 UChar* plainTextToMallocAllocatedBuffer(const Range* r, unsigned& bufferLength, bool isDisplayString, TextIteratorBehavior defaultBehavior)
2486 {
2487     UChar* result = 0;
2488 
2489     // Do this in pieces to avoid massive reallocations if there is a large amount of text.
2490     // Use system malloc for buffers since they can consume lots of memory and current TCMalloc is unable return it back to OS.
2491     static const unsigned cMaxSegmentSize = 1 << 16;
2492     bufferLength = 0;
2493     typedef pair<UChar*, unsigned> TextSegment;
2494     OwnPtr<Vector<TextSegment> > textSegments;
2495     Vector<UChar> textBuffer;
2496     textBuffer.reserveInitialCapacity(cMaxSegmentSize);
2497     TextIteratorBehavior behavior = defaultBehavior;
2498     if (!isDisplayString)
2499         behavior = static_cast<TextIteratorBehavior>(behavior | TextIteratorEmitsTextsWithoutTranscoding);
2500 
2501     for (TextIterator it(r, behavior); !it.atEnd(); it.advance()) {
2502         if (textBuffer.size() && textBuffer.size() + it.length() > cMaxSegmentSize) {
2503             UChar* newSegmentBuffer = static_cast<UChar*>(malloc(textBuffer.size() * sizeof(UChar)));
2504             if (!newSegmentBuffer)
2505                 goto exit;
2506             memcpy(newSegmentBuffer, textBuffer.data(), textBuffer.size() * sizeof(UChar));
2507             if (!textSegments)
2508                 textSegments = adoptPtr(new Vector<TextSegment>);
2509             textSegments->append(make_pair(newSegmentBuffer, (unsigned)textBuffer.size()));
2510             textBuffer.clear();
2511         }
2512         textBuffer.append(it.characters(), it.length());
2513         bufferLength += it.length();
2514     }
2515 
2516     if (!bufferLength)
2517         return 0;
2518 
2519     // Since we know the size now, we can make a single buffer out of the pieces with one big alloc
2520     result = static_cast<UChar*>(malloc(bufferLength * sizeof(UChar)));
2521     if (!result)
2522         goto exit;
2523 
2524     {
2525         UChar* resultPos = result;
2526         if (textSegments) {
2527             unsigned size = textSegments->size();
2528             for (unsigned i = 0; i < size; ++i) {
2529                 const TextSegment& segment = textSegments->at(i);
2530                 memcpy(resultPos, segment.first, segment.second * sizeof(UChar));
2531                 resultPos += segment.second;
2532             }
2533         }
2534         memcpy(resultPos, textBuffer.data(), textBuffer.size() * sizeof(UChar));
2535     }
2536 
2537 exit:
2538     if (textSegments) {
2539         unsigned size = textSegments->size();
2540         for (unsigned i = 0; i < size; ++i)
2541             free(textSegments->at(i).first);
2542     }
2543 
2544     if (isDisplayString && r->ownerDocument())
2545         r->ownerDocument()->displayBufferModifiedByEncoding(result, bufferLength);
2546 
2547     return result;
2548 }
2549 
plainText(const Range * r,TextIteratorBehavior defaultBehavior)2550 String plainText(const Range* r, TextIteratorBehavior defaultBehavior)
2551 {
2552     unsigned length;
2553     UChar* buf = plainTextToMallocAllocatedBuffer(r, length, false, defaultBehavior);
2554     if (!buf)
2555         return "";
2556     String result(buf, length);
2557     free(buf);
2558     return result;
2559 }
2560 
isAllCollapsibleWhitespace(const String & string)2561 static inline bool isAllCollapsibleWhitespace(const String& string)
2562 {
2563     const UChar* characters = string.characters();
2564     unsigned length = string.length();
2565     for (unsigned i = 0; i < length; ++i) {
2566         if (!isCollapsibleWhitespace(characters[i]))
2567             return false;
2568     }
2569     return true;
2570 }
2571 
collapsedToBoundary(const Range * range,bool forward)2572 static PassRefPtr<Range> collapsedToBoundary(const Range* range, bool forward)
2573 {
2574     ExceptionCode ec = 0;
2575     RefPtr<Range> result = range->cloneRange(ec);
2576     ASSERT(!ec);
2577     result->collapse(!forward, ec);
2578     ASSERT(!ec);
2579     return result.release();
2580 }
2581 
findPlainText(CharacterIterator & it,const String & target,FindOptions options,size_t & matchStart)2582 static size_t findPlainText(CharacterIterator& it, const String& target, FindOptions options, size_t& matchStart)
2583 {
2584     matchStart = 0;
2585     size_t matchLength = 0;
2586 
2587     SearchBuffer buffer(target, options);
2588 
2589     if (buffer.needsMoreContext()) {
2590         RefPtr<Range> startRange = it.range();
2591         RefPtr<Range> beforeStartRange = startRange->ownerDocument()->createRange();
2592         ExceptionCode ec = 0;
2593         beforeStartRange->setEnd(startRange->startContainer(), startRange->startOffset(), ec);
2594         for (SimplifiedBackwardsTextIterator backwardsIterator(beforeStartRange.get()); !backwardsIterator.atEnd(); backwardsIterator.advance()) {
2595             buffer.prependContext(backwardsIterator.characters(), backwardsIterator.length());
2596             if (!buffer.needsMoreContext())
2597                 break;
2598         }
2599     }
2600 
2601     while (!it.atEnd()) {
2602         it.advance(buffer.append(it.characters(), it.length()));
2603 tryAgain:
2604         size_t matchStartOffset;
2605         if (size_t newMatchLength = buffer.search(matchStartOffset)) {
2606             // Note that we found a match, and where we found it.
2607             size_t lastCharacterInBufferOffset = it.characterOffset();
2608             ASSERT(lastCharacterInBufferOffset >= matchStartOffset);
2609             matchStart = lastCharacterInBufferOffset - matchStartOffset;
2610             matchLength = newMatchLength;
2611             // If searching forward, stop on the first match.
2612             // If searching backward, don't stop, so we end up with the last match.
2613             if (!(options & Backwards))
2614                 break;
2615             goto tryAgain;
2616         }
2617         if (it.atBreak() && !buffer.atBreak()) {
2618             buffer.reachedBreak();
2619             goto tryAgain;
2620         }
2621     }
2622 
2623     return matchLength;
2624 }
2625 
findPlainText(const Range * range,const String & target,FindOptions options)2626 PassRefPtr<Range> findPlainText(const Range* range, const String& target, FindOptions options)
2627 {
2628     // First, find the text.
2629     size_t matchStart;
2630     size_t matchLength;
2631     {
2632         CharacterIterator findIterator(range, TextIteratorEntersTextControls);
2633         matchLength = findPlainText(findIterator, target, options, matchStart);
2634         if (!matchLength)
2635             return collapsedToBoundary(range, !(options & Backwards));
2636     }
2637 
2638     // Then, find the document position of the start and the end of the text.
2639     CharacterIterator computeRangeIterator(range, TextIteratorEntersTextControls);
2640     return characterSubrange(computeRangeIterator, matchStart, matchLength);
2641 }
2642 
2643 }
2644