• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010 Apple Inc. All rights reserved.
3  * Copyright (C) 2005 Alexey Proskuryakov.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #include "config.h"
28 #include "TextIterator.h"
29 
30 #include "Document.h"
31 #include "Frame.h"
32 #include "HTMLElement.h"
33 #include "HTMLNames.h"
34 #include "htmlediting.h"
35 #include "InlineTextBox.h"
36 #include "Range.h"
37 #include "RenderTableCell.h"
38 #include "RenderTableRow.h"
39 #include "RenderTextControl.h"
40 #include "RenderTextFragment.h"
41 #include "TextBoundaries.h"
42 #include "TextBreakIterator.h"
43 #include "VisiblePosition.h"
44 #include "visible_units.h"
45 #include <wtf/unicode/CharacterNames.h>
46 
47 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
48 #include "TextBreakIteratorInternalICU.h"
49 #include <unicode/usearch.h>
50 #endif
51 
52 using namespace WTF::Unicode;
53 using namespace std;
54 
55 namespace WebCore {
56 
57 using namespace HTMLNames;
58 
59 // Buffer that knows how to compare with a search target.
60 // Keeps enough of the previous text to be able to search in the future, but no more.
61 // Non-breaking spaces are always equal to normal spaces.
62 // Case folding is also done if the CaseInsensitive option is specified.
63 // Matches are further filtered if the AtWordStarts option is specified, although some
64 // matches inside a word are permitted if TreatMedialCapitalAsWordStart is specified as well.
65 class SearchBuffer {
66     WTF_MAKE_NONCOPYABLE(SearchBuffer);
67 public:
68     SearchBuffer(const String& target, FindOptions);
69     ~SearchBuffer();
70 
71     // Returns number of characters appended; guaranteed to be in the range [1, length].
72     size_t append(const UChar*, size_t length);
73     bool needsMoreContext() const;
74     void prependContext(const UChar*, size_t length);
75     void reachedBreak();
76 
77     // Result is the size in characters of what was found.
78     // And <startOffset> is the number of characters back to the start of what was found.
79     size_t search(size_t& startOffset);
80     bool atBreak() const;
81 
82 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
83 
84 private:
85     bool isBadMatch(const UChar*, size_t length) const;
86     bool isWordStartMatch(size_t start, size_t length) const;
87 
88     String m_target;
89     FindOptions m_options;
90 
91     Vector<UChar> m_buffer;
92     size_t m_overlap;
93     size_t m_prefixLength;
94     bool m_atBreak;
95     bool m_needsMoreContext;
96 
97     bool m_targetRequiresKanaWorkaround;
98     Vector<UChar> m_normalizedTarget;
99     mutable Vector<UChar> m_normalizedMatch;
100 
101 #else
102 
103 private:
104     void append(UChar, bool isCharacterStart);
105     size_t length() const;
106 
107     String m_target;
108     FindOptions m_options;
109 
110     Vector<UChar> m_buffer;
111     Vector<bool> m_isCharacterStartBuffer;
112     bool m_isBufferFull;
113     size_t m_cursor;
114 
115 #endif
116 };
117 
118 // --------
119 
120 static const unsigned bitsInWord = sizeof(unsigned) * 8;
121 static const unsigned bitInWordMask = bitsInWord - 1;
122 
BitStack()123 BitStack::BitStack()
124     : m_size(0)
125 {
126 }
127 
~BitStack()128 BitStack::~BitStack()
129 {
130 }
131 
push(bool bit)132 void BitStack::push(bool bit)
133 {
134     unsigned index = m_size / bitsInWord;
135     unsigned shift = m_size & bitInWordMask;
136     if (!shift && index == m_words.size()) {
137         m_words.grow(index + 1);
138         m_words[index] = 0;
139     }
140     unsigned& word = m_words[index];
141     unsigned mask = 1U << shift;
142     if (bit)
143         word |= mask;
144     else
145         word &= ~mask;
146     ++m_size;
147 }
148 
pop()149 void BitStack::pop()
150 {
151     if (m_size)
152         --m_size;
153 }
154 
top() const155 bool BitStack::top() const
156 {
157     if (!m_size)
158         return false;
159     unsigned shift = (m_size - 1) & bitInWordMask;
160     return m_words.last() & (1U << shift);
161 }
162 
size() const163 unsigned BitStack::size() const
164 {
165     return m_size;
166 }
167 
168 // --------
169 
170 #if !ASSERT_DISABLED
171 
depthCrossingShadowBoundaries(Node * node)172 static unsigned depthCrossingShadowBoundaries(Node* node)
173 {
174     unsigned depth = 0;
175     for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
176         ++depth;
177     return depth;
178 }
179 
180 #endif
181 
182 // This function is like Range::pastLastNode, except for the fact that it can climb up out of shadow trees.
nextInPreOrderCrossingShadowBoundaries(Node * rangeEndContainer,int rangeEndOffset)183 static Node* nextInPreOrderCrossingShadowBoundaries(Node* rangeEndContainer, int rangeEndOffset)
184 {
185     if (!rangeEndContainer)
186         return 0;
187     if (rangeEndOffset >= 0 && !rangeEndContainer->offsetInCharacters()) {
188         if (Node* next = rangeEndContainer->childNode(rangeEndOffset))
189             return next;
190     }
191     for (Node* node = rangeEndContainer; node; node = node->parentOrHostNode()) {
192         if (Node* next = node->nextSibling())
193             return next;
194     }
195     return 0;
196 }
197 
198 // --------
199 
fullyClipsContents(Node * node)200 static inline bool fullyClipsContents(Node* node)
201 {
202     RenderObject* renderer = node->renderer();
203     if (!renderer || !renderer->isBox() || !renderer->hasOverflowClip())
204         return false;
205     return toRenderBox(renderer)->size().isEmpty();
206 }
207 
ignoresContainerClip(Node * node)208 static inline bool ignoresContainerClip(Node* node)
209 {
210     RenderObject* renderer = node->renderer();
211     if (!renderer || renderer->isText())
212         return false;
213     EPosition position = renderer->style()->position();
214     return position == AbsolutePosition || position == FixedPosition;
215 }
216 
pushFullyClippedState(BitStack & stack,Node * node)217 static void pushFullyClippedState(BitStack& stack, Node* node)
218 {
219     ASSERT(stack.size() == depthCrossingShadowBoundaries(node));
220 
221     // Push true if this node full clips its contents, or if a parent already has fully
222     // clipped and this is not a node that ignores its container's clip.
223     stack.push(fullyClipsContents(node) || (stack.top() && !ignoresContainerClip(node)));
224 }
225 
setUpFullyClippedStack(BitStack & stack,Node * node)226 static void setUpFullyClippedStack(BitStack& stack, Node* node)
227 {
228     // Put the nodes in a vector so we can iterate in reverse order.
229     Vector<Node*, 100> ancestry;
230     for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
231         ancestry.append(parent);
232 
233     // Call pushFullyClippedState on each node starting with the earliest ancestor.
234     size_t size = ancestry.size();
235     for (size_t i = 0; i < size; ++i)
236         pushFullyClippedState(stack, ancestry[size - i - 1]);
237     pushFullyClippedState(stack, node);
238 
239     ASSERT(stack.size() == 1 + depthCrossingShadowBoundaries(node));
240 }
241 
242 #if OS(ANDROID)
checkFormControlElement(Node * startNode)243 static bool checkFormControlElement(Node* startNode)
244 {
245     Node* node = startNode;
246     while (node) {
247         if (node->isElementNode() && static_cast<Element*>(node)->isFormControlElement())
248             return true;
249         node = node->parentNode();
250     }
251     return false;
252 }
253 #endif
254 
255 
256 // --------
257 
TextIterator()258 TextIterator::TextIterator()
259     : m_startContainer(0)
260     , m_startOffset(0)
261     , m_endContainer(0)
262     , m_endOffset(0)
263     , m_positionNode(0)
264     , m_textCharacters(0)
265     , m_textLength(0)
266     , m_remainingTextBox(0)
267     , m_firstLetterText(0)
268     , m_lastCharacter(0)
269     , m_emitsCharactersBetweenAllVisiblePositions(false)
270     , m_entersTextControls(false)
271     , m_emitsTextWithoutTranscoding(false)
272     , m_handledFirstLetter(false)
273     , m_ignoresStyleVisibility(false)
274     , m_emitsObjectReplacementCharacters(false)
275 #if OS(ANDROID)
276     , m_stopsOnFormControls(false)
277     , m_shouldStop(false)
278 #endif
279 {
280 }
281 
TextIterator(const Range * r,TextIteratorBehavior behavior)282 TextIterator::TextIterator(const Range* r, TextIteratorBehavior behavior)
283     : m_startContainer(0)
284     , m_startOffset(0)
285     , m_endContainer(0)
286     , m_endOffset(0)
287     , m_positionNode(0)
288     , m_textCharacters(0)
289     , m_textLength(0)
290     , m_remainingTextBox(0)
291     , m_firstLetterText(0)
292     , m_emitsCharactersBetweenAllVisiblePositions(behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions)
293     , m_entersTextControls(behavior & TextIteratorEntersTextControls)
294     , m_emitsTextWithoutTranscoding(behavior & TextIteratorEmitsTextsWithoutTranscoding)
295     , m_handledFirstLetter(false)
296     , m_ignoresStyleVisibility(behavior & TextIteratorIgnoresStyleVisibility)
297     , m_emitsObjectReplacementCharacters(behavior & TextIteratorEmitsObjectReplacementCharacters)
298 #if OS(ANDROID)
299     , m_stopsOnFormControls(behavior & TextIteratorStopsOnFormControls)
300     , m_shouldStop(false)
301     , m_needsAnotherNewline(false)
302 #endif
303 {
304     if (!r)
305         return;
306 
307     // get and validate the range endpoints
308     Node* startContainer = r->startContainer();
309     if (!startContainer)
310         return;
311     int startOffset = r->startOffset();
312     Node* endContainer = r->endContainer();
313     int endOffset = r->endOffset();
314 
315     // Callers should be handing us well-formed ranges. If we discover that this isn't
316     // the case, we could consider changing this assertion to an early return.
317     ASSERT(r->boundaryPointsValid());
318 
319     // remember range - this does not change
320     m_startContainer = startContainer;
321     m_startOffset = startOffset;
322     m_endContainer = endContainer;
323     m_endOffset = endOffset;
324 
325     // set up the current node for processing
326     m_node = r->firstNode();
327     if (!m_node)
328         return;
329     setUpFullyClippedStack(m_fullyClippedStack, m_node);
330     m_offset = m_node == m_startContainer ? m_startOffset : 0;
331     m_handledNode = false;
332     m_handledChildren = false;
333 
334     // calculate first out of bounds node
335     m_pastEndNode = nextInPreOrderCrossingShadowBoundaries(endContainer, endOffset);
336 
337     // initialize node processing state
338     m_needsAnotherNewline = false;
339     m_textBox = 0;
340 
341     // initialize record of previous node processing
342     m_hasEmitted = false;
343     m_lastTextNode = 0;
344     m_lastTextNodeEndedWithCollapsedSpace = false;
345     m_lastCharacter = 0;
346 
347 #ifndef NDEBUG
348     // need this just because of the assert in advance()
349     m_positionNode = m_node;
350 #endif
351 
352     // identify the first run
353     advance();
354 }
355 
~TextIterator()356 TextIterator::~TextIterator()
357 {
358 }
359 
atEnd() const360 bool TextIterator::atEnd() const
361 {
362 #if OS(ANDROID)
363     return !m_positionNode || m_shouldStop;
364 #else
365     return !m_positionNode;
366 #endif
367 }
368 
advance()369 void TextIterator::advance()
370 {
371 #if OS(ANDROID)
372     if (m_shouldStop)
373         return;
374 #endif
375     // reset the run information
376     m_positionNode = 0;
377     m_textLength = 0;
378 
379     // handle remembered node that needed a newline after the text node's newline
380     if (m_needsAnotherNewline) {
381         // Emit the extra newline, and position it *inside* m_node, after m_node's
382         // contents, in case it's a block, in the same way that we position the first
383         // newline.  The range for the emitted newline should start where the line
384         // break begins.
385         // FIXME: It would be cleaner if we emitted two newlines during the last
386         // iteration, instead of using m_needsAnotherNewline.
387         Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
388         emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
389         m_needsAnotherNewline = false;
390         return;
391     }
392 
393     if (!m_textBox && m_remainingTextBox) {
394         m_textBox = m_remainingTextBox;
395         m_remainingTextBox = 0;
396         m_firstLetterText = 0;
397         m_offset = 0;
398     }
399     // handle remembered text box
400     if (m_textBox) {
401         handleTextBox();
402         if (m_positionNode)
403             return;
404     }
405 
406     while (m_node && m_node != m_pastEndNode) {
407 #if OS(ANDROID)
408         if (!m_shouldStop && m_stopsOnFormControls && checkFormControlElement(m_node))
409             m_shouldStop = true;
410 #endif
411         // if the range ends at offset 0 of an element, represent the
412         // position, but not the content, of that element e.g. if the
413         // node is a blockflow element, emit a newline that
414         // precedes the element
415         if (m_node == m_endContainer && m_endOffset == 0) {
416             representNodeOffsetZero();
417             m_node = 0;
418             return;
419         }
420 
421         RenderObject* renderer = m_node->renderer();
422         if (!renderer) {
423             m_handledNode = true;
424             m_handledChildren = true;
425         } else {
426             // handle current node according to its type
427             if (!m_handledNode) {
428                 if (renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) // FIXME: What about CDATA_SECTION_NODE?
429                     m_handledNode = handleTextNode();
430                 else if (renderer && (renderer->isImage() || renderer->isWidget() ||
431                          (renderer->node() && renderer->node()->isElementNode() &&
432                           static_cast<Element*>(renderer->node())->isFormControlElement())))
433                     m_handledNode = handleReplacedElement();
434                 else
435                     m_handledNode = handleNonTextNode();
436                 if (m_positionNode)
437                     return;
438             }
439         }
440 
441         // find a new current node to handle in depth-first manner,
442         // calling exitNode() as we come back thru a parent node
443         Node* next = m_handledChildren ? 0 : m_node->firstChild();
444         m_offset = 0;
445         if (!next) {
446             next = m_node->nextSibling();
447             if (!next) {
448                 bool pastEnd = m_node->traverseNextNode() == m_pastEndNode;
449                 Node* parentNode = m_node->parentOrHostNode();
450                 while (!next && parentNode) {
451                     if ((pastEnd && parentNode == m_endContainer) || m_endContainer->isDescendantOf(parentNode))
452                         return;
453                     bool haveRenderer = m_node->renderer();
454                     m_node = parentNode;
455                     m_fullyClippedStack.pop();
456                     parentNode = m_node->parentOrHostNode();
457                     if (haveRenderer)
458                         exitNode();
459                     if (m_positionNode) {
460                         m_handledNode = true;
461                         m_handledChildren = true;
462                         return;
463                     }
464                     next = m_node->nextSibling();
465                 }
466             }
467             m_fullyClippedStack.pop();
468         }
469 
470         // set the new current node
471         m_node = next;
472         if (m_node)
473             pushFullyClippedState(m_fullyClippedStack, m_node);
474         m_handledNode = false;
475         m_handledChildren = false;
476         m_handledFirstLetter = false;
477         m_firstLetterText = 0;
478 
479         // how would this ever be?
480         if (m_positionNode)
481             return;
482     }
483 }
484 
handleTextNode()485 bool TextIterator::handleTextNode()
486 {
487     if (m_fullyClippedStack.top() && !m_ignoresStyleVisibility)
488         return false;
489 
490     RenderText* renderer = toRenderText(m_node->renderer());
491 
492     m_lastTextNode = m_node;
493     String str = renderer->text();
494 
495     // handle pre-formatted text
496     if (!renderer->style()->collapseWhiteSpace()) {
497         int runStart = m_offset;
498         if (m_lastTextNodeEndedWithCollapsedSpace && hasVisibleTextNode(renderer)) {
499             emitCharacter(' ', m_node, 0, runStart, runStart);
500             return false;
501         }
502         if (!m_handledFirstLetter && renderer->isTextFragment()) {
503             handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
504             if (m_firstLetterText) {
505                 String firstLetter = m_firstLetterText->text();
506                 emitText(m_node, m_firstLetterText, m_offset, m_offset + firstLetter.length());
507                 m_firstLetterText = 0;
508                 m_textBox = 0;
509                 return false;
510             }
511         }
512         if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
513             return false;
514         int strLength = str.length();
515         int end = (m_node == m_endContainer) ? m_endOffset : INT_MAX;
516         int runEnd = min(strLength, end);
517 
518         if (runStart >= runEnd)
519             return true;
520 
521         emitText(m_node, runStart, runEnd);
522         return true;
523     }
524 
525     if (!renderer->firstTextBox() && str.length() > 0) {
526         if (!m_handledFirstLetter && renderer->isTextFragment()) {
527             handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
528             if (m_firstLetterText) {
529                 handleTextBox();
530                 return false;
531             }
532         }
533         if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
534             return false;
535         m_lastTextNodeEndedWithCollapsedSpace = true; // entire block is collapsed space
536         return true;
537     }
538 
539     // Used when text boxes are out of order (Hebrew/Arabic w/ embeded LTR text)
540     if (renderer->containsReversedText()) {
541         m_sortedTextBoxes.clear();
542         for (InlineTextBox* textBox = renderer->firstTextBox(); textBox; textBox = textBox->nextTextBox()) {
543             m_sortedTextBoxes.append(textBox);
544         }
545         std::sort(m_sortedTextBoxes.begin(), m_sortedTextBoxes.end(), InlineTextBox::compareByStart);
546         m_sortedTextBoxesPosition = 0;
547     }
548 
549     m_textBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox();
550     if (!m_handledFirstLetter && renderer->isTextFragment() && !m_offset)
551         handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
552     handleTextBox();
553     return true;
554 }
555 
handleTextBox()556 void TextIterator::handleTextBox()
557 {
558     RenderText* renderer = m_firstLetterText ? m_firstLetterText : toRenderText(m_node->renderer());
559     if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility) {
560         m_textBox = 0;
561         return;
562     }
563     String str = renderer->text();
564     unsigned start = m_offset;
565     unsigned end = (m_node == m_endContainer) ? static_cast<unsigned>(m_endOffset) : UINT_MAX;
566     while (m_textBox) {
567         unsigned textBoxStart = m_textBox->start();
568         unsigned runStart = max(textBoxStart, start);
569 
570         // Check for collapsed space at the start of this run.
571         InlineTextBox* firstTextBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox();
572         bool needSpace = m_lastTextNodeEndedWithCollapsedSpace
573             || (m_textBox == firstTextBox && textBoxStart == runStart && runStart > 0);
574         if (needSpace && !isCollapsibleWhitespace(m_lastCharacter) && m_lastCharacter) {
575             if (m_lastTextNode == m_node && runStart > 0 && str[runStart - 1] == ' ') {
576                 unsigned spaceRunStart = runStart - 1;
577                 while (spaceRunStart > 0 && str[spaceRunStart - 1] == ' ')
578                     --spaceRunStart;
579                 emitText(m_node, renderer, spaceRunStart, spaceRunStart + 1);
580             } else
581                 emitCharacter(' ', m_node, 0, runStart, runStart);
582             return;
583         }
584         unsigned textBoxEnd = textBoxStart + m_textBox->len();
585         unsigned runEnd = min(textBoxEnd, end);
586 
587         // Determine what the next text box will be, but don't advance yet
588         InlineTextBox* nextTextBox = 0;
589         if (renderer->containsReversedText()) {
590             if (m_sortedTextBoxesPosition + 1 < m_sortedTextBoxes.size())
591                 nextTextBox = m_sortedTextBoxes[m_sortedTextBoxesPosition + 1];
592         } else
593             nextTextBox = m_textBox->nextTextBox();
594 
595         if (runStart < runEnd) {
596             // Handle either a single newline character (which becomes a space),
597             // or a run of characters that does not include a newline.
598             // This effectively translates newlines to spaces without copying the text.
599             if (str[runStart] == '\n') {
600                 emitCharacter(' ', m_node, 0, runStart, runStart + 1);
601                 m_offset = runStart + 1;
602             } else {
603                 size_t subrunEnd = str.find('\n', runStart);
604                 if (subrunEnd == notFound || subrunEnd > runEnd)
605                     subrunEnd = runEnd;
606 
607                 m_offset = subrunEnd;
608                 emitText(m_node, renderer, runStart, subrunEnd);
609             }
610 
611             // If we are doing a subrun that doesn't go to the end of the text box,
612             // come back again to finish handling this text box; don't advance to the next one.
613             if (static_cast<unsigned>(m_positionEndOffset) < textBoxEnd)
614                 return;
615 
616             // Advance and return
617             unsigned nextRunStart = nextTextBox ? nextTextBox->start() : str.length();
618             if (nextRunStart > runEnd)
619                 m_lastTextNodeEndedWithCollapsedSpace = true; // collapsed space between runs or at the end
620             m_textBox = nextTextBox;
621             if (renderer->containsReversedText())
622                 ++m_sortedTextBoxesPosition;
623             return;
624         }
625         // Advance and continue
626         m_textBox = nextTextBox;
627         if (renderer->containsReversedText())
628             ++m_sortedTextBoxesPosition;
629     }
630     if (!m_textBox && m_remainingTextBox) {
631         m_textBox = m_remainingTextBox;
632         m_remainingTextBox = 0;
633         m_firstLetterText = 0;
634         m_offset = 0;
635         handleTextBox();
636     }
637 }
638 
handleTextNodeFirstLetter(RenderTextFragment * renderer)639 void TextIterator::handleTextNodeFirstLetter(RenderTextFragment* renderer)
640 {
641     if (renderer->firstLetter()) {
642         RenderObject* r = renderer->firstLetter();
643         if (r->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
644             return;
645         for (RenderObject *currChild = r->firstChild(); currChild; currChild->nextSibling()) {
646             if (currChild->isText()) {
647                 RenderText* firstLetter = toRenderText(currChild);
648                 m_handledFirstLetter = true;
649                 m_remainingTextBox = m_textBox;
650                 m_textBox = firstLetter->firstTextBox();
651                 m_firstLetterText = firstLetter;
652                 return;
653             }
654         }
655     }
656     m_handledFirstLetter = true;
657 }
658 
handleReplacedElement()659 bool TextIterator::handleReplacedElement()
660 {
661     if (m_fullyClippedStack.top())
662         return false;
663 
664     RenderObject* renderer = m_node->renderer();
665     if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
666         return false;
667 
668     if (m_lastTextNodeEndedWithCollapsedSpace) {
669         emitCharacter(' ', m_lastTextNode->parentNode(), m_lastTextNode, 1, 1);
670         return false;
671     }
672 
673     if (m_entersTextControls && renderer->isTextControl()) {
674         if (HTMLElement* innerTextElement = toRenderTextControl(renderer)->innerTextElement()) {
675             m_node = innerTextElement->shadowTreeRootNode();
676             pushFullyClippedState(m_fullyClippedStack, m_node);
677             m_offset = 0;
678             return false;
679         }
680     }
681 
682     m_hasEmitted = true;
683 
684     if (m_emitsObjectReplacementCharacters && renderer && renderer->isReplaced()) {
685         emitCharacter(objectReplacementCharacter, m_node->parentNode(), m_node, 0, 1);
686         return true;
687     }
688 
689     if (m_emitsCharactersBetweenAllVisiblePositions) {
690         // We want replaced elements to behave like punctuation for boundary
691         // finding, and to simply take up space for the selection preservation
692         // code in moveParagraphs, so we use a comma.
693         emitCharacter(',', m_node->parentNode(), m_node, 0, 1);
694         return true;
695     }
696 
697     m_positionNode = m_node->parentNode();
698     m_positionOffsetBaseNode = m_node;
699     m_positionStartOffset = 0;
700     m_positionEndOffset = 1;
701 
702     m_textCharacters = 0;
703     m_textLength = 0;
704 
705     m_lastCharacter = 0;
706 
707     return true;
708 }
709 
hasVisibleTextNode(RenderText * renderer)710 bool TextIterator::hasVisibleTextNode(RenderText* renderer)
711 {
712     if (renderer->style()->visibility() == VISIBLE)
713         return true;
714     if (renderer->isTextFragment()) {
715         RenderTextFragment* fragment = static_cast<RenderTextFragment*>(renderer);
716         if (fragment->firstLetter() && fragment->firstLetter()->style()->visibility() == VISIBLE)
717             return true;
718     }
719     return false;
720 }
721 
shouldEmitTabBeforeNode(Node * node)722 static bool shouldEmitTabBeforeNode(Node* node)
723 {
724     RenderObject* r = node->renderer();
725 
726     // Table cells are delimited by tabs.
727     if (!r || !isTableCell(node))
728         return false;
729 
730     // Want a tab before every cell other than the first one
731     RenderTableCell* rc = toRenderTableCell(r);
732     RenderTable* t = rc->table();
733     return t && (t->cellBefore(rc) || t->cellAbove(rc));
734 }
735 
shouldEmitNewlineForNode(Node * node)736 static bool shouldEmitNewlineForNode(Node* node)
737 {
738     // br elements are represented by a single newline.
739     RenderObject* r = node->renderer();
740     if (!r)
741         return node->hasTagName(brTag);
742 
743     return r->isBR();
744 }
745 
shouldEmitNewlinesBeforeAndAfterNode(Node * node)746 static bool shouldEmitNewlinesBeforeAndAfterNode(Node* node)
747 {
748     // Block flow (versus inline flow) is represented by having
749     // a newline both before and after the element.
750     RenderObject* r = node->renderer();
751     if (!r) {
752         return (node->hasTagName(blockquoteTag)
753                 || node->hasTagName(ddTag)
754                 || node->hasTagName(divTag)
755                 || node->hasTagName(dlTag)
756                 || node->hasTagName(dtTag)
757                 || node->hasTagName(h1Tag)
758                 || node->hasTagName(h2Tag)
759                 || node->hasTagName(h3Tag)
760                 || node->hasTagName(h4Tag)
761                 || node->hasTagName(h5Tag)
762                 || node->hasTagName(h6Tag)
763                 || node->hasTagName(hrTag)
764                 || node->hasTagName(liTag)
765                 || node->hasTagName(listingTag)
766                 || node->hasTagName(olTag)
767                 || node->hasTagName(pTag)
768                 || node->hasTagName(preTag)
769                 || node->hasTagName(trTag)
770                 || node->hasTagName(ulTag));
771     }
772 
773     // Need to make an exception for table cells, because they are blocks, but we
774     // want them tab-delimited rather than having newlines before and after.
775     if (isTableCell(node))
776         return false;
777 
778     // Need to make an exception for table row elements, because they are neither
779     // "inline" or "RenderBlock", but we want newlines for them.
780     if (r->isTableRow()) {
781         RenderTable* t = toRenderTableRow(r)->table();
782         if (t && !t->isInline())
783             return true;
784     }
785 
786     return !r->isInline() && r->isRenderBlock() && !r->isFloatingOrPositioned() && !r->isBody();
787 }
788 
shouldEmitNewlineAfterNode(Node * node)789 static bool shouldEmitNewlineAfterNode(Node* node)
790 {
791     // FIXME: It should be better but slower to create a VisiblePosition here.
792     if (!shouldEmitNewlinesBeforeAndAfterNode(node))
793         return false;
794     // Check if this is the very last renderer in the document.
795     // If so, then we should not emit a newline.
796     while ((node = node->traverseNextSibling()))
797         if (node->renderer())
798             return true;
799     return false;
800 }
801 
shouldEmitNewlineBeforeNode(Node * node)802 static bool shouldEmitNewlineBeforeNode(Node* node)
803 {
804     return shouldEmitNewlinesBeforeAndAfterNode(node);
805 }
806 
shouldEmitExtraNewlineForNode(Node * node)807 static bool shouldEmitExtraNewlineForNode(Node* node)
808 {
809     // When there is a significant collapsed bottom margin, emit an extra
810     // newline for a more realistic result.  We end up getting the right
811     // result even without margin collapsing. For example: <div><p>text</p></div>
812     // will work right even if both the <div> and the <p> have bottom margins.
813     RenderObject* r = node->renderer();
814     if (!r || !r->isBox())
815         return false;
816 
817     // NOTE: We only do this for a select set of nodes, and fwiw WinIE appears
818     // not to do this at all
819     if (node->hasTagName(h1Tag)
820         || node->hasTagName(h2Tag)
821         || node->hasTagName(h3Tag)
822         || node->hasTagName(h4Tag)
823         || node->hasTagName(h5Tag)
824         || node->hasTagName(h6Tag)
825         || node->hasTagName(pTag)) {
826         RenderStyle* style = r->style();
827         if (style) {
828             int bottomMargin = toRenderBox(r)->collapsedMarginAfter();
829             int fontSize = style->fontDescription().computedPixelSize();
830             if (bottomMargin * 2 >= fontSize)
831                 return true;
832         }
833     }
834 
835     return false;
836 }
837 
collapsedSpaceLength(RenderText * renderer,int textEnd)838 static int collapsedSpaceLength(RenderText* renderer, int textEnd)
839 {
840     const UChar* characters = renderer->text()->characters();
841     int length = renderer->text()->length();
842     for (int i = textEnd; i < length; ++i) {
843         if (!renderer->style()->isCollapsibleWhiteSpace(characters[i]))
844             return i - textEnd;
845     }
846 
847     return length - textEnd;
848 }
849 
maxOffsetIncludingCollapsedSpaces(Node * node)850 static int maxOffsetIncludingCollapsedSpaces(Node* node)
851 {
852     int offset = caretMaxOffset(node);
853 
854     if (node->renderer() && node->renderer()->isText())
855         offset += collapsedSpaceLength(toRenderText(node->renderer()), offset);
856 
857     return offset;
858 }
859 
860 // Whether or not we should emit a character as we enter m_node (if it's a container) or as we hit it (if it's atomic).
shouldRepresentNodeOffsetZero()861 bool TextIterator::shouldRepresentNodeOffsetZero()
862 {
863     if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isTable())
864         return true;
865 
866     // Leave element positioned flush with start of a paragraph
867     // (e.g. do not insert tab before a table cell at the start of a paragraph)
868     if (m_lastCharacter == '\n')
869         return false;
870 
871     // Otherwise, show the position if we have emitted any characters
872     if (m_hasEmitted)
873         return true;
874 
875     // We've not emitted anything yet. Generally, there is no need for any positioning then.
876     // The only exception is when the element is visually not in the same line as
877     // the start of the range (e.g. the range starts at the end of the previous paragraph).
878     // NOTE: Creating VisiblePositions and comparing them is relatively expensive, so we
879     // make quicker checks to possibly avoid that. Another check that we could make is
880     // is whether the inline vs block flow changed since the previous visible element.
881     // I think we're already in a special enough case that that won't be needed, tho.
882 
883     // No character needed if this is the first node in the range.
884     if (m_node == m_startContainer)
885         return false;
886 
887     // If we are outside the start container's subtree, assume we need to emit.
888     // FIXME: m_startContainer could be an inline block
889     if (!m_node->isDescendantOf(m_startContainer))
890         return true;
891 
892     // If we started as m_startContainer offset 0 and the current node is a descendant of
893     // the start container, we already had enough context to correctly decide whether to
894     // emit after a preceding block. We chose not to emit (m_hasEmitted is false),
895     // so don't second guess that now.
896     // NOTE: Is this really correct when m_node is not a leftmost descendant? Probably
897     // immaterial since we likely would have already emitted something by now.
898     if (m_startOffset == 0)
899         return false;
900 
901     // If this node is unrendered or invisible the VisiblePosition checks below won't have much meaning.
902     // Additionally, if the range we are iterating over contains huge sections of unrendered content,
903     // we would create VisiblePositions on every call to this function without this check.
904     if (!m_node->renderer() || m_node->renderer()->style()->visibility() != VISIBLE)
905         return false;
906 
907     // The startPos.isNotNull() check is needed because the start could be before the body,
908     // and in that case we'll get null. We don't want to put in newlines at the start in that case.
909     // The currPos.isNotNull() check is needed because positions in non-HTML content
910     // (like SVG) do not have visible positions, and we don't want to emit for them either.
911     VisiblePosition startPos = VisiblePosition(Position(m_startContainer, m_startOffset, Position::PositionIsOffsetInAnchor), DOWNSTREAM);
912     VisiblePosition currPos = VisiblePosition(positionBeforeNode(m_node), DOWNSTREAM);
913     return startPos.isNotNull() && currPos.isNotNull() && !inSameLine(startPos, currPos);
914 }
915 
shouldEmitSpaceBeforeAndAfterNode(Node * node)916 bool TextIterator::shouldEmitSpaceBeforeAndAfterNode(Node* node)
917 {
918     return node->renderer() && node->renderer()->isTable() && (node->renderer()->isInline() || m_emitsCharactersBetweenAllVisiblePositions);
919 }
920 
representNodeOffsetZero()921 void TextIterator::representNodeOffsetZero()
922 {
923     // Emit a character to show the positioning of m_node.
924 
925     // When we haven't been emitting any characters, shouldRepresentNodeOffsetZero() can
926     // create VisiblePositions, which is expensive.  So, we perform the inexpensive checks
927     // on m_node to see if it necessitates emitting a character first and will early return
928     // before encountering shouldRepresentNodeOffsetZero()s worse case behavior.
929     if (shouldEmitTabBeforeNode(m_node)) {
930         if (shouldRepresentNodeOffsetZero())
931             emitCharacter('\t', m_node->parentNode(), m_node, 0, 0);
932     } else if (shouldEmitNewlineBeforeNode(m_node)) {
933         if (shouldRepresentNodeOffsetZero())
934             emitCharacter('\n', m_node->parentNode(), m_node, 0, 0);
935     } else if (shouldEmitSpaceBeforeAndAfterNode(m_node)) {
936         if (shouldRepresentNodeOffsetZero())
937             emitCharacter(' ', m_node->parentNode(), m_node, 0, 0);
938     }
939 }
940 
handleNonTextNode()941 bool TextIterator::handleNonTextNode()
942 {
943     if (shouldEmitNewlineForNode(m_node))
944         emitCharacter('\n', m_node->parentNode(), m_node, 0, 1);
945     else if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isHR())
946         emitCharacter(' ', m_node->parentNode(), m_node, 0, 1);
947     else
948         representNodeOffsetZero();
949 
950     return true;
951 }
952 
exitNode()953 void TextIterator::exitNode()
954 {
955     // prevent emitting a newline when exiting a collapsed block at beginning of the range
956     // FIXME: !m_hasEmitted does not necessarily mean there was a collapsed block... it could
957     // have been an hr (e.g.). Also, a collapsed block could have height (e.g. a table) and
958     // therefore look like a blank line.
959     if (!m_hasEmitted)
960         return;
961 
962     // Emit with a position *inside* m_node, after m_node's contents, in
963     // case it is a block, because the run should start where the
964     // emitted character is positioned visually.
965     Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
966     // FIXME: This shouldn't require the m_lastTextNode to be true, but we can't change that without making
967     // the logic in _web_attributedStringFromRange match.  We'll get that for free when we switch to use
968     // TextIterator in _web_attributedStringFromRange.
969     // See <rdar://problem/5428427> for an example of how this mismatch will cause problems.
970     if (m_lastTextNode && shouldEmitNewlineAfterNode(m_node)) {
971         // use extra newline to represent margin bottom, as needed
972         bool addNewline = shouldEmitExtraNewlineForNode(m_node);
973 
974         // FIXME: We need to emit a '\n' as we leave an empty block(s) that
975         // contain a VisiblePosition when doing selection preservation.
976         if (m_lastCharacter != '\n') {
977             // insert a newline with a position following this block's contents.
978             emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
979             // remember whether to later add a newline for the current node
980             ASSERT(!m_needsAnotherNewline);
981             m_needsAnotherNewline = addNewline;
982         } else if (addNewline)
983             // insert a newline with a position following this block's contents.
984             emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
985     }
986 
987     // If nothing was emitted, see if we need to emit a space.
988     if (!m_positionNode && shouldEmitSpaceBeforeAndAfterNode(m_node))
989         emitCharacter(' ', baseNode->parentNode(), baseNode, 1, 1);
990 }
991 
emitCharacter(UChar c,Node * textNode,Node * offsetBaseNode,int textStartOffset,int textEndOffset)992 void TextIterator::emitCharacter(UChar c, Node* textNode, Node* offsetBaseNode, int textStartOffset, int textEndOffset)
993 {
994     m_hasEmitted = true;
995 
996     // remember information with which to construct the TextIterator::range()
997     // NOTE: textNode is often not a text node, so the range will specify child nodes of positionNode
998     m_positionNode = textNode;
999     m_positionOffsetBaseNode = offsetBaseNode;
1000     m_positionStartOffset = textStartOffset;
1001     m_positionEndOffset = textEndOffset;
1002 
1003     // remember information with which to construct the TextIterator::characters() and length()
1004     m_singleCharacterBuffer = c;
1005     m_textCharacters = &m_singleCharacterBuffer;
1006     m_textLength = 1;
1007 
1008     // remember some iteration state
1009     m_lastTextNodeEndedWithCollapsedSpace = false;
1010     m_lastCharacter = c;
1011 }
1012 
emitText(Node * textNode,RenderObject * renderObject,int textStartOffset,int textEndOffset)1013 void TextIterator::emitText(Node* textNode, RenderObject* renderObject, int textStartOffset, int textEndOffset)
1014 {
1015     RenderText* renderer = toRenderText(renderObject);
1016     m_text = m_emitsTextWithoutTranscoding ? renderer->textWithoutTranscoding() : renderer->text();
1017     ASSERT(m_text.characters());
1018 
1019     m_positionNode = textNode;
1020     m_positionOffsetBaseNode = 0;
1021     m_positionStartOffset = textStartOffset;
1022     m_positionEndOffset = textEndOffset;
1023     m_textCharacters = m_text.characters() + textStartOffset;
1024     m_textLength = textEndOffset - textStartOffset;
1025     m_lastCharacter = m_text[textEndOffset - 1];
1026 
1027     m_lastTextNodeEndedWithCollapsedSpace = false;
1028     m_hasEmitted = true;
1029 }
1030 
emitText(Node * textNode,int textStartOffset,int textEndOffset)1031 void TextIterator::emitText(Node* textNode, int textStartOffset, int textEndOffset)
1032 {
1033     emitText(textNode, m_node->renderer(), textStartOffset, textEndOffset);
1034 }
1035 
range() const1036 PassRefPtr<Range> TextIterator::range() const
1037 {
1038     // use the current run information, if we have it
1039     if (m_positionNode) {
1040         if (m_positionOffsetBaseNode) {
1041             int index = m_positionOffsetBaseNode->nodeIndex();
1042             m_positionStartOffset += index;
1043             m_positionEndOffset += index;
1044             m_positionOffsetBaseNode = 0;
1045         }
1046         return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1047     }
1048 
1049     // otherwise, return the end of the overall range we were given
1050     if (m_endContainer)
1051         return Range::create(m_endContainer->document(), m_endContainer, m_endOffset, m_endContainer, m_endOffset);
1052 
1053     return 0;
1054 }
1055 
node() const1056 Node* TextIterator::node() const
1057 {
1058     RefPtr<Range> textRange = range();
1059     if (!textRange)
1060         return 0;
1061 
1062     Node* node = textRange->startContainer();
1063     if (!node)
1064         return 0;
1065     if (node->offsetInCharacters())
1066         return node;
1067 
1068     return node->childNode(textRange->startOffset());
1069 }
1070 
1071 // --------
1072 
SimplifiedBackwardsTextIterator()1073 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator()
1074     : m_behavior(TextIteratorDefaultBehavior)
1075     , m_node(0)
1076     , m_positionNode(0)
1077 #if OS(ANDROID)
1078     , m_stopsOnFormControls(false)
1079     , m_shouldStop(false)
1080 #endif
1081 {
1082 }
1083 
SimplifiedBackwardsTextIterator(const Range * r,TextIteratorBehavior behavior)1084 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator(const Range* r, TextIteratorBehavior behavior)
1085     : m_behavior(behavior)
1086     , m_node(0)
1087     , m_positionNode(0)
1088 #if OS(ANDROID)
1089     , m_stopsOnFormControls(behavior & TextIteratorStopsOnFormControls)
1090     , m_shouldStop(false)
1091 #endif
1092 {
1093 #if OS(ANDROID)
1094     ASSERT(m_behavior == TextIteratorDefaultBehavior || m_behavior == TextIteratorStopsOnFormControls);
1095 #else
1096     ASSERT(m_behavior == TextIteratorDefaultBehavior);
1097 #endif
1098 
1099     if (!r)
1100         return;
1101 
1102     Node* startNode = r->startContainer();
1103     if (!startNode)
1104         return;
1105     Node* endNode = r->endContainer();
1106     int startOffset = r->startOffset();
1107     int endOffset = r->endOffset();
1108 
1109     if (!startNode->offsetInCharacters()) {
1110         if (startOffset >= 0 && startOffset < static_cast<int>(startNode->childNodeCount())) {
1111             startNode = startNode->childNode(startOffset);
1112             startOffset = 0;
1113         }
1114     }
1115     if (!endNode->offsetInCharacters()) {
1116         if (endOffset > 0 && endOffset <= static_cast<int>(endNode->childNodeCount())) {
1117             endNode = endNode->childNode(endOffset - 1);
1118             endOffset = lastOffsetInNode(endNode);
1119         }
1120     }
1121 
1122     m_node = endNode;
1123     setUpFullyClippedStack(m_fullyClippedStack, m_node);
1124     m_offset = endOffset;
1125     m_handledNode = false;
1126     m_handledChildren = endOffset == 0;
1127 
1128     m_startNode = startNode;
1129     m_startOffset = startOffset;
1130     m_endNode = endNode;
1131     m_endOffset = endOffset;
1132 
1133 #ifndef NDEBUG
1134     // Need this just because of the assert.
1135     m_positionNode = endNode;
1136 #endif
1137 
1138     m_lastTextNode = 0;
1139     m_lastCharacter = '\n';
1140 
1141     m_havePassedStartNode = false;
1142 
1143     advance();
1144 }
1145 
atEnd() const1146 bool SimplifiedBackwardsTextIterator::atEnd() const
1147 {
1148 #if OS(ANDROID)
1149     return !m_positionNode || m_shouldStop;
1150 #else
1151     return !m_positionNode;
1152 #endif
1153 }
1154 
advance()1155 void SimplifiedBackwardsTextIterator::advance()
1156 {
1157     ASSERT(m_positionNode);
1158 
1159 #if OS(ANDROID)
1160     if (m_shouldStop)
1161         return;
1162 
1163     // Prevent changing the iterator position if a form control element was found and advance should stop on it.
1164     if (m_stopsOnFormControls && checkFormControlElement(m_node)) {
1165         m_shouldStop = true;
1166         return;
1167     }
1168 #endif
1169 
1170     m_positionNode = 0;
1171     m_textLength = 0;
1172 
1173     while (m_node && !m_havePassedStartNode) {
1174         // Don't handle node if we start iterating at [node, 0].
1175         if (!m_handledNode && !(m_node == m_endNode && m_endOffset == 0)) {
1176             RenderObject* renderer = m_node->renderer();
1177             if (renderer && renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) {
1178                 // FIXME: What about CDATA_SECTION_NODE?
1179                 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
1180                     m_handledNode = handleTextNode();
1181             } else if (renderer && (renderer->isImage() || renderer->isWidget())) {
1182                 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
1183                     m_handledNode = handleReplacedElement();
1184             } else
1185                 m_handledNode = handleNonTextNode();
1186             if (m_positionNode)
1187                 return;
1188         }
1189 
1190         if (!m_handledChildren && m_node->hasChildNodes()) {
1191             m_node = m_node->lastChild();
1192             pushFullyClippedState(m_fullyClippedStack, m_node);
1193         } else {
1194             // Exit empty containers as we pass over them or containers
1195             // where [container, 0] is where we started iterating.
1196             if (!m_handledNode
1197                     && canHaveChildrenForEditing(m_node)
1198                     && m_node->parentNode()
1199                     && (!m_node->lastChild() || (m_node == m_endNode && !m_endOffset))) {
1200                 exitNode();
1201                 if (m_positionNode) {
1202                     m_handledNode = true;
1203                     m_handledChildren = true;
1204                     return;
1205                 }
1206             }
1207 
1208             // Exit all other containers.
1209             while (!m_node->previousSibling()) {
1210                 if (!advanceRespectingRange(m_node->parentOrHostNode()))
1211                     break;
1212                 m_fullyClippedStack.pop();
1213                 exitNode();
1214                 if (m_positionNode) {
1215                     m_handledNode = true;
1216                     m_handledChildren = true;
1217                     return;
1218                 }
1219             }
1220 
1221             m_fullyClippedStack.pop();
1222             if (advanceRespectingRange(m_node->previousSibling()))
1223                 pushFullyClippedState(m_fullyClippedStack, m_node);
1224             else
1225                 m_node = 0;
1226         }
1227 
1228         // For the purpose of word boundary detection,
1229         // we should iterate all visible text and trailing (collapsed) whitespaces.
1230         m_offset = m_node ? maxOffsetIncludingCollapsedSpaces(m_node) : 0;
1231         m_handledNode = false;
1232         m_handledChildren = false;
1233 
1234         if (m_positionNode)
1235             return;
1236     }
1237 }
1238 
handleTextNode()1239 bool SimplifiedBackwardsTextIterator::handleTextNode()
1240 {
1241     m_lastTextNode = m_node;
1242 
1243     RenderText* renderer = toRenderText(m_node->renderer());
1244     String str = renderer->text();
1245 
1246     if (!renderer->firstTextBox() && str.length() > 0)
1247         return true;
1248 
1249     m_positionEndOffset = m_offset;
1250 
1251     m_offset = (m_node == m_startNode) ? m_startOffset : 0;
1252     m_positionNode = m_node;
1253     m_positionStartOffset = m_offset;
1254     m_textLength = m_positionEndOffset - m_positionStartOffset;
1255     m_textCharacters = str.characters() + m_positionStartOffset;
1256 
1257     m_lastCharacter = str[m_positionEndOffset - 1];
1258 
1259     return true;
1260 }
1261 
handleReplacedElement()1262 bool SimplifiedBackwardsTextIterator::handleReplacedElement()
1263 {
1264     unsigned index = m_node->nodeIndex();
1265     // We want replaced elements to behave like punctuation for boundary
1266     // finding, and to simply take up space for the selection preservation
1267     // code in moveParagraphs, so we use a comma.  Unconditionally emit
1268     // here because this iterator is only used for boundary finding.
1269     emitCharacter(',', m_node->parentNode(), index, index + 1);
1270     return true;
1271 }
1272 
handleNonTextNode()1273 bool SimplifiedBackwardsTextIterator::handleNonTextNode()
1274 {
1275     // We can use a linefeed in place of a tab because this simple iterator is only used to
1276     // find boundaries, not actual content.  A linefeed breaks words, sentences, and paragraphs.
1277     if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineAfterNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
1278         unsigned index = m_node->nodeIndex();
1279         // The start of this emitted range is wrong. Ensuring correctness would require
1280         // VisiblePositions and so would be slow. previousBoundary expects this.
1281         emitCharacter('\n', m_node->parentNode(), index + 1, index + 1);
1282     }
1283     return true;
1284 }
1285 
exitNode()1286 void SimplifiedBackwardsTextIterator::exitNode()
1287 {
1288     if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineBeforeNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
1289         // The start of this emitted range is wrong. Ensuring correctness would require
1290         // VisiblePositions and so would be slow. previousBoundary expects this.
1291         emitCharacter('\n', m_node, 0, 0);
1292     }
1293 }
1294 
emitCharacter(UChar c,Node * node,int startOffset,int endOffset)1295 void SimplifiedBackwardsTextIterator::emitCharacter(UChar c, Node* node, int startOffset, int endOffset)
1296 {
1297     m_singleCharacterBuffer = c;
1298     m_positionNode = node;
1299     m_positionStartOffset = startOffset;
1300     m_positionEndOffset = endOffset;
1301     m_textCharacters = &m_singleCharacterBuffer;
1302     m_textLength = 1;
1303     m_lastCharacter = c;
1304 }
1305 
advanceRespectingRange(Node * next)1306 bool SimplifiedBackwardsTextIterator::advanceRespectingRange(Node* next)
1307 {
1308     if (!next)
1309         return false;
1310     m_havePassedStartNode |= m_node == m_startNode;
1311     if (m_havePassedStartNode)
1312         return false;
1313     m_node = next;
1314     return true;
1315 }
1316 
range() const1317 PassRefPtr<Range> SimplifiedBackwardsTextIterator::range() const
1318 {
1319     if (m_positionNode)
1320         return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1321 
1322     return Range::create(m_startNode->document(), m_startNode, m_startOffset, m_startNode, m_startOffset);
1323 }
1324 
1325 // --------
1326 
CharacterIterator()1327 CharacterIterator::CharacterIterator()
1328     : m_offset(0)
1329     , m_runOffset(0)
1330     , m_atBreak(true)
1331 {
1332 }
1333 
CharacterIterator(const Range * r,TextIteratorBehavior behavior)1334 CharacterIterator::CharacterIterator(const Range* r, TextIteratorBehavior behavior)
1335     : m_offset(0)
1336     , m_runOffset(0)
1337     , m_atBreak(true)
1338     , m_textIterator(r, behavior)
1339 {
1340     while (!atEnd() && m_textIterator.length() == 0)
1341         m_textIterator.advance();
1342 }
1343 
range() const1344 PassRefPtr<Range> CharacterIterator::range() const
1345 {
1346     RefPtr<Range> r = m_textIterator.range();
1347     if (!m_textIterator.atEnd()) {
1348         if (m_textIterator.length() <= 1) {
1349             ASSERT(m_runOffset == 0);
1350         } else {
1351             Node* n = r->startContainer();
1352             ASSERT(n == r->endContainer());
1353             int offset = r->startOffset() + m_runOffset;
1354             ExceptionCode ec = 0;
1355             r->setStart(n, offset, ec);
1356             r->setEnd(n, offset + 1, ec);
1357             ASSERT(!ec);
1358         }
1359     }
1360     return r.release();
1361 }
1362 
advance(int count)1363 void CharacterIterator::advance(int count)
1364 {
1365     if (count <= 0) {
1366         ASSERT(count == 0);
1367         return;
1368     }
1369 
1370     m_atBreak = false;
1371 
1372     // easy if there is enough left in the current m_textIterator run
1373     int remaining = m_textIterator.length() - m_runOffset;
1374     if (count < remaining) {
1375         m_runOffset += count;
1376         m_offset += count;
1377         return;
1378     }
1379 
1380     // exhaust the current m_textIterator run
1381     count -= remaining;
1382     m_offset += remaining;
1383 
1384     // move to a subsequent m_textIterator run
1385     for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
1386         int runLength = m_textIterator.length();
1387         if (runLength == 0)
1388             m_atBreak = true;
1389         else {
1390             // see whether this is m_textIterator to use
1391             if (count < runLength) {
1392                 m_runOffset = count;
1393                 m_offset += count;
1394                 return;
1395             }
1396 
1397             // exhaust this m_textIterator run
1398             count -= runLength;
1399             m_offset += runLength;
1400         }
1401     }
1402 
1403     // ran to the end of the m_textIterator... no more runs left
1404     m_atBreak = true;
1405     m_runOffset = 0;
1406 }
1407 
string(int numChars)1408 String CharacterIterator::string(int numChars)
1409 {
1410     Vector<UChar> result;
1411     result.reserveInitialCapacity(numChars);
1412     while (numChars > 0 && !atEnd()) {
1413         int runSize = min(numChars, length());
1414         result.append(characters(), runSize);
1415         numChars -= runSize;
1416         advance(runSize);
1417     }
1418     return String::adopt(result);
1419 }
1420 
characterSubrange(CharacterIterator & it,int offset,int length)1421 static PassRefPtr<Range> characterSubrange(CharacterIterator& it, int offset, int length)
1422 {
1423     it.advance(offset);
1424     RefPtr<Range> start = it.range();
1425 
1426     if (length > 1)
1427         it.advance(length - 1);
1428     RefPtr<Range> end = it.range();
1429 
1430     return Range::create(start->startContainer()->document(),
1431         start->startContainer(), start->startOffset(),
1432         end->endContainer(), end->endOffset());
1433 }
1434 
BackwardsCharacterIterator()1435 BackwardsCharacterIterator::BackwardsCharacterIterator()
1436     : m_offset(0)
1437     , m_runOffset(0)
1438     , m_atBreak(true)
1439 {
1440 }
1441 
BackwardsCharacterIterator(const Range * range,TextIteratorBehavior behavior)1442 BackwardsCharacterIterator::BackwardsCharacterIterator(const Range* range, TextIteratorBehavior behavior)
1443     : m_offset(0)
1444     , m_runOffset(0)
1445     , m_atBreak(true)
1446     , m_textIterator(range, behavior)
1447 {
1448     while (!atEnd() && !m_textIterator.length())
1449         m_textIterator.advance();
1450 }
1451 
range() const1452 PassRefPtr<Range> BackwardsCharacterIterator::range() const
1453 {
1454     RefPtr<Range> r = m_textIterator.range();
1455     if (!m_textIterator.atEnd()) {
1456         if (m_textIterator.length() <= 1)
1457             ASSERT(m_runOffset == 0);
1458         else {
1459             Node* n = r->startContainer();
1460             ASSERT(n == r->endContainer());
1461             int offset = r->endOffset() - m_runOffset;
1462             ExceptionCode ec = 0;
1463             r->setStart(n, offset - 1, ec);
1464             r->setEnd(n, offset, ec);
1465             ASSERT(!ec);
1466         }
1467     }
1468     return r.release();
1469 }
1470 
advance(int count)1471 void BackwardsCharacterIterator::advance(int count)
1472 {
1473     if (count <= 0) {
1474         ASSERT(!count);
1475         return;
1476     }
1477 
1478     m_atBreak = false;
1479 
1480     int remaining = m_textIterator.length() - m_runOffset;
1481     if (count < remaining) {
1482         m_runOffset += count;
1483         m_offset += count;
1484         return;
1485     }
1486 
1487     count -= remaining;
1488     m_offset += remaining;
1489 
1490     for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
1491         int runLength = m_textIterator.length();
1492         if (runLength == 0)
1493             m_atBreak = true;
1494         else {
1495             if (count < runLength) {
1496                 m_runOffset = count;
1497                 m_offset += count;
1498                 return;
1499             }
1500 
1501             count -= runLength;
1502             m_offset += runLength;
1503         }
1504     }
1505 
1506     m_atBreak = true;
1507     m_runOffset = 0;
1508 }
1509 
1510 // --------
1511 
WordAwareIterator()1512 WordAwareIterator::WordAwareIterator()
1513     : m_previousText(0)
1514     , m_didLookAhead(false)
1515 {
1516 }
1517 
WordAwareIterator(const Range * r)1518 WordAwareIterator::WordAwareIterator(const Range* r)
1519     : m_previousText(0)
1520     , m_didLookAhead(true) // so we consider the first chunk from the text iterator
1521     , m_textIterator(r)
1522 {
1523     advance(); // get in position over the first chunk of text
1524 }
1525 
~WordAwareIterator()1526 WordAwareIterator::~WordAwareIterator()
1527 {
1528 }
1529 
1530 // We're always in one of these modes:
1531 // - The current chunk in the text iterator is our current chunk
1532 //      (typically its a piece of whitespace, or text that ended with whitespace)
1533 // - The previous chunk in the text iterator is our current chunk
1534 //      (we looked ahead to the next chunk and found a word boundary)
1535 // - We built up our own chunk of text from many chunks from the text iterator
1536 
1537 // FIXME: Performance could be bad for huge spans next to each other that don't fall on word boundaries.
1538 
advance()1539 void WordAwareIterator::advance()
1540 {
1541     m_previousText = 0;
1542     m_buffer.clear();      // toss any old buffer we built up
1543 
1544     // If last time we did a look-ahead, start with that looked-ahead chunk now
1545     if (!m_didLookAhead) {
1546         ASSERT(!m_textIterator.atEnd());
1547         m_textIterator.advance();
1548     }
1549     m_didLookAhead = false;
1550 
1551     // Go to next non-empty chunk
1552     while (!m_textIterator.atEnd() && m_textIterator.length() == 0)
1553         m_textIterator.advance();
1554     m_range = m_textIterator.range();
1555 
1556     if (m_textIterator.atEnd())
1557         return;
1558 
1559     while (1) {
1560         // If this chunk ends in whitespace we can just use it as our chunk.
1561         if (isSpaceOrNewline(m_textIterator.characters()[m_textIterator.length() - 1]))
1562             return;
1563 
1564         // If this is the first chunk that failed, save it in previousText before look ahead
1565         if (m_buffer.isEmpty()) {
1566             m_previousText = m_textIterator.characters();
1567             m_previousLength = m_textIterator.length();
1568         }
1569 
1570         // Look ahead to next chunk.  If it is whitespace or a break, we can use the previous stuff
1571         m_textIterator.advance();
1572         if (m_textIterator.atEnd() || m_textIterator.length() == 0 || isSpaceOrNewline(m_textIterator.characters()[0])) {
1573             m_didLookAhead = true;
1574             return;
1575         }
1576 
1577         if (m_buffer.isEmpty()) {
1578             // Start gobbling chunks until we get to a suitable stopping point
1579             m_buffer.append(m_previousText, m_previousLength);
1580             m_previousText = 0;
1581         }
1582         m_buffer.append(m_textIterator.characters(), m_textIterator.length());
1583         int exception = 0;
1584         m_range->setEnd(m_textIterator.range()->endContainer(), m_textIterator.range()->endOffset(), exception);
1585     }
1586 }
1587 
length() const1588 int WordAwareIterator::length() const
1589 {
1590     if (!m_buffer.isEmpty())
1591         return m_buffer.size();
1592     if (m_previousText)
1593         return m_previousLength;
1594     return m_textIterator.length();
1595 }
1596 
characters() const1597 const UChar* WordAwareIterator::characters() const
1598 {
1599     if (!m_buffer.isEmpty())
1600         return m_buffer.data();
1601     if (m_previousText)
1602         return m_previousText;
1603     return m_textIterator.characters();
1604 }
1605 
1606 // --------
1607 
foldQuoteMarkOrSoftHyphen(UChar c)1608 static inline UChar foldQuoteMarkOrSoftHyphen(UChar c)
1609 {
1610     switch (c) {
1611         case hebrewPunctuationGershayim:
1612         case leftDoubleQuotationMark:
1613         case rightDoubleQuotationMark:
1614             return '"';
1615         case hebrewPunctuationGeresh:
1616         case leftSingleQuotationMark:
1617         case rightSingleQuotationMark:
1618             return '\'';
1619         case softHyphen:
1620             // Replace soft hyphen with an ignorable character so that their presence or absence will
1621             // not affect string comparison.
1622             return 0;
1623         default:
1624             return c;
1625     }
1626 }
1627 
foldQuoteMarksAndSoftHyphens(String & s)1628 static inline void foldQuoteMarksAndSoftHyphens(String& s)
1629 {
1630     s.replace(hebrewPunctuationGeresh, '\'');
1631     s.replace(hebrewPunctuationGershayim, '"');
1632     s.replace(leftDoubleQuotationMark, '"');
1633     s.replace(leftSingleQuotationMark, '\'');
1634     s.replace(rightDoubleQuotationMark, '"');
1635     s.replace(rightSingleQuotationMark, '\'');
1636     // Replace soft hyphen with an ignorable character so that their presence or absence will
1637     // not affect string comparison.
1638     s.replace(softHyphen, 0);
1639 }
1640 
1641 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
1642 
foldQuoteMarksAndSoftHyphens(UChar * data,size_t length)1643 static inline void foldQuoteMarksAndSoftHyphens(UChar* data, size_t length)
1644 {
1645     for (size_t i = 0; i < length; ++i)
1646         data[i] = foldQuoteMarkOrSoftHyphen(data[i]);
1647 }
1648 
1649 static const size_t minimumSearchBufferSize = 8192;
1650 
1651 #ifndef NDEBUG
1652 static bool searcherInUse;
1653 #endif
1654 
createSearcher()1655 static UStringSearch* createSearcher()
1656 {
1657     // Provide a non-empty pattern and non-empty text so usearch_open will not fail,
1658     // but it doesn't matter exactly what it is, since we don't perform any searches
1659     // without setting both the pattern and the text.
1660     UErrorCode status = U_ZERO_ERROR;
1661     UStringSearch* searcher = usearch_open(&newlineCharacter, 1, &newlineCharacter, 1, currentSearchLocaleID(), 0, &status);
1662     ASSERT(status == U_ZERO_ERROR || status == U_USING_FALLBACK_WARNING || status == U_USING_DEFAULT_WARNING);
1663     return searcher;
1664 }
1665 
searcher()1666 static UStringSearch* searcher()
1667 {
1668     static UStringSearch* searcher = createSearcher();
1669     return searcher;
1670 }
1671 
lockSearcher()1672 static inline void lockSearcher()
1673 {
1674 #ifndef NDEBUG
1675     ASSERT(!searcherInUse);
1676     searcherInUse = true;
1677 #endif
1678 }
1679 
unlockSearcher()1680 static inline void unlockSearcher()
1681 {
1682 #ifndef NDEBUG
1683     ASSERT(searcherInUse);
1684     searcherInUse = false;
1685 #endif
1686 }
1687 
1688 // ICU's search ignores the distinction between small kana letters and ones
1689 // that are not small, and also characters that differ only in the voicing
1690 // marks when considering only primary collation strength diffrences.
1691 // This is not helpful for end users, since these differences make words
1692 // distinct, so for our purposes we need these to be considered.
1693 // The Unicode folks do not think the collation algorithm should be
1694 // changed. To work around this, we would like to tailor the ICU searcher,
1695 // but we can't get that to work yet. So instead, we check for cases where
1696 // these differences occur, and skip those matches.
1697 
1698 // We refer to the above technique as the "kana workaround". The next few
1699 // functions are helper functinos for the kana workaround.
1700 
isKanaLetter(UChar character)1701 static inline bool isKanaLetter(UChar character)
1702 {
1703     // Hiragana letters.
1704     if (character >= 0x3041 && character <= 0x3096)
1705         return true;
1706 
1707     // Katakana letters.
1708     if (character >= 0x30A1 && character <= 0x30FA)
1709         return true;
1710     if (character >= 0x31F0 && character <= 0x31FF)
1711         return true;
1712 
1713     // Halfwidth katakana letters.
1714     if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70)
1715         return true;
1716 
1717     return false;
1718 }
1719 
isSmallKanaLetter(UChar character)1720 static inline bool isSmallKanaLetter(UChar character)
1721 {
1722     ASSERT(isKanaLetter(character));
1723 
1724     switch (character) {
1725     case 0x3041: // HIRAGANA LETTER SMALL A
1726     case 0x3043: // HIRAGANA LETTER SMALL I
1727     case 0x3045: // HIRAGANA LETTER SMALL U
1728     case 0x3047: // HIRAGANA LETTER SMALL E
1729     case 0x3049: // HIRAGANA LETTER SMALL O
1730     case 0x3063: // HIRAGANA LETTER SMALL TU
1731     case 0x3083: // HIRAGANA LETTER SMALL YA
1732     case 0x3085: // HIRAGANA LETTER SMALL YU
1733     case 0x3087: // HIRAGANA LETTER SMALL YO
1734     case 0x308E: // HIRAGANA LETTER SMALL WA
1735     case 0x3095: // HIRAGANA LETTER SMALL KA
1736     case 0x3096: // HIRAGANA LETTER SMALL KE
1737     case 0x30A1: // KATAKANA LETTER SMALL A
1738     case 0x30A3: // KATAKANA LETTER SMALL I
1739     case 0x30A5: // KATAKANA LETTER SMALL U
1740     case 0x30A7: // KATAKANA LETTER SMALL E
1741     case 0x30A9: // KATAKANA LETTER SMALL O
1742     case 0x30C3: // KATAKANA LETTER SMALL TU
1743     case 0x30E3: // KATAKANA LETTER SMALL YA
1744     case 0x30E5: // KATAKANA LETTER SMALL YU
1745     case 0x30E7: // KATAKANA LETTER SMALL YO
1746     case 0x30EE: // KATAKANA LETTER SMALL WA
1747     case 0x30F5: // KATAKANA LETTER SMALL KA
1748     case 0x30F6: // KATAKANA LETTER SMALL KE
1749     case 0x31F0: // KATAKANA LETTER SMALL KU
1750     case 0x31F1: // KATAKANA LETTER SMALL SI
1751     case 0x31F2: // KATAKANA LETTER SMALL SU
1752     case 0x31F3: // KATAKANA LETTER SMALL TO
1753     case 0x31F4: // KATAKANA LETTER SMALL NU
1754     case 0x31F5: // KATAKANA LETTER SMALL HA
1755     case 0x31F6: // KATAKANA LETTER SMALL HI
1756     case 0x31F7: // KATAKANA LETTER SMALL HU
1757     case 0x31F8: // KATAKANA LETTER SMALL HE
1758     case 0x31F9: // KATAKANA LETTER SMALL HO
1759     case 0x31FA: // KATAKANA LETTER SMALL MU
1760     case 0x31FB: // KATAKANA LETTER SMALL RA
1761     case 0x31FC: // KATAKANA LETTER SMALL RI
1762     case 0x31FD: // KATAKANA LETTER SMALL RU
1763     case 0x31FE: // KATAKANA LETTER SMALL RE
1764     case 0x31FF: // KATAKANA LETTER SMALL RO
1765     case 0xFF67: // HALFWIDTH KATAKANA LETTER SMALL A
1766     case 0xFF68: // HALFWIDTH KATAKANA LETTER SMALL I
1767     case 0xFF69: // HALFWIDTH KATAKANA LETTER SMALL U
1768     case 0xFF6A: // HALFWIDTH KATAKANA LETTER SMALL E
1769     case 0xFF6B: // HALFWIDTH KATAKANA LETTER SMALL O
1770     case 0xFF6C: // HALFWIDTH KATAKANA LETTER SMALL YA
1771     case 0xFF6D: // HALFWIDTH KATAKANA LETTER SMALL YU
1772     case 0xFF6E: // HALFWIDTH KATAKANA LETTER SMALL YO
1773     case 0xFF6F: // HALFWIDTH KATAKANA LETTER SMALL TU
1774         return true;
1775     }
1776     return false;
1777 }
1778 
1779 enum VoicedSoundMarkType { NoVoicedSoundMark, VoicedSoundMark, SemiVoicedSoundMark };
1780 
composedVoicedSoundMark(UChar character)1781 static inline VoicedSoundMarkType composedVoicedSoundMark(UChar character)
1782 {
1783     ASSERT(isKanaLetter(character));
1784 
1785     switch (character) {
1786     case 0x304C: // HIRAGANA LETTER GA
1787     case 0x304E: // HIRAGANA LETTER GI
1788     case 0x3050: // HIRAGANA LETTER GU
1789     case 0x3052: // HIRAGANA LETTER GE
1790     case 0x3054: // HIRAGANA LETTER GO
1791     case 0x3056: // HIRAGANA LETTER ZA
1792     case 0x3058: // HIRAGANA LETTER ZI
1793     case 0x305A: // HIRAGANA LETTER ZU
1794     case 0x305C: // HIRAGANA LETTER ZE
1795     case 0x305E: // HIRAGANA LETTER ZO
1796     case 0x3060: // HIRAGANA LETTER DA
1797     case 0x3062: // HIRAGANA LETTER DI
1798     case 0x3065: // HIRAGANA LETTER DU
1799     case 0x3067: // HIRAGANA LETTER DE
1800     case 0x3069: // HIRAGANA LETTER DO
1801     case 0x3070: // HIRAGANA LETTER BA
1802     case 0x3073: // HIRAGANA LETTER BI
1803     case 0x3076: // HIRAGANA LETTER BU
1804     case 0x3079: // HIRAGANA LETTER BE
1805     case 0x307C: // HIRAGANA LETTER BO
1806     case 0x3094: // HIRAGANA LETTER VU
1807     case 0x30AC: // KATAKANA LETTER GA
1808     case 0x30AE: // KATAKANA LETTER GI
1809     case 0x30B0: // KATAKANA LETTER GU
1810     case 0x30B2: // KATAKANA LETTER GE
1811     case 0x30B4: // KATAKANA LETTER GO
1812     case 0x30B6: // KATAKANA LETTER ZA
1813     case 0x30B8: // KATAKANA LETTER ZI
1814     case 0x30BA: // KATAKANA LETTER ZU
1815     case 0x30BC: // KATAKANA LETTER ZE
1816     case 0x30BE: // KATAKANA LETTER ZO
1817     case 0x30C0: // KATAKANA LETTER DA
1818     case 0x30C2: // KATAKANA LETTER DI
1819     case 0x30C5: // KATAKANA LETTER DU
1820     case 0x30C7: // KATAKANA LETTER DE
1821     case 0x30C9: // KATAKANA LETTER DO
1822     case 0x30D0: // KATAKANA LETTER BA
1823     case 0x30D3: // KATAKANA LETTER BI
1824     case 0x30D6: // KATAKANA LETTER BU
1825     case 0x30D9: // KATAKANA LETTER BE
1826     case 0x30DC: // KATAKANA LETTER BO
1827     case 0x30F4: // KATAKANA LETTER VU
1828     case 0x30F7: // KATAKANA LETTER VA
1829     case 0x30F8: // KATAKANA LETTER VI
1830     case 0x30F9: // KATAKANA LETTER VE
1831     case 0x30FA: // KATAKANA LETTER VO
1832         return VoicedSoundMark;
1833     case 0x3071: // HIRAGANA LETTER PA
1834     case 0x3074: // HIRAGANA LETTER PI
1835     case 0x3077: // HIRAGANA LETTER PU
1836     case 0x307A: // HIRAGANA LETTER PE
1837     case 0x307D: // HIRAGANA LETTER PO
1838     case 0x30D1: // KATAKANA LETTER PA
1839     case 0x30D4: // KATAKANA LETTER PI
1840     case 0x30D7: // KATAKANA LETTER PU
1841     case 0x30DA: // KATAKANA LETTER PE
1842     case 0x30DD: // KATAKANA LETTER PO
1843         return SemiVoicedSoundMark;
1844     }
1845     return NoVoicedSoundMark;
1846 }
1847 
isCombiningVoicedSoundMark(UChar character)1848 static inline bool isCombiningVoicedSoundMark(UChar character)
1849 {
1850     switch (character) {
1851     case 0x3099: // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
1852     case 0x309A: // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
1853         return true;
1854     }
1855     return false;
1856 }
1857 
containsKanaLetters(const String & pattern)1858 static inline bool containsKanaLetters(const String& pattern)
1859 {
1860     const UChar* characters = pattern.characters();
1861     unsigned length = pattern.length();
1862     for (unsigned i = 0; i < length; ++i) {
1863         if (isKanaLetter(characters[i]))
1864             return true;
1865     }
1866     return false;
1867 }
1868 
normalizeCharacters(const UChar * characters,unsigned length,Vector<UChar> & buffer)1869 static void normalizeCharacters(const UChar* characters, unsigned length, Vector<UChar>& buffer)
1870 {
1871     ASSERT(length);
1872 
1873     buffer.resize(length);
1874 
1875     UErrorCode status = U_ZERO_ERROR;
1876     size_t bufferSize = unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), length, &status);
1877     ASSERT(status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR);
1878     ASSERT(bufferSize);
1879 
1880     buffer.resize(bufferSize);
1881 
1882     if (status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING)
1883         return;
1884 
1885     status = U_ZERO_ERROR;
1886     unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), bufferSize, &status);
1887     ASSERT(status == U_STRING_NOT_TERMINATED_WARNING);
1888 }
1889 
isNonLatin1Separator(UChar32 character)1890 static bool isNonLatin1Separator(UChar32 character)
1891 {
1892     ASSERT_ARG(character, character >= 256);
1893 
1894     return U_GET_GC_MASK(character) & (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK);
1895 }
1896 
isSeparator(UChar32 character)1897 static inline bool isSeparator(UChar32 character)
1898 {
1899     static const bool latin1SeparatorTable[256] = {
1900         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1901         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1902         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // space ! " # $ % & ' ( ) * + , - . /
1903         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, //                         : ; < = > ?
1904         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //   @
1905         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, //                         [ \ ] ^ _
1906         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //   `
1907         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, //                           { | } ~
1908         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1909         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1910         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1911         1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1912         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1913         0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
1914         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1915         0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
1916     };
1917 
1918     if (character < 256)
1919         return latin1SeparatorTable[character];
1920 
1921     return isNonLatin1Separator(character);
1922 }
1923 
SearchBuffer(const String & target,FindOptions options)1924 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
1925     : m_target(target)
1926     , m_options(options)
1927     , m_prefixLength(0)
1928     , m_atBreak(true)
1929     , m_needsMoreContext(options & AtWordStarts)
1930     , m_targetRequiresKanaWorkaround(containsKanaLetters(m_target))
1931 {
1932     ASSERT(!m_target.isEmpty());
1933 
1934     // FIXME: We'd like to tailor the searcher to fold quote marks for us instead
1935     // of doing it in a separate replacement pass here, but ICU doesn't offer a way
1936     // to add tailoring on top of the locale-specific tailoring as of this writing.
1937     foldQuoteMarksAndSoftHyphens(m_target);
1938 
1939     size_t targetLength = m_target.length();
1940     m_buffer.reserveInitialCapacity(max(targetLength * 8, minimumSearchBufferSize));
1941     m_overlap = m_buffer.capacity() / 4;
1942 
1943     if ((m_options & AtWordStarts) && targetLength) {
1944         UChar32 targetFirstCharacter;
1945         U16_GET(m_target.characters(), 0, 0, targetLength, targetFirstCharacter);
1946         // Characters in the separator category never really occur at the beginning of a word,
1947         // so if the target begins with such a character, we just ignore the AtWordStart option.
1948         if (isSeparator(targetFirstCharacter)) {
1949             m_options &= ~AtWordStarts;
1950             m_needsMoreContext = false;
1951         }
1952     }
1953 
1954     // Grab the single global searcher.
1955     // If we ever have a reason to do more than once search buffer at once, we'll have
1956     // to move to multiple searchers.
1957     lockSearcher();
1958 
1959     UStringSearch* searcher = WebCore::searcher();
1960     UCollator* collator = usearch_getCollator(searcher);
1961 
1962     UCollationStrength strength = m_options & CaseInsensitive ? UCOL_PRIMARY : UCOL_TERTIARY;
1963     if (ucol_getStrength(collator) != strength) {
1964         ucol_setStrength(collator, strength);
1965         usearch_reset(searcher);
1966     }
1967 
1968     UErrorCode status = U_ZERO_ERROR;
1969     usearch_setPattern(searcher, m_target.characters(), targetLength, &status);
1970     ASSERT(status == U_ZERO_ERROR);
1971 
1972     // The kana workaround requires a normalized copy of the target string.
1973     if (m_targetRequiresKanaWorkaround)
1974         normalizeCharacters(m_target.characters(), m_target.length(), m_normalizedTarget);
1975 }
1976 
~SearchBuffer()1977 inline SearchBuffer::~SearchBuffer()
1978 {
1979     // Leave the static object pointing to a valid string.
1980     UErrorCode status = U_ZERO_ERROR;
1981     usearch_setPattern(WebCore::searcher(), &newlineCharacter, 1, &status);
1982     ASSERT(status == U_ZERO_ERROR);
1983 
1984     unlockSearcher();
1985 }
1986 
append(const UChar * characters,size_t length)1987 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
1988 {
1989     ASSERT(length);
1990 
1991     if (m_atBreak) {
1992         m_buffer.shrink(0);
1993         m_prefixLength = 0;
1994         m_atBreak = false;
1995     } else if (m_buffer.size() == m_buffer.capacity()) {
1996         memcpy(m_buffer.data(), m_buffer.data() + m_buffer.size() - m_overlap, m_overlap * sizeof(UChar));
1997         m_prefixLength -= min(m_prefixLength, m_buffer.size() - m_overlap);
1998         m_buffer.shrink(m_overlap);
1999     }
2000 
2001     size_t oldLength = m_buffer.size();
2002     size_t usableLength = min(m_buffer.capacity() - oldLength, length);
2003     ASSERT(usableLength);
2004     m_buffer.append(characters, usableLength);
2005     foldQuoteMarksAndSoftHyphens(m_buffer.data() + oldLength, usableLength);
2006     return usableLength;
2007 }
2008 
needsMoreContext() const2009 inline bool SearchBuffer::needsMoreContext() const
2010 {
2011     return m_needsMoreContext;
2012 }
2013 
prependContext(const UChar * characters,size_t length)2014 inline void SearchBuffer::prependContext(const UChar* characters, size_t length)
2015 {
2016     ASSERT(m_needsMoreContext);
2017     ASSERT(m_prefixLength == m_buffer.size());
2018 
2019     if (!length)
2020         return;
2021 
2022     m_atBreak = false;
2023 
2024     size_t wordBoundaryContextStart = length;
2025     if (wordBoundaryContextStart) {
2026         U16_BACK_1(characters, 0, wordBoundaryContextStart);
2027         wordBoundaryContextStart = startOfLastWordBoundaryContext(characters, wordBoundaryContextStart);
2028     }
2029 
2030     size_t usableLength = min(m_buffer.capacity() - m_prefixLength, length - wordBoundaryContextStart);
2031     m_buffer.prepend(characters + length - usableLength, usableLength);
2032     m_prefixLength += usableLength;
2033 
2034     if (wordBoundaryContextStart || m_prefixLength == m_buffer.capacity())
2035         m_needsMoreContext = false;
2036 }
2037 
atBreak() const2038 inline bool SearchBuffer::atBreak() const
2039 {
2040     return m_atBreak;
2041 }
2042 
reachedBreak()2043 inline void SearchBuffer::reachedBreak()
2044 {
2045     m_atBreak = true;
2046 }
2047 
isBadMatch(const UChar * match,size_t matchLength) const2048 inline bool SearchBuffer::isBadMatch(const UChar* match, size_t matchLength) const
2049 {
2050     // This function implements the kana workaround. If usearch treats
2051     // it as a match, but we do not want to, then it's a "bad match".
2052     if (!m_targetRequiresKanaWorkaround)
2053         return false;
2054 
2055     // Normalize into a match buffer. We reuse a single buffer rather than
2056     // creating a new one each time.
2057     normalizeCharacters(match, matchLength, m_normalizedMatch);
2058 
2059     const UChar* a = m_normalizedTarget.begin();
2060     const UChar* aEnd = m_normalizedTarget.end();
2061 
2062     const UChar* b = m_normalizedMatch.begin();
2063     const UChar* bEnd = m_normalizedMatch.end();
2064 
2065     while (true) {
2066         // Skip runs of non-kana-letter characters. This is necessary so we can
2067         // correctly handle strings where the target and match have different-length
2068         // runs of characters that match, while still double checking the correctness
2069         // of matches of kana letters with other kana letters.
2070         while (a != aEnd && !isKanaLetter(*a))
2071             ++a;
2072         while (b != bEnd && !isKanaLetter(*b))
2073             ++b;
2074 
2075         // If we reached the end of either the target or the match, we should have
2076         // reached the end of both; both should have the same number of kana letters.
2077         if (a == aEnd || b == bEnd) {
2078             ASSERT(a == aEnd);
2079             ASSERT(b == bEnd);
2080             return false;
2081         }
2082 
2083         // Check for differences in the kana letter character itself.
2084         if (isSmallKanaLetter(*a) != isSmallKanaLetter(*b))
2085             return true;
2086         if (composedVoicedSoundMark(*a) != composedVoicedSoundMark(*b))
2087             return true;
2088         ++a;
2089         ++b;
2090 
2091         // Check for differences in combining voiced sound marks found after the letter.
2092         while (1) {
2093             if (!(a != aEnd && isCombiningVoicedSoundMark(*a))) {
2094                 if (b != bEnd && isCombiningVoicedSoundMark(*b))
2095                     return true;
2096                 break;
2097             }
2098             if (!(b != bEnd && isCombiningVoicedSoundMark(*b)))
2099                 return true;
2100             if (*a != *b)
2101                 return true;
2102             ++a;
2103             ++b;
2104         }
2105     }
2106 }
2107 
isWordStartMatch(size_t start,size_t length) const2108 inline bool SearchBuffer::isWordStartMatch(size_t start, size_t length) const
2109 {
2110     ASSERT(m_options & AtWordStarts);
2111 
2112     if (!start)
2113         return true;
2114 
2115     if (m_options & TreatMedialCapitalAsWordStart) {
2116         int size = m_buffer.size();
2117         int offset = start;
2118         UChar32 firstCharacter;
2119         U16_GET(m_buffer.data(), 0, offset, size, firstCharacter);
2120         UChar32 previousCharacter;
2121         U16_PREV(m_buffer.data(), 0, offset, previousCharacter);
2122 
2123         if (isSeparator(firstCharacter)) {
2124             // The start of a separator run is a word start (".org" in "webkit.org").
2125             if (!isSeparator(previousCharacter))
2126                 return true;
2127         } else if (isASCIIUpper(firstCharacter)) {
2128             // The start of an uppercase run is a word start ("Kit" in "WebKit").
2129             if (!isASCIIUpper(previousCharacter))
2130                 return true;
2131             // The last character of an uppercase run followed by a non-separator, non-digit
2132             // is a word start ("Request" in "XMLHTTPRequest").
2133             offset = start;
2134             U16_FWD_1(m_buffer.data(), offset, size);
2135             UChar32 nextCharacter = 0;
2136             if (offset < size)
2137                 U16_GET(m_buffer.data(), 0, offset, size, nextCharacter);
2138             if (!isASCIIUpper(nextCharacter) && !isASCIIDigit(nextCharacter) && !isSeparator(nextCharacter))
2139                 return true;
2140         } else if (isASCIIDigit(firstCharacter)) {
2141             // The start of a digit run is a word start ("2" in "WebKit2").
2142             if (!isASCIIDigit(previousCharacter))
2143                 return true;
2144         } else if (isSeparator(previousCharacter) || isASCIIDigit(previousCharacter)) {
2145             // The start of a non-separator, non-uppercase, non-digit run is a word start,
2146             // except after an uppercase. ("org" in "webkit.org", but not "ore" in "WebCore").
2147             return true;
2148         }
2149     }
2150 
2151     size_t wordBreakSearchStart = start + length;
2152     while (wordBreakSearchStart > start)
2153         wordBreakSearchStart = findNextWordFromIndex(m_buffer.data(), m_buffer.size(), wordBreakSearchStart, false /* backwards */);
2154     return wordBreakSearchStart == start;
2155 }
2156 
search(size_t & start)2157 inline size_t SearchBuffer::search(size_t& start)
2158 {
2159     size_t size = m_buffer.size();
2160     if (m_atBreak) {
2161         if (!size)
2162             return 0;
2163     } else {
2164         if (size != m_buffer.capacity())
2165             return 0;
2166     }
2167 
2168     UStringSearch* searcher = WebCore::searcher();
2169 
2170     UErrorCode status = U_ZERO_ERROR;
2171     usearch_setText(searcher, m_buffer.data(), size, &status);
2172     ASSERT(status == U_ZERO_ERROR);
2173 
2174     usearch_setOffset(searcher, m_prefixLength, &status);
2175     ASSERT(status == U_ZERO_ERROR);
2176 
2177     int matchStart = usearch_next(searcher, &status);
2178     ASSERT(status == U_ZERO_ERROR);
2179 
2180 nextMatch:
2181     if (!(matchStart >= 0 && static_cast<size_t>(matchStart) < size)) {
2182         ASSERT(matchStart == USEARCH_DONE);
2183         return 0;
2184     }
2185 
2186     // Matches that start in the overlap area are only tentative.
2187     // The same match may appear later, matching more characters,
2188     // possibly including a combining character that's not yet in the buffer.
2189     if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) {
2190         size_t overlap = m_overlap;
2191         if (m_options & AtWordStarts) {
2192             // Ensure that there is sufficient context before matchStart the next time around for
2193             // determining if it is at a word boundary.
2194             int wordBoundaryContextStart = matchStart;
2195             U16_BACK_1(m_buffer.data(), 0, wordBoundaryContextStart);
2196             wordBoundaryContextStart = startOfLastWordBoundaryContext(m_buffer.data(), wordBoundaryContextStart);
2197             overlap = min(size - 1, max(overlap, size - wordBoundaryContextStart));
2198         }
2199         memcpy(m_buffer.data(), m_buffer.data() + size - overlap, overlap * sizeof(UChar));
2200         m_prefixLength -= min(m_prefixLength, size - overlap);
2201         m_buffer.shrink(overlap);
2202         return 0;
2203     }
2204 
2205     size_t matchedLength = usearch_getMatchedLength(searcher);
2206     ASSERT(matchStart + matchedLength <= size);
2207 
2208     // If this match is "bad", move on to the next match.
2209     if (isBadMatch(m_buffer.data() + matchStart, matchedLength) || ((m_options & AtWordStarts) && !isWordStartMatch(matchStart, matchedLength))) {
2210         matchStart = usearch_next(searcher, &status);
2211         ASSERT(status == U_ZERO_ERROR);
2212         goto nextMatch;
2213     }
2214 
2215     size_t newSize = size - (matchStart + 1);
2216     memmove(m_buffer.data(), m_buffer.data() + matchStart + 1, newSize * sizeof(UChar));
2217     m_prefixLength -= min<size_t>(m_prefixLength, matchStart + 1);
2218     m_buffer.shrink(newSize);
2219 
2220     start = size - matchStart;
2221     return matchedLength;
2222 }
2223 
2224 #else // !ICU_UNICODE
2225 
SearchBuffer(const String & target,FindOptions options)2226 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
2227     : m_target(options & CaseInsensitive ? target.foldCase() : target)
2228     , m_options(options)
2229     , m_buffer(m_target.length())
2230     , m_isCharacterStartBuffer(m_target.length())
2231     , m_isBufferFull(false)
2232     , m_cursor(0)
2233 {
2234     ASSERT(!m_target.isEmpty());
2235     m_target.replace(noBreakSpace, ' ');
2236     foldQuoteMarksAndSoftHyphens(m_target);
2237 }
2238 
~SearchBuffer()2239 inline SearchBuffer::~SearchBuffer()
2240 {
2241 }
2242 
reachedBreak()2243 inline void SearchBuffer::reachedBreak()
2244 {
2245     m_cursor = 0;
2246     m_isBufferFull = false;
2247 }
2248 
atBreak() const2249 inline bool SearchBuffer::atBreak() const
2250 {
2251     return !m_cursor && !m_isBufferFull;
2252 }
2253 
append(UChar c,bool isStart)2254 inline void SearchBuffer::append(UChar c, bool isStart)
2255 {
2256     m_buffer[m_cursor] = c == noBreakSpace ? ' ' : foldQuoteMarkOrSoftHyphen(c);
2257     m_isCharacterStartBuffer[m_cursor] = isStart;
2258     if (++m_cursor == m_target.length()) {
2259         m_cursor = 0;
2260         m_isBufferFull = true;
2261     }
2262 }
2263 
append(const UChar * characters,size_t length)2264 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
2265 {
2266     ASSERT(length);
2267     if (!(m_options & CaseInsensitive)) {
2268         append(characters[0], true);
2269         return 1;
2270     }
2271     const int maxFoldedCharacters = 16; // sensible maximum is 3, this should be more than enough
2272     UChar foldedCharacters[maxFoldedCharacters];
2273     bool error;
2274     int numFoldedCharacters = foldCase(foldedCharacters, maxFoldedCharacters, characters, 1, &error);
2275     ASSERT(!error);
2276     ASSERT(numFoldedCharacters);
2277     ASSERT(numFoldedCharacters <= maxFoldedCharacters);
2278     if (!error && numFoldedCharacters) {
2279         numFoldedCharacters = min(numFoldedCharacters, maxFoldedCharacters);
2280         append(foldedCharacters[0], true);
2281         for (int i = 1; i < numFoldedCharacters; ++i)
2282             append(foldedCharacters[i], false);
2283     }
2284     return 1;
2285 }
2286 
needsMoreContext() const2287 inline bool SearchBuffer::needsMoreContext() const
2288 {
2289     return false;
2290 }
2291 
prependContext(const UChar *,size_t)2292 void SearchBuffer::prependContext(const UChar*, size_t)
2293 {
2294     ASSERT_NOT_REACHED();
2295 }
2296 
search(size_t & start)2297 inline size_t SearchBuffer::search(size_t& start)
2298 {
2299     if (!m_isBufferFull)
2300         return 0;
2301     if (!m_isCharacterStartBuffer[m_cursor])
2302         return 0;
2303 
2304     size_t tailSpace = m_target.length() - m_cursor;
2305     if (memcmp(&m_buffer[m_cursor], m_target.characters(), tailSpace * sizeof(UChar)) != 0)
2306         return 0;
2307     if (memcmp(&m_buffer[0], m_target.characters() + tailSpace, m_cursor * sizeof(UChar)) != 0)
2308         return 0;
2309 
2310     start = length();
2311 
2312     // Now that we've found a match once, we don't want to find it again, because those
2313     // are the SearchBuffer semantics, allowing for a buffer where you append more than one
2314     // character at a time. To do this we take advantage of m_isCharacterStartBuffer, but if
2315     // we want to get rid of that in the future we could track this with a separate boolean
2316     // or even move the characters to the start of the buffer and set m_isBufferFull to false.
2317     m_isCharacterStartBuffer[m_cursor] = false;
2318 
2319     return start;
2320 }
2321 
2322 // Returns the number of characters that were appended to the buffer (what we are searching in).
2323 // That's not necessarily the same length as the passed-in target string, because case folding
2324 // can make two strings match even though they're not the same length.
length() const2325 size_t SearchBuffer::length() const
2326 {
2327     size_t bufferSize = m_target.length();
2328     size_t length = 0;
2329     for (size_t i = 0; i < bufferSize; ++i)
2330         length += m_isCharacterStartBuffer[i];
2331     return length;
2332 }
2333 
2334 #endif // !ICU_UNICODE
2335 
2336 // --------
2337 
rangeLength(const Range * r,bool forSelectionPreservation)2338 int TextIterator::rangeLength(const Range* r, bool forSelectionPreservation)
2339 {
2340     int length = 0;
2341     for (TextIterator it(r, forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior); !it.atEnd(); it.advance())
2342         length += it.length();
2343 
2344     return length;
2345 }
2346 
subrange(Range * entireRange,int characterOffset,int characterCount)2347 PassRefPtr<Range> TextIterator::subrange(Range* entireRange, int characterOffset, int characterCount)
2348 {
2349     CharacterIterator entireRangeIterator(entireRange);
2350     return characterSubrange(entireRangeIterator, characterOffset, characterCount);
2351 }
2352 
rangeFromLocationAndLength(Element * scope,int rangeLocation,int rangeLength,bool forSelectionPreservation)2353 PassRefPtr<Range> TextIterator::rangeFromLocationAndLength(Element* scope, int rangeLocation, int rangeLength, bool forSelectionPreservation)
2354 {
2355     RefPtr<Range> resultRange = scope->document()->createRange();
2356 
2357     int docTextPosition = 0;
2358     int rangeEnd = rangeLocation + rangeLength;
2359     bool startRangeFound = false;
2360 
2361     RefPtr<Range> textRunRange;
2362 
2363     TextIterator it(rangeOfContents(scope).get(), forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior);
2364 
2365     // FIXME: the atEnd() check shouldn't be necessary, workaround for <http://bugs.webkit.org/show_bug.cgi?id=6289>.
2366     if (rangeLocation == 0 && rangeLength == 0 && it.atEnd()) {
2367         textRunRange = it.range();
2368 
2369         ExceptionCode ec = 0;
2370         resultRange->setStart(textRunRange->startContainer(), 0, ec);
2371         ASSERT(!ec);
2372         resultRange->setEnd(textRunRange->startContainer(), 0, ec);
2373         ASSERT(!ec);
2374 
2375         return resultRange.release();
2376     }
2377 
2378     for (; !it.atEnd(); it.advance()) {
2379         int len = it.length();
2380         textRunRange = it.range();
2381 
2382         bool foundStart = rangeLocation >= docTextPosition && rangeLocation <= docTextPosition + len;
2383         bool foundEnd = rangeEnd >= docTextPosition && rangeEnd <= docTextPosition + len;
2384 
2385         // Fix textRunRange->endPosition(), but only if foundStart || foundEnd, because it is only
2386         // in those cases that textRunRange is used.
2387         if (foundEnd) {
2388             // FIXME: This is a workaround for the fact that the end of a run is often at the wrong
2389             // position for emitted '\n's.
2390             if (len == 1 && it.characters()[0] == '\n') {
2391                 scope->document()->updateLayoutIgnorePendingStylesheets();
2392                 it.advance();
2393                 if (!it.atEnd()) {
2394                     RefPtr<Range> range = it.range();
2395                     ExceptionCode ec = 0;
2396                     textRunRange->setEnd(range->startContainer(), range->startOffset(), ec);
2397                     ASSERT(!ec);
2398                 } else {
2399                     Position runStart = textRunRange->startPosition();
2400                     Position runEnd = VisiblePosition(runStart).next().deepEquivalent();
2401                     if (runEnd.isNotNull()) {
2402                         ExceptionCode ec = 0;
2403                         textRunRange->setEnd(runEnd.containerNode(), runEnd.computeOffsetInContainerNode(), ec);
2404                         ASSERT(!ec);
2405                     }
2406                 }
2407             }
2408         }
2409 
2410         if (foundStart) {
2411             startRangeFound = true;
2412             int exception = 0;
2413             if (textRunRange->startContainer()->isTextNode()) {
2414                 int offset = rangeLocation - docTextPosition;
2415                 resultRange->setStart(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
2416             } else {
2417                 if (rangeLocation == docTextPosition)
2418                     resultRange->setStart(textRunRange->startContainer(), textRunRange->startOffset(), exception);
2419                 else
2420                     resultRange->setStart(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2421             }
2422         }
2423 
2424         if (foundEnd) {
2425             int exception = 0;
2426             if (textRunRange->startContainer()->isTextNode()) {
2427                 int offset = rangeEnd - docTextPosition;
2428                 resultRange->setEnd(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
2429             } else {
2430                 if (rangeEnd == docTextPosition)
2431                     resultRange->setEnd(textRunRange->startContainer(), textRunRange->startOffset(), exception);
2432                 else
2433                     resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2434             }
2435             docTextPosition += len;
2436             break;
2437         }
2438         docTextPosition += len;
2439     }
2440 
2441     if (!startRangeFound)
2442         return 0;
2443 
2444     if (rangeLength != 0 && rangeEnd > docTextPosition) { // rangeEnd is out of bounds
2445         int exception = 0;
2446         resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2447     }
2448 
2449     return resultRange.release();
2450 }
2451 
locationAndLengthFromRange(const Range * range,size_t & location,size_t & length)2452 bool TextIterator::locationAndLengthFromRange(const Range* range, size_t& location, size_t& length)
2453 {
2454     location = notFound;
2455     length = 0;
2456 
2457     if (!range->startContainer())
2458         return false;
2459 
2460     Element* selectionRoot = range->ownerDocument()->frame()->selection()->rootEditableElement();
2461     Element* scope = selectionRoot ? selectionRoot : range->ownerDocument()->documentElement();
2462 
2463     // The critical assumption is that this only gets called with ranges that
2464     // concentrate on a given area containing the selection root. This is done
2465     // because of text fields and textareas. The DOM for those is not
2466     // directly in the document DOM, so ensure that the range does not cross a
2467     // boundary of one of those.
2468     if (range->startContainer() != scope && !range->startContainer()->isDescendantOf(scope))
2469         return false;
2470     if (range->endContainer() != scope && !range->endContainer()->isDescendantOf(scope))
2471         return false;
2472 
2473     RefPtr<Range> testRange = Range::create(scope->document(), scope, 0, range->startContainer(), range->startOffset());
2474     ASSERT(testRange->startContainer() == scope);
2475     location = TextIterator::rangeLength(testRange.get());
2476 
2477     ExceptionCode ec;
2478     testRange->setEnd(range->endContainer(), range->endOffset(), ec);
2479     ASSERT(testRange->startContainer() == scope);
2480     length = TextIterator::rangeLength(testRange.get()) - location;
2481     return true;
2482 }
2483 
2484 // --------
2485 
plainTextToMallocAllocatedBuffer(const Range * r,unsigned & bufferLength,bool isDisplayString,TextIteratorBehavior defaultBehavior)2486 UChar* plainTextToMallocAllocatedBuffer(const Range* r, unsigned& bufferLength, bool isDisplayString, TextIteratorBehavior defaultBehavior)
2487 {
2488     UChar* result = 0;
2489 
2490     // Do this in pieces to avoid massive reallocations if there is a large amount of text.
2491     // Use system malloc for buffers since they can consume lots of memory and current TCMalloc is unable return it back to OS.
2492     static const unsigned cMaxSegmentSize = 1 << 16;
2493     bufferLength = 0;
2494     typedef pair<UChar*, unsigned> TextSegment;
2495     OwnPtr<Vector<TextSegment> > textSegments;
2496     Vector<UChar> textBuffer;
2497     textBuffer.reserveInitialCapacity(cMaxSegmentSize);
2498     TextIteratorBehavior behavior = defaultBehavior;
2499     if (!isDisplayString)
2500         behavior = static_cast<TextIteratorBehavior>(behavior | TextIteratorEmitsTextsWithoutTranscoding);
2501 
2502     for (TextIterator it(r, behavior); !it.atEnd(); it.advance()) {
2503         if (textBuffer.size() && textBuffer.size() + it.length() > cMaxSegmentSize) {
2504             UChar* newSegmentBuffer = static_cast<UChar*>(malloc(textBuffer.size() * sizeof(UChar)));
2505             if (!newSegmentBuffer)
2506                 goto exit;
2507             memcpy(newSegmentBuffer, textBuffer.data(), textBuffer.size() * sizeof(UChar));
2508             if (!textSegments)
2509                 textSegments = adoptPtr(new Vector<TextSegment>);
2510             textSegments->append(make_pair(newSegmentBuffer, (unsigned)textBuffer.size()));
2511             textBuffer.clear();
2512         }
2513         textBuffer.append(it.characters(), it.length());
2514         bufferLength += it.length();
2515     }
2516 
2517     if (!bufferLength)
2518         return 0;
2519 
2520     // Since we know the size now, we can make a single buffer out of the pieces with one big alloc
2521     result = static_cast<UChar*>(malloc(bufferLength * sizeof(UChar)));
2522     if (!result)
2523         goto exit;
2524 
2525     {
2526         UChar* resultPos = result;
2527         if (textSegments) {
2528             unsigned size = textSegments->size();
2529             for (unsigned i = 0; i < size; ++i) {
2530                 const TextSegment& segment = textSegments->at(i);
2531                 memcpy(resultPos, segment.first, segment.second * sizeof(UChar));
2532                 resultPos += segment.second;
2533             }
2534         }
2535         memcpy(resultPos, textBuffer.data(), textBuffer.size() * sizeof(UChar));
2536     }
2537 
2538 exit:
2539     if (textSegments) {
2540         unsigned size = textSegments->size();
2541         for (unsigned i = 0; i < size; ++i)
2542             free(textSegments->at(i).first);
2543     }
2544 
2545     if (isDisplayString && r->ownerDocument())
2546         r->ownerDocument()->displayBufferModifiedByEncoding(result, bufferLength);
2547 
2548     return result;
2549 }
2550 
plainText(const Range * r,TextIteratorBehavior defaultBehavior)2551 String plainText(const Range* r, TextIteratorBehavior defaultBehavior)
2552 {
2553     unsigned length;
2554     UChar* buf = plainTextToMallocAllocatedBuffer(r, length, false, defaultBehavior);
2555     if (!buf)
2556         return "";
2557     String result(buf, length);
2558     free(buf);
2559     return result;
2560 }
2561 
isAllCollapsibleWhitespace(const String & string)2562 static inline bool isAllCollapsibleWhitespace(const String& string)
2563 {
2564     const UChar* characters = string.characters();
2565     unsigned length = string.length();
2566     for (unsigned i = 0; i < length; ++i) {
2567         if (!isCollapsibleWhitespace(characters[i]))
2568             return false;
2569     }
2570     return true;
2571 }
2572 
collapsedToBoundary(const Range * range,bool forward)2573 static PassRefPtr<Range> collapsedToBoundary(const Range* range, bool forward)
2574 {
2575     ExceptionCode ec = 0;
2576     RefPtr<Range> result = range->cloneRange(ec);
2577     ASSERT(!ec);
2578     result->collapse(!forward, ec);
2579     ASSERT(!ec);
2580     return result.release();
2581 }
2582 
findPlainText(CharacterIterator & it,const String & target,FindOptions options,size_t & matchStart)2583 static size_t findPlainText(CharacterIterator& it, const String& target, FindOptions options, size_t& matchStart)
2584 {
2585     matchStart = 0;
2586     size_t matchLength = 0;
2587 
2588     SearchBuffer buffer(target, options);
2589 
2590     if (buffer.needsMoreContext()) {
2591         RefPtr<Range> startRange = it.range();
2592         RefPtr<Range> beforeStartRange = startRange->ownerDocument()->createRange();
2593         ExceptionCode ec = 0;
2594         beforeStartRange->setEnd(startRange->startContainer(), startRange->startOffset(), ec);
2595         for (SimplifiedBackwardsTextIterator backwardsIterator(beforeStartRange.get()); !backwardsIterator.atEnd(); backwardsIterator.advance()) {
2596             buffer.prependContext(backwardsIterator.characters(), backwardsIterator.length());
2597             if (!buffer.needsMoreContext())
2598                 break;
2599         }
2600     }
2601 
2602     while (!it.atEnd()) {
2603         it.advance(buffer.append(it.characters(), it.length()));
2604 tryAgain:
2605         size_t matchStartOffset;
2606         if (size_t newMatchLength = buffer.search(matchStartOffset)) {
2607             // Note that we found a match, and where we found it.
2608             size_t lastCharacterInBufferOffset = it.characterOffset();
2609             ASSERT(lastCharacterInBufferOffset >= matchStartOffset);
2610             matchStart = lastCharacterInBufferOffset - matchStartOffset;
2611             matchLength = newMatchLength;
2612             // If searching forward, stop on the first match.
2613             // If searching backward, don't stop, so we end up with the last match.
2614             if (!(options & Backwards))
2615                 break;
2616             goto tryAgain;
2617         }
2618         if (it.atBreak() && !buffer.atBreak()) {
2619             buffer.reachedBreak();
2620             goto tryAgain;
2621         }
2622     }
2623 
2624     return matchLength;
2625 }
2626 
findPlainText(const Range * range,const String & target,FindOptions options)2627 PassRefPtr<Range> findPlainText(const Range* range, const String& target, FindOptions options)
2628 {
2629     // First, find the text.
2630     size_t matchStart;
2631     size_t matchLength;
2632     {
2633         CharacterIterator findIterator(range, TextIteratorEntersTextControls);
2634         matchLength = findPlainText(findIterator, target, options, matchStart);
2635         if (!matchLength)
2636             return collapsedToBoundary(range, !(options & Backwards));
2637     }
2638 
2639     // Then, find the document position of the start and the end of the text.
2640     CharacterIterator computeRangeIterator(range, TextIteratorEntersTextControls);
2641     return characterSubrange(computeRangeIterator, matchStart, matchLength);
2642 }
2643 
2644 }
2645