• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010 Apple Inc. All rights reserved.
3  * Copyright (C) 2005 Alexey Proskuryakov.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #include "config.h"
28 #include "TextIterator.h"
29 
30 #include "Document.h"
31 #include "Frame.h"
32 #include "HTMLElement.h"
33 #include "HTMLNames.h"
34 #include "htmlediting.h"
35 #include "InlineTextBox.h"
36 #include "Range.h"
37 #include "RenderTableCell.h"
38 #include "RenderTableRow.h"
39 #include "RenderTextControl.h"
40 #include "RenderTextFragment.h"
41 #include "TextBoundaries.h"
42 #include "TextBreakIterator.h"
43 #include "VisiblePosition.h"
44 #include "visible_units.h"
45 #include <wtf/unicode/CharacterNames.h>
46 
47 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
48 #include "TextBreakIteratorInternalICU.h"
49 #include <unicode/usearch.h>
50 #endif
51 
52 using namespace WTF::Unicode;
53 using namespace std;
54 
55 namespace WebCore {
56 
57 using namespace HTMLNames;
58 
59 // Buffer that knows how to compare with a search target.
60 // Keeps enough of the previous text to be able to search in the future, but no more.
61 // Non-breaking spaces are always equal to normal spaces.
62 // Case folding is also done if the CaseInsensitive option is specified.
63 // Matches are further filtered if the AtWordStarts option is specified, although some
64 // matches inside a word are permitted if TreatMedialCapitalAsWordStart is specified as well.
65 class SearchBuffer {
66     WTF_MAKE_NONCOPYABLE(SearchBuffer);
67 public:
68     SearchBuffer(const String& target, FindOptions);
69     ~SearchBuffer();
70 
71     // Returns number of characters appended; guaranteed to be in the range [1, length].
72     size_t append(const UChar*, size_t length);
73     bool needsMoreContext() const;
74     void prependContext(const UChar*, size_t length);
75     void reachedBreak();
76 
77     // Result is the size in characters of what was found.
78     // And <startOffset> is the number of characters back to the start of what was found.
79     size_t search(size_t& startOffset);
80     bool atBreak() const;
81 
82 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
83 
84 private:
85     bool isBadMatch(const UChar*, size_t length) const;
86     bool isWordStartMatch(size_t start, size_t length) const;
87 
88     String m_target;
89     FindOptions m_options;
90 
91     Vector<UChar> m_buffer;
92     size_t m_overlap;
93     size_t m_prefixLength;
94     bool m_atBreak;
95     bool m_needsMoreContext;
96 
97     bool m_targetRequiresKanaWorkaround;
98     Vector<UChar> m_normalizedTarget;
99     mutable Vector<UChar> m_normalizedMatch;
100 
101 #else
102 
103 private:
104     void append(UChar, bool isCharacterStart);
105     size_t length() const;
106 
107     String m_target;
108     FindOptions m_options;
109 
110     Vector<UChar> m_buffer;
111     Vector<bool> m_isCharacterStartBuffer;
112     bool m_isBufferFull;
113     size_t m_cursor;
114 
115 #endif
116 };
117 
118 // --------
119 
120 static const unsigned bitsInWord = sizeof(unsigned) * 8;
121 static const unsigned bitInWordMask = bitsInWord - 1;
122 
BitStack()123 BitStack::BitStack()
124     : m_size(0)
125 {
126 }
127 
~BitStack()128 BitStack::~BitStack()
129 {
130 }
131 
push(bool bit)132 void BitStack::push(bool bit)
133 {
134     unsigned index = m_size / bitsInWord;
135     unsigned shift = m_size & bitInWordMask;
136     if (!shift && index == m_words.size()) {
137         m_words.grow(index + 1);
138         m_words[index] = 0;
139     }
140     unsigned& word = m_words[index];
141     unsigned mask = 1U << shift;
142     if (bit)
143         word |= mask;
144     else
145         word &= ~mask;
146     ++m_size;
147 }
148 
pop()149 void BitStack::pop()
150 {
151     if (m_size)
152         --m_size;
153 }
154 
top() const155 bool BitStack::top() const
156 {
157     if (!m_size)
158         return false;
159     unsigned shift = (m_size - 1) & bitInWordMask;
160     return m_words.last() & (1U << shift);
161 }
162 
size() const163 unsigned BitStack::size() const
164 {
165     return m_size;
166 }
167 
168 // --------
169 
170 #if !ASSERT_DISABLED
171 
depthCrossingShadowBoundaries(Node * node)172 static unsigned depthCrossingShadowBoundaries(Node* node)
173 {
174     unsigned depth = 0;
175     for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
176         ++depth;
177     return depth;
178 }
179 
180 #endif
181 
182 // This function is like Range::pastLastNode, except for the fact that it can climb up out of shadow trees.
nextInPreOrderCrossingShadowBoundaries(Node * rangeEndContainer,int rangeEndOffset)183 static Node* nextInPreOrderCrossingShadowBoundaries(Node* rangeEndContainer, int rangeEndOffset)
184 {
185     if (!rangeEndContainer)
186         return 0;
187     if (rangeEndOffset >= 0 && !rangeEndContainer->offsetInCharacters()) {
188         if (Node* next = rangeEndContainer->childNode(rangeEndOffset))
189             return next;
190     }
191     for (Node* node = rangeEndContainer; node; node = node->parentOrHostNode()) {
192         if (Node* next = node->nextSibling())
193             return next;
194     }
195     return 0;
196 }
197 
198 // --------
199 
fullyClipsContents(Node * node)200 static inline bool fullyClipsContents(Node* node)
201 {
202     RenderObject* renderer = node->renderer();
203     if (!renderer || !renderer->isBox() || !renderer->hasOverflowClip())
204         return false;
205     return toRenderBox(renderer)->size().isEmpty();
206 }
207 
ignoresContainerClip(Node * node)208 static inline bool ignoresContainerClip(Node* node)
209 {
210     RenderObject* renderer = node->renderer();
211     if (!renderer || renderer->isText())
212         return false;
213     EPosition position = renderer->style()->position();
214     return position == AbsolutePosition || position == FixedPosition;
215 }
216 
pushFullyClippedState(BitStack & stack,Node * node)217 static void pushFullyClippedState(BitStack& stack, Node* node)
218 {
219     ASSERT(stack.size() == depthCrossingShadowBoundaries(node));
220 
221     // Push true if this node full clips its contents, or if a parent already has fully
222     // clipped and this is not a node that ignores its container's clip.
223     stack.push(fullyClipsContents(node) || (stack.top() && !ignoresContainerClip(node)));
224 }
225 
setUpFullyClippedStack(BitStack & stack,Node * node)226 static void setUpFullyClippedStack(BitStack& stack, Node* node)
227 {
228     // Put the nodes in a vector so we can iterate in reverse order.
229     Vector<Node*, 100> ancestry;
230     for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
231         ancestry.append(parent);
232 
233     // Call pushFullyClippedState on each node starting with the earliest ancestor.
234     size_t size = ancestry.size();
235     for (size_t i = 0; i < size; ++i)
236         pushFullyClippedState(stack, ancestry[size - i - 1]);
237     pushFullyClippedState(stack, node);
238 
239     ASSERT(stack.size() == 1 + depthCrossingShadowBoundaries(node));
240 }
241 
242 #if OS(ANDROID)
checkFormControlElement(Node * startNode)243 static bool checkFormControlElement(Node* startNode)
244 {
245     Node* node = startNode;
246     while (node) {
247         if (node->isElementNode() && static_cast<Element*>(node)->isFormControlElement())
248             return true;
249         node = node->parentNode();
250     }
251     return false;
252 }
253 #endif
254 
255 
256 // --------
257 
TextIterator()258 TextIterator::TextIterator()
259     : m_startContainer(0)
260     , m_startOffset(0)
261     , m_endContainer(0)
262     , m_endOffset(0)
263     , m_positionNode(0)
264     , m_textCharacters(0)
265     , m_textLength(0)
266     , m_remainingTextBox(0)
267     , m_firstLetterText(0)
268     , m_lastCharacter(0)
269     , m_emitsCharactersBetweenAllVisiblePositions(false)
270     , m_entersTextControls(false)
271     , m_emitsTextWithoutTranscoding(false)
272     , m_handledFirstLetter(false)
273     , m_ignoresStyleVisibility(false)
274     , m_emitsObjectReplacementCharacters(false)
275 #if OS(ANDROID)
276     , m_stopsOnFormControls(false)
277     , m_shouldStop(false)
278 #endif
279 {
280 }
281 
TextIterator(const Range * r,TextIteratorBehavior behavior)282 TextIterator::TextIterator(const Range* r, TextIteratorBehavior behavior)
283     : m_startContainer(0)
284     , m_startOffset(0)
285     , m_endContainer(0)
286     , m_endOffset(0)
287     , m_positionNode(0)
288     , m_textCharacters(0)
289     , m_textLength(0)
290 #if OS(ANDROID)
291     , m_needsAnotherNewline(false)
292 #endif
293     , m_remainingTextBox(0)
294     , m_firstLetterText(0)
295     , m_emitsCharactersBetweenAllVisiblePositions(behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions)
296     , m_entersTextControls(behavior & TextIteratorEntersTextControls)
297     , m_emitsTextWithoutTranscoding(behavior & TextIteratorEmitsTextsWithoutTranscoding)
298     , m_handledFirstLetter(false)
299     , m_ignoresStyleVisibility(behavior & TextIteratorIgnoresStyleVisibility)
300     , m_emitsObjectReplacementCharacters(behavior & TextIteratorEmitsObjectReplacementCharacters)
301 #if OS(ANDROID)
302     , m_stopsOnFormControls(behavior & TextIteratorStopsOnFormControls)
303     , m_shouldStop(false)
304 #endif
305 {
306     if (!r)
307         return;
308 
309     // get and validate the range endpoints
310     Node* startContainer = r->startContainer();
311     if (!startContainer)
312         return;
313     int startOffset = r->startOffset();
314     Node* endContainer = r->endContainer();
315     int endOffset = r->endOffset();
316 
317     // Callers should be handing us well-formed ranges. If we discover that this isn't
318     // the case, we could consider changing this assertion to an early return.
319     ASSERT(r->boundaryPointsValid());
320 
321     // remember range - this does not change
322     m_startContainer = startContainer;
323     m_startOffset = startOffset;
324     m_endContainer = endContainer;
325     m_endOffset = endOffset;
326 
327     // set up the current node for processing
328     m_node = r->firstNode();
329     if (!m_node)
330         return;
331     setUpFullyClippedStack(m_fullyClippedStack, m_node);
332     m_offset = m_node == m_startContainer ? m_startOffset : 0;
333     m_handledNode = false;
334     m_handledChildren = false;
335 
336     // calculate first out of bounds node
337     m_pastEndNode = nextInPreOrderCrossingShadowBoundaries(endContainer, endOffset);
338 
339     // initialize node processing state
340     m_needsAnotherNewline = false;
341     m_textBox = 0;
342 
343     // initialize record of previous node processing
344     m_hasEmitted = false;
345     m_lastTextNode = 0;
346     m_lastTextNodeEndedWithCollapsedSpace = false;
347     m_lastCharacter = 0;
348 
349 #ifndef NDEBUG
350     // need this just because of the assert in advance()
351     m_positionNode = m_node;
352 #endif
353 
354     // identify the first run
355     advance();
356 }
357 
~TextIterator()358 TextIterator::~TextIterator()
359 {
360 }
361 
atEnd() const362 bool TextIterator::atEnd() const
363 {
364 #if OS(ANDROID)
365     return !m_positionNode || m_shouldStop;
366 #else
367     return !m_positionNode;
368 #endif
369 }
370 
advance()371 void TextIterator::advance()
372 {
373 #if OS(ANDROID)
374     if (m_shouldStop)
375         return;
376 #endif
377     // reset the run information
378     m_positionNode = 0;
379     m_textLength = 0;
380 
381     // handle remembered node that needed a newline after the text node's newline
382     if (m_needsAnotherNewline) {
383         // Emit the extra newline, and position it *inside* m_node, after m_node's
384         // contents, in case it's a block, in the same way that we position the first
385         // newline.  The range for the emitted newline should start where the line
386         // break begins.
387         // FIXME: It would be cleaner if we emitted two newlines during the last
388         // iteration, instead of using m_needsAnotherNewline.
389         Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
390         emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
391         m_needsAnotherNewline = false;
392         return;
393     }
394 
395     if (!m_textBox && m_remainingTextBox) {
396         m_textBox = m_remainingTextBox;
397         m_remainingTextBox = 0;
398         m_firstLetterText = 0;
399         m_offset = 0;
400     }
401     // handle remembered text box
402     if (m_textBox) {
403         handleTextBox();
404         if (m_positionNode)
405             return;
406     }
407 
408     while (m_node && m_node != m_pastEndNode) {
409 #if OS(ANDROID)
410         if (!m_shouldStop && m_stopsOnFormControls && checkFormControlElement(m_node))
411             m_shouldStop = true;
412 #endif
413         // if the range ends at offset 0 of an element, represent the
414         // position, but not the content, of that element e.g. if the
415         // node is a blockflow element, emit a newline that
416         // precedes the element
417         if (m_node == m_endContainer && m_endOffset == 0) {
418             representNodeOffsetZero();
419             m_node = 0;
420             return;
421         }
422 
423         RenderObject* renderer = m_node->renderer();
424         if (!renderer) {
425             m_handledNode = true;
426             m_handledChildren = true;
427         } else {
428             // handle current node according to its type
429             if (!m_handledNode) {
430                 if (renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) // FIXME: What about CDATA_SECTION_NODE?
431                     m_handledNode = handleTextNode();
432                 else if (renderer && (renderer->isImage() || renderer->isWidget() ||
433                          (renderer->node() && renderer->node()->isElementNode() &&
434                           static_cast<Element*>(renderer->node())->isFormControlElement())))
435                     m_handledNode = handleReplacedElement();
436                 else
437                     m_handledNode = handleNonTextNode();
438                 if (m_positionNode)
439                     return;
440             }
441         }
442 
443         // find a new current node to handle in depth-first manner,
444         // calling exitNode() as we come back thru a parent node
445         Node* next = m_handledChildren ? 0 : m_node->firstChild();
446         m_offset = 0;
447         if (!next) {
448             next = m_node->nextSibling();
449             if (!next) {
450                 bool pastEnd = m_node->traverseNextNode() == m_pastEndNode;
451                 Node* parentNode = m_node->parentOrHostNode();
452                 while (!next && parentNode) {
453                     if ((pastEnd && parentNode == m_endContainer) || m_endContainer->isDescendantOf(parentNode))
454                         return;
455                     bool haveRenderer = m_node->renderer();
456                     m_node = parentNode;
457                     m_fullyClippedStack.pop();
458                     parentNode = m_node->parentOrHostNode();
459                     if (haveRenderer)
460                         exitNode();
461                     if (m_positionNode) {
462                         m_handledNode = true;
463                         m_handledChildren = true;
464                         return;
465                     }
466                     next = m_node->nextSibling();
467                 }
468             }
469             m_fullyClippedStack.pop();
470         }
471 
472         // set the new current node
473         m_node = next;
474         if (m_node)
475             pushFullyClippedState(m_fullyClippedStack, m_node);
476         m_handledNode = false;
477         m_handledChildren = false;
478         m_handledFirstLetter = false;
479         m_firstLetterText = 0;
480 
481         // how would this ever be?
482         if (m_positionNode)
483             return;
484     }
485 }
486 
handleTextNode()487 bool TextIterator::handleTextNode()
488 {
489     if (m_fullyClippedStack.top() && !m_ignoresStyleVisibility)
490         return false;
491 
492     RenderText* renderer = toRenderText(m_node->renderer());
493 
494     m_lastTextNode = m_node;
495     String str = renderer->text();
496 
497     // handle pre-formatted text
498     if (!renderer->style()->collapseWhiteSpace()) {
499         int runStart = m_offset;
500         if (m_lastTextNodeEndedWithCollapsedSpace && hasVisibleTextNode(renderer)) {
501             emitCharacter(' ', m_node, 0, runStart, runStart);
502             return false;
503         }
504         if (!m_handledFirstLetter && renderer->isTextFragment()) {
505             handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
506             if (m_firstLetterText) {
507                 String firstLetter = m_firstLetterText->text();
508                 emitText(m_node, m_firstLetterText, m_offset, m_offset + firstLetter.length());
509                 m_firstLetterText = 0;
510                 m_textBox = 0;
511                 return false;
512             }
513         }
514         if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
515             return false;
516         int strLength = str.length();
517         int end = (m_node == m_endContainer) ? m_endOffset : INT_MAX;
518         int runEnd = min(strLength, end);
519 
520         if (runStart >= runEnd)
521             return true;
522 
523         emitText(m_node, runStart, runEnd);
524         return true;
525     }
526 
527     if (!renderer->firstTextBox() && str.length() > 0) {
528         if (!m_handledFirstLetter && renderer->isTextFragment()) {
529             handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
530             if (m_firstLetterText) {
531                 handleTextBox();
532                 return false;
533             }
534         }
535         if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
536             return false;
537         m_lastTextNodeEndedWithCollapsedSpace = true; // entire block is collapsed space
538         return true;
539     }
540 
541     // Used when text boxes are out of order (Hebrew/Arabic w/ embeded LTR text)
542     if (renderer->containsReversedText()) {
543         m_sortedTextBoxes.clear();
544         for (InlineTextBox* textBox = renderer->firstTextBox(); textBox; textBox = textBox->nextTextBox()) {
545             m_sortedTextBoxes.append(textBox);
546         }
547         std::sort(m_sortedTextBoxes.begin(), m_sortedTextBoxes.end(), InlineTextBox::compareByStart);
548         m_sortedTextBoxesPosition = 0;
549     }
550 
551     m_textBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox();
552     if (!m_handledFirstLetter && renderer->isTextFragment() && !m_offset)
553         handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
554     handleTextBox();
555     return true;
556 }
557 
handleTextBox()558 void TextIterator::handleTextBox()
559 {
560     RenderText* renderer = m_firstLetterText ? m_firstLetterText : toRenderText(m_node->renderer());
561     if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility) {
562         m_textBox = 0;
563         return;
564     }
565     String str = renderer->text();
566     unsigned start = m_offset;
567     unsigned end = (m_node == m_endContainer) ? static_cast<unsigned>(m_endOffset) : UINT_MAX;
568     while (m_textBox) {
569         unsigned textBoxStart = m_textBox->start();
570         unsigned runStart = max(textBoxStart, start);
571 
572         // Check for collapsed space at the start of this run.
573         InlineTextBox* firstTextBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox();
574         bool needSpace = m_lastTextNodeEndedWithCollapsedSpace
575             || (m_textBox == firstTextBox && textBoxStart == runStart && runStart > 0);
576         if (needSpace && !isCollapsibleWhitespace(m_lastCharacter) && m_lastCharacter) {
577             if (m_lastTextNode == m_node && runStart > 0 && str[runStart - 1] == ' ') {
578                 unsigned spaceRunStart = runStart - 1;
579                 while (spaceRunStart > 0 && str[spaceRunStart - 1] == ' ')
580                     --spaceRunStart;
581                 emitText(m_node, renderer, spaceRunStart, spaceRunStart + 1);
582             } else
583                 emitCharacter(' ', m_node, 0, runStart, runStart);
584             return;
585         }
586         unsigned textBoxEnd = textBoxStart + m_textBox->len();
587         unsigned runEnd = min(textBoxEnd, end);
588 
589         // Determine what the next text box will be, but don't advance yet
590         InlineTextBox* nextTextBox = 0;
591         if (renderer->containsReversedText()) {
592             if (m_sortedTextBoxesPosition + 1 < m_sortedTextBoxes.size())
593                 nextTextBox = m_sortedTextBoxes[m_sortedTextBoxesPosition + 1];
594         } else
595             nextTextBox = m_textBox->nextTextBox();
596 
597         if (runStart < runEnd) {
598             // Handle either a single newline character (which becomes a space),
599             // or a run of characters that does not include a newline.
600             // This effectively translates newlines to spaces without copying the text.
601             if (str[runStart] == '\n') {
602                 emitCharacter(' ', m_node, 0, runStart, runStart + 1);
603                 m_offset = runStart + 1;
604             } else {
605                 size_t subrunEnd = str.find('\n', runStart);
606                 if (subrunEnd == notFound || subrunEnd > runEnd)
607                     subrunEnd = runEnd;
608 
609                 m_offset = subrunEnd;
610                 emitText(m_node, renderer, runStart, subrunEnd);
611             }
612 
613             // If we are doing a subrun that doesn't go to the end of the text box,
614             // come back again to finish handling this text box; don't advance to the next one.
615             if (static_cast<unsigned>(m_positionEndOffset) < textBoxEnd)
616                 return;
617 
618             // Advance and return
619             unsigned nextRunStart = nextTextBox ? nextTextBox->start() : str.length();
620             if (nextRunStart > runEnd)
621                 m_lastTextNodeEndedWithCollapsedSpace = true; // collapsed space between runs or at the end
622             m_textBox = nextTextBox;
623             if (renderer->containsReversedText())
624                 ++m_sortedTextBoxesPosition;
625             return;
626         }
627         // Advance and continue
628         m_textBox = nextTextBox;
629         if (renderer->containsReversedText())
630             ++m_sortedTextBoxesPosition;
631     }
632     if (!m_textBox && m_remainingTextBox) {
633         m_textBox = m_remainingTextBox;
634         m_remainingTextBox = 0;
635         m_firstLetterText = 0;
636         m_offset = 0;
637         handleTextBox();
638     }
639 }
640 
handleTextNodeFirstLetter(RenderTextFragment * renderer)641 void TextIterator::handleTextNodeFirstLetter(RenderTextFragment* renderer)
642 {
643     if (renderer->firstLetter()) {
644         RenderObject* r = renderer->firstLetter();
645         if (r->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
646             return;
647         for (RenderObject *currChild = r->firstChild(); currChild; currChild->nextSibling()) {
648             if (currChild->isText()) {
649                 RenderText* firstLetter = toRenderText(currChild);
650                 m_handledFirstLetter = true;
651                 m_remainingTextBox = m_textBox;
652                 m_textBox = firstLetter->firstTextBox();
653                 m_firstLetterText = firstLetter;
654                 return;
655             }
656         }
657     }
658     m_handledFirstLetter = true;
659 }
660 
handleReplacedElement()661 bool TextIterator::handleReplacedElement()
662 {
663     if (m_fullyClippedStack.top())
664         return false;
665 
666     RenderObject* renderer = m_node->renderer();
667     if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
668         return false;
669 
670     if (m_lastTextNodeEndedWithCollapsedSpace) {
671         emitCharacter(' ', m_lastTextNode->parentNode(), m_lastTextNode, 1, 1);
672         return false;
673     }
674 
675     if (m_entersTextControls && renderer->isTextControl()) {
676         if (HTMLElement* innerTextElement = toRenderTextControl(renderer)->innerTextElement()) {
677             m_node = innerTextElement->shadowTreeRootNode();
678             pushFullyClippedState(m_fullyClippedStack, m_node);
679             m_offset = 0;
680             return false;
681         }
682     }
683 
684     m_hasEmitted = true;
685 
686     if (m_emitsObjectReplacementCharacters && renderer && renderer->isReplaced()) {
687         emitCharacter(objectReplacementCharacter, m_node->parentNode(), m_node, 0, 1);
688         return true;
689     }
690 
691     if (m_emitsCharactersBetweenAllVisiblePositions) {
692         // We want replaced elements to behave like punctuation for boundary
693         // finding, and to simply take up space for the selection preservation
694         // code in moveParagraphs, so we use a comma.
695         emitCharacter(',', m_node->parentNode(), m_node, 0, 1);
696         return true;
697     }
698 
699     m_positionNode = m_node->parentNode();
700     m_positionOffsetBaseNode = m_node;
701     m_positionStartOffset = 0;
702     m_positionEndOffset = 1;
703 
704     m_textCharacters = 0;
705     m_textLength = 0;
706 
707     m_lastCharacter = 0;
708 
709     return true;
710 }
711 
hasVisibleTextNode(RenderText * renderer)712 bool TextIterator::hasVisibleTextNode(RenderText* renderer)
713 {
714     if (renderer->style()->visibility() == VISIBLE)
715         return true;
716     if (renderer->isTextFragment()) {
717         RenderTextFragment* fragment = static_cast<RenderTextFragment*>(renderer);
718         if (fragment->firstLetter() && fragment->firstLetter()->style()->visibility() == VISIBLE)
719             return true;
720     }
721     return false;
722 }
723 
shouldEmitTabBeforeNode(Node * node)724 static bool shouldEmitTabBeforeNode(Node* node)
725 {
726     RenderObject* r = node->renderer();
727 
728     // Table cells are delimited by tabs.
729     if (!r || !isTableCell(node))
730         return false;
731 
732     // Want a tab before every cell other than the first one
733     RenderTableCell* rc = toRenderTableCell(r);
734     RenderTable* t = rc->table();
735     return t && (t->cellBefore(rc) || t->cellAbove(rc));
736 }
737 
shouldEmitNewlineForNode(Node * node)738 static bool shouldEmitNewlineForNode(Node* node)
739 {
740     // br elements are represented by a single newline.
741     RenderObject* r = node->renderer();
742     if (!r)
743         return node->hasTagName(brTag);
744 
745     return r->isBR();
746 }
747 
shouldEmitNewlinesBeforeAndAfterNode(Node * node)748 static bool shouldEmitNewlinesBeforeAndAfterNode(Node* node)
749 {
750     // Block flow (versus inline flow) is represented by having
751     // a newline both before and after the element.
752     RenderObject* r = node->renderer();
753     if (!r) {
754         return (node->hasTagName(blockquoteTag)
755                 || node->hasTagName(ddTag)
756                 || node->hasTagName(divTag)
757                 || node->hasTagName(dlTag)
758                 || node->hasTagName(dtTag)
759                 || node->hasTagName(h1Tag)
760                 || node->hasTagName(h2Tag)
761                 || node->hasTagName(h3Tag)
762                 || node->hasTagName(h4Tag)
763                 || node->hasTagName(h5Tag)
764                 || node->hasTagName(h6Tag)
765                 || node->hasTagName(hrTag)
766                 || node->hasTagName(liTag)
767                 || node->hasTagName(listingTag)
768                 || node->hasTagName(olTag)
769                 || node->hasTagName(pTag)
770                 || node->hasTagName(preTag)
771                 || node->hasTagName(trTag)
772                 || node->hasTagName(ulTag));
773     }
774 
775     // Need to make an exception for table cells, because they are blocks, but we
776     // want them tab-delimited rather than having newlines before and after.
777     if (isTableCell(node))
778         return false;
779 
780     // Need to make an exception for table row elements, because they are neither
781     // "inline" or "RenderBlock", but we want newlines for them.
782     if (r->isTableRow()) {
783         RenderTable* t = toRenderTableRow(r)->table();
784         if (t && !t->isInline())
785             return true;
786     }
787 
788     return !r->isInline() && r->isRenderBlock() && !r->isFloatingOrPositioned() && !r->isBody();
789 }
790 
shouldEmitNewlineAfterNode(Node * node)791 static bool shouldEmitNewlineAfterNode(Node* node)
792 {
793     // FIXME: It should be better but slower to create a VisiblePosition here.
794     if (!shouldEmitNewlinesBeforeAndAfterNode(node))
795         return false;
796     // Check if this is the very last renderer in the document.
797     // If so, then we should not emit a newline.
798     while ((node = node->traverseNextSibling()))
799         if (node->renderer())
800             return true;
801     return false;
802 }
803 
shouldEmitNewlineBeforeNode(Node * node)804 static bool shouldEmitNewlineBeforeNode(Node* node)
805 {
806     return shouldEmitNewlinesBeforeAndAfterNode(node);
807 }
808 
shouldEmitExtraNewlineForNode(Node * node)809 static bool shouldEmitExtraNewlineForNode(Node* node)
810 {
811     // When there is a significant collapsed bottom margin, emit an extra
812     // newline for a more realistic result.  We end up getting the right
813     // result even without margin collapsing. For example: <div><p>text</p></div>
814     // will work right even if both the <div> and the <p> have bottom margins.
815     RenderObject* r = node->renderer();
816     if (!r || !r->isBox())
817         return false;
818 
819     // NOTE: We only do this for a select set of nodes, and fwiw WinIE appears
820     // not to do this at all
821     if (node->hasTagName(h1Tag)
822         || node->hasTagName(h2Tag)
823         || node->hasTagName(h3Tag)
824         || node->hasTagName(h4Tag)
825         || node->hasTagName(h5Tag)
826         || node->hasTagName(h6Tag)
827         || node->hasTagName(pTag)) {
828         RenderStyle* style = r->style();
829         if (style) {
830             int bottomMargin = toRenderBox(r)->collapsedMarginAfter();
831             int fontSize = style->fontDescription().computedPixelSize();
832             if (bottomMargin * 2 >= fontSize)
833                 return true;
834         }
835     }
836 
837     return false;
838 }
839 
collapsedSpaceLength(RenderText * renderer,int textEnd)840 static int collapsedSpaceLength(RenderText* renderer, int textEnd)
841 {
842     const UChar* characters = renderer->text()->characters();
843     int length = renderer->text()->length();
844     for (int i = textEnd; i < length; ++i) {
845         if (!renderer->style()->isCollapsibleWhiteSpace(characters[i]))
846             return i - textEnd;
847     }
848 
849     return length - textEnd;
850 }
851 
maxOffsetIncludingCollapsedSpaces(Node * node)852 static int maxOffsetIncludingCollapsedSpaces(Node* node)
853 {
854     int offset = caretMaxOffset(node);
855 
856     if (node->renderer() && node->renderer()->isText())
857         offset += collapsedSpaceLength(toRenderText(node->renderer()), offset);
858 
859     return offset;
860 }
861 
862 // Whether or not we should emit a character as we enter m_node (if it's a container) or as we hit it (if it's atomic).
shouldRepresentNodeOffsetZero()863 bool TextIterator::shouldRepresentNodeOffsetZero()
864 {
865     if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isTable())
866         return true;
867 
868     // Leave element positioned flush with start of a paragraph
869     // (e.g. do not insert tab before a table cell at the start of a paragraph)
870     if (m_lastCharacter == '\n')
871         return false;
872 
873     // Otherwise, show the position if we have emitted any characters
874     if (m_hasEmitted)
875         return true;
876 
877     // We've not emitted anything yet. Generally, there is no need for any positioning then.
878     // The only exception is when the element is visually not in the same line as
879     // the start of the range (e.g. the range starts at the end of the previous paragraph).
880     // NOTE: Creating VisiblePositions and comparing them is relatively expensive, so we
881     // make quicker checks to possibly avoid that. Another check that we could make is
882     // is whether the inline vs block flow changed since the previous visible element.
883     // I think we're already in a special enough case that that won't be needed, tho.
884 
885     // No character needed if this is the first node in the range.
886     if (m_node == m_startContainer)
887         return false;
888 
889     // If we are outside the start container's subtree, assume we need to emit.
890     // FIXME: m_startContainer could be an inline block
891     if (!m_node->isDescendantOf(m_startContainer))
892         return true;
893 
894     // If we started as m_startContainer offset 0 and the current node is a descendant of
895     // the start container, we already had enough context to correctly decide whether to
896     // emit after a preceding block. We chose not to emit (m_hasEmitted is false),
897     // so don't second guess that now.
898     // NOTE: Is this really correct when m_node is not a leftmost descendant? Probably
899     // immaterial since we likely would have already emitted something by now.
900     if (m_startOffset == 0)
901         return false;
902 
903     // If this node is unrendered or invisible the VisiblePosition checks below won't have much meaning.
904     // Additionally, if the range we are iterating over contains huge sections of unrendered content,
905     // we would create VisiblePositions on every call to this function without this check.
906     if (!m_node->renderer() || m_node->renderer()->style()->visibility() != VISIBLE)
907         return false;
908 
909     // The startPos.isNotNull() check is needed because the start could be before the body,
910     // and in that case we'll get null. We don't want to put in newlines at the start in that case.
911     // The currPos.isNotNull() check is needed because positions in non-HTML content
912     // (like SVG) do not have visible positions, and we don't want to emit for them either.
913     VisiblePosition startPos = VisiblePosition(Position(m_startContainer, m_startOffset, Position::PositionIsOffsetInAnchor), DOWNSTREAM);
914     VisiblePosition currPos = VisiblePosition(positionBeforeNode(m_node), DOWNSTREAM);
915     return startPos.isNotNull() && currPos.isNotNull() && !inSameLine(startPos, currPos);
916 }
917 
shouldEmitSpaceBeforeAndAfterNode(Node * node)918 bool TextIterator::shouldEmitSpaceBeforeAndAfterNode(Node* node)
919 {
920     return node->renderer() && node->renderer()->isTable() && (node->renderer()->isInline() || m_emitsCharactersBetweenAllVisiblePositions);
921 }
922 
representNodeOffsetZero()923 void TextIterator::representNodeOffsetZero()
924 {
925     // Emit a character to show the positioning of m_node.
926 
927     // When we haven't been emitting any characters, shouldRepresentNodeOffsetZero() can
928     // create VisiblePositions, which is expensive.  So, we perform the inexpensive checks
929     // on m_node to see if it necessitates emitting a character first and will early return
930     // before encountering shouldRepresentNodeOffsetZero()s worse case behavior.
931     if (shouldEmitTabBeforeNode(m_node)) {
932         if (shouldRepresentNodeOffsetZero())
933             emitCharacter('\t', m_node->parentNode(), m_node, 0, 0);
934     } else if (shouldEmitNewlineBeforeNode(m_node)) {
935         if (shouldRepresentNodeOffsetZero())
936             emitCharacter('\n', m_node->parentNode(), m_node, 0, 0);
937     } else if (shouldEmitSpaceBeforeAndAfterNode(m_node)) {
938         if (shouldRepresentNodeOffsetZero())
939             emitCharacter(' ', m_node->parentNode(), m_node, 0, 0);
940     }
941 }
942 
handleNonTextNode()943 bool TextIterator::handleNonTextNode()
944 {
945     if (shouldEmitNewlineForNode(m_node))
946         emitCharacter('\n', m_node->parentNode(), m_node, 0, 1);
947     else if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isHR())
948         emitCharacter(' ', m_node->parentNode(), m_node, 0, 1);
949     else
950         representNodeOffsetZero();
951 
952     return true;
953 }
954 
exitNode()955 void TextIterator::exitNode()
956 {
957     // prevent emitting a newline when exiting a collapsed block at beginning of the range
958     // FIXME: !m_hasEmitted does not necessarily mean there was a collapsed block... it could
959     // have been an hr (e.g.). Also, a collapsed block could have height (e.g. a table) and
960     // therefore look like a blank line.
961     if (!m_hasEmitted)
962         return;
963 
964     // Emit with a position *inside* m_node, after m_node's contents, in
965     // case it is a block, because the run should start where the
966     // emitted character is positioned visually.
967     Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
968     // FIXME: This shouldn't require the m_lastTextNode to be true, but we can't change that without making
969     // the logic in _web_attributedStringFromRange match.  We'll get that for free when we switch to use
970     // TextIterator in _web_attributedStringFromRange.
971     // See <rdar://problem/5428427> for an example of how this mismatch will cause problems.
972     if (m_lastTextNode && shouldEmitNewlineAfterNode(m_node)) {
973         // use extra newline to represent margin bottom, as needed
974         bool addNewline = shouldEmitExtraNewlineForNode(m_node);
975 
976         // FIXME: We need to emit a '\n' as we leave an empty block(s) that
977         // contain a VisiblePosition when doing selection preservation.
978         if (m_lastCharacter != '\n') {
979             // insert a newline with a position following this block's contents.
980             emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
981             // remember whether to later add a newline for the current node
982             ASSERT(!m_needsAnotherNewline);
983             m_needsAnotherNewline = addNewline;
984         } else if (addNewline)
985             // insert a newline with a position following this block's contents.
986             emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
987     }
988 
989     // If nothing was emitted, see if we need to emit a space.
990     if (!m_positionNode && shouldEmitSpaceBeforeAndAfterNode(m_node))
991         emitCharacter(' ', baseNode->parentNode(), baseNode, 1, 1);
992 }
993 
emitCharacter(UChar c,Node * textNode,Node * offsetBaseNode,int textStartOffset,int textEndOffset)994 void TextIterator::emitCharacter(UChar c, Node* textNode, Node* offsetBaseNode, int textStartOffset, int textEndOffset)
995 {
996     m_hasEmitted = true;
997 
998     // remember information with which to construct the TextIterator::range()
999     // NOTE: textNode is often not a text node, so the range will specify child nodes of positionNode
1000     m_positionNode = textNode;
1001     m_positionOffsetBaseNode = offsetBaseNode;
1002     m_positionStartOffset = textStartOffset;
1003     m_positionEndOffset = textEndOffset;
1004 
1005     // remember information with which to construct the TextIterator::characters() and length()
1006     m_singleCharacterBuffer = c;
1007     m_textCharacters = &m_singleCharacterBuffer;
1008     m_textLength = 1;
1009 
1010     // remember some iteration state
1011     m_lastTextNodeEndedWithCollapsedSpace = false;
1012     m_lastCharacter = c;
1013 }
1014 
emitText(Node * textNode,RenderObject * renderObject,int textStartOffset,int textEndOffset)1015 void TextIterator::emitText(Node* textNode, RenderObject* renderObject, int textStartOffset, int textEndOffset)
1016 {
1017     RenderText* renderer = toRenderText(renderObject);
1018     m_text = m_emitsTextWithoutTranscoding ? renderer->textWithoutTranscoding() : renderer->text();
1019     ASSERT(m_text.characters());
1020 
1021     m_positionNode = textNode;
1022     m_positionOffsetBaseNode = 0;
1023     m_positionStartOffset = textStartOffset;
1024     m_positionEndOffset = textEndOffset;
1025     m_textCharacters = m_text.characters() + textStartOffset;
1026     m_textLength = textEndOffset - textStartOffset;
1027     m_lastCharacter = m_text[textEndOffset - 1];
1028 
1029     m_lastTextNodeEndedWithCollapsedSpace = false;
1030     m_hasEmitted = true;
1031 }
1032 
emitText(Node * textNode,int textStartOffset,int textEndOffset)1033 void TextIterator::emitText(Node* textNode, int textStartOffset, int textEndOffset)
1034 {
1035     emitText(textNode, m_node->renderer(), textStartOffset, textEndOffset);
1036 }
1037 
range() const1038 PassRefPtr<Range> TextIterator::range() const
1039 {
1040     // use the current run information, if we have it
1041     if (m_positionNode) {
1042         if (m_positionOffsetBaseNode) {
1043             int index = m_positionOffsetBaseNode->nodeIndex();
1044             m_positionStartOffset += index;
1045             m_positionEndOffset += index;
1046             m_positionOffsetBaseNode = 0;
1047         }
1048         return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1049     }
1050 
1051     // otherwise, return the end of the overall range we were given
1052     if (m_endContainer)
1053         return Range::create(m_endContainer->document(), m_endContainer, m_endOffset, m_endContainer, m_endOffset);
1054 
1055     return 0;
1056 }
1057 
node() const1058 Node* TextIterator::node() const
1059 {
1060     RefPtr<Range> textRange = range();
1061     if (!textRange)
1062         return 0;
1063 
1064     Node* node = textRange->startContainer();
1065     if (!node)
1066         return 0;
1067     if (node->offsetInCharacters())
1068         return node;
1069 
1070     return node->childNode(textRange->startOffset());
1071 }
1072 
1073 // --------
1074 
SimplifiedBackwardsTextIterator()1075 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator()
1076     : m_behavior(TextIteratorDefaultBehavior)
1077     , m_node(0)
1078     , m_positionNode(0)
1079 #if OS(ANDROID)
1080     , m_stopsOnFormControls(false)
1081     , m_shouldStop(false)
1082 #endif
1083 {
1084 }
1085 
SimplifiedBackwardsTextIterator(const Range * r,TextIteratorBehavior behavior)1086 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator(const Range* r, TextIteratorBehavior behavior)
1087     : m_behavior(behavior)
1088     , m_node(0)
1089     , m_positionNode(0)
1090 #if OS(ANDROID)
1091     , m_stopsOnFormControls(behavior & TextIteratorStopsOnFormControls)
1092     , m_shouldStop(false)
1093 #endif
1094 {
1095 #if OS(ANDROID)
1096     ASSERT(m_behavior == TextIteratorDefaultBehavior || m_behavior == TextIteratorStopsOnFormControls);
1097 #else
1098     ASSERT(m_behavior == TextIteratorDefaultBehavior);
1099 #endif
1100 
1101     if (!r)
1102         return;
1103 
1104     Node* startNode = r->startContainer();
1105     if (!startNode)
1106         return;
1107     Node* endNode = r->endContainer();
1108     int startOffset = r->startOffset();
1109     int endOffset = r->endOffset();
1110 
1111     if (!startNode->offsetInCharacters()) {
1112         if (startOffset >= 0 && startOffset < static_cast<int>(startNode->childNodeCount())) {
1113             startNode = startNode->childNode(startOffset);
1114             startOffset = 0;
1115         }
1116     }
1117     if (!endNode->offsetInCharacters()) {
1118         if (endOffset > 0 && endOffset <= static_cast<int>(endNode->childNodeCount())) {
1119             endNode = endNode->childNode(endOffset - 1);
1120             endOffset = lastOffsetInNode(endNode);
1121         }
1122     }
1123 
1124     m_node = endNode;
1125     setUpFullyClippedStack(m_fullyClippedStack, m_node);
1126     m_offset = endOffset;
1127     m_handledNode = false;
1128     m_handledChildren = endOffset == 0;
1129 
1130     m_startNode = startNode;
1131     m_startOffset = startOffset;
1132     m_endNode = endNode;
1133     m_endOffset = endOffset;
1134 
1135 #ifndef NDEBUG
1136     // Need this just because of the assert.
1137     m_positionNode = endNode;
1138 #endif
1139 
1140     m_lastTextNode = 0;
1141     m_lastCharacter = '\n';
1142 
1143     m_havePassedStartNode = false;
1144 
1145     advance();
1146 }
1147 
atEnd() const1148 bool SimplifiedBackwardsTextIterator::atEnd() const
1149 {
1150 #if OS(ANDROID)
1151     return !m_positionNode || m_shouldStop;
1152 #else
1153     return !m_positionNode;
1154 #endif
1155 }
1156 
advance()1157 void SimplifiedBackwardsTextIterator::advance()
1158 {
1159     ASSERT(m_positionNode);
1160 
1161 #if OS(ANDROID)
1162     if (m_shouldStop)
1163         return;
1164 
1165     // Prevent changing the iterator position if a form control element was found and advance should stop on it.
1166     if (m_stopsOnFormControls && checkFormControlElement(m_node)) {
1167         m_shouldStop = true;
1168         return;
1169     }
1170 #endif
1171 
1172     m_positionNode = 0;
1173     m_textLength = 0;
1174 
1175     while (m_node && !m_havePassedStartNode) {
1176         // Don't handle node if we start iterating at [node, 0].
1177         if (!m_handledNode && !(m_node == m_endNode && m_endOffset == 0)) {
1178             RenderObject* renderer = m_node->renderer();
1179             if (renderer && renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) {
1180                 // FIXME: What about CDATA_SECTION_NODE?
1181                 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
1182                     m_handledNode = handleTextNode();
1183             } else if (renderer && (renderer->isImage() || renderer->isWidget())) {
1184                 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
1185                     m_handledNode = handleReplacedElement();
1186             } else
1187                 m_handledNode = handleNonTextNode();
1188             if (m_positionNode)
1189                 return;
1190         }
1191 
1192         if (!m_handledChildren && m_node->hasChildNodes()) {
1193             m_node = m_node->lastChild();
1194             pushFullyClippedState(m_fullyClippedStack, m_node);
1195         } else {
1196             // Exit empty containers as we pass over them or containers
1197             // where [container, 0] is where we started iterating.
1198             if (!m_handledNode
1199                     && canHaveChildrenForEditing(m_node)
1200                     && m_node->parentNode()
1201                     && (!m_node->lastChild() || (m_node == m_endNode && !m_endOffset))) {
1202                 exitNode();
1203                 if (m_positionNode) {
1204                     m_handledNode = true;
1205                     m_handledChildren = true;
1206                     return;
1207                 }
1208             }
1209 
1210             // Exit all other containers.
1211             while (!m_node->previousSibling()) {
1212                 if (!advanceRespectingRange(m_node->parentOrHostNode()))
1213                     break;
1214                 m_fullyClippedStack.pop();
1215                 exitNode();
1216                 if (m_positionNode) {
1217                     m_handledNode = true;
1218                     m_handledChildren = true;
1219                     return;
1220                 }
1221             }
1222 
1223             m_fullyClippedStack.pop();
1224             if (advanceRespectingRange(m_node->previousSibling()))
1225                 pushFullyClippedState(m_fullyClippedStack, m_node);
1226             else
1227                 m_node = 0;
1228         }
1229 
1230         // For the purpose of word boundary detection,
1231         // we should iterate all visible text and trailing (collapsed) whitespaces.
1232         m_offset = m_node ? maxOffsetIncludingCollapsedSpaces(m_node) : 0;
1233         m_handledNode = false;
1234         m_handledChildren = false;
1235 
1236         if (m_positionNode)
1237             return;
1238     }
1239 }
1240 
handleTextNode()1241 bool SimplifiedBackwardsTextIterator::handleTextNode()
1242 {
1243     m_lastTextNode = m_node;
1244 
1245     RenderText* renderer = toRenderText(m_node->renderer());
1246     String str = renderer->text();
1247 
1248     if (!renderer->firstTextBox() && str.length() > 0)
1249         return true;
1250 
1251     m_positionEndOffset = m_offset;
1252 
1253     m_offset = (m_node == m_startNode) ? m_startOffset : 0;
1254     m_positionNode = m_node;
1255     m_positionStartOffset = m_offset;
1256     m_textLength = m_positionEndOffset - m_positionStartOffset;
1257     m_textCharacters = str.characters() + m_positionStartOffset;
1258 
1259     m_lastCharacter = str[m_positionEndOffset - 1];
1260 
1261     return true;
1262 }
1263 
handleReplacedElement()1264 bool SimplifiedBackwardsTextIterator::handleReplacedElement()
1265 {
1266     unsigned index = m_node->nodeIndex();
1267     // We want replaced elements to behave like punctuation for boundary
1268     // finding, and to simply take up space for the selection preservation
1269     // code in moveParagraphs, so we use a comma.  Unconditionally emit
1270     // here because this iterator is only used for boundary finding.
1271     emitCharacter(',', m_node->parentNode(), index, index + 1);
1272     return true;
1273 }
1274 
handleNonTextNode()1275 bool SimplifiedBackwardsTextIterator::handleNonTextNode()
1276 {
1277     // We can use a linefeed in place of a tab because this simple iterator is only used to
1278     // find boundaries, not actual content.  A linefeed breaks words, sentences, and paragraphs.
1279     if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineAfterNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
1280         unsigned index = m_node->nodeIndex();
1281         // The start of this emitted range is wrong. Ensuring correctness would require
1282         // VisiblePositions and so would be slow. previousBoundary expects this.
1283         emitCharacter('\n', m_node->parentNode(), index + 1, index + 1);
1284     }
1285     return true;
1286 }
1287 
exitNode()1288 void SimplifiedBackwardsTextIterator::exitNode()
1289 {
1290     if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineBeforeNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
1291         // The start of this emitted range is wrong. Ensuring correctness would require
1292         // VisiblePositions and so would be slow. previousBoundary expects this.
1293         emitCharacter('\n', m_node, 0, 0);
1294     }
1295 }
1296 
emitCharacter(UChar c,Node * node,int startOffset,int endOffset)1297 void SimplifiedBackwardsTextIterator::emitCharacter(UChar c, Node* node, int startOffset, int endOffset)
1298 {
1299     m_singleCharacterBuffer = c;
1300     m_positionNode = node;
1301     m_positionStartOffset = startOffset;
1302     m_positionEndOffset = endOffset;
1303     m_textCharacters = &m_singleCharacterBuffer;
1304     m_textLength = 1;
1305     m_lastCharacter = c;
1306 }
1307 
advanceRespectingRange(Node * next)1308 bool SimplifiedBackwardsTextIterator::advanceRespectingRange(Node* next)
1309 {
1310     if (!next)
1311         return false;
1312     m_havePassedStartNode |= m_node == m_startNode;
1313     if (m_havePassedStartNode)
1314         return false;
1315     m_node = next;
1316     return true;
1317 }
1318 
range() const1319 PassRefPtr<Range> SimplifiedBackwardsTextIterator::range() const
1320 {
1321     if (m_positionNode)
1322         return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1323 
1324     return Range::create(m_startNode->document(), m_startNode, m_startOffset, m_startNode, m_startOffset);
1325 }
1326 
1327 // --------
1328 
CharacterIterator()1329 CharacterIterator::CharacterIterator()
1330     : m_offset(0)
1331     , m_runOffset(0)
1332     , m_atBreak(true)
1333 {
1334 }
1335 
CharacterIterator(const Range * r,TextIteratorBehavior behavior)1336 CharacterIterator::CharacterIterator(const Range* r, TextIteratorBehavior behavior)
1337     : m_offset(0)
1338     , m_runOffset(0)
1339     , m_atBreak(true)
1340     , m_textIterator(r, behavior)
1341 {
1342     while (!atEnd() && m_textIterator.length() == 0)
1343         m_textIterator.advance();
1344 }
1345 
range() const1346 PassRefPtr<Range> CharacterIterator::range() const
1347 {
1348     RefPtr<Range> r = m_textIterator.range();
1349     if (!m_textIterator.atEnd()) {
1350         if (m_textIterator.length() <= 1) {
1351             ASSERT(m_runOffset == 0);
1352         } else {
1353             Node* n = r->startContainer();
1354             ASSERT(n == r->endContainer());
1355             int offset = r->startOffset() + m_runOffset;
1356             ExceptionCode ec = 0;
1357             r->setStart(n, offset, ec);
1358             r->setEnd(n, offset + 1, ec);
1359             ASSERT(!ec);
1360         }
1361     }
1362     return r.release();
1363 }
1364 
advance(int count)1365 void CharacterIterator::advance(int count)
1366 {
1367     if (count <= 0) {
1368         ASSERT(count == 0);
1369         return;
1370     }
1371 
1372     m_atBreak = false;
1373 
1374     // easy if there is enough left in the current m_textIterator run
1375     int remaining = m_textIterator.length() - m_runOffset;
1376     if (count < remaining) {
1377         m_runOffset += count;
1378         m_offset += count;
1379         return;
1380     }
1381 
1382     // exhaust the current m_textIterator run
1383     count -= remaining;
1384     m_offset += remaining;
1385 
1386     // move to a subsequent m_textIterator run
1387     for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
1388         int runLength = m_textIterator.length();
1389         if (runLength == 0)
1390             m_atBreak = true;
1391         else {
1392             // see whether this is m_textIterator to use
1393             if (count < runLength) {
1394                 m_runOffset = count;
1395                 m_offset += count;
1396                 return;
1397             }
1398 
1399             // exhaust this m_textIterator run
1400             count -= runLength;
1401             m_offset += runLength;
1402         }
1403     }
1404 
1405     // ran to the end of the m_textIterator... no more runs left
1406     m_atBreak = true;
1407     m_runOffset = 0;
1408 }
1409 
string(int numChars)1410 String CharacterIterator::string(int numChars)
1411 {
1412     Vector<UChar> result;
1413     result.reserveInitialCapacity(numChars);
1414     while (numChars > 0 && !atEnd()) {
1415         int runSize = min(numChars, length());
1416         result.append(characters(), runSize);
1417         numChars -= runSize;
1418         advance(runSize);
1419     }
1420     return String::adopt(result);
1421 }
1422 
characterSubrange(CharacterIterator & it,int offset,int length)1423 static PassRefPtr<Range> characterSubrange(CharacterIterator& it, int offset, int length)
1424 {
1425     it.advance(offset);
1426     RefPtr<Range> start = it.range();
1427 
1428     if (length > 1)
1429         it.advance(length - 1);
1430     RefPtr<Range> end = it.range();
1431 
1432     return Range::create(start->startContainer()->document(),
1433         start->startContainer(), start->startOffset(),
1434         end->endContainer(), end->endOffset());
1435 }
1436 
BackwardsCharacterIterator()1437 BackwardsCharacterIterator::BackwardsCharacterIterator()
1438     : m_offset(0)
1439     , m_runOffset(0)
1440     , m_atBreak(true)
1441 {
1442 }
1443 
BackwardsCharacterIterator(const Range * range,TextIteratorBehavior behavior)1444 BackwardsCharacterIterator::BackwardsCharacterIterator(const Range* range, TextIteratorBehavior behavior)
1445     : m_offset(0)
1446     , m_runOffset(0)
1447     , m_atBreak(true)
1448     , m_textIterator(range, behavior)
1449 {
1450     while (!atEnd() && !m_textIterator.length())
1451         m_textIterator.advance();
1452 }
1453 
range() const1454 PassRefPtr<Range> BackwardsCharacterIterator::range() const
1455 {
1456     RefPtr<Range> r = m_textIterator.range();
1457     if (!m_textIterator.atEnd()) {
1458         if (m_textIterator.length() <= 1)
1459             ASSERT(m_runOffset == 0);
1460         else {
1461             Node* n = r->startContainer();
1462             ASSERT(n == r->endContainer());
1463             int offset = r->endOffset() - m_runOffset;
1464             ExceptionCode ec = 0;
1465             r->setStart(n, offset - 1, ec);
1466             r->setEnd(n, offset, ec);
1467             ASSERT(!ec);
1468         }
1469     }
1470     return r.release();
1471 }
1472 
advance(int count)1473 void BackwardsCharacterIterator::advance(int count)
1474 {
1475     if (count <= 0) {
1476         ASSERT(!count);
1477         return;
1478     }
1479 
1480     m_atBreak = false;
1481 
1482     int remaining = m_textIterator.length() - m_runOffset;
1483     if (count < remaining) {
1484         m_runOffset += count;
1485         m_offset += count;
1486         return;
1487     }
1488 
1489     count -= remaining;
1490     m_offset += remaining;
1491 
1492     for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
1493         int runLength = m_textIterator.length();
1494         if (runLength == 0)
1495             m_atBreak = true;
1496         else {
1497             if (count < runLength) {
1498                 m_runOffset = count;
1499                 m_offset += count;
1500                 return;
1501             }
1502 
1503             count -= runLength;
1504             m_offset += runLength;
1505         }
1506     }
1507 
1508     m_atBreak = true;
1509     m_runOffset = 0;
1510 }
1511 
1512 // --------
1513 
WordAwareIterator()1514 WordAwareIterator::WordAwareIterator()
1515     : m_previousText(0)
1516     , m_didLookAhead(false)
1517 {
1518 }
1519 
WordAwareIterator(const Range * r)1520 WordAwareIterator::WordAwareIterator(const Range* r)
1521     : m_previousText(0)
1522     , m_didLookAhead(true) // so we consider the first chunk from the text iterator
1523     , m_textIterator(r)
1524 {
1525     advance(); // get in position over the first chunk of text
1526 }
1527 
~WordAwareIterator()1528 WordAwareIterator::~WordAwareIterator()
1529 {
1530 }
1531 
1532 // We're always in one of these modes:
1533 // - The current chunk in the text iterator is our current chunk
1534 //      (typically its a piece of whitespace, or text that ended with whitespace)
1535 // - The previous chunk in the text iterator is our current chunk
1536 //      (we looked ahead to the next chunk and found a word boundary)
1537 // - We built up our own chunk of text from many chunks from the text iterator
1538 
1539 // FIXME: Performance could be bad for huge spans next to each other that don't fall on word boundaries.
1540 
advance()1541 void WordAwareIterator::advance()
1542 {
1543     m_previousText = 0;
1544     m_buffer.clear();      // toss any old buffer we built up
1545 
1546     // If last time we did a look-ahead, start with that looked-ahead chunk now
1547     if (!m_didLookAhead) {
1548         ASSERT(!m_textIterator.atEnd());
1549         m_textIterator.advance();
1550     }
1551     m_didLookAhead = false;
1552 
1553     // Go to next non-empty chunk
1554     while (!m_textIterator.atEnd() && m_textIterator.length() == 0)
1555         m_textIterator.advance();
1556     m_range = m_textIterator.range();
1557 
1558     if (m_textIterator.atEnd())
1559         return;
1560 
1561     while (1) {
1562         // If this chunk ends in whitespace we can just use it as our chunk.
1563         if (isSpaceOrNewline(m_textIterator.characters()[m_textIterator.length() - 1]))
1564             return;
1565 
1566         // If this is the first chunk that failed, save it in previousText before look ahead
1567         if (m_buffer.isEmpty()) {
1568             m_previousText = m_textIterator.characters();
1569             m_previousLength = m_textIterator.length();
1570         }
1571 
1572         // Look ahead to next chunk.  If it is whitespace or a break, we can use the previous stuff
1573         m_textIterator.advance();
1574         if (m_textIterator.atEnd() || m_textIterator.length() == 0 || isSpaceOrNewline(m_textIterator.characters()[0])) {
1575             m_didLookAhead = true;
1576             return;
1577         }
1578 
1579         if (m_buffer.isEmpty()) {
1580             // Start gobbling chunks until we get to a suitable stopping point
1581             m_buffer.append(m_previousText, m_previousLength);
1582             m_previousText = 0;
1583         }
1584         m_buffer.append(m_textIterator.characters(), m_textIterator.length());
1585         int exception = 0;
1586         m_range->setEnd(m_textIterator.range()->endContainer(), m_textIterator.range()->endOffset(), exception);
1587     }
1588 }
1589 
length() const1590 int WordAwareIterator::length() const
1591 {
1592     if (!m_buffer.isEmpty())
1593         return m_buffer.size();
1594     if (m_previousText)
1595         return m_previousLength;
1596     return m_textIterator.length();
1597 }
1598 
characters() const1599 const UChar* WordAwareIterator::characters() const
1600 {
1601     if (!m_buffer.isEmpty())
1602         return m_buffer.data();
1603     if (m_previousText)
1604         return m_previousText;
1605     return m_textIterator.characters();
1606 }
1607 
1608 // --------
1609 
foldQuoteMarkOrSoftHyphen(UChar c)1610 static inline UChar foldQuoteMarkOrSoftHyphen(UChar c)
1611 {
1612     switch (c) {
1613         case hebrewPunctuationGershayim:
1614         case leftDoubleQuotationMark:
1615         case rightDoubleQuotationMark:
1616             return '"';
1617         case hebrewPunctuationGeresh:
1618         case leftSingleQuotationMark:
1619         case rightSingleQuotationMark:
1620             return '\'';
1621         case softHyphen:
1622             // Replace soft hyphen with an ignorable character so that their presence or absence will
1623             // not affect string comparison.
1624             return 0;
1625         default:
1626             return c;
1627     }
1628 }
1629 
foldQuoteMarksAndSoftHyphens(String & s)1630 static inline void foldQuoteMarksAndSoftHyphens(String& s)
1631 {
1632     s.replace(hebrewPunctuationGeresh, '\'');
1633     s.replace(hebrewPunctuationGershayim, '"');
1634     s.replace(leftDoubleQuotationMark, '"');
1635     s.replace(leftSingleQuotationMark, '\'');
1636     s.replace(rightDoubleQuotationMark, '"');
1637     s.replace(rightSingleQuotationMark, '\'');
1638     // Replace soft hyphen with an ignorable character so that their presence or absence will
1639     // not affect string comparison.
1640     s.replace(softHyphen, 0);
1641 }
1642 
1643 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
1644 
foldQuoteMarksAndSoftHyphens(UChar * data,size_t length)1645 static inline void foldQuoteMarksAndSoftHyphens(UChar* data, size_t length)
1646 {
1647     for (size_t i = 0; i < length; ++i)
1648         data[i] = foldQuoteMarkOrSoftHyphen(data[i]);
1649 }
1650 
1651 static const size_t minimumSearchBufferSize = 8192;
1652 
1653 #ifndef NDEBUG
1654 static bool searcherInUse;
1655 #endif
1656 
createSearcher()1657 static UStringSearch* createSearcher()
1658 {
1659     // Provide a non-empty pattern and non-empty text so usearch_open will not fail,
1660     // but it doesn't matter exactly what it is, since we don't perform any searches
1661     // without setting both the pattern and the text.
1662     UErrorCode status = U_ZERO_ERROR;
1663     UStringSearch* searcher = usearch_open(&newlineCharacter, 1, &newlineCharacter, 1, currentSearchLocaleID(), 0, &status);
1664     ASSERT(status == U_ZERO_ERROR || status == U_USING_FALLBACK_WARNING || status == U_USING_DEFAULT_WARNING);
1665     return searcher;
1666 }
1667 
searcher()1668 static UStringSearch* searcher()
1669 {
1670     static UStringSearch* searcher = createSearcher();
1671     return searcher;
1672 }
1673 
lockSearcher()1674 static inline void lockSearcher()
1675 {
1676 #ifndef NDEBUG
1677     ASSERT(!searcherInUse);
1678     searcherInUse = true;
1679 #endif
1680 }
1681 
unlockSearcher()1682 static inline void unlockSearcher()
1683 {
1684 #ifndef NDEBUG
1685     ASSERT(searcherInUse);
1686     searcherInUse = false;
1687 #endif
1688 }
1689 
1690 // ICU's search ignores the distinction between small kana letters and ones
1691 // that are not small, and also characters that differ only in the voicing
1692 // marks when considering only primary collation strength diffrences.
1693 // This is not helpful for end users, since these differences make words
1694 // distinct, so for our purposes we need these to be considered.
1695 // The Unicode folks do not think the collation algorithm should be
1696 // changed. To work around this, we would like to tailor the ICU searcher,
1697 // but we can't get that to work yet. So instead, we check for cases where
1698 // these differences occur, and skip those matches.
1699 
1700 // We refer to the above technique as the "kana workaround". The next few
1701 // functions are helper functinos for the kana workaround.
1702 
isKanaLetter(UChar character)1703 static inline bool isKanaLetter(UChar character)
1704 {
1705     // Hiragana letters.
1706     if (character >= 0x3041 && character <= 0x3096)
1707         return true;
1708 
1709     // Katakana letters.
1710     if (character >= 0x30A1 && character <= 0x30FA)
1711         return true;
1712     if (character >= 0x31F0 && character <= 0x31FF)
1713         return true;
1714 
1715     // Halfwidth katakana letters.
1716     if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70)
1717         return true;
1718 
1719     return false;
1720 }
1721 
isSmallKanaLetter(UChar character)1722 static inline bool isSmallKanaLetter(UChar character)
1723 {
1724     ASSERT(isKanaLetter(character));
1725 
1726     switch (character) {
1727     case 0x3041: // HIRAGANA LETTER SMALL A
1728     case 0x3043: // HIRAGANA LETTER SMALL I
1729     case 0x3045: // HIRAGANA LETTER SMALL U
1730     case 0x3047: // HIRAGANA LETTER SMALL E
1731     case 0x3049: // HIRAGANA LETTER SMALL O
1732     case 0x3063: // HIRAGANA LETTER SMALL TU
1733     case 0x3083: // HIRAGANA LETTER SMALL YA
1734     case 0x3085: // HIRAGANA LETTER SMALL YU
1735     case 0x3087: // HIRAGANA LETTER SMALL YO
1736     case 0x308E: // HIRAGANA LETTER SMALL WA
1737     case 0x3095: // HIRAGANA LETTER SMALL KA
1738     case 0x3096: // HIRAGANA LETTER SMALL KE
1739     case 0x30A1: // KATAKANA LETTER SMALL A
1740     case 0x30A3: // KATAKANA LETTER SMALL I
1741     case 0x30A5: // KATAKANA LETTER SMALL U
1742     case 0x30A7: // KATAKANA LETTER SMALL E
1743     case 0x30A9: // KATAKANA LETTER SMALL O
1744     case 0x30C3: // KATAKANA LETTER SMALL TU
1745     case 0x30E3: // KATAKANA LETTER SMALL YA
1746     case 0x30E5: // KATAKANA LETTER SMALL YU
1747     case 0x30E7: // KATAKANA LETTER SMALL YO
1748     case 0x30EE: // KATAKANA LETTER SMALL WA
1749     case 0x30F5: // KATAKANA LETTER SMALL KA
1750     case 0x30F6: // KATAKANA LETTER SMALL KE
1751     case 0x31F0: // KATAKANA LETTER SMALL KU
1752     case 0x31F1: // KATAKANA LETTER SMALL SI
1753     case 0x31F2: // KATAKANA LETTER SMALL SU
1754     case 0x31F3: // KATAKANA LETTER SMALL TO
1755     case 0x31F4: // KATAKANA LETTER SMALL NU
1756     case 0x31F5: // KATAKANA LETTER SMALL HA
1757     case 0x31F6: // KATAKANA LETTER SMALL HI
1758     case 0x31F7: // KATAKANA LETTER SMALL HU
1759     case 0x31F8: // KATAKANA LETTER SMALL HE
1760     case 0x31F9: // KATAKANA LETTER SMALL HO
1761     case 0x31FA: // KATAKANA LETTER SMALL MU
1762     case 0x31FB: // KATAKANA LETTER SMALL RA
1763     case 0x31FC: // KATAKANA LETTER SMALL RI
1764     case 0x31FD: // KATAKANA LETTER SMALL RU
1765     case 0x31FE: // KATAKANA LETTER SMALL RE
1766     case 0x31FF: // KATAKANA LETTER SMALL RO
1767     case 0xFF67: // HALFWIDTH KATAKANA LETTER SMALL A
1768     case 0xFF68: // HALFWIDTH KATAKANA LETTER SMALL I
1769     case 0xFF69: // HALFWIDTH KATAKANA LETTER SMALL U
1770     case 0xFF6A: // HALFWIDTH KATAKANA LETTER SMALL E
1771     case 0xFF6B: // HALFWIDTH KATAKANA LETTER SMALL O
1772     case 0xFF6C: // HALFWIDTH KATAKANA LETTER SMALL YA
1773     case 0xFF6D: // HALFWIDTH KATAKANA LETTER SMALL YU
1774     case 0xFF6E: // HALFWIDTH KATAKANA LETTER SMALL YO
1775     case 0xFF6F: // HALFWIDTH KATAKANA LETTER SMALL TU
1776         return true;
1777     }
1778     return false;
1779 }
1780 
1781 enum VoicedSoundMarkType { NoVoicedSoundMark, VoicedSoundMark, SemiVoicedSoundMark };
1782 
composedVoicedSoundMark(UChar character)1783 static inline VoicedSoundMarkType composedVoicedSoundMark(UChar character)
1784 {
1785     ASSERT(isKanaLetter(character));
1786 
1787     switch (character) {
1788     case 0x304C: // HIRAGANA LETTER GA
1789     case 0x304E: // HIRAGANA LETTER GI
1790     case 0x3050: // HIRAGANA LETTER GU
1791     case 0x3052: // HIRAGANA LETTER GE
1792     case 0x3054: // HIRAGANA LETTER GO
1793     case 0x3056: // HIRAGANA LETTER ZA
1794     case 0x3058: // HIRAGANA LETTER ZI
1795     case 0x305A: // HIRAGANA LETTER ZU
1796     case 0x305C: // HIRAGANA LETTER ZE
1797     case 0x305E: // HIRAGANA LETTER ZO
1798     case 0x3060: // HIRAGANA LETTER DA
1799     case 0x3062: // HIRAGANA LETTER DI
1800     case 0x3065: // HIRAGANA LETTER DU
1801     case 0x3067: // HIRAGANA LETTER DE
1802     case 0x3069: // HIRAGANA LETTER DO
1803     case 0x3070: // HIRAGANA LETTER BA
1804     case 0x3073: // HIRAGANA LETTER BI
1805     case 0x3076: // HIRAGANA LETTER BU
1806     case 0x3079: // HIRAGANA LETTER BE
1807     case 0x307C: // HIRAGANA LETTER BO
1808     case 0x3094: // HIRAGANA LETTER VU
1809     case 0x30AC: // KATAKANA LETTER GA
1810     case 0x30AE: // KATAKANA LETTER GI
1811     case 0x30B0: // KATAKANA LETTER GU
1812     case 0x30B2: // KATAKANA LETTER GE
1813     case 0x30B4: // KATAKANA LETTER GO
1814     case 0x30B6: // KATAKANA LETTER ZA
1815     case 0x30B8: // KATAKANA LETTER ZI
1816     case 0x30BA: // KATAKANA LETTER ZU
1817     case 0x30BC: // KATAKANA LETTER ZE
1818     case 0x30BE: // KATAKANA LETTER ZO
1819     case 0x30C0: // KATAKANA LETTER DA
1820     case 0x30C2: // KATAKANA LETTER DI
1821     case 0x30C5: // KATAKANA LETTER DU
1822     case 0x30C7: // KATAKANA LETTER DE
1823     case 0x30C9: // KATAKANA LETTER DO
1824     case 0x30D0: // KATAKANA LETTER BA
1825     case 0x30D3: // KATAKANA LETTER BI
1826     case 0x30D6: // KATAKANA LETTER BU
1827     case 0x30D9: // KATAKANA LETTER BE
1828     case 0x30DC: // KATAKANA LETTER BO
1829     case 0x30F4: // KATAKANA LETTER VU
1830     case 0x30F7: // KATAKANA LETTER VA
1831     case 0x30F8: // KATAKANA LETTER VI
1832     case 0x30F9: // KATAKANA LETTER VE
1833     case 0x30FA: // KATAKANA LETTER VO
1834         return VoicedSoundMark;
1835     case 0x3071: // HIRAGANA LETTER PA
1836     case 0x3074: // HIRAGANA LETTER PI
1837     case 0x3077: // HIRAGANA LETTER PU
1838     case 0x307A: // HIRAGANA LETTER PE
1839     case 0x307D: // HIRAGANA LETTER PO
1840     case 0x30D1: // KATAKANA LETTER PA
1841     case 0x30D4: // KATAKANA LETTER PI
1842     case 0x30D7: // KATAKANA LETTER PU
1843     case 0x30DA: // KATAKANA LETTER PE
1844     case 0x30DD: // KATAKANA LETTER PO
1845         return SemiVoicedSoundMark;
1846     }
1847     return NoVoicedSoundMark;
1848 }
1849 
isCombiningVoicedSoundMark(UChar character)1850 static inline bool isCombiningVoicedSoundMark(UChar character)
1851 {
1852     switch (character) {
1853     case 0x3099: // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
1854     case 0x309A: // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
1855         return true;
1856     }
1857     return false;
1858 }
1859 
containsKanaLetters(const String & pattern)1860 static inline bool containsKanaLetters(const String& pattern)
1861 {
1862     const UChar* characters = pattern.characters();
1863     unsigned length = pattern.length();
1864     for (unsigned i = 0; i < length; ++i) {
1865         if (isKanaLetter(characters[i]))
1866             return true;
1867     }
1868     return false;
1869 }
1870 
normalizeCharacters(const UChar * characters,unsigned length,Vector<UChar> & buffer)1871 static void normalizeCharacters(const UChar* characters, unsigned length, Vector<UChar>& buffer)
1872 {
1873     ASSERT(length);
1874 
1875     buffer.resize(length);
1876 
1877     UErrorCode status = U_ZERO_ERROR;
1878     size_t bufferSize = unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), length, &status);
1879     ASSERT(status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR);
1880     ASSERT(bufferSize);
1881 
1882     buffer.resize(bufferSize);
1883 
1884     if (status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING)
1885         return;
1886 
1887     status = U_ZERO_ERROR;
1888     unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), bufferSize, &status);
1889     ASSERT(status == U_STRING_NOT_TERMINATED_WARNING);
1890 }
1891 
isNonLatin1Separator(UChar32 character)1892 static bool isNonLatin1Separator(UChar32 character)
1893 {
1894     ASSERT_ARG(character, character >= 256);
1895 
1896     return U_GET_GC_MASK(character) & (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK);
1897 }
1898 
isSeparator(UChar32 character)1899 static inline bool isSeparator(UChar32 character)
1900 {
1901     static const bool latin1SeparatorTable[256] = {
1902         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1903         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1904         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // space ! " # $ % & ' ( ) * + , - . /
1905         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, //                         : ; < = > ?
1906         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //   @
1907         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, //                         [ \ ] ^ _
1908         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //   `
1909         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, //                           { | } ~
1910         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1911         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1912         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1913         1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1914         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1915         0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
1916         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1917         0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
1918     };
1919 
1920     if (character < 256)
1921         return latin1SeparatorTable[character];
1922 
1923     return isNonLatin1Separator(character);
1924 }
1925 
SearchBuffer(const String & target,FindOptions options)1926 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
1927     : m_target(target)
1928     , m_options(options)
1929     , m_prefixLength(0)
1930     , m_atBreak(true)
1931     , m_needsMoreContext(options & AtWordStarts)
1932     , m_targetRequiresKanaWorkaround(containsKanaLetters(m_target))
1933 {
1934     ASSERT(!m_target.isEmpty());
1935 
1936     // FIXME: We'd like to tailor the searcher to fold quote marks for us instead
1937     // of doing it in a separate replacement pass here, but ICU doesn't offer a way
1938     // to add tailoring on top of the locale-specific tailoring as of this writing.
1939     foldQuoteMarksAndSoftHyphens(m_target);
1940 
1941     size_t targetLength = m_target.length();
1942     m_buffer.reserveInitialCapacity(max(targetLength * 8, minimumSearchBufferSize));
1943     m_overlap = m_buffer.capacity() / 4;
1944 
1945     if ((m_options & AtWordStarts) && targetLength) {
1946         UChar32 targetFirstCharacter;
1947         U16_GET(m_target.characters(), 0, 0, targetLength, targetFirstCharacter);
1948         // Characters in the separator category never really occur at the beginning of a word,
1949         // so if the target begins with such a character, we just ignore the AtWordStart option.
1950         if (isSeparator(targetFirstCharacter)) {
1951             m_options &= ~AtWordStarts;
1952             m_needsMoreContext = false;
1953         }
1954     }
1955 
1956     // Grab the single global searcher.
1957     // If we ever have a reason to do more than once search buffer at once, we'll have
1958     // to move to multiple searchers.
1959     lockSearcher();
1960 
1961     UStringSearch* searcher = WebCore::searcher();
1962     UCollator* collator = usearch_getCollator(searcher);
1963 
1964     UCollationStrength strength = m_options & CaseInsensitive ? UCOL_PRIMARY : UCOL_TERTIARY;
1965     if (ucol_getStrength(collator) != strength) {
1966         ucol_setStrength(collator, strength);
1967         usearch_reset(searcher);
1968     }
1969 
1970     UErrorCode status = U_ZERO_ERROR;
1971     usearch_setPattern(searcher, m_target.characters(), targetLength, &status);
1972     ASSERT(status == U_ZERO_ERROR);
1973 
1974     // The kana workaround requires a normalized copy of the target string.
1975     if (m_targetRequiresKanaWorkaround)
1976         normalizeCharacters(m_target.characters(), m_target.length(), m_normalizedTarget);
1977 }
1978 
~SearchBuffer()1979 inline SearchBuffer::~SearchBuffer()
1980 {
1981     // Leave the static object pointing to a valid string.
1982     UErrorCode status = U_ZERO_ERROR;
1983     usearch_setPattern(WebCore::searcher(), &newlineCharacter, 1, &status);
1984     ASSERT(status == U_ZERO_ERROR);
1985 
1986     unlockSearcher();
1987 }
1988 
append(const UChar * characters,size_t length)1989 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
1990 {
1991     ASSERT(length);
1992 
1993     if (m_atBreak) {
1994         m_buffer.shrink(0);
1995         m_prefixLength = 0;
1996         m_atBreak = false;
1997     } else if (m_buffer.size() == m_buffer.capacity()) {
1998         memcpy(m_buffer.data(), m_buffer.data() + m_buffer.size() - m_overlap, m_overlap * sizeof(UChar));
1999         m_prefixLength -= min(m_prefixLength, m_buffer.size() - m_overlap);
2000         m_buffer.shrink(m_overlap);
2001     }
2002 
2003     size_t oldLength = m_buffer.size();
2004     size_t usableLength = min(m_buffer.capacity() - oldLength, length);
2005     ASSERT(usableLength);
2006     m_buffer.append(characters, usableLength);
2007     foldQuoteMarksAndSoftHyphens(m_buffer.data() + oldLength, usableLength);
2008     return usableLength;
2009 }
2010 
needsMoreContext() const2011 inline bool SearchBuffer::needsMoreContext() const
2012 {
2013     return m_needsMoreContext;
2014 }
2015 
prependContext(const UChar * characters,size_t length)2016 inline void SearchBuffer::prependContext(const UChar* characters, size_t length)
2017 {
2018     ASSERT(m_needsMoreContext);
2019     ASSERT(m_prefixLength == m_buffer.size());
2020 
2021     if (!length)
2022         return;
2023 
2024     m_atBreak = false;
2025 
2026     size_t wordBoundaryContextStart = length;
2027     if (wordBoundaryContextStart) {
2028         U16_BACK_1(characters, 0, wordBoundaryContextStart);
2029         wordBoundaryContextStart = startOfLastWordBoundaryContext(characters, wordBoundaryContextStart);
2030     }
2031 
2032     size_t usableLength = min(m_buffer.capacity() - m_prefixLength, length - wordBoundaryContextStart);
2033     m_buffer.prepend(characters + length - usableLength, usableLength);
2034     m_prefixLength += usableLength;
2035 
2036     if (wordBoundaryContextStart || m_prefixLength == m_buffer.capacity())
2037         m_needsMoreContext = false;
2038 }
2039 
atBreak() const2040 inline bool SearchBuffer::atBreak() const
2041 {
2042     return m_atBreak;
2043 }
2044 
reachedBreak()2045 inline void SearchBuffer::reachedBreak()
2046 {
2047     m_atBreak = true;
2048 }
2049 
isBadMatch(const UChar * match,size_t matchLength) const2050 inline bool SearchBuffer::isBadMatch(const UChar* match, size_t matchLength) const
2051 {
2052     // This function implements the kana workaround. If usearch treats
2053     // it as a match, but we do not want to, then it's a "bad match".
2054     if (!m_targetRequiresKanaWorkaround)
2055         return false;
2056 
2057     // Normalize into a match buffer. We reuse a single buffer rather than
2058     // creating a new one each time.
2059     normalizeCharacters(match, matchLength, m_normalizedMatch);
2060 
2061     const UChar* a = m_normalizedTarget.begin();
2062     const UChar* aEnd = m_normalizedTarget.end();
2063 
2064     const UChar* b = m_normalizedMatch.begin();
2065     const UChar* bEnd = m_normalizedMatch.end();
2066 
2067     while (true) {
2068         // Skip runs of non-kana-letter characters. This is necessary so we can
2069         // correctly handle strings where the target and match have different-length
2070         // runs of characters that match, while still double checking the correctness
2071         // of matches of kana letters with other kana letters.
2072         while (a != aEnd && !isKanaLetter(*a))
2073             ++a;
2074         while (b != bEnd && !isKanaLetter(*b))
2075             ++b;
2076 
2077         // If we reached the end of either the target or the match, we should have
2078         // reached the end of both; both should have the same number of kana letters.
2079         if (a == aEnd || b == bEnd) {
2080             ASSERT(a == aEnd);
2081             ASSERT(b == bEnd);
2082             return false;
2083         }
2084 
2085         // Check for differences in the kana letter character itself.
2086         if (isSmallKanaLetter(*a) != isSmallKanaLetter(*b))
2087             return true;
2088         if (composedVoicedSoundMark(*a) != composedVoicedSoundMark(*b))
2089             return true;
2090         ++a;
2091         ++b;
2092 
2093         // Check for differences in combining voiced sound marks found after the letter.
2094         while (1) {
2095             if (!(a != aEnd && isCombiningVoicedSoundMark(*a))) {
2096                 if (b != bEnd && isCombiningVoicedSoundMark(*b))
2097                     return true;
2098                 break;
2099             }
2100             if (!(b != bEnd && isCombiningVoicedSoundMark(*b)))
2101                 return true;
2102             if (*a != *b)
2103                 return true;
2104             ++a;
2105             ++b;
2106         }
2107     }
2108 }
2109 
isWordStartMatch(size_t start,size_t length) const2110 inline bool SearchBuffer::isWordStartMatch(size_t start, size_t length) const
2111 {
2112     ASSERT(m_options & AtWordStarts);
2113 
2114     if (!start)
2115         return true;
2116 
2117     if (m_options & TreatMedialCapitalAsWordStart) {
2118         int size = m_buffer.size();
2119         int offset = start;
2120         UChar32 firstCharacter;
2121         U16_GET(m_buffer.data(), 0, offset, size, firstCharacter);
2122         UChar32 previousCharacter;
2123         U16_PREV(m_buffer.data(), 0, offset, previousCharacter);
2124 
2125         if (isSeparator(firstCharacter)) {
2126             // The start of a separator run is a word start (".org" in "webkit.org").
2127             if (!isSeparator(previousCharacter))
2128                 return true;
2129         } else if (isASCIIUpper(firstCharacter)) {
2130             // The start of an uppercase run is a word start ("Kit" in "WebKit").
2131             if (!isASCIIUpper(previousCharacter))
2132                 return true;
2133             // The last character of an uppercase run followed by a non-separator, non-digit
2134             // is a word start ("Request" in "XMLHTTPRequest").
2135             offset = start;
2136             U16_FWD_1(m_buffer.data(), offset, size);
2137             UChar32 nextCharacter = 0;
2138             if (offset < size)
2139                 U16_GET(m_buffer.data(), 0, offset, size, nextCharacter);
2140             if (!isASCIIUpper(nextCharacter) && !isASCIIDigit(nextCharacter) && !isSeparator(nextCharacter))
2141                 return true;
2142         } else if (isASCIIDigit(firstCharacter)) {
2143             // The start of a digit run is a word start ("2" in "WebKit2").
2144             if (!isASCIIDigit(previousCharacter))
2145                 return true;
2146         } else if (isSeparator(previousCharacter) || isASCIIDigit(previousCharacter)) {
2147             // The start of a non-separator, non-uppercase, non-digit run is a word start,
2148             // except after an uppercase. ("org" in "webkit.org", but not "ore" in "WebCore").
2149             return true;
2150         }
2151     }
2152 
2153     size_t wordBreakSearchStart = start + length;
2154     while (wordBreakSearchStart > start)
2155         wordBreakSearchStart = findNextWordFromIndex(m_buffer.data(), m_buffer.size(), wordBreakSearchStart, false /* backwards */);
2156     return wordBreakSearchStart == start;
2157 }
2158 
search(size_t & start)2159 inline size_t SearchBuffer::search(size_t& start)
2160 {
2161     size_t size = m_buffer.size();
2162     if (m_atBreak) {
2163         if (!size)
2164             return 0;
2165     } else {
2166         if (size != m_buffer.capacity())
2167             return 0;
2168     }
2169 
2170     UStringSearch* searcher = WebCore::searcher();
2171 
2172     UErrorCode status = U_ZERO_ERROR;
2173     usearch_setText(searcher, m_buffer.data(), size, &status);
2174     ASSERT(status == U_ZERO_ERROR);
2175 
2176     usearch_setOffset(searcher, m_prefixLength, &status);
2177     ASSERT(status == U_ZERO_ERROR);
2178 
2179     int matchStart = usearch_next(searcher, &status);
2180     ASSERT(status == U_ZERO_ERROR);
2181 
2182 nextMatch:
2183     if (!(matchStart >= 0 && static_cast<size_t>(matchStart) < size)) {
2184         ASSERT(matchStart == USEARCH_DONE);
2185         return 0;
2186     }
2187 
2188     // Matches that start in the overlap area are only tentative.
2189     // The same match may appear later, matching more characters,
2190     // possibly including a combining character that's not yet in the buffer.
2191     if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) {
2192         size_t overlap = m_overlap;
2193         if (m_options & AtWordStarts) {
2194             // Ensure that there is sufficient context before matchStart the next time around for
2195             // determining if it is at a word boundary.
2196             int wordBoundaryContextStart = matchStart;
2197             U16_BACK_1(m_buffer.data(), 0, wordBoundaryContextStart);
2198             wordBoundaryContextStart = startOfLastWordBoundaryContext(m_buffer.data(), wordBoundaryContextStart);
2199             overlap = min(size - 1, max(overlap, size - wordBoundaryContextStart));
2200         }
2201         memcpy(m_buffer.data(), m_buffer.data() + size - overlap, overlap * sizeof(UChar));
2202         m_prefixLength -= min(m_prefixLength, size - overlap);
2203         m_buffer.shrink(overlap);
2204         return 0;
2205     }
2206 
2207     size_t matchedLength = usearch_getMatchedLength(searcher);
2208     ASSERT(matchStart + matchedLength <= size);
2209 
2210     // If this match is "bad", move on to the next match.
2211     if (isBadMatch(m_buffer.data() + matchStart, matchedLength) || ((m_options & AtWordStarts) && !isWordStartMatch(matchStart, matchedLength))) {
2212         matchStart = usearch_next(searcher, &status);
2213         ASSERT(status == U_ZERO_ERROR);
2214         goto nextMatch;
2215     }
2216 
2217     size_t newSize = size - (matchStart + 1);
2218     memmove(m_buffer.data(), m_buffer.data() + matchStart + 1, newSize * sizeof(UChar));
2219     m_prefixLength -= min<size_t>(m_prefixLength, matchStart + 1);
2220     m_buffer.shrink(newSize);
2221 
2222     start = size - matchStart;
2223     return matchedLength;
2224 }
2225 
2226 #else // !ICU_UNICODE
2227 
SearchBuffer(const String & target,FindOptions options)2228 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
2229     : m_target(options & CaseInsensitive ? target.foldCase() : target)
2230     , m_options(options)
2231     , m_buffer(m_target.length())
2232     , m_isCharacterStartBuffer(m_target.length())
2233     , m_isBufferFull(false)
2234     , m_cursor(0)
2235 {
2236     ASSERT(!m_target.isEmpty());
2237     m_target.replace(noBreakSpace, ' ');
2238     foldQuoteMarksAndSoftHyphens(m_target);
2239 }
2240 
~SearchBuffer()2241 inline SearchBuffer::~SearchBuffer()
2242 {
2243 }
2244 
reachedBreak()2245 inline void SearchBuffer::reachedBreak()
2246 {
2247     m_cursor = 0;
2248     m_isBufferFull = false;
2249 }
2250 
atBreak() const2251 inline bool SearchBuffer::atBreak() const
2252 {
2253     return !m_cursor && !m_isBufferFull;
2254 }
2255 
append(UChar c,bool isStart)2256 inline void SearchBuffer::append(UChar c, bool isStart)
2257 {
2258     m_buffer[m_cursor] = c == noBreakSpace ? ' ' : foldQuoteMarkOrSoftHyphen(c);
2259     m_isCharacterStartBuffer[m_cursor] = isStart;
2260     if (++m_cursor == m_target.length()) {
2261         m_cursor = 0;
2262         m_isBufferFull = true;
2263     }
2264 }
2265 
append(const UChar * characters,size_t length)2266 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
2267 {
2268     ASSERT(length);
2269     if (!(m_options & CaseInsensitive)) {
2270         append(characters[0], true);
2271         return 1;
2272     }
2273     const int maxFoldedCharacters = 16; // sensible maximum is 3, this should be more than enough
2274     UChar foldedCharacters[maxFoldedCharacters];
2275     bool error;
2276     int numFoldedCharacters = foldCase(foldedCharacters, maxFoldedCharacters, characters, 1, &error);
2277     ASSERT(!error);
2278     ASSERT(numFoldedCharacters);
2279     ASSERT(numFoldedCharacters <= maxFoldedCharacters);
2280     if (!error && numFoldedCharacters) {
2281         numFoldedCharacters = min(numFoldedCharacters, maxFoldedCharacters);
2282         append(foldedCharacters[0], true);
2283         for (int i = 1; i < numFoldedCharacters; ++i)
2284             append(foldedCharacters[i], false);
2285     }
2286     return 1;
2287 }
2288 
needsMoreContext() const2289 inline bool SearchBuffer::needsMoreContext() const
2290 {
2291     return false;
2292 }
2293 
prependContext(const UChar *,size_t)2294 void SearchBuffer::prependContext(const UChar*, size_t)
2295 {
2296     ASSERT_NOT_REACHED();
2297 }
2298 
search(size_t & start)2299 inline size_t SearchBuffer::search(size_t& start)
2300 {
2301     if (!m_isBufferFull)
2302         return 0;
2303     if (!m_isCharacterStartBuffer[m_cursor])
2304         return 0;
2305 
2306     size_t tailSpace = m_target.length() - m_cursor;
2307     if (memcmp(&m_buffer[m_cursor], m_target.characters(), tailSpace * sizeof(UChar)) != 0)
2308         return 0;
2309     if (memcmp(&m_buffer[0], m_target.characters() + tailSpace, m_cursor * sizeof(UChar)) != 0)
2310         return 0;
2311 
2312     start = length();
2313 
2314     // Now that we've found a match once, we don't want to find it again, because those
2315     // are the SearchBuffer semantics, allowing for a buffer where you append more than one
2316     // character at a time. To do this we take advantage of m_isCharacterStartBuffer, but if
2317     // we want to get rid of that in the future we could track this with a separate boolean
2318     // or even move the characters to the start of the buffer and set m_isBufferFull to false.
2319     m_isCharacterStartBuffer[m_cursor] = false;
2320 
2321     return start;
2322 }
2323 
2324 // Returns the number of characters that were appended to the buffer (what we are searching in).
2325 // That's not necessarily the same length as the passed-in target string, because case folding
2326 // can make two strings match even though they're not the same length.
length() const2327 size_t SearchBuffer::length() const
2328 {
2329     size_t bufferSize = m_target.length();
2330     size_t length = 0;
2331     for (size_t i = 0; i < bufferSize; ++i)
2332         length += m_isCharacterStartBuffer[i];
2333     return length;
2334 }
2335 
2336 #endif // !ICU_UNICODE
2337 
2338 // --------
2339 
rangeLength(const Range * r,bool forSelectionPreservation)2340 int TextIterator::rangeLength(const Range* r, bool forSelectionPreservation)
2341 {
2342     int length = 0;
2343     for (TextIterator it(r, forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior); !it.atEnd(); it.advance())
2344         length += it.length();
2345 
2346     return length;
2347 }
2348 
subrange(Range * entireRange,int characterOffset,int characterCount)2349 PassRefPtr<Range> TextIterator::subrange(Range* entireRange, int characterOffset, int characterCount)
2350 {
2351     CharacterIterator entireRangeIterator(entireRange);
2352     return characterSubrange(entireRangeIterator, characterOffset, characterCount);
2353 }
2354 
rangeFromLocationAndLength(Element * scope,int rangeLocation,int rangeLength,bool forSelectionPreservation)2355 PassRefPtr<Range> TextIterator::rangeFromLocationAndLength(Element* scope, int rangeLocation, int rangeLength, bool forSelectionPreservation)
2356 {
2357     RefPtr<Range> resultRange = scope->document()->createRange();
2358 
2359     int docTextPosition = 0;
2360     int rangeEnd = rangeLocation + rangeLength;
2361     bool startRangeFound = false;
2362 
2363     RefPtr<Range> textRunRange;
2364 
2365     TextIterator it(rangeOfContents(scope).get(), forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior);
2366 
2367     // FIXME: the atEnd() check shouldn't be necessary, workaround for <http://bugs.webkit.org/show_bug.cgi?id=6289>.
2368     if (rangeLocation == 0 && rangeLength == 0 && it.atEnd()) {
2369         textRunRange = it.range();
2370 
2371         ExceptionCode ec = 0;
2372         resultRange->setStart(textRunRange->startContainer(), 0, ec);
2373         ASSERT(!ec);
2374         resultRange->setEnd(textRunRange->startContainer(), 0, ec);
2375         ASSERT(!ec);
2376 
2377         return resultRange.release();
2378     }
2379 
2380     for (; !it.atEnd(); it.advance()) {
2381         int len = it.length();
2382         textRunRange = it.range();
2383 
2384         bool foundStart = rangeLocation >= docTextPosition && rangeLocation <= docTextPosition + len;
2385         bool foundEnd = rangeEnd >= docTextPosition && rangeEnd <= docTextPosition + len;
2386 
2387         // Fix textRunRange->endPosition(), but only if foundStart || foundEnd, because it is only
2388         // in those cases that textRunRange is used.
2389         if (foundEnd) {
2390             // FIXME: This is a workaround for the fact that the end of a run is often at the wrong
2391             // position for emitted '\n's.
2392             if (len == 1 && it.characters()[0] == '\n') {
2393                 scope->document()->updateLayoutIgnorePendingStylesheets();
2394                 it.advance();
2395                 if (!it.atEnd()) {
2396                     RefPtr<Range> range = it.range();
2397                     ExceptionCode ec = 0;
2398                     textRunRange->setEnd(range->startContainer(), range->startOffset(), ec);
2399                     ASSERT(!ec);
2400                 } else {
2401                     Position runStart = textRunRange->startPosition();
2402                     Position runEnd = VisiblePosition(runStart).next().deepEquivalent();
2403                     if (runEnd.isNotNull()) {
2404                         ExceptionCode ec = 0;
2405                         textRunRange->setEnd(runEnd.containerNode(), runEnd.computeOffsetInContainerNode(), ec);
2406                         ASSERT(!ec);
2407                     }
2408                 }
2409             }
2410         }
2411 
2412         if (foundStart) {
2413             startRangeFound = true;
2414             int exception = 0;
2415             if (textRunRange->startContainer()->isTextNode()) {
2416                 int offset = rangeLocation - docTextPosition;
2417                 resultRange->setStart(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
2418             } else {
2419                 if (rangeLocation == docTextPosition)
2420                     resultRange->setStart(textRunRange->startContainer(), textRunRange->startOffset(), exception);
2421                 else
2422                     resultRange->setStart(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2423             }
2424         }
2425 
2426         if (foundEnd) {
2427             int exception = 0;
2428             if (textRunRange->startContainer()->isTextNode()) {
2429                 int offset = rangeEnd - docTextPosition;
2430                 resultRange->setEnd(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
2431             } else {
2432                 if (rangeEnd == docTextPosition)
2433                     resultRange->setEnd(textRunRange->startContainer(), textRunRange->startOffset(), exception);
2434                 else
2435                     resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2436             }
2437             docTextPosition += len;
2438             break;
2439         }
2440         docTextPosition += len;
2441     }
2442 
2443     if (!startRangeFound)
2444         return 0;
2445 
2446     if (rangeLength != 0 && rangeEnd > docTextPosition) { // rangeEnd is out of bounds
2447         int exception = 0;
2448         resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2449     }
2450 
2451     return resultRange.release();
2452 }
2453 
locationAndLengthFromRange(const Range * range,size_t & location,size_t & length)2454 bool TextIterator::locationAndLengthFromRange(const Range* range, size_t& location, size_t& length)
2455 {
2456     location = notFound;
2457     length = 0;
2458 
2459     if (!range->startContainer())
2460         return false;
2461 
2462     Element* selectionRoot = range->ownerDocument()->frame()->selection()->rootEditableElement();
2463     Element* scope = selectionRoot ? selectionRoot : range->ownerDocument()->documentElement();
2464 
2465     // The critical assumption is that this only gets called with ranges that
2466     // concentrate on a given area containing the selection root. This is done
2467     // because of text fields and textareas. The DOM for those is not
2468     // directly in the document DOM, so ensure that the range does not cross a
2469     // boundary of one of those.
2470     if (range->startContainer() != scope && !range->startContainer()->isDescendantOf(scope))
2471         return false;
2472     if (range->endContainer() != scope && !range->endContainer()->isDescendantOf(scope))
2473         return false;
2474 
2475     RefPtr<Range> testRange = Range::create(scope->document(), scope, 0, range->startContainer(), range->startOffset());
2476     ASSERT(testRange->startContainer() == scope);
2477     location = TextIterator::rangeLength(testRange.get());
2478 
2479     ExceptionCode ec;
2480     testRange->setEnd(range->endContainer(), range->endOffset(), ec);
2481     ASSERT(testRange->startContainer() == scope);
2482     length = TextIterator::rangeLength(testRange.get()) - location;
2483     return true;
2484 }
2485 
2486 // --------
2487 
plainTextToMallocAllocatedBuffer(const Range * r,unsigned & bufferLength,bool isDisplayString,TextIteratorBehavior defaultBehavior)2488 UChar* plainTextToMallocAllocatedBuffer(const Range* r, unsigned& bufferLength, bool isDisplayString, TextIteratorBehavior defaultBehavior)
2489 {
2490     UChar* result = 0;
2491 
2492     // Do this in pieces to avoid massive reallocations if there is a large amount of text.
2493     // Use system malloc for buffers since they can consume lots of memory and current TCMalloc is unable return it back to OS.
2494     static const unsigned cMaxSegmentSize = 1 << 16;
2495     bufferLength = 0;
2496     typedef pair<UChar*, unsigned> TextSegment;
2497     OwnPtr<Vector<TextSegment> > textSegments;
2498     Vector<UChar> textBuffer;
2499     textBuffer.reserveInitialCapacity(cMaxSegmentSize);
2500     TextIteratorBehavior behavior = defaultBehavior;
2501     if (!isDisplayString)
2502         behavior = static_cast<TextIteratorBehavior>(behavior | TextIteratorEmitsTextsWithoutTranscoding);
2503 
2504     for (TextIterator it(r, behavior); !it.atEnd(); it.advance()) {
2505         if (textBuffer.size() && textBuffer.size() + it.length() > cMaxSegmentSize) {
2506             UChar* newSegmentBuffer = static_cast<UChar*>(malloc(textBuffer.size() * sizeof(UChar)));
2507             if (!newSegmentBuffer)
2508                 goto exit;
2509             memcpy(newSegmentBuffer, textBuffer.data(), textBuffer.size() * sizeof(UChar));
2510             if (!textSegments)
2511                 textSegments = adoptPtr(new Vector<TextSegment>);
2512             textSegments->append(make_pair(newSegmentBuffer, (unsigned)textBuffer.size()));
2513             textBuffer.clear();
2514         }
2515         textBuffer.append(it.characters(), it.length());
2516         bufferLength += it.length();
2517     }
2518 
2519     if (!bufferLength)
2520         return 0;
2521 
2522     // Since we know the size now, we can make a single buffer out of the pieces with one big alloc
2523     result = static_cast<UChar*>(malloc(bufferLength * sizeof(UChar)));
2524     if (!result)
2525         goto exit;
2526 
2527     {
2528         UChar* resultPos = result;
2529         if (textSegments) {
2530             unsigned size = textSegments->size();
2531             for (unsigned i = 0; i < size; ++i) {
2532                 const TextSegment& segment = textSegments->at(i);
2533                 memcpy(resultPos, segment.first, segment.second * sizeof(UChar));
2534                 resultPos += segment.second;
2535             }
2536         }
2537         memcpy(resultPos, textBuffer.data(), textBuffer.size() * sizeof(UChar));
2538     }
2539 
2540 exit:
2541     if (textSegments) {
2542         unsigned size = textSegments->size();
2543         for (unsigned i = 0; i < size; ++i)
2544             free(textSegments->at(i).first);
2545     }
2546 
2547     if (isDisplayString && r->ownerDocument())
2548         r->ownerDocument()->displayBufferModifiedByEncoding(result, bufferLength);
2549 
2550     return result;
2551 }
2552 
plainText(const Range * r,TextIteratorBehavior defaultBehavior)2553 String plainText(const Range* r, TextIteratorBehavior defaultBehavior)
2554 {
2555     unsigned length;
2556     UChar* buf = plainTextToMallocAllocatedBuffer(r, length, false, defaultBehavior);
2557     if (!buf)
2558         return "";
2559     String result(buf, length);
2560     free(buf);
2561     return result;
2562 }
2563 
isAllCollapsibleWhitespace(const String & string)2564 static inline bool isAllCollapsibleWhitespace(const String& string)
2565 {
2566     const UChar* characters = string.characters();
2567     unsigned length = string.length();
2568     for (unsigned i = 0; i < length; ++i) {
2569         if (!isCollapsibleWhitespace(characters[i]))
2570             return false;
2571     }
2572     return true;
2573 }
2574 
collapsedToBoundary(const Range * range,bool forward)2575 static PassRefPtr<Range> collapsedToBoundary(const Range* range, bool forward)
2576 {
2577     ExceptionCode ec = 0;
2578     RefPtr<Range> result = range->cloneRange(ec);
2579     ASSERT(!ec);
2580     result->collapse(!forward, ec);
2581     ASSERT(!ec);
2582     return result.release();
2583 }
2584 
findPlainText(CharacterIterator & it,const String & target,FindOptions options,size_t & matchStart)2585 static size_t findPlainText(CharacterIterator& it, const String& target, FindOptions options, size_t& matchStart)
2586 {
2587     matchStart = 0;
2588     size_t matchLength = 0;
2589 
2590     SearchBuffer buffer(target, options);
2591 
2592     if (buffer.needsMoreContext()) {
2593         RefPtr<Range> startRange = it.range();
2594         RefPtr<Range> beforeStartRange = startRange->ownerDocument()->createRange();
2595         ExceptionCode ec = 0;
2596         beforeStartRange->setEnd(startRange->startContainer(), startRange->startOffset(), ec);
2597         for (SimplifiedBackwardsTextIterator backwardsIterator(beforeStartRange.get()); !backwardsIterator.atEnd(); backwardsIterator.advance()) {
2598             buffer.prependContext(backwardsIterator.characters(), backwardsIterator.length());
2599             if (!buffer.needsMoreContext())
2600                 break;
2601         }
2602     }
2603 
2604     while (!it.atEnd()) {
2605         it.advance(buffer.append(it.characters(), it.length()));
2606 tryAgain:
2607         size_t matchStartOffset;
2608         if (size_t newMatchLength = buffer.search(matchStartOffset)) {
2609             // Note that we found a match, and where we found it.
2610             size_t lastCharacterInBufferOffset = it.characterOffset();
2611             ASSERT(lastCharacterInBufferOffset >= matchStartOffset);
2612             matchStart = lastCharacterInBufferOffset - matchStartOffset;
2613             matchLength = newMatchLength;
2614             // If searching forward, stop on the first match.
2615             // If searching backward, don't stop, so we end up with the last match.
2616             if (!(options & Backwards))
2617                 break;
2618             goto tryAgain;
2619         }
2620         if (it.atBreak() && !buffer.atBreak()) {
2621             buffer.reachedBreak();
2622             goto tryAgain;
2623         }
2624     }
2625 
2626     return matchLength;
2627 }
2628 
findPlainText(const Range * range,const String & target,FindOptions options)2629 PassRefPtr<Range> findPlainText(const Range* range, const String& target, FindOptions options)
2630 {
2631     // First, find the text.
2632     size_t matchStart;
2633     size_t matchLength;
2634     {
2635         CharacterIterator findIterator(range, TextIteratorEntersTextControls);
2636         matchLength = findPlainText(findIterator, target, options, matchStart);
2637         if (!matchLength)
2638             return collapsedToBoundary(range, !(options & Backwards));
2639     }
2640 
2641     // Then, find the document position of the start and the end of the text.
2642     CharacterIterator computeRangeIterator(range, TextIteratorEntersTextControls);
2643     return characterSubrange(computeRangeIterator, matchStart, matchLength);
2644 }
2645 
2646 }
2647