• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
3  * Copyright (C) 2005 Alexey Proskuryakov.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
15  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  */
26 
27 #include "config.h"
28 #include "TextIterator.h"
29 
30 #include "CharacterNames.h"
31 #include "Document.h"
32 #include "Element.h"
33 #include "HTMLNames.h"
34 #include "htmlediting.h"
35 #include "InlineTextBox.h"
36 #include "Position.h"
37 #include "Range.h"
38 #include "RenderTableCell.h"
39 #include "RenderTableRow.h"
40 #include "RenderTextControl.h"
41 #include "VisiblePosition.h"
42 #include "visible_units.h"
43 
44 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
45 #include <unicode/usearch.h>
46 #endif
47 
48 using namespace WTF::Unicode;
49 using namespace std;
50 
51 namespace WebCore {
52 
53 using namespace HTMLNames;
54 
55 // Buffer that knows how to compare with a search target.
56 // Keeps enough of the previous text to be able to search in the future, but no more.
57 // Non-breaking spaces are always equal to normal spaces.
58 // Case folding is also done if <isCaseSensitive> is false.
59 class SearchBuffer : Noncopyable {
60 public:
61     SearchBuffer(const String& target, bool isCaseSensitive);
62     ~SearchBuffer();
63 
64     // Returns number of characters appended; guaranteed to be in the range [1, length].
65     size_t append(const UChar*, size_t length);
66     void reachedBreak();
67 
68     // Result is the size in characters of what was found.
69     // And <startOffset> is the number of characters back to the start of what was found.
70     size_t search(size_t& startOffset);
71     bool atBreak() const;
72 
73 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
74 
75 private:
76     String m_target;
77     Vector<UChar> m_buffer;
78     size_t m_overlap;
79     bool m_atBreak;
80 
81 #else
82 
83 private:
84     void append(UChar, bool isCharacterStart);
85     size_t length() const;
86 
87     String m_target;
88     bool m_isCaseSensitive;
89 
90     Vector<UChar> m_buffer;
91     Vector<bool> m_isCharacterStartBuffer;
92     bool m_isBufferFull;
93     size_t m_cursor;
94 
95 #endif
96 };
97 
98 // --------
99 
TextIterator()100 TextIterator::TextIterator()
101     : m_startContainer(0)
102     , m_startOffset(0)
103     , m_endContainer(0)
104     , m_endOffset(0)
105     , m_positionNode(0)
106     , m_lastCharacter(0)
107     , m_emitCharactersBetweenAllVisiblePositions(false)
108     , m_enterTextControls(false)
109 {
110 }
111 
TextIterator(const Range * r,bool emitCharactersBetweenAllVisiblePositions,bool enterTextControls)112 TextIterator::TextIterator(const Range* r, bool emitCharactersBetweenAllVisiblePositions, bool enterTextControls)
113     : m_inShadowContent(false)
114     , m_startContainer(0)
115     , m_startOffset(0)
116     , m_endContainer(0)
117     , m_endOffset(0)
118     , m_positionNode(0)
119     , m_emitCharactersBetweenAllVisiblePositions(emitCharactersBetweenAllVisiblePositions)
120     , m_enterTextControls(enterTextControls)
121 {
122     if (!r)
123         return;
124 
125     // get and validate the range endpoints
126     Node* startContainer = r->startContainer();
127     if (!startContainer)
128         return;
129     int startOffset = r->startOffset();
130     Node* endContainer = r->endContainer();
131     int endOffset = r->endOffset();
132 
133     // Callers should be handing us well-formed ranges. If we discover that this isn't
134     // the case, we could consider changing this assertion to an early return.
135     ASSERT(r->boundaryPointsValid());
136 
137     // remember range - this does not change
138     m_startContainer = startContainer;
139     m_startOffset = startOffset;
140     m_endContainer = endContainer;
141     m_endOffset = endOffset;
142 
143     for (Node* n = startContainer; n; n = n->parentNode()) {
144         if (n->isShadowNode()) {
145             m_inShadowContent = true;
146             break;
147         }
148     }
149 
150     // set up the current node for processing
151     m_node = r->firstNode();
152     if (m_node == 0)
153         return;
154     m_offset = m_node == m_startContainer ? m_startOffset : 0;
155     m_handledNode = false;
156     m_handledChildren = false;
157 
158     // calculate first out of bounds node
159     m_pastEndNode = r->pastLastNode();
160 
161     // initialize node processing state
162     m_needAnotherNewline = false;
163     m_textBox = 0;
164 
165     // initialize record of previous node processing
166     m_haveEmitted = false;
167     m_lastTextNode = 0;
168     m_lastTextNodeEndedWithCollapsedSpace = false;
169     m_lastCharacter = 0;
170 
171 #ifndef NDEBUG
172     // need this just because of the assert in advance()
173     m_positionNode = m_node;
174 #endif
175 
176     // identify the first run
177     advance();
178 }
179 
advance()180 void TextIterator::advance()
181 {
182     // reset the run information
183     m_positionNode = 0;
184     m_textLength = 0;
185 
186     // handle remembered node that needed a newline after the text node's newline
187     if (m_needAnotherNewline) {
188         // Emit the extra newline, and position it *inside* m_node, after m_node's
189         // contents, in case it's a block, in the same way that we position the first
190         // newline.  The range for the emitted newline should start where the line
191         // break begins.
192         // FIXME: It would be cleaner if we emitted two newlines during the last
193         // iteration, instead of using m_needAnotherNewline.
194         Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
195         emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
196         m_needAnotherNewline = false;
197         return;
198     }
199 
200     // handle remembered text box
201     if (m_textBox) {
202         handleTextBox();
203         if (m_positionNode)
204             return;
205     }
206 
207     while (m_node && m_node != m_pastEndNode) {
208         // if the range ends at offset 0 of an element, represent the
209         // position, but not the content, of that element e.g. if the
210         // node is a blockflow element, emit a newline that
211         // precedes the element
212         if (m_node == m_endContainer && m_endOffset == 0) {
213             representNodeOffsetZero();
214             m_node = 0;
215             return;
216         }
217 
218         RenderObject *renderer = m_node->renderer();
219         if (!renderer) {
220             m_handledNode = true;
221             m_handledChildren = true;
222         } else {
223             // handle current node according to its type
224             if (!m_handledNode) {
225                 if (renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) // FIXME: What about CDATA_SECTION_NODE?
226                     m_handledNode = handleTextNode();
227                 else if (renderer && (renderer->isImage() || renderer->isWidget() || (renderer->element() && renderer->element()->isControl())))
228                     m_handledNode = handleReplacedElement();
229                 else
230                     m_handledNode = handleNonTextNode();
231                 if (m_positionNode)
232                     return;
233             }
234         }
235 
236         // find a new current node to handle in depth-first manner,
237         // calling exitNode() as we come back thru a parent node
238         Node *next = m_handledChildren ? 0 : m_node->firstChild();
239         m_offset = 0;
240         if (!next) {
241             next = m_node->nextSibling();
242             if (!next) {
243                 bool pastEnd = m_node->traverseNextNode() == m_pastEndNode;
244                 Node* parentNode = m_node->parentNode();
245                 if (!parentNode && m_inShadowContent) {
246                     m_inShadowContent = false;
247                     parentNode = m_node->shadowParentNode();
248                 }
249                 while (!next && parentNode) {
250                     if (pastEnd && parentNode == m_endContainer || m_endContainer->isDescendantOf(parentNode))
251                         return;
252                     bool haveRenderer = m_node->renderer();
253                     m_node = parentNode;
254                     parentNode = m_node->parentNode();
255                     if (!parentNode && m_inShadowContent) {
256                         m_inShadowContent = false;
257                         parentNode = m_node->shadowParentNode();
258                     }
259                     if (haveRenderer)
260                         exitNode();
261                     if (m_positionNode) {
262                         m_handledNode = true;
263                         m_handledChildren = true;
264                         return;
265                     }
266                     next = m_node->nextSibling();
267                 }
268             }
269         }
270 
271         // set the new current node
272         m_node = next;
273         m_handledNode = false;
274         m_handledChildren = false;
275 
276         // how would this ever be?
277         if (m_positionNode)
278             return;
279     }
280 }
281 
compareBoxStart(const InlineTextBox * first,const InlineTextBox * second)282 static inline bool compareBoxStart(const InlineTextBox *first, const InlineTextBox *second)
283 {
284     return first->start() < second->start();
285 }
286 
handleTextNode()287 bool TextIterator::handleTextNode()
288 {
289     RenderText* renderer = toRenderText(m_node->renderer());
290     if (renderer->style()->visibility() != VISIBLE)
291         return false;
292 
293     m_lastTextNode = m_node;
294     String str = renderer->text();
295 
296     // handle pre-formatted text
297     if (!renderer->style()->collapseWhiteSpace()) {
298         int runStart = m_offset;
299         if (m_lastTextNodeEndedWithCollapsedSpace) {
300             emitCharacter(' ', m_node, 0, runStart, runStart);
301             return false;
302         }
303         int strLength = str.length();
304         int end = (m_node == m_endContainer) ? m_endOffset : INT_MAX;
305         int runEnd = min(strLength, end);
306 
307         if (runStart >= runEnd)
308             return true;
309 
310         emitText(m_node, runStart, runEnd);
311         return true;
312     }
313 
314     if (!renderer->firstTextBox() && str.length() > 0) {
315         m_lastTextNodeEndedWithCollapsedSpace = true; // entire block is collapsed space
316         return true;
317     }
318 
319     // Used when text boxes are out of order (Hebrew/Arabic w/ embeded LTR text)
320     if (renderer->containsReversedText()) {
321         m_sortedTextBoxes.clear();
322         for (InlineTextBox * textBox = renderer->firstTextBox(); textBox; textBox = textBox->nextTextBox()) {
323             m_sortedTextBoxes.append(textBox);
324         }
325         std::sort(m_sortedTextBoxes.begin(), m_sortedTextBoxes.end(), compareBoxStart);
326         m_sortedTextBoxesPosition = 0;
327     }
328 
329     m_textBox = renderer->containsReversedText() ? m_sortedTextBoxes[0] : renderer->firstTextBox();
330     handleTextBox();
331     return true;
332 }
333 
handleTextBox()334 void TextIterator::handleTextBox()
335 {
336     RenderText *renderer = toRenderText(m_node->renderer());
337     String str = renderer->text();
338     int start = m_offset;
339     int end = (m_node == m_endContainer) ? m_endOffset : INT_MAX;
340     while (m_textBox) {
341         int textBoxStart = m_textBox->start();
342         int runStart = max(textBoxStart, start);
343 
344         // Check for collapsed space at the start of this run.
345         InlineTextBox *firstTextBox = renderer->containsReversedText() ? m_sortedTextBoxes[0] : renderer->firstTextBox();
346         bool needSpace = m_lastTextNodeEndedWithCollapsedSpace
347             || (m_textBox == firstTextBox && textBoxStart == runStart && runStart > 0);
348         if (needSpace && !isCollapsibleWhitespace(m_lastCharacter) && m_lastCharacter) {
349             if (m_lastTextNode == m_node && runStart > 0 && str[runStart - 1] == ' ') {
350                 unsigned spaceRunStart = runStart - 1;
351                 while (spaceRunStart > 0 && str[spaceRunStart - 1] == ' ')
352                     --spaceRunStart;
353                 emitText(m_node, spaceRunStart, spaceRunStart + 1);
354             } else
355                 emitCharacter(' ', m_node, 0, runStart, runStart);
356             return;
357         }
358         int textBoxEnd = textBoxStart + m_textBox->len();
359         int runEnd = min(textBoxEnd, end);
360 
361         // Determine what the next text box will be, but don't advance yet
362         InlineTextBox *nextTextBox = 0;
363         if (renderer->containsReversedText()) {
364             if (m_sortedTextBoxesPosition + 1 < m_sortedTextBoxes.size())
365                 nextTextBox = m_sortedTextBoxes[m_sortedTextBoxesPosition + 1];
366         } else
367             nextTextBox = m_textBox->nextTextBox();
368 
369         if (runStart < runEnd) {
370             // Handle either a single newline character (which becomes a space),
371             // or a run of characters that does not include a newline.
372             // This effectively translates newlines to spaces without copying the text.
373             if (str[runStart] == '\n') {
374                 emitCharacter(' ', m_node, 0, runStart, runStart + 1);
375                 m_offset = runStart + 1;
376             } else {
377                 int subrunEnd = str.find('\n', runStart);
378                 if (subrunEnd == -1 || subrunEnd > runEnd)
379                     subrunEnd = runEnd;
380 
381                 m_offset = subrunEnd;
382                 emitText(m_node, runStart, subrunEnd);
383             }
384 
385             // If we are doing a subrun that doesn't go to the end of the text box,
386             // come back again to finish handling this text box; don't advance to the next one.
387             if (m_positionEndOffset < textBoxEnd)
388                 return;
389 
390             // Advance and return
391             int nextRunStart = nextTextBox ? nextTextBox->start() : str.length();
392             if (nextRunStart > runEnd)
393                 m_lastTextNodeEndedWithCollapsedSpace = true; // collapsed space between runs or at the end
394             m_textBox = nextTextBox;
395             if (renderer->containsReversedText())
396                 ++m_sortedTextBoxesPosition;
397             return;
398         }
399         // Advance and continue
400         m_textBox = nextTextBox;
401         if (renderer->containsReversedText())
402             ++m_sortedTextBoxesPosition;
403     }
404 }
405 
handleReplacedElement()406 bool TextIterator::handleReplacedElement()
407 {
408     RenderObject* renderer = m_node->renderer();
409     if (renderer->style()->visibility() != VISIBLE)
410         return false;
411 
412     if (m_lastTextNodeEndedWithCollapsedSpace) {
413         emitCharacter(' ', m_lastTextNode->parentNode(), m_lastTextNode, 1, 1);
414         return false;
415     }
416 
417     if (m_enterTextControls && (renderer->isTextArea() || renderer->isTextField())) {
418         m_node = static_cast<RenderTextControl*>(renderer)->innerTextElement();
419         m_offset = 0;
420         m_inShadowContent = true;
421         return false;
422     }
423 
424     m_haveEmitted = true;
425 
426     if (m_emitCharactersBetweenAllVisiblePositions) {
427         // We want replaced elements to behave like punctuation for boundary
428         // finding, and to simply take up space for the selection preservation
429         // code in moveParagraphs, so we use a comma.
430         emitCharacter(',', m_node->parentNode(), m_node, 0, 1);
431         return true;
432     }
433 
434     m_positionNode = m_node->parentNode();
435     m_positionOffsetBaseNode = m_node;
436     m_positionStartOffset = 0;
437     m_positionEndOffset = 1;
438 
439     m_textCharacters = 0;
440     m_textLength = 0;
441 
442     m_lastCharacter = 0;
443 
444     return true;
445 }
446 
shouldEmitTabBeforeNode(Node * node)447 static bool shouldEmitTabBeforeNode(Node* node)
448 {
449     RenderObject* r = node->renderer();
450 
451     // Table cells are delimited by tabs.
452     if (!r || !isTableCell(node))
453         return false;
454 
455     // Want a tab before every cell other than the first one
456     RenderTableCell* rc = static_cast<RenderTableCell*>(r);
457     RenderTable* t = rc->table();
458     return t && (t->cellBefore(rc) || t->cellAbove(rc));
459 }
460 
shouldEmitNewlineForNode(Node * node)461 static bool shouldEmitNewlineForNode(Node* node)
462 {
463     // br elements are represented by a single newline.
464     RenderObject* r = node->renderer();
465     if (!r)
466         return node->hasTagName(brTag);
467 
468     return r->isBR();
469 }
470 
shouldEmitNewlinesBeforeAndAfterNode(Node * node)471 static bool shouldEmitNewlinesBeforeAndAfterNode(Node* node)
472 {
473     // Block flow (versus inline flow) is represented by having
474     // a newline both before and after the element.
475     RenderObject* r = node->renderer();
476     if (!r) {
477         return (node->hasTagName(blockquoteTag)
478                 || node->hasTagName(ddTag)
479                 || node->hasTagName(divTag)
480                 || node->hasTagName(dlTag)
481                 || node->hasTagName(dtTag)
482                 || node->hasTagName(h1Tag)
483                 || node->hasTagName(h2Tag)
484                 || node->hasTagName(h3Tag)
485                 || node->hasTagName(h4Tag)
486                 || node->hasTagName(h5Tag)
487                 || node->hasTagName(h6Tag)
488                 || node->hasTagName(hrTag)
489                 || node->hasTagName(liTag)
490                 || node->hasTagName(listingTag)
491                 || node->hasTagName(olTag)
492                 || node->hasTagName(pTag)
493                 || node->hasTagName(preTag)
494                 || node->hasTagName(trTag)
495                 || node->hasTagName(ulTag));
496     }
497 
498     // Need to make an exception for table cells, because they are blocks, but we
499     // want them tab-delimited rather than having newlines before and after.
500     if (isTableCell(node))
501         return false;
502 
503     // Need to make an exception for table row elements, because they are neither
504     // "inline" or "RenderBlock", but we want newlines for them.
505     if (r->isTableRow()) {
506         RenderTable* t = static_cast<RenderTableRow*>(r)->table();
507         if (t && !t->isInline())
508             return true;
509     }
510 
511     return !r->isInline() && r->isRenderBlock() && !r->isFloatingOrPositioned() && !r->isBody();
512 }
513 
shouldEmitNewlineAfterNode(Node * node)514 static bool shouldEmitNewlineAfterNode(Node* node)
515 {
516     // FIXME: It should be better but slower to create a VisiblePosition here.
517     if (!shouldEmitNewlinesBeforeAndAfterNode(node))
518         return false;
519     // Check if this is the very last renderer in the document.
520     // If so, then we should not emit a newline.
521     while ((node = node->traverseNextSibling()))
522         if (node->renderer())
523             return true;
524     return false;
525 }
526 
shouldEmitNewlineBeforeNode(Node * node)527 static bool shouldEmitNewlineBeforeNode(Node* node)
528 {
529     return shouldEmitNewlinesBeforeAndAfterNode(node);
530 }
531 
shouldEmitExtraNewlineForNode(Node * node)532 static bool shouldEmitExtraNewlineForNode(Node* node)
533 {
534     // When there is a significant collapsed bottom margin, emit an extra
535     // newline for a more realistic result.  We end up getting the right
536     // result even without margin collapsing. For example: <div><p>text</p></div>
537     // will work right even if both the <div> and the <p> have bottom margins.
538     RenderObject* r = node->renderer();
539     if (!r || !r->isBox())
540         return false;
541 
542     // NOTE: We only do this for a select set of nodes, and fwiw WinIE appears
543     // not to do this at all
544     if (node->hasTagName(h1Tag)
545         || node->hasTagName(h2Tag)
546         || node->hasTagName(h3Tag)
547         || node->hasTagName(h4Tag)
548         || node->hasTagName(h5Tag)
549         || node->hasTagName(h6Tag)
550         || node->hasTagName(pTag)) {
551         RenderStyle* style = r->style();
552         if (style) {
553             int bottomMargin = toRenderBox(r)->collapsedMarginBottom();
554             int fontSize = style->fontDescription().computedPixelSize();
555             if (bottomMargin * 2 >= fontSize)
556                 return true;
557         }
558     }
559 
560     return false;
561 }
562 
563 // Whether or not we should emit a character as we enter m_node (if it's a container) or as we hit it (if it's atomic).
shouldRepresentNodeOffsetZero()564 bool TextIterator::shouldRepresentNodeOffsetZero()
565 {
566     if (m_emitCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isTable())
567         return true;
568 
569     // Leave element positioned flush with start of a paragraph
570     // (e.g. do not insert tab before a table cell at the start of a paragraph)
571     if (m_lastCharacter == '\n')
572         return false;
573 
574     // Otherwise, show the position if we have emitted any characters
575     if (m_haveEmitted)
576         return true;
577 
578     // We've not emitted anything yet. Generally, there is no need for any positioning then.
579     // The only exception is when the element is visually not in the same line as
580     // the start of the range (e.g. the range starts at the end of the previous paragraph).
581     // NOTE: Creating VisiblePositions and comparing them is relatively expensive, so we
582     // make quicker checks to possibly avoid that. Another check that we could make is
583     // is whether the inline vs block flow changed since the previous visible element.
584     // I think we're already in a special enough case that that won't be needed, tho.
585 
586     // No character needed if this is the first node in the range.
587     if (m_node == m_startContainer)
588         return false;
589 
590     // If we are outside the start container's subtree, assume we need to emit.
591     // FIXME: m_startContainer could be an inline block
592     if (!m_node->isDescendantOf(m_startContainer))
593         return true;
594 
595     // If we started as m_startContainer offset 0 and the current node is a descendant of
596     // the start container, we already had enough context to correctly decide whether to
597     // emit after a preceding block. We chose not to emit (m_haveEmitted is false),
598     // so don't second guess that now.
599     // NOTE: Is this really correct when m_node is not a leftmost descendant? Probably
600     // immaterial since we likely would have already emitted something by now.
601     if (m_startOffset == 0)
602         return false;
603 
604     // If this node is unrendered or invisible the VisiblePosition checks below won't have much meaning.
605     // Additionally, if the range we are iterating over contains huge sections of unrendered content,
606     // we would create VisiblePositions on every call to this function without this check.
607     if (!m_node->renderer() || m_node->renderer()->style()->visibility() != VISIBLE)
608         return false;
609 
610     // The currPos.isNotNull() check is needed because positions in non-html content
611     // (like svg) do not have visible positions, and we don't want to emit for them either.
612     VisiblePosition startPos = VisiblePosition(m_startContainer, m_startOffset, DOWNSTREAM);
613     VisiblePosition currPos = VisiblePosition(m_node, 0, DOWNSTREAM);
614     return currPos.isNotNull() && !inSameLine(startPos, currPos);
615 }
616 
shouldEmitSpaceBeforeAndAfterNode(Node * node)617 bool TextIterator::shouldEmitSpaceBeforeAndAfterNode(Node* node)
618 {
619     return node->renderer() && node->renderer()->isTable() && (node->renderer()->isInline() || m_emitCharactersBetweenAllVisiblePositions);
620 }
621 
representNodeOffsetZero()622 void TextIterator::representNodeOffsetZero()
623 {
624     // Emit a character to show the positioning of m_node.
625 
626     // When we haven't been emitting any characters, shouldRepresentNodeOffsetZero() can
627     // create VisiblePositions, which is expensive.  So, we perform the inexpensive checks
628     // on m_node to see if it necessitates emitting a character first and will early return
629     // before encountering shouldRepresentNodeOffsetZero()s worse case behavior.
630     if (shouldEmitTabBeforeNode(m_node)) {
631         if (shouldRepresentNodeOffsetZero())
632             emitCharacter('\t', m_node->parentNode(), m_node, 0, 0);
633     } else if (shouldEmitNewlineBeforeNode(m_node)) {
634         if (shouldRepresentNodeOffsetZero())
635             emitCharacter('\n', m_node->parentNode(), m_node, 0, 0);
636     } else if (shouldEmitSpaceBeforeAndAfterNode(m_node)) {
637         if (shouldRepresentNodeOffsetZero())
638             emitCharacter(' ', m_node->parentNode(), m_node, 0, 0);
639     }
640 }
641 
handleNonTextNode()642 bool TextIterator::handleNonTextNode()
643 {
644     if (shouldEmitNewlineForNode(m_node))
645         emitCharacter('\n', m_node->parentNode(), m_node, 0, 1);
646     else if (m_emitCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isHR())
647         emitCharacter(' ', m_node->parentNode(), m_node, 0, 1);
648     else
649         representNodeOffsetZero();
650 
651     return true;
652 }
653 
exitNode()654 void TextIterator::exitNode()
655 {
656     // prevent emitting a newline when exiting a collapsed block at beginning of the range
657     // FIXME: !m_haveEmitted does not necessarily mean there was a collapsed block... it could
658     // have been an hr (e.g.). Also, a collapsed block could have height (e.g. a table) and
659     // therefore look like a blank line.
660     if (!m_haveEmitted)
661         return;
662 
663     // Emit with a position *inside* m_node, after m_node's contents, in
664     // case it is a block, because the run should start where the
665     // emitted character is positioned visually.
666     Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
667     // FIXME: This shouldn't require the m_lastTextNode to be true, but we can't change that without making
668     // the logic in _web_attributedStringFromRange match.  We'll get that for free when we switch to use
669     // TextIterator in _web_attributedStringFromRange.
670     // See <rdar://problem/5428427> for an example of how this mismatch will cause problems.
671     if (m_lastTextNode && shouldEmitNewlineAfterNode(m_node)) {
672         // use extra newline to represent margin bottom, as needed
673         bool addNewline = shouldEmitExtraNewlineForNode(m_node);
674 
675         // FIXME: We need to emit a '\n' as we leave an empty block(s) that
676         // contain a VisiblePosition when doing selection preservation.
677         if (m_lastCharacter != '\n') {
678             // insert a newline with a position following this block's contents.
679             emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
680             // remember whether to later add a newline for the current node
681             ASSERT(!m_needAnotherNewline);
682             m_needAnotherNewline = addNewline;
683         } else if (addNewline)
684             // insert a newline with a position following this block's contents.
685             emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
686     }
687 
688     // If nothing was emitted, see if we need to emit a space.
689     if (!m_positionNode && shouldEmitSpaceBeforeAndAfterNode(m_node))
690         emitCharacter(' ', baseNode->parentNode(), baseNode, 1, 1);
691 }
692 
emitCharacter(UChar c,Node * textNode,Node * offsetBaseNode,int textStartOffset,int textEndOffset)693 void TextIterator::emitCharacter(UChar c, Node *textNode, Node *offsetBaseNode, int textStartOffset, int textEndOffset)
694 {
695     m_haveEmitted = true;
696 
697     // remember information with which to construct the TextIterator::range()
698     // NOTE: textNode is often not a text node, so the range will specify child nodes of positionNode
699     m_positionNode = textNode;
700     m_positionOffsetBaseNode = offsetBaseNode;
701     m_positionStartOffset = textStartOffset;
702     m_positionEndOffset = textEndOffset;
703 
704     // remember information with which to construct the TextIterator::characters() and length()
705     m_singleCharacterBuffer = c;
706     m_textCharacters = &m_singleCharacterBuffer;
707     m_textLength = 1;
708 
709     // remember some iteration state
710     m_lastTextNodeEndedWithCollapsedSpace = false;
711     m_lastCharacter = c;
712 }
713 
emitText(Node * textNode,int textStartOffset,int textEndOffset)714 void TextIterator::emitText(Node* textNode, int textStartOffset, int textEndOffset)
715 {
716     RenderText* renderer = toRenderText(m_node->renderer());
717     String str = renderer->text();
718     ASSERT(str.characters());
719 
720     m_positionNode = textNode;
721     m_positionOffsetBaseNode = 0;
722     m_positionStartOffset = textStartOffset;
723     m_positionEndOffset = textEndOffset;
724     m_textCharacters = str.characters() + textStartOffset;
725     m_textLength = textEndOffset - textStartOffset;
726     m_lastCharacter = str[textEndOffset - 1];
727 
728     m_lastTextNodeEndedWithCollapsedSpace = false;
729     m_haveEmitted = true;
730 }
731 
range() const732 PassRefPtr<Range> TextIterator::range() const
733 {
734     // use the current run information, if we have it
735     if (m_positionNode) {
736         if (m_positionOffsetBaseNode) {
737             int index = m_positionOffsetBaseNode->nodeIndex();
738             m_positionStartOffset += index;
739             m_positionEndOffset += index;
740             m_positionOffsetBaseNode = 0;
741         }
742         return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
743     }
744 
745     // otherwise, return the end of the overall range we were given
746     if (m_endContainer)
747         return Range::create(m_endContainer->document(), m_endContainer, m_endOffset, m_endContainer, m_endOffset);
748 
749     return 0;
750 }
751 
node() const752 Node* TextIterator::node() const
753 {
754     RefPtr<Range> textRange = range();
755     if (!textRange)
756         return 0;
757 
758     Node* node = textRange->startContainer();
759     if (!node)
760         return 0;
761     if (node->offsetInCharacters())
762         return node;
763 
764     return node->childNode(textRange->startOffset());
765 }
766 
767 // --------
768 
SimplifiedBackwardsTextIterator()769 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator() : m_positionNode(0)
770 {
771 }
772 
SimplifiedBackwardsTextIterator(const Range * r)773 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator(const Range *r)
774 {
775     m_positionNode = 0;
776 
777     if (!r)
778         return;
779 
780     Node* startNode = r->startContainer();
781     if (!startNode)
782         return;
783     Node* endNode = r->endContainer();
784     int startOffset = r->startOffset();
785     int endOffset = r->endOffset();
786 
787     if (!startNode->offsetInCharacters()) {
788         if (startOffset >= 0 && startOffset < static_cast<int>(startNode->childNodeCount())) {
789             startNode = startNode->childNode(startOffset);
790             startOffset = 0;
791         }
792     }
793     if (!endNode->offsetInCharacters()) {
794         if (endOffset > 0 && endOffset <= static_cast<int>(endNode->childNodeCount())) {
795             endNode = endNode->childNode(endOffset - 1);
796             endOffset = endNode->offsetInCharacters() ? endNode->maxCharacterOffset() : endNode->childNodeCount();
797         }
798     }
799 
800     m_node = endNode;
801     m_offset = endOffset;
802     m_handledNode = false;
803     m_handledChildren = endOffset == 0;
804 
805     m_startNode = startNode;
806     m_startOffset = startOffset;
807     m_endNode = endNode;
808     m_endOffset = endOffset;
809 
810 #ifndef NDEBUG
811     // Need this just because of the assert.
812     m_positionNode = endNode;
813 #endif
814 
815     m_lastTextNode = 0;
816     m_lastCharacter = '\n';
817 
818     if (startOffset == 0 || !startNode->firstChild()) {
819         m_pastStartNode = startNode->previousSibling();
820         while (!m_pastStartNode && startNode->parentNode()) {
821             startNode = startNode->parentNode();
822             m_pastStartNode = startNode->previousSibling();
823         }
824     } else
825         m_pastStartNode = startNode->childNode(startOffset - 1);
826 
827     advance();
828 }
829 
advance()830 void SimplifiedBackwardsTextIterator::advance()
831 {
832     ASSERT(m_positionNode);
833 
834     m_positionNode = 0;
835     m_textLength = 0;
836 
837     while (m_node && m_node != m_pastStartNode) {
838         // Don't handle node if we start iterating at [node, 0].
839         if (!m_handledNode && !(m_node == m_endNode && m_endOffset == 0)) {
840             RenderObject *renderer = m_node->renderer();
841             if (renderer && renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) {
842                 // FIXME: What about CDATA_SECTION_NODE?
843                 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
844                     m_handledNode = handleTextNode();
845             } else if (renderer && (renderer->isImage() || renderer->isWidget())) {
846                 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
847                     m_handledNode = handleReplacedElement();
848             } else
849                 m_handledNode = handleNonTextNode();
850             if (m_positionNode)
851                 return;
852         }
853 
854         Node* next = m_handledChildren ? 0 : m_node->lastChild();
855         if (!next) {
856             // Exit empty containers as we pass over them or containers
857             // where [container, 0] is where we started iterating.
858             if (!m_handledNode &&
859                 canHaveChildrenForEditing(m_node) &&
860                 m_node->parentNode() &&
861                 (!m_node->lastChild() || m_node == m_endNode && m_endOffset == 0)) {
862                 exitNode();
863                 if (m_positionNode) {
864                     m_handledNode = true;
865                     m_handledChildren = true;
866                     return;
867                 }
868             }
869             // Exit all other containers.
870             next = m_node->previousSibling();
871             while (!next) {
872                 if (!m_node->parentNode())
873                     break;
874                 m_node = m_node->parentNode();
875                 exitNode();
876                 if (m_positionNode) {
877                     m_handledNode = true;
878                     m_handledChildren = true;
879                     return;
880                 }
881                 next = m_node->previousSibling();
882             }
883         }
884 
885         m_node = next;
886         m_offset = m_node ? caretMaxOffset(m_node) : 0;
887         m_handledNode = false;
888         m_handledChildren = false;
889 
890         if (m_positionNode)
891             return;
892     }
893 }
894 
handleTextNode()895 bool SimplifiedBackwardsTextIterator::handleTextNode()
896 {
897     m_lastTextNode = m_node;
898 
899     RenderText *renderer = toRenderText(m_node->renderer());
900     String str = renderer->text();
901 
902     if (!renderer->firstTextBox() && str.length() > 0)
903         return true;
904 
905     m_positionEndOffset = m_offset;
906 
907     m_offset = (m_node == m_startNode) ? m_startOffset : 0;
908     m_positionNode = m_node;
909     m_positionStartOffset = m_offset;
910     m_textLength = m_positionEndOffset - m_positionStartOffset;
911     m_textCharacters = str.characters() + m_positionStartOffset;
912 
913     m_lastCharacter = str[m_positionEndOffset - 1];
914 
915     return true;
916 }
917 
handleReplacedElement()918 bool SimplifiedBackwardsTextIterator::handleReplacedElement()
919 {
920     unsigned index = m_node->nodeIndex();
921     // We want replaced elements to behave like punctuation for boundary
922     // finding, and to simply take up space for the selection preservation
923     // code in moveParagraphs, so we use a comma.  Unconditionally emit
924     // here because this iterator is only used for boundary finding.
925     emitCharacter(',', m_node->parentNode(), index, index + 1);
926     return true;
927 }
928 
handleNonTextNode()929 bool SimplifiedBackwardsTextIterator::handleNonTextNode()
930 {
931     // We can use a linefeed in place of a tab because this simple iterator is only used to
932     // find boundaries, not actual content.  A linefeed breaks words, sentences, and paragraphs.
933     if (shouldEmitNewlineForNode(m_node) ||
934         shouldEmitNewlineAfterNode(m_node) ||
935         shouldEmitTabBeforeNode(m_node)) {
936         unsigned index = m_node->nodeIndex();
937         // The start of this emitted range is wrong, ensuring correctness would require
938         // VisiblePositions and so would be slow.  previousBoundary expects this.
939         emitCharacter('\n', m_node->parentNode(), index + 1, index + 1);
940     }
941 
942     return true;
943 }
944 
exitNode()945 void SimplifiedBackwardsTextIterator::exitNode()
946 {
947     if (shouldEmitNewlineForNode(m_node) ||
948         shouldEmitNewlineBeforeNode(m_node) ||
949         shouldEmitTabBeforeNode(m_node))
950         // The start of this emitted range is wrong, ensuring correctness would require
951         // VisiblePositions and so would be slow.  previousBoundary expects this.
952         emitCharacter('\n', m_node, 0, 0);
953 }
954 
emitCharacter(UChar c,Node * node,int startOffset,int endOffset)955 void SimplifiedBackwardsTextIterator::emitCharacter(UChar c, Node *node, int startOffset, int endOffset)
956 {
957     m_singleCharacterBuffer = c;
958     m_positionNode = node;
959     m_positionStartOffset = startOffset;
960     m_positionEndOffset = endOffset;
961     m_textCharacters = &m_singleCharacterBuffer;
962     m_textLength = 1;
963     m_lastCharacter = c;
964 }
965 
range() const966 PassRefPtr<Range> SimplifiedBackwardsTextIterator::range() const
967 {
968     if (m_positionNode)
969         return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
970 
971     return Range::create(m_startNode->document(), m_startNode, m_startOffset, m_startNode, m_startOffset);
972 }
973 
974 // --------
975 
CharacterIterator()976 CharacterIterator::CharacterIterator()
977     : m_offset(0)
978     , m_runOffset(0)
979     , m_atBreak(true)
980 {
981 }
982 
CharacterIterator(const Range * r,bool emitCharactersBetweenAllVisiblePositions,bool enterTextControls)983 CharacterIterator::CharacterIterator(const Range *r, bool emitCharactersBetweenAllVisiblePositions, bool enterTextControls)
984     : m_offset(0)
985     , m_runOffset(0)
986     , m_atBreak(true)
987     , m_textIterator(r, emitCharactersBetweenAllVisiblePositions, enterTextControls)
988 {
989     while (!atEnd() && m_textIterator.length() == 0)
990         m_textIterator.advance();
991 }
992 
range() const993 PassRefPtr<Range> CharacterIterator::range() const
994 {
995     RefPtr<Range> r = m_textIterator.range();
996     if (!m_textIterator.atEnd()) {
997         if (m_textIterator.length() <= 1) {
998             ASSERT(m_runOffset == 0);
999         } else {
1000             Node* n = r->startContainer();
1001             ASSERT(n == r->endContainer());
1002             int offset = r->startOffset() + m_runOffset;
1003             ExceptionCode ec = 0;
1004             r->setStart(n, offset, ec);
1005             r->setEnd(n, offset + 1, ec);
1006             ASSERT(!ec);
1007         }
1008     }
1009     return r.release();
1010 }
1011 
advance(int count)1012 void CharacterIterator::advance(int count)
1013 {
1014     if (count <= 0) {
1015         ASSERT(count == 0);
1016         return;
1017     }
1018 
1019     m_atBreak = false;
1020 
1021     // easy if there is enough left in the current m_textIterator run
1022     int remaining = m_textIterator.length() - m_runOffset;
1023     if (count < remaining) {
1024         m_runOffset += count;
1025         m_offset += count;
1026         return;
1027     }
1028 
1029     // exhaust the current m_textIterator run
1030     count -= remaining;
1031     m_offset += remaining;
1032 
1033     // move to a subsequent m_textIterator run
1034     for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
1035         int runLength = m_textIterator.length();
1036         if (runLength == 0)
1037             m_atBreak = true;
1038         else {
1039             // see whether this is m_textIterator to use
1040             if (count < runLength) {
1041                 m_runOffset = count;
1042                 m_offset += count;
1043                 return;
1044             }
1045 
1046             // exhaust this m_textIterator run
1047             count -= runLength;
1048             m_offset += runLength;
1049         }
1050     }
1051 
1052     // ran to the end of the m_textIterator... no more runs left
1053     m_atBreak = true;
1054     m_runOffset = 0;
1055 }
1056 
string(int numChars)1057 String CharacterIterator::string(int numChars)
1058 {
1059     Vector<UChar> result;
1060     result.reserveCapacity(numChars);
1061     while (numChars > 0 && !atEnd()) {
1062         int runSize = min(numChars, length());
1063         result.append(characters(), runSize);
1064         numChars -= runSize;
1065         advance(runSize);
1066     }
1067     return String::adopt(result);
1068 }
1069 
characterSubrange(CharacterIterator & it,int offset,int length)1070 static PassRefPtr<Range> characterSubrange(CharacterIterator& it, int offset, int length)
1071 {
1072     it.advance(offset);
1073     RefPtr<Range> start = it.range();
1074 
1075     if (length > 1)
1076         it.advance(length - 1);
1077     RefPtr<Range> end = it.range();
1078 
1079     return Range::create(start->startContainer()->document(),
1080         start->startContainer(), start->startOffset(),
1081         end->endContainer(), end->endOffset());
1082 }
1083 
1084 // --------
1085 
WordAwareIterator()1086 WordAwareIterator::WordAwareIterator()
1087 : m_previousText(0), m_didLookAhead(false)
1088 {
1089 }
1090 
WordAwareIterator(const Range * r)1091 WordAwareIterator::WordAwareIterator(const Range *r)
1092 : m_previousText(0), m_didLookAhead(false), m_textIterator(r)
1093 {
1094     m_didLookAhead = true;  // so we consider the first chunk from the text iterator
1095     advance();              // get in position over the first chunk of text
1096 }
1097 
1098 // We're always in one of these modes:
1099 // - The current chunk in the text iterator is our current chunk
1100 //      (typically its a piece of whitespace, or text that ended with whitespace)
1101 // - The previous chunk in the text iterator is our current chunk
1102 //      (we looked ahead to the next chunk and found a word boundary)
1103 // - We built up our own chunk of text from many chunks from the text iterator
1104 
1105 // FIXME: Perf could be bad for huge spans next to each other that don't fall on word boundaries
1106 
advance()1107 void WordAwareIterator::advance()
1108 {
1109     m_previousText = 0;
1110     m_buffer.clear();      // toss any old buffer we built up
1111 
1112     // If last time we did a look-ahead, start with that looked-ahead chunk now
1113     if (!m_didLookAhead) {
1114         ASSERT(!m_textIterator.atEnd());
1115         m_textIterator.advance();
1116     }
1117     m_didLookAhead = false;
1118 
1119     // Go to next non-empty chunk
1120     while (!m_textIterator.atEnd() && m_textIterator.length() == 0)
1121         m_textIterator.advance();
1122     m_range = m_textIterator.range();
1123 
1124     if (m_textIterator.atEnd())
1125         return;
1126 
1127     while (1) {
1128         // If this chunk ends in whitespace we can just use it as our chunk.
1129         if (isSpaceOrNewline(m_textIterator.characters()[m_textIterator.length() - 1]))
1130             return;
1131 
1132         // If this is the first chunk that failed, save it in previousText before look ahead
1133         if (m_buffer.isEmpty()) {
1134             m_previousText = m_textIterator.characters();
1135             m_previousLength = m_textIterator.length();
1136         }
1137 
1138         // Look ahead to next chunk.  If it is whitespace or a break, we can use the previous stuff
1139         m_textIterator.advance();
1140         if (m_textIterator.atEnd() || m_textIterator.length() == 0 || isSpaceOrNewline(m_textIterator.characters()[0])) {
1141             m_didLookAhead = true;
1142             return;
1143         }
1144 
1145         if (m_buffer.isEmpty()) {
1146             // Start gobbling chunks until we get to a suitable stopping point
1147             m_buffer.append(m_previousText, m_previousLength);
1148             m_previousText = 0;
1149         }
1150         m_buffer.append(m_textIterator.characters(), m_textIterator.length());
1151         int exception = 0;
1152         m_range->setEnd(m_textIterator.range()->endContainer(), m_textIterator.range()->endOffset(), exception);
1153     }
1154 }
1155 
length() const1156 int WordAwareIterator::length() const
1157 {
1158     if (!m_buffer.isEmpty())
1159         return m_buffer.size();
1160     if (m_previousText)
1161         return m_previousLength;
1162     return m_textIterator.length();
1163 }
1164 
characters() const1165 const UChar* WordAwareIterator::characters() const
1166 {
1167     if (!m_buffer.isEmpty())
1168         return m_buffer.data();
1169     if (m_previousText)
1170         return m_previousText;
1171     return m_textIterator.characters();
1172 }
1173 
1174 // --------
1175 
1176 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
1177 
1178 static const size_t minimumSearchBufferSize = 8192;
1179 
1180 #ifndef NDEBUG
1181 static bool searcherInUse;
1182 #endif
1183 
createSearcher()1184 static UStringSearch* createSearcher()
1185 {
1186     // Provide a non-empty pattern and non-empty text so usearch_open will not fail,
1187     // but it doesn't matter exactly what it is, since we don't perform any searches
1188     // without setting both the pattern and the text.
1189 
1190     // Pass empty string for the locale for now to get the Unicode Collation Algorithm,
1191     // rather than something locale-specific.
1192 
1193     UErrorCode status = U_ZERO_ERROR;
1194     UStringSearch* searcher = usearch_open(&newlineCharacter, 1, &newlineCharacter, 1, "", 0, &status);
1195     ASSERT(status == U_ZERO_ERROR);
1196     return searcher;
1197 }
1198 
searcher()1199 static UStringSearch* searcher()
1200 {
1201     static UStringSearch* searcher = createSearcher();
1202     return searcher;
1203 }
1204 
lockSearcher()1205 static inline void lockSearcher()
1206 {
1207 #ifndef NDEBUG
1208     ASSERT(!searcherInUse);
1209     searcherInUse = true;
1210 #endif
1211 }
1212 
unlockSearcher()1213 static inline void unlockSearcher()
1214 {
1215 #ifndef NDEBUG
1216     ASSERT(searcherInUse);
1217     searcherInUse = false;
1218 #endif
1219 }
1220 
SearchBuffer(const String & target,bool isCaseSensitive)1221 inline SearchBuffer::SearchBuffer(const String& target, bool isCaseSensitive)
1222     : m_target(target)
1223     , m_atBreak(true)
1224 {
1225     ASSERT(!m_target.isEmpty());
1226 
1227     size_t targetLength = target.length();
1228     m_buffer.reserveCapacity(max(targetLength * 8, minimumSearchBufferSize));
1229     m_overlap = m_buffer.capacity() / 4;
1230 
1231     // Grab the single global searcher.
1232     // If we ever have a reason to do more than once search buffer at once, we'll have
1233     // to move to multiple searchers.
1234     lockSearcher();
1235 
1236     UStringSearch* searcher = WebCore::searcher();
1237     UCollator* collator = usearch_getCollator(searcher);
1238 
1239     UCollationStrength strength = isCaseSensitive ? UCOL_TERTIARY : UCOL_PRIMARY;
1240     if (ucol_getStrength(collator) != strength) {
1241         ucol_setStrength(collator, strength);
1242         usearch_reset(searcher);
1243     }
1244 
1245     UErrorCode status = U_ZERO_ERROR;
1246     usearch_setPattern(searcher, m_target.characters(), targetLength, &status);
1247     ASSERT(status == U_ZERO_ERROR);
1248 }
1249 
~SearchBuffer()1250 inline SearchBuffer::~SearchBuffer()
1251 {
1252     unlockSearcher();
1253 }
1254 
append(const UChar * characters,size_t length)1255 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
1256 {
1257     ASSERT(length);
1258 
1259     if (m_atBreak) {
1260         m_buffer.shrink(0);
1261         m_atBreak = false;
1262     } else if (m_buffer.size() == m_buffer.capacity()) {
1263         memcpy(m_buffer.data(), m_buffer.data() + m_buffer.size() - m_overlap, m_overlap * sizeof(UChar));
1264         m_buffer.shrink(m_overlap);
1265     }
1266 
1267     size_t usableLength = min(m_buffer.capacity() - m_buffer.size(), length);
1268     ASSERT(usableLength);
1269     m_buffer.append(characters, usableLength);
1270     return usableLength;
1271 }
1272 
atBreak() const1273 inline bool SearchBuffer::atBreak() const
1274 {
1275     return m_atBreak;
1276 }
1277 
reachedBreak()1278 inline void SearchBuffer::reachedBreak()
1279 {
1280     m_atBreak = true;
1281 }
1282 
search(size_t & start)1283 inline size_t SearchBuffer::search(size_t& start)
1284 {
1285     size_t size = m_buffer.size();
1286     if (m_atBreak) {
1287         if (!size)
1288             return 0;
1289     } else {
1290         if (size != m_buffer.capacity())
1291             return 0;
1292     }
1293 
1294     UStringSearch* searcher = WebCore::searcher();
1295 
1296     UErrorCode status = U_ZERO_ERROR;
1297     usearch_setText(searcher, m_buffer.data(), size, &status);
1298     ASSERT(status == U_ZERO_ERROR);
1299 
1300     int matchStart = usearch_first(searcher, &status);
1301     ASSERT(status == U_ZERO_ERROR);
1302     if (!(matchStart >= 0 && static_cast<size_t>(matchStart) < size)) {
1303         ASSERT(matchStart == USEARCH_DONE);
1304         return 0;
1305     }
1306 
1307     // Matches that start in the overlap area are only tentative.
1308     // The same match may appear later, matching more characters,
1309     // possibly including a combining character that's not yet in the buffer.
1310     if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) {
1311         memcpy(m_buffer.data(), m_buffer.data() + size - m_overlap, m_overlap * sizeof(UChar));
1312         m_buffer.shrink(m_overlap);
1313         return 0;
1314     }
1315 
1316     size_t newSize = size - (matchStart + 1);
1317     memmove(m_buffer.data(), m_buffer.data() + matchStart + 1, newSize * sizeof(UChar));
1318     m_buffer.shrink(newSize);
1319 
1320     start = size - matchStart;
1321     return usearch_getMatchedLength(searcher);
1322 }
1323 
1324 #else // !ICU_UNICODE
1325 
SearchBuffer(const String & target,bool isCaseSensitive)1326 inline SearchBuffer::SearchBuffer(const String& target, bool isCaseSensitive)
1327     : m_target(isCaseSensitive ? target : target.foldCase())
1328     , m_isCaseSensitive(isCaseSensitive)
1329     , m_buffer(m_target.length())
1330     , m_isCharacterStartBuffer(m_target.length())
1331     , m_isBufferFull(false)
1332     , m_cursor(0)
1333 {
1334     ASSERT(!m_target.isEmpty());
1335     m_target.replace(noBreakSpace, ' ');
1336 }
1337 
~SearchBuffer()1338 inline SearchBuffer::~SearchBuffer()
1339 {
1340 }
1341 
reachedBreak()1342 inline void SearchBuffer::reachedBreak()
1343 {
1344     m_cursor = 0;
1345     m_isBufferFull = false;
1346 }
1347 
atBreak() const1348 inline bool SearchBuffer::atBreak() const
1349 {
1350     return !m_cursor && !m_isBufferFull;
1351 }
1352 
append(UChar c,bool isStart)1353 inline void SearchBuffer::append(UChar c, bool isStart)
1354 {
1355     m_buffer[m_cursor] = c == noBreakSpace ? ' ' : c;
1356     m_isCharacterStartBuffer[m_cursor] = isStart;
1357     if (++m_cursor == m_target.length()) {
1358         m_cursor = 0;
1359         m_isBufferFull = true;
1360     }
1361 }
1362 
append(const UChar * characters,size_t length)1363 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
1364 {
1365     ASSERT(length);
1366     if (m_isCaseSensitive) {
1367         append(characters[0], true);
1368         return 1;
1369     }
1370     const int maxFoldedCharacters = 16; // sensible maximum is 3, this should be more than enough
1371     UChar foldedCharacters[maxFoldedCharacters];
1372     bool error;
1373     int numFoldedCharacters = foldCase(foldedCharacters, maxFoldedCharacters, characters, 1, &error);
1374     ASSERT(!error);
1375     ASSERT(numFoldedCharacters);
1376     ASSERT(numFoldedCharacters <= maxFoldedCharacters);
1377     if (!error && numFoldedCharacters) {
1378         numFoldedCharacters = min(numFoldedCharacters, maxFoldedCharacters);
1379         append(foldedCharacters[0], true);
1380         for (int i = 1; i < numFoldedCharacters; ++i)
1381             append(foldedCharacters[i], false);
1382     }
1383     return 1;
1384 }
1385 
search(size_t & start)1386 inline size_t SearchBuffer::search(size_t& start)
1387 {
1388     if (!m_isBufferFull)
1389         return 0;
1390     if (!m_isCharacterStartBuffer[m_cursor])
1391         return 0;
1392 
1393     size_t tailSpace = m_target.length() - m_cursor;
1394     if (memcmp(&m_buffer[m_cursor], m_target.characters(), tailSpace * sizeof(UChar)) != 0)
1395         return 0;
1396     if (memcmp(&m_buffer[0], m_target.characters() + tailSpace, m_cursor * sizeof(UChar)) != 0)
1397         return 0;
1398 
1399     start = length();
1400 
1401     // Now that we've found a match once, we don't want to find it again, because those
1402     // are the SearchBuffer semantics, allowing for a buffer where you append more than one
1403     // character at a time. To do this we take advantage of m_isCharacterStartBuffer, but if
1404     // we want to get rid of that in the future we could track this with a separate boolean
1405     // or even move the characters to the start of the buffer and set m_isBufferFull to false.
1406     m_isCharacterStartBuffer[m_cursor] = false;
1407 
1408     return start;
1409 }
1410 
1411 // Returns the number of characters that were appended to the buffer (what we are searching in).
1412 // That's not necessarily the same length as the passed-in target string, because case folding
1413 // can make two strings match even though they're not the same length.
length() const1414 size_t SearchBuffer::length() const
1415 {
1416     size_t bufferSize = m_target.length();
1417     size_t length = 0;
1418     for (size_t i = 0; i < bufferSize; ++i)
1419         length += m_isCharacterStartBuffer[i];
1420     return length;
1421 }
1422 
1423 #endif // !ICU_UNICODE
1424 
1425 // --------
1426 
rangeLength(const Range * r,bool forSelectionPreservation)1427 int TextIterator::rangeLength(const Range *r, bool forSelectionPreservation)
1428 {
1429     int length = 0;
1430     for (TextIterator it(r, forSelectionPreservation); !it.atEnd(); it.advance())
1431         length += it.length();
1432 
1433     return length;
1434 }
1435 
subrange(Range * entireRange,int characterOffset,int characterCount)1436 PassRefPtr<Range> TextIterator::subrange(Range* entireRange, int characterOffset, int characterCount)
1437 {
1438     CharacterIterator entireRangeIterator(entireRange);
1439     return characterSubrange(entireRangeIterator, characterOffset, characterCount);
1440 }
1441 
rangeFromLocationAndLength(Element * scope,int rangeLocation,int rangeLength,bool forSelectionPreservation)1442 PassRefPtr<Range> TextIterator::rangeFromLocationAndLength(Element *scope, int rangeLocation, int rangeLength, bool forSelectionPreservation)
1443 {
1444     RefPtr<Range> resultRange = scope->document()->createRange();
1445 
1446     int docTextPosition = 0;
1447     int rangeEnd = rangeLocation + rangeLength;
1448     bool startRangeFound = false;
1449 
1450     RefPtr<Range> textRunRange;
1451 
1452     TextIterator it(rangeOfContents(scope).get(), forSelectionPreservation);
1453 
1454     // FIXME: the atEnd() check shouldn't be necessary, workaround for <http://bugs.webkit.org/show_bug.cgi?id=6289>.
1455     if (rangeLocation == 0 && rangeLength == 0 && it.atEnd()) {
1456         textRunRange = it.range();
1457 
1458         ExceptionCode ec = 0;
1459         resultRange->setStart(textRunRange->startContainer(), 0, ec);
1460         ASSERT(!ec);
1461         resultRange->setEnd(textRunRange->startContainer(), 0, ec);
1462         ASSERT(!ec);
1463 
1464         return resultRange.release();
1465     }
1466 
1467     for (; !it.atEnd(); it.advance()) {
1468         int len = it.length();
1469         textRunRange = it.range();
1470 
1471         bool foundStart = rangeLocation >= docTextPosition && rangeLocation <= docTextPosition + len;
1472         bool foundEnd = rangeEnd >= docTextPosition && rangeEnd <= docTextPosition + len;
1473 
1474         // Fix textRunRange->endPosition(), but only if foundStart || foundEnd, because it is only
1475         // in those cases that textRunRange is used.
1476         if (foundStart || foundEnd) {
1477             // FIXME: This is a workaround for the fact that the end of a run is often at the wrong
1478             // position for emitted '\n's.
1479             if (len == 1 && it.characters()[0] == '\n') {
1480                 Position runStart = textRunRange->startPosition();
1481                 Position runEnd = VisiblePosition(runStart).next().deepEquivalent();
1482                 if (runEnd.isNotNull()) {
1483                     ExceptionCode ec = 0;
1484                     textRunRange->setEnd(runEnd.node(), runEnd.offset(), ec);
1485                     ASSERT(!ec);
1486                 }
1487             }
1488         }
1489 
1490         if (foundStart) {
1491             startRangeFound = true;
1492             int exception = 0;
1493             if (textRunRange->startContainer()->isTextNode()) {
1494                 int offset = rangeLocation - docTextPosition;
1495                 resultRange->setStart(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
1496             } else {
1497                 if (rangeLocation == docTextPosition)
1498                     resultRange->setStart(textRunRange->startContainer(), textRunRange->startOffset(), exception);
1499                 else
1500                     resultRange->setStart(textRunRange->endContainer(), textRunRange->endOffset(), exception);
1501             }
1502         }
1503 
1504         if (foundEnd) {
1505             int exception = 0;
1506             if (textRunRange->startContainer()->isTextNode()) {
1507                 int offset = rangeEnd - docTextPosition;
1508                 resultRange->setEnd(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
1509             } else {
1510                 if (rangeEnd == docTextPosition)
1511                     resultRange->setEnd(textRunRange->startContainer(), textRunRange->startOffset(), exception);
1512                 else
1513                     resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
1514             }
1515             docTextPosition += len;
1516             break;
1517         }
1518         docTextPosition += len;
1519     }
1520 
1521     if (!startRangeFound)
1522         return 0;
1523 
1524     if (rangeLength != 0 && rangeEnd > docTextPosition) { // rangeEnd is out of bounds
1525         int exception = 0;
1526         resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
1527     }
1528 
1529     return resultRange.release();
1530 }
1531 
1532 // --------
1533 
plainTextToMallocAllocatedBuffer(const Range * r,unsigned & bufferLength,bool isDisplayString)1534 UChar* plainTextToMallocAllocatedBuffer(const Range* r, unsigned& bufferLength, bool isDisplayString)
1535 {
1536     UChar* result = 0;
1537 
1538     // Do this in pieces to avoid massive reallocations if there is a large amount of text.
1539     // Use system malloc for buffers since they can consume lots of memory and current TCMalloc is unable return it back to OS.
1540     static const unsigned cMaxSegmentSize = 1 << 16;
1541     bufferLength = 0;
1542     typedef pair<UChar*, unsigned> TextSegment;
1543     Vector<TextSegment>* textSegments = 0;
1544     Vector<UChar> textBuffer;
1545     textBuffer.reserveCapacity(cMaxSegmentSize);
1546     for (TextIterator it(r); !it.atEnd(); it.advance()) {
1547         if (textBuffer.size() && textBuffer.size() + it.length() > cMaxSegmentSize) {
1548             UChar* newSegmentBuffer = static_cast<UChar*>(malloc(textBuffer.size() * sizeof(UChar)));
1549             if (!newSegmentBuffer)
1550                 goto exit;
1551             memcpy(newSegmentBuffer, textBuffer.data(), textBuffer.size() * sizeof(UChar));
1552             if (!textSegments)
1553                 textSegments = new Vector<TextSegment>;
1554             textSegments->append(make_pair(newSegmentBuffer, (unsigned)textBuffer.size()));
1555             textBuffer.clear();
1556         }
1557         textBuffer.append(it.characters(), it.length());
1558         bufferLength += it.length();
1559     }
1560 
1561     if (!bufferLength)
1562         return 0;
1563 
1564     // Since we know the size now, we can make a single buffer out of the pieces with one big alloc
1565     result = static_cast<UChar*>(malloc(bufferLength * sizeof(UChar)));
1566     if (!result)
1567         goto exit;
1568 
1569     {
1570         UChar* resultPos = result;
1571         if (textSegments) {
1572             unsigned size = textSegments->size();
1573             for (unsigned i = 0; i < size; ++i) {
1574                 const TextSegment& segment = textSegments->at(i);
1575                 memcpy(resultPos, segment.first, segment.second * sizeof(UChar));
1576                 resultPos += segment.second;
1577             }
1578         }
1579         memcpy(resultPos, textBuffer.data(), textBuffer.size() * sizeof(UChar));
1580     }
1581 
1582 exit:
1583     if (textSegments) {
1584         unsigned size = textSegments->size();
1585         for (unsigned i = 0; i < size; ++i)
1586             free(textSegments->at(i).first);
1587         delete textSegments;
1588     }
1589 
1590     if (isDisplayString && r->ownerDocument())
1591         r->ownerDocument()->displayBufferModifiedByEncoding(result, bufferLength);
1592 
1593     return result;
1594 }
1595 
plainText(const Range * r)1596 String plainText(const Range* r)
1597 {
1598     unsigned length;
1599     UChar* buf = plainTextToMallocAllocatedBuffer(r, length, false);
1600     if (!buf)
1601         return "";
1602     String result(buf, length);
1603     free(buf);
1604     return result;
1605 }
1606 
isAllCollapsibleWhitespace(const String & string)1607 static inline bool isAllCollapsibleWhitespace(const String& string)
1608 {
1609     const UChar* characters = string.characters();
1610     unsigned length = string.length();
1611     for (unsigned i = 0; i < length; ++i) {
1612         if (!isCollapsibleWhitespace(characters[i]))
1613             return false;
1614     }
1615     return true;
1616 }
1617 
collapsedToBoundary(const Range * range,bool forward)1618 static PassRefPtr<Range> collapsedToBoundary(const Range* range, bool forward)
1619 {
1620     ExceptionCode ec = 0;
1621     RefPtr<Range> result = range->cloneRange(ec);
1622     ASSERT(!ec);
1623     result->collapse(!forward, ec);
1624     ASSERT(!ec);
1625     return result.release();
1626 }
1627 
findPlainText(CharacterIterator & it,const String & target,bool forward,bool caseSensitive,size_t & matchStart)1628 static size_t findPlainText(CharacterIterator& it, const String& target, bool forward, bool caseSensitive, size_t& matchStart)
1629 {
1630     matchStart = 0;
1631     size_t matchLength = 0;
1632 
1633     SearchBuffer buffer(target, caseSensitive);
1634 
1635     while (!it.atEnd()) {
1636         it.advance(buffer.append(it.characters(), it.length()));
1637 tryAgain:
1638         size_t matchStartOffset;
1639         if (size_t newMatchLength = buffer.search(matchStartOffset)) {
1640             // Note that we found a match, and where we found it.
1641             size_t lastCharacterInBufferOffset = it.characterOffset();
1642             ASSERT(lastCharacterInBufferOffset >= matchStartOffset);
1643             matchStart = lastCharacterInBufferOffset - matchStartOffset;
1644             matchLength = newMatchLength;
1645             // If searching forward, stop on the first match.
1646             // If searching backward, don't stop, so we end up with the last match.
1647             if (forward)
1648                 break;
1649             goto tryAgain;
1650         }
1651         if (it.atBreak() && !buffer.atBreak()) {
1652             buffer.reachedBreak();
1653             goto tryAgain;
1654         }
1655     }
1656 
1657     return matchLength;
1658 }
1659 
findPlainText(const Range * range,const String & target,bool forward,bool caseSensitive)1660 PassRefPtr<Range> findPlainText(const Range* range, const String& target, bool forward, bool caseSensitive)
1661 {
1662     // We can't search effectively for a string that's entirely made of collapsible
1663     // whitespace, so we won't even try. This also takes care of the empty string case.
1664     if (isAllCollapsibleWhitespace(target))
1665         return collapsedToBoundary(range, forward);
1666 
1667     // First, find the text.
1668     size_t matchStart;
1669     size_t matchLength;
1670     {
1671         CharacterIterator findIterator(range, false, true);
1672         matchLength = findPlainText(findIterator, target, forward, caseSensitive, matchStart);
1673         if (!matchLength)
1674             return collapsedToBoundary(range, forward);
1675     }
1676 
1677     // Then, find the document position of the start and the end of the text.
1678     CharacterIterator computeRangeIterator(range, false, true);
1679     return characterSubrange(computeRangeIterator, matchStart, matchLength);
1680 }
1681 
1682 }
1683