1 /*
2 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010 Apple Inc. All rights reserved.
3 * Copyright (C) 2005 Alexey Proskuryakov.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27 #include "config.h"
28 #include "TextIterator.h"
29
30 #include "Document.h"
31 #include "Frame.h"
32 #include "HTMLElement.h"
33 #include "HTMLNames.h"
34 #include "htmlediting.h"
35 #include "InlineTextBox.h"
36 #include "Range.h"
37 #include "RenderTableCell.h"
38 #include "RenderTableRow.h"
39 #include "RenderTextControl.h"
40 #include "RenderTextFragment.h"
41 #include "TextBoundaries.h"
42 #include "TextBreakIterator.h"
43 #include "VisiblePosition.h"
44 #include "visible_units.h"
45 #include <wtf/unicode/CharacterNames.h>
46
47 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
48 #include "TextBreakIteratorInternalICU.h"
49 #include <unicode/usearch.h>
50 #endif
51
52 using namespace WTF::Unicode;
53 using namespace std;
54
55 namespace WebCore {
56
57 using namespace HTMLNames;
58
59 // Buffer that knows how to compare with a search target.
60 // Keeps enough of the previous text to be able to search in the future, but no more.
61 // Non-breaking spaces are always equal to normal spaces.
62 // Case folding is also done if the CaseInsensitive option is specified.
63 // Matches are further filtered if the AtWordStarts option is specified, although some
64 // matches inside a word are permitted if TreatMedialCapitalAsWordStart is specified as well.
65 class SearchBuffer {
66 WTF_MAKE_NONCOPYABLE(SearchBuffer);
67 public:
68 SearchBuffer(const String& target, FindOptions);
69 ~SearchBuffer();
70
71 // Returns number of characters appended; guaranteed to be in the range [1, length].
72 size_t append(const UChar*, size_t length);
73 bool needsMoreContext() const;
74 void prependContext(const UChar*, size_t length);
75 void reachedBreak();
76
77 // Result is the size in characters of what was found.
78 // And <startOffset> is the number of characters back to the start of what was found.
79 size_t search(size_t& startOffset);
80 bool atBreak() const;
81
82 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
83
84 private:
85 bool isBadMatch(const UChar*, size_t length) const;
86 bool isWordStartMatch(size_t start, size_t length) const;
87
88 String m_target;
89 FindOptions m_options;
90
91 Vector<UChar> m_buffer;
92 size_t m_overlap;
93 size_t m_prefixLength;
94 bool m_atBreak;
95 bool m_needsMoreContext;
96
97 bool m_targetRequiresKanaWorkaround;
98 Vector<UChar> m_normalizedTarget;
99 mutable Vector<UChar> m_normalizedMatch;
100
101 #else
102
103 private:
104 void append(UChar, bool isCharacterStart);
105 size_t length() const;
106
107 String m_target;
108 FindOptions m_options;
109
110 Vector<UChar> m_buffer;
111 Vector<bool> m_isCharacterStartBuffer;
112 bool m_isBufferFull;
113 size_t m_cursor;
114
115 #endif
116 };
117
118 // --------
119
120 static const unsigned bitsInWord = sizeof(unsigned) * 8;
121 static const unsigned bitInWordMask = bitsInWord - 1;
122
BitStack()123 BitStack::BitStack()
124 : m_size(0)
125 {
126 }
127
~BitStack()128 BitStack::~BitStack()
129 {
130 }
131
push(bool bit)132 void BitStack::push(bool bit)
133 {
134 unsigned index = m_size / bitsInWord;
135 unsigned shift = m_size & bitInWordMask;
136 if (!shift && index == m_words.size()) {
137 m_words.grow(index + 1);
138 m_words[index] = 0;
139 }
140 unsigned& word = m_words[index];
141 unsigned mask = 1U << shift;
142 if (bit)
143 word |= mask;
144 else
145 word &= ~mask;
146 ++m_size;
147 }
148
pop()149 void BitStack::pop()
150 {
151 if (m_size)
152 --m_size;
153 }
154
top() const155 bool BitStack::top() const
156 {
157 if (!m_size)
158 return false;
159 unsigned shift = (m_size - 1) & bitInWordMask;
160 return m_words.last() & (1U << shift);
161 }
162
size() const163 unsigned BitStack::size() const
164 {
165 return m_size;
166 }
167
168 // --------
169
170 #if !ASSERT_DISABLED
171
depthCrossingShadowBoundaries(Node * node)172 static unsigned depthCrossingShadowBoundaries(Node* node)
173 {
174 unsigned depth = 0;
175 for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
176 ++depth;
177 return depth;
178 }
179
180 #endif
181
182 // This function is like Range::pastLastNode, except for the fact that it can climb up out of shadow trees.
nextInPreOrderCrossingShadowBoundaries(Node * rangeEndContainer,int rangeEndOffset)183 static Node* nextInPreOrderCrossingShadowBoundaries(Node* rangeEndContainer, int rangeEndOffset)
184 {
185 if (!rangeEndContainer)
186 return 0;
187 if (rangeEndOffset >= 0 && !rangeEndContainer->offsetInCharacters()) {
188 if (Node* next = rangeEndContainer->childNode(rangeEndOffset))
189 return next;
190 }
191 for (Node* node = rangeEndContainer; node; node = node->parentOrHostNode()) {
192 if (Node* next = node->nextSibling())
193 return next;
194 }
195 return 0;
196 }
197
198 // --------
199
fullyClipsContents(Node * node)200 static inline bool fullyClipsContents(Node* node)
201 {
202 RenderObject* renderer = node->renderer();
203 if (!renderer || !renderer->isBox() || !renderer->hasOverflowClip())
204 return false;
205 return toRenderBox(renderer)->size().isEmpty();
206 }
207
ignoresContainerClip(Node * node)208 static inline bool ignoresContainerClip(Node* node)
209 {
210 RenderObject* renderer = node->renderer();
211 if (!renderer || renderer->isText())
212 return false;
213 EPosition position = renderer->style()->position();
214 return position == AbsolutePosition || position == FixedPosition;
215 }
216
pushFullyClippedState(BitStack & stack,Node * node)217 static void pushFullyClippedState(BitStack& stack, Node* node)
218 {
219 ASSERT(stack.size() == depthCrossingShadowBoundaries(node));
220
221 // Push true if this node full clips its contents, or if a parent already has fully
222 // clipped and this is not a node that ignores its container's clip.
223 stack.push(fullyClipsContents(node) || (stack.top() && !ignoresContainerClip(node)));
224 }
225
setUpFullyClippedStack(BitStack & stack,Node * node)226 static void setUpFullyClippedStack(BitStack& stack, Node* node)
227 {
228 // Put the nodes in a vector so we can iterate in reverse order.
229 Vector<Node*, 100> ancestry;
230 for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
231 ancestry.append(parent);
232
233 // Call pushFullyClippedState on each node starting with the earliest ancestor.
234 size_t size = ancestry.size();
235 for (size_t i = 0; i < size; ++i)
236 pushFullyClippedState(stack, ancestry[size - i - 1]);
237 pushFullyClippedState(stack, node);
238
239 ASSERT(stack.size() == 1 + depthCrossingShadowBoundaries(node));
240 }
241
242 #if OS(ANDROID)
checkFormControlElement(Node * startNode)243 static bool checkFormControlElement(Node* startNode)
244 {
245 Node* node = startNode;
246 while (node) {
247 if (node->isElementNode() && static_cast<Element*>(node)->isFormControlElement())
248 return true;
249 node = node->parentNode();
250 }
251 return false;
252 }
253 #endif
254
255
256 // --------
257
TextIterator()258 TextIterator::TextIterator()
259 : m_startContainer(0)
260 , m_startOffset(0)
261 , m_endContainer(0)
262 , m_endOffset(0)
263 , m_positionNode(0)
264 , m_textCharacters(0)
265 , m_textLength(0)
266 , m_remainingTextBox(0)
267 , m_firstLetterText(0)
268 , m_lastCharacter(0)
269 , m_emitsCharactersBetweenAllVisiblePositions(false)
270 , m_entersTextControls(false)
271 , m_emitsTextWithoutTranscoding(false)
272 , m_handledFirstLetter(false)
273 , m_ignoresStyleVisibility(false)
274 , m_emitsObjectReplacementCharacters(false)
275 #if OS(ANDROID)
276 , m_stopsOnFormControls(false)
277 , m_shouldStop(false)
278 #endif
279 {
280 }
281
TextIterator(const Range * r,TextIteratorBehavior behavior)282 TextIterator::TextIterator(const Range* r, TextIteratorBehavior behavior)
283 : m_startContainer(0)
284 , m_startOffset(0)
285 , m_endContainer(0)
286 , m_endOffset(0)
287 , m_positionNode(0)
288 , m_textCharacters(0)
289 , m_textLength(0)
290 #if OS(ANDROID)
291 , m_needsAnotherNewline(false)
292 #endif
293 , m_remainingTextBox(0)
294 , m_firstLetterText(0)
295 , m_emitsCharactersBetweenAllVisiblePositions(behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions)
296 , m_entersTextControls(behavior & TextIteratorEntersTextControls)
297 , m_emitsTextWithoutTranscoding(behavior & TextIteratorEmitsTextsWithoutTranscoding)
298 , m_handledFirstLetter(false)
299 , m_ignoresStyleVisibility(behavior & TextIteratorIgnoresStyleVisibility)
300 , m_emitsObjectReplacementCharacters(behavior & TextIteratorEmitsObjectReplacementCharacters)
301 #if OS(ANDROID)
302 , m_stopsOnFormControls(behavior & TextIteratorStopsOnFormControls)
303 , m_shouldStop(false)
304 #endif
305 {
306 if (!r)
307 return;
308
309 // get and validate the range endpoints
310 Node* startContainer = r->startContainer();
311 if (!startContainer)
312 return;
313 int startOffset = r->startOffset();
314 Node* endContainer = r->endContainer();
315 int endOffset = r->endOffset();
316
317 // Callers should be handing us well-formed ranges. If we discover that this isn't
318 // the case, we could consider changing this assertion to an early return.
319 ASSERT(r->boundaryPointsValid());
320
321 // remember range - this does not change
322 m_startContainer = startContainer;
323 m_startOffset = startOffset;
324 m_endContainer = endContainer;
325 m_endOffset = endOffset;
326
327 // set up the current node for processing
328 m_node = r->firstNode();
329 if (!m_node)
330 return;
331 setUpFullyClippedStack(m_fullyClippedStack, m_node);
332 m_offset = m_node == m_startContainer ? m_startOffset : 0;
333 m_handledNode = false;
334 m_handledChildren = false;
335
336 // calculate first out of bounds node
337 m_pastEndNode = nextInPreOrderCrossingShadowBoundaries(endContainer, endOffset);
338
339 // initialize node processing state
340 m_needsAnotherNewline = false;
341 m_textBox = 0;
342
343 // initialize record of previous node processing
344 m_hasEmitted = false;
345 m_lastTextNode = 0;
346 m_lastTextNodeEndedWithCollapsedSpace = false;
347 m_lastCharacter = 0;
348
349 #ifndef NDEBUG
350 // need this just because of the assert in advance()
351 m_positionNode = m_node;
352 #endif
353
354 // identify the first run
355 advance();
356 }
357
~TextIterator()358 TextIterator::~TextIterator()
359 {
360 }
361
atEnd() const362 bool TextIterator::atEnd() const
363 {
364 #if OS(ANDROID)
365 return !m_positionNode || m_shouldStop;
366 #else
367 return !m_positionNode;
368 #endif
369 }
370
advance()371 void TextIterator::advance()
372 {
373 #if OS(ANDROID)
374 if (m_shouldStop)
375 return;
376 #endif
377 // reset the run information
378 m_positionNode = 0;
379 m_textLength = 0;
380
381 // handle remembered node that needed a newline after the text node's newline
382 if (m_needsAnotherNewline) {
383 // Emit the extra newline, and position it *inside* m_node, after m_node's
384 // contents, in case it's a block, in the same way that we position the first
385 // newline. The range for the emitted newline should start where the line
386 // break begins.
387 // FIXME: It would be cleaner if we emitted two newlines during the last
388 // iteration, instead of using m_needsAnotherNewline.
389 Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
390 emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
391 m_needsAnotherNewline = false;
392 return;
393 }
394
395 if (!m_textBox && m_remainingTextBox) {
396 m_textBox = m_remainingTextBox;
397 m_remainingTextBox = 0;
398 m_firstLetterText = 0;
399 m_offset = 0;
400 }
401 // handle remembered text box
402 if (m_textBox) {
403 handleTextBox();
404 if (m_positionNode)
405 return;
406 }
407
408 while (m_node && m_node != m_pastEndNode) {
409 #if OS(ANDROID)
410 if (!m_shouldStop && m_stopsOnFormControls && checkFormControlElement(m_node))
411 m_shouldStop = true;
412 #endif
413 // if the range ends at offset 0 of an element, represent the
414 // position, but not the content, of that element e.g. if the
415 // node is a blockflow element, emit a newline that
416 // precedes the element
417 if (m_node == m_endContainer && m_endOffset == 0) {
418 representNodeOffsetZero();
419 m_node = 0;
420 return;
421 }
422
423 RenderObject* renderer = m_node->renderer();
424 if (!renderer) {
425 m_handledNode = true;
426 m_handledChildren = true;
427 } else {
428 // handle current node according to its type
429 if (!m_handledNode) {
430 if (renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) // FIXME: What about CDATA_SECTION_NODE?
431 m_handledNode = handleTextNode();
432 else if (renderer && (renderer->isImage() || renderer->isWidget() ||
433 (renderer->node() && renderer->node()->isElementNode() &&
434 static_cast<Element*>(renderer->node())->isFormControlElement())))
435 m_handledNode = handleReplacedElement();
436 else
437 m_handledNode = handleNonTextNode();
438 if (m_positionNode)
439 return;
440 }
441 }
442
443 // find a new current node to handle in depth-first manner,
444 // calling exitNode() as we come back thru a parent node
445 Node* next = m_handledChildren ? 0 : m_node->firstChild();
446 m_offset = 0;
447 if (!next) {
448 next = m_node->nextSibling();
449 if (!next) {
450 bool pastEnd = m_node->traverseNextNode() == m_pastEndNode;
451 Node* parentNode = m_node->parentOrHostNode();
452 while (!next && parentNode) {
453 if ((pastEnd && parentNode == m_endContainer) || m_endContainer->isDescendantOf(parentNode))
454 return;
455 bool haveRenderer = m_node->renderer();
456 m_node = parentNode;
457 m_fullyClippedStack.pop();
458 parentNode = m_node->parentOrHostNode();
459 if (haveRenderer)
460 exitNode();
461 if (m_positionNode) {
462 m_handledNode = true;
463 m_handledChildren = true;
464 return;
465 }
466 next = m_node->nextSibling();
467 }
468 }
469 m_fullyClippedStack.pop();
470 }
471
472 // set the new current node
473 m_node = next;
474 if (m_node)
475 pushFullyClippedState(m_fullyClippedStack, m_node);
476 m_handledNode = false;
477 m_handledChildren = false;
478 m_handledFirstLetter = false;
479 m_firstLetterText = 0;
480
481 // how would this ever be?
482 if (m_positionNode)
483 return;
484 }
485 }
486
handleTextNode()487 bool TextIterator::handleTextNode()
488 {
489 if (m_fullyClippedStack.top() && !m_ignoresStyleVisibility)
490 return false;
491
492 RenderText* renderer = toRenderText(m_node->renderer());
493
494 m_lastTextNode = m_node;
495 String str = renderer->text();
496
497 // handle pre-formatted text
498 if (!renderer->style()->collapseWhiteSpace()) {
499 int runStart = m_offset;
500 if (m_lastTextNodeEndedWithCollapsedSpace && hasVisibleTextNode(renderer)) {
501 emitCharacter(' ', m_node, 0, runStart, runStart);
502 return false;
503 }
504 if (!m_handledFirstLetter && renderer->isTextFragment()) {
505 handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
506 if (m_firstLetterText) {
507 String firstLetter = m_firstLetterText->text();
508 emitText(m_node, m_firstLetterText, m_offset, m_offset + firstLetter.length());
509 m_firstLetterText = 0;
510 m_textBox = 0;
511 return false;
512 }
513 }
514 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
515 return false;
516 int strLength = str.length();
517 int end = (m_node == m_endContainer) ? m_endOffset : INT_MAX;
518 int runEnd = min(strLength, end);
519
520 if (runStart >= runEnd)
521 return true;
522
523 emitText(m_node, runStart, runEnd);
524 return true;
525 }
526
527 if (!renderer->firstTextBox() && str.length() > 0) {
528 if (!m_handledFirstLetter && renderer->isTextFragment()) {
529 handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
530 if (m_firstLetterText) {
531 handleTextBox();
532 return false;
533 }
534 }
535 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
536 return false;
537 m_lastTextNodeEndedWithCollapsedSpace = true; // entire block is collapsed space
538 return true;
539 }
540
541 // Used when text boxes are out of order (Hebrew/Arabic w/ embeded LTR text)
542 if (renderer->containsReversedText()) {
543 m_sortedTextBoxes.clear();
544 for (InlineTextBox* textBox = renderer->firstTextBox(); textBox; textBox = textBox->nextTextBox()) {
545 m_sortedTextBoxes.append(textBox);
546 }
547 std::sort(m_sortedTextBoxes.begin(), m_sortedTextBoxes.end(), InlineTextBox::compareByStart);
548 m_sortedTextBoxesPosition = 0;
549 }
550
551 m_textBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox();
552 if (!m_handledFirstLetter && renderer->isTextFragment() && !m_offset)
553 handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
554 handleTextBox();
555 return true;
556 }
557
handleTextBox()558 void TextIterator::handleTextBox()
559 {
560 RenderText* renderer = m_firstLetterText ? m_firstLetterText : toRenderText(m_node->renderer());
561 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility) {
562 m_textBox = 0;
563 return;
564 }
565 String str = renderer->text();
566 unsigned start = m_offset;
567 unsigned end = (m_node == m_endContainer) ? static_cast<unsigned>(m_endOffset) : UINT_MAX;
568 while (m_textBox) {
569 unsigned textBoxStart = m_textBox->start();
570 unsigned runStart = max(textBoxStart, start);
571
572 // Check for collapsed space at the start of this run.
573 InlineTextBox* firstTextBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox();
574 bool needSpace = m_lastTextNodeEndedWithCollapsedSpace
575 || (m_textBox == firstTextBox && textBoxStart == runStart && runStart > 0);
576 if (needSpace && !isCollapsibleWhitespace(m_lastCharacter) && m_lastCharacter) {
577 if (m_lastTextNode == m_node && runStart > 0 && str[runStart - 1] == ' ') {
578 unsigned spaceRunStart = runStart - 1;
579 while (spaceRunStart > 0 && str[spaceRunStart - 1] == ' ')
580 --spaceRunStart;
581 emitText(m_node, renderer, spaceRunStart, spaceRunStart + 1);
582 } else
583 emitCharacter(' ', m_node, 0, runStart, runStart);
584 return;
585 }
586 unsigned textBoxEnd = textBoxStart + m_textBox->len();
587 unsigned runEnd = min(textBoxEnd, end);
588
589 // Determine what the next text box will be, but don't advance yet
590 InlineTextBox* nextTextBox = 0;
591 if (renderer->containsReversedText()) {
592 if (m_sortedTextBoxesPosition + 1 < m_sortedTextBoxes.size())
593 nextTextBox = m_sortedTextBoxes[m_sortedTextBoxesPosition + 1];
594 } else
595 nextTextBox = m_textBox->nextTextBox();
596
597 if (runStart < runEnd) {
598 // Handle either a single newline character (which becomes a space),
599 // or a run of characters that does not include a newline.
600 // This effectively translates newlines to spaces without copying the text.
601 if (str[runStart] == '\n') {
602 emitCharacter(' ', m_node, 0, runStart, runStart + 1);
603 m_offset = runStart + 1;
604 } else {
605 size_t subrunEnd = str.find('\n', runStart);
606 if (subrunEnd == notFound || subrunEnd > runEnd)
607 subrunEnd = runEnd;
608
609 m_offset = subrunEnd;
610 emitText(m_node, renderer, runStart, subrunEnd);
611 }
612
613 // If we are doing a subrun that doesn't go to the end of the text box,
614 // come back again to finish handling this text box; don't advance to the next one.
615 if (static_cast<unsigned>(m_positionEndOffset) < textBoxEnd)
616 return;
617
618 // Advance and return
619 unsigned nextRunStart = nextTextBox ? nextTextBox->start() : str.length();
620 if (nextRunStart > runEnd)
621 m_lastTextNodeEndedWithCollapsedSpace = true; // collapsed space between runs or at the end
622 m_textBox = nextTextBox;
623 if (renderer->containsReversedText())
624 ++m_sortedTextBoxesPosition;
625 return;
626 }
627 // Advance and continue
628 m_textBox = nextTextBox;
629 if (renderer->containsReversedText())
630 ++m_sortedTextBoxesPosition;
631 }
632 if (!m_textBox && m_remainingTextBox) {
633 m_textBox = m_remainingTextBox;
634 m_remainingTextBox = 0;
635 m_firstLetterText = 0;
636 m_offset = 0;
637 handleTextBox();
638 }
639 }
640
handleTextNodeFirstLetter(RenderTextFragment * renderer)641 void TextIterator::handleTextNodeFirstLetter(RenderTextFragment* renderer)
642 {
643 if (renderer->firstLetter()) {
644 RenderObject* r = renderer->firstLetter();
645 if (r->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
646 return;
647 for (RenderObject *currChild = r->firstChild(); currChild; currChild->nextSibling()) {
648 if (currChild->isText()) {
649 RenderText* firstLetter = toRenderText(currChild);
650 m_handledFirstLetter = true;
651 m_remainingTextBox = m_textBox;
652 m_textBox = firstLetter->firstTextBox();
653 m_firstLetterText = firstLetter;
654 return;
655 }
656 }
657 }
658 m_handledFirstLetter = true;
659 }
660
handleReplacedElement()661 bool TextIterator::handleReplacedElement()
662 {
663 if (m_fullyClippedStack.top())
664 return false;
665
666 RenderObject* renderer = m_node->renderer();
667 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
668 return false;
669
670 if (m_lastTextNodeEndedWithCollapsedSpace) {
671 emitCharacter(' ', m_lastTextNode->parentNode(), m_lastTextNode, 1, 1);
672 return false;
673 }
674
675 if (m_entersTextControls && renderer->isTextControl()) {
676 if (HTMLElement* innerTextElement = toRenderTextControl(renderer)->innerTextElement()) {
677 m_node = innerTextElement->shadowTreeRootNode();
678 pushFullyClippedState(m_fullyClippedStack, m_node);
679 m_offset = 0;
680 return false;
681 }
682 }
683
684 m_hasEmitted = true;
685
686 if (m_emitsObjectReplacementCharacters && renderer && renderer->isReplaced()) {
687 emitCharacter(objectReplacementCharacter, m_node->parentNode(), m_node, 0, 1);
688 return true;
689 }
690
691 if (m_emitsCharactersBetweenAllVisiblePositions) {
692 // We want replaced elements to behave like punctuation for boundary
693 // finding, and to simply take up space for the selection preservation
694 // code in moveParagraphs, so we use a comma.
695 emitCharacter(',', m_node->parentNode(), m_node, 0, 1);
696 return true;
697 }
698
699 m_positionNode = m_node->parentNode();
700 m_positionOffsetBaseNode = m_node;
701 m_positionStartOffset = 0;
702 m_positionEndOffset = 1;
703
704 m_textCharacters = 0;
705 m_textLength = 0;
706
707 m_lastCharacter = 0;
708
709 return true;
710 }
711
hasVisibleTextNode(RenderText * renderer)712 bool TextIterator::hasVisibleTextNode(RenderText* renderer)
713 {
714 if (renderer->style()->visibility() == VISIBLE)
715 return true;
716 if (renderer->isTextFragment()) {
717 RenderTextFragment* fragment = static_cast<RenderTextFragment*>(renderer);
718 if (fragment->firstLetter() && fragment->firstLetter()->style()->visibility() == VISIBLE)
719 return true;
720 }
721 return false;
722 }
723
shouldEmitTabBeforeNode(Node * node)724 static bool shouldEmitTabBeforeNode(Node* node)
725 {
726 RenderObject* r = node->renderer();
727
728 // Table cells are delimited by tabs.
729 if (!r || !isTableCell(node))
730 return false;
731
732 // Want a tab before every cell other than the first one
733 RenderTableCell* rc = toRenderTableCell(r);
734 RenderTable* t = rc->table();
735 return t && (t->cellBefore(rc) || t->cellAbove(rc));
736 }
737
shouldEmitNewlineForNode(Node * node)738 static bool shouldEmitNewlineForNode(Node* node)
739 {
740 // br elements are represented by a single newline.
741 RenderObject* r = node->renderer();
742 if (!r)
743 return node->hasTagName(brTag);
744
745 return r->isBR();
746 }
747
shouldEmitNewlinesBeforeAndAfterNode(Node * node)748 static bool shouldEmitNewlinesBeforeAndAfterNode(Node* node)
749 {
750 // Block flow (versus inline flow) is represented by having
751 // a newline both before and after the element.
752 RenderObject* r = node->renderer();
753 if (!r) {
754 return (node->hasTagName(blockquoteTag)
755 || node->hasTagName(ddTag)
756 || node->hasTagName(divTag)
757 || node->hasTagName(dlTag)
758 || node->hasTagName(dtTag)
759 || node->hasTagName(h1Tag)
760 || node->hasTagName(h2Tag)
761 || node->hasTagName(h3Tag)
762 || node->hasTagName(h4Tag)
763 || node->hasTagName(h5Tag)
764 || node->hasTagName(h6Tag)
765 || node->hasTagName(hrTag)
766 || node->hasTagName(liTag)
767 || node->hasTagName(listingTag)
768 || node->hasTagName(olTag)
769 || node->hasTagName(pTag)
770 || node->hasTagName(preTag)
771 || node->hasTagName(trTag)
772 || node->hasTagName(ulTag));
773 }
774
775 // Need to make an exception for table cells, because they are blocks, but we
776 // want them tab-delimited rather than having newlines before and after.
777 if (isTableCell(node))
778 return false;
779
780 // Need to make an exception for table row elements, because they are neither
781 // "inline" or "RenderBlock", but we want newlines for them.
782 if (r->isTableRow()) {
783 RenderTable* t = toRenderTableRow(r)->table();
784 if (t && !t->isInline())
785 return true;
786 }
787
788 return !r->isInline() && r->isRenderBlock() && !r->isFloatingOrPositioned() && !r->isBody();
789 }
790
shouldEmitNewlineAfterNode(Node * node)791 static bool shouldEmitNewlineAfterNode(Node* node)
792 {
793 // FIXME: It should be better but slower to create a VisiblePosition here.
794 if (!shouldEmitNewlinesBeforeAndAfterNode(node))
795 return false;
796 // Check if this is the very last renderer in the document.
797 // If so, then we should not emit a newline.
798 while ((node = node->traverseNextSibling()))
799 if (node->renderer())
800 return true;
801 return false;
802 }
803
shouldEmitNewlineBeforeNode(Node * node)804 static bool shouldEmitNewlineBeforeNode(Node* node)
805 {
806 return shouldEmitNewlinesBeforeAndAfterNode(node);
807 }
808
shouldEmitExtraNewlineForNode(Node * node)809 static bool shouldEmitExtraNewlineForNode(Node* node)
810 {
811 // When there is a significant collapsed bottom margin, emit an extra
812 // newline for a more realistic result. We end up getting the right
813 // result even without margin collapsing. For example: <div><p>text</p></div>
814 // will work right even if both the <div> and the <p> have bottom margins.
815 RenderObject* r = node->renderer();
816 if (!r || !r->isBox())
817 return false;
818
819 // NOTE: We only do this for a select set of nodes, and fwiw WinIE appears
820 // not to do this at all
821 if (node->hasTagName(h1Tag)
822 || node->hasTagName(h2Tag)
823 || node->hasTagName(h3Tag)
824 || node->hasTagName(h4Tag)
825 || node->hasTagName(h5Tag)
826 || node->hasTagName(h6Tag)
827 || node->hasTagName(pTag)) {
828 RenderStyle* style = r->style();
829 if (style) {
830 int bottomMargin = toRenderBox(r)->collapsedMarginAfter();
831 int fontSize = style->fontDescription().computedPixelSize();
832 if (bottomMargin * 2 >= fontSize)
833 return true;
834 }
835 }
836
837 return false;
838 }
839
collapsedSpaceLength(RenderText * renderer,int textEnd)840 static int collapsedSpaceLength(RenderText* renderer, int textEnd)
841 {
842 const UChar* characters = renderer->text()->characters();
843 int length = renderer->text()->length();
844 for (int i = textEnd; i < length; ++i) {
845 if (!renderer->style()->isCollapsibleWhiteSpace(characters[i]))
846 return i - textEnd;
847 }
848
849 return length - textEnd;
850 }
851
maxOffsetIncludingCollapsedSpaces(Node * node)852 static int maxOffsetIncludingCollapsedSpaces(Node* node)
853 {
854 int offset = caretMaxOffset(node);
855
856 if (node->renderer() && node->renderer()->isText())
857 offset += collapsedSpaceLength(toRenderText(node->renderer()), offset);
858
859 return offset;
860 }
861
862 // Whether or not we should emit a character as we enter m_node (if it's a container) or as we hit it (if it's atomic).
shouldRepresentNodeOffsetZero()863 bool TextIterator::shouldRepresentNodeOffsetZero()
864 {
865 if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isTable())
866 return true;
867
868 // Leave element positioned flush with start of a paragraph
869 // (e.g. do not insert tab before a table cell at the start of a paragraph)
870 if (m_lastCharacter == '\n')
871 return false;
872
873 // Otherwise, show the position if we have emitted any characters
874 if (m_hasEmitted)
875 return true;
876
877 // We've not emitted anything yet. Generally, there is no need for any positioning then.
878 // The only exception is when the element is visually not in the same line as
879 // the start of the range (e.g. the range starts at the end of the previous paragraph).
880 // NOTE: Creating VisiblePositions and comparing them is relatively expensive, so we
881 // make quicker checks to possibly avoid that. Another check that we could make is
882 // is whether the inline vs block flow changed since the previous visible element.
883 // I think we're already in a special enough case that that won't be needed, tho.
884
885 // No character needed if this is the first node in the range.
886 if (m_node == m_startContainer)
887 return false;
888
889 // If we are outside the start container's subtree, assume we need to emit.
890 // FIXME: m_startContainer could be an inline block
891 if (!m_node->isDescendantOf(m_startContainer))
892 return true;
893
894 // If we started as m_startContainer offset 0 and the current node is a descendant of
895 // the start container, we already had enough context to correctly decide whether to
896 // emit after a preceding block. We chose not to emit (m_hasEmitted is false),
897 // so don't second guess that now.
898 // NOTE: Is this really correct when m_node is not a leftmost descendant? Probably
899 // immaterial since we likely would have already emitted something by now.
900 if (m_startOffset == 0)
901 return false;
902
903 // If this node is unrendered or invisible the VisiblePosition checks below won't have much meaning.
904 // Additionally, if the range we are iterating over contains huge sections of unrendered content,
905 // we would create VisiblePositions on every call to this function without this check.
906 if (!m_node->renderer() || m_node->renderer()->style()->visibility() != VISIBLE)
907 return false;
908
909 // The startPos.isNotNull() check is needed because the start could be before the body,
910 // and in that case we'll get null. We don't want to put in newlines at the start in that case.
911 // The currPos.isNotNull() check is needed because positions in non-HTML content
912 // (like SVG) do not have visible positions, and we don't want to emit for them either.
913 VisiblePosition startPos = VisiblePosition(Position(m_startContainer, m_startOffset, Position::PositionIsOffsetInAnchor), DOWNSTREAM);
914 VisiblePosition currPos = VisiblePosition(positionBeforeNode(m_node), DOWNSTREAM);
915 return startPos.isNotNull() && currPos.isNotNull() && !inSameLine(startPos, currPos);
916 }
917
shouldEmitSpaceBeforeAndAfterNode(Node * node)918 bool TextIterator::shouldEmitSpaceBeforeAndAfterNode(Node* node)
919 {
920 return node->renderer() && node->renderer()->isTable() && (node->renderer()->isInline() || m_emitsCharactersBetweenAllVisiblePositions);
921 }
922
representNodeOffsetZero()923 void TextIterator::representNodeOffsetZero()
924 {
925 // Emit a character to show the positioning of m_node.
926
927 // When we haven't been emitting any characters, shouldRepresentNodeOffsetZero() can
928 // create VisiblePositions, which is expensive. So, we perform the inexpensive checks
929 // on m_node to see if it necessitates emitting a character first and will early return
930 // before encountering shouldRepresentNodeOffsetZero()s worse case behavior.
931 if (shouldEmitTabBeforeNode(m_node)) {
932 if (shouldRepresentNodeOffsetZero())
933 emitCharacter('\t', m_node->parentNode(), m_node, 0, 0);
934 } else if (shouldEmitNewlineBeforeNode(m_node)) {
935 if (shouldRepresentNodeOffsetZero())
936 emitCharacter('\n', m_node->parentNode(), m_node, 0, 0);
937 } else if (shouldEmitSpaceBeforeAndAfterNode(m_node)) {
938 if (shouldRepresentNodeOffsetZero())
939 emitCharacter(' ', m_node->parentNode(), m_node, 0, 0);
940 }
941 }
942
handleNonTextNode()943 bool TextIterator::handleNonTextNode()
944 {
945 if (shouldEmitNewlineForNode(m_node))
946 emitCharacter('\n', m_node->parentNode(), m_node, 0, 1);
947 else if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isHR())
948 emitCharacter(' ', m_node->parentNode(), m_node, 0, 1);
949 else
950 representNodeOffsetZero();
951
952 return true;
953 }
954
exitNode()955 void TextIterator::exitNode()
956 {
957 // prevent emitting a newline when exiting a collapsed block at beginning of the range
958 // FIXME: !m_hasEmitted does not necessarily mean there was a collapsed block... it could
959 // have been an hr (e.g.). Also, a collapsed block could have height (e.g. a table) and
960 // therefore look like a blank line.
961 if (!m_hasEmitted)
962 return;
963
964 // Emit with a position *inside* m_node, after m_node's contents, in
965 // case it is a block, because the run should start where the
966 // emitted character is positioned visually.
967 Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
968 // FIXME: This shouldn't require the m_lastTextNode to be true, but we can't change that without making
969 // the logic in _web_attributedStringFromRange match. We'll get that for free when we switch to use
970 // TextIterator in _web_attributedStringFromRange.
971 // See <rdar://problem/5428427> for an example of how this mismatch will cause problems.
972 if (m_lastTextNode && shouldEmitNewlineAfterNode(m_node)) {
973 // use extra newline to represent margin bottom, as needed
974 bool addNewline = shouldEmitExtraNewlineForNode(m_node);
975
976 // FIXME: We need to emit a '\n' as we leave an empty block(s) that
977 // contain a VisiblePosition when doing selection preservation.
978 if (m_lastCharacter != '\n') {
979 // insert a newline with a position following this block's contents.
980 emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
981 // remember whether to later add a newline for the current node
982 ASSERT(!m_needsAnotherNewline);
983 m_needsAnotherNewline = addNewline;
984 } else if (addNewline)
985 // insert a newline with a position following this block's contents.
986 emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
987 }
988
989 // If nothing was emitted, see if we need to emit a space.
990 if (!m_positionNode && shouldEmitSpaceBeforeAndAfterNode(m_node))
991 emitCharacter(' ', baseNode->parentNode(), baseNode, 1, 1);
992 }
993
emitCharacter(UChar c,Node * textNode,Node * offsetBaseNode,int textStartOffset,int textEndOffset)994 void TextIterator::emitCharacter(UChar c, Node* textNode, Node* offsetBaseNode, int textStartOffset, int textEndOffset)
995 {
996 m_hasEmitted = true;
997
998 // remember information with which to construct the TextIterator::range()
999 // NOTE: textNode is often not a text node, so the range will specify child nodes of positionNode
1000 m_positionNode = textNode;
1001 m_positionOffsetBaseNode = offsetBaseNode;
1002 m_positionStartOffset = textStartOffset;
1003 m_positionEndOffset = textEndOffset;
1004
1005 // remember information with which to construct the TextIterator::characters() and length()
1006 m_singleCharacterBuffer = c;
1007 m_textCharacters = &m_singleCharacterBuffer;
1008 m_textLength = 1;
1009
1010 // remember some iteration state
1011 m_lastTextNodeEndedWithCollapsedSpace = false;
1012 m_lastCharacter = c;
1013 }
1014
emitText(Node * textNode,RenderObject * renderObject,int textStartOffset,int textEndOffset)1015 void TextIterator::emitText(Node* textNode, RenderObject* renderObject, int textStartOffset, int textEndOffset)
1016 {
1017 RenderText* renderer = toRenderText(renderObject);
1018 m_text = m_emitsTextWithoutTranscoding ? renderer->textWithoutTranscoding() : renderer->text();
1019 ASSERT(m_text.characters());
1020
1021 m_positionNode = textNode;
1022 m_positionOffsetBaseNode = 0;
1023 m_positionStartOffset = textStartOffset;
1024 m_positionEndOffset = textEndOffset;
1025 m_textCharacters = m_text.characters() + textStartOffset;
1026 m_textLength = textEndOffset - textStartOffset;
1027 m_lastCharacter = m_text[textEndOffset - 1];
1028
1029 m_lastTextNodeEndedWithCollapsedSpace = false;
1030 m_hasEmitted = true;
1031 }
1032
emitText(Node * textNode,int textStartOffset,int textEndOffset)1033 void TextIterator::emitText(Node* textNode, int textStartOffset, int textEndOffset)
1034 {
1035 emitText(textNode, m_node->renderer(), textStartOffset, textEndOffset);
1036 }
1037
range() const1038 PassRefPtr<Range> TextIterator::range() const
1039 {
1040 // use the current run information, if we have it
1041 if (m_positionNode) {
1042 if (m_positionOffsetBaseNode) {
1043 int index = m_positionOffsetBaseNode->nodeIndex();
1044 m_positionStartOffset += index;
1045 m_positionEndOffset += index;
1046 m_positionOffsetBaseNode = 0;
1047 }
1048 return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1049 }
1050
1051 // otherwise, return the end of the overall range we were given
1052 if (m_endContainer)
1053 return Range::create(m_endContainer->document(), m_endContainer, m_endOffset, m_endContainer, m_endOffset);
1054
1055 return 0;
1056 }
1057
node() const1058 Node* TextIterator::node() const
1059 {
1060 RefPtr<Range> textRange = range();
1061 if (!textRange)
1062 return 0;
1063
1064 Node* node = textRange->startContainer();
1065 if (!node)
1066 return 0;
1067 if (node->offsetInCharacters())
1068 return node;
1069
1070 return node->childNode(textRange->startOffset());
1071 }
1072
1073 // --------
1074
SimplifiedBackwardsTextIterator()1075 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator()
1076 : m_behavior(TextIteratorDefaultBehavior)
1077 , m_node(0)
1078 , m_positionNode(0)
1079 #if OS(ANDROID)
1080 , m_stopsOnFormControls(false)
1081 , m_shouldStop(false)
1082 #endif
1083 {
1084 }
1085
SimplifiedBackwardsTextIterator(const Range * r,TextIteratorBehavior behavior)1086 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator(const Range* r, TextIteratorBehavior behavior)
1087 : m_behavior(behavior)
1088 , m_node(0)
1089 , m_positionNode(0)
1090 #if OS(ANDROID)
1091 , m_stopsOnFormControls(behavior & TextIteratorStopsOnFormControls)
1092 , m_shouldStop(false)
1093 #endif
1094 {
1095 #if OS(ANDROID)
1096 ASSERT(m_behavior == TextIteratorDefaultBehavior || m_behavior == TextIteratorStopsOnFormControls);
1097 #else
1098 ASSERT(m_behavior == TextIteratorDefaultBehavior);
1099 #endif
1100
1101 if (!r)
1102 return;
1103
1104 Node* startNode = r->startContainer();
1105 if (!startNode)
1106 return;
1107 Node* endNode = r->endContainer();
1108 int startOffset = r->startOffset();
1109 int endOffset = r->endOffset();
1110
1111 if (!startNode->offsetInCharacters()) {
1112 if (startOffset >= 0 && startOffset < static_cast<int>(startNode->childNodeCount())) {
1113 startNode = startNode->childNode(startOffset);
1114 startOffset = 0;
1115 }
1116 }
1117 if (!endNode->offsetInCharacters()) {
1118 if (endOffset > 0 && endOffset <= static_cast<int>(endNode->childNodeCount())) {
1119 endNode = endNode->childNode(endOffset - 1);
1120 endOffset = lastOffsetInNode(endNode);
1121 }
1122 }
1123
1124 m_node = endNode;
1125 setUpFullyClippedStack(m_fullyClippedStack, m_node);
1126 m_offset = endOffset;
1127 m_handledNode = false;
1128 m_handledChildren = endOffset == 0;
1129
1130 m_startNode = startNode;
1131 m_startOffset = startOffset;
1132 m_endNode = endNode;
1133 m_endOffset = endOffset;
1134
1135 #ifndef NDEBUG
1136 // Need this just because of the assert.
1137 m_positionNode = endNode;
1138 #endif
1139
1140 m_lastTextNode = 0;
1141 m_lastCharacter = '\n';
1142
1143 m_havePassedStartNode = false;
1144
1145 advance();
1146 }
1147
atEnd() const1148 bool SimplifiedBackwardsTextIterator::atEnd() const
1149 {
1150 #if OS(ANDROID)
1151 return !m_positionNode || m_shouldStop;
1152 #else
1153 return !m_positionNode;
1154 #endif
1155 }
1156
advance()1157 void SimplifiedBackwardsTextIterator::advance()
1158 {
1159 ASSERT(m_positionNode);
1160
1161 #if OS(ANDROID)
1162 if (m_shouldStop)
1163 return;
1164
1165 // Prevent changing the iterator position if a form control element was found and advance should stop on it.
1166 if (m_stopsOnFormControls && checkFormControlElement(m_node)) {
1167 m_shouldStop = true;
1168 return;
1169 }
1170 #endif
1171
1172 m_positionNode = 0;
1173 m_textLength = 0;
1174
1175 while (m_node && !m_havePassedStartNode) {
1176 // Don't handle node if we start iterating at [node, 0].
1177 if (!m_handledNode && !(m_node == m_endNode && m_endOffset == 0)) {
1178 RenderObject* renderer = m_node->renderer();
1179 if (renderer && renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) {
1180 // FIXME: What about CDATA_SECTION_NODE?
1181 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
1182 m_handledNode = handleTextNode();
1183 } else if (renderer && (renderer->isImage() || renderer->isWidget())) {
1184 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
1185 m_handledNode = handleReplacedElement();
1186 } else
1187 m_handledNode = handleNonTextNode();
1188 if (m_positionNode)
1189 return;
1190 }
1191
1192 if (!m_handledChildren && m_node->hasChildNodes()) {
1193 m_node = m_node->lastChild();
1194 pushFullyClippedState(m_fullyClippedStack, m_node);
1195 } else {
1196 // Exit empty containers as we pass over them or containers
1197 // where [container, 0] is where we started iterating.
1198 if (!m_handledNode
1199 && canHaveChildrenForEditing(m_node)
1200 && m_node->parentNode()
1201 && (!m_node->lastChild() || (m_node == m_endNode && !m_endOffset))) {
1202 exitNode();
1203 if (m_positionNode) {
1204 m_handledNode = true;
1205 m_handledChildren = true;
1206 return;
1207 }
1208 }
1209
1210 // Exit all other containers.
1211 while (!m_node->previousSibling()) {
1212 if (!advanceRespectingRange(m_node->parentOrHostNode()))
1213 break;
1214 m_fullyClippedStack.pop();
1215 exitNode();
1216 if (m_positionNode) {
1217 m_handledNode = true;
1218 m_handledChildren = true;
1219 return;
1220 }
1221 }
1222
1223 m_fullyClippedStack.pop();
1224 if (advanceRespectingRange(m_node->previousSibling()))
1225 pushFullyClippedState(m_fullyClippedStack, m_node);
1226 else
1227 m_node = 0;
1228 }
1229
1230 // For the purpose of word boundary detection,
1231 // we should iterate all visible text and trailing (collapsed) whitespaces.
1232 m_offset = m_node ? maxOffsetIncludingCollapsedSpaces(m_node) : 0;
1233 m_handledNode = false;
1234 m_handledChildren = false;
1235
1236 if (m_positionNode)
1237 return;
1238 }
1239 }
1240
handleTextNode()1241 bool SimplifiedBackwardsTextIterator::handleTextNode()
1242 {
1243 m_lastTextNode = m_node;
1244
1245 RenderText* renderer = toRenderText(m_node->renderer());
1246 String str = renderer->text();
1247
1248 if (!renderer->firstTextBox() && str.length() > 0)
1249 return true;
1250
1251 m_positionEndOffset = m_offset;
1252
1253 m_offset = (m_node == m_startNode) ? m_startOffset : 0;
1254 m_positionNode = m_node;
1255 m_positionStartOffset = m_offset;
1256 m_textLength = m_positionEndOffset - m_positionStartOffset;
1257 m_textCharacters = str.characters() + m_positionStartOffset;
1258
1259 m_lastCharacter = str[m_positionEndOffset - 1];
1260
1261 return true;
1262 }
1263
handleReplacedElement()1264 bool SimplifiedBackwardsTextIterator::handleReplacedElement()
1265 {
1266 unsigned index = m_node->nodeIndex();
1267 // We want replaced elements to behave like punctuation for boundary
1268 // finding, and to simply take up space for the selection preservation
1269 // code in moveParagraphs, so we use a comma. Unconditionally emit
1270 // here because this iterator is only used for boundary finding.
1271 emitCharacter(',', m_node->parentNode(), index, index + 1);
1272 return true;
1273 }
1274
handleNonTextNode()1275 bool SimplifiedBackwardsTextIterator::handleNonTextNode()
1276 {
1277 // We can use a linefeed in place of a tab because this simple iterator is only used to
1278 // find boundaries, not actual content. A linefeed breaks words, sentences, and paragraphs.
1279 if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineAfterNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
1280 unsigned index = m_node->nodeIndex();
1281 // The start of this emitted range is wrong. Ensuring correctness would require
1282 // VisiblePositions and so would be slow. previousBoundary expects this.
1283 emitCharacter('\n', m_node->parentNode(), index + 1, index + 1);
1284 }
1285 return true;
1286 }
1287
exitNode()1288 void SimplifiedBackwardsTextIterator::exitNode()
1289 {
1290 if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineBeforeNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
1291 // The start of this emitted range is wrong. Ensuring correctness would require
1292 // VisiblePositions and so would be slow. previousBoundary expects this.
1293 emitCharacter('\n', m_node, 0, 0);
1294 }
1295 }
1296
emitCharacter(UChar c,Node * node,int startOffset,int endOffset)1297 void SimplifiedBackwardsTextIterator::emitCharacter(UChar c, Node* node, int startOffset, int endOffset)
1298 {
1299 m_singleCharacterBuffer = c;
1300 m_positionNode = node;
1301 m_positionStartOffset = startOffset;
1302 m_positionEndOffset = endOffset;
1303 m_textCharacters = &m_singleCharacterBuffer;
1304 m_textLength = 1;
1305 m_lastCharacter = c;
1306 }
1307
advanceRespectingRange(Node * next)1308 bool SimplifiedBackwardsTextIterator::advanceRespectingRange(Node* next)
1309 {
1310 if (!next)
1311 return false;
1312 m_havePassedStartNode |= m_node == m_startNode;
1313 if (m_havePassedStartNode)
1314 return false;
1315 m_node = next;
1316 return true;
1317 }
1318
range() const1319 PassRefPtr<Range> SimplifiedBackwardsTextIterator::range() const
1320 {
1321 if (m_positionNode)
1322 return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1323
1324 return Range::create(m_startNode->document(), m_startNode, m_startOffset, m_startNode, m_startOffset);
1325 }
1326
1327 // --------
1328
CharacterIterator()1329 CharacterIterator::CharacterIterator()
1330 : m_offset(0)
1331 , m_runOffset(0)
1332 , m_atBreak(true)
1333 {
1334 }
1335
CharacterIterator(const Range * r,TextIteratorBehavior behavior)1336 CharacterIterator::CharacterIterator(const Range* r, TextIteratorBehavior behavior)
1337 : m_offset(0)
1338 , m_runOffset(0)
1339 , m_atBreak(true)
1340 , m_textIterator(r, behavior)
1341 {
1342 while (!atEnd() && m_textIterator.length() == 0)
1343 m_textIterator.advance();
1344 }
1345
range() const1346 PassRefPtr<Range> CharacterIterator::range() const
1347 {
1348 RefPtr<Range> r = m_textIterator.range();
1349 if (!m_textIterator.atEnd()) {
1350 if (m_textIterator.length() <= 1) {
1351 ASSERT(m_runOffset == 0);
1352 } else {
1353 Node* n = r->startContainer();
1354 ASSERT(n == r->endContainer());
1355 int offset = r->startOffset() + m_runOffset;
1356 ExceptionCode ec = 0;
1357 r->setStart(n, offset, ec);
1358 r->setEnd(n, offset + 1, ec);
1359 ASSERT(!ec);
1360 }
1361 }
1362 return r.release();
1363 }
1364
advance(int count)1365 void CharacterIterator::advance(int count)
1366 {
1367 if (count <= 0) {
1368 ASSERT(count == 0);
1369 return;
1370 }
1371
1372 m_atBreak = false;
1373
1374 // easy if there is enough left in the current m_textIterator run
1375 int remaining = m_textIterator.length() - m_runOffset;
1376 if (count < remaining) {
1377 m_runOffset += count;
1378 m_offset += count;
1379 return;
1380 }
1381
1382 // exhaust the current m_textIterator run
1383 count -= remaining;
1384 m_offset += remaining;
1385
1386 // move to a subsequent m_textIterator run
1387 for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
1388 int runLength = m_textIterator.length();
1389 if (runLength == 0)
1390 m_atBreak = true;
1391 else {
1392 // see whether this is m_textIterator to use
1393 if (count < runLength) {
1394 m_runOffset = count;
1395 m_offset += count;
1396 return;
1397 }
1398
1399 // exhaust this m_textIterator run
1400 count -= runLength;
1401 m_offset += runLength;
1402 }
1403 }
1404
1405 // ran to the end of the m_textIterator... no more runs left
1406 m_atBreak = true;
1407 m_runOffset = 0;
1408 }
1409
string(int numChars)1410 String CharacterIterator::string(int numChars)
1411 {
1412 Vector<UChar> result;
1413 result.reserveInitialCapacity(numChars);
1414 while (numChars > 0 && !atEnd()) {
1415 int runSize = min(numChars, length());
1416 result.append(characters(), runSize);
1417 numChars -= runSize;
1418 advance(runSize);
1419 }
1420 return String::adopt(result);
1421 }
1422
characterSubrange(CharacterIterator & it,int offset,int length)1423 static PassRefPtr<Range> characterSubrange(CharacterIterator& it, int offset, int length)
1424 {
1425 it.advance(offset);
1426 RefPtr<Range> start = it.range();
1427
1428 if (length > 1)
1429 it.advance(length - 1);
1430 RefPtr<Range> end = it.range();
1431
1432 return Range::create(start->startContainer()->document(),
1433 start->startContainer(), start->startOffset(),
1434 end->endContainer(), end->endOffset());
1435 }
1436
BackwardsCharacterIterator()1437 BackwardsCharacterIterator::BackwardsCharacterIterator()
1438 : m_offset(0)
1439 , m_runOffset(0)
1440 , m_atBreak(true)
1441 {
1442 }
1443
BackwardsCharacterIterator(const Range * range,TextIteratorBehavior behavior)1444 BackwardsCharacterIterator::BackwardsCharacterIterator(const Range* range, TextIteratorBehavior behavior)
1445 : m_offset(0)
1446 , m_runOffset(0)
1447 , m_atBreak(true)
1448 , m_textIterator(range, behavior)
1449 {
1450 while (!atEnd() && !m_textIterator.length())
1451 m_textIterator.advance();
1452 }
1453
range() const1454 PassRefPtr<Range> BackwardsCharacterIterator::range() const
1455 {
1456 RefPtr<Range> r = m_textIterator.range();
1457 if (!m_textIterator.atEnd()) {
1458 if (m_textIterator.length() <= 1)
1459 ASSERT(m_runOffset == 0);
1460 else {
1461 Node* n = r->startContainer();
1462 ASSERT(n == r->endContainer());
1463 int offset = r->endOffset() - m_runOffset;
1464 ExceptionCode ec = 0;
1465 r->setStart(n, offset - 1, ec);
1466 r->setEnd(n, offset, ec);
1467 ASSERT(!ec);
1468 }
1469 }
1470 return r.release();
1471 }
1472
advance(int count)1473 void BackwardsCharacterIterator::advance(int count)
1474 {
1475 if (count <= 0) {
1476 ASSERT(!count);
1477 return;
1478 }
1479
1480 m_atBreak = false;
1481
1482 int remaining = m_textIterator.length() - m_runOffset;
1483 if (count < remaining) {
1484 m_runOffset += count;
1485 m_offset += count;
1486 return;
1487 }
1488
1489 count -= remaining;
1490 m_offset += remaining;
1491
1492 for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
1493 int runLength = m_textIterator.length();
1494 if (runLength == 0)
1495 m_atBreak = true;
1496 else {
1497 if (count < runLength) {
1498 m_runOffset = count;
1499 m_offset += count;
1500 return;
1501 }
1502
1503 count -= runLength;
1504 m_offset += runLength;
1505 }
1506 }
1507
1508 m_atBreak = true;
1509 m_runOffset = 0;
1510 }
1511
1512 // --------
1513
WordAwareIterator()1514 WordAwareIterator::WordAwareIterator()
1515 : m_previousText(0)
1516 , m_didLookAhead(false)
1517 {
1518 }
1519
WordAwareIterator(const Range * r)1520 WordAwareIterator::WordAwareIterator(const Range* r)
1521 : m_previousText(0)
1522 , m_didLookAhead(true) // so we consider the first chunk from the text iterator
1523 , m_textIterator(r)
1524 {
1525 advance(); // get in position over the first chunk of text
1526 }
1527
~WordAwareIterator()1528 WordAwareIterator::~WordAwareIterator()
1529 {
1530 }
1531
1532 // We're always in one of these modes:
1533 // - The current chunk in the text iterator is our current chunk
1534 // (typically its a piece of whitespace, or text that ended with whitespace)
1535 // - The previous chunk in the text iterator is our current chunk
1536 // (we looked ahead to the next chunk and found a word boundary)
1537 // - We built up our own chunk of text from many chunks from the text iterator
1538
1539 // FIXME: Performance could be bad for huge spans next to each other that don't fall on word boundaries.
1540
advance()1541 void WordAwareIterator::advance()
1542 {
1543 m_previousText = 0;
1544 m_buffer.clear(); // toss any old buffer we built up
1545
1546 // If last time we did a look-ahead, start with that looked-ahead chunk now
1547 if (!m_didLookAhead) {
1548 ASSERT(!m_textIterator.atEnd());
1549 m_textIterator.advance();
1550 }
1551 m_didLookAhead = false;
1552
1553 // Go to next non-empty chunk
1554 while (!m_textIterator.atEnd() && m_textIterator.length() == 0)
1555 m_textIterator.advance();
1556 m_range = m_textIterator.range();
1557
1558 if (m_textIterator.atEnd())
1559 return;
1560
1561 while (1) {
1562 // If this chunk ends in whitespace we can just use it as our chunk.
1563 if (isSpaceOrNewline(m_textIterator.characters()[m_textIterator.length() - 1]))
1564 return;
1565
1566 // If this is the first chunk that failed, save it in previousText before look ahead
1567 if (m_buffer.isEmpty()) {
1568 m_previousText = m_textIterator.characters();
1569 m_previousLength = m_textIterator.length();
1570 }
1571
1572 // Look ahead to next chunk. If it is whitespace or a break, we can use the previous stuff
1573 m_textIterator.advance();
1574 if (m_textIterator.atEnd() || m_textIterator.length() == 0 || isSpaceOrNewline(m_textIterator.characters()[0])) {
1575 m_didLookAhead = true;
1576 return;
1577 }
1578
1579 if (m_buffer.isEmpty()) {
1580 // Start gobbling chunks until we get to a suitable stopping point
1581 m_buffer.append(m_previousText, m_previousLength);
1582 m_previousText = 0;
1583 }
1584 m_buffer.append(m_textIterator.characters(), m_textIterator.length());
1585 int exception = 0;
1586 m_range->setEnd(m_textIterator.range()->endContainer(), m_textIterator.range()->endOffset(), exception);
1587 }
1588 }
1589
length() const1590 int WordAwareIterator::length() const
1591 {
1592 if (!m_buffer.isEmpty())
1593 return m_buffer.size();
1594 if (m_previousText)
1595 return m_previousLength;
1596 return m_textIterator.length();
1597 }
1598
characters() const1599 const UChar* WordAwareIterator::characters() const
1600 {
1601 if (!m_buffer.isEmpty())
1602 return m_buffer.data();
1603 if (m_previousText)
1604 return m_previousText;
1605 return m_textIterator.characters();
1606 }
1607
1608 // --------
1609
foldQuoteMarkOrSoftHyphen(UChar c)1610 static inline UChar foldQuoteMarkOrSoftHyphen(UChar c)
1611 {
1612 switch (c) {
1613 case hebrewPunctuationGershayim:
1614 case leftDoubleQuotationMark:
1615 case rightDoubleQuotationMark:
1616 return '"';
1617 case hebrewPunctuationGeresh:
1618 case leftSingleQuotationMark:
1619 case rightSingleQuotationMark:
1620 return '\'';
1621 case softHyphen:
1622 // Replace soft hyphen with an ignorable character so that their presence or absence will
1623 // not affect string comparison.
1624 return 0;
1625 default:
1626 return c;
1627 }
1628 }
1629
foldQuoteMarksAndSoftHyphens(String & s)1630 static inline void foldQuoteMarksAndSoftHyphens(String& s)
1631 {
1632 s.replace(hebrewPunctuationGeresh, '\'');
1633 s.replace(hebrewPunctuationGershayim, '"');
1634 s.replace(leftDoubleQuotationMark, '"');
1635 s.replace(leftSingleQuotationMark, '\'');
1636 s.replace(rightDoubleQuotationMark, '"');
1637 s.replace(rightSingleQuotationMark, '\'');
1638 // Replace soft hyphen with an ignorable character so that their presence or absence will
1639 // not affect string comparison.
1640 s.replace(softHyphen, 0);
1641 }
1642
1643 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
1644
foldQuoteMarksAndSoftHyphens(UChar * data,size_t length)1645 static inline void foldQuoteMarksAndSoftHyphens(UChar* data, size_t length)
1646 {
1647 for (size_t i = 0; i < length; ++i)
1648 data[i] = foldQuoteMarkOrSoftHyphen(data[i]);
1649 }
1650
1651 static const size_t minimumSearchBufferSize = 8192;
1652
1653 #ifndef NDEBUG
1654 static bool searcherInUse;
1655 #endif
1656
createSearcher()1657 static UStringSearch* createSearcher()
1658 {
1659 // Provide a non-empty pattern and non-empty text so usearch_open will not fail,
1660 // but it doesn't matter exactly what it is, since we don't perform any searches
1661 // without setting both the pattern and the text.
1662 UErrorCode status = U_ZERO_ERROR;
1663 UStringSearch* searcher = usearch_open(&newlineCharacter, 1, &newlineCharacter, 1, currentSearchLocaleID(), 0, &status);
1664 ASSERT(status == U_ZERO_ERROR || status == U_USING_FALLBACK_WARNING || status == U_USING_DEFAULT_WARNING);
1665 return searcher;
1666 }
1667
searcher()1668 static UStringSearch* searcher()
1669 {
1670 static UStringSearch* searcher = createSearcher();
1671 return searcher;
1672 }
1673
lockSearcher()1674 static inline void lockSearcher()
1675 {
1676 #ifndef NDEBUG
1677 ASSERT(!searcherInUse);
1678 searcherInUse = true;
1679 #endif
1680 }
1681
unlockSearcher()1682 static inline void unlockSearcher()
1683 {
1684 #ifndef NDEBUG
1685 ASSERT(searcherInUse);
1686 searcherInUse = false;
1687 #endif
1688 }
1689
1690 // ICU's search ignores the distinction between small kana letters and ones
1691 // that are not small, and also characters that differ only in the voicing
1692 // marks when considering only primary collation strength diffrences.
1693 // This is not helpful for end users, since these differences make words
1694 // distinct, so for our purposes we need these to be considered.
1695 // The Unicode folks do not think the collation algorithm should be
1696 // changed. To work around this, we would like to tailor the ICU searcher,
1697 // but we can't get that to work yet. So instead, we check for cases where
1698 // these differences occur, and skip those matches.
1699
1700 // We refer to the above technique as the "kana workaround". The next few
1701 // functions are helper functinos for the kana workaround.
1702
isKanaLetter(UChar character)1703 static inline bool isKanaLetter(UChar character)
1704 {
1705 // Hiragana letters.
1706 if (character >= 0x3041 && character <= 0x3096)
1707 return true;
1708
1709 // Katakana letters.
1710 if (character >= 0x30A1 && character <= 0x30FA)
1711 return true;
1712 if (character >= 0x31F0 && character <= 0x31FF)
1713 return true;
1714
1715 // Halfwidth katakana letters.
1716 if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70)
1717 return true;
1718
1719 return false;
1720 }
1721
isSmallKanaLetter(UChar character)1722 static inline bool isSmallKanaLetter(UChar character)
1723 {
1724 ASSERT(isKanaLetter(character));
1725
1726 switch (character) {
1727 case 0x3041: // HIRAGANA LETTER SMALL A
1728 case 0x3043: // HIRAGANA LETTER SMALL I
1729 case 0x3045: // HIRAGANA LETTER SMALL U
1730 case 0x3047: // HIRAGANA LETTER SMALL E
1731 case 0x3049: // HIRAGANA LETTER SMALL O
1732 case 0x3063: // HIRAGANA LETTER SMALL TU
1733 case 0x3083: // HIRAGANA LETTER SMALL YA
1734 case 0x3085: // HIRAGANA LETTER SMALL YU
1735 case 0x3087: // HIRAGANA LETTER SMALL YO
1736 case 0x308E: // HIRAGANA LETTER SMALL WA
1737 case 0x3095: // HIRAGANA LETTER SMALL KA
1738 case 0x3096: // HIRAGANA LETTER SMALL KE
1739 case 0x30A1: // KATAKANA LETTER SMALL A
1740 case 0x30A3: // KATAKANA LETTER SMALL I
1741 case 0x30A5: // KATAKANA LETTER SMALL U
1742 case 0x30A7: // KATAKANA LETTER SMALL E
1743 case 0x30A9: // KATAKANA LETTER SMALL O
1744 case 0x30C3: // KATAKANA LETTER SMALL TU
1745 case 0x30E3: // KATAKANA LETTER SMALL YA
1746 case 0x30E5: // KATAKANA LETTER SMALL YU
1747 case 0x30E7: // KATAKANA LETTER SMALL YO
1748 case 0x30EE: // KATAKANA LETTER SMALL WA
1749 case 0x30F5: // KATAKANA LETTER SMALL KA
1750 case 0x30F6: // KATAKANA LETTER SMALL KE
1751 case 0x31F0: // KATAKANA LETTER SMALL KU
1752 case 0x31F1: // KATAKANA LETTER SMALL SI
1753 case 0x31F2: // KATAKANA LETTER SMALL SU
1754 case 0x31F3: // KATAKANA LETTER SMALL TO
1755 case 0x31F4: // KATAKANA LETTER SMALL NU
1756 case 0x31F5: // KATAKANA LETTER SMALL HA
1757 case 0x31F6: // KATAKANA LETTER SMALL HI
1758 case 0x31F7: // KATAKANA LETTER SMALL HU
1759 case 0x31F8: // KATAKANA LETTER SMALL HE
1760 case 0x31F9: // KATAKANA LETTER SMALL HO
1761 case 0x31FA: // KATAKANA LETTER SMALL MU
1762 case 0x31FB: // KATAKANA LETTER SMALL RA
1763 case 0x31FC: // KATAKANA LETTER SMALL RI
1764 case 0x31FD: // KATAKANA LETTER SMALL RU
1765 case 0x31FE: // KATAKANA LETTER SMALL RE
1766 case 0x31FF: // KATAKANA LETTER SMALL RO
1767 case 0xFF67: // HALFWIDTH KATAKANA LETTER SMALL A
1768 case 0xFF68: // HALFWIDTH KATAKANA LETTER SMALL I
1769 case 0xFF69: // HALFWIDTH KATAKANA LETTER SMALL U
1770 case 0xFF6A: // HALFWIDTH KATAKANA LETTER SMALL E
1771 case 0xFF6B: // HALFWIDTH KATAKANA LETTER SMALL O
1772 case 0xFF6C: // HALFWIDTH KATAKANA LETTER SMALL YA
1773 case 0xFF6D: // HALFWIDTH KATAKANA LETTER SMALL YU
1774 case 0xFF6E: // HALFWIDTH KATAKANA LETTER SMALL YO
1775 case 0xFF6F: // HALFWIDTH KATAKANA LETTER SMALL TU
1776 return true;
1777 }
1778 return false;
1779 }
1780
1781 enum VoicedSoundMarkType { NoVoicedSoundMark, VoicedSoundMark, SemiVoicedSoundMark };
1782
composedVoicedSoundMark(UChar character)1783 static inline VoicedSoundMarkType composedVoicedSoundMark(UChar character)
1784 {
1785 ASSERT(isKanaLetter(character));
1786
1787 switch (character) {
1788 case 0x304C: // HIRAGANA LETTER GA
1789 case 0x304E: // HIRAGANA LETTER GI
1790 case 0x3050: // HIRAGANA LETTER GU
1791 case 0x3052: // HIRAGANA LETTER GE
1792 case 0x3054: // HIRAGANA LETTER GO
1793 case 0x3056: // HIRAGANA LETTER ZA
1794 case 0x3058: // HIRAGANA LETTER ZI
1795 case 0x305A: // HIRAGANA LETTER ZU
1796 case 0x305C: // HIRAGANA LETTER ZE
1797 case 0x305E: // HIRAGANA LETTER ZO
1798 case 0x3060: // HIRAGANA LETTER DA
1799 case 0x3062: // HIRAGANA LETTER DI
1800 case 0x3065: // HIRAGANA LETTER DU
1801 case 0x3067: // HIRAGANA LETTER DE
1802 case 0x3069: // HIRAGANA LETTER DO
1803 case 0x3070: // HIRAGANA LETTER BA
1804 case 0x3073: // HIRAGANA LETTER BI
1805 case 0x3076: // HIRAGANA LETTER BU
1806 case 0x3079: // HIRAGANA LETTER BE
1807 case 0x307C: // HIRAGANA LETTER BO
1808 case 0x3094: // HIRAGANA LETTER VU
1809 case 0x30AC: // KATAKANA LETTER GA
1810 case 0x30AE: // KATAKANA LETTER GI
1811 case 0x30B0: // KATAKANA LETTER GU
1812 case 0x30B2: // KATAKANA LETTER GE
1813 case 0x30B4: // KATAKANA LETTER GO
1814 case 0x30B6: // KATAKANA LETTER ZA
1815 case 0x30B8: // KATAKANA LETTER ZI
1816 case 0x30BA: // KATAKANA LETTER ZU
1817 case 0x30BC: // KATAKANA LETTER ZE
1818 case 0x30BE: // KATAKANA LETTER ZO
1819 case 0x30C0: // KATAKANA LETTER DA
1820 case 0x30C2: // KATAKANA LETTER DI
1821 case 0x30C5: // KATAKANA LETTER DU
1822 case 0x30C7: // KATAKANA LETTER DE
1823 case 0x30C9: // KATAKANA LETTER DO
1824 case 0x30D0: // KATAKANA LETTER BA
1825 case 0x30D3: // KATAKANA LETTER BI
1826 case 0x30D6: // KATAKANA LETTER BU
1827 case 0x30D9: // KATAKANA LETTER BE
1828 case 0x30DC: // KATAKANA LETTER BO
1829 case 0x30F4: // KATAKANA LETTER VU
1830 case 0x30F7: // KATAKANA LETTER VA
1831 case 0x30F8: // KATAKANA LETTER VI
1832 case 0x30F9: // KATAKANA LETTER VE
1833 case 0x30FA: // KATAKANA LETTER VO
1834 return VoicedSoundMark;
1835 case 0x3071: // HIRAGANA LETTER PA
1836 case 0x3074: // HIRAGANA LETTER PI
1837 case 0x3077: // HIRAGANA LETTER PU
1838 case 0x307A: // HIRAGANA LETTER PE
1839 case 0x307D: // HIRAGANA LETTER PO
1840 case 0x30D1: // KATAKANA LETTER PA
1841 case 0x30D4: // KATAKANA LETTER PI
1842 case 0x30D7: // KATAKANA LETTER PU
1843 case 0x30DA: // KATAKANA LETTER PE
1844 case 0x30DD: // KATAKANA LETTER PO
1845 return SemiVoicedSoundMark;
1846 }
1847 return NoVoicedSoundMark;
1848 }
1849
isCombiningVoicedSoundMark(UChar character)1850 static inline bool isCombiningVoicedSoundMark(UChar character)
1851 {
1852 switch (character) {
1853 case 0x3099: // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
1854 case 0x309A: // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
1855 return true;
1856 }
1857 return false;
1858 }
1859
containsKanaLetters(const String & pattern)1860 static inline bool containsKanaLetters(const String& pattern)
1861 {
1862 const UChar* characters = pattern.characters();
1863 unsigned length = pattern.length();
1864 for (unsigned i = 0; i < length; ++i) {
1865 if (isKanaLetter(characters[i]))
1866 return true;
1867 }
1868 return false;
1869 }
1870
normalizeCharacters(const UChar * characters,unsigned length,Vector<UChar> & buffer)1871 static void normalizeCharacters(const UChar* characters, unsigned length, Vector<UChar>& buffer)
1872 {
1873 ASSERT(length);
1874
1875 buffer.resize(length);
1876
1877 UErrorCode status = U_ZERO_ERROR;
1878 size_t bufferSize = unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), length, &status);
1879 ASSERT(status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR);
1880 ASSERT(bufferSize);
1881
1882 buffer.resize(bufferSize);
1883
1884 if (status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING)
1885 return;
1886
1887 status = U_ZERO_ERROR;
1888 unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), bufferSize, &status);
1889 ASSERT(status == U_STRING_NOT_TERMINATED_WARNING);
1890 }
1891
isNonLatin1Separator(UChar32 character)1892 static bool isNonLatin1Separator(UChar32 character)
1893 {
1894 ASSERT_ARG(character, character >= 256);
1895
1896 return U_GET_GC_MASK(character) & (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK);
1897 }
1898
isSeparator(UChar32 character)1899 static inline bool isSeparator(UChar32 character)
1900 {
1901 static const bool latin1SeparatorTable[256] = {
1902 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1903 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1904 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // space ! " # $ % & ' ( ) * + , - . /
1905 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, // : ; < = > ?
1906 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // @
1907 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, // [ \ ] ^ _
1908 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // `
1909 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, // { | } ~
1910 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1911 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1912 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1913 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1914 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1915 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
1916 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1917 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
1918 };
1919
1920 if (character < 256)
1921 return latin1SeparatorTable[character];
1922
1923 return isNonLatin1Separator(character);
1924 }
1925
SearchBuffer(const String & target,FindOptions options)1926 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
1927 : m_target(target)
1928 , m_options(options)
1929 , m_prefixLength(0)
1930 , m_atBreak(true)
1931 , m_needsMoreContext(options & AtWordStarts)
1932 , m_targetRequiresKanaWorkaround(containsKanaLetters(m_target))
1933 {
1934 ASSERT(!m_target.isEmpty());
1935
1936 // FIXME: We'd like to tailor the searcher to fold quote marks for us instead
1937 // of doing it in a separate replacement pass here, but ICU doesn't offer a way
1938 // to add tailoring on top of the locale-specific tailoring as of this writing.
1939 foldQuoteMarksAndSoftHyphens(m_target);
1940
1941 size_t targetLength = m_target.length();
1942 m_buffer.reserveInitialCapacity(max(targetLength * 8, minimumSearchBufferSize));
1943 m_overlap = m_buffer.capacity() / 4;
1944
1945 if ((m_options & AtWordStarts) && targetLength) {
1946 UChar32 targetFirstCharacter;
1947 U16_GET(m_target.characters(), 0, 0, targetLength, targetFirstCharacter);
1948 // Characters in the separator category never really occur at the beginning of a word,
1949 // so if the target begins with such a character, we just ignore the AtWordStart option.
1950 if (isSeparator(targetFirstCharacter)) {
1951 m_options &= ~AtWordStarts;
1952 m_needsMoreContext = false;
1953 }
1954 }
1955
1956 // Grab the single global searcher.
1957 // If we ever have a reason to do more than once search buffer at once, we'll have
1958 // to move to multiple searchers.
1959 lockSearcher();
1960
1961 UStringSearch* searcher = WebCore::searcher();
1962 UCollator* collator = usearch_getCollator(searcher);
1963
1964 UCollationStrength strength = m_options & CaseInsensitive ? UCOL_PRIMARY : UCOL_TERTIARY;
1965 if (ucol_getStrength(collator) != strength) {
1966 ucol_setStrength(collator, strength);
1967 usearch_reset(searcher);
1968 }
1969
1970 UErrorCode status = U_ZERO_ERROR;
1971 usearch_setPattern(searcher, m_target.characters(), targetLength, &status);
1972 ASSERT(status == U_ZERO_ERROR);
1973
1974 // The kana workaround requires a normalized copy of the target string.
1975 if (m_targetRequiresKanaWorkaround)
1976 normalizeCharacters(m_target.characters(), m_target.length(), m_normalizedTarget);
1977 }
1978
~SearchBuffer()1979 inline SearchBuffer::~SearchBuffer()
1980 {
1981 // Leave the static object pointing to a valid string.
1982 UErrorCode status = U_ZERO_ERROR;
1983 usearch_setPattern(WebCore::searcher(), &newlineCharacter, 1, &status);
1984 ASSERT(status == U_ZERO_ERROR);
1985
1986 unlockSearcher();
1987 }
1988
append(const UChar * characters,size_t length)1989 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
1990 {
1991 ASSERT(length);
1992
1993 if (m_atBreak) {
1994 m_buffer.shrink(0);
1995 m_prefixLength = 0;
1996 m_atBreak = false;
1997 } else if (m_buffer.size() == m_buffer.capacity()) {
1998 memcpy(m_buffer.data(), m_buffer.data() + m_buffer.size() - m_overlap, m_overlap * sizeof(UChar));
1999 m_prefixLength -= min(m_prefixLength, m_buffer.size() - m_overlap);
2000 m_buffer.shrink(m_overlap);
2001 }
2002
2003 size_t oldLength = m_buffer.size();
2004 size_t usableLength = min(m_buffer.capacity() - oldLength, length);
2005 ASSERT(usableLength);
2006 m_buffer.append(characters, usableLength);
2007 foldQuoteMarksAndSoftHyphens(m_buffer.data() + oldLength, usableLength);
2008 return usableLength;
2009 }
2010
needsMoreContext() const2011 inline bool SearchBuffer::needsMoreContext() const
2012 {
2013 return m_needsMoreContext;
2014 }
2015
prependContext(const UChar * characters,size_t length)2016 inline void SearchBuffer::prependContext(const UChar* characters, size_t length)
2017 {
2018 ASSERT(m_needsMoreContext);
2019 ASSERT(m_prefixLength == m_buffer.size());
2020
2021 if (!length)
2022 return;
2023
2024 m_atBreak = false;
2025
2026 size_t wordBoundaryContextStart = length;
2027 if (wordBoundaryContextStart) {
2028 U16_BACK_1(characters, 0, wordBoundaryContextStart);
2029 wordBoundaryContextStart = startOfLastWordBoundaryContext(characters, wordBoundaryContextStart);
2030 }
2031
2032 size_t usableLength = min(m_buffer.capacity() - m_prefixLength, length - wordBoundaryContextStart);
2033 m_buffer.prepend(characters + length - usableLength, usableLength);
2034 m_prefixLength += usableLength;
2035
2036 if (wordBoundaryContextStart || m_prefixLength == m_buffer.capacity())
2037 m_needsMoreContext = false;
2038 }
2039
atBreak() const2040 inline bool SearchBuffer::atBreak() const
2041 {
2042 return m_atBreak;
2043 }
2044
reachedBreak()2045 inline void SearchBuffer::reachedBreak()
2046 {
2047 m_atBreak = true;
2048 }
2049
isBadMatch(const UChar * match,size_t matchLength) const2050 inline bool SearchBuffer::isBadMatch(const UChar* match, size_t matchLength) const
2051 {
2052 // This function implements the kana workaround. If usearch treats
2053 // it as a match, but we do not want to, then it's a "bad match".
2054 if (!m_targetRequiresKanaWorkaround)
2055 return false;
2056
2057 // Normalize into a match buffer. We reuse a single buffer rather than
2058 // creating a new one each time.
2059 normalizeCharacters(match, matchLength, m_normalizedMatch);
2060
2061 const UChar* a = m_normalizedTarget.begin();
2062 const UChar* aEnd = m_normalizedTarget.end();
2063
2064 const UChar* b = m_normalizedMatch.begin();
2065 const UChar* bEnd = m_normalizedMatch.end();
2066
2067 while (true) {
2068 // Skip runs of non-kana-letter characters. This is necessary so we can
2069 // correctly handle strings where the target and match have different-length
2070 // runs of characters that match, while still double checking the correctness
2071 // of matches of kana letters with other kana letters.
2072 while (a != aEnd && !isKanaLetter(*a))
2073 ++a;
2074 while (b != bEnd && !isKanaLetter(*b))
2075 ++b;
2076
2077 // If we reached the end of either the target or the match, we should have
2078 // reached the end of both; both should have the same number of kana letters.
2079 if (a == aEnd || b == bEnd) {
2080 ASSERT(a == aEnd);
2081 ASSERT(b == bEnd);
2082 return false;
2083 }
2084
2085 // Check for differences in the kana letter character itself.
2086 if (isSmallKanaLetter(*a) != isSmallKanaLetter(*b))
2087 return true;
2088 if (composedVoicedSoundMark(*a) != composedVoicedSoundMark(*b))
2089 return true;
2090 ++a;
2091 ++b;
2092
2093 // Check for differences in combining voiced sound marks found after the letter.
2094 while (1) {
2095 if (!(a != aEnd && isCombiningVoicedSoundMark(*a))) {
2096 if (b != bEnd && isCombiningVoicedSoundMark(*b))
2097 return true;
2098 break;
2099 }
2100 if (!(b != bEnd && isCombiningVoicedSoundMark(*b)))
2101 return true;
2102 if (*a != *b)
2103 return true;
2104 ++a;
2105 ++b;
2106 }
2107 }
2108 }
2109
isWordStartMatch(size_t start,size_t length) const2110 inline bool SearchBuffer::isWordStartMatch(size_t start, size_t length) const
2111 {
2112 ASSERT(m_options & AtWordStarts);
2113
2114 if (!start)
2115 return true;
2116
2117 if (m_options & TreatMedialCapitalAsWordStart) {
2118 int size = m_buffer.size();
2119 int offset = start;
2120 UChar32 firstCharacter;
2121 U16_GET(m_buffer.data(), 0, offset, size, firstCharacter);
2122 UChar32 previousCharacter;
2123 U16_PREV(m_buffer.data(), 0, offset, previousCharacter);
2124
2125 if (isSeparator(firstCharacter)) {
2126 // The start of a separator run is a word start (".org" in "webkit.org").
2127 if (!isSeparator(previousCharacter))
2128 return true;
2129 } else if (isASCIIUpper(firstCharacter)) {
2130 // The start of an uppercase run is a word start ("Kit" in "WebKit").
2131 if (!isASCIIUpper(previousCharacter))
2132 return true;
2133 // The last character of an uppercase run followed by a non-separator, non-digit
2134 // is a word start ("Request" in "XMLHTTPRequest").
2135 offset = start;
2136 U16_FWD_1(m_buffer.data(), offset, size);
2137 UChar32 nextCharacter = 0;
2138 if (offset < size)
2139 U16_GET(m_buffer.data(), 0, offset, size, nextCharacter);
2140 if (!isASCIIUpper(nextCharacter) && !isASCIIDigit(nextCharacter) && !isSeparator(nextCharacter))
2141 return true;
2142 } else if (isASCIIDigit(firstCharacter)) {
2143 // The start of a digit run is a word start ("2" in "WebKit2").
2144 if (!isASCIIDigit(previousCharacter))
2145 return true;
2146 } else if (isSeparator(previousCharacter) || isASCIIDigit(previousCharacter)) {
2147 // The start of a non-separator, non-uppercase, non-digit run is a word start,
2148 // except after an uppercase. ("org" in "webkit.org", but not "ore" in "WebCore").
2149 return true;
2150 }
2151 }
2152
2153 size_t wordBreakSearchStart = start + length;
2154 while (wordBreakSearchStart > start)
2155 wordBreakSearchStart = findNextWordFromIndex(m_buffer.data(), m_buffer.size(), wordBreakSearchStart, false /* backwards */);
2156 return wordBreakSearchStart == start;
2157 }
2158
search(size_t & start)2159 inline size_t SearchBuffer::search(size_t& start)
2160 {
2161 size_t size = m_buffer.size();
2162 if (m_atBreak) {
2163 if (!size)
2164 return 0;
2165 } else {
2166 if (size != m_buffer.capacity())
2167 return 0;
2168 }
2169
2170 UStringSearch* searcher = WebCore::searcher();
2171
2172 UErrorCode status = U_ZERO_ERROR;
2173 usearch_setText(searcher, m_buffer.data(), size, &status);
2174 ASSERT(status == U_ZERO_ERROR);
2175
2176 usearch_setOffset(searcher, m_prefixLength, &status);
2177 ASSERT(status == U_ZERO_ERROR);
2178
2179 int matchStart = usearch_next(searcher, &status);
2180 ASSERT(status == U_ZERO_ERROR);
2181
2182 nextMatch:
2183 if (!(matchStart >= 0 && static_cast<size_t>(matchStart) < size)) {
2184 ASSERT(matchStart == USEARCH_DONE);
2185 return 0;
2186 }
2187
2188 // Matches that start in the overlap area are only tentative.
2189 // The same match may appear later, matching more characters,
2190 // possibly including a combining character that's not yet in the buffer.
2191 if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) {
2192 size_t overlap = m_overlap;
2193 if (m_options & AtWordStarts) {
2194 // Ensure that there is sufficient context before matchStart the next time around for
2195 // determining if it is at a word boundary.
2196 int wordBoundaryContextStart = matchStart;
2197 U16_BACK_1(m_buffer.data(), 0, wordBoundaryContextStart);
2198 wordBoundaryContextStart = startOfLastWordBoundaryContext(m_buffer.data(), wordBoundaryContextStart);
2199 overlap = min(size - 1, max(overlap, size - wordBoundaryContextStart));
2200 }
2201 memcpy(m_buffer.data(), m_buffer.data() + size - overlap, overlap * sizeof(UChar));
2202 m_prefixLength -= min(m_prefixLength, size - overlap);
2203 m_buffer.shrink(overlap);
2204 return 0;
2205 }
2206
2207 size_t matchedLength = usearch_getMatchedLength(searcher);
2208 ASSERT(matchStart + matchedLength <= size);
2209
2210 // If this match is "bad", move on to the next match.
2211 if (isBadMatch(m_buffer.data() + matchStart, matchedLength) || ((m_options & AtWordStarts) && !isWordStartMatch(matchStart, matchedLength))) {
2212 matchStart = usearch_next(searcher, &status);
2213 ASSERT(status == U_ZERO_ERROR);
2214 goto nextMatch;
2215 }
2216
2217 size_t newSize = size - (matchStart + 1);
2218 memmove(m_buffer.data(), m_buffer.data() + matchStart + 1, newSize * sizeof(UChar));
2219 m_prefixLength -= min<size_t>(m_prefixLength, matchStart + 1);
2220 m_buffer.shrink(newSize);
2221
2222 start = size - matchStart;
2223 return matchedLength;
2224 }
2225
2226 #else // !ICU_UNICODE
2227
SearchBuffer(const String & target,FindOptions options)2228 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
2229 : m_target(options & CaseInsensitive ? target.foldCase() : target)
2230 , m_options(options)
2231 , m_buffer(m_target.length())
2232 , m_isCharacterStartBuffer(m_target.length())
2233 , m_isBufferFull(false)
2234 , m_cursor(0)
2235 {
2236 ASSERT(!m_target.isEmpty());
2237 m_target.replace(noBreakSpace, ' ');
2238 foldQuoteMarksAndSoftHyphens(m_target);
2239 }
2240
~SearchBuffer()2241 inline SearchBuffer::~SearchBuffer()
2242 {
2243 }
2244
reachedBreak()2245 inline void SearchBuffer::reachedBreak()
2246 {
2247 m_cursor = 0;
2248 m_isBufferFull = false;
2249 }
2250
atBreak() const2251 inline bool SearchBuffer::atBreak() const
2252 {
2253 return !m_cursor && !m_isBufferFull;
2254 }
2255
append(UChar c,bool isStart)2256 inline void SearchBuffer::append(UChar c, bool isStart)
2257 {
2258 m_buffer[m_cursor] = c == noBreakSpace ? ' ' : foldQuoteMarkOrSoftHyphen(c);
2259 m_isCharacterStartBuffer[m_cursor] = isStart;
2260 if (++m_cursor == m_target.length()) {
2261 m_cursor = 0;
2262 m_isBufferFull = true;
2263 }
2264 }
2265
append(const UChar * characters,size_t length)2266 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
2267 {
2268 ASSERT(length);
2269 if (!(m_options & CaseInsensitive)) {
2270 append(characters[0], true);
2271 return 1;
2272 }
2273 const int maxFoldedCharacters = 16; // sensible maximum is 3, this should be more than enough
2274 UChar foldedCharacters[maxFoldedCharacters];
2275 bool error;
2276 int numFoldedCharacters = foldCase(foldedCharacters, maxFoldedCharacters, characters, 1, &error);
2277 ASSERT(!error);
2278 ASSERT(numFoldedCharacters);
2279 ASSERT(numFoldedCharacters <= maxFoldedCharacters);
2280 if (!error && numFoldedCharacters) {
2281 numFoldedCharacters = min(numFoldedCharacters, maxFoldedCharacters);
2282 append(foldedCharacters[0], true);
2283 for (int i = 1; i < numFoldedCharacters; ++i)
2284 append(foldedCharacters[i], false);
2285 }
2286 return 1;
2287 }
2288
needsMoreContext() const2289 inline bool SearchBuffer::needsMoreContext() const
2290 {
2291 return false;
2292 }
2293
prependContext(const UChar *,size_t)2294 void SearchBuffer::prependContext(const UChar*, size_t)
2295 {
2296 ASSERT_NOT_REACHED();
2297 }
2298
search(size_t & start)2299 inline size_t SearchBuffer::search(size_t& start)
2300 {
2301 if (!m_isBufferFull)
2302 return 0;
2303 if (!m_isCharacterStartBuffer[m_cursor])
2304 return 0;
2305
2306 size_t tailSpace = m_target.length() - m_cursor;
2307 if (memcmp(&m_buffer[m_cursor], m_target.characters(), tailSpace * sizeof(UChar)) != 0)
2308 return 0;
2309 if (memcmp(&m_buffer[0], m_target.characters() + tailSpace, m_cursor * sizeof(UChar)) != 0)
2310 return 0;
2311
2312 start = length();
2313
2314 // Now that we've found a match once, we don't want to find it again, because those
2315 // are the SearchBuffer semantics, allowing for a buffer where you append more than one
2316 // character at a time. To do this we take advantage of m_isCharacterStartBuffer, but if
2317 // we want to get rid of that in the future we could track this with a separate boolean
2318 // or even move the characters to the start of the buffer and set m_isBufferFull to false.
2319 m_isCharacterStartBuffer[m_cursor] = false;
2320
2321 return start;
2322 }
2323
2324 // Returns the number of characters that were appended to the buffer (what we are searching in).
2325 // That's not necessarily the same length as the passed-in target string, because case folding
2326 // can make two strings match even though they're not the same length.
length() const2327 size_t SearchBuffer::length() const
2328 {
2329 size_t bufferSize = m_target.length();
2330 size_t length = 0;
2331 for (size_t i = 0; i < bufferSize; ++i)
2332 length += m_isCharacterStartBuffer[i];
2333 return length;
2334 }
2335
2336 #endif // !ICU_UNICODE
2337
2338 // --------
2339
rangeLength(const Range * r,bool forSelectionPreservation)2340 int TextIterator::rangeLength(const Range* r, bool forSelectionPreservation)
2341 {
2342 int length = 0;
2343 for (TextIterator it(r, forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior); !it.atEnd(); it.advance())
2344 length += it.length();
2345
2346 return length;
2347 }
2348
subrange(Range * entireRange,int characterOffset,int characterCount)2349 PassRefPtr<Range> TextIterator::subrange(Range* entireRange, int characterOffset, int characterCount)
2350 {
2351 CharacterIterator entireRangeIterator(entireRange);
2352 return characterSubrange(entireRangeIterator, characterOffset, characterCount);
2353 }
2354
rangeFromLocationAndLength(Element * scope,int rangeLocation,int rangeLength,bool forSelectionPreservation)2355 PassRefPtr<Range> TextIterator::rangeFromLocationAndLength(Element* scope, int rangeLocation, int rangeLength, bool forSelectionPreservation)
2356 {
2357 RefPtr<Range> resultRange = scope->document()->createRange();
2358
2359 int docTextPosition = 0;
2360 int rangeEnd = rangeLocation + rangeLength;
2361 bool startRangeFound = false;
2362
2363 RefPtr<Range> textRunRange;
2364
2365 TextIterator it(rangeOfContents(scope).get(), forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior);
2366
2367 // FIXME: the atEnd() check shouldn't be necessary, workaround for <http://bugs.webkit.org/show_bug.cgi?id=6289>.
2368 if (rangeLocation == 0 && rangeLength == 0 && it.atEnd()) {
2369 textRunRange = it.range();
2370
2371 ExceptionCode ec = 0;
2372 resultRange->setStart(textRunRange->startContainer(), 0, ec);
2373 ASSERT(!ec);
2374 resultRange->setEnd(textRunRange->startContainer(), 0, ec);
2375 ASSERT(!ec);
2376
2377 return resultRange.release();
2378 }
2379
2380 for (; !it.atEnd(); it.advance()) {
2381 int len = it.length();
2382 textRunRange = it.range();
2383
2384 bool foundStart = rangeLocation >= docTextPosition && rangeLocation <= docTextPosition + len;
2385 bool foundEnd = rangeEnd >= docTextPosition && rangeEnd <= docTextPosition + len;
2386
2387 // Fix textRunRange->endPosition(), but only if foundStart || foundEnd, because it is only
2388 // in those cases that textRunRange is used.
2389 if (foundEnd) {
2390 // FIXME: This is a workaround for the fact that the end of a run is often at the wrong
2391 // position for emitted '\n's.
2392 if (len == 1 && it.characters()[0] == '\n') {
2393 scope->document()->updateLayoutIgnorePendingStylesheets();
2394 it.advance();
2395 if (!it.atEnd()) {
2396 RefPtr<Range> range = it.range();
2397 ExceptionCode ec = 0;
2398 textRunRange->setEnd(range->startContainer(), range->startOffset(), ec);
2399 ASSERT(!ec);
2400 } else {
2401 Position runStart = textRunRange->startPosition();
2402 Position runEnd = VisiblePosition(runStart).next().deepEquivalent();
2403 if (runEnd.isNotNull()) {
2404 ExceptionCode ec = 0;
2405 textRunRange->setEnd(runEnd.containerNode(), runEnd.computeOffsetInContainerNode(), ec);
2406 ASSERT(!ec);
2407 }
2408 }
2409 }
2410 }
2411
2412 if (foundStart) {
2413 startRangeFound = true;
2414 int exception = 0;
2415 if (textRunRange->startContainer()->isTextNode()) {
2416 int offset = rangeLocation - docTextPosition;
2417 resultRange->setStart(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
2418 } else {
2419 if (rangeLocation == docTextPosition)
2420 resultRange->setStart(textRunRange->startContainer(), textRunRange->startOffset(), exception);
2421 else
2422 resultRange->setStart(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2423 }
2424 }
2425
2426 if (foundEnd) {
2427 int exception = 0;
2428 if (textRunRange->startContainer()->isTextNode()) {
2429 int offset = rangeEnd - docTextPosition;
2430 resultRange->setEnd(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
2431 } else {
2432 if (rangeEnd == docTextPosition)
2433 resultRange->setEnd(textRunRange->startContainer(), textRunRange->startOffset(), exception);
2434 else
2435 resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2436 }
2437 docTextPosition += len;
2438 break;
2439 }
2440 docTextPosition += len;
2441 }
2442
2443 if (!startRangeFound)
2444 return 0;
2445
2446 if (rangeLength != 0 && rangeEnd > docTextPosition) { // rangeEnd is out of bounds
2447 int exception = 0;
2448 resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2449 }
2450
2451 return resultRange.release();
2452 }
2453
locationAndLengthFromRange(const Range * range,size_t & location,size_t & length)2454 bool TextIterator::locationAndLengthFromRange(const Range* range, size_t& location, size_t& length)
2455 {
2456 location = notFound;
2457 length = 0;
2458
2459 if (!range->startContainer())
2460 return false;
2461
2462 Element* selectionRoot = range->ownerDocument()->frame()->selection()->rootEditableElement();
2463 Element* scope = selectionRoot ? selectionRoot : range->ownerDocument()->documentElement();
2464
2465 // The critical assumption is that this only gets called with ranges that
2466 // concentrate on a given area containing the selection root. This is done
2467 // because of text fields and textareas. The DOM for those is not
2468 // directly in the document DOM, so ensure that the range does not cross a
2469 // boundary of one of those.
2470 if (range->startContainer() != scope && !range->startContainer()->isDescendantOf(scope))
2471 return false;
2472 if (range->endContainer() != scope && !range->endContainer()->isDescendantOf(scope))
2473 return false;
2474
2475 RefPtr<Range> testRange = Range::create(scope->document(), scope, 0, range->startContainer(), range->startOffset());
2476 ASSERT(testRange->startContainer() == scope);
2477 location = TextIterator::rangeLength(testRange.get());
2478
2479 ExceptionCode ec;
2480 testRange->setEnd(range->endContainer(), range->endOffset(), ec);
2481 ASSERT(testRange->startContainer() == scope);
2482 length = TextIterator::rangeLength(testRange.get()) - location;
2483 return true;
2484 }
2485
2486 // --------
2487
plainTextToMallocAllocatedBuffer(const Range * r,unsigned & bufferLength,bool isDisplayString,TextIteratorBehavior defaultBehavior)2488 UChar* plainTextToMallocAllocatedBuffer(const Range* r, unsigned& bufferLength, bool isDisplayString, TextIteratorBehavior defaultBehavior)
2489 {
2490 UChar* result = 0;
2491
2492 // Do this in pieces to avoid massive reallocations if there is a large amount of text.
2493 // Use system malloc for buffers since they can consume lots of memory and current TCMalloc is unable return it back to OS.
2494 static const unsigned cMaxSegmentSize = 1 << 16;
2495 bufferLength = 0;
2496 typedef pair<UChar*, unsigned> TextSegment;
2497 OwnPtr<Vector<TextSegment> > textSegments;
2498 Vector<UChar> textBuffer;
2499 textBuffer.reserveInitialCapacity(cMaxSegmentSize);
2500 TextIteratorBehavior behavior = defaultBehavior;
2501 if (!isDisplayString)
2502 behavior = static_cast<TextIteratorBehavior>(behavior | TextIteratorEmitsTextsWithoutTranscoding);
2503
2504 for (TextIterator it(r, behavior); !it.atEnd(); it.advance()) {
2505 if (textBuffer.size() && textBuffer.size() + it.length() > cMaxSegmentSize) {
2506 UChar* newSegmentBuffer = static_cast<UChar*>(malloc(textBuffer.size() * sizeof(UChar)));
2507 if (!newSegmentBuffer)
2508 goto exit;
2509 memcpy(newSegmentBuffer, textBuffer.data(), textBuffer.size() * sizeof(UChar));
2510 if (!textSegments)
2511 textSegments = adoptPtr(new Vector<TextSegment>);
2512 textSegments->append(make_pair(newSegmentBuffer, (unsigned)textBuffer.size()));
2513 textBuffer.clear();
2514 }
2515 textBuffer.append(it.characters(), it.length());
2516 bufferLength += it.length();
2517 }
2518
2519 if (!bufferLength)
2520 return 0;
2521
2522 // Since we know the size now, we can make a single buffer out of the pieces with one big alloc
2523 result = static_cast<UChar*>(malloc(bufferLength * sizeof(UChar)));
2524 if (!result)
2525 goto exit;
2526
2527 {
2528 UChar* resultPos = result;
2529 if (textSegments) {
2530 unsigned size = textSegments->size();
2531 for (unsigned i = 0; i < size; ++i) {
2532 const TextSegment& segment = textSegments->at(i);
2533 memcpy(resultPos, segment.first, segment.second * sizeof(UChar));
2534 resultPos += segment.second;
2535 }
2536 }
2537 memcpy(resultPos, textBuffer.data(), textBuffer.size() * sizeof(UChar));
2538 }
2539
2540 exit:
2541 if (textSegments) {
2542 unsigned size = textSegments->size();
2543 for (unsigned i = 0; i < size; ++i)
2544 free(textSegments->at(i).first);
2545 }
2546
2547 if (isDisplayString && r->ownerDocument())
2548 r->ownerDocument()->displayBufferModifiedByEncoding(result, bufferLength);
2549
2550 return result;
2551 }
2552
plainText(const Range * r,TextIteratorBehavior defaultBehavior)2553 String plainText(const Range* r, TextIteratorBehavior defaultBehavior)
2554 {
2555 unsigned length;
2556 UChar* buf = plainTextToMallocAllocatedBuffer(r, length, false, defaultBehavior);
2557 if (!buf)
2558 return "";
2559 String result(buf, length);
2560 free(buf);
2561 return result;
2562 }
2563
isAllCollapsibleWhitespace(const String & string)2564 static inline bool isAllCollapsibleWhitespace(const String& string)
2565 {
2566 const UChar* characters = string.characters();
2567 unsigned length = string.length();
2568 for (unsigned i = 0; i < length; ++i) {
2569 if (!isCollapsibleWhitespace(characters[i]))
2570 return false;
2571 }
2572 return true;
2573 }
2574
collapsedToBoundary(const Range * range,bool forward)2575 static PassRefPtr<Range> collapsedToBoundary(const Range* range, bool forward)
2576 {
2577 ExceptionCode ec = 0;
2578 RefPtr<Range> result = range->cloneRange(ec);
2579 ASSERT(!ec);
2580 result->collapse(!forward, ec);
2581 ASSERT(!ec);
2582 return result.release();
2583 }
2584
findPlainText(CharacterIterator & it,const String & target,FindOptions options,size_t & matchStart)2585 static size_t findPlainText(CharacterIterator& it, const String& target, FindOptions options, size_t& matchStart)
2586 {
2587 matchStart = 0;
2588 size_t matchLength = 0;
2589
2590 SearchBuffer buffer(target, options);
2591
2592 if (buffer.needsMoreContext()) {
2593 RefPtr<Range> startRange = it.range();
2594 RefPtr<Range> beforeStartRange = startRange->ownerDocument()->createRange();
2595 ExceptionCode ec = 0;
2596 beforeStartRange->setEnd(startRange->startContainer(), startRange->startOffset(), ec);
2597 for (SimplifiedBackwardsTextIterator backwardsIterator(beforeStartRange.get()); !backwardsIterator.atEnd(); backwardsIterator.advance()) {
2598 buffer.prependContext(backwardsIterator.characters(), backwardsIterator.length());
2599 if (!buffer.needsMoreContext())
2600 break;
2601 }
2602 }
2603
2604 while (!it.atEnd()) {
2605 it.advance(buffer.append(it.characters(), it.length()));
2606 tryAgain:
2607 size_t matchStartOffset;
2608 if (size_t newMatchLength = buffer.search(matchStartOffset)) {
2609 // Note that we found a match, and where we found it.
2610 size_t lastCharacterInBufferOffset = it.characterOffset();
2611 ASSERT(lastCharacterInBufferOffset >= matchStartOffset);
2612 matchStart = lastCharacterInBufferOffset - matchStartOffset;
2613 matchLength = newMatchLength;
2614 // If searching forward, stop on the first match.
2615 // If searching backward, don't stop, so we end up with the last match.
2616 if (!(options & Backwards))
2617 break;
2618 goto tryAgain;
2619 }
2620 if (it.atBreak() && !buffer.atBreak()) {
2621 buffer.reachedBreak();
2622 goto tryAgain;
2623 }
2624 }
2625
2626 return matchLength;
2627 }
2628
findPlainText(const Range * range,const String & target,FindOptions options)2629 PassRefPtr<Range> findPlainText(const Range* range, const String& target, FindOptions options)
2630 {
2631 // First, find the text.
2632 size_t matchStart;
2633 size_t matchLength;
2634 {
2635 CharacterIterator findIterator(range, TextIteratorEntersTextControls);
2636 matchLength = findPlainText(findIterator, target, options, matchStart);
2637 if (!matchLength)
2638 return collapsedToBoundary(range, !(options & Backwards));
2639 }
2640
2641 // Then, find the document position of the start and the end of the text.
2642 CharacterIterator computeRangeIterator(range, TextIteratorEntersTextControls);
2643 return characterSubrange(computeRangeIterator, matchStart, matchLength);
2644 }
2645
2646 }
2647