1 /*
2 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010 Apple Inc. All rights reserved.
3 * Copyright (C) 2005 Alexey Proskuryakov.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27 #include "config.h"
28 #include "TextIterator.h"
29
30 #include "Document.h"
31 #include "Frame.h"
32 #include "HTMLElement.h"
33 #include "HTMLNames.h"
34 #include "htmlediting.h"
35 #include "InlineTextBox.h"
36 #include "Range.h"
37 #include "RenderTableCell.h"
38 #include "RenderTableRow.h"
39 #include "RenderTextControl.h"
40 #include "RenderTextFragment.h"
41 #include "TextBoundaries.h"
42 #include "TextBreakIterator.h"
43 #include "VisiblePosition.h"
44 #include "visible_units.h"
45 #include <wtf/unicode/CharacterNames.h>
46
47 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
48 #include "TextBreakIteratorInternalICU.h"
49 #include <unicode/usearch.h>
50 #endif
51
52 using namespace WTF::Unicode;
53 using namespace std;
54
55 namespace WebCore {
56
57 using namespace HTMLNames;
58
59 // Buffer that knows how to compare with a search target.
60 // Keeps enough of the previous text to be able to search in the future, but no more.
61 // Non-breaking spaces are always equal to normal spaces.
62 // Case folding is also done if the CaseInsensitive option is specified.
63 // Matches are further filtered if the AtWordStarts option is specified, although some
64 // matches inside a word are permitted if TreatMedialCapitalAsWordStart is specified as well.
65 class SearchBuffer {
66 WTF_MAKE_NONCOPYABLE(SearchBuffer);
67 public:
68 SearchBuffer(const String& target, FindOptions);
69 ~SearchBuffer();
70
71 // Returns number of characters appended; guaranteed to be in the range [1, length].
72 size_t append(const UChar*, size_t length);
73 bool needsMoreContext() const;
74 void prependContext(const UChar*, size_t length);
75 void reachedBreak();
76
77 // Result is the size in characters of what was found.
78 // And <startOffset> is the number of characters back to the start of what was found.
79 size_t search(size_t& startOffset);
80 bool atBreak() const;
81
82 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
83
84 private:
85 bool isBadMatch(const UChar*, size_t length) const;
86 bool isWordStartMatch(size_t start, size_t length) const;
87
88 String m_target;
89 FindOptions m_options;
90
91 Vector<UChar> m_buffer;
92 size_t m_overlap;
93 size_t m_prefixLength;
94 bool m_atBreak;
95 bool m_needsMoreContext;
96
97 bool m_targetRequiresKanaWorkaround;
98 Vector<UChar> m_normalizedTarget;
99 mutable Vector<UChar> m_normalizedMatch;
100
101 #else
102
103 private:
104 void append(UChar, bool isCharacterStart);
105 size_t length() const;
106
107 String m_target;
108 FindOptions m_options;
109
110 Vector<UChar> m_buffer;
111 Vector<bool> m_isCharacterStartBuffer;
112 bool m_isBufferFull;
113 size_t m_cursor;
114
115 #endif
116 };
117
118 // --------
119
120 static const unsigned bitsInWord = sizeof(unsigned) * 8;
121 static const unsigned bitInWordMask = bitsInWord - 1;
122
BitStack()123 BitStack::BitStack()
124 : m_size(0)
125 {
126 }
127
~BitStack()128 BitStack::~BitStack()
129 {
130 }
131
push(bool bit)132 void BitStack::push(bool bit)
133 {
134 unsigned index = m_size / bitsInWord;
135 unsigned shift = m_size & bitInWordMask;
136 if (!shift && index == m_words.size()) {
137 m_words.grow(index + 1);
138 m_words[index] = 0;
139 }
140 unsigned& word = m_words[index];
141 unsigned mask = 1U << shift;
142 if (bit)
143 word |= mask;
144 else
145 word &= ~mask;
146 ++m_size;
147 }
148
pop()149 void BitStack::pop()
150 {
151 if (m_size)
152 --m_size;
153 }
154
top() const155 bool BitStack::top() const
156 {
157 if (!m_size)
158 return false;
159 unsigned shift = (m_size - 1) & bitInWordMask;
160 return m_words.last() & (1U << shift);
161 }
162
size() const163 unsigned BitStack::size() const
164 {
165 return m_size;
166 }
167
168 // --------
169
170 #if !ASSERT_DISABLED
171
depthCrossingShadowBoundaries(Node * node)172 static unsigned depthCrossingShadowBoundaries(Node* node)
173 {
174 unsigned depth = 0;
175 for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
176 ++depth;
177 return depth;
178 }
179
180 #endif
181
182 // This function is like Range::pastLastNode, except for the fact that it can climb up out of shadow trees.
nextInPreOrderCrossingShadowBoundaries(Node * rangeEndContainer,int rangeEndOffset)183 static Node* nextInPreOrderCrossingShadowBoundaries(Node* rangeEndContainer, int rangeEndOffset)
184 {
185 if (!rangeEndContainer)
186 return 0;
187 if (rangeEndOffset >= 0 && !rangeEndContainer->offsetInCharacters()) {
188 if (Node* next = rangeEndContainer->childNode(rangeEndOffset))
189 return next;
190 }
191 for (Node* node = rangeEndContainer; node; node = node->parentOrHostNode()) {
192 if (Node* next = node->nextSibling())
193 return next;
194 }
195 return 0;
196 }
197
198 // --------
199
fullyClipsContents(Node * node)200 static inline bool fullyClipsContents(Node* node)
201 {
202 RenderObject* renderer = node->renderer();
203 if (!renderer || !renderer->isBox() || !renderer->hasOverflowClip())
204 return false;
205 return toRenderBox(renderer)->size().isEmpty();
206 }
207
ignoresContainerClip(Node * node)208 static inline bool ignoresContainerClip(Node* node)
209 {
210 RenderObject* renderer = node->renderer();
211 if (!renderer || renderer->isText())
212 return false;
213 EPosition position = renderer->style()->position();
214 return position == AbsolutePosition || position == FixedPosition;
215 }
216
pushFullyClippedState(BitStack & stack,Node * node)217 static void pushFullyClippedState(BitStack& stack, Node* node)
218 {
219 ASSERT(stack.size() == depthCrossingShadowBoundaries(node));
220
221 // Push true if this node full clips its contents, or if a parent already has fully
222 // clipped and this is not a node that ignores its container's clip.
223 stack.push(fullyClipsContents(node) || (stack.top() && !ignoresContainerClip(node)));
224 }
225
setUpFullyClippedStack(BitStack & stack,Node * node)226 static void setUpFullyClippedStack(BitStack& stack, Node* node)
227 {
228 // Put the nodes in a vector so we can iterate in reverse order.
229 Vector<Node*, 100> ancestry;
230 for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
231 ancestry.append(parent);
232
233 // Call pushFullyClippedState on each node starting with the earliest ancestor.
234 size_t size = ancestry.size();
235 for (size_t i = 0; i < size; ++i)
236 pushFullyClippedState(stack, ancestry[size - i - 1]);
237 pushFullyClippedState(stack, node);
238
239 ASSERT(stack.size() == 1 + depthCrossingShadowBoundaries(node));
240 }
241
242 #if OS(ANDROID)
checkFormControlElement(Node * startNode)243 static bool checkFormControlElement(Node* startNode)
244 {
245 Node* node = startNode;
246 while (node) {
247 if (node->isElementNode() && static_cast<Element*>(node)->isFormControlElement())
248 return true;
249 node = node->parentNode();
250 }
251 return false;
252 }
253 #endif
254
255
256 // --------
257
TextIterator()258 TextIterator::TextIterator()
259 : m_startContainer(0)
260 , m_startOffset(0)
261 , m_endContainer(0)
262 , m_endOffset(0)
263 , m_positionNode(0)
264 , m_textCharacters(0)
265 , m_textLength(0)
266 , m_remainingTextBox(0)
267 , m_firstLetterText(0)
268 , m_lastCharacter(0)
269 , m_emitsCharactersBetweenAllVisiblePositions(false)
270 , m_entersTextControls(false)
271 , m_emitsTextWithoutTranscoding(false)
272 , m_handledFirstLetter(false)
273 , m_ignoresStyleVisibility(false)
274 , m_emitsObjectReplacementCharacters(false)
275 #if OS(ANDROID)
276 , m_stopsOnFormControls(false)
277 , m_shouldStop(false)
278 #endif
279 {
280 }
281
TextIterator(const Range * r,TextIteratorBehavior behavior)282 TextIterator::TextIterator(const Range* r, TextIteratorBehavior behavior)
283 : m_startContainer(0)
284 , m_startOffset(0)
285 , m_endContainer(0)
286 , m_endOffset(0)
287 , m_positionNode(0)
288 , m_textCharacters(0)
289 , m_textLength(0)
290 , m_remainingTextBox(0)
291 , m_firstLetterText(0)
292 , m_emitsCharactersBetweenAllVisiblePositions(behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions)
293 , m_entersTextControls(behavior & TextIteratorEntersTextControls)
294 , m_emitsTextWithoutTranscoding(behavior & TextIteratorEmitsTextsWithoutTranscoding)
295 , m_handledFirstLetter(false)
296 , m_ignoresStyleVisibility(behavior & TextIteratorIgnoresStyleVisibility)
297 , m_emitsObjectReplacementCharacters(behavior & TextIteratorEmitsObjectReplacementCharacters)
298 #if OS(ANDROID)
299 , m_stopsOnFormControls(behavior & TextIteratorStopsOnFormControls)
300 , m_shouldStop(false)
301 #endif
302 {
303 if (!r)
304 return;
305
306 // get and validate the range endpoints
307 Node* startContainer = r->startContainer();
308 if (!startContainer)
309 return;
310 int startOffset = r->startOffset();
311 Node* endContainer = r->endContainer();
312 int endOffset = r->endOffset();
313
314 // Callers should be handing us well-formed ranges. If we discover that this isn't
315 // the case, we could consider changing this assertion to an early return.
316 ASSERT(r->boundaryPointsValid());
317
318 // remember range - this does not change
319 m_startContainer = startContainer;
320 m_startOffset = startOffset;
321 m_endContainer = endContainer;
322 m_endOffset = endOffset;
323
324 // set up the current node for processing
325 m_node = r->firstNode();
326 if (!m_node)
327 return;
328 setUpFullyClippedStack(m_fullyClippedStack, m_node);
329 m_offset = m_node == m_startContainer ? m_startOffset : 0;
330 m_handledNode = false;
331 m_handledChildren = false;
332
333 // calculate first out of bounds node
334 m_pastEndNode = nextInPreOrderCrossingShadowBoundaries(endContainer, endOffset);
335
336 // initialize node processing state
337 m_needsAnotherNewline = false;
338 m_textBox = 0;
339
340 // initialize record of previous node processing
341 m_hasEmitted = false;
342 m_lastTextNode = 0;
343 m_lastTextNodeEndedWithCollapsedSpace = false;
344 m_lastCharacter = 0;
345
346 #ifndef NDEBUG
347 // need this just because of the assert in advance()
348 m_positionNode = m_node;
349 #endif
350
351 // identify the first run
352 advance();
353 }
354
~TextIterator()355 TextIterator::~TextIterator()
356 {
357 }
358
atEnd() const359 bool TextIterator::atEnd() const
360 {
361 #if OS(ANDROID)
362 return !m_positionNode || m_shouldStop;
363 #else
364 return !m_positionNode;
365 #endif
366 }
367
advance()368 void TextIterator::advance()
369 {
370 #if OS(ANDROID)
371 if (m_shouldStop)
372 return;
373 #endif
374 // reset the run information
375 m_positionNode = 0;
376 m_textLength = 0;
377
378 // handle remembered node that needed a newline after the text node's newline
379 if (m_needsAnotherNewline) {
380 // Emit the extra newline, and position it *inside* m_node, after m_node's
381 // contents, in case it's a block, in the same way that we position the first
382 // newline. The range for the emitted newline should start where the line
383 // break begins.
384 // FIXME: It would be cleaner if we emitted two newlines during the last
385 // iteration, instead of using m_needsAnotherNewline.
386 Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
387 emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
388 m_needsAnotherNewline = false;
389 return;
390 }
391
392 if (!m_textBox && m_remainingTextBox) {
393 m_textBox = m_remainingTextBox;
394 m_remainingTextBox = 0;
395 m_firstLetterText = 0;
396 m_offset = 0;
397 }
398 // handle remembered text box
399 if (m_textBox) {
400 handleTextBox();
401 if (m_positionNode)
402 return;
403 }
404
405 while (m_node && m_node != m_pastEndNode) {
406 #if OS(ANDROID)
407 if (!m_shouldStop && m_stopsOnFormControls && checkFormControlElement(m_node))
408 m_shouldStop = true;
409 #endif
410 // if the range ends at offset 0 of an element, represent the
411 // position, but not the content, of that element e.g. if the
412 // node is a blockflow element, emit a newline that
413 // precedes the element
414 if (m_node == m_endContainer && m_endOffset == 0) {
415 representNodeOffsetZero();
416 m_node = 0;
417 return;
418 }
419
420 RenderObject* renderer = m_node->renderer();
421 if (!renderer) {
422 m_handledNode = true;
423 m_handledChildren = true;
424 } else {
425 // handle current node according to its type
426 if (!m_handledNode) {
427 if (renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) // FIXME: What about CDATA_SECTION_NODE?
428 m_handledNode = handleTextNode();
429 else if (renderer && (renderer->isImage() || renderer->isWidget() ||
430 (renderer->node() && renderer->node()->isElementNode() &&
431 static_cast<Element*>(renderer->node())->isFormControlElement())))
432 m_handledNode = handleReplacedElement();
433 else
434 m_handledNode = handleNonTextNode();
435 if (m_positionNode)
436 return;
437 }
438 }
439
440 // find a new current node to handle in depth-first manner,
441 // calling exitNode() as we come back thru a parent node
442 Node* next = m_handledChildren ? 0 : m_node->firstChild();
443 m_offset = 0;
444 if (!next) {
445 next = m_node->nextSibling();
446 if (!next) {
447 bool pastEnd = m_node->traverseNextNode() == m_pastEndNode;
448 Node* parentNode = m_node->parentOrHostNode();
449 while (!next && parentNode) {
450 if ((pastEnd && parentNode == m_endContainer) || m_endContainer->isDescendantOf(parentNode))
451 return;
452 bool haveRenderer = m_node->renderer();
453 m_node = parentNode;
454 m_fullyClippedStack.pop();
455 parentNode = m_node->parentOrHostNode();
456 if (haveRenderer)
457 exitNode();
458 if (m_positionNode) {
459 m_handledNode = true;
460 m_handledChildren = true;
461 return;
462 }
463 next = m_node->nextSibling();
464 }
465 }
466 m_fullyClippedStack.pop();
467 }
468
469 // set the new current node
470 m_node = next;
471 if (m_node)
472 pushFullyClippedState(m_fullyClippedStack, m_node);
473 m_handledNode = false;
474 m_handledChildren = false;
475 m_handledFirstLetter = false;
476 m_firstLetterText = 0;
477
478 // how would this ever be?
479 if (m_positionNode)
480 return;
481 }
482 }
483
handleTextNode()484 bool TextIterator::handleTextNode()
485 {
486 if (m_fullyClippedStack.top() && !m_ignoresStyleVisibility)
487 return false;
488
489 RenderText* renderer = toRenderText(m_node->renderer());
490
491 m_lastTextNode = m_node;
492 String str = renderer->text();
493
494 // handle pre-formatted text
495 if (!renderer->style()->collapseWhiteSpace()) {
496 int runStart = m_offset;
497 if (m_lastTextNodeEndedWithCollapsedSpace && hasVisibleTextNode(renderer)) {
498 emitCharacter(' ', m_node, 0, runStart, runStart);
499 return false;
500 }
501 if (!m_handledFirstLetter && renderer->isTextFragment()) {
502 handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
503 if (m_firstLetterText) {
504 String firstLetter = m_firstLetterText->text();
505 emitText(m_node, m_firstLetterText, m_offset, m_offset + firstLetter.length());
506 m_firstLetterText = 0;
507 m_textBox = 0;
508 return false;
509 }
510 }
511 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
512 return false;
513 int strLength = str.length();
514 int end = (m_node == m_endContainer) ? m_endOffset : INT_MAX;
515 int runEnd = min(strLength, end);
516
517 if (runStart >= runEnd)
518 return true;
519
520 emitText(m_node, runStart, runEnd);
521 return true;
522 }
523
524 if (!renderer->firstTextBox() && str.length() > 0) {
525 if (!m_handledFirstLetter && renderer->isTextFragment()) {
526 handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
527 if (m_firstLetterText) {
528 handleTextBox();
529 return false;
530 }
531 }
532 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
533 return false;
534 m_lastTextNodeEndedWithCollapsedSpace = true; // entire block is collapsed space
535 return true;
536 }
537
538 // Used when text boxes are out of order (Hebrew/Arabic w/ embeded LTR text)
539 if (renderer->containsReversedText()) {
540 m_sortedTextBoxes.clear();
541 for (InlineTextBox* textBox = renderer->firstTextBox(); textBox; textBox = textBox->nextTextBox()) {
542 m_sortedTextBoxes.append(textBox);
543 }
544 std::sort(m_sortedTextBoxes.begin(), m_sortedTextBoxes.end(), InlineTextBox::compareByStart);
545 m_sortedTextBoxesPosition = 0;
546 }
547
548 m_textBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox();
549 if (!m_handledFirstLetter && renderer->isTextFragment() && !m_offset)
550 handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
551 handleTextBox();
552 return true;
553 }
554
handleTextBox()555 void TextIterator::handleTextBox()
556 {
557 RenderText* renderer = m_firstLetterText ? m_firstLetterText : toRenderText(m_node->renderer());
558 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility) {
559 m_textBox = 0;
560 return;
561 }
562 String str = renderer->text();
563 unsigned start = m_offset;
564 unsigned end = (m_node == m_endContainer) ? static_cast<unsigned>(m_endOffset) : UINT_MAX;
565 while (m_textBox) {
566 unsigned textBoxStart = m_textBox->start();
567 unsigned runStart = max(textBoxStart, start);
568
569 // Check for collapsed space at the start of this run.
570 InlineTextBox* firstTextBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox();
571 bool needSpace = m_lastTextNodeEndedWithCollapsedSpace
572 || (m_textBox == firstTextBox && textBoxStart == runStart && runStart > 0);
573 if (needSpace && !isCollapsibleWhitespace(m_lastCharacter) && m_lastCharacter) {
574 if (m_lastTextNode == m_node && runStart > 0 && str[runStart - 1] == ' ') {
575 unsigned spaceRunStart = runStart - 1;
576 while (spaceRunStart > 0 && str[spaceRunStart - 1] == ' ')
577 --spaceRunStart;
578 emitText(m_node, renderer, spaceRunStart, spaceRunStart + 1);
579 } else
580 emitCharacter(' ', m_node, 0, runStart, runStart);
581 return;
582 }
583 unsigned textBoxEnd = textBoxStart + m_textBox->len();
584 unsigned runEnd = min(textBoxEnd, end);
585
586 // Determine what the next text box will be, but don't advance yet
587 InlineTextBox* nextTextBox = 0;
588 if (renderer->containsReversedText()) {
589 if (m_sortedTextBoxesPosition + 1 < m_sortedTextBoxes.size())
590 nextTextBox = m_sortedTextBoxes[m_sortedTextBoxesPosition + 1];
591 } else
592 nextTextBox = m_textBox->nextTextBox();
593
594 if (runStart < runEnd) {
595 // Handle either a single newline character (which becomes a space),
596 // or a run of characters that does not include a newline.
597 // This effectively translates newlines to spaces without copying the text.
598 if (str[runStart] == '\n') {
599 emitCharacter(' ', m_node, 0, runStart, runStart + 1);
600 m_offset = runStart + 1;
601 } else {
602 size_t subrunEnd = str.find('\n', runStart);
603 if (subrunEnd == notFound || subrunEnd > runEnd)
604 subrunEnd = runEnd;
605
606 m_offset = subrunEnd;
607 emitText(m_node, renderer, runStart, subrunEnd);
608 }
609
610 // If we are doing a subrun that doesn't go to the end of the text box,
611 // come back again to finish handling this text box; don't advance to the next one.
612 if (static_cast<unsigned>(m_positionEndOffset) < textBoxEnd)
613 return;
614
615 // Advance and return
616 unsigned nextRunStart = nextTextBox ? nextTextBox->start() : str.length();
617 if (nextRunStart > runEnd)
618 m_lastTextNodeEndedWithCollapsedSpace = true; // collapsed space between runs or at the end
619 m_textBox = nextTextBox;
620 if (renderer->containsReversedText())
621 ++m_sortedTextBoxesPosition;
622 return;
623 }
624 // Advance and continue
625 m_textBox = nextTextBox;
626 if (renderer->containsReversedText())
627 ++m_sortedTextBoxesPosition;
628 }
629 if (!m_textBox && m_remainingTextBox) {
630 m_textBox = m_remainingTextBox;
631 m_remainingTextBox = 0;
632 m_firstLetterText = 0;
633 m_offset = 0;
634 handleTextBox();
635 }
636 }
637
handleTextNodeFirstLetter(RenderTextFragment * renderer)638 void TextIterator::handleTextNodeFirstLetter(RenderTextFragment* renderer)
639 {
640 if (renderer->firstLetter()) {
641 RenderObject* r = renderer->firstLetter();
642 if (r->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
643 return;
644 for (RenderObject *currChild = r->firstChild(); currChild; currChild->nextSibling()) {
645 if (currChild->isText()) {
646 RenderText* firstLetter = toRenderText(currChild);
647 m_handledFirstLetter = true;
648 m_remainingTextBox = m_textBox;
649 m_textBox = firstLetter->firstTextBox();
650 m_firstLetterText = firstLetter;
651 return;
652 }
653 }
654 }
655 m_handledFirstLetter = true;
656 }
657
handleReplacedElement()658 bool TextIterator::handleReplacedElement()
659 {
660 if (m_fullyClippedStack.top())
661 return false;
662
663 RenderObject* renderer = m_node->renderer();
664 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
665 return false;
666
667 if (m_lastTextNodeEndedWithCollapsedSpace) {
668 emitCharacter(' ', m_lastTextNode->parentNode(), m_lastTextNode, 1, 1);
669 return false;
670 }
671
672 if (m_entersTextControls && renderer->isTextControl()) {
673 if (HTMLElement* innerTextElement = toRenderTextControl(renderer)->innerTextElement()) {
674 m_node = innerTextElement->shadowTreeRootNode();
675 pushFullyClippedState(m_fullyClippedStack, m_node);
676 m_offset = 0;
677 return false;
678 }
679 }
680
681 m_hasEmitted = true;
682
683 if (m_emitsObjectReplacementCharacters && renderer && renderer->isReplaced()) {
684 emitCharacter(objectReplacementCharacter, m_node->parentNode(), m_node, 0, 1);
685 return true;
686 }
687
688 if (m_emitsCharactersBetweenAllVisiblePositions) {
689 // We want replaced elements to behave like punctuation for boundary
690 // finding, and to simply take up space for the selection preservation
691 // code in moveParagraphs, so we use a comma.
692 emitCharacter(',', m_node->parentNode(), m_node, 0, 1);
693 return true;
694 }
695
696 m_positionNode = m_node->parentNode();
697 m_positionOffsetBaseNode = m_node;
698 m_positionStartOffset = 0;
699 m_positionEndOffset = 1;
700
701 m_textCharacters = 0;
702 m_textLength = 0;
703
704 m_lastCharacter = 0;
705
706 return true;
707 }
708
hasVisibleTextNode(RenderText * renderer)709 bool TextIterator::hasVisibleTextNode(RenderText* renderer)
710 {
711 if (renderer->style()->visibility() == VISIBLE)
712 return true;
713 if (renderer->isTextFragment()) {
714 RenderTextFragment* fragment = static_cast<RenderTextFragment*>(renderer);
715 if (fragment->firstLetter() && fragment->firstLetter()->style()->visibility() == VISIBLE)
716 return true;
717 }
718 return false;
719 }
720
shouldEmitTabBeforeNode(Node * node)721 static bool shouldEmitTabBeforeNode(Node* node)
722 {
723 RenderObject* r = node->renderer();
724
725 // Table cells are delimited by tabs.
726 if (!r || !isTableCell(node))
727 return false;
728
729 // Want a tab before every cell other than the first one
730 RenderTableCell* rc = toRenderTableCell(r);
731 RenderTable* t = rc->table();
732 return t && (t->cellBefore(rc) || t->cellAbove(rc));
733 }
734
shouldEmitNewlineForNode(Node * node)735 static bool shouldEmitNewlineForNode(Node* node)
736 {
737 // br elements are represented by a single newline.
738 RenderObject* r = node->renderer();
739 if (!r)
740 return node->hasTagName(brTag);
741
742 return r->isBR();
743 }
744
shouldEmitNewlinesBeforeAndAfterNode(Node * node)745 static bool shouldEmitNewlinesBeforeAndAfterNode(Node* node)
746 {
747 // Block flow (versus inline flow) is represented by having
748 // a newline both before and after the element.
749 RenderObject* r = node->renderer();
750 if (!r) {
751 return (node->hasTagName(blockquoteTag)
752 || node->hasTagName(ddTag)
753 || node->hasTagName(divTag)
754 || node->hasTagName(dlTag)
755 || node->hasTagName(dtTag)
756 || node->hasTagName(h1Tag)
757 || node->hasTagName(h2Tag)
758 || node->hasTagName(h3Tag)
759 || node->hasTagName(h4Tag)
760 || node->hasTagName(h5Tag)
761 || node->hasTagName(h6Tag)
762 || node->hasTagName(hrTag)
763 || node->hasTagName(liTag)
764 || node->hasTagName(listingTag)
765 || node->hasTagName(olTag)
766 || node->hasTagName(pTag)
767 || node->hasTagName(preTag)
768 || node->hasTagName(trTag)
769 || node->hasTagName(ulTag));
770 }
771
772 // Need to make an exception for table cells, because they are blocks, but we
773 // want them tab-delimited rather than having newlines before and after.
774 if (isTableCell(node))
775 return false;
776
777 // Need to make an exception for table row elements, because they are neither
778 // "inline" or "RenderBlock", but we want newlines for them.
779 if (r->isTableRow()) {
780 RenderTable* t = toRenderTableRow(r)->table();
781 if (t && !t->isInline())
782 return true;
783 }
784
785 return !r->isInline() && r->isRenderBlock() && !r->isFloatingOrPositioned() && !r->isBody();
786 }
787
shouldEmitNewlineAfterNode(Node * node)788 static bool shouldEmitNewlineAfterNode(Node* node)
789 {
790 // FIXME: It should be better but slower to create a VisiblePosition here.
791 if (!shouldEmitNewlinesBeforeAndAfterNode(node))
792 return false;
793 // Check if this is the very last renderer in the document.
794 // If so, then we should not emit a newline.
795 while ((node = node->traverseNextSibling()))
796 if (node->renderer())
797 return true;
798 return false;
799 }
800
shouldEmitNewlineBeforeNode(Node * node)801 static bool shouldEmitNewlineBeforeNode(Node* node)
802 {
803 return shouldEmitNewlinesBeforeAndAfterNode(node);
804 }
805
shouldEmitExtraNewlineForNode(Node * node)806 static bool shouldEmitExtraNewlineForNode(Node* node)
807 {
808 // When there is a significant collapsed bottom margin, emit an extra
809 // newline for a more realistic result. We end up getting the right
810 // result even without margin collapsing. For example: <div><p>text</p></div>
811 // will work right even if both the <div> and the <p> have bottom margins.
812 RenderObject* r = node->renderer();
813 if (!r || !r->isBox())
814 return false;
815
816 // NOTE: We only do this for a select set of nodes, and fwiw WinIE appears
817 // not to do this at all
818 if (node->hasTagName(h1Tag)
819 || node->hasTagName(h2Tag)
820 || node->hasTagName(h3Tag)
821 || node->hasTagName(h4Tag)
822 || node->hasTagName(h5Tag)
823 || node->hasTagName(h6Tag)
824 || node->hasTagName(pTag)) {
825 RenderStyle* style = r->style();
826 if (style) {
827 int bottomMargin = toRenderBox(r)->collapsedMarginAfter();
828 int fontSize = style->fontDescription().computedPixelSize();
829 if (bottomMargin * 2 >= fontSize)
830 return true;
831 }
832 }
833
834 return false;
835 }
836
collapsedSpaceLength(RenderText * renderer,int textEnd)837 static int collapsedSpaceLength(RenderText* renderer, int textEnd)
838 {
839 const UChar* characters = renderer->text()->characters();
840 int length = renderer->text()->length();
841 for (int i = textEnd; i < length; ++i) {
842 if (!renderer->style()->isCollapsibleWhiteSpace(characters[i]))
843 return i - textEnd;
844 }
845
846 return length - textEnd;
847 }
848
maxOffsetIncludingCollapsedSpaces(Node * node)849 static int maxOffsetIncludingCollapsedSpaces(Node* node)
850 {
851 int offset = caretMaxOffset(node);
852
853 if (node->renderer() && node->renderer()->isText())
854 offset += collapsedSpaceLength(toRenderText(node->renderer()), offset);
855
856 return offset;
857 }
858
859 // Whether or not we should emit a character as we enter m_node (if it's a container) or as we hit it (if it's atomic).
shouldRepresentNodeOffsetZero()860 bool TextIterator::shouldRepresentNodeOffsetZero()
861 {
862 if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isTable())
863 return true;
864
865 // Leave element positioned flush with start of a paragraph
866 // (e.g. do not insert tab before a table cell at the start of a paragraph)
867 if (m_lastCharacter == '\n')
868 return false;
869
870 // Otherwise, show the position if we have emitted any characters
871 if (m_hasEmitted)
872 return true;
873
874 // We've not emitted anything yet. Generally, there is no need for any positioning then.
875 // The only exception is when the element is visually not in the same line as
876 // the start of the range (e.g. the range starts at the end of the previous paragraph).
877 // NOTE: Creating VisiblePositions and comparing them is relatively expensive, so we
878 // make quicker checks to possibly avoid that. Another check that we could make is
879 // is whether the inline vs block flow changed since the previous visible element.
880 // I think we're already in a special enough case that that won't be needed, tho.
881
882 // No character needed if this is the first node in the range.
883 if (m_node == m_startContainer)
884 return false;
885
886 // If we are outside the start container's subtree, assume we need to emit.
887 // FIXME: m_startContainer could be an inline block
888 if (!m_node->isDescendantOf(m_startContainer))
889 return true;
890
891 // If we started as m_startContainer offset 0 and the current node is a descendant of
892 // the start container, we already had enough context to correctly decide whether to
893 // emit after a preceding block. We chose not to emit (m_hasEmitted is false),
894 // so don't second guess that now.
895 // NOTE: Is this really correct when m_node is not a leftmost descendant? Probably
896 // immaterial since we likely would have already emitted something by now.
897 if (m_startOffset == 0)
898 return false;
899
900 // If this node is unrendered or invisible the VisiblePosition checks below won't have much meaning.
901 // Additionally, if the range we are iterating over contains huge sections of unrendered content,
902 // we would create VisiblePositions on every call to this function without this check.
903 if (!m_node->renderer() || m_node->renderer()->style()->visibility() != VISIBLE)
904 return false;
905
906 // The startPos.isNotNull() check is needed because the start could be before the body,
907 // and in that case we'll get null. We don't want to put in newlines at the start in that case.
908 // The currPos.isNotNull() check is needed because positions in non-HTML content
909 // (like SVG) do not have visible positions, and we don't want to emit for them either.
910 VisiblePosition startPos = VisiblePosition(Position(m_startContainer, m_startOffset, Position::PositionIsOffsetInAnchor), DOWNSTREAM);
911 VisiblePosition currPos = VisiblePosition(positionBeforeNode(m_node), DOWNSTREAM);
912 return startPos.isNotNull() && currPos.isNotNull() && !inSameLine(startPos, currPos);
913 }
914
shouldEmitSpaceBeforeAndAfterNode(Node * node)915 bool TextIterator::shouldEmitSpaceBeforeAndAfterNode(Node* node)
916 {
917 return node->renderer() && node->renderer()->isTable() && (node->renderer()->isInline() || m_emitsCharactersBetweenAllVisiblePositions);
918 }
919
representNodeOffsetZero()920 void TextIterator::representNodeOffsetZero()
921 {
922 // Emit a character to show the positioning of m_node.
923
924 // When we haven't been emitting any characters, shouldRepresentNodeOffsetZero() can
925 // create VisiblePositions, which is expensive. So, we perform the inexpensive checks
926 // on m_node to see if it necessitates emitting a character first and will early return
927 // before encountering shouldRepresentNodeOffsetZero()s worse case behavior.
928 if (shouldEmitTabBeforeNode(m_node)) {
929 if (shouldRepresentNodeOffsetZero())
930 emitCharacter('\t', m_node->parentNode(), m_node, 0, 0);
931 } else if (shouldEmitNewlineBeforeNode(m_node)) {
932 if (shouldRepresentNodeOffsetZero())
933 emitCharacter('\n', m_node->parentNode(), m_node, 0, 0);
934 } else if (shouldEmitSpaceBeforeAndAfterNode(m_node)) {
935 if (shouldRepresentNodeOffsetZero())
936 emitCharacter(' ', m_node->parentNode(), m_node, 0, 0);
937 }
938 }
939
handleNonTextNode()940 bool TextIterator::handleNonTextNode()
941 {
942 if (shouldEmitNewlineForNode(m_node))
943 emitCharacter('\n', m_node->parentNode(), m_node, 0, 1);
944 else if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isHR())
945 emitCharacter(' ', m_node->parentNode(), m_node, 0, 1);
946 else
947 representNodeOffsetZero();
948
949 return true;
950 }
951
exitNode()952 void TextIterator::exitNode()
953 {
954 // prevent emitting a newline when exiting a collapsed block at beginning of the range
955 // FIXME: !m_hasEmitted does not necessarily mean there was a collapsed block... it could
956 // have been an hr (e.g.). Also, a collapsed block could have height (e.g. a table) and
957 // therefore look like a blank line.
958 if (!m_hasEmitted)
959 return;
960
961 // Emit with a position *inside* m_node, after m_node's contents, in
962 // case it is a block, because the run should start where the
963 // emitted character is positioned visually.
964 Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
965 // FIXME: This shouldn't require the m_lastTextNode to be true, but we can't change that without making
966 // the logic in _web_attributedStringFromRange match. We'll get that for free when we switch to use
967 // TextIterator in _web_attributedStringFromRange.
968 // See <rdar://problem/5428427> for an example of how this mismatch will cause problems.
969 if (m_lastTextNode && shouldEmitNewlineAfterNode(m_node)) {
970 // use extra newline to represent margin bottom, as needed
971 bool addNewline = shouldEmitExtraNewlineForNode(m_node);
972
973 // FIXME: We need to emit a '\n' as we leave an empty block(s) that
974 // contain a VisiblePosition when doing selection preservation.
975 if (m_lastCharacter != '\n') {
976 // insert a newline with a position following this block's contents.
977 emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
978 // remember whether to later add a newline for the current node
979 ASSERT(!m_needsAnotherNewline);
980 m_needsAnotherNewline = addNewline;
981 } else if (addNewline)
982 // insert a newline with a position following this block's contents.
983 emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
984 }
985
986 // If nothing was emitted, see if we need to emit a space.
987 if (!m_positionNode && shouldEmitSpaceBeforeAndAfterNode(m_node))
988 emitCharacter(' ', baseNode->parentNode(), baseNode, 1, 1);
989 }
990
emitCharacter(UChar c,Node * textNode,Node * offsetBaseNode,int textStartOffset,int textEndOffset)991 void TextIterator::emitCharacter(UChar c, Node* textNode, Node* offsetBaseNode, int textStartOffset, int textEndOffset)
992 {
993 m_hasEmitted = true;
994
995 // remember information with which to construct the TextIterator::range()
996 // NOTE: textNode is often not a text node, so the range will specify child nodes of positionNode
997 m_positionNode = textNode;
998 m_positionOffsetBaseNode = offsetBaseNode;
999 m_positionStartOffset = textStartOffset;
1000 m_positionEndOffset = textEndOffset;
1001
1002 // remember information with which to construct the TextIterator::characters() and length()
1003 m_singleCharacterBuffer = c;
1004 m_textCharacters = &m_singleCharacterBuffer;
1005 m_textLength = 1;
1006
1007 // remember some iteration state
1008 m_lastTextNodeEndedWithCollapsedSpace = false;
1009 m_lastCharacter = c;
1010 }
1011
emitText(Node * textNode,RenderObject * renderObject,int textStartOffset,int textEndOffset)1012 void TextIterator::emitText(Node* textNode, RenderObject* renderObject, int textStartOffset, int textEndOffset)
1013 {
1014 RenderText* renderer = toRenderText(renderObject);
1015 m_text = m_emitsTextWithoutTranscoding ? renderer->textWithoutTranscoding() : renderer->text();
1016 ASSERT(m_text.characters());
1017
1018 m_positionNode = textNode;
1019 m_positionOffsetBaseNode = 0;
1020 m_positionStartOffset = textStartOffset;
1021 m_positionEndOffset = textEndOffset;
1022 m_textCharacters = m_text.characters() + textStartOffset;
1023 m_textLength = textEndOffset - textStartOffset;
1024 m_lastCharacter = m_text[textEndOffset - 1];
1025
1026 m_lastTextNodeEndedWithCollapsedSpace = false;
1027 m_hasEmitted = true;
1028 }
1029
emitText(Node * textNode,int textStartOffset,int textEndOffset)1030 void TextIterator::emitText(Node* textNode, int textStartOffset, int textEndOffset)
1031 {
1032 emitText(textNode, m_node->renderer(), textStartOffset, textEndOffset);
1033 }
1034
range() const1035 PassRefPtr<Range> TextIterator::range() const
1036 {
1037 // use the current run information, if we have it
1038 if (m_positionNode) {
1039 if (m_positionOffsetBaseNode) {
1040 int index = m_positionOffsetBaseNode->nodeIndex();
1041 m_positionStartOffset += index;
1042 m_positionEndOffset += index;
1043 m_positionOffsetBaseNode = 0;
1044 }
1045 return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1046 }
1047
1048 // otherwise, return the end of the overall range we were given
1049 if (m_endContainer)
1050 return Range::create(m_endContainer->document(), m_endContainer, m_endOffset, m_endContainer, m_endOffset);
1051
1052 return 0;
1053 }
1054
node() const1055 Node* TextIterator::node() const
1056 {
1057 RefPtr<Range> textRange = range();
1058 if (!textRange)
1059 return 0;
1060
1061 Node* node = textRange->startContainer();
1062 if (!node)
1063 return 0;
1064 if (node->offsetInCharacters())
1065 return node;
1066
1067 return node->childNode(textRange->startOffset());
1068 }
1069
1070 // --------
1071
SimplifiedBackwardsTextIterator()1072 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator()
1073 : m_behavior(TextIteratorDefaultBehavior)
1074 , m_node(0)
1075 , m_positionNode(0)
1076 #if OS(ANDROID)
1077 , m_stopsOnFormControls(false)
1078 , m_shouldStop(false)
1079 #endif
1080 {
1081 }
1082
SimplifiedBackwardsTextIterator(const Range * r,TextIteratorBehavior behavior)1083 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator(const Range* r, TextIteratorBehavior behavior)
1084 : m_behavior(behavior)
1085 , m_node(0)
1086 , m_positionNode(0)
1087 #if OS(ANDROID)
1088 , m_stopsOnFormControls(behavior & TextIteratorStopsOnFormControls)
1089 , m_shouldStop(false)
1090 #endif
1091 {
1092 #if OS(ANDROID)
1093 ASSERT(m_behavior == TextIteratorDefaultBehavior || m_behavior == TextIteratorStopsOnFormControls);
1094 #else
1095 ASSERT(m_behavior == TextIteratorDefaultBehavior);
1096 #endif
1097
1098 if (!r)
1099 return;
1100
1101 Node* startNode = r->startContainer();
1102 if (!startNode)
1103 return;
1104 Node* endNode = r->endContainer();
1105 int startOffset = r->startOffset();
1106 int endOffset = r->endOffset();
1107
1108 if (!startNode->offsetInCharacters()) {
1109 if (startOffset >= 0 && startOffset < static_cast<int>(startNode->childNodeCount())) {
1110 startNode = startNode->childNode(startOffset);
1111 startOffset = 0;
1112 }
1113 }
1114 if (!endNode->offsetInCharacters()) {
1115 if (endOffset > 0 && endOffset <= static_cast<int>(endNode->childNodeCount())) {
1116 endNode = endNode->childNode(endOffset - 1);
1117 endOffset = lastOffsetInNode(endNode);
1118 }
1119 }
1120
1121 m_node = endNode;
1122 setUpFullyClippedStack(m_fullyClippedStack, m_node);
1123 m_offset = endOffset;
1124 m_handledNode = false;
1125 m_handledChildren = endOffset == 0;
1126
1127 m_startNode = startNode;
1128 m_startOffset = startOffset;
1129 m_endNode = endNode;
1130 m_endOffset = endOffset;
1131
1132 #ifndef NDEBUG
1133 // Need this just because of the assert.
1134 m_positionNode = endNode;
1135 #endif
1136
1137 m_lastTextNode = 0;
1138 m_lastCharacter = '\n';
1139
1140 m_havePassedStartNode = false;
1141
1142 advance();
1143 }
1144
atEnd() const1145 bool SimplifiedBackwardsTextIterator::atEnd() const
1146 {
1147 #if OS(ANDROID)
1148 return !m_positionNode || m_shouldStop;
1149 #else
1150 return !m_positionNode;
1151 #endif
1152 }
1153
advance()1154 void SimplifiedBackwardsTextIterator::advance()
1155 {
1156 ASSERT(m_positionNode);
1157
1158 #if OS(ANDROID)
1159 if (m_shouldStop)
1160 return;
1161
1162 // Prevent changing the iterator position if a form control element was found and advance should stop on it.
1163 if (m_stopsOnFormControls && checkFormControlElement(m_node)) {
1164 m_shouldStop = true;
1165 return;
1166 }
1167 #endif
1168
1169 m_positionNode = 0;
1170 m_textLength = 0;
1171
1172 while (m_node && !m_havePassedStartNode) {
1173 // Don't handle node if we start iterating at [node, 0].
1174 if (!m_handledNode && !(m_node == m_endNode && m_endOffset == 0)) {
1175 RenderObject* renderer = m_node->renderer();
1176 if (renderer && renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) {
1177 // FIXME: What about CDATA_SECTION_NODE?
1178 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
1179 m_handledNode = handleTextNode();
1180 } else if (renderer && (renderer->isImage() || renderer->isWidget())) {
1181 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
1182 m_handledNode = handleReplacedElement();
1183 } else
1184 m_handledNode = handleNonTextNode();
1185 if (m_positionNode)
1186 return;
1187 }
1188
1189 if (!m_handledChildren && m_node->hasChildNodes()) {
1190 m_node = m_node->lastChild();
1191 pushFullyClippedState(m_fullyClippedStack, m_node);
1192 } else {
1193 // Exit empty containers as we pass over them or containers
1194 // where [container, 0] is where we started iterating.
1195 if (!m_handledNode
1196 && canHaveChildrenForEditing(m_node)
1197 && m_node->parentNode()
1198 && (!m_node->lastChild() || (m_node == m_endNode && !m_endOffset))) {
1199 exitNode();
1200 if (m_positionNode) {
1201 m_handledNode = true;
1202 m_handledChildren = true;
1203 return;
1204 }
1205 }
1206
1207 // Exit all other containers.
1208 while (!m_node->previousSibling()) {
1209 if (!advanceRespectingRange(m_node->parentOrHostNode()))
1210 break;
1211 m_fullyClippedStack.pop();
1212 exitNode();
1213 if (m_positionNode) {
1214 m_handledNode = true;
1215 m_handledChildren = true;
1216 return;
1217 }
1218 }
1219
1220 m_fullyClippedStack.pop();
1221 if (advanceRespectingRange(m_node->previousSibling()))
1222 pushFullyClippedState(m_fullyClippedStack, m_node);
1223 else
1224 m_node = 0;
1225 }
1226
1227 // For the purpose of word boundary detection,
1228 // we should iterate all visible text and trailing (collapsed) whitespaces.
1229 m_offset = m_node ? maxOffsetIncludingCollapsedSpaces(m_node) : 0;
1230 m_handledNode = false;
1231 m_handledChildren = false;
1232
1233 if (m_positionNode)
1234 return;
1235 }
1236 }
1237
handleTextNode()1238 bool SimplifiedBackwardsTextIterator::handleTextNode()
1239 {
1240 m_lastTextNode = m_node;
1241
1242 RenderText* renderer = toRenderText(m_node->renderer());
1243 String str = renderer->text();
1244
1245 if (!renderer->firstTextBox() && str.length() > 0)
1246 return true;
1247
1248 m_positionEndOffset = m_offset;
1249
1250 m_offset = (m_node == m_startNode) ? m_startOffset : 0;
1251 m_positionNode = m_node;
1252 m_positionStartOffset = m_offset;
1253 m_textLength = m_positionEndOffset - m_positionStartOffset;
1254 m_textCharacters = str.characters() + m_positionStartOffset;
1255
1256 m_lastCharacter = str[m_positionEndOffset - 1];
1257
1258 return true;
1259 }
1260
handleReplacedElement()1261 bool SimplifiedBackwardsTextIterator::handleReplacedElement()
1262 {
1263 unsigned index = m_node->nodeIndex();
1264 // We want replaced elements to behave like punctuation for boundary
1265 // finding, and to simply take up space for the selection preservation
1266 // code in moveParagraphs, so we use a comma. Unconditionally emit
1267 // here because this iterator is only used for boundary finding.
1268 emitCharacter(',', m_node->parentNode(), index, index + 1);
1269 return true;
1270 }
1271
handleNonTextNode()1272 bool SimplifiedBackwardsTextIterator::handleNonTextNode()
1273 {
1274 // We can use a linefeed in place of a tab because this simple iterator is only used to
1275 // find boundaries, not actual content. A linefeed breaks words, sentences, and paragraphs.
1276 if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineAfterNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
1277 unsigned index = m_node->nodeIndex();
1278 // The start of this emitted range is wrong. Ensuring correctness would require
1279 // VisiblePositions and so would be slow. previousBoundary expects this.
1280 emitCharacter('\n', m_node->parentNode(), index + 1, index + 1);
1281 }
1282 return true;
1283 }
1284
exitNode()1285 void SimplifiedBackwardsTextIterator::exitNode()
1286 {
1287 if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineBeforeNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
1288 // The start of this emitted range is wrong. Ensuring correctness would require
1289 // VisiblePositions and so would be slow. previousBoundary expects this.
1290 emitCharacter('\n', m_node, 0, 0);
1291 }
1292 }
1293
emitCharacter(UChar c,Node * node,int startOffset,int endOffset)1294 void SimplifiedBackwardsTextIterator::emitCharacter(UChar c, Node* node, int startOffset, int endOffset)
1295 {
1296 m_singleCharacterBuffer = c;
1297 m_positionNode = node;
1298 m_positionStartOffset = startOffset;
1299 m_positionEndOffset = endOffset;
1300 m_textCharacters = &m_singleCharacterBuffer;
1301 m_textLength = 1;
1302 m_lastCharacter = c;
1303 }
1304
advanceRespectingRange(Node * next)1305 bool SimplifiedBackwardsTextIterator::advanceRespectingRange(Node* next)
1306 {
1307 if (!next)
1308 return false;
1309 m_havePassedStartNode |= m_node == m_startNode;
1310 if (m_havePassedStartNode)
1311 return false;
1312 m_node = next;
1313 return true;
1314 }
1315
range() const1316 PassRefPtr<Range> SimplifiedBackwardsTextIterator::range() const
1317 {
1318 if (m_positionNode)
1319 return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1320
1321 return Range::create(m_startNode->document(), m_startNode, m_startOffset, m_startNode, m_startOffset);
1322 }
1323
1324 // --------
1325
CharacterIterator()1326 CharacterIterator::CharacterIterator()
1327 : m_offset(0)
1328 , m_runOffset(0)
1329 , m_atBreak(true)
1330 {
1331 }
1332
CharacterIterator(const Range * r,TextIteratorBehavior behavior)1333 CharacterIterator::CharacterIterator(const Range* r, TextIteratorBehavior behavior)
1334 : m_offset(0)
1335 , m_runOffset(0)
1336 , m_atBreak(true)
1337 , m_textIterator(r, behavior)
1338 {
1339 while (!atEnd() && m_textIterator.length() == 0)
1340 m_textIterator.advance();
1341 }
1342
range() const1343 PassRefPtr<Range> CharacterIterator::range() const
1344 {
1345 RefPtr<Range> r = m_textIterator.range();
1346 if (!m_textIterator.atEnd()) {
1347 if (m_textIterator.length() <= 1) {
1348 ASSERT(m_runOffset == 0);
1349 } else {
1350 Node* n = r->startContainer();
1351 ASSERT(n == r->endContainer());
1352 int offset = r->startOffset() + m_runOffset;
1353 ExceptionCode ec = 0;
1354 r->setStart(n, offset, ec);
1355 r->setEnd(n, offset + 1, ec);
1356 ASSERT(!ec);
1357 }
1358 }
1359 return r.release();
1360 }
1361
advance(int count)1362 void CharacterIterator::advance(int count)
1363 {
1364 if (count <= 0) {
1365 ASSERT(count == 0);
1366 return;
1367 }
1368
1369 m_atBreak = false;
1370
1371 // easy if there is enough left in the current m_textIterator run
1372 int remaining = m_textIterator.length() - m_runOffset;
1373 if (count < remaining) {
1374 m_runOffset += count;
1375 m_offset += count;
1376 return;
1377 }
1378
1379 // exhaust the current m_textIterator run
1380 count -= remaining;
1381 m_offset += remaining;
1382
1383 // move to a subsequent m_textIterator run
1384 for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
1385 int runLength = m_textIterator.length();
1386 if (runLength == 0)
1387 m_atBreak = true;
1388 else {
1389 // see whether this is m_textIterator to use
1390 if (count < runLength) {
1391 m_runOffset = count;
1392 m_offset += count;
1393 return;
1394 }
1395
1396 // exhaust this m_textIterator run
1397 count -= runLength;
1398 m_offset += runLength;
1399 }
1400 }
1401
1402 // ran to the end of the m_textIterator... no more runs left
1403 m_atBreak = true;
1404 m_runOffset = 0;
1405 }
1406
string(int numChars)1407 String CharacterIterator::string(int numChars)
1408 {
1409 Vector<UChar> result;
1410 result.reserveInitialCapacity(numChars);
1411 while (numChars > 0 && !atEnd()) {
1412 int runSize = min(numChars, length());
1413 result.append(characters(), runSize);
1414 numChars -= runSize;
1415 advance(runSize);
1416 }
1417 return String::adopt(result);
1418 }
1419
characterSubrange(CharacterIterator & it,int offset,int length)1420 static PassRefPtr<Range> characterSubrange(CharacterIterator& it, int offset, int length)
1421 {
1422 it.advance(offset);
1423 RefPtr<Range> start = it.range();
1424
1425 if (length > 1)
1426 it.advance(length - 1);
1427 RefPtr<Range> end = it.range();
1428
1429 return Range::create(start->startContainer()->document(),
1430 start->startContainer(), start->startOffset(),
1431 end->endContainer(), end->endOffset());
1432 }
1433
BackwardsCharacterIterator()1434 BackwardsCharacterIterator::BackwardsCharacterIterator()
1435 : m_offset(0)
1436 , m_runOffset(0)
1437 , m_atBreak(true)
1438 {
1439 }
1440
BackwardsCharacterIterator(const Range * range,TextIteratorBehavior behavior)1441 BackwardsCharacterIterator::BackwardsCharacterIterator(const Range* range, TextIteratorBehavior behavior)
1442 : m_offset(0)
1443 , m_runOffset(0)
1444 , m_atBreak(true)
1445 , m_textIterator(range, behavior)
1446 {
1447 while (!atEnd() && !m_textIterator.length())
1448 m_textIterator.advance();
1449 }
1450
range() const1451 PassRefPtr<Range> BackwardsCharacterIterator::range() const
1452 {
1453 RefPtr<Range> r = m_textIterator.range();
1454 if (!m_textIterator.atEnd()) {
1455 if (m_textIterator.length() <= 1)
1456 ASSERT(m_runOffset == 0);
1457 else {
1458 Node* n = r->startContainer();
1459 ASSERT(n == r->endContainer());
1460 int offset = r->endOffset() - m_runOffset;
1461 ExceptionCode ec = 0;
1462 r->setStart(n, offset - 1, ec);
1463 r->setEnd(n, offset, ec);
1464 ASSERT(!ec);
1465 }
1466 }
1467 return r.release();
1468 }
1469
advance(int count)1470 void BackwardsCharacterIterator::advance(int count)
1471 {
1472 if (count <= 0) {
1473 ASSERT(!count);
1474 return;
1475 }
1476
1477 m_atBreak = false;
1478
1479 int remaining = m_textIterator.length() - m_runOffset;
1480 if (count < remaining) {
1481 m_runOffset += count;
1482 m_offset += count;
1483 return;
1484 }
1485
1486 count -= remaining;
1487 m_offset += remaining;
1488
1489 for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
1490 int runLength = m_textIterator.length();
1491 if (runLength == 0)
1492 m_atBreak = true;
1493 else {
1494 if (count < runLength) {
1495 m_runOffset = count;
1496 m_offset += count;
1497 return;
1498 }
1499
1500 count -= runLength;
1501 m_offset += runLength;
1502 }
1503 }
1504
1505 m_atBreak = true;
1506 m_runOffset = 0;
1507 }
1508
1509 // --------
1510
WordAwareIterator()1511 WordAwareIterator::WordAwareIterator()
1512 : m_previousText(0)
1513 , m_didLookAhead(false)
1514 {
1515 }
1516
WordAwareIterator(const Range * r)1517 WordAwareIterator::WordAwareIterator(const Range* r)
1518 : m_previousText(0)
1519 , m_didLookAhead(true) // so we consider the first chunk from the text iterator
1520 , m_textIterator(r)
1521 {
1522 advance(); // get in position over the first chunk of text
1523 }
1524
~WordAwareIterator()1525 WordAwareIterator::~WordAwareIterator()
1526 {
1527 }
1528
1529 // We're always in one of these modes:
1530 // - The current chunk in the text iterator is our current chunk
1531 // (typically its a piece of whitespace, or text that ended with whitespace)
1532 // - The previous chunk in the text iterator is our current chunk
1533 // (we looked ahead to the next chunk and found a word boundary)
1534 // - We built up our own chunk of text from many chunks from the text iterator
1535
1536 // FIXME: Performance could be bad for huge spans next to each other that don't fall on word boundaries.
1537
advance()1538 void WordAwareIterator::advance()
1539 {
1540 m_previousText = 0;
1541 m_buffer.clear(); // toss any old buffer we built up
1542
1543 // If last time we did a look-ahead, start with that looked-ahead chunk now
1544 if (!m_didLookAhead) {
1545 ASSERT(!m_textIterator.atEnd());
1546 m_textIterator.advance();
1547 }
1548 m_didLookAhead = false;
1549
1550 // Go to next non-empty chunk
1551 while (!m_textIterator.atEnd() && m_textIterator.length() == 0)
1552 m_textIterator.advance();
1553 m_range = m_textIterator.range();
1554
1555 if (m_textIterator.atEnd())
1556 return;
1557
1558 while (1) {
1559 // If this chunk ends in whitespace we can just use it as our chunk.
1560 if (isSpaceOrNewline(m_textIterator.characters()[m_textIterator.length() - 1]))
1561 return;
1562
1563 // If this is the first chunk that failed, save it in previousText before look ahead
1564 if (m_buffer.isEmpty()) {
1565 m_previousText = m_textIterator.characters();
1566 m_previousLength = m_textIterator.length();
1567 }
1568
1569 // Look ahead to next chunk. If it is whitespace or a break, we can use the previous stuff
1570 m_textIterator.advance();
1571 if (m_textIterator.atEnd() || m_textIterator.length() == 0 || isSpaceOrNewline(m_textIterator.characters()[0])) {
1572 m_didLookAhead = true;
1573 return;
1574 }
1575
1576 if (m_buffer.isEmpty()) {
1577 // Start gobbling chunks until we get to a suitable stopping point
1578 m_buffer.append(m_previousText, m_previousLength);
1579 m_previousText = 0;
1580 }
1581 m_buffer.append(m_textIterator.characters(), m_textIterator.length());
1582 int exception = 0;
1583 m_range->setEnd(m_textIterator.range()->endContainer(), m_textIterator.range()->endOffset(), exception);
1584 }
1585 }
1586
length() const1587 int WordAwareIterator::length() const
1588 {
1589 if (!m_buffer.isEmpty())
1590 return m_buffer.size();
1591 if (m_previousText)
1592 return m_previousLength;
1593 return m_textIterator.length();
1594 }
1595
characters() const1596 const UChar* WordAwareIterator::characters() const
1597 {
1598 if (!m_buffer.isEmpty())
1599 return m_buffer.data();
1600 if (m_previousText)
1601 return m_previousText;
1602 return m_textIterator.characters();
1603 }
1604
1605 // --------
1606
foldQuoteMarkOrSoftHyphen(UChar c)1607 static inline UChar foldQuoteMarkOrSoftHyphen(UChar c)
1608 {
1609 switch (c) {
1610 case hebrewPunctuationGershayim:
1611 case leftDoubleQuotationMark:
1612 case rightDoubleQuotationMark:
1613 return '"';
1614 case hebrewPunctuationGeresh:
1615 case leftSingleQuotationMark:
1616 case rightSingleQuotationMark:
1617 return '\'';
1618 case softHyphen:
1619 // Replace soft hyphen with an ignorable character so that their presence or absence will
1620 // not affect string comparison.
1621 return 0;
1622 default:
1623 return c;
1624 }
1625 }
1626
foldQuoteMarksAndSoftHyphens(String & s)1627 static inline void foldQuoteMarksAndSoftHyphens(String& s)
1628 {
1629 s.replace(hebrewPunctuationGeresh, '\'');
1630 s.replace(hebrewPunctuationGershayim, '"');
1631 s.replace(leftDoubleQuotationMark, '"');
1632 s.replace(leftSingleQuotationMark, '\'');
1633 s.replace(rightDoubleQuotationMark, '"');
1634 s.replace(rightSingleQuotationMark, '\'');
1635 // Replace soft hyphen with an ignorable character so that their presence or absence will
1636 // not affect string comparison.
1637 s.replace(softHyphen, 0);
1638 }
1639
1640 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
1641
foldQuoteMarksAndSoftHyphens(UChar * data,size_t length)1642 static inline void foldQuoteMarksAndSoftHyphens(UChar* data, size_t length)
1643 {
1644 for (size_t i = 0; i < length; ++i)
1645 data[i] = foldQuoteMarkOrSoftHyphen(data[i]);
1646 }
1647
1648 static const size_t minimumSearchBufferSize = 8192;
1649
1650 #ifndef NDEBUG
1651 static bool searcherInUse;
1652 #endif
1653
createSearcher()1654 static UStringSearch* createSearcher()
1655 {
1656 // Provide a non-empty pattern and non-empty text so usearch_open will not fail,
1657 // but it doesn't matter exactly what it is, since we don't perform any searches
1658 // without setting both the pattern and the text.
1659 UErrorCode status = U_ZERO_ERROR;
1660 UStringSearch* searcher = usearch_open(&newlineCharacter, 1, &newlineCharacter, 1, currentSearchLocaleID(), 0, &status);
1661 ASSERT(status == U_ZERO_ERROR || status == U_USING_FALLBACK_WARNING || status == U_USING_DEFAULT_WARNING);
1662 return searcher;
1663 }
1664
searcher()1665 static UStringSearch* searcher()
1666 {
1667 static UStringSearch* searcher = createSearcher();
1668 return searcher;
1669 }
1670
lockSearcher()1671 static inline void lockSearcher()
1672 {
1673 #ifndef NDEBUG
1674 ASSERT(!searcherInUse);
1675 searcherInUse = true;
1676 #endif
1677 }
1678
unlockSearcher()1679 static inline void unlockSearcher()
1680 {
1681 #ifndef NDEBUG
1682 ASSERT(searcherInUse);
1683 searcherInUse = false;
1684 #endif
1685 }
1686
1687 // ICU's search ignores the distinction between small kana letters and ones
1688 // that are not small, and also characters that differ only in the voicing
1689 // marks when considering only primary collation strength diffrences.
1690 // This is not helpful for end users, since these differences make words
1691 // distinct, so for our purposes we need these to be considered.
1692 // The Unicode folks do not think the collation algorithm should be
1693 // changed. To work around this, we would like to tailor the ICU searcher,
1694 // but we can't get that to work yet. So instead, we check for cases where
1695 // these differences occur, and skip those matches.
1696
1697 // We refer to the above technique as the "kana workaround". The next few
1698 // functions are helper functinos for the kana workaround.
1699
isKanaLetter(UChar character)1700 static inline bool isKanaLetter(UChar character)
1701 {
1702 // Hiragana letters.
1703 if (character >= 0x3041 && character <= 0x3096)
1704 return true;
1705
1706 // Katakana letters.
1707 if (character >= 0x30A1 && character <= 0x30FA)
1708 return true;
1709 if (character >= 0x31F0 && character <= 0x31FF)
1710 return true;
1711
1712 // Halfwidth katakana letters.
1713 if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70)
1714 return true;
1715
1716 return false;
1717 }
1718
isSmallKanaLetter(UChar character)1719 static inline bool isSmallKanaLetter(UChar character)
1720 {
1721 ASSERT(isKanaLetter(character));
1722
1723 switch (character) {
1724 case 0x3041: // HIRAGANA LETTER SMALL A
1725 case 0x3043: // HIRAGANA LETTER SMALL I
1726 case 0x3045: // HIRAGANA LETTER SMALL U
1727 case 0x3047: // HIRAGANA LETTER SMALL E
1728 case 0x3049: // HIRAGANA LETTER SMALL O
1729 case 0x3063: // HIRAGANA LETTER SMALL TU
1730 case 0x3083: // HIRAGANA LETTER SMALL YA
1731 case 0x3085: // HIRAGANA LETTER SMALL YU
1732 case 0x3087: // HIRAGANA LETTER SMALL YO
1733 case 0x308E: // HIRAGANA LETTER SMALL WA
1734 case 0x3095: // HIRAGANA LETTER SMALL KA
1735 case 0x3096: // HIRAGANA LETTER SMALL KE
1736 case 0x30A1: // KATAKANA LETTER SMALL A
1737 case 0x30A3: // KATAKANA LETTER SMALL I
1738 case 0x30A5: // KATAKANA LETTER SMALL U
1739 case 0x30A7: // KATAKANA LETTER SMALL E
1740 case 0x30A9: // KATAKANA LETTER SMALL O
1741 case 0x30C3: // KATAKANA LETTER SMALL TU
1742 case 0x30E3: // KATAKANA LETTER SMALL YA
1743 case 0x30E5: // KATAKANA LETTER SMALL YU
1744 case 0x30E7: // KATAKANA LETTER SMALL YO
1745 case 0x30EE: // KATAKANA LETTER SMALL WA
1746 case 0x30F5: // KATAKANA LETTER SMALL KA
1747 case 0x30F6: // KATAKANA LETTER SMALL KE
1748 case 0x31F0: // KATAKANA LETTER SMALL KU
1749 case 0x31F1: // KATAKANA LETTER SMALL SI
1750 case 0x31F2: // KATAKANA LETTER SMALL SU
1751 case 0x31F3: // KATAKANA LETTER SMALL TO
1752 case 0x31F4: // KATAKANA LETTER SMALL NU
1753 case 0x31F5: // KATAKANA LETTER SMALL HA
1754 case 0x31F6: // KATAKANA LETTER SMALL HI
1755 case 0x31F7: // KATAKANA LETTER SMALL HU
1756 case 0x31F8: // KATAKANA LETTER SMALL HE
1757 case 0x31F9: // KATAKANA LETTER SMALL HO
1758 case 0x31FA: // KATAKANA LETTER SMALL MU
1759 case 0x31FB: // KATAKANA LETTER SMALL RA
1760 case 0x31FC: // KATAKANA LETTER SMALL RI
1761 case 0x31FD: // KATAKANA LETTER SMALL RU
1762 case 0x31FE: // KATAKANA LETTER SMALL RE
1763 case 0x31FF: // KATAKANA LETTER SMALL RO
1764 case 0xFF67: // HALFWIDTH KATAKANA LETTER SMALL A
1765 case 0xFF68: // HALFWIDTH KATAKANA LETTER SMALL I
1766 case 0xFF69: // HALFWIDTH KATAKANA LETTER SMALL U
1767 case 0xFF6A: // HALFWIDTH KATAKANA LETTER SMALL E
1768 case 0xFF6B: // HALFWIDTH KATAKANA LETTER SMALL O
1769 case 0xFF6C: // HALFWIDTH KATAKANA LETTER SMALL YA
1770 case 0xFF6D: // HALFWIDTH KATAKANA LETTER SMALL YU
1771 case 0xFF6E: // HALFWIDTH KATAKANA LETTER SMALL YO
1772 case 0xFF6F: // HALFWIDTH KATAKANA LETTER SMALL TU
1773 return true;
1774 }
1775 return false;
1776 }
1777
1778 enum VoicedSoundMarkType { NoVoicedSoundMark, VoicedSoundMark, SemiVoicedSoundMark };
1779
composedVoicedSoundMark(UChar character)1780 static inline VoicedSoundMarkType composedVoicedSoundMark(UChar character)
1781 {
1782 ASSERT(isKanaLetter(character));
1783
1784 switch (character) {
1785 case 0x304C: // HIRAGANA LETTER GA
1786 case 0x304E: // HIRAGANA LETTER GI
1787 case 0x3050: // HIRAGANA LETTER GU
1788 case 0x3052: // HIRAGANA LETTER GE
1789 case 0x3054: // HIRAGANA LETTER GO
1790 case 0x3056: // HIRAGANA LETTER ZA
1791 case 0x3058: // HIRAGANA LETTER ZI
1792 case 0x305A: // HIRAGANA LETTER ZU
1793 case 0x305C: // HIRAGANA LETTER ZE
1794 case 0x305E: // HIRAGANA LETTER ZO
1795 case 0x3060: // HIRAGANA LETTER DA
1796 case 0x3062: // HIRAGANA LETTER DI
1797 case 0x3065: // HIRAGANA LETTER DU
1798 case 0x3067: // HIRAGANA LETTER DE
1799 case 0x3069: // HIRAGANA LETTER DO
1800 case 0x3070: // HIRAGANA LETTER BA
1801 case 0x3073: // HIRAGANA LETTER BI
1802 case 0x3076: // HIRAGANA LETTER BU
1803 case 0x3079: // HIRAGANA LETTER BE
1804 case 0x307C: // HIRAGANA LETTER BO
1805 case 0x3094: // HIRAGANA LETTER VU
1806 case 0x30AC: // KATAKANA LETTER GA
1807 case 0x30AE: // KATAKANA LETTER GI
1808 case 0x30B0: // KATAKANA LETTER GU
1809 case 0x30B2: // KATAKANA LETTER GE
1810 case 0x30B4: // KATAKANA LETTER GO
1811 case 0x30B6: // KATAKANA LETTER ZA
1812 case 0x30B8: // KATAKANA LETTER ZI
1813 case 0x30BA: // KATAKANA LETTER ZU
1814 case 0x30BC: // KATAKANA LETTER ZE
1815 case 0x30BE: // KATAKANA LETTER ZO
1816 case 0x30C0: // KATAKANA LETTER DA
1817 case 0x30C2: // KATAKANA LETTER DI
1818 case 0x30C5: // KATAKANA LETTER DU
1819 case 0x30C7: // KATAKANA LETTER DE
1820 case 0x30C9: // KATAKANA LETTER DO
1821 case 0x30D0: // KATAKANA LETTER BA
1822 case 0x30D3: // KATAKANA LETTER BI
1823 case 0x30D6: // KATAKANA LETTER BU
1824 case 0x30D9: // KATAKANA LETTER BE
1825 case 0x30DC: // KATAKANA LETTER BO
1826 case 0x30F4: // KATAKANA LETTER VU
1827 case 0x30F7: // KATAKANA LETTER VA
1828 case 0x30F8: // KATAKANA LETTER VI
1829 case 0x30F9: // KATAKANA LETTER VE
1830 case 0x30FA: // KATAKANA LETTER VO
1831 return VoicedSoundMark;
1832 case 0x3071: // HIRAGANA LETTER PA
1833 case 0x3074: // HIRAGANA LETTER PI
1834 case 0x3077: // HIRAGANA LETTER PU
1835 case 0x307A: // HIRAGANA LETTER PE
1836 case 0x307D: // HIRAGANA LETTER PO
1837 case 0x30D1: // KATAKANA LETTER PA
1838 case 0x30D4: // KATAKANA LETTER PI
1839 case 0x30D7: // KATAKANA LETTER PU
1840 case 0x30DA: // KATAKANA LETTER PE
1841 case 0x30DD: // KATAKANA LETTER PO
1842 return SemiVoicedSoundMark;
1843 }
1844 return NoVoicedSoundMark;
1845 }
1846
isCombiningVoicedSoundMark(UChar character)1847 static inline bool isCombiningVoicedSoundMark(UChar character)
1848 {
1849 switch (character) {
1850 case 0x3099: // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
1851 case 0x309A: // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
1852 return true;
1853 }
1854 return false;
1855 }
1856
containsKanaLetters(const String & pattern)1857 static inline bool containsKanaLetters(const String& pattern)
1858 {
1859 const UChar* characters = pattern.characters();
1860 unsigned length = pattern.length();
1861 for (unsigned i = 0; i < length; ++i) {
1862 if (isKanaLetter(characters[i]))
1863 return true;
1864 }
1865 return false;
1866 }
1867
normalizeCharacters(const UChar * characters,unsigned length,Vector<UChar> & buffer)1868 static void normalizeCharacters(const UChar* characters, unsigned length, Vector<UChar>& buffer)
1869 {
1870 ASSERT(length);
1871
1872 buffer.resize(length);
1873
1874 UErrorCode status = U_ZERO_ERROR;
1875 size_t bufferSize = unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), length, &status);
1876 ASSERT(status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR);
1877 ASSERT(bufferSize);
1878
1879 buffer.resize(bufferSize);
1880
1881 if (status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING)
1882 return;
1883
1884 status = U_ZERO_ERROR;
1885 unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), bufferSize, &status);
1886 ASSERT(status == U_STRING_NOT_TERMINATED_WARNING);
1887 }
1888
isNonLatin1Separator(UChar32 character)1889 static bool isNonLatin1Separator(UChar32 character)
1890 {
1891 ASSERT_ARG(character, character >= 256);
1892
1893 return U_GET_GC_MASK(character) & (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK);
1894 }
1895
isSeparator(UChar32 character)1896 static inline bool isSeparator(UChar32 character)
1897 {
1898 static const bool latin1SeparatorTable[256] = {
1899 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1900 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1901 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // space ! " # $ % & ' ( ) * + , - . /
1902 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, // : ; < = > ?
1903 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // @
1904 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, // [ \ ] ^ _
1905 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // `
1906 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, // { | } ~
1907 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1908 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1909 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1910 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1911 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1912 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
1913 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1914 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
1915 };
1916
1917 if (character < 256)
1918 return latin1SeparatorTable[character];
1919
1920 return isNonLatin1Separator(character);
1921 }
1922
SearchBuffer(const String & target,FindOptions options)1923 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
1924 : m_target(target)
1925 , m_options(options)
1926 , m_prefixLength(0)
1927 , m_atBreak(true)
1928 , m_needsMoreContext(options & AtWordStarts)
1929 , m_targetRequiresKanaWorkaround(containsKanaLetters(m_target))
1930 {
1931 ASSERT(!m_target.isEmpty());
1932
1933 // FIXME: We'd like to tailor the searcher to fold quote marks for us instead
1934 // of doing it in a separate replacement pass here, but ICU doesn't offer a way
1935 // to add tailoring on top of the locale-specific tailoring as of this writing.
1936 foldQuoteMarksAndSoftHyphens(m_target);
1937
1938 size_t targetLength = m_target.length();
1939 m_buffer.reserveInitialCapacity(max(targetLength * 8, minimumSearchBufferSize));
1940 m_overlap = m_buffer.capacity() / 4;
1941
1942 if ((m_options & AtWordStarts) && targetLength) {
1943 UChar32 targetFirstCharacter;
1944 U16_GET(m_target.characters(), 0, 0, targetLength, targetFirstCharacter);
1945 // Characters in the separator category never really occur at the beginning of a word,
1946 // so if the target begins with such a character, we just ignore the AtWordStart option.
1947 if (isSeparator(targetFirstCharacter)) {
1948 m_options &= ~AtWordStarts;
1949 m_needsMoreContext = false;
1950 }
1951 }
1952
1953 // Grab the single global searcher.
1954 // If we ever have a reason to do more than once search buffer at once, we'll have
1955 // to move to multiple searchers.
1956 lockSearcher();
1957
1958 UStringSearch* searcher = WebCore::searcher();
1959 UCollator* collator = usearch_getCollator(searcher);
1960
1961 UCollationStrength strength = m_options & CaseInsensitive ? UCOL_PRIMARY : UCOL_TERTIARY;
1962 if (ucol_getStrength(collator) != strength) {
1963 ucol_setStrength(collator, strength);
1964 usearch_reset(searcher);
1965 }
1966
1967 UErrorCode status = U_ZERO_ERROR;
1968 usearch_setPattern(searcher, m_target.characters(), targetLength, &status);
1969 ASSERT(status == U_ZERO_ERROR);
1970
1971 // The kana workaround requires a normalized copy of the target string.
1972 if (m_targetRequiresKanaWorkaround)
1973 normalizeCharacters(m_target.characters(), m_target.length(), m_normalizedTarget);
1974 }
1975
~SearchBuffer()1976 inline SearchBuffer::~SearchBuffer()
1977 {
1978 // Leave the static object pointing to a valid string.
1979 UErrorCode status = U_ZERO_ERROR;
1980 usearch_setPattern(WebCore::searcher(), &newlineCharacter, 1, &status);
1981 ASSERT(status == U_ZERO_ERROR);
1982
1983 unlockSearcher();
1984 }
1985
append(const UChar * characters,size_t length)1986 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
1987 {
1988 ASSERT(length);
1989
1990 if (m_atBreak) {
1991 m_buffer.shrink(0);
1992 m_prefixLength = 0;
1993 m_atBreak = false;
1994 } else if (m_buffer.size() == m_buffer.capacity()) {
1995 memcpy(m_buffer.data(), m_buffer.data() + m_buffer.size() - m_overlap, m_overlap * sizeof(UChar));
1996 m_prefixLength -= min(m_prefixLength, m_buffer.size() - m_overlap);
1997 m_buffer.shrink(m_overlap);
1998 }
1999
2000 size_t oldLength = m_buffer.size();
2001 size_t usableLength = min(m_buffer.capacity() - oldLength, length);
2002 ASSERT(usableLength);
2003 m_buffer.append(characters, usableLength);
2004 foldQuoteMarksAndSoftHyphens(m_buffer.data() + oldLength, usableLength);
2005 return usableLength;
2006 }
2007
needsMoreContext() const2008 inline bool SearchBuffer::needsMoreContext() const
2009 {
2010 return m_needsMoreContext;
2011 }
2012
prependContext(const UChar * characters,size_t length)2013 inline void SearchBuffer::prependContext(const UChar* characters, size_t length)
2014 {
2015 ASSERT(m_needsMoreContext);
2016 ASSERT(m_prefixLength == m_buffer.size());
2017
2018 if (!length)
2019 return;
2020
2021 m_atBreak = false;
2022
2023 size_t wordBoundaryContextStart = length;
2024 if (wordBoundaryContextStart) {
2025 U16_BACK_1(characters, 0, wordBoundaryContextStart);
2026 wordBoundaryContextStart = startOfLastWordBoundaryContext(characters, wordBoundaryContextStart);
2027 }
2028
2029 size_t usableLength = min(m_buffer.capacity() - m_prefixLength, length - wordBoundaryContextStart);
2030 m_buffer.prepend(characters + length - usableLength, usableLength);
2031 m_prefixLength += usableLength;
2032
2033 if (wordBoundaryContextStart || m_prefixLength == m_buffer.capacity())
2034 m_needsMoreContext = false;
2035 }
2036
atBreak() const2037 inline bool SearchBuffer::atBreak() const
2038 {
2039 return m_atBreak;
2040 }
2041
reachedBreak()2042 inline void SearchBuffer::reachedBreak()
2043 {
2044 m_atBreak = true;
2045 }
2046
isBadMatch(const UChar * match,size_t matchLength) const2047 inline bool SearchBuffer::isBadMatch(const UChar* match, size_t matchLength) const
2048 {
2049 // This function implements the kana workaround. If usearch treats
2050 // it as a match, but we do not want to, then it's a "bad match".
2051 if (!m_targetRequiresKanaWorkaround)
2052 return false;
2053
2054 // Normalize into a match buffer. We reuse a single buffer rather than
2055 // creating a new one each time.
2056 normalizeCharacters(match, matchLength, m_normalizedMatch);
2057
2058 const UChar* a = m_normalizedTarget.begin();
2059 const UChar* aEnd = m_normalizedTarget.end();
2060
2061 const UChar* b = m_normalizedMatch.begin();
2062 const UChar* bEnd = m_normalizedMatch.end();
2063
2064 while (true) {
2065 // Skip runs of non-kana-letter characters. This is necessary so we can
2066 // correctly handle strings where the target and match have different-length
2067 // runs of characters that match, while still double checking the correctness
2068 // of matches of kana letters with other kana letters.
2069 while (a != aEnd && !isKanaLetter(*a))
2070 ++a;
2071 while (b != bEnd && !isKanaLetter(*b))
2072 ++b;
2073
2074 // If we reached the end of either the target or the match, we should have
2075 // reached the end of both; both should have the same number of kana letters.
2076 if (a == aEnd || b == bEnd) {
2077 ASSERT(a == aEnd);
2078 ASSERT(b == bEnd);
2079 return false;
2080 }
2081
2082 // Check for differences in the kana letter character itself.
2083 if (isSmallKanaLetter(*a) != isSmallKanaLetter(*b))
2084 return true;
2085 if (composedVoicedSoundMark(*a) != composedVoicedSoundMark(*b))
2086 return true;
2087 ++a;
2088 ++b;
2089
2090 // Check for differences in combining voiced sound marks found after the letter.
2091 while (1) {
2092 if (!(a != aEnd && isCombiningVoicedSoundMark(*a))) {
2093 if (b != bEnd && isCombiningVoicedSoundMark(*b))
2094 return true;
2095 break;
2096 }
2097 if (!(b != bEnd && isCombiningVoicedSoundMark(*b)))
2098 return true;
2099 if (*a != *b)
2100 return true;
2101 ++a;
2102 ++b;
2103 }
2104 }
2105 }
2106
isWordStartMatch(size_t start,size_t length) const2107 inline bool SearchBuffer::isWordStartMatch(size_t start, size_t length) const
2108 {
2109 ASSERT(m_options & AtWordStarts);
2110
2111 if (!start)
2112 return true;
2113
2114 if (m_options & TreatMedialCapitalAsWordStart) {
2115 int size = m_buffer.size();
2116 int offset = start;
2117 UChar32 firstCharacter;
2118 U16_GET(m_buffer.data(), 0, offset, size, firstCharacter);
2119 UChar32 previousCharacter;
2120 U16_PREV(m_buffer.data(), 0, offset, previousCharacter);
2121
2122 if (isSeparator(firstCharacter)) {
2123 // The start of a separator run is a word start (".org" in "webkit.org").
2124 if (!isSeparator(previousCharacter))
2125 return true;
2126 } else if (isASCIIUpper(firstCharacter)) {
2127 // The start of an uppercase run is a word start ("Kit" in "WebKit").
2128 if (!isASCIIUpper(previousCharacter))
2129 return true;
2130 // The last character of an uppercase run followed by a non-separator, non-digit
2131 // is a word start ("Request" in "XMLHTTPRequest").
2132 offset = start;
2133 U16_FWD_1(m_buffer.data(), offset, size);
2134 UChar32 nextCharacter = 0;
2135 if (offset < size)
2136 U16_GET(m_buffer.data(), 0, offset, size, nextCharacter);
2137 if (!isASCIIUpper(nextCharacter) && !isASCIIDigit(nextCharacter) && !isSeparator(nextCharacter))
2138 return true;
2139 } else if (isASCIIDigit(firstCharacter)) {
2140 // The start of a digit run is a word start ("2" in "WebKit2").
2141 if (!isASCIIDigit(previousCharacter))
2142 return true;
2143 } else if (isSeparator(previousCharacter) || isASCIIDigit(previousCharacter)) {
2144 // The start of a non-separator, non-uppercase, non-digit run is a word start,
2145 // except after an uppercase. ("org" in "webkit.org", but not "ore" in "WebCore").
2146 return true;
2147 }
2148 }
2149
2150 size_t wordBreakSearchStart = start + length;
2151 while (wordBreakSearchStart > start)
2152 wordBreakSearchStart = findNextWordFromIndex(m_buffer.data(), m_buffer.size(), wordBreakSearchStart, false /* backwards */);
2153 return wordBreakSearchStart == start;
2154 }
2155
search(size_t & start)2156 inline size_t SearchBuffer::search(size_t& start)
2157 {
2158 size_t size = m_buffer.size();
2159 if (m_atBreak) {
2160 if (!size)
2161 return 0;
2162 } else {
2163 if (size != m_buffer.capacity())
2164 return 0;
2165 }
2166
2167 UStringSearch* searcher = WebCore::searcher();
2168
2169 UErrorCode status = U_ZERO_ERROR;
2170 usearch_setText(searcher, m_buffer.data(), size, &status);
2171 ASSERT(status == U_ZERO_ERROR);
2172
2173 usearch_setOffset(searcher, m_prefixLength, &status);
2174 ASSERT(status == U_ZERO_ERROR);
2175
2176 int matchStart = usearch_next(searcher, &status);
2177 ASSERT(status == U_ZERO_ERROR);
2178
2179 nextMatch:
2180 if (!(matchStart >= 0 && static_cast<size_t>(matchStart) < size)) {
2181 ASSERT(matchStart == USEARCH_DONE);
2182 return 0;
2183 }
2184
2185 // Matches that start in the overlap area are only tentative.
2186 // The same match may appear later, matching more characters,
2187 // possibly including a combining character that's not yet in the buffer.
2188 if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) {
2189 size_t overlap = m_overlap;
2190 if (m_options & AtWordStarts) {
2191 // Ensure that there is sufficient context before matchStart the next time around for
2192 // determining if it is at a word boundary.
2193 int wordBoundaryContextStart = matchStart;
2194 U16_BACK_1(m_buffer.data(), 0, wordBoundaryContextStart);
2195 wordBoundaryContextStart = startOfLastWordBoundaryContext(m_buffer.data(), wordBoundaryContextStart);
2196 overlap = min(size - 1, max(overlap, size - wordBoundaryContextStart));
2197 }
2198 memcpy(m_buffer.data(), m_buffer.data() + size - overlap, overlap * sizeof(UChar));
2199 m_prefixLength -= min(m_prefixLength, size - overlap);
2200 m_buffer.shrink(overlap);
2201 return 0;
2202 }
2203
2204 size_t matchedLength = usearch_getMatchedLength(searcher);
2205 ASSERT(matchStart + matchedLength <= size);
2206
2207 // If this match is "bad", move on to the next match.
2208 if (isBadMatch(m_buffer.data() + matchStart, matchedLength) || ((m_options & AtWordStarts) && !isWordStartMatch(matchStart, matchedLength))) {
2209 matchStart = usearch_next(searcher, &status);
2210 ASSERT(status == U_ZERO_ERROR);
2211 goto nextMatch;
2212 }
2213
2214 size_t newSize = size - (matchStart + 1);
2215 memmove(m_buffer.data(), m_buffer.data() + matchStart + 1, newSize * sizeof(UChar));
2216 m_prefixLength -= min<size_t>(m_prefixLength, matchStart + 1);
2217 m_buffer.shrink(newSize);
2218
2219 start = size - matchStart;
2220 return matchedLength;
2221 }
2222
2223 #else // !ICU_UNICODE
2224
SearchBuffer(const String & target,FindOptions options)2225 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
2226 : m_target(options & CaseInsensitive ? target.foldCase() : target)
2227 , m_options(options)
2228 , m_buffer(m_target.length())
2229 , m_isCharacterStartBuffer(m_target.length())
2230 , m_isBufferFull(false)
2231 , m_cursor(0)
2232 {
2233 ASSERT(!m_target.isEmpty());
2234 m_target.replace(noBreakSpace, ' ');
2235 foldQuoteMarksAndSoftHyphens(m_target);
2236 }
2237
~SearchBuffer()2238 inline SearchBuffer::~SearchBuffer()
2239 {
2240 }
2241
reachedBreak()2242 inline void SearchBuffer::reachedBreak()
2243 {
2244 m_cursor = 0;
2245 m_isBufferFull = false;
2246 }
2247
atBreak() const2248 inline bool SearchBuffer::atBreak() const
2249 {
2250 return !m_cursor && !m_isBufferFull;
2251 }
2252
append(UChar c,bool isStart)2253 inline void SearchBuffer::append(UChar c, bool isStart)
2254 {
2255 m_buffer[m_cursor] = c == noBreakSpace ? ' ' : foldQuoteMarkOrSoftHyphen(c);
2256 m_isCharacterStartBuffer[m_cursor] = isStart;
2257 if (++m_cursor == m_target.length()) {
2258 m_cursor = 0;
2259 m_isBufferFull = true;
2260 }
2261 }
2262
append(const UChar * characters,size_t length)2263 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
2264 {
2265 ASSERT(length);
2266 if (!(m_options & CaseInsensitive)) {
2267 append(characters[0], true);
2268 return 1;
2269 }
2270 const int maxFoldedCharacters = 16; // sensible maximum is 3, this should be more than enough
2271 UChar foldedCharacters[maxFoldedCharacters];
2272 bool error;
2273 int numFoldedCharacters = foldCase(foldedCharacters, maxFoldedCharacters, characters, 1, &error);
2274 ASSERT(!error);
2275 ASSERT(numFoldedCharacters);
2276 ASSERT(numFoldedCharacters <= maxFoldedCharacters);
2277 if (!error && numFoldedCharacters) {
2278 numFoldedCharacters = min(numFoldedCharacters, maxFoldedCharacters);
2279 append(foldedCharacters[0], true);
2280 for (int i = 1; i < numFoldedCharacters; ++i)
2281 append(foldedCharacters[i], false);
2282 }
2283 return 1;
2284 }
2285
needsMoreContext() const2286 inline bool SearchBuffer::needsMoreContext() const
2287 {
2288 return false;
2289 }
2290
prependContext(const UChar *,size_t)2291 void SearchBuffer::prependContext(const UChar*, size_t)
2292 {
2293 ASSERT_NOT_REACHED();
2294 }
2295
search(size_t & start)2296 inline size_t SearchBuffer::search(size_t& start)
2297 {
2298 if (!m_isBufferFull)
2299 return 0;
2300 if (!m_isCharacterStartBuffer[m_cursor])
2301 return 0;
2302
2303 size_t tailSpace = m_target.length() - m_cursor;
2304 if (memcmp(&m_buffer[m_cursor], m_target.characters(), tailSpace * sizeof(UChar)) != 0)
2305 return 0;
2306 if (memcmp(&m_buffer[0], m_target.characters() + tailSpace, m_cursor * sizeof(UChar)) != 0)
2307 return 0;
2308
2309 start = length();
2310
2311 // Now that we've found a match once, we don't want to find it again, because those
2312 // are the SearchBuffer semantics, allowing for a buffer where you append more than one
2313 // character at a time. To do this we take advantage of m_isCharacterStartBuffer, but if
2314 // we want to get rid of that in the future we could track this with a separate boolean
2315 // or even move the characters to the start of the buffer and set m_isBufferFull to false.
2316 m_isCharacterStartBuffer[m_cursor] = false;
2317
2318 return start;
2319 }
2320
2321 // Returns the number of characters that were appended to the buffer (what we are searching in).
2322 // That's not necessarily the same length as the passed-in target string, because case folding
2323 // can make two strings match even though they're not the same length.
length() const2324 size_t SearchBuffer::length() const
2325 {
2326 size_t bufferSize = m_target.length();
2327 size_t length = 0;
2328 for (size_t i = 0; i < bufferSize; ++i)
2329 length += m_isCharacterStartBuffer[i];
2330 return length;
2331 }
2332
2333 #endif // !ICU_UNICODE
2334
2335 // --------
2336
rangeLength(const Range * r,bool forSelectionPreservation)2337 int TextIterator::rangeLength(const Range* r, bool forSelectionPreservation)
2338 {
2339 int length = 0;
2340 for (TextIterator it(r, forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior); !it.atEnd(); it.advance())
2341 length += it.length();
2342
2343 return length;
2344 }
2345
subrange(Range * entireRange,int characterOffset,int characterCount)2346 PassRefPtr<Range> TextIterator::subrange(Range* entireRange, int characterOffset, int characterCount)
2347 {
2348 CharacterIterator entireRangeIterator(entireRange);
2349 return characterSubrange(entireRangeIterator, characterOffset, characterCount);
2350 }
2351
rangeFromLocationAndLength(Element * scope,int rangeLocation,int rangeLength,bool forSelectionPreservation)2352 PassRefPtr<Range> TextIterator::rangeFromLocationAndLength(Element* scope, int rangeLocation, int rangeLength, bool forSelectionPreservation)
2353 {
2354 RefPtr<Range> resultRange = scope->document()->createRange();
2355
2356 int docTextPosition = 0;
2357 int rangeEnd = rangeLocation + rangeLength;
2358 bool startRangeFound = false;
2359
2360 RefPtr<Range> textRunRange;
2361
2362 TextIterator it(rangeOfContents(scope).get(), forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior);
2363
2364 // FIXME: the atEnd() check shouldn't be necessary, workaround for <http://bugs.webkit.org/show_bug.cgi?id=6289>.
2365 if (rangeLocation == 0 && rangeLength == 0 && it.atEnd()) {
2366 textRunRange = it.range();
2367
2368 ExceptionCode ec = 0;
2369 resultRange->setStart(textRunRange->startContainer(), 0, ec);
2370 ASSERT(!ec);
2371 resultRange->setEnd(textRunRange->startContainer(), 0, ec);
2372 ASSERT(!ec);
2373
2374 return resultRange.release();
2375 }
2376
2377 for (; !it.atEnd(); it.advance()) {
2378 int len = it.length();
2379 textRunRange = it.range();
2380
2381 bool foundStart = rangeLocation >= docTextPosition && rangeLocation <= docTextPosition + len;
2382 bool foundEnd = rangeEnd >= docTextPosition && rangeEnd <= docTextPosition + len;
2383
2384 // Fix textRunRange->endPosition(), but only if foundStart || foundEnd, because it is only
2385 // in those cases that textRunRange is used.
2386 if (foundEnd) {
2387 // FIXME: This is a workaround for the fact that the end of a run is often at the wrong
2388 // position for emitted '\n's.
2389 if (len == 1 && it.characters()[0] == '\n') {
2390 scope->document()->updateLayoutIgnorePendingStylesheets();
2391 it.advance();
2392 if (!it.atEnd()) {
2393 RefPtr<Range> range = it.range();
2394 ExceptionCode ec = 0;
2395 textRunRange->setEnd(range->startContainer(), range->startOffset(), ec);
2396 ASSERT(!ec);
2397 } else {
2398 Position runStart = textRunRange->startPosition();
2399 Position runEnd = VisiblePosition(runStart).next().deepEquivalent();
2400 if (runEnd.isNotNull()) {
2401 ExceptionCode ec = 0;
2402 textRunRange->setEnd(runEnd.containerNode(), runEnd.computeOffsetInContainerNode(), ec);
2403 ASSERT(!ec);
2404 }
2405 }
2406 }
2407 }
2408
2409 if (foundStart) {
2410 startRangeFound = true;
2411 int exception = 0;
2412 if (textRunRange->startContainer()->isTextNode()) {
2413 int offset = rangeLocation - docTextPosition;
2414 resultRange->setStart(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
2415 } else {
2416 if (rangeLocation == docTextPosition)
2417 resultRange->setStart(textRunRange->startContainer(), textRunRange->startOffset(), exception);
2418 else
2419 resultRange->setStart(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2420 }
2421 }
2422
2423 if (foundEnd) {
2424 int exception = 0;
2425 if (textRunRange->startContainer()->isTextNode()) {
2426 int offset = rangeEnd - docTextPosition;
2427 resultRange->setEnd(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
2428 } else {
2429 if (rangeEnd == docTextPosition)
2430 resultRange->setEnd(textRunRange->startContainer(), textRunRange->startOffset(), exception);
2431 else
2432 resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2433 }
2434 docTextPosition += len;
2435 break;
2436 }
2437 docTextPosition += len;
2438 }
2439
2440 if (!startRangeFound)
2441 return 0;
2442
2443 if (rangeLength != 0 && rangeEnd > docTextPosition) { // rangeEnd is out of bounds
2444 int exception = 0;
2445 resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2446 }
2447
2448 return resultRange.release();
2449 }
2450
locationAndLengthFromRange(const Range * range,size_t & location,size_t & length)2451 bool TextIterator::locationAndLengthFromRange(const Range* range, size_t& location, size_t& length)
2452 {
2453 location = notFound;
2454 length = 0;
2455
2456 if (!range->startContainer())
2457 return false;
2458
2459 Element* selectionRoot = range->ownerDocument()->frame()->selection()->rootEditableElement();
2460 Element* scope = selectionRoot ? selectionRoot : range->ownerDocument()->documentElement();
2461
2462 // The critical assumption is that this only gets called with ranges that
2463 // concentrate on a given area containing the selection root. This is done
2464 // because of text fields and textareas. The DOM for those is not
2465 // directly in the document DOM, so ensure that the range does not cross a
2466 // boundary of one of those.
2467 if (range->startContainer() != scope && !range->startContainer()->isDescendantOf(scope))
2468 return false;
2469 if (range->endContainer() != scope && !range->endContainer()->isDescendantOf(scope))
2470 return false;
2471
2472 RefPtr<Range> testRange = Range::create(scope->document(), scope, 0, range->startContainer(), range->startOffset());
2473 ASSERT(testRange->startContainer() == scope);
2474 location = TextIterator::rangeLength(testRange.get());
2475
2476 ExceptionCode ec;
2477 testRange->setEnd(range->endContainer(), range->endOffset(), ec);
2478 ASSERT(testRange->startContainer() == scope);
2479 length = TextIterator::rangeLength(testRange.get()) - location;
2480 return true;
2481 }
2482
2483 // --------
2484
plainTextToMallocAllocatedBuffer(const Range * r,unsigned & bufferLength,bool isDisplayString,TextIteratorBehavior defaultBehavior)2485 UChar* plainTextToMallocAllocatedBuffer(const Range* r, unsigned& bufferLength, bool isDisplayString, TextIteratorBehavior defaultBehavior)
2486 {
2487 UChar* result = 0;
2488
2489 // Do this in pieces to avoid massive reallocations if there is a large amount of text.
2490 // Use system malloc for buffers since they can consume lots of memory and current TCMalloc is unable return it back to OS.
2491 static const unsigned cMaxSegmentSize = 1 << 16;
2492 bufferLength = 0;
2493 typedef pair<UChar*, unsigned> TextSegment;
2494 OwnPtr<Vector<TextSegment> > textSegments;
2495 Vector<UChar> textBuffer;
2496 textBuffer.reserveInitialCapacity(cMaxSegmentSize);
2497 TextIteratorBehavior behavior = defaultBehavior;
2498 if (!isDisplayString)
2499 behavior = static_cast<TextIteratorBehavior>(behavior | TextIteratorEmitsTextsWithoutTranscoding);
2500
2501 for (TextIterator it(r, behavior); !it.atEnd(); it.advance()) {
2502 if (textBuffer.size() && textBuffer.size() + it.length() > cMaxSegmentSize) {
2503 UChar* newSegmentBuffer = static_cast<UChar*>(malloc(textBuffer.size() * sizeof(UChar)));
2504 if (!newSegmentBuffer)
2505 goto exit;
2506 memcpy(newSegmentBuffer, textBuffer.data(), textBuffer.size() * sizeof(UChar));
2507 if (!textSegments)
2508 textSegments = adoptPtr(new Vector<TextSegment>);
2509 textSegments->append(make_pair(newSegmentBuffer, (unsigned)textBuffer.size()));
2510 textBuffer.clear();
2511 }
2512 textBuffer.append(it.characters(), it.length());
2513 bufferLength += it.length();
2514 }
2515
2516 if (!bufferLength)
2517 return 0;
2518
2519 // Since we know the size now, we can make a single buffer out of the pieces with one big alloc
2520 result = static_cast<UChar*>(malloc(bufferLength * sizeof(UChar)));
2521 if (!result)
2522 goto exit;
2523
2524 {
2525 UChar* resultPos = result;
2526 if (textSegments) {
2527 unsigned size = textSegments->size();
2528 for (unsigned i = 0; i < size; ++i) {
2529 const TextSegment& segment = textSegments->at(i);
2530 memcpy(resultPos, segment.first, segment.second * sizeof(UChar));
2531 resultPos += segment.second;
2532 }
2533 }
2534 memcpy(resultPos, textBuffer.data(), textBuffer.size() * sizeof(UChar));
2535 }
2536
2537 exit:
2538 if (textSegments) {
2539 unsigned size = textSegments->size();
2540 for (unsigned i = 0; i < size; ++i)
2541 free(textSegments->at(i).first);
2542 }
2543
2544 if (isDisplayString && r->ownerDocument())
2545 r->ownerDocument()->displayBufferModifiedByEncoding(result, bufferLength);
2546
2547 return result;
2548 }
2549
plainText(const Range * r,TextIteratorBehavior defaultBehavior)2550 String plainText(const Range* r, TextIteratorBehavior defaultBehavior)
2551 {
2552 unsigned length;
2553 UChar* buf = plainTextToMallocAllocatedBuffer(r, length, false, defaultBehavior);
2554 if (!buf)
2555 return "";
2556 String result(buf, length);
2557 free(buf);
2558 return result;
2559 }
2560
isAllCollapsibleWhitespace(const String & string)2561 static inline bool isAllCollapsibleWhitespace(const String& string)
2562 {
2563 const UChar* characters = string.characters();
2564 unsigned length = string.length();
2565 for (unsigned i = 0; i < length; ++i) {
2566 if (!isCollapsibleWhitespace(characters[i]))
2567 return false;
2568 }
2569 return true;
2570 }
2571
collapsedToBoundary(const Range * range,bool forward)2572 static PassRefPtr<Range> collapsedToBoundary(const Range* range, bool forward)
2573 {
2574 ExceptionCode ec = 0;
2575 RefPtr<Range> result = range->cloneRange(ec);
2576 ASSERT(!ec);
2577 result->collapse(!forward, ec);
2578 ASSERT(!ec);
2579 return result.release();
2580 }
2581
findPlainText(CharacterIterator & it,const String & target,FindOptions options,size_t & matchStart)2582 static size_t findPlainText(CharacterIterator& it, const String& target, FindOptions options, size_t& matchStart)
2583 {
2584 matchStart = 0;
2585 size_t matchLength = 0;
2586
2587 SearchBuffer buffer(target, options);
2588
2589 if (buffer.needsMoreContext()) {
2590 RefPtr<Range> startRange = it.range();
2591 RefPtr<Range> beforeStartRange = startRange->ownerDocument()->createRange();
2592 ExceptionCode ec = 0;
2593 beforeStartRange->setEnd(startRange->startContainer(), startRange->startOffset(), ec);
2594 for (SimplifiedBackwardsTextIterator backwardsIterator(beforeStartRange.get()); !backwardsIterator.atEnd(); backwardsIterator.advance()) {
2595 buffer.prependContext(backwardsIterator.characters(), backwardsIterator.length());
2596 if (!buffer.needsMoreContext())
2597 break;
2598 }
2599 }
2600
2601 while (!it.atEnd()) {
2602 it.advance(buffer.append(it.characters(), it.length()));
2603 tryAgain:
2604 size_t matchStartOffset;
2605 if (size_t newMatchLength = buffer.search(matchStartOffset)) {
2606 // Note that we found a match, and where we found it.
2607 size_t lastCharacterInBufferOffset = it.characterOffset();
2608 ASSERT(lastCharacterInBufferOffset >= matchStartOffset);
2609 matchStart = lastCharacterInBufferOffset - matchStartOffset;
2610 matchLength = newMatchLength;
2611 // If searching forward, stop on the first match.
2612 // If searching backward, don't stop, so we end up with the last match.
2613 if (!(options & Backwards))
2614 break;
2615 goto tryAgain;
2616 }
2617 if (it.atBreak() && !buffer.atBreak()) {
2618 buffer.reachedBreak();
2619 goto tryAgain;
2620 }
2621 }
2622
2623 return matchLength;
2624 }
2625
findPlainText(const Range * range,const String & target,FindOptions options)2626 PassRefPtr<Range> findPlainText(const Range* range, const String& target, FindOptions options)
2627 {
2628 // First, find the text.
2629 size_t matchStart;
2630 size_t matchLength;
2631 {
2632 CharacterIterator findIterator(range, TextIteratorEntersTextControls);
2633 matchLength = findPlainText(findIterator, target, options, matchStart);
2634 if (!matchLength)
2635 return collapsedToBoundary(range, !(options & Backwards));
2636 }
2637
2638 // Then, find the document position of the start and the end of the text.
2639 CharacterIterator computeRangeIterator(range, TextIteratorEntersTextControls);
2640 return characterSubrange(computeRangeIterator, matchStart, matchLength);
2641 }
2642
2643 }
2644