1 /*
2 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010 Apple Inc. All rights reserved.
3 * Copyright (C) 2005 Alexey Proskuryakov.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27 #include "config.h"
28 #include "TextIterator.h"
29
30 #include "Document.h"
31 #include "Frame.h"
32 #include "HTMLElement.h"
33 #include "HTMLNames.h"
34 #include "htmlediting.h"
35 #include "InlineTextBox.h"
36 #include "Range.h"
37 #include "RenderTableCell.h"
38 #include "RenderTableRow.h"
39 #include "RenderTextControl.h"
40 #include "RenderTextFragment.h"
41 #include "TextBoundaries.h"
42 #include "TextBreakIterator.h"
43 #include "VisiblePosition.h"
44 #include "visible_units.h"
45 #include <wtf/unicode/CharacterNames.h>
46
47 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
48 #include "TextBreakIteratorInternalICU.h"
49 #include <unicode/usearch.h>
50 #endif
51
52 using namespace WTF::Unicode;
53 using namespace std;
54
55 namespace WebCore {
56
57 using namespace HTMLNames;
58
59 // Buffer that knows how to compare with a search target.
60 // Keeps enough of the previous text to be able to search in the future, but no more.
61 // Non-breaking spaces are always equal to normal spaces.
62 // Case folding is also done if the CaseInsensitive option is specified.
63 // Matches are further filtered if the AtWordStarts option is specified, although some
64 // matches inside a word are permitted if TreatMedialCapitalAsWordStart is specified as well.
65 class SearchBuffer {
66 WTF_MAKE_NONCOPYABLE(SearchBuffer);
67 public:
68 SearchBuffer(const String& target, FindOptions);
69 ~SearchBuffer();
70
71 // Returns number of characters appended; guaranteed to be in the range [1, length].
72 size_t append(const UChar*, size_t length);
73 bool needsMoreContext() const;
74 void prependContext(const UChar*, size_t length);
75 void reachedBreak();
76
77 // Result is the size in characters of what was found.
78 // And <startOffset> is the number of characters back to the start of what was found.
79 size_t search(size_t& startOffset);
80 bool atBreak() const;
81
82 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
83
84 private:
85 bool isBadMatch(const UChar*, size_t length) const;
86 bool isWordStartMatch(size_t start, size_t length) const;
87
88 String m_target;
89 FindOptions m_options;
90
91 Vector<UChar> m_buffer;
92 size_t m_overlap;
93 size_t m_prefixLength;
94 bool m_atBreak;
95 bool m_needsMoreContext;
96
97 bool m_targetRequiresKanaWorkaround;
98 Vector<UChar> m_normalizedTarget;
99 mutable Vector<UChar> m_normalizedMatch;
100
101 #else
102
103 private:
104 void append(UChar, bool isCharacterStart);
105 size_t length() const;
106
107 String m_target;
108 FindOptions m_options;
109
110 Vector<UChar> m_buffer;
111 Vector<bool> m_isCharacterStartBuffer;
112 bool m_isBufferFull;
113 size_t m_cursor;
114
115 #endif
116 };
117
118 // --------
119
120 static const unsigned bitsInWord = sizeof(unsigned) * 8;
121 static const unsigned bitInWordMask = bitsInWord - 1;
122
BitStack()123 BitStack::BitStack()
124 : m_size(0)
125 {
126 }
127
~BitStack()128 BitStack::~BitStack()
129 {
130 }
131
push(bool bit)132 void BitStack::push(bool bit)
133 {
134 unsigned index = m_size / bitsInWord;
135 unsigned shift = m_size & bitInWordMask;
136 if (!shift && index == m_words.size()) {
137 m_words.grow(index + 1);
138 m_words[index] = 0;
139 }
140 unsigned& word = m_words[index];
141 unsigned mask = 1U << shift;
142 if (bit)
143 word |= mask;
144 else
145 word &= ~mask;
146 ++m_size;
147 }
148
pop()149 void BitStack::pop()
150 {
151 if (m_size)
152 --m_size;
153 }
154
top() const155 bool BitStack::top() const
156 {
157 if (!m_size)
158 return false;
159 unsigned shift = (m_size - 1) & bitInWordMask;
160 return m_words.last() & (1U << shift);
161 }
162
size() const163 unsigned BitStack::size() const
164 {
165 return m_size;
166 }
167
168 // --------
169
170 #if !ASSERT_DISABLED
171
depthCrossingShadowBoundaries(Node * node)172 static unsigned depthCrossingShadowBoundaries(Node* node)
173 {
174 unsigned depth = 0;
175 for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
176 ++depth;
177 return depth;
178 }
179
180 #endif
181
182 // This function is like Range::pastLastNode, except for the fact that it can climb up out of shadow trees.
nextInPreOrderCrossingShadowBoundaries(Node * rangeEndContainer,int rangeEndOffset)183 static Node* nextInPreOrderCrossingShadowBoundaries(Node* rangeEndContainer, int rangeEndOffset)
184 {
185 if (!rangeEndContainer)
186 return 0;
187 if (rangeEndOffset >= 0 && !rangeEndContainer->offsetInCharacters()) {
188 if (Node* next = rangeEndContainer->childNode(rangeEndOffset))
189 return next;
190 }
191 for (Node* node = rangeEndContainer; node; node = node->parentOrHostNode()) {
192 if (Node* next = node->nextSibling())
193 return next;
194 }
195 return 0;
196 }
197
198 // --------
199
fullyClipsContents(Node * node)200 static inline bool fullyClipsContents(Node* node)
201 {
202 RenderObject* renderer = node->renderer();
203 if (!renderer || !renderer->isBox() || !renderer->hasOverflowClip())
204 return false;
205 return toRenderBox(renderer)->size().isEmpty();
206 }
207
ignoresContainerClip(Node * node)208 static inline bool ignoresContainerClip(Node* node)
209 {
210 RenderObject* renderer = node->renderer();
211 if (!renderer || renderer->isText())
212 return false;
213 EPosition position = renderer->style()->position();
214 return position == AbsolutePosition || position == FixedPosition;
215 }
216
pushFullyClippedState(BitStack & stack,Node * node)217 static void pushFullyClippedState(BitStack& stack, Node* node)
218 {
219 ASSERT(stack.size() == depthCrossingShadowBoundaries(node));
220
221 // Push true if this node full clips its contents, or if a parent already has fully
222 // clipped and this is not a node that ignores its container's clip.
223 stack.push(fullyClipsContents(node) || (stack.top() && !ignoresContainerClip(node)));
224 }
225
setUpFullyClippedStack(BitStack & stack,Node * node)226 static void setUpFullyClippedStack(BitStack& stack, Node* node)
227 {
228 // Put the nodes in a vector so we can iterate in reverse order.
229 Vector<Node*, 100> ancestry;
230 for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
231 ancestry.append(parent);
232
233 // Call pushFullyClippedState on each node starting with the earliest ancestor.
234 size_t size = ancestry.size();
235 for (size_t i = 0; i < size; ++i)
236 pushFullyClippedState(stack, ancestry[size - i - 1]);
237 pushFullyClippedState(stack, node);
238
239 ASSERT(stack.size() == 1 + depthCrossingShadowBoundaries(node));
240 }
241
242 #if OS(ANDROID)
checkFormControlElement(Node * startNode)243 static bool checkFormControlElement(Node* startNode)
244 {
245 Node* node = startNode;
246 while (node) {
247 if (node->isElementNode() && static_cast<Element*>(node)->isFormControlElement())
248 return true;
249 node = node->parentNode();
250 }
251 return false;
252 }
253 #endif
254
255
256 // --------
257
TextIterator()258 TextIterator::TextIterator()
259 : m_startContainer(0)
260 , m_startOffset(0)
261 , m_endContainer(0)
262 , m_endOffset(0)
263 , m_positionNode(0)
264 , m_textCharacters(0)
265 , m_textLength(0)
266 , m_remainingTextBox(0)
267 , m_firstLetterText(0)
268 , m_lastCharacter(0)
269 , m_emitsCharactersBetweenAllVisiblePositions(false)
270 , m_entersTextControls(false)
271 , m_emitsTextWithoutTranscoding(false)
272 , m_handledFirstLetter(false)
273 , m_ignoresStyleVisibility(false)
274 , m_emitsObjectReplacementCharacters(false)
275 #if OS(ANDROID)
276 , m_stopsOnFormControls(false)
277 , m_shouldStop(false)
278 #endif
279 {
280 }
281
TextIterator(const Range * r,TextIteratorBehavior behavior)282 TextIterator::TextIterator(const Range* r, TextIteratorBehavior behavior)
283 : m_startContainer(0)
284 , m_startOffset(0)
285 , m_endContainer(0)
286 , m_endOffset(0)
287 , m_positionNode(0)
288 , m_textCharacters(0)
289 , m_textLength(0)
290 , m_remainingTextBox(0)
291 , m_firstLetterText(0)
292 , m_emitsCharactersBetweenAllVisiblePositions(behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions)
293 , m_entersTextControls(behavior & TextIteratorEntersTextControls)
294 , m_emitsTextWithoutTranscoding(behavior & TextIteratorEmitsTextsWithoutTranscoding)
295 , m_handledFirstLetter(false)
296 , m_ignoresStyleVisibility(behavior & TextIteratorIgnoresStyleVisibility)
297 , m_emitsObjectReplacementCharacters(behavior & TextIteratorEmitsObjectReplacementCharacters)
298 #if OS(ANDROID)
299 , m_stopsOnFormControls(behavior & TextIteratorStopsOnFormControls)
300 , m_shouldStop(false)
301 , m_needsAnotherNewline(false)
302 #endif
303 {
304 if (!r)
305 return;
306
307 // get and validate the range endpoints
308 Node* startContainer = r->startContainer();
309 if (!startContainer)
310 return;
311 int startOffset = r->startOffset();
312 Node* endContainer = r->endContainer();
313 int endOffset = r->endOffset();
314
315 // Callers should be handing us well-formed ranges. If we discover that this isn't
316 // the case, we could consider changing this assertion to an early return.
317 ASSERT(r->boundaryPointsValid());
318
319 // remember range - this does not change
320 m_startContainer = startContainer;
321 m_startOffset = startOffset;
322 m_endContainer = endContainer;
323 m_endOffset = endOffset;
324
325 // set up the current node for processing
326 m_node = r->firstNode();
327 if (!m_node)
328 return;
329 setUpFullyClippedStack(m_fullyClippedStack, m_node);
330 m_offset = m_node == m_startContainer ? m_startOffset : 0;
331 m_handledNode = false;
332 m_handledChildren = false;
333
334 // calculate first out of bounds node
335 m_pastEndNode = nextInPreOrderCrossingShadowBoundaries(endContainer, endOffset);
336
337 // initialize node processing state
338 m_needsAnotherNewline = false;
339 m_textBox = 0;
340
341 // initialize record of previous node processing
342 m_hasEmitted = false;
343 m_lastTextNode = 0;
344 m_lastTextNodeEndedWithCollapsedSpace = false;
345 m_lastCharacter = 0;
346
347 #ifndef NDEBUG
348 // need this just because of the assert in advance()
349 m_positionNode = m_node;
350 #endif
351
352 // identify the first run
353 advance();
354 }
355
~TextIterator()356 TextIterator::~TextIterator()
357 {
358 }
359
atEnd() const360 bool TextIterator::atEnd() const
361 {
362 #if OS(ANDROID)
363 return !m_positionNode || m_shouldStop;
364 #else
365 return !m_positionNode;
366 #endif
367 }
368
advance()369 void TextIterator::advance()
370 {
371 #if OS(ANDROID)
372 if (m_shouldStop)
373 return;
374 #endif
375 // reset the run information
376 m_positionNode = 0;
377 m_textLength = 0;
378
379 // handle remembered node that needed a newline after the text node's newline
380 if (m_needsAnotherNewline) {
381 // Emit the extra newline, and position it *inside* m_node, after m_node's
382 // contents, in case it's a block, in the same way that we position the first
383 // newline. The range for the emitted newline should start where the line
384 // break begins.
385 // FIXME: It would be cleaner if we emitted two newlines during the last
386 // iteration, instead of using m_needsAnotherNewline.
387 Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
388 emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
389 m_needsAnotherNewline = false;
390 return;
391 }
392
393 if (!m_textBox && m_remainingTextBox) {
394 m_textBox = m_remainingTextBox;
395 m_remainingTextBox = 0;
396 m_firstLetterText = 0;
397 m_offset = 0;
398 }
399 // handle remembered text box
400 if (m_textBox) {
401 handleTextBox();
402 if (m_positionNode)
403 return;
404 }
405
406 while (m_node && m_node != m_pastEndNode) {
407 #if OS(ANDROID)
408 if (!m_shouldStop && m_stopsOnFormControls && checkFormControlElement(m_node))
409 m_shouldStop = true;
410 #endif
411 // if the range ends at offset 0 of an element, represent the
412 // position, but not the content, of that element e.g. if the
413 // node is a blockflow element, emit a newline that
414 // precedes the element
415 if (m_node == m_endContainer && m_endOffset == 0) {
416 representNodeOffsetZero();
417 m_node = 0;
418 return;
419 }
420
421 RenderObject* renderer = m_node->renderer();
422 if (!renderer) {
423 m_handledNode = true;
424 m_handledChildren = true;
425 } else {
426 // handle current node according to its type
427 if (!m_handledNode) {
428 if (renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) // FIXME: What about CDATA_SECTION_NODE?
429 m_handledNode = handleTextNode();
430 else if (renderer && (renderer->isImage() || renderer->isWidget() ||
431 (renderer->node() && renderer->node()->isElementNode() &&
432 static_cast<Element*>(renderer->node())->isFormControlElement())))
433 m_handledNode = handleReplacedElement();
434 else
435 m_handledNode = handleNonTextNode();
436 if (m_positionNode)
437 return;
438 }
439 }
440
441 // find a new current node to handle in depth-first manner,
442 // calling exitNode() as we come back thru a parent node
443 Node* next = m_handledChildren ? 0 : m_node->firstChild();
444 m_offset = 0;
445 if (!next) {
446 next = m_node->nextSibling();
447 if (!next) {
448 bool pastEnd = m_node->traverseNextNode() == m_pastEndNode;
449 Node* parentNode = m_node->parentOrHostNode();
450 while (!next && parentNode) {
451 if ((pastEnd && parentNode == m_endContainer) || m_endContainer->isDescendantOf(parentNode))
452 return;
453 bool haveRenderer = m_node->renderer();
454 m_node = parentNode;
455 m_fullyClippedStack.pop();
456 parentNode = m_node->parentOrHostNode();
457 if (haveRenderer)
458 exitNode();
459 if (m_positionNode) {
460 m_handledNode = true;
461 m_handledChildren = true;
462 return;
463 }
464 next = m_node->nextSibling();
465 }
466 }
467 m_fullyClippedStack.pop();
468 }
469
470 // set the new current node
471 m_node = next;
472 if (m_node)
473 pushFullyClippedState(m_fullyClippedStack, m_node);
474 m_handledNode = false;
475 m_handledChildren = false;
476 m_handledFirstLetter = false;
477 m_firstLetterText = 0;
478
479 // how would this ever be?
480 if (m_positionNode)
481 return;
482 }
483 }
484
handleTextNode()485 bool TextIterator::handleTextNode()
486 {
487 if (m_fullyClippedStack.top() && !m_ignoresStyleVisibility)
488 return false;
489
490 RenderText* renderer = toRenderText(m_node->renderer());
491
492 m_lastTextNode = m_node;
493 String str = renderer->text();
494
495 // handle pre-formatted text
496 if (!renderer->style()->collapseWhiteSpace()) {
497 int runStart = m_offset;
498 if (m_lastTextNodeEndedWithCollapsedSpace && hasVisibleTextNode(renderer)) {
499 emitCharacter(' ', m_node, 0, runStart, runStart);
500 return false;
501 }
502 if (!m_handledFirstLetter && renderer->isTextFragment()) {
503 handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
504 if (m_firstLetterText) {
505 String firstLetter = m_firstLetterText->text();
506 emitText(m_node, m_firstLetterText, m_offset, m_offset + firstLetter.length());
507 m_firstLetterText = 0;
508 m_textBox = 0;
509 return false;
510 }
511 }
512 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
513 return false;
514 int strLength = str.length();
515 int end = (m_node == m_endContainer) ? m_endOffset : INT_MAX;
516 int runEnd = min(strLength, end);
517
518 if (runStart >= runEnd)
519 return true;
520
521 emitText(m_node, runStart, runEnd);
522 return true;
523 }
524
525 if (!renderer->firstTextBox() && str.length() > 0) {
526 if (!m_handledFirstLetter && renderer->isTextFragment()) {
527 handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
528 if (m_firstLetterText) {
529 handleTextBox();
530 return false;
531 }
532 }
533 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
534 return false;
535 m_lastTextNodeEndedWithCollapsedSpace = true; // entire block is collapsed space
536 return true;
537 }
538
539 // Used when text boxes are out of order (Hebrew/Arabic w/ embeded LTR text)
540 if (renderer->containsReversedText()) {
541 m_sortedTextBoxes.clear();
542 for (InlineTextBox* textBox = renderer->firstTextBox(); textBox; textBox = textBox->nextTextBox()) {
543 m_sortedTextBoxes.append(textBox);
544 }
545 std::sort(m_sortedTextBoxes.begin(), m_sortedTextBoxes.end(), InlineTextBox::compareByStart);
546 m_sortedTextBoxesPosition = 0;
547 }
548
549 m_textBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox();
550 if (!m_handledFirstLetter && renderer->isTextFragment() && !m_offset)
551 handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
552 handleTextBox();
553 return true;
554 }
555
handleTextBox()556 void TextIterator::handleTextBox()
557 {
558 RenderText* renderer = m_firstLetterText ? m_firstLetterText : toRenderText(m_node->renderer());
559 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility) {
560 m_textBox = 0;
561 return;
562 }
563 String str = renderer->text();
564 unsigned start = m_offset;
565 unsigned end = (m_node == m_endContainer) ? static_cast<unsigned>(m_endOffset) : UINT_MAX;
566 while (m_textBox) {
567 unsigned textBoxStart = m_textBox->start();
568 unsigned runStart = max(textBoxStart, start);
569
570 // Check for collapsed space at the start of this run.
571 InlineTextBox* firstTextBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox();
572 bool needSpace = m_lastTextNodeEndedWithCollapsedSpace
573 || (m_textBox == firstTextBox && textBoxStart == runStart && runStart > 0);
574 if (needSpace && !isCollapsibleWhitespace(m_lastCharacter) && m_lastCharacter) {
575 if (m_lastTextNode == m_node && runStart > 0 && str[runStart - 1] == ' ') {
576 unsigned spaceRunStart = runStart - 1;
577 while (spaceRunStart > 0 && str[spaceRunStart - 1] == ' ')
578 --spaceRunStart;
579 emitText(m_node, renderer, spaceRunStart, spaceRunStart + 1);
580 } else
581 emitCharacter(' ', m_node, 0, runStart, runStart);
582 return;
583 }
584 unsigned textBoxEnd = textBoxStart + m_textBox->len();
585 unsigned runEnd = min(textBoxEnd, end);
586
587 // Determine what the next text box will be, but don't advance yet
588 InlineTextBox* nextTextBox = 0;
589 if (renderer->containsReversedText()) {
590 if (m_sortedTextBoxesPosition + 1 < m_sortedTextBoxes.size())
591 nextTextBox = m_sortedTextBoxes[m_sortedTextBoxesPosition + 1];
592 } else
593 nextTextBox = m_textBox->nextTextBox();
594
595 if (runStart < runEnd) {
596 // Handle either a single newline character (which becomes a space),
597 // or a run of characters that does not include a newline.
598 // This effectively translates newlines to spaces without copying the text.
599 if (str[runStart] == '\n') {
600 emitCharacter(' ', m_node, 0, runStart, runStart + 1);
601 m_offset = runStart + 1;
602 } else {
603 size_t subrunEnd = str.find('\n', runStart);
604 if (subrunEnd == notFound || subrunEnd > runEnd)
605 subrunEnd = runEnd;
606
607 m_offset = subrunEnd;
608 emitText(m_node, renderer, runStart, subrunEnd);
609 }
610
611 // If we are doing a subrun that doesn't go to the end of the text box,
612 // come back again to finish handling this text box; don't advance to the next one.
613 if (static_cast<unsigned>(m_positionEndOffset) < textBoxEnd)
614 return;
615
616 // Advance and return
617 unsigned nextRunStart = nextTextBox ? nextTextBox->start() : str.length();
618 if (nextRunStart > runEnd)
619 m_lastTextNodeEndedWithCollapsedSpace = true; // collapsed space between runs or at the end
620 m_textBox = nextTextBox;
621 if (renderer->containsReversedText())
622 ++m_sortedTextBoxesPosition;
623 return;
624 }
625 // Advance and continue
626 m_textBox = nextTextBox;
627 if (renderer->containsReversedText())
628 ++m_sortedTextBoxesPosition;
629 }
630 if (!m_textBox && m_remainingTextBox) {
631 m_textBox = m_remainingTextBox;
632 m_remainingTextBox = 0;
633 m_firstLetterText = 0;
634 m_offset = 0;
635 handleTextBox();
636 }
637 }
638
handleTextNodeFirstLetter(RenderTextFragment * renderer)639 void TextIterator::handleTextNodeFirstLetter(RenderTextFragment* renderer)
640 {
641 if (renderer->firstLetter()) {
642 RenderObject* r = renderer->firstLetter();
643 if (r->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
644 return;
645 for (RenderObject *currChild = r->firstChild(); currChild; currChild->nextSibling()) {
646 if (currChild->isText()) {
647 RenderText* firstLetter = toRenderText(currChild);
648 m_handledFirstLetter = true;
649 m_remainingTextBox = m_textBox;
650 m_textBox = firstLetter->firstTextBox();
651 m_firstLetterText = firstLetter;
652 return;
653 }
654 }
655 }
656 m_handledFirstLetter = true;
657 }
658
handleReplacedElement()659 bool TextIterator::handleReplacedElement()
660 {
661 if (m_fullyClippedStack.top())
662 return false;
663
664 RenderObject* renderer = m_node->renderer();
665 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
666 return false;
667
668 if (m_lastTextNodeEndedWithCollapsedSpace) {
669 emitCharacter(' ', m_lastTextNode->parentNode(), m_lastTextNode, 1, 1);
670 return false;
671 }
672
673 if (m_entersTextControls && renderer->isTextControl()) {
674 if (HTMLElement* innerTextElement = toRenderTextControl(renderer)->innerTextElement()) {
675 m_node = innerTextElement->shadowTreeRootNode();
676 pushFullyClippedState(m_fullyClippedStack, m_node);
677 m_offset = 0;
678 return false;
679 }
680 }
681
682 m_hasEmitted = true;
683
684 if (m_emitsObjectReplacementCharacters && renderer && renderer->isReplaced()) {
685 emitCharacter(objectReplacementCharacter, m_node->parentNode(), m_node, 0, 1);
686 return true;
687 }
688
689 if (m_emitsCharactersBetweenAllVisiblePositions) {
690 // We want replaced elements to behave like punctuation for boundary
691 // finding, and to simply take up space for the selection preservation
692 // code in moveParagraphs, so we use a comma.
693 emitCharacter(',', m_node->parentNode(), m_node, 0, 1);
694 return true;
695 }
696
697 m_positionNode = m_node->parentNode();
698 m_positionOffsetBaseNode = m_node;
699 m_positionStartOffset = 0;
700 m_positionEndOffset = 1;
701
702 m_textCharacters = 0;
703 m_textLength = 0;
704
705 m_lastCharacter = 0;
706
707 return true;
708 }
709
hasVisibleTextNode(RenderText * renderer)710 bool TextIterator::hasVisibleTextNode(RenderText* renderer)
711 {
712 if (renderer->style()->visibility() == VISIBLE)
713 return true;
714 if (renderer->isTextFragment()) {
715 RenderTextFragment* fragment = static_cast<RenderTextFragment*>(renderer);
716 if (fragment->firstLetter() && fragment->firstLetter()->style()->visibility() == VISIBLE)
717 return true;
718 }
719 return false;
720 }
721
shouldEmitTabBeforeNode(Node * node)722 static bool shouldEmitTabBeforeNode(Node* node)
723 {
724 RenderObject* r = node->renderer();
725
726 // Table cells are delimited by tabs.
727 if (!r || !isTableCell(node))
728 return false;
729
730 // Want a tab before every cell other than the first one
731 RenderTableCell* rc = toRenderTableCell(r);
732 RenderTable* t = rc->table();
733 return t && (t->cellBefore(rc) || t->cellAbove(rc));
734 }
735
shouldEmitNewlineForNode(Node * node)736 static bool shouldEmitNewlineForNode(Node* node)
737 {
738 // br elements are represented by a single newline.
739 RenderObject* r = node->renderer();
740 if (!r)
741 return node->hasTagName(brTag);
742
743 return r->isBR();
744 }
745
shouldEmitNewlinesBeforeAndAfterNode(Node * node)746 static bool shouldEmitNewlinesBeforeAndAfterNode(Node* node)
747 {
748 // Block flow (versus inline flow) is represented by having
749 // a newline both before and after the element.
750 RenderObject* r = node->renderer();
751 if (!r) {
752 return (node->hasTagName(blockquoteTag)
753 || node->hasTagName(ddTag)
754 || node->hasTagName(divTag)
755 || node->hasTagName(dlTag)
756 || node->hasTagName(dtTag)
757 || node->hasTagName(h1Tag)
758 || node->hasTagName(h2Tag)
759 || node->hasTagName(h3Tag)
760 || node->hasTagName(h4Tag)
761 || node->hasTagName(h5Tag)
762 || node->hasTagName(h6Tag)
763 || node->hasTagName(hrTag)
764 || node->hasTagName(liTag)
765 || node->hasTagName(listingTag)
766 || node->hasTagName(olTag)
767 || node->hasTagName(pTag)
768 || node->hasTagName(preTag)
769 || node->hasTagName(trTag)
770 || node->hasTagName(ulTag));
771 }
772
773 // Need to make an exception for table cells, because they are blocks, but we
774 // want them tab-delimited rather than having newlines before and after.
775 if (isTableCell(node))
776 return false;
777
778 // Need to make an exception for table row elements, because they are neither
779 // "inline" or "RenderBlock", but we want newlines for them.
780 if (r->isTableRow()) {
781 RenderTable* t = toRenderTableRow(r)->table();
782 if (t && !t->isInline())
783 return true;
784 }
785
786 return !r->isInline() && r->isRenderBlock() && !r->isFloatingOrPositioned() && !r->isBody();
787 }
788
shouldEmitNewlineAfterNode(Node * node)789 static bool shouldEmitNewlineAfterNode(Node* node)
790 {
791 // FIXME: It should be better but slower to create a VisiblePosition here.
792 if (!shouldEmitNewlinesBeforeAndAfterNode(node))
793 return false;
794 // Check if this is the very last renderer in the document.
795 // If so, then we should not emit a newline.
796 while ((node = node->traverseNextSibling()))
797 if (node->renderer())
798 return true;
799 return false;
800 }
801
shouldEmitNewlineBeforeNode(Node * node)802 static bool shouldEmitNewlineBeforeNode(Node* node)
803 {
804 return shouldEmitNewlinesBeforeAndAfterNode(node);
805 }
806
shouldEmitExtraNewlineForNode(Node * node)807 static bool shouldEmitExtraNewlineForNode(Node* node)
808 {
809 // When there is a significant collapsed bottom margin, emit an extra
810 // newline for a more realistic result. We end up getting the right
811 // result even without margin collapsing. For example: <div><p>text</p></div>
812 // will work right even if both the <div> and the <p> have bottom margins.
813 RenderObject* r = node->renderer();
814 if (!r || !r->isBox())
815 return false;
816
817 // NOTE: We only do this for a select set of nodes, and fwiw WinIE appears
818 // not to do this at all
819 if (node->hasTagName(h1Tag)
820 || node->hasTagName(h2Tag)
821 || node->hasTagName(h3Tag)
822 || node->hasTagName(h4Tag)
823 || node->hasTagName(h5Tag)
824 || node->hasTagName(h6Tag)
825 || node->hasTagName(pTag)) {
826 RenderStyle* style = r->style();
827 if (style) {
828 int bottomMargin = toRenderBox(r)->collapsedMarginAfter();
829 int fontSize = style->fontDescription().computedPixelSize();
830 if (bottomMargin * 2 >= fontSize)
831 return true;
832 }
833 }
834
835 return false;
836 }
837
collapsedSpaceLength(RenderText * renderer,int textEnd)838 static int collapsedSpaceLength(RenderText* renderer, int textEnd)
839 {
840 const UChar* characters = renderer->text()->characters();
841 int length = renderer->text()->length();
842 for (int i = textEnd; i < length; ++i) {
843 if (!renderer->style()->isCollapsibleWhiteSpace(characters[i]))
844 return i - textEnd;
845 }
846
847 return length - textEnd;
848 }
849
maxOffsetIncludingCollapsedSpaces(Node * node)850 static int maxOffsetIncludingCollapsedSpaces(Node* node)
851 {
852 int offset = caretMaxOffset(node);
853
854 if (node->renderer() && node->renderer()->isText())
855 offset += collapsedSpaceLength(toRenderText(node->renderer()), offset);
856
857 return offset;
858 }
859
860 // Whether or not we should emit a character as we enter m_node (if it's a container) or as we hit it (if it's atomic).
shouldRepresentNodeOffsetZero()861 bool TextIterator::shouldRepresentNodeOffsetZero()
862 {
863 if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isTable())
864 return true;
865
866 // Leave element positioned flush with start of a paragraph
867 // (e.g. do not insert tab before a table cell at the start of a paragraph)
868 if (m_lastCharacter == '\n')
869 return false;
870
871 // Otherwise, show the position if we have emitted any characters
872 if (m_hasEmitted)
873 return true;
874
875 // We've not emitted anything yet. Generally, there is no need for any positioning then.
876 // The only exception is when the element is visually not in the same line as
877 // the start of the range (e.g. the range starts at the end of the previous paragraph).
878 // NOTE: Creating VisiblePositions and comparing them is relatively expensive, so we
879 // make quicker checks to possibly avoid that. Another check that we could make is
880 // is whether the inline vs block flow changed since the previous visible element.
881 // I think we're already in a special enough case that that won't be needed, tho.
882
883 // No character needed if this is the first node in the range.
884 if (m_node == m_startContainer)
885 return false;
886
887 // If we are outside the start container's subtree, assume we need to emit.
888 // FIXME: m_startContainer could be an inline block
889 if (!m_node->isDescendantOf(m_startContainer))
890 return true;
891
892 // If we started as m_startContainer offset 0 and the current node is a descendant of
893 // the start container, we already had enough context to correctly decide whether to
894 // emit after a preceding block. We chose not to emit (m_hasEmitted is false),
895 // so don't second guess that now.
896 // NOTE: Is this really correct when m_node is not a leftmost descendant? Probably
897 // immaterial since we likely would have already emitted something by now.
898 if (m_startOffset == 0)
899 return false;
900
901 // If this node is unrendered or invisible the VisiblePosition checks below won't have much meaning.
902 // Additionally, if the range we are iterating over contains huge sections of unrendered content,
903 // we would create VisiblePositions on every call to this function without this check.
904 if (!m_node->renderer() || m_node->renderer()->style()->visibility() != VISIBLE)
905 return false;
906
907 // The startPos.isNotNull() check is needed because the start could be before the body,
908 // and in that case we'll get null. We don't want to put in newlines at the start in that case.
909 // The currPos.isNotNull() check is needed because positions in non-HTML content
910 // (like SVG) do not have visible positions, and we don't want to emit for them either.
911 VisiblePosition startPos = VisiblePosition(Position(m_startContainer, m_startOffset, Position::PositionIsOffsetInAnchor), DOWNSTREAM);
912 VisiblePosition currPos = VisiblePosition(positionBeforeNode(m_node), DOWNSTREAM);
913 return startPos.isNotNull() && currPos.isNotNull() && !inSameLine(startPos, currPos);
914 }
915
shouldEmitSpaceBeforeAndAfterNode(Node * node)916 bool TextIterator::shouldEmitSpaceBeforeAndAfterNode(Node* node)
917 {
918 return node->renderer() && node->renderer()->isTable() && (node->renderer()->isInline() || m_emitsCharactersBetweenAllVisiblePositions);
919 }
920
representNodeOffsetZero()921 void TextIterator::representNodeOffsetZero()
922 {
923 // Emit a character to show the positioning of m_node.
924
925 // When we haven't been emitting any characters, shouldRepresentNodeOffsetZero() can
926 // create VisiblePositions, which is expensive. So, we perform the inexpensive checks
927 // on m_node to see if it necessitates emitting a character first and will early return
928 // before encountering shouldRepresentNodeOffsetZero()s worse case behavior.
929 if (shouldEmitTabBeforeNode(m_node)) {
930 if (shouldRepresentNodeOffsetZero())
931 emitCharacter('\t', m_node->parentNode(), m_node, 0, 0);
932 } else if (shouldEmitNewlineBeforeNode(m_node)) {
933 if (shouldRepresentNodeOffsetZero())
934 emitCharacter('\n', m_node->parentNode(), m_node, 0, 0);
935 } else if (shouldEmitSpaceBeforeAndAfterNode(m_node)) {
936 if (shouldRepresentNodeOffsetZero())
937 emitCharacter(' ', m_node->parentNode(), m_node, 0, 0);
938 }
939 }
940
handleNonTextNode()941 bool TextIterator::handleNonTextNode()
942 {
943 if (shouldEmitNewlineForNode(m_node))
944 emitCharacter('\n', m_node->parentNode(), m_node, 0, 1);
945 else if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isHR())
946 emitCharacter(' ', m_node->parentNode(), m_node, 0, 1);
947 else
948 representNodeOffsetZero();
949
950 return true;
951 }
952
exitNode()953 void TextIterator::exitNode()
954 {
955 // prevent emitting a newline when exiting a collapsed block at beginning of the range
956 // FIXME: !m_hasEmitted does not necessarily mean there was a collapsed block... it could
957 // have been an hr (e.g.). Also, a collapsed block could have height (e.g. a table) and
958 // therefore look like a blank line.
959 if (!m_hasEmitted)
960 return;
961
962 // Emit with a position *inside* m_node, after m_node's contents, in
963 // case it is a block, because the run should start where the
964 // emitted character is positioned visually.
965 Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
966 // FIXME: This shouldn't require the m_lastTextNode to be true, but we can't change that without making
967 // the logic in _web_attributedStringFromRange match. We'll get that for free when we switch to use
968 // TextIterator in _web_attributedStringFromRange.
969 // See <rdar://problem/5428427> for an example of how this mismatch will cause problems.
970 if (m_lastTextNode && shouldEmitNewlineAfterNode(m_node)) {
971 // use extra newline to represent margin bottom, as needed
972 bool addNewline = shouldEmitExtraNewlineForNode(m_node);
973
974 // FIXME: We need to emit a '\n' as we leave an empty block(s) that
975 // contain a VisiblePosition when doing selection preservation.
976 if (m_lastCharacter != '\n') {
977 // insert a newline with a position following this block's contents.
978 emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
979 // remember whether to later add a newline for the current node
980 ASSERT(!m_needsAnotherNewline);
981 m_needsAnotherNewline = addNewline;
982 } else if (addNewline)
983 // insert a newline with a position following this block's contents.
984 emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
985 }
986
987 // If nothing was emitted, see if we need to emit a space.
988 if (!m_positionNode && shouldEmitSpaceBeforeAndAfterNode(m_node))
989 emitCharacter(' ', baseNode->parentNode(), baseNode, 1, 1);
990 }
991
emitCharacter(UChar c,Node * textNode,Node * offsetBaseNode,int textStartOffset,int textEndOffset)992 void TextIterator::emitCharacter(UChar c, Node* textNode, Node* offsetBaseNode, int textStartOffset, int textEndOffset)
993 {
994 m_hasEmitted = true;
995
996 // remember information with which to construct the TextIterator::range()
997 // NOTE: textNode is often not a text node, so the range will specify child nodes of positionNode
998 m_positionNode = textNode;
999 m_positionOffsetBaseNode = offsetBaseNode;
1000 m_positionStartOffset = textStartOffset;
1001 m_positionEndOffset = textEndOffset;
1002
1003 // remember information with which to construct the TextIterator::characters() and length()
1004 m_singleCharacterBuffer = c;
1005 m_textCharacters = &m_singleCharacterBuffer;
1006 m_textLength = 1;
1007
1008 // remember some iteration state
1009 m_lastTextNodeEndedWithCollapsedSpace = false;
1010 m_lastCharacter = c;
1011 }
1012
emitText(Node * textNode,RenderObject * renderObject,int textStartOffset,int textEndOffset)1013 void TextIterator::emitText(Node* textNode, RenderObject* renderObject, int textStartOffset, int textEndOffset)
1014 {
1015 RenderText* renderer = toRenderText(renderObject);
1016 m_text = m_emitsTextWithoutTranscoding ? renderer->textWithoutTranscoding() : renderer->text();
1017 ASSERT(m_text.characters());
1018
1019 m_positionNode = textNode;
1020 m_positionOffsetBaseNode = 0;
1021 m_positionStartOffset = textStartOffset;
1022 m_positionEndOffset = textEndOffset;
1023 m_textCharacters = m_text.characters() + textStartOffset;
1024 m_textLength = textEndOffset - textStartOffset;
1025 m_lastCharacter = m_text[textEndOffset - 1];
1026
1027 m_lastTextNodeEndedWithCollapsedSpace = false;
1028 m_hasEmitted = true;
1029 }
1030
emitText(Node * textNode,int textStartOffset,int textEndOffset)1031 void TextIterator::emitText(Node* textNode, int textStartOffset, int textEndOffset)
1032 {
1033 emitText(textNode, m_node->renderer(), textStartOffset, textEndOffset);
1034 }
1035
range() const1036 PassRefPtr<Range> TextIterator::range() const
1037 {
1038 // use the current run information, if we have it
1039 if (m_positionNode) {
1040 if (m_positionOffsetBaseNode) {
1041 int index = m_positionOffsetBaseNode->nodeIndex();
1042 m_positionStartOffset += index;
1043 m_positionEndOffset += index;
1044 m_positionOffsetBaseNode = 0;
1045 }
1046 return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1047 }
1048
1049 // otherwise, return the end of the overall range we were given
1050 if (m_endContainer)
1051 return Range::create(m_endContainer->document(), m_endContainer, m_endOffset, m_endContainer, m_endOffset);
1052
1053 return 0;
1054 }
1055
node() const1056 Node* TextIterator::node() const
1057 {
1058 RefPtr<Range> textRange = range();
1059 if (!textRange)
1060 return 0;
1061
1062 Node* node = textRange->startContainer();
1063 if (!node)
1064 return 0;
1065 if (node->offsetInCharacters())
1066 return node;
1067
1068 return node->childNode(textRange->startOffset());
1069 }
1070
1071 // --------
1072
SimplifiedBackwardsTextIterator()1073 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator()
1074 : m_behavior(TextIteratorDefaultBehavior)
1075 , m_node(0)
1076 , m_positionNode(0)
1077 #if OS(ANDROID)
1078 , m_stopsOnFormControls(false)
1079 , m_shouldStop(false)
1080 #endif
1081 {
1082 }
1083
SimplifiedBackwardsTextIterator(const Range * r,TextIteratorBehavior behavior)1084 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator(const Range* r, TextIteratorBehavior behavior)
1085 : m_behavior(behavior)
1086 , m_node(0)
1087 , m_positionNode(0)
1088 #if OS(ANDROID)
1089 , m_stopsOnFormControls(behavior & TextIteratorStopsOnFormControls)
1090 , m_shouldStop(false)
1091 #endif
1092 {
1093 #if OS(ANDROID)
1094 ASSERT(m_behavior == TextIteratorDefaultBehavior || m_behavior == TextIteratorStopsOnFormControls);
1095 #else
1096 ASSERT(m_behavior == TextIteratorDefaultBehavior);
1097 #endif
1098
1099 if (!r)
1100 return;
1101
1102 Node* startNode = r->startContainer();
1103 if (!startNode)
1104 return;
1105 Node* endNode = r->endContainer();
1106 int startOffset = r->startOffset();
1107 int endOffset = r->endOffset();
1108
1109 if (!startNode->offsetInCharacters()) {
1110 if (startOffset >= 0 && startOffset < static_cast<int>(startNode->childNodeCount())) {
1111 startNode = startNode->childNode(startOffset);
1112 startOffset = 0;
1113 }
1114 }
1115 if (!endNode->offsetInCharacters()) {
1116 if (endOffset > 0 && endOffset <= static_cast<int>(endNode->childNodeCount())) {
1117 endNode = endNode->childNode(endOffset - 1);
1118 endOffset = lastOffsetInNode(endNode);
1119 }
1120 }
1121
1122 m_node = endNode;
1123 setUpFullyClippedStack(m_fullyClippedStack, m_node);
1124 m_offset = endOffset;
1125 m_handledNode = false;
1126 m_handledChildren = endOffset == 0;
1127
1128 m_startNode = startNode;
1129 m_startOffset = startOffset;
1130 m_endNode = endNode;
1131 m_endOffset = endOffset;
1132
1133 #ifndef NDEBUG
1134 // Need this just because of the assert.
1135 m_positionNode = endNode;
1136 #endif
1137
1138 m_lastTextNode = 0;
1139 m_lastCharacter = '\n';
1140
1141 m_havePassedStartNode = false;
1142
1143 advance();
1144 }
1145
atEnd() const1146 bool SimplifiedBackwardsTextIterator::atEnd() const
1147 {
1148 #if OS(ANDROID)
1149 return !m_positionNode || m_shouldStop;
1150 #else
1151 return !m_positionNode;
1152 #endif
1153 }
1154
advance()1155 void SimplifiedBackwardsTextIterator::advance()
1156 {
1157 ASSERT(m_positionNode);
1158
1159 #if OS(ANDROID)
1160 if (m_shouldStop)
1161 return;
1162
1163 // Prevent changing the iterator position if a form control element was found and advance should stop on it.
1164 if (m_stopsOnFormControls && checkFormControlElement(m_node)) {
1165 m_shouldStop = true;
1166 return;
1167 }
1168 #endif
1169
1170 m_positionNode = 0;
1171 m_textLength = 0;
1172
1173 while (m_node && !m_havePassedStartNode) {
1174 // Don't handle node if we start iterating at [node, 0].
1175 if (!m_handledNode && !(m_node == m_endNode && m_endOffset == 0)) {
1176 RenderObject* renderer = m_node->renderer();
1177 if (renderer && renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) {
1178 // FIXME: What about CDATA_SECTION_NODE?
1179 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
1180 m_handledNode = handleTextNode();
1181 } else if (renderer && (renderer->isImage() || renderer->isWidget())) {
1182 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
1183 m_handledNode = handleReplacedElement();
1184 } else
1185 m_handledNode = handleNonTextNode();
1186 if (m_positionNode)
1187 return;
1188 }
1189
1190 if (!m_handledChildren && m_node->hasChildNodes()) {
1191 m_node = m_node->lastChild();
1192 pushFullyClippedState(m_fullyClippedStack, m_node);
1193 } else {
1194 // Exit empty containers as we pass over them or containers
1195 // where [container, 0] is where we started iterating.
1196 if (!m_handledNode
1197 && canHaveChildrenForEditing(m_node)
1198 && m_node->parentNode()
1199 && (!m_node->lastChild() || (m_node == m_endNode && !m_endOffset))) {
1200 exitNode();
1201 if (m_positionNode) {
1202 m_handledNode = true;
1203 m_handledChildren = true;
1204 return;
1205 }
1206 }
1207
1208 // Exit all other containers.
1209 while (!m_node->previousSibling()) {
1210 if (!advanceRespectingRange(m_node->parentOrHostNode()))
1211 break;
1212 m_fullyClippedStack.pop();
1213 exitNode();
1214 if (m_positionNode) {
1215 m_handledNode = true;
1216 m_handledChildren = true;
1217 return;
1218 }
1219 }
1220
1221 m_fullyClippedStack.pop();
1222 if (advanceRespectingRange(m_node->previousSibling()))
1223 pushFullyClippedState(m_fullyClippedStack, m_node);
1224 else
1225 m_node = 0;
1226 }
1227
1228 // For the purpose of word boundary detection,
1229 // we should iterate all visible text and trailing (collapsed) whitespaces.
1230 m_offset = m_node ? maxOffsetIncludingCollapsedSpaces(m_node) : 0;
1231 m_handledNode = false;
1232 m_handledChildren = false;
1233
1234 if (m_positionNode)
1235 return;
1236 }
1237 }
1238
handleTextNode()1239 bool SimplifiedBackwardsTextIterator::handleTextNode()
1240 {
1241 m_lastTextNode = m_node;
1242
1243 RenderText* renderer = toRenderText(m_node->renderer());
1244 String str = renderer->text();
1245
1246 if (!renderer->firstTextBox() && str.length() > 0)
1247 return true;
1248
1249 m_positionEndOffset = m_offset;
1250
1251 m_offset = (m_node == m_startNode) ? m_startOffset : 0;
1252 m_positionNode = m_node;
1253 m_positionStartOffset = m_offset;
1254 m_textLength = m_positionEndOffset - m_positionStartOffset;
1255 m_textCharacters = str.characters() + m_positionStartOffset;
1256
1257 m_lastCharacter = str[m_positionEndOffset - 1];
1258
1259 return true;
1260 }
1261
handleReplacedElement()1262 bool SimplifiedBackwardsTextIterator::handleReplacedElement()
1263 {
1264 unsigned index = m_node->nodeIndex();
1265 // We want replaced elements to behave like punctuation for boundary
1266 // finding, and to simply take up space for the selection preservation
1267 // code in moveParagraphs, so we use a comma. Unconditionally emit
1268 // here because this iterator is only used for boundary finding.
1269 emitCharacter(',', m_node->parentNode(), index, index + 1);
1270 return true;
1271 }
1272
handleNonTextNode()1273 bool SimplifiedBackwardsTextIterator::handleNonTextNode()
1274 {
1275 // We can use a linefeed in place of a tab because this simple iterator is only used to
1276 // find boundaries, not actual content. A linefeed breaks words, sentences, and paragraphs.
1277 if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineAfterNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
1278 unsigned index = m_node->nodeIndex();
1279 // The start of this emitted range is wrong. Ensuring correctness would require
1280 // VisiblePositions and so would be slow. previousBoundary expects this.
1281 emitCharacter('\n', m_node->parentNode(), index + 1, index + 1);
1282 }
1283 return true;
1284 }
1285
exitNode()1286 void SimplifiedBackwardsTextIterator::exitNode()
1287 {
1288 if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineBeforeNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
1289 // The start of this emitted range is wrong. Ensuring correctness would require
1290 // VisiblePositions and so would be slow. previousBoundary expects this.
1291 emitCharacter('\n', m_node, 0, 0);
1292 }
1293 }
1294
emitCharacter(UChar c,Node * node,int startOffset,int endOffset)1295 void SimplifiedBackwardsTextIterator::emitCharacter(UChar c, Node* node, int startOffset, int endOffset)
1296 {
1297 m_singleCharacterBuffer = c;
1298 m_positionNode = node;
1299 m_positionStartOffset = startOffset;
1300 m_positionEndOffset = endOffset;
1301 m_textCharacters = &m_singleCharacterBuffer;
1302 m_textLength = 1;
1303 m_lastCharacter = c;
1304 }
1305
advanceRespectingRange(Node * next)1306 bool SimplifiedBackwardsTextIterator::advanceRespectingRange(Node* next)
1307 {
1308 if (!next)
1309 return false;
1310 m_havePassedStartNode |= m_node == m_startNode;
1311 if (m_havePassedStartNode)
1312 return false;
1313 m_node = next;
1314 return true;
1315 }
1316
range() const1317 PassRefPtr<Range> SimplifiedBackwardsTextIterator::range() const
1318 {
1319 if (m_positionNode)
1320 return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1321
1322 return Range::create(m_startNode->document(), m_startNode, m_startOffset, m_startNode, m_startOffset);
1323 }
1324
1325 // --------
1326
CharacterIterator()1327 CharacterIterator::CharacterIterator()
1328 : m_offset(0)
1329 , m_runOffset(0)
1330 , m_atBreak(true)
1331 {
1332 }
1333
CharacterIterator(const Range * r,TextIteratorBehavior behavior)1334 CharacterIterator::CharacterIterator(const Range* r, TextIteratorBehavior behavior)
1335 : m_offset(0)
1336 , m_runOffset(0)
1337 , m_atBreak(true)
1338 , m_textIterator(r, behavior)
1339 {
1340 while (!atEnd() && m_textIterator.length() == 0)
1341 m_textIterator.advance();
1342 }
1343
range() const1344 PassRefPtr<Range> CharacterIterator::range() const
1345 {
1346 RefPtr<Range> r = m_textIterator.range();
1347 if (!m_textIterator.atEnd()) {
1348 if (m_textIterator.length() <= 1) {
1349 ASSERT(m_runOffset == 0);
1350 } else {
1351 Node* n = r->startContainer();
1352 ASSERT(n == r->endContainer());
1353 int offset = r->startOffset() + m_runOffset;
1354 ExceptionCode ec = 0;
1355 r->setStart(n, offset, ec);
1356 r->setEnd(n, offset + 1, ec);
1357 ASSERT(!ec);
1358 }
1359 }
1360 return r.release();
1361 }
1362
advance(int count)1363 void CharacterIterator::advance(int count)
1364 {
1365 if (count <= 0) {
1366 ASSERT(count == 0);
1367 return;
1368 }
1369
1370 m_atBreak = false;
1371
1372 // easy if there is enough left in the current m_textIterator run
1373 int remaining = m_textIterator.length() - m_runOffset;
1374 if (count < remaining) {
1375 m_runOffset += count;
1376 m_offset += count;
1377 return;
1378 }
1379
1380 // exhaust the current m_textIterator run
1381 count -= remaining;
1382 m_offset += remaining;
1383
1384 // move to a subsequent m_textIterator run
1385 for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
1386 int runLength = m_textIterator.length();
1387 if (runLength == 0)
1388 m_atBreak = true;
1389 else {
1390 // see whether this is m_textIterator to use
1391 if (count < runLength) {
1392 m_runOffset = count;
1393 m_offset += count;
1394 return;
1395 }
1396
1397 // exhaust this m_textIterator run
1398 count -= runLength;
1399 m_offset += runLength;
1400 }
1401 }
1402
1403 // ran to the end of the m_textIterator... no more runs left
1404 m_atBreak = true;
1405 m_runOffset = 0;
1406 }
1407
string(int numChars)1408 String CharacterIterator::string(int numChars)
1409 {
1410 Vector<UChar> result;
1411 result.reserveInitialCapacity(numChars);
1412 while (numChars > 0 && !atEnd()) {
1413 int runSize = min(numChars, length());
1414 result.append(characters(), runSize);
1415 numChars -= runSize;
1416 advance(runSize);
1417 }
1418 return String::adopt(result);
1419 }
1420
characterSubrange(CharacterIterator & it,int offset,int length)1421 static PassRefPtr<Range> characterSubrange(CharacterIterator& it, int offset, int length)
1422 {
1423 it.advance(offset);
1424 RefPtr<Range> start = it.range();
1425
1426 if (length > 1)
1427 it.advance(length - 1);
1428 RefPtr<Range> end = it.range();
1429
1430 return Range::create(start->startContainer()->document(),
1431 start->startContainer(), start->startOffset(),
1432 end->endContainer(), end->endOffset());
1433 }
1434
BackwardsCharacterIterator()1435 BackwardsCharacterIterator::BackwardsCharacterIterator()
1436 : m_offset(0)
1437 , m_runOffset(0)
1438 , m_atBreak(true)
1439 {
1440 }
1441
BackwardsCharacterIterator(const Range * range,TextIteratorBehavior behavior)1442 BackwardsCharacterIterator::BackwardsCharacterIterator(const Range* range, TextIteratorBehavior behavior)
1443 : m_offset(0)
1444 , m_runOffset(0)
1445 , m_atBreak(true)
1446 , m_textIterator(range, behavior)
1447 {
1448 while (!atEnd() && !m_textIterator.length())
1449 m_textIterator.advance();
1450 }
1451
range() const1452 PassRefPtr<Range> BackwardsCharacterIterator::range() const
1453 {
1454 RefPtr<Range> r = m_textIterator.range();
1455 if (!m_textIterator.atEnd()) {
1456 if (m_textIterator.length() <= 1)
1457 ASSERT(m_runOffset == 0);
1458 else {
1459 Node* n = r->startContainer();
1460 ASSERT(n == r->endContainer());
1461 int offset = r->endOffset() - m_runOffset;
1462 ExceptionCode ec = 0;
1463 r->setStart(n, offset - 1, ec);
1464 r->setEnd(n, offset, ec);
1465 ASSERT(!ec);
1466 }
1467 }
1468 return r.release();
1469 }
1470
advance(int count)1471 void BackwardsCharacterIterator::advance(int count)
1472 {
1473 if (count <= 0) {
1474 ASSERT(!count);
1475 return;
1476 }
1477
1478 m_atBreak = false;
1479
1480 int remaining = m_textIterator.length() - m_runOffset;
1481 if (count < remaining) {
1482 m_runOffset += count;
1483 m_offset += count;
1484 return;
1485 }
1486
1487 count -= remaining;
1488 m_offset += remaining;
1489
1490 for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
1491 int runLength = m_textIterator.length();
1492 if (runLength == 0)
1493 m_atBreak = true;
1494 else {
1495 if (count < runLength) {
1496 m_runOffset = count;
1497 m_offset += count;
1498 return;
1499 }
1500
1501 count -= runLength;
1502 m_offset += runLength;
1503 }
1504 }
1505
1506 m_atBreak = true;
1507 m_runOffset = 0;
1508 }
1509
1510 // --------
1511
WordAwareIterator()1512 WordAwareIterator::WordAwareIterator()
1513 : m_previousText(0)
1514 , m_didLookAhead(false)
1515 {
1516 }
1517
WordAwareIterator(const Range * r)1518 WordAwareIterator::WordAwareIterator(const Range* r)
1519 : m_previousText(0)
1520 , m_didLookAhead(true) // so we consider the first chunk from the text iterator
1521 , m_textIterator(r)
1522 {
1523 advance(); // get in position over the first chunk of text
1524 }
1525
~WordAwareIterator()1526 WordAwareIterator::~WordAwareIterator()
1527 {
1528 }
1529
1530 // We're always in one of these modes:
1531 // - The current chunk in the text iterator is our current chunk
1532 // (typically its a piece of whitespace, or text that ended with whitespace)
1533 // - The previous chunk in the text iterator is our current chunk
1534 // (we looked ahead to the next chunk and found a word boundary)
1535 // - We built up our own chunk of text from many chunks from the text iterator
1536
1537 // FIXME: Performance could be bad for huge spans next to each other that don't fall on word boundaries.
1538
advance()1539 void WordAwareIterator::advance()
1540 {
1541 m_previousText = 0;
1542 m_buffer.clear(); // toss any old buffer we built up
1543
1544 // If last time we did a look-ahead, start with that looked-ahead chunk now
1545 if (!m_didLookAhead) {
1546 ASSERT(!m_textIterator.atEnd());
1547 m_textIterator.advance();
1548 }
1549 m_didLookAhead = false;
1550
1551 // Go to next non-empty chunk
1552 while (!m_textIterator.atEnd() && m_textIterator.length() == 0)
1553 m_textIterator.advance();
1554 m_range = m_textIterator.range();
1555
1556 if (m_textIterator.atEnd())
1557 return;
1558
1559 while (1) {
1560 // If this chunk ends in whitespace we can just use it as our chunk.
1561 if (isSpaceOrNewline(m_textIterator.characters()[m_textIterator.length() - 1]))
1562 return;
1563
1564 // If this is the first chunk that failed, save it in previousText before look ahead
1565 if (m_buffer.isEmpty()) {
1566 m_previousText = m_textIterator.characters();
1567 m_previousLength = m_textIterator.length();
1568 }
1569
1570 // Look ahead to next chunk. If it is whitespace or a break, we can use the previous stuff
1571 m_textIterator.advance();
1572 if (m_textIterator.atEnd() || m_textIterator.length() == 0 || isSpaceOrNewline(m_textIterator.characters()[0])) {
1573 m_didLookAhead = true;
1574 return;
1575 }
1576
1577 if (m_buffer.isEmpty()) {
1578 // Start gobbling chunks until we get to a suitable stopping point
1579 m_buffer.append(m_previousText, m_previousLength);
1580 m_previousText = 0;
1581 }
1582 m_buffer.append(m_textIterator.characters(), m_textIterator.length());
1583 int exception = 0;
1584 m_range->setEnd(m_textIterator.range()->endContainer(), m_textIterator.range()->endOffset(), exception);
1585 }
1586 }
1587
length() const1588 int WordAwareIterator::length() const
1589 {
1590 if (!m_buffer.isEmpty())
1591 return m_buffer.size();
1592 if (m_previousText)
1593 return m_previousLength;
1594 return m_textIterator.length();
1595 }
1596
characters() const1597 const UChar* WordAwareIterator::characters() const
1598 {
1599 if (!m_buffer.isEmpty())
1600 return m_buffer.data();
1601 if (m_previousText)
1602 return m_previousText;
1603 return m_textIterator.characters();
1604 }
1605
1606 // --------
1607
foldQuoteMarkOrSoftHyphen(UChar c)1608 static inline UChar foldQuoteMarkOrSoftHyphen(UChar c)
1609 {
1610 switch (c) {
1611 case hebrewPunctuationGershayim:
1612 case leftDoubleQuotationMark:
1613 case rightDoubleQuotationMark:
1614 return '"';
1615 case hebrewPunctuationGeresh:
1616 case leftSingleQuotationMark:
1617 case rightSingleQuotationMark:
1618 return '\'';
1619 case softHyphen:
1620 // Replace soft hyphen with an ignorable character so that their presence or absence will
1621 // not affect string comparison.
1622 return 0;
1623 default:
1624 return c;
1625 }
1626 }
1627
foldQuoteMarksAndSoftHyphens(String & s)1628 static inline void foldQuoteMarksAndSoftHyphens(String& s)
1629 {
1630 s.replace(hebrewPunctuationGeresh, '\'');
1631 s.replace(hebrewPunctuationGershayim, '"');
1632 s.replace(leftDoubleQuotationMark, '"');
1633 s.replace(leftSingleQuotationMark, '\'');
1634 s.replace(rightDoubleQuotationMark, '"');
1635 s.replace(rightSingleQuotationMark, '\'');
1636 // Replace soft hyphen with an ignorable character so that their presence or absence will
1637 // not affect string comparison.
1638 s.replace(softHyphen, 0);
1639 }
1640
1641 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
1642
foldQuoteMarksAndSoftHyphens(UChar * data,size_t length)1643 static inline void foldQuoteMarksAndSoftHyphens(UChar* data, size_t length)
1644 {
1645 for (size_t i = 0; i < length; ++i)
1646 data[i] = foldQuoteMarkOrSoftHyphen(data[i]);
1647 }
1648
1649 static const size_t minimumSearchBufferSize = 8192;
1650
1651 #ifndef NDEBUG
1652 static bool searcherInUse;
1653 #endif
1654
createSearcher()1655 static UStringSearch* createSearcher()
1656 {
1657 // Provide a non-empty pattern and non-empty text so usearch_open will not fail,
1658 // but it doesn't matter exactly what it is, since we don't perform any searches
1659 // without setting both the pattern and the text.
1660 UErrorCode status = U_ZERO_ERROR;
1661 UStringSearch* searcher = usearch_open(&newlineCharacter, 1, &newlineCharacter, 1, currentSearchLocaleID(), 0, &status);
1662 ASSERT(status == U_ZERO_ERROR || status == U_USING_FALLBACK_WARNING || status == U_USING_DEFAULT_WARNING);
1663 return searcher;
1664 }
1665
searcher()1666 static UStringSearch* searcher()
1667 {
1668 static UStringSearch* searcher = createSearcher();
1669 return searcher;
1670 }
1671
lockSearcher()1672 static inline void lockSearcher()
1673 {
1674 #ifndef NDEBUG
1675 ASSERT(!searcherInUse);
1676 searcherInUse = true;
1677 #endif
1678 }
1679
unlockSearcher()1680 static inline void unlockSearcher()
1681 {
1682 #ifndef NDEBUG
1683 ASSERT(searcherInUse);
1684 searcherInUse = false;
1685 #endif
1686 }
1687
1688 // ICU's search ignores the distinction between small kana letters and ones
1689 // that are not small, and also characters that differ only in the voicing
1690 // marks when considering only primary collation strength diffrences.
1691 // This is not helpful for end users, since these differences make words
1692 // distinct, so for our purposes we need these to be considered.
1693 // The Unicode folks do not think the collation algorithm should be
1694 // changed. To work around this, we would like to tailor the ICU searcher,
1695 // but we can't get that to work yet. So instead, we check for cases where
1696 // these differences occur, and skip those matches.
1697
1698 // We refer to the above technique as the "kana workaround". The next few
1699 // functions are helper functinos for the kana workaround.
1700
isKanaLetter(UChar character)1701 static inline bool isKanaLetter(UChar character)
1702 {
1703 // Hiragana letters.
1704 if (character >= 0x3041 && character <= 0x3096)
1705 return true;
1706
1707 // Katakana letters.
1708 if (character >= 0x30A1 && character <= 0x30FA)
1709 return true;
1710 if (character >= 0x31F0 && character <= 0x31FF)
1711 return true;
1712
1713 // Halfwidth katakana letters.
1714 if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70)
1715 return true;
1716
1717 return false;
1718 }
1719
isSmallKanaLetter(UChar character)1720 static inline bool isSmallKanaLetter(UChar character)
1721 {
1722 ASSERT(isKanaLetter(character));
1723
1724 switch (character) {
1725 case 0x3041: // HIRAGANA LETTER SMALL A
1726 case 0x3043: // HIRAGANA LETTER SMALL I
1727 case 0x3045: // HIRAGANA LETTER SMALL U
1728 case 0x3047: // HIRAGANA LETTER SMALL E
1729 case 0x3049: // HIRAGANA LETTER SMALL O
1730 case 0x3063: // HIRAGANA LETTER SMALL TU
1731 case 0x3083: // HIRAGANA LETTER SMALL YA
1732 case 0x3085: // HIRAGANA LETTER SMALL YU
1733 case 0x3087: // HIRAGANA LETTER SMALL YO
1734 case 0x308E: // HIRAGANA LETTER SMALL WA
1735 case 0x3095: // HIRAGANA LETTER SMALL KA
1736 case 0x3096: // HIRAGANA LETTER SMALL KE
1737 case 0x30A1: // KATAKANA LETTER SMALL A
1738 case 0x30A3: // KATAKANA LETTER SMALL I
1739 case 0x30A5: // KATAKANA LETTER SMALL U
1740 case 0x30A7: // KATAKANA LETTER SMALL E
1741 case 0x30A9: // KATAKANA LETTER SMALL O
1742 case 0x30C3: // KATAKANA LETTER SMALL TU
1743 case 0x30E3: // KATAKANA LETTER SMALL YA
1744 case 0x30E5: // KATAKANA LETTER SMALL YU
1745 case 0x30E7: // KATAKANA LETTER SMALL YO
1746 case 0x30EE: // KATAKANA LETTER SMALL WA
1747 case 0x30F5: // KATAKANA LETTER SMALL KA
1748 case 0x30F6: // KATAKANA LETTER SMALL KE
1749 case 0x31F0: // KATAKANA LETTER SMALL KU
1750 case 0x31F1: // KATAKANA LETTER SMALL SI
1751 case 0x31F2: // KATAKANA LETTER SMALL SU
1752 case 0x31F3: // KATAKANA LETTER SMALL TO
1753 case 0x31F4: // KATAKANA LETTER SMALL NU
1754 case 0x31F5: // KATAKANA LETTER SMALL HA
1755 case 0x31F6: // KATAKANA LETTER SMALL HI
1756 case 0x31F7: // KATAKANA LETTER SMALL HU
1757 case 0x31F8: // KATAKANA LETTER SMALL HE
1758 case 0x31F9: // KATAKANA LETTER SMALL HO
1759 case 0x31FA: // KATAKANA LETTER SMALL MU
1760 case 0x31FB: // KATAKANA LETTER SMALL RA
1761 case 0x31FC: // KATAKANA LETTER SMALL RI
1762 case 0x31FD: // KATAKANA LETTER SMALL RU
1763 case 0x31FE: // KATAKANA LETTER SMALL RE
1764 case 0x31FF: // KATAKANA LETTER SMALL RO
1765 case 0xFF67: // HALFWIDTH KATAKANA LETTER SMALL A
1766 case 0xFF68: // HALFWIDTH KATAKANA LETTER SMALL I
1767 case 0xFF69: // HALFWIDTH KATAKANA LETTER SMALL U
1768 case 0xFF6A: // HALFWIDTH KATAKANA LETTER SMALL E
1769 case 0xFF6B: // HALFWIDTH KATAKANA LETTER SMALL O
1770 case 0xFF6C: // HALFWIDTH KATAKANA LETTER SMALL YA
1771 case 0xFF6D: // HALFWIDTH KATAKANA LETTER SMALL YU
1772 case 0xFF6E: // HALFWIDTH KATAKANA LETTER SMALL YO
1773 case 0xFF6F: // HALFWIDTH KATAKANA LETTER SMALL TU
1774 return true;
1775 }
1776 return false;
1777 }
1778
1779 enum VoicedSoundMarkType { NoVoicedSoundMark, VoicedSoundMark, SemiVoicedSoundMark };
1780
composedVoicedSoundMark(UChar character)1781 static inline VoicedSoundMarkType composedVoicedSoundMark(UChar character)
1782 {
1783 ASSERT(isKanaLetter(character));
1784
1785 switch (character) {
1786 case 0x304C: // HIRAGANA LETTER GA
1787 case 0x304E: // HIRAGANA LETTER GI
1788 case 0x3050: // HIRAGANA LETTER GU
1789 case 0x3052: // HIRAGANA LETTER GE
1790 case 0x3054: // HIRAGANA LETTER GO
1791 case 0x3056: // HIRAGANA LETTER ZA
1792 case 0x3058: // HIRAGANA LETTER ZI
1793 case 0x305A: // HIRAGANA LETTER ZU
1794 case 0x305C: // HIRAGANA LETTER ZE
1795 case 0x305E: // HIRAGANA LETTER ZO
1796 case 0x3060: // HIRAGANA LETTER DA
1797 case 0x3062: // HIRAGANA LETTER DI
1798 case 0x3065: // HIRAGANA LETTER DU
1799 case 0x3067: // HIRAGANA LETTER DE
1800 case 0x3069: // HIRAGANA LETTER DO
1801 case 0x3070: // HIRAGANA LETTER BA
1802 case 0x3073: // HIRAGANA LETTER BI
1803 case 0x3076: // HIRAGANA LETTER BU
1804 case 0x3079: // HIRAGANA LETTER BE
1805 case 0x307C: // HIRAGANA LETTER BO
1806 case 0x3094: // HIRAGANA LETTER VU
1807 case 0x30AC: // KATAKANA LETTER GA
1808 case 0x30AE: // KATAKANA LETTER GI
1809 case 0x30B0: // KATAKANA LETTER GU
1810 case 0x30B2: // KATAKANA LETTER GE
1811 case 0x30B4: // KATAKANA LETTER GO
1812 case 0x30B6: // KATAKANA LETTER ZA
1813 case 0x30B8: // KATAKANA LETTER ZI
1814 case 0x30BA: // KATAKANA LETTER ZU
1815 case 0x30BC: // KATAKANA LETTER ZE
1816 case 0x30BE: // KATAKANA LETTER ZO
1817 case 0x30C0: // KATAKANA LETTER DA
1818 case 0x30C2: // KATAKANA LETTER DI
1819 case 0x30C5: // KATAKANA LETTER DU
1820 case 0x30C7: // KATAKANA LETTER DE
1821 case 0x30C9: // KATAKANA LETTER DO
1822 case 0x30D0: // KATAKANA LETTER BA
1823 case 0x30D3: // KATAKANA LETTER BI
1824 case 0x30D6: // KATAKANA LETTER BU
1825 case 0x30D9: // KATAKANA LETTER BE
1826 case 0x30DC: // KATAKANA LETTER BO
1827 case 0x30F4: // KATAKANA LETTER VU
1828 case 0x30F7: // KATAKANA LETTER VA
1829 case 0x30F8: // KATAKANA LETTER VI
1830 case 0x30F9: // KATAKANA LETTER VE
1831 case 0x30FA: // KATAKANA LETTER VO
1832 return VoicedSoundMark;
1833 case 0x3071: // HIRAGANA LETTER PA
1834 case 0x3074: // HIRAGANA LETTER PI
1835 case 0x3077: // HIRAGANA LETTER PU
1836 case 0x307A: // HIRAGANA LETTER PE
1837 case 0x307D: // HIRAGANA LETTER PO
1838 case 0x30D1: // KATAKANA LETTER PA
1839 case 0x30D4: // KATAKANA LETTER PI
1840 case 0x30D7: // KATAKANA LETTER PU
1841 case 0x30DA: // KATAKANA LETTER PE
1842 case 0x30DD: // KATAKANA LETTER PO
1843 return SemiVoicedSoundMark;
1844 }
1845 return NoVoicedSoundMark;
1846 }
1847
isCombiningVoicedSoundMark(UChar character)1848 static inline bool isCombiningVoicedSoundMark(UChar character)
1849 {
1850 switch (character) {
1851 case 0x3099: // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
1852 case 0x309A: // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
1853 return true;
1854 }
1855 return false;
1856 }
1857
containsKanaLetters(const String & pattern)1858 static inline bool containsKanaLetters(const String& pattern)
1859 {
1860 const UChar* characters = pattern.characters();
1861 unsigned length = pattern.length();
1862 for (unsigned i = 0; i < length; ++i) {
1863 if (isKanaLetter(characters[i]))
1864 return true;
1865 }
1866 return false;
1867 }
1868
normalizeCharacters(const UChar * characters,unsigned length,Vector<UChar> & buffer)1869 static void normalizeCharacters(const UChar* characters, unsigned length, Vector<UChar>& buffer)
1870 {
1871 ASSERT(length);
1872
1873 buffer.resize(length);
1874
1875 UErrorCode status = U_ZERO_ERROR;
1876 size_t bufferSize = unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), length, &status);
1877 ASSERT(status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR);
1878 ASSERT(bufferSize);
1879
1880 buffer.resize(bufferSize);
1881
1882 if (status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING)
1883 return;
1884
1885 status = U_ZERO_ERROR;
1886 unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), bufferSize, &status);
1887 ASSERT(status == U_STRING_NOT_TERMINATED_WARNING);
1888 }
1889
isNonLatin1Separator(UChar32 character)1890 static bool isNonLatin1Separator(UChar32 character)
1891 {
1892 ASSERT_ARG(character, character >= 256);
1893
1894 return U_GET_GC_MASK(character) & (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK);
1895 }
1896
isSeparator(UChar32 character)1897 static inline bool isSeparator(UChar32 character)
1898 {
1899 static const bool latin1SeparatorTable[256] = {
1900 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1901 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1902 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // space ! " # $ % & ' ( ) * + , - . /
1903 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, // : ; < = > ?
1904 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // @
1905 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, // [ \ ] ^ _
1906 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // `
1907 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, // { | } ~
1908 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1909 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1910 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1911 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1912 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1913 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
1914 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1915 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
1916 };
1917
1918 if (character < 256)
1919 return latin1SeparatorTable[character];
1920
1921 return isNonLatin1Separator(character);
1922 }
1923
SearchBuffer(const String & target,FindOptions options)1924 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
1925 : m_target(target)
1926 , m_options(options)
1927 , m_prefixLength(0)
1928 , m_atBreak(true)
1929 , m_needsMoreContext(options & AtWordStarts)
1930 , m_targetRequiresKanaWorkaround(containsKanaLetters(m_target))
1931 {
1932 ASSERT(!m_target.isEmpty());
1933
1934 // FIXME: We'd like to tailor the searcher to fold quote marks for us instead
1935 // of doing it in a separate replacement pass here, but ICU doesn't offer a way
1936 // to add tailoring on top of the locale-specific tailoring as of this writing.
1937 foldQuoteMarksAndSoftHyphens(m_target);
1938
1939 size_t targetLength = m_target.length();
1940 m_buffer.reserveInitialCapacity(max(targetLength * 8, minimumSearchBufferSize));
1941 m_overlap = m_buffer.capacity() / 4;
1942
1943 if ((m_options & AtWordStarts) && targetLength) {
1944 UChar32 targetFirstCharacter;
1945 U16_GET(m_target.characters(), 0, 0, targetLength, targetFirstCharacter);
1946 // Characters in the separator category never really occur at the beginning of a word,
1947 // so if the target begins with such a character, we just ignore the AtWordStart option.
1948 if (isSeparator(targetFirstCharacter)) {
1949 m_options &= ~AtWordStarts;
1950 m_needsMoreContext = false;
1951 }
1952 }
1953
1954 // Grab the single global searcher.
1955 // If we ever have a reason to do more than once search buffer at once, we'll have
1956 // to move to multiple searchers.
1957 lockSearcher();
1958
1959 UStringSearch* searcher = WebCore::searcher();
1960 UCollator* collator = usearch_getCollator(searcher);
1961
1962 UCollationStrength strength = m_options & CaseInsensitive ? UCOL_PRIMARY : UCOL_TERTIARY;
1963 if (ucol_getStrength(collator) != strength) {
1964 ucol_setStrength(collator, strength);
1965 usearch_reset(searcher);
1966 }
1967
1968 UErrorCode status = U_ZERO_ERROR;
1969 usearch_setPattern(searcher, m_target.characters(), targetLength, &status);
1970 ASSERT(status == U_ZERO_ERROR);
1971
1972 // The kana workaround requires a normalized copy of the target string.
1973 if (m_targetRequiresKanaWorkaround)
1974 normalizeCharacters(m_target.characters(), m_target.length(), m_normalizedTarget);
1975 }
1976
~SearchBuffer()1977 inline SearchBuffer::~SearchBuffer()
1978 {
1979 // Leave the static object pointing to a valid string.
1980 UErrorCode status = U_ZERO_ERROR;
1981 usearch_setPattern(WebCore::searcher(), &newlineCharacter, 1, &status);
1982 ASSERT(status == U_ZERO_ERROR);
1983
1984 unlockSearcher();
1985 }
1986
append(const UChar * characters,size_t length)1987 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
1988 {
1989 ASSERT(length);
1990
1991 if (m_atBreak) {
1992 m_buffer.shrink(0);
1993 m_prefixLength = 0;
1994 m_atBreak = false;
1995 } else if (m_buffer.size() == m_buffer.capacity()) {
1996 memcpy(m_buffer.data(), m_buffer.data() + m_buffer.size() - m_overlap, m_overlap * sizeof(UChar));
1997 m_prefixLength -= min(m_prefixLength, m_buffer.size() - m_overlap);
1998 m_buffer.shrink(m_overlap);
1999 }
2000
2001 size_t oldLength = m_buffer.size();
2002 size_t usableLength = min(m_buffer.capacity() - oldLength, length);
2003 ASSERT(usableLength);
2004 m_buffer.append(characters, usableLength);
2005 foldQuoteMarksAndSoftHyphens(m_buffer.data() + oldLength, usableLength);
2006 return usableLength;
2007 }
2008
needsMoreContext() const2009 inline bool SearchBuffer::needsMoreContext() const
2010 {
2011 return m_needsMoreContext;
2012 }
2013
prependContext(const UChar * characters,size_t length)2014 inline void SearchBuffer::prependContext(const UChar* characters, size_t length)
2015 {
2016 ASSERT(m_needsMoreContext);
2017 ASSERT(m_prefixLength == m_buffer.size());
2018
2019 if (!length)
2020 return;
2021
2022 m_atBreak = false;
2023
2024 size_t wordBoundaryContextStart = length;
2025 if (wordBoundaryContextStart) {
2026 U16_BACK_1(characters, 0, wordBoundaryContextStart);
2027 wordBoundaryContextStart = startOfLastWordBoundaryContext(characters, wordBoundaryContextStart);
2028 }
2029
2030 size_t usableLength = min(m_buffer.capacity() - m_prefixLength, length - wordBoundaryContextStart);
2031 m_buffer.prepend(characters + length - usableLength, usableLength);
2032 m_prefixLength += usableLength;
2033
2034 if (wordBoundaryContextStart || m_prefixLength == m_buffer.capacity())
2035 m_needsMoreContext = false;
2036 }
2037
atBreak() const2038 inline bool SearchBuffer::atBreak() const
2039 {
2040 return m_atBreak;
2041 }
2042
reachedBreak()2043 inline void SearchBuffer::reachedBreak()
2044 {
2045 m_atBreak = true;
2046 }
2047
isBadMatch(const UChar * match,size_t matchLength) const2048 inline bool SearchBuffer::isBadMatch(const UChar* match, size_t matchLength) const
2049 {
2050 // This function implements the kana workaround. If usearch treats
2051 // it as a match, but we do not want to, then it's a "bad match".
2052 if (!m_targetRequiresKanaWorkaround)
2053 return false;
2054
2055 // Normalize into a match buffer. We reuse a single buffer rather than
2056 // creating a new one each time.
2057 normalizeCharacters(match, matchLength, m_normalizedMatch);
2058
2059 const UChar* a = m_normalizedTarget.begin();
2060 const UChar* aEnd = m_normalizedTarget.end();
2061
2062 const UChar* b = m_normalizedMatch.begin();
2063 const UChar* bEnd = m_normalizedMatch.end();
2064
2065 while (true) {
2066 // Skip runs of non-kana-letter characters. This is necessary so we can
2067 // correctly handle strings where the target and match have different-length
2068 // runs of characters that match, while still double checking the correctness
2069 // of matches of kana letters with other kana letters.
2070 while (a != aEnd && !isKanaLetter(*a))
2071 ++a;
2072 while (b != bEnd && !isKanaLetter(*b))
2073 ++b;
2074
2075 // If we reached the end of either the target or the match, we should have
2076 // reached the end of both; both should have the same number of kana letters.
2077 if (a == aEnd || b == bEnd) {
2078 ASSERT(a == aEnd);
2079 ASSERT(b == bEnd);
2080 return false;
2081 }
2082
2083 // Check for differences in the kana letter character itself.
2084 if (isSmallKanaLetter(*a) != isSmallKanaLetter(*b))
2085 return true;
2086 if (composedVoicedSoundMark(*a) != composedVoicedSoundMark(*b))
2087 return true;
2088 ++a;
2089 ++b;
2090
2091 // Check for differences in combining voiced sound marks found after the letter.
2092 while (1) {
2093 if (!(a != aEnd && isCombiningVoicedSoundMark(*a))) {
2094 if (b != bEnd && isCombiningVoicedSoundMark(*b))
2095 return true;
2096 break;
2097 }
2098 if (!(b != bEnd && isCombiningVoicedSoundMark(*b)))
2099 return true;
2100 if (*a != *b)
2101 return true;
2102 ++a;
2103 ++b;
2104 }
2105 }
2106 }
2107
isWordStartMatch(size_t start,size_t length) const2108 inline bool SearchBuffer::isWordStartMatch(size_t start, size_t length) const
2109 {
2110 ASSERT(m_options & AtWordStarts);
2111
2112 if (!start)
2113 return true;
2114
2115 if (m_options & TreatMedialCapitalAsWordStart) {
2116 int size = m_buffer.size();
2117 int offset = start;
2118 UChar32 firstCharacter;
2119 U16_GET(m_buffer.data(), 0, offset, size, firstCharacter);
2120 UChar32 previousCharacter;
2121 U16_PREV(m_buffer.data(), 0, offset, previousCharacter);
2122
2123 if (isSeparator(firstCharacter)) {
2124 // The start of a separator run is a word start (".org" in "webkit.org").
2125 if (!isSeparator(previousCharacter))
2126 return true;
2127 } else if (isASCIIUpper(firstCharacter)) {
2128 // The start of an uppercase run is a word start ("Kit" in "WebKit").
2129 if (!isASCIIUpper(previousCharacter))
2130 return true;
2131 // The last character of an uppercase run followed by a non-separator, non-digit
2132 // is a word start ("Request" in "XMLHTTPRequest").
2133 offset = start;
2134 U16_FWD_1(m_buffer.data(), offset, size);
2135 UChar32 nextCharacter = 0;
2136 if (offset < size)
2137 U16_GET(m_buffer.data(), 0, offset, size, nextCharacter);
2138 if (!isASCIIUpper(nextCharacter) && !isASCIIDigit(nextCharacter) && !isSeparator(nextCharacter))
2139 return true;
2140 } else if (isASCIIDigit(firstCharacter)) {
2141 // The start of a digit run is a word start ("2" in "WebKit2").
2142 if (!isASCIIDigit(previousCharacter))
2143 return true;
2144 } else if (isSeparator(previousCharacter) || isASCIIDigit(previousCharacter)) {
2145 // The start of a non-separator, non-uppercase, non-digit run is a word start,
2146 // except after an uppercase. ("org" in "webkit.org", but not "ore" in "WebCore").
2147 return true;
2148 }
2149 }
2150
2151 size_t wordBreakSearchStart = start + length;
2152 while (wordBreakSearchStart > start)
2153 wordBreakSearchStart = findNextWordFromIndex(m_buffer.data(), m_buffer.size(), wordBreakSearchStart, false /* backwards */);
2154 return wordBreakSearchStart == start;
2155 }
2156
search(size_t & start)2157 inline size_t SearchBuffer::search(size_t& start)
2158 {
2159 size_t size = m_buffer.size();
2160 if (m_atBreak) {
2161 if (!size)
2162 return 0;
2163 } else {
2164 if (size != m_buffer.capacity())
2165 return 0;
2166 }
2167
2168 UStringSearch* searcher = WebCore::searcher();
2169
2170 UErrorCode status = U_ZERO_ERROR;
2171 usearch_setText(searcher, m_buffer.data(), size, &status);
2172 ASSERT(status == U_ZERO_ERROR);
2173
2174 usearch_setOffset(searcher, m_prefixLength, &status);
2175 ASSERT(status == U_ZERO_ERROR);
2176
2177 int matchStart = usearch_next(searcher, &status);
2178 ASSERT(status == U_ZERO_ERROR);
2179
2180 nextMatch:
2181 if (!(matchStart >= 0 && static_cast<size_t>(matchStart) < size)) {
2182 ASSERT(matchStart == USEARCH_DONE);
2183 return 0;
2184 }
2185
2186 // Matches that start in the overlap area are only tentative.
2187 // The same match may appear later, matching more characters,
2188 // possibly including a combining character that's not yet in the buffer.
2189 if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) {
2190 size_t overlap = m_overlap;
2191 if (m_options & AtWordStarts) {
2192 // Ensure that there is sufficient context before matchStart the next time around for
2193 // determining if it is at a word boundary.
2194 int wordBoundaryContextStart = matchStart;
2195 U16_BACK_1(m_buffer.data(), 0, wordBoundaryContextStart);
2196 wordBoundaryContextStart = startOfLastWordBoundaryContext(m_buffer.data(), wordBoundaryContextStart);
2197 overlap = min(size - 1, max(overlap, size - wordBoundaryContextStart));
2198 }
2199 memcpy(m_buffer.data(), m_buffer.data() + size - overlap, overlap * sizeof(UChar));
2200 m_prefixLength -= min(m_prefixLength, size - overlap);
2201 m_buffer.shrink(overlap);
2202 return 0;
2203 }
2204
2205 size_t matchedLength = usearch_getMatchedLength(searcher);
2206 ASSERT(matchStart + matchedLength <= size);
2207
2208 // If this match is "bad", move on to the next match.
2209 if (isBadMatch(m_buffer.data() + matchStart, matchedLength) || ((m_options & AtWordStarts) && !isWordStartMatch(matchStart, matchedLength))) {
2210 matchStart = usearch_next(searcher, &status);
2211 ASSERT(status == U_ZERO_ERROR);
2212 goto nextMatch;
2213 }
2214
2215 size_t newSize = size - (matchStart + 1);
2216 memmove(m_buffer.data(), m_buffer.data() + matchStart + 1, newSize * sizeof(UChar));
2217 m_prefixLength -= min<size_t>(m_prefixLength, matchStart + 1);
2218 m_buffer.shrink(newSize);
2219
2220 start = size - matchStart;
2221 return matchedLength;
2222 }
2223
2224 #else // !ICU_UNICODE
2225
SearchBuffer(const String & target,FindOptions options)2226 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
2227 : m_target(options & CaseInsensitive ? target.foldCase() : target)
2228 , m_options(options)
2229 , m_buffer(m_target.length())
2230 , m_isCharacterStartBuffer(m_target.length())
2231 , m_isBufferFull(false)
2232 , m_cursor(0)
2233 {
2234 ASSERT(!m_target.isEmpty());
2235 m_target.replace(noBreakSpace, ' ');
2236 foldQuoteMarksAndSoftHyphens(m_target);
2237 }
2238
~SearchBuffer()2239 inline SearchBuffer::~SearchBuffer()
2240 {
2241 }
2242
reachedBreak()2243 inline void SearchBuffer::reachedBreak()
2244 {
2245 m_cursor = 0;
2246 m_isBufferFull = false;
2247 }
2248
atBreak() const2249 inline bool SearchBuffer::atBreak() const
2250 {
2251 return !m_cursor && !m_isBufferFull;
2252 }
2253
append(UChar c,bool isStart)2254 inline void SearchBuffer::append(UChar c, bool isStart)
2255 {
2256 m_buffer[m_cursor] = c == noBreakSpace ? ' ' : foldQuoteMarkOrSoftHyphen(c);
2257 m_isCharacterStartBuffer[m_cursor] = isStart;
2258 if (++m_cursor == m_target.length()) {
2259 m_cursor = 0;
2260 m_isBufferFull = true;
2261 }
2262 }
2263
append(const UChar * characters,size_t length)2264 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
2265 {
2266 ASSERT(length);
2267 if (!(m_options & CaseInsensitive)) {
2268 append(characters[0], true);
2269 return 1;
2270 }
2271 const int maxFoldedCharacters = 16; // sensible maximum is 3, this should be more than enough
2272 UChar foldedCharacters[maxFoldedCharacters];
2273 bool error;
2274 int numFoldedCharacters = foldCase(foldedCharacters, maxFoldedCharacters, characters, 1, &error);
2275 ASSERT(!error);
2276 ASSERT(numFoldedCharacters);
2277 ASSERT(numFoldedCharacters <= maxFoldedCharacters);
2278 if (!error && numFoldedCharacters) {
2279 numFoldedCharacters = min(numFoldedCharacters, maxFoldedCharacters);
2280 append(foldedCharacters[0], true);
2281 for (int i = 1; i < numFoldedCharacters; ++i)
2282 append(foldedCharacters[i], false);
2283 }
2284 return 1;
2285 }
2286
needsMoreContext() const2287 inline bool SearchBuffer::needsMoreContext() const
2288 {
2289 return false;
2290 }
2291
prependContext(const UChar *,size_t)2292 void SearchBuffer::prependContext(const UChar*, size_t)
2293 {
2294 ASSERT_NOT_REACHED();
2295 }
2296
search(size_t & start)2297 inline size_t SearchBuffer::search(size_t& start)
2298 {
2299 if (!m_isBufferFull)
2300 return 0;
2301 if (!m_isCharacterStartBuffer[m_cursor])
2302 return 0;
2303
2304 size_t tailSpace = m_target.length() - m_cursor;
2305 if (memcmp(&m_buffer[m_cursor], m_target.characters(), tailSpace * sizeof(UChar)) != 0)
2306 return 0;
2307 if (memcmp(&m_buffer[0], m_target.characters() + tailSpace, m_cursor * sizeof(UChar)) != 0)
2308 return 0;
2309
2310 start = length();
2311
2312 // Now that we've found a match once, we don't want to find it again, because those
2313 // are the SearchBuffer semantics, allowing for a buffer where you append more than one
2314 // character at a time. To do this we take advantage of m_isCharacterStartBuffer, but if
2315 // we want to get rid of that in the future we could track this with a separate boolean
2316 // or even move the characters to the start of the buffer and set m_isBufferFull to false.
2317 m_isCharacterStartBuffer[m_cursor] = false;
2318
2319 return start;
2320 }
2321
2322 // Returns the number of characters that were appended to the buffer (what we are searching in).
2323 // That's not necessarily the same length as the passed-in target string, because case folding
2324 // can make two strings match even though they're not the same length.
length() const2325 size_t SearchBuffer::length() const
2326 {
2327 size_t bufferSize = m_target.length();
2328 size_t length = 0;
2329 for (size_t i = 0; i < bufferSize; ++i)
2330 length += m_isCharacterStartBuffer[i];
2331 return length;
2332 }
2333
2334 #endif // !ICU_UNICODE
2335
2336 // --------
2337
rangeLength(const Range * r,bool forSelectionPreservation)2338 int TextIterator::rangeLength(const Range* r, bool forSelectionPreservation)
2339 {
2340 int length = 0;
2341 for (TextIterator it(r, forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior); !it.atEnd(); it.advance())
2342 length += it.length();
2343
2344 return length;
2345 }
2346
subrange(Range * entireRange,int characterOffset,int characterCount)2347 PassRefPtr<Range> TextIterator::subrange(Range* entireRange, int characterOffset, int characterCount)
2348 {
2349 CharacterIterator entireRangeIterator(entireRange);
2350 return characterSubrange(entireRangeIterator, characterOffset, characterCount);
2351 }
2352
rangeFromLocationAndLength(Element * scope,int rangeLocation,int rangeLength,bool forSelectionPreservation)2353 PassRefPtr<Range> TextIterator::rangeFromLocationAndLength(Element* scope, int rangeLocation, int rangeLength, bool forSelectionPreservation)
2354 {
2355 RefPtr<Range> resultRange = scope->document()->createRange();
2356
2357 int docTextPosition = 0;
2358 int rangeEnd = rangeLocation + rangeLength;
2359 bool startRangeFound = false;
2360
2361 RefPtr<Range> textRunRange;
2362
2363 TextIterator it(rangeOfContents(scope).get(), forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior);
2364
2365 // FIXME: the atEnd() check shouldn't be necessary, workaround for <http://bugs.webkit.org/show_bug.cgi?id=6289>.
2366 if (rangeLocation == 0 && rangeLength == 0 && it.atEnd()) {
2367 textRunRange = it.range();
2368
2369 ExceptionCode ec = 0;
2370 resultRange->setStart(textRunRange->startContainer(), 0, ec);
2371 ASSERT(!ec);
2372 resultRange->setEnd(textRunRange->startContainer(), 0, ec);
2373 ASSERT(!ec);
2374
2375 return resultRange.release();
2376 }
2377
2378 for (; !it.atEnd(); it.advance()) {
2379 int len = it.length();
2380 textRunRange = it.range();
2381
2382 bool foundStart = rangeLocation >= docTextPosition && rangeLocation <= docTextPosition + len;
2383 bool foundEnd = rangeEnd >= docTextPosition && rangeEnd <= docTextPosition + len;
2384
2385 // Fix textRunRange->endPosition(), but only if foundStart || foundEnd, because it is only
2386 // in those cases that textRunRange is used.
2387 if (foundEnd) {
2388 // FIXME: This is a workaround for the fact that the end of a run is often at the wrong
2389 // position for emitted '\n's.
2390 if (len == 1 && it.characters()[0] == '\n') {
2391 scope->document()->updateLayoutIgnorePendingStylesheets();
2392 it.advance();
2393 if (!it.atEnd()) {
2394 RefPtr<Range> range = it.range();
2395 ExceptionCode ec = 0;
2396 textRunRange->setEnd(range->startContainer(), range->startOffset(), ec);
2397 ASSERT(!ec);
2398 } else {
2399 Position runStart = textRunRange->startPosition();
2400 Position runEnd = VisiblePosition(runStart).next().deepEquivalent();
2401 if (runEnd.isNotNull()) {
2402 ExceptionCode ec = 0;
2403 textRunRange->setEnd(runEnd.containerNode(), runEnd.computeOffsetInContainerNode(), ec);
2404 ASSERT(!ec);
2405 }
2406 }
2407 }
2408 }
2409
2410 if (foundStart) {
2411 startRangeFound = true;
2412 int exception = 0;
2413 if (textRunRange->startContainer()->isTextNode()) {
2414 int offset = rangeLocation - docTextPosition;
2415 resultRange->setStart(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
2416 } else {
2417 if (rangeLocation == docTextPosition)
2418 resultRange->setStart(textRunRange->startContainer(), textRunRange->startOffset(), exception);
2419 else
2420 resultRange->setStart(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2421 }
2422 }
2423
2424 if (foundEnd) {
2425 int exception = 0;
2426 if (textRunRange->startContainer()->isTextNode()) {
2427 int offset = rangeEnd - docTextPosition;
2428 resultRange->setEnd(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
2429 } else {
2430 if (rangeEnd == docTextPosition)
2431 resultRange->setEnd(textRunRange->startContainer(), textRunRange->startOffset(), exception);
2432 else
2433 resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2434 }
2435 docTextPosition += len;
2436 break;
2437 }
2438 docTextPosition += len;
2439 }
2440
2441 if (!startRangeFound)
2442 return 0;
2443
2444 if (rangeLength != 0 && rangeEnd > docTextPosition) { // rangeEnd is out of bounds
2445 int exception = 0;
2446 resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2447 }
2448
2449 return resultRange.release();
2450 }
2451
locationAndLengthFromRange(const Range * range,size_t & location,size_t & length)2452 bool TextIterator::locationAndLengthFromRange(const Range* range, size_t& location, size_t& length)
2453 {
2454 location = notFound;
2455 length = 0;
2456
2457 if (!range->startContainer())
2458 return false;
2459
2460 Element* selectionRoot = range->ownerDocument()->frame()->selection()->rootEditableElement();
2461 Element* scope = selectionRoot ? selectionRoot : range->ownerDocument()->documentElement();
2462
2463 // The critical assumption is that this only gets called with ranges that
2464 // concentrate on a given area containing the selection root. This is done
2465 // because of text fields and textareas. The DOM for those is not
2466 // directly in the document DOM, so ensure that the range does not cross a
2467 // boundary of one of those.
2468 if (range->startContainer() != scope && !range->startContainer()->isDescendantOf(scope))
2469 return false;
2470 if (range->endContainer() != scope && !range->endContainer()->isDescendantOf(scope))
2471 return false;
2472
2473 RefPtr<Range> testRange = Range::create(scope->document(), scope, 0, range->startContainer(), range->startOffset());
2474 ASSERT(testRange->startContainer() == scope);
2475 location = TextIterator::rangeLength(testRange.get());
2476
2477 ExceptionCode ec;
2478 testRange->setEnd(range->endContainer(), range->endOffset(), ec);
2479 ASSERT(testRange->startContainer() == scope);
2480 length = TextIterator::rangeLength(testRange.get()) - location;
2481 return true;
2482 }
2483
2484 // --------
2485
plainTextToMallocAllocatedBuffer(const Range * r,unsigned & bufferLength,bool isDisplayString,TextIteratorBehavior defaultBehavior)2486 UChar* plainTextToMallocAllocatedBuffer(const Range* r, unsigned& bufferLength, bool isDisplayString, TextIteratorBehavior defaultBehavior)
2487 {
2488 UChar* result = 0;
2489
2490 // Do this in pieces to avoid massive reallocations if there is a large amount of text.
2491 // Use system malloc for buffers since they can consume lots of memory and current TCMalloc is unable return it back to OS.
2492 static const unsigned cMaxSegmentSize = 1 << 16;
2493 bufferLength = 0;
2494 typedef pair<UChar*, unsigned> TextSegment;
2495 OwnPtr<Vector<TextSegment> > textSegments;
2496 Vector<UChar> textBuffer;
2497 textBuffer.reserveInitialCapacity(cMaxSegmentSize);
2498 TextIteratorBehavior behavior = defaultBehavior;
2499 if (!isDisplayString)
2500 behavior = static_cast<TextIteratorBehavior>(behavior | TextIteratorEmitsTextsWithoutTranscoding);
2501
2502 for (TextIterator it(r, behavior); !it.atEnd(); it.advance()) {
2503 if (textBuffer.size() && textBuffer.size() + it.length() > cMaxSegmentSize) {
2504 UChar* newSegmentBuffer = static_cast<UChar*>(malloc(textBuffer.size() * sizeof(UChar)));
2505 if (!newSegmentBuffer)
2506 goto exit;
2507 memcpy(newSegmentBuffer, textBuffer.data(), textBuffer.size() * sizeof(UChar));
2508 if (!textSegments)
2509 textSegments = adoptPtr(new Vector<TextSegment>);
2510 textSegments->append(make_pair(newSegmentBuffer, (unsigned)textBuffer.size()));
2511 textBuffer.clear();
2512 }
2513 textBuffer.append(it.characters(), it.length());
2514 bufferLength += it.length();
2515 }
2516
2517 if (!bufferLength)
2518 return 0;
2519
2520 // Since we know the size now, we can make a single buffer out of the pieces with one big alloc
2521 result = static_cast<UChar*>(malloc(bufferLength * sizeof(UChar)));
2522 if (!result)
2523 goto exit;
2524
2525 {
2526 UChar* resultPos = result;
2527 if (textSegments) {
2528 unsigned size = textSegments->size();
2529 for (unsigned i = 0; i < size; ++i) {
2530 const TextSegment& segment = textSegments->at(i);
2531 memcpy(resultPos, segment.first, segment.second * sizeof(UChar));
2532 resultPos += segment.second;
2533 }
2534 }
2535 memcpy(resultPos, textBuffer.data(), textBuffer.size() * sizeof(UChar));
2536 }
2537
2538 exit:
2539 if (textSegments) {
2540 unsigned size = textSegments->size();
2541 for (unsigned i = 0; i < size; ++i)
2542 free(textSegments->at(i).first);
2543 }
2544
2545 if (isDisplayString && r->ownerDocument())
2546 r->ownerDocument()->displayBufferModifiedByEncoding(result, bufferLength);
2547
2548 return result;
2549 }
2550
plainText(const Range * r,TextIteratorBehavior defaultBehavior)2551 String plainText(const Range* r, TextIteratorBehavior defaultBehavior)
2552 {
2553 unsigned length;
2554 UChar* buf = plainTextToMallocAllocatedBuffer(r, length, false, defaultBehavior);
2555 if (!buf)
2556 return "";
2557 String result(buf, length);
2558 free(buf);
2559 return result;
2560 }
2561
isAllCollapsibleWhitespace(const String & string)2562 static inline bool isAllCollapsibleWhitespace(const String& string)
2563 {
2564 const UChar* characters = string.characters();
2565 unsigned length = string.length();
2566 for (unsigned i = 0; i < length; ++i) {
2567 if (!isCollapsibleWhitespace(characters[i]))
2568 return false;
2569 }
2570 return true;
2571 }
2572
collapsedToBoundary(const Range * range,bool forward)2573 static PassRefPtr<Range> collapsedToBoundary(const Range* range, bool forward)
2574 {
2575 ExceptionCode ec = 0;
2576 RefPtr<Range> result = range->cloneRange(ec);
2577 ASSERT(!ec);
2578 result->collapse(!forward, ec);
2579 ASSERT(!ec);
2580 return result.release();
2581 }
2582
findPlainText(CharacterIterator & it,const String & target,FindOptions options,size_t & matchStart)2583 static size_t findPlainText(CharacterIterator& it, const String& target, FindOptions options, size_t& matchStart)
2584 {
2585 matchStart = 0;
2586 size_t matchLength = 0;
2587
2588 SearchBuffer buffer(target, options);
2589
2590 if (buffer.needsMoreContext()) {
2591 RefPtr<Range> startRange = it.range();
2592 RefPtr<Range> beforeStartRange = startRange->ownerDocument()->createRange();
2593 ExceptionCode ec = 0;
2594 beforeStartRange->setEnd(startRange->startContainer(), startRange->startOffset(), ec);
2595 for (SimplifiedBackwardsTextIterator backwardsIterator(beforeStartRange.get()); !backwardsIterator.atEnd(); backwardsIterator.advance()) {
2596 buffer.prependContext(backwardsIterator.characters(), backwardsIterator.length());
2597 if (!buffer.needsMoreContext())
2598 break;
2599 }
2600 }
2601
2602 while (!it.atEnd()) {
2603 it.advance(buffer.append(it.characters(), it.length()));
2604 tryAgain:
2605 size_t matchStartOffset;
2606 if (size_t newMatchLength = buffer.search(matchStartOffset)) {
2607 // Note that we found a match, and where we found it.
2608 size_t lastCharacterInBufferOffset = it.characterOffset();
2609 ASSERT(lastCharacterInBufferOffset >= matchStartOffset);
2610 matchStart = lastCharacterInBufferOffset - matchStartOffset;
2611 matchLength = newMatchLength;
2612 // If searching forward, stop on the first match.
2613 // If searching backward, don't stop, so we end up with the last match.
2614 if (!(options & Backwards))
2615 break;
2616 goto tryAgain;
2617 }
2618 if (it.atBreak() && !buffer.atBreak()) {
2619 buffer.reachedBreak();
2620 goto tryAgain;
2621 }
2622 }
2623
2624 return matchLength;
2625 }
2626
findPlainText(const Range * range,const String & target,FindOptions options)2627 PassRefPtr<Range> findPlainText(const Range* range, const String& target, FindOptions options)
2628 {
2629 // First, find the text.
2630 size_t matchStart;
2631 size_t matchLength;
2632 {
2633 CharacterIterator findIterator(range, TextIteratorEntersTextControls);
2634 matchLength = findPlainText(findIterator, target, options, matchStart);
2635 if (!matchLength)
2636 return collapsedToBoundary(range, !(options & Backwards));
2637 }
2638
2639 // Then, find the document position of the start and the end of the text.
2640 CharacterIterator computeRangeIterator(range, TextIteratorEntersTextControls);
2641 return characterSubrange(computeRangeIterator, matchStart, matchLength);
2642 }
2643
2644 }
2645