1 /*
2 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010 Apple Inc. All rights reserved.
3 * Copyright (C) 2005 Alexey Proskuryakov.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27 #include "config.h"
28 #include "TextIterator.h"
29
30 #include "Document.h"
31 #include "Frame.h"
32 #include "HTMLElement.h"
33 #include "HTMLNames.h"
34 #include "htmlediting.h"
35 #include "InlineTextBox.h"
36 #include "Range.h"
37 #include "RenderTableCell.h"
38 #include "RenderTableRow.h"
39 #include "RenderTextControl.h"
40 #include "RenderTextFragment.h"
41 #include "TextBoundaries.h"
42 #include "TextBreakIterator.h"
43 #include "VisiblePosition.h"
44 #include "visible_units.h"
45 #include <wtf/unicode/CharacterNames.h>
46
47 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
48 #include "TextBreakIteratorInternalICU.h"
49 #include <unicode/usearch.h>
50 #endif
51
52 using namespace WTF::Unicode;
53 using namespace std;
54
55 namespace WebCore {
56
57 using namespace HTMLNames;
58
59 // Buffer that knows how to compare with a search target.
60 // Keeps enough of the previous text to be able to search in the future, but no more.
61 // Non-breaking spaces are always equal to normal spaces.
62 // Case folding is also done if the CaseInsensitive option is specified.
63 // Matches are further filtered if the AtWordStarts option is specified, although some
64 // matches inside a word are permitted if TreatMedialCapitalAsWordStart is specified as well.
65 class SearchBuffer {
66 WTF_MAKE_NONCOPYABLE(SearchBuffer);
67 public:
68 SearchBuffer(const String& target, FindOptions);
69 ~SearchBuffer();
70
71 // Returns number of characters appended; guaranteed to be in the range [1, length].
72 size_t append(const UChar*, size_t length);
73 bool needsMoreContext() const;
74 void prependContext(const UChar*, size_t length);
75 void reachedBreak();
76
77 // Result is the size in characters of what was found.
78 // And <startOffset> is the number of characters back to the start of what was found.
79 size_t search(size_t& startOffset);
80 bool atBreak() const;
81
82 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
83
84 private:
85 bool isBadMatch(const UChar*, size_t length) const;
86 bool isWordStartMatch(size_t start, size_t length) const;
87
88 String m_target;
89 FindOptions m_options;
90
91 Vector<UChar> m_buffer;
92 size_t m_overlap;
93 size_t m_prefixLength;
94 bool m_atBreak;
95 bool m_needsMoreContext;
96
97 bool m_targetRequiresKanaWorkaround;
98 Vector<UChar> m_normalizedTarget;
99 mutable Vector<UChar> m_normalizedMatch;
100
101 #else
102
103 private:
104 void append(UChar, bool isCharacterStart);
105 size_t length() const;
106
107 String m_target;
108 FindOptions m_options;
109
110 Vector<UChar> m_buffer;
111 Vector<bool> m_isCharacterStartBuffer;
112 bool m_isBufferFull;
113 size_t m_cursor;
114
115 #endif
116 };
117
118 // --------
119
120 static const unsigned bitsInWord = sizeof(unsigned) * 8;
121 static const unsigned bitInWordMask = bitsInWord - 1;
122
BitStack()123 BitStack::BitStack()
124 : m_size(0)
125 {
126 }
127
~BitStack()128 BitStack::~BitStack()
129 {
130 }
131
push(bool bit)132 void BitStack::push(bool bit)
133 {
134 unsigned index = m_size / bitsInWord;
135 unsigned shift = m_size & bitInWordMask;
136 if (!shift && index == m_words.size()) {
137 m_words.grow(index + 1);
138 m_words[index] = 0;
139 }
140 unsigned& word = m_words[index];
141 unsigned mask = 1U << shift;
142 if (bit)
143 word |= mask;
144 else
145 word &= ~mask;
146 ++m_size;
147 }
148
pop()149 void BitStack::pop()
150 {
151 if (m_size)
152 --m_size;
153 }
154
top() const155 bool BitStack::top() const
156 {
157 if (!m_size)
158 return false;
159 unsigned shift = (m_size - 1) & bitInWordMask;
160 return m_words.last() & (1U << shift);
161 }
162
size() const163 unsigned BitStack::size() const
164 {
165 return m_size;
166 }
167
168 // --------
169
170 #if !ASSERT_DISABLED
171
depthCrossingShadowBoundaries(Node * node)172 static unsigned depthCrossingShadowBoundaries(Node* node)
173 {
174 unsigned depth = 0;
175 for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
176 ++depth;
177 return depth;
178 }
179
180 #endif
181
182 // This function is like Range::pastLastNode, except for the fact that it can climb up out of shadow trees.
nextInPreOrderCrossingShadowBoundaries(Node * rangeEndContainer,int rangeEndOffset)183 static Node* nextInPreOrderCrossingShadowBoundaries(Node* rangeEndContainer, int rangeEndOffset)
184 {
185 if (!rangeEndContainer)
186 return 0;
187 if (rangeEndOffset >= 0 && !rangeEndContainer->offsetInCharacters()) {
188 if (Node* next = rangeEndContainer->childNode(rangeEndOffset))
189 return next;
190 }
191 for (Node* node = rangeEndContainer; node; node = node->parentOrHostNode()) {
192 if (Node* next = node->nextSibling())
193 return next;
194 }
195 return 0;
196 }
197
198 // --------
199
fullyClipsContents(Node * node)200 static inline bool fullyClipsContents(Node* node)
201 {
202 RenderObject* renderer = node->renderer();
203 if (!renderer || !renderer->isBox() || !renderer->hasOverflowClip())
204 return false;
205 return toRenderBox(renderer)->size().isEmpty();
206 }
207
ignoresContainerClip(Node * node)208 static inline bool ignoresContainerClip(Node* node)
209 {
210 RenderObject* renderer = node->renderer();
211 if (!renderer || renderer->isText())
212 return false;
213 EPosition position = renderer->style()->position();
214 return position == AbsolutePosition || position == FixedPosition;
215 }
216
pushFullyClippedState(BitStack & stack,Node * node)217 static void pushFullyClippedState(BitStack& stack, Node* node)
218 {
219 ASSERT(stack.size() == depthCrossingShadowBoundaries(node));
220
221 // Push true if this node full clips its contents, or if a parent already has fully
222 // clipped and this is not a node that ignores its container's clip.
223 stack.push(fullyClipsContents(node) || (stack.top() && !ignoresContainerClip(node)));
224 }
225
setUpFullyClippedStack(BitStack & stack,Node * node)226 static void setUpFullyClippedStack(BitStack& stack, Node* node)
227 {
228 // Put the nodes in a vector so we can iterate in reverse order.
229 Vector<Node*, 100> ancestry;
230 for (Node* parent = node->parentOrHostNode(); parent; parent = parent->parentOrHostNode())
231 ancestry.append(parent);
232
233 // Call pushFullyClippedState on each node starting with the earliest ancestor.
234 size_t size = ancestry.size();
235 for (size_t i = 0; i < size; ++i)
236 pushFullyClippedState(stack, ancestry[size - i - 1]);
237 pushFullyClippedState(stack, node);
238
239 ASSERT(stack.size() == 1 + depthCrossingShadowBoundaries(node));
240 }
241
242 // --------
243
TextIterator()244 TextIterator::TextIterator()
245 : m_startContainer(0)
246 , m_startOffset(0)
247 , m_endContainer(0)
248 , m_endOffset(0)
249 , m_positionNode(0)
250 , m_textCharacters(0)
251 , m_textLength(0)
252 , m_remainingTextBox(0)
253 , m_firstLetterText(0)
254 , m_lastCharacter(0)
255 , m_emitsCharactersBetweenAllVisiblePositions(false)
256 , m_entersTextControls(false)
257 , m_emitsTextWithoutTranscoding(false)
258 , m_handledFirstLetter(false)
259 , m_ignoresStyleVisibility(false)
260 , m_emitsObjectReplacementCharacters(false)
261 {
262 }
263
TextIterator(const Range * r,TextIteratorBehavior behavior)264 TextIterator::TextIterator(const Range* r, TextIteratorBehavior behavior)
265 : m_startContainer(0)
266 , m_startOffset(0)
267 , m_endContainer(0)
268 , m_endOffset(0)
269 , m_positionNode(0)
270 , m_textCharacters(0)
271 , m_textLength(0)
272 , m_remainingTextBox(0)
273 , m_firstLetterText(0)
274 , m_emitsCharactersBetweenAllVisiblePositions(behavior & TextIteratorEmitsCharactersBetweenAllVisiblePositions)
275 , m_entersTextControls(behavior & TextIteratorEntersTextControls)
276 , m_emitsTextWithoutTranscoding(behavior & TextIteratorEmitsTextsWithoutTranscoding)
277 , m_handledFirstLetter(false)
278 , m_ignoresStyleVisibility(behavior & TextIteratorIgnoresStyleVisibility)
279 , m_emitsObjectReplacementCharacters(behavior & TextIteratorEmitsObjectReplacementCharacters)
280 {
281 if (!r)
282 return;
283
284 // get and validate the range endpoints
285 Node* startContainer = r->startContainer();
286 if (!startContainer)
287 return;
288 int startOffset = r->startOffset();
289 Node* endContainer = r->endContainer();
290 int endOffset = r->endOffset();
291
292 // Callers should be handing us well-formed ranges. If we discover that this isn't
293 // the case, we could consider changing this assertion to an early return.
294 ASSERT(r->boundaryPointsValid());
295
296 // remember range - this does not change
297 m_startContainer = startContainer;
298 m_startOffset = startOffset;
299 m_endContainer = endContainer;
300 m_endOffset = endOffset;
301
302 // set up the current node for processing
303 m_node = r->firstNode();
304 if (!m_node)
305 return;
306 setUpFullyClippedStack(m_fullyClippedStack, m_node);
307 m_offset = m_node == m_startContainer ? m_startOffset : 0;
308 m_handledNode = false;
309 m_handledChildren = false;
310
311 // calculate first out of bounds node
312 m_pastEndNode = nextInPreOrderCrossingShadowBoundaries(endContainer, endOffset);
313
314 // initialize node processing state
315 m_needsAnotherNewline = false;
316 m_textBox = 0;
317
318 // initialize record of previous node processing
319 m_hasEmitted = false;
320 m_lastTextNode = 0;
321 m_lastTextNodeEndedWithCollapsedSpace = false;
322 m_lastCharacter = 0;
323
324 #ifndef NDEBUG
325 // need this just because of the assert in advance()
326 m_positionNode = m_node;
327 #endif
328
329 // identify the first run
330 advance();
331 }
332
~TextIterator()333 TextIterator::~TextIterator()
334 {
335 }
336
advance()337 void TextIterator::advance()
338 {
339 // reset the run information
340 m_positionNode = 0;
341 m_textLength = 0;
342
343 // handle remembered node that needed a newline after the text node's newline
344 if (m_needsAnotherNewline) {
345 // Emit the extra newline, and position it *inside* m_node, after m_node's
346 // contents, in case it's a block, in the same way that we position the first
347 // newline. The range for the emitted newline should start where the line
348 // break begins.
349 // FIXME: It would be cleaner if we emitted two newlines during the last
350 // iteration, instead of using m_needsAnotherNewline.
351 Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
352 emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
353 m_needsAnotherNewline = false;
354 return;
355 }
356
357 if (!m_textBox && m_remainingTextBox) {
358 m_textBox = m_remainingTextBox;
359 m_remainingTextBox = 0;
360 m_firstLetterText = 0;
361 m_offset = 0;
362 }
363 // handle remembered text box
364 if (m_textBox) {
365 handleTextBox();
366 if (m_positionNode)
367 return;
368 }
369
370 while (m_node && m_node != m_pastEndNode) {
371 // if the range ends at offset 0 of an element, represent the
372 // position, but not the content, of that element e.g. if the
373 // node is a blockflow element, emit a newline that
374 // precedes the element
375 if (m_node == m_endContainer && m_endOffset == 0) {
376 representNodeOffsetZero();
377 m_node = 0;
378 return;
379 }
380
381 RenderObject* renderer = m_node->renderer();
382 if (!renderer) {
383 m_handledNode = true;
384 m_handledChildren = true;
385 } else {
386 // handle current node according to its type
387 if (!m_handledNode) {
388 if (renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) // FIXME: What about CDATA_SECTION_NODE?
389 m_handledNode = handleTextNode();
390 else if (renderer && (renderer->isImage() || renderer->isWidget() ||
391 (renderer->node() && renderer->node()->isElementNode() &&
392 static_cast<Element*>(renderer->node())->isFormControlElement())))
393 m_handledNode = handleReplacedElement();
394 else
395 m_handledNode = handleNonTextNode();
396 if (m_positionNode)
397 return;
398 }
399 }
400
401 // find a new current node to handle in depth-first manner,
402 // calling exitNode() as we come back thru a parent node
403 Node* next = m_handledChildren ? 0 : m_node->firstChild();
404 m_offset = 0;
405 if (!next) {
406 next = m_node->nextSibling();
407 if (!next) {
408 bool pastEnd = m_node->traverseNextNode() == m_pastEndNode;
409 Node* parentNode = m_node->parentOrHostNode();
410 while (!next && parentNode) {
411 if ((pastEnd && parentNode == m_endContainer) || m_endContainer->isDescendantOf(parentNode))
412 return;
413 bool haveRenderer = m_node->renderer();
414 m_node = parentNode;
415 m_fullyClippedStack.pop();
416 parentNode = m_node->parentOrHostNode();
417 if (haveRenderer)
418 exitNode();
419 if (m_positionNode) {
420 m_handledNode = true;
421 m_handledChildren = true;
422 return;
423 }
424 next = m_node->nextSibling();
425 }
426 }
427 m_fullyClippedStack.pop();
428 }
429
430 // set the new current node
431 m_node = next;
432 if (m_node)
433 pushFullyClippedState(m_fullyClippedStack, m_node);
434 m_handledNode = false;
435 m_handledChildren = false;
436 m_handledFirstLetter = false;
437 m_firstLetterText = 0;
438
439 // how would this ever be?
440 if (m_positionNode)
441 return;
442 }
443 }
444
handleTextNode()445 bool TextIterator::handleTextNode()
446 {
447 if (m_fullyClippedStack.top() && !m_ignoresStyleVisibility)
448 return false;
449
450 RenderText* renderer = toRenderText(m_node->renderer());
451
452 m_lastTextNode = m_node;
453 String str = renderer->text();
454
455 // handle pre-formatted text
456 if (!renderer->style()->collapseWhiteSpace()) {
457 int runStart = m_offset;
458 if (m_lastTextNodeEndedWithCollapsedSpace && hasVisibleTextNode(renderer)) {
459 emitCharacter(' ', m_node, 0, runStart, runStart);
460 return false;
461 }
462 if (!m_handledFirstLetter && renderer->isTextFragment()) {
463 handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
464 if (m_firstLetterText) {
465 String firstLetter = m_firstLetterText->text();
466 emitText(m_node, m_firstLetterText, m_offset, m_offset + firstLetter.length());
467 m_firstLetterText = 0;
468 m_textBox = 0;
469 return false;
470 }
471 }
472 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
473 return false;
474 int strLength = str.length();
475 int end = (m_node == m_endContainer) ? m_endOffset : INT_MAX;
476 int runEnd = min(strLength, end);
477
478 if (runStart >= runEnd)
479 return true;
480
481 emitText(m_node, runStart, runEnd);
482 return true;
483 }
484
485 if (!renderer->firstTextBox() && str.length() > 0) {
486 if (!m_handledFirstLetter && renderer->isTextFragment()) {
487 handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
488 if (m_firstLetterText) {
489 handleTextBox();
490 return false;
491 }
492 }
493 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
494 return false;
495 m_lastTextNodeEndedWithCollapsedSpace = true; // entire block is collapsed space
496 return true;
497 }
498
499 // Used when text boxes are out of order (Hebrew/Arabic w/ embeded LTR text)
500 if (renderer->containsReversedText()) {
501 m_sortedTextBoxes.clear();
502 for (InlineTextBox* textBox = renderer->firstTextBox(); textBox; textBox = textBox->nextTextBox()) {
503 m_sortedTextBoxes.append(textBox);
504 }
505 std::sort(m_sortedTextBoxes.begin(), m_sortedTextBoxes.end(), InlineTextBox::compareByStart);
506 m_sortedTextBoxesPosition = 0;
507 }
508
509 m_textBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox();
510 if (!m_handledFirstLetter && renderer->isTextFragment() && !m_offset)
511 handleTextNodeFirstLetter(static_cast<RenderTextFragment*>(renderer));
512 handleTextBox();
513 return true;
514 }
515
handleTextBox()516 void TextIterator::handleTextBox()
517 {
518 RenderText* renderer = m_firstLetterText ? m_firstLetterText : toRenderText(m_node->renderer());
519 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility) {
520 m_textBox = 0;
521 return;
522 }
523 String str = renderer->text();
524 unsigned start = m_offset;
525 unsigned end = (m_node == m_endContainer) ? static_cast<unsigned>(m_endOffset) : UINT_MAX;
526 while (m_textBox) {
527 unsigned textBoxStart = m_textBox->start();
528 unsigned runStart = max(textBoxStart, start);
529
530 // Check for collapsed space at the start of this run.
531 InlineTextBox* firstTextBox = renderer->containsReversedText() ? (m_sortedTextBoxes.isEmpty() ? 0 : m_sortedTextBoxes[0]) : renderer->firstTextBox();
532 bool needSpace = m_lastTextNodeEndedWithCollapsedSpace
533 || (m_textBox == firstTextBox && textBoxStart == runStart && runStart > 0);
534 if (needSpace && !isCollapsibleWhitespace(m_lastCharacter) && m_lastCharacter) {
535 if (m_lastTextNode == m_node && runStart > 0 && str[runStart - 1] == ' ') {
536 unsigned spaceRunStart = runStart - 1;
537 while (spaceRunStart > 0 && str[spaceRunStart - 1] == ' ')
538 --spaceRunStart;
539 emitText(m_node, renderer, spaceRunStart, spaceRunStart + 1);
540 } else
541 emitCharacter(' ', m_node, 0, runStart, runStart);
542 return;
543 }
544 unsigned textBoxEnd = textBoxStart + m_textBox->len();
545 unsigned runEnd = min(textBoxEnd, end);
546
547 // Determine what the next text box will be, but don't advance yet
548 InlineTextBox* nextTextBox = 0;
549 if (renderer->containsReversedText()) {
550 if (m_sortedTextBoxesPosition + 1 < m_sortedTextBoxes.size())
551 nextTextBox = m_sortedTextBoxes[m_sortedTextBoxesPosition + 1];
552 } else
553 nextTextBox = m_textBox->nextTextBox();
554
555 if (runStart < runEnd) {
556 // Handle either a single newline character (which becomes a space),
557 // or a run of characters that does not include a newline.
558 // This effectively translates newlines to spaces without copying the text.
559 if (str[runStart] == '\n') {
560 emitCharacter(' ', m_node, 0, runStart, runStart + 1);
561 m_offset = runStart + 1;
562 } else {
563 size_t subrunEnd = str.find('\n', runStart);
564 if (subrunEnd == notFound || subrunEnd > runEnd)
565 subrunEnd = runEnd;
566
567 m_offset = subrunEnd;
568 emitText(m_node, renderer, runStart, subrunEnd);
569 }
570
571 // If we are doing a subrun that doesn't go to the end of the text box,
572 // come back again to finish handling this text box; don't advance to the next one.
573 if (static_cast<unsigned>(m_positionEndOffset) < textBoxEnd)
574 return;
575
576 // Advance and return
577 unsigned nextRunStart = nextTextBox ? nextTextBox->start() : str.length();
578 if (nextRunStart > runEnd)
579 m_lastTextNodeEndedWithCollapsedSpace = true; // collapsed space between runs or at the end
580 m_textBox = nextTextBox;
581 if (renderer->containsReversedText())
582 ++m_sortedTextBoxesPosition;
583 return;
584 }
585 // Advance and continue
586 m_textBox = nextTextBox;
587 if (renderer->containsReversedText())
588 ++m_sortedTextBoxesPosition;
589 }
590 if (!m_textBox && m_remainingTextBox) {
591 m_textBox = m_remainingTextBox;
592 m_remainingTextBox = 0;
593 m_firstLetterText = 0;
594 m_offset = 0;
595 handleTextBox();
596 }
597 }
598
handleTextNodeFirstLetter(RenderTextFragment * renderer)599 void TextIterator::handleTextNodeFirstLetter(RenderTextFragment* renderer)
600 {
601 if (renderer->firstLetter()) {
602 RenderObject* r = renderer->firstLetter();
603 if (r->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
604 return;
605 for (RenderObject *currChild = r->firstChild(); currChild; currChild->nextSibling()) {
606 if (currChild->isText()) {
607 RenderText* firstLetter = toRenderText(currChild);
608 m_handledFirstLetter = true;
609 m_remainingTextBox = m_textBox;
610 m_textBox = firstLetter->firstTextBox();
611 m_firstLetterText = firstLetter;
612 return;
613 }
614 }
615 }
616 m_handledFirstLetter = true;
617 }
618
handleReplacedElement()619 bool TextIterator::handleReplacedElement()
620 {
621 if (m_fullyClippedStack.top())
622 return false;
623
624 RenderObject* renderer = m_node->renderer();
625 if (renderer->style()->visibility() != VISIBLE && !m_ignoresStyleVisibility)
626 return false;
627
628 if (m_lastTextNodeEndedWithCollapsedSpace) {
629 emitCharacter(' ', m_lastTextNode->parentNode(), m_lastTextNode, 1, 1);
630 return false;
631 }
632
633 if (m_entersTextControls && renderer->isTextControl()) {
634 if (HTMLElement* innerTextElement = toRenderTextControl(renderer)->innerTextElement()) {
635 m_node = innerTextElement->shadowTreeRootNode();
636 pushFullyClippedState(m_fullyClippedStack, m_node);
637 m_offset = 0;
638 return false;
639 }
640 }
641
642 m_hasEmitted = true;
643
644 if (m_emitsObjectReplacementCharacters && renderer && renderer->isReplaced()) {
645 emitCharacter(objectReplacementCharacter, m_node->parentNode(), m_node, 0, 1);
646 return true;
647 }
648
649 if (m_emitsCharactersBetweenAllVisiblePositions) {
650 // We want replaced elements to behave like punctuation for boundary
651 // finding, and to simply take up space for the selection preservation
652 // code in moveParagraphs, so we use a comma.
653 emitCharacter(',', m_node->parentNode(), m_node, 0, 1);
654 return true;
655 }
656
657 m_positionNode = m_node->parentNode();
658 m_positionOffsetBaseNode = m_node;
659 m_positionStartOffset = 0;
660 m_positionEndOffset = 1;
661
662 m_textCharacters = 0;
663 m_textLength = 0;
664
665 m_lastCharacter = 0;
666
667 return true;
668 }
669
hasVisibleTextNode(RenderText * renderer)670 bool TextIterator::hasVisibleTextNode(RenderText* renderer)
671 {
672 if (renderer->style()->visibility() == VISIBLE)
673 return true;
674 if (renderer->isTextFragment()) {
675 RenderTextFragment* fragment = static_cast<RenderTextFragment*>(renderer);
676 if (fragment->firstLetter() && fragment->firstLetter()->style()->visibility() == VISIBLE)
677 return true;
678 }
679 return false;
680 }
681
shouldEmitTabBeforeNode(Node * node)682 static bool shouldEmitTabBeforeNode(Node* node)
683 {
684 RenderObject* r = node->renderer();
685
686 // Table cells are delimited by tabs.
687 if (!r || !isTableCell(node))
688 return false;
689
690 // Want a tab before every cell other than the first one
691 RenderTableCell* rc = toRenderTableCell(r);
692 RenderTable* t = rc->table();
693 return t && (t->cellBefore(rc) || t->cellAbove(rc));
694 }
695
shouldEmitNewlineForNode(Node * node)696 static bool shouldEmitNewlineForNode(Node* node)
697 {
698 // br elements are represented by a single newline.
699 RenderObject* r = node->renderer();
700 if (!r)
701 return node->hasTagName(brTag);
702
703 return r->isBR();
704 }
705
shouldEmitNewlinesBeforeAndAfterNode(Node * node)706 static bool shouldEmitNewlinesBeforeAndAfterNode(Node* node)
707 {
708 // Block flow (versus inline flow) is represented by having
709 // a newline both before and after the element.
710 RenderObject* r = node->renderer();
711 if (!r) {
712 return (node->hasTagName(blockquoteTag)
713 || node->hasTagName(ddTag)
714 || node->hasTagName(divTag)
715 || node->hasTagName(dlTag)
716 || node->hasTagName(dtTag)
717 || node->hasTagName(h1Tag)
718 || node->hasTagName(h2Tag)
719 || node->hasTagName(h3Tag)
720 || node->hasTagName(h4Tag)
721 || node->hasTagName(h5Tag)
722 || node->hasTagName(h6Tag)
723 || node->hasTagName(hrTag)
724 || node->hasTagName(liTag)
725 || node->hasTagName(listingTag)
726 || node->hasTagName(olTag)
727 || node->hasTagName(pTag)
728 || node->hasTagName(preTag)
729 || node->hasTagName(trTag)
730 || node->hasTagName(ulTag));
731 }
732
733 // Need to make an exception for table cells, because they are blocks, but we
734 // want them tab-delimited rather than having newlines before and after.
735 if (isTableCell(node))
736 return false;
737
738 // Need to make an exception for table row elements, because they are neither
739 // "inline" or "RenderBlock", but we want newlines for them.
740 if (r->isTableRow()) {
741 RenderTable* t = toRenderTableRow(r)->table();
742 if (t && !t->isInline())
743 return true;
744 }
745
746 return !r->isInline() && r->isRenderBlock() && !r->isFloatingOrPositioned() && !r->isBody();
747 }
748
shouldEmitNewlineAfterNode(Node * node)749 static bool shouldEmitNewlineAfterNode(Node* node)
750 {
751 // FIXME: It should be better but slower to create a VisiblePosition here.
752 if (!shouldEmitNewlinesBeforeAndAfterNode(node))
753 return false;
754 // Check if this is the very last renderer in the document.
755 // If so, then we should not emit a newline.
756 while ((node = node->traverseNextSibling()))
757 if (node->renderer())
758 return true;
759 return false;
760 }
761
shouldEmitNewlineBeforeNode(Node * node)762 static bool shouldEmitNewlineBeforeNode(Node* node)
763 {
764 return shouldEmitNewlinesBeforeAndAfterNode(node);
765 }
766
shouldEmitExtraNewlineForNode(Node * node)767 static bool shouldEmitExtraNewlineForNode(Node* node)
768 {
769 // When there is a significant collapsed bottom margin, emit an extra
770 // newline for a more realistic result. We end up getting the right
771 // result even without margin collapsing. For example: <div><p>text</p></div>
772 // will work right even if both the <div> and the <p> have bottom margins.
773 RenderObject* r = node->renderer();
774 if (!r || !r->isBox())
775 return false;
776
777 // NOTE: We only do this for a select set of nodes, and fwiw WinIE appears
778 // not to do this at all
779 if (node->hasTagName(h1Tag)
780 || node->hasTagName(h2Tag)
781 || node->hasTagName(h3Tag)
782 || node->hasTagName(h4Tag)
783 || node->hasTagName(h5Tag)
784 || node->hasTagName(h6Tag)
785 || node->hasTagName(pTag)) {
786 RenderStyle* style = r->style();
787 if (style) {
788 int bottomMargin = toRenderBox(r)->collapsedMarginAfter();
789 int fontSize = style->fontDescription().computedPixelSize();
790 if (bottomMargin * 2 >= fontSize)
791 return true;
792 }
793 }
794
795 return false;
796 }
797
collapsedSpaceLength(RenderText * renderer,int textEnd)798 static int collapsedSpaceLength(RenderText* renderer, int textEnd)
799 {
800 const UChar* characters = renderer->text()->characters();
801 int length = renderer->text()->length();
802 for (int i = textEnd; i < length; ++i) {
803 if (!renderer->style()->isCollapsibleWhiteSpace(characters[i]))
804 return i - textEnd;
805 }
806
807 return length - textEnd;
808 }
809
maxOffsetIncludingCollapsedSpaces(Node * node)810 static int maxOffsetIncludingCollapsedSpaces(Node* node)
811 {
812 int offset = caretMaxOffset(node);
813
814 if (node->renderer() && node->renderer()->isText())
815 offset += collapsedSpaceLength(toRenderText(node->renderer()), offset);
816
817 return offset;
818 }
819
820 // Whether or not we should emit a character as we enter m_node (if it's a container) or as we hit it (if it's atomic).
shouldRepresentNodeOffsetZero()821 bool TextIterator::shouldRepresentNodeOffsetZero()
822 {
823 if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isTable())
824 return true;
825
826 // Leave element positioned flush with start of a paragraph
827 // (e.g. do not insert tab before a table cell at the start of a paragraph)
828 if (m_lastCharacter == '\n')
829 return false;
830
831 // Otherwise, show the position if we have emitted any characters
832 if (m_hasEmitted)
833 return true;
834
835 // We've not emitted anything yet. Generally, there is no need for any positioning then.
836 // The only exception is when the element is visually not in the same line as
837 // the start of the range (e.g. the range starts at the end of the previous paragraph).
838 // NOTE: Creating VisiblePositions and comparing them is relatively expensive, so we
839 // make quicker checks to possibly avoid that. Another check that we could make is
840 // is whether the inline vs block flow changed since the previous visible element.
841 // I think we're already in a special enough case that that won't be needed, tho.
842
843 // No character needed if this is the first node in the range.
844 if (m_node == m_startContainer)
845 return false;
846
847 // If we are outside the start container's subtree, assume we need to emit.
848 // FIXME: m_startContainer could be an inline block
849 if (!m_node->isDescendantOf(m_startContainer))
850 return true;
851
852 // If we started as m_startContainer offset 0 and the current node is a descendant of
853 // the start container, we already had enough context to correctly decide whether to
854 // emit after a preceding block. We chose not to emit (m_hasEmitted is false),
855 // so don't second guess that now.
856 // NOTE: Is this really correct when m_node is not a leftmost descendant? Probably
857 // immaterial since we likely would have already emitted something by now.
858 if (m_startOffset == 0)
859 return false;
860
861 // If this node is unrendered or invisible the VisiblePosition checks below won't have much meaning.
862 // Additionally, if the range we are iterating over contains huge sections of unrendered content,
863 // we would create VisiblePositions on every call to this function without this check.
864 if (!m_node->renderer() || m_node->renderer()->style()->visibility() != VISIBLE)
865 return false;
866
867 // The startPos.isNotNull() check is needed because the start could be before the body,
868 // and in that case we'll get null. We don't want to put in newlines at the start in that case.
869 // The currPos.isNotNull() check is needed because positions in non-HTML content
870 // (like SVG) do not have visible positions, and we don't want to emit for them either.
871 VisiblePosition startPos = VisiblePosition(Position(m_startContainer, m_startOffset, Position::PositionIsOffsetInAnchor), DOWNSTREAM);
872 VisiblePosition currPos = VisiblePosition(positionBeforeNode(m_node), DOWNSTREAM);
873 return startPos.isNotNull() && currPos.isNotNull() && !inSameLine(startPos, currPos);
874 }
875
shouldEmitSpaceBeforeAndAfterNode(Node * node)876 bool TextIterator::shouldEmitSpaceBeforeAndAfterNode(Node* node)
877 {
878 return node->renderer() && node->renderer()->isTable() && (node->renderer()->isInline() || m_emitsCharactersBetweenAllVisiblePositions);
879 }
880
representNodeOffsetZero()881 void TextIterator::representNodeOffsetZero()
882 {
883 // Emit a character to show the positioning of m_node.
884
885 // When we haven't been emitting any characters, shouldRepresentNodeOffsetZero() can
886 // create VisiblePositions, which is expensive. So, we perform the inexpensive checks
887 // on m_node to see if it necessitates emitting a character first and will early return
888 // before encountering shouldRepresentNodeOffsetZero()s worse case behavior.
889 if (shouldEmitTabBeforeNode(m_node)) {
890 if (shouldRepresentNodeOffsetZero())
891 emitCharacter('\t', m_node->parentNode(), m_node, 0, 0);
892 } else if (shouldEmitNewlineBeforeNode(m_node)) {
893 if (shouldRepresentNodeOffsetZero())
894 emitCharacter('\n', m_node->parentNode(), m_node, 0, 0);
895 } else if (shouldEmitSpaceBeforeAndAfterNode(m_node)) {
896 if (shouldRepresentNodeOffsetZero())
897 emitCharacter(' ', m_node->parentNode(), m_node, 0, 0);
898 }
899 }
900
handleNonTextNode()901 bool TextIterator::handleNonTextNode()
902 {
903 if (shouldEmitNewlineForNode(m_node))
904 emitCharacter('\n', m_node->parentNode(), m_node, 0, 1);
905 else if (m_emitsCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isHR())
906 emitCharacter(' ', m_node->parentNode(), m_node, 0, 1);
907 else
908 representNodeOffsetZero();
909
910 return true;
911 }
912
exitNode()913 void TextIterator::exitNode()
914 {
915 // prevent emitting a newline when exiting a collapsed block at beginning of the range
916 // FIXME: !m_hasEmitted does not necessarily mean there was a collapsed block... it could
917 // have been an hr (e.g.). Also, a collapsed block could have height (e.g. a table) and
918 // therefore look like a blank line.
919 if (!m_hasEmitted)
920 return;
921
922 // Emit with a position *inside* m_node, after m_node's contents, in
923 // case it is a block, because the run should start where the
924 // emitted character is positioned visually.
925 Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
926 // FIXME: This shouldn't require the m_lastTextNode to be true, but we can't change that without making
927 // the logic in _web_attributedStringFromRange match. We'll get that for free when we switch to use
928 // TextIterator in _web_attributedStringFromRange.
929 // See <rdar://problem/5428427> for an example of how this mismatch will cause problems.
930 if (m_lastTextNode && shouldEmitNewlineAfterNode(m_node)) {
931 // use extra newline to represent margin bottom, as needed
932 bool addNewline = shouldEmitExtraNewlineForNode(m_node);
933
934 // FIXME: We need to emit a '\n' as we leave an empty block(s) that
935 // contain a VisiblePosition when doing selection preservation.
936 if (m_lastCharacter != '\n') {
937 // insert a newline with a position following this block's contents.
938 emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
939 // remember whether to later add a newline for the current node
940 ASSERT(!m_needsAnotherNewline);
941 m_needsAnotherNewline = addNewline;
942 } else if (addNewline)
943 // insert a newline with a position following this block's contents.
944 emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
945 }
946
947 // If nothing was emitted, see if we need to emit a space.
948 if (!m_positionNode && shouldEmitSpaceBeforeAndAfterNode(m_node))
949 emitCharacter(' ', baseNode->parentNode(), baseNode, 1, 1);
950 }
951
emitCharacter(UChar c,Node * textNode,Node * offsetBaseNode,int textStartOffset,int textEndOffset)952 void TextIterator::emitCharacter(UChar c, Node* textNode, Node* offsetBaseNode, int textStartOffset, int textEndOffset)
953 {
954 m_hasEmitted = true;
955
956 // remember information with which to construct the TextIterator::range()
957 // NOTE: textNode is often not a text node, so the range will specify child nodes of positionNode
958 m_positionNode = textNode;
959 m_positionOffsetBaseNode = offsetBaseNode;
960 m_positionStartOffset = textStartOffset;
961 m_positionEndOffset = textEndOffset;
962
963 // remember information with which to construct the TextIterator::characters() and length()
964 m_singleCharacterBuffer = c;
965 m_textCharacters = &m_singleCharacterBuffer;
966 m_textLength = 1;
967
968 // remember some iteration state
969 m_lastTextNodeEndedWithCollapsedSpace = false;
970 m_lastCharacter = c;
971 }
972
emitText(Node * textNode,RenderObject * renderObject,int textStartOffset,int textEndOffset)973 void TextIterator::emitText(Node* textNode, RenderObject* renderObject, int textStartOffset, int textEndOffset)
974 {
975 RenderText* renderer = toRenderText(renderObject);
976 m_text = m_emitsTextWithoutTranscoding ? renderer->textWithoutTranscoding() : renderer->text();
977 ASSERT(m_text.characters());
978
979 m_positionNode = textNode;
980 m_positionOffsetBaseNode = 0;
981 m_positionStartOffset = textStartOffset;
982 m_positionEndOffset = textEndOffset;
983 m_textCharacters = m_text.characters() + textStartOffset;
984 m_textLength = textEndOffset - textStartOffset;
985 m_lastCharacter = m_text[textEndOffset - 1];
986
987 m_lastTextNodeEndedWithCollapsedSpace = false;
988 m_hasEmitted = true;
989 }
990
emitText(Node * textNode,int textStartOffset,int textEndOffset)991 void TextIterator::emitText(Node* textNode, int textStartOffset, int textEndOffset)
992 {
993 emitText(textNode, m_node->renderer(), textStartOffset, textEndOffset);
994 }
995
range() const996 PassRefPtr<Range> TextIterator::range() const
997 {
998 // use the current run information, if we have it
999 if (m_positionNode) {
1000 if (m_positionOffsetBaseNode) {
1001 int index = m_positionOffsetBaseNode->nodeIndex();
1002 m_positionStartOffset += index;
1003 m_positionEndOffset += index;
1004 m_positionOffsetBaseNode = 0;
1005 }
1006 return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1007 }
1008
1009 // otherwise, return the end of the overall range we were given
1010 if (m_endContainer)
1011 return Range::create(m_endContainer->document(), m_endContainer, m_endOffset, m_endContainer, m_endOffset);
1012
1013 return 0;
1014 }
1015
node() const1016 Node* TextIterator::node() const
1017 {
1018 RefPtr<Range> textRange = range();
1019 if (!textRange)
1020 return 0;
1021
1022 Node* node = textRange->startContainer();
1023 if (!node)
1024 return 0;
1025 if (node->offsetInCharacters())
1026 return node;
1027
1028 return node->childNode(textRange->startOffset());
1029 }
1030
1031 // --------
1032
SimplifiedBackwardsTextIterator()1033 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator()
1034 : m_behavior(TextIteratorDefaultBehavior)
1035 , m_node(0)
1036 , m_positionNode(0)
1037 {
1038 }
1039
SimplifiedBackwardsTextIterator(const Range * r,TextIteratorBehavior behavior)1040 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator(const Range* r, TextIteratorBehavior behavior)
1041 : m_behavior(behavior)
1042 , m_node(0)
1043 , m_positionNode(0)
1044 {
1045 ASSERT(m_behavior == TextIteratorDefaultBehavior);
1046
1047 if (!r)
1048 return;
1049
1050 Node* startNode = r->startContainer();
1051 if (!startNode)
1052 return;
1053 Node* endNode = r->endContainer();
1054 int startOffset = r->startOffset();
1055 int endOffset = r->endOffset();
1056
1057 if (!startNode->offsetInCharacters()) {
1058 if (startOffset >= 0 && startOffset < static_cast<int>(startNode->childNodeCount())) {
1059 startNode = startNode->childNode(startOffset);
1060 startOffset = 0;
1061 }
1062 }
1063 if (!endNode->offsetInCharacters()) {
1064 if (endOffset > 0 && endOffset <= static_cast<int>(endNode->childNodeCount())) {
1065 endNode = endNode->childNode(endOffset - 1);
1066 endOffset = lastOffsetInNode(endNode);
1067 }
1068 }
1069
1070 m_node = endNode;
1071 setUpFullyClippedStack(m_fullyClippedStack, m_node);
1072 m_offset = endOffset;
1073 m_handledNode = false;
1074 m_handledChildren = endOffset == 0;
1075
1076 m_startNode = startNode;
1077 m_startOffset = startOffset;
1078 m_endNode = endNode;
1079 m_endOffset = endOffset;
1080
1081 #ifndef NDEBUG
1082 // Need this just because of the assert.
1083 m_positionNode = endNode;
1084 #endif
1085
1086 m_lastTextNode = 0;
1087 m_lastCharacter = '\n';
1088
1089 m_havePassedStartNode = false;
1090
1091 advance();
1092 }
1093
advance()1094 void SimplifiedBackwardsTextIterator::advance()
1095 {
1096 ASSERT(m_positionNode);
1097
1098 m_positionNode = 0;
1099 m_textLength = 0;
1100
1101 while (m_node && !m_havePassedStartNode) {
1102 // Don't handle node if we start iterating at [node, 0].
1103 if (!m_handledNode && !(m_node == m_endNode && m_endOffset == 0)) {
1104 RenderObject* renderer = m_node->renderer();
1105 if (renderer && renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) {
1106 // FIXME: What about CDATA_SECTION_NODE?
1107 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
1108 m_handledNode = handleTextNode();
1109 } else if (renderer && (renderer->isImage() || renderer->isWidget())) {
1110 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
1111 m_handledNode = handleReplacedElement();
1112 } else
1113 m_handledNode = handleNonTextNode();
1114 if (m_positionNode)
1115 return;
1116 }
1117
1118 if (!m_handledChildren && m_node->hasChildNodes()) {
1119 m_node = m_node->lastChild();
1120 pushFullyClippedState(m_fullyClippedStack, m_node);
1121 } else {
1122 // Exit empty containers as we pass over them or containers
1123 // where [container, 0] is where we started iterating.
1124 if (!m_handledNode
1125 && canHaveChildrenForEditing(m_node)
1126 && m_node->parentNode()
1127 && (!m_node->lastChild() || (m_node == m_endNode && !m_endOffset))) {
1128 exitNode();
1129 if (m_positionNode) {
1130 m_handledNode = true;
1131 m_handledChildren = true;
1132 return;
1133 }
1134 }
1135
1136 // Exit all other containers.
1137 while (!m_node->previousSibling()) {
1138 if (!advanceRespectingRange(m_node->parentOrHostNode()))
1139 break;
1140 m_fullyClippedStack.pop();
1141 exitNode();
1142 if (m_positionNode) {
1143 m_handledNode = true;
1144 m_handledChildren = true;
1145 return;
1146 }
1147 }
1148
1149 m_fullyClippedStack.pop();
1150 if (advanceRespectingRange(m_node->previousSibling()))
1151 pushFullyClippedState(m_fullyClippedStack, m_node);
1152 else
1153 m_node = 0;
1154 }
1155
1156 // For the purpose of word boundary detection,
1157 // we should iterate all visible text and trailing (collapsed) whitespaces.
1158 m_offset = m_node ? maxOffsetIncludingCollapsedSpaces(m_node) : 0;
1159 m_handledNode = false;
1160 m_handledChildren = false;
1161
1162 if (m_positionNode)
1163 return;
1164 }
1165 }
1166
handleTextNode()1167 bool SimplifiedBackwardsTextIterator::handleTextNode()
1168 {
1169 m_lastTextNode = m_node;
1170
1171 RenderText* renderer = toRenderText(m_node->renderer());
1172 String str = renderer->text();
1173
1174 if (!renderer->firstTextBox() && str.length() > 0)
1175 return true;
1176
1177 m_positionEndOffset = m_offset;
1178
1179 m_offset = (m_node == m_startNode) ? m_startOffset : 0;
1180 m_positionNode = m_node;
1181 m_positionStartOffset = m_offset;
1182 m_textLength = m_positionEndOffset - m_positionStartOffset;
1183 m_textCharacters = str.characters() + m_positionStartOffset;
1184
1185 m_lastCharacter = str[m_positionEndOffset - 1];
1186
1187 return true;
1188 }
1189
handleReplacedElement()1190 bool SimplifiedBackwardsTextIterator::handleReplacedElement()
1191 {
1192 unsigned index = m_node->nodeIndex();
1193 // We want replaced elements to behave like punctuation for boundary
1194 // finding, and to simply take up space for the selection preservation
1195 // code in moveParagraphs, so we use a comma. Unconditionally emit
1196 // here because this iterator is only used for boundary finding.
1197 emitCharacter(',', m_node->parentNode(), index, index + 1);
1198 return true;
1199 }
1200
handleNonTextNode()1201 bool SimplifiedBackwardsTextIterator::handleNonTextNode()
1202 {
1203 // We can use a linefeed in place of a tab because this simple iterator is only used to
1204 // find boundaries, not actual content. A linefeed breaks words, sentences, and paragraphs.
1205 if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineAfterNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
1206 unsigned index = m_node->nodeIndex();
1207 // The start of this emitted range is wrong. Ensuring correctness would require
1208 // VisiblePositions and so would be slow. previousBoundary expects this.
1209 emitCharacter('\n', m_node->parentNode(), index + 1, index + 1);
1210 }
1211 return true;
1212 }
1213
exitNode()1214 void SimplifiedBackwardsTextIterator::exitNode()
1215 {
1216 if (shouldEmitNewlineForNode(m_node) || shouldEmitNewlineBeforeNode(m_node) || shouldEmitTabBeforeNode(m_node)) {
1217 // The start of this emitted range is wrong. Ensuring correctness would require
1218 // VisiblePositions and so would be slow. previousBoundary expects this.
1219 emitCharacter('\n', m_node, 0, 0);
1220 }
1221 }
1222
emitCharacter(UChar c,Node * node,int startOffset,int endOffset)1223 void SimplifiedBackwardsTextIterator::emitCharacter(UChar c, Node* node, int startOffset, int endOffset)
1224 {
1225 m_singleCharacterBuffer = c;
1226 m_positionNode = node;
1227 m_positionStartOffset = startOffset;
1228 m_positionEndOffset = endOffset;
1229 m_textCharacters = &m_singleCharacterBuffer;
1230 m_textLength = 1;
1231 m_lastCharacter = c;
1232 }
1233
advanceRespectingRange(Node * next)1234 bool SimplifiedBackwardsTextIterator::advanceRespectingRange(Node* next)
1235 {
1236 if (!next)
1237 return false;
1238 m_havePassedStartNode |= m_node == m_startNode;
1239 if (m_havePassedStartNode)
1240 return false;
1241 m_node = next;
1242 return true;
1243 }
1244
range() const1245 PassRefPtr<Range> SimplifiedBackwardsTextIterator::range() const
1246 {
1247 if (m_positionNode)
1248 return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
1249
1250 return Range::create(m_startNode->document(), m_startNode, m_startOffset, m_startNode, m_startOffset);
1251 }
1252
1253 // --------
1254
CharacterIterator()1255 CharacterIterator::CharacterIterator()
1256 : m_offset(0)
1257 , m_runOffset(0)
1258 , m_atBreak(true)
1259 {
1260 }
1261
CharacterIterator(const Range * r,TextIteratorBehavior behavior)1262 CharacterIterator::CharacterIterator(const Range* r, TextIteratorBehavior behavior)
1263 : m_offset(0)
1264 , m_runOffset(0)
1265 , m_atBreak(true)
1266 , m_textIterator(r, behavior)
1267 {
1268 while (!atEnd() && m_textIterator.length() == 0)
1269 m_textIterator.advance();
1270 }
1271
range() const1272 PassRefPtr<Range> CharacterIterator::range() const
1273 {
1274 RefPtr<Range> r = m_textIterator.range();
1275 if (!m_textIterator.atEnd()) {
1276 if (m_textIterator.length() <= 1) {
1277 ASSERT(m_runOffset == 0);
1278 } else {
1279 Node* n = r->startContainer();
1280 ASSERT(n == r->endContainer());
1281 int offset = r->startOffset() + m_runOffset;
1282 ExceptionCode ec = 0;
1283 r->setStart(n, offset, ec);
1284 r->setEnd(n, offset + 1, ec);
1285 ASSERT(!ec);
1286 }
1287 }
1288 return r.release();
1289 }
1290
advance(int count)1291 void CharacterIterator::advance(int count)
1292 {
1293 if (count <= 0) {
1294 ASSERT(count == 0);
1295 return;
1296 }
1297
1298 m_atBreak = false;
1299
1300 // easy if there is enough left in the current m_textIterator run
1301 int remaining = m_textIterator.length() - m_runOffset;
1302 if (count < remaining) {
1303 m_runOffset += count;
1304 m_offset += count;
1305 return;
1306 }
1307
1308 // exhaust the current m_textIterator run
1309 count -= remaining;
1310 m_offset += remaining;
1311
1312 // move to a subsequent m_textIterator run
1313 for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
1314 int runLength = m_textIterator.length();
1315 if (runLength == 0)
1316 m_atBreak = true;
1317 else {
1318 // see whether this is m_textIterator to use
1319 if (count < runLength) {
1320 m_runOffset = count;
1321 m_offset += count;
1322 return;
1323 }
1324
1325 // exhaust this m_textIterator run
1326 count -= runLength;
1327 m_offset += runLength;
1328 }
1329 }
1330
1331 // ran to the end of the m_textIterator... no more runs left
1332 m_atBreak = true;
1333 m_runOffset = 0;
1334 }
1335
string(int numChars)1336 String CharacterIterator::string(int numChars)
1337 {
1338 Vector<UChar> result;
1339 result.reserveInitialCapacity(numChars);
1340 while (numChars > 0 && !atEnd()) {
1341 int runSize = min(numChars, length());
1342 result.append(characters(), runSize);
1343 numChars -= runSize;
1344 advance(runSize);
1345 }
1346 return String::adopt(result);
1347 }
1348
characterSubrange(CharacterIterator & it,int offset,int length)1349 static PassRefPtr<Range> characterSubrange(CharacterIterator& it, int offset, int length)
1350 {
1351 it.advance(offset);
1352 RefPtr<Range> start = it.range();
1353
1354 if (length > 1)
1355 it.advance(length - 1);
1356 RefPtr<Range> end = it.range();
1357
1358 return Range::create(start->startContainer()->document(),
1359 start->startContainer(), start->startOffset(),
1360 end->endContainer(), end->endOffset());
1361 }
1362
BackwardsCharacterIterator()1363 BackwardsCharacterIterator::BackwardsCharacterIterator()
1364 : m_offset(0)
1365 , m_runOffset(0)
1366 , m_atBreak(true)
1367 {
1368 }
1369
BackwardsCharacterIterator(const Range * range,TextIteratorBehavior behavior)1370 BackwardsCharacterIterator::BackwardsCharacterIterator(const Range* range, TextIteratorBehavior behavior)
1371 : m_offset(0)
1372 , m_runOffset(0)
1373 , m_atBreak(true)
1374 , m_textIterator(range, behavior)
1375 {
1376 while (!atEnd() && !m_textIterator.length())
1377 m_textIterator.advance();
1378 }
1379
range() const1380 PassRefPtr<Range> BackwardsCharacterIterator::range() const
1381 {
1382 RefPtr<Range> r = m_textIterator.range();
1383 if (!m_textIterator.atEnd()) {
1384 if (m_textIterator.length() <= 1)
1385 ASSERT(m_runOffset == 0);
1386 else {
1387 Node* n = r->startContainer();
1388 ASSERT(n == r->endContainer());
1389 int offset = r->endOffset() - m_runOffset;
1390 ExceptionCode ec = 0;
1391 r->setStart(n, offset - 1, ec);
1392 r->setEnd(n, offset, ec);
1393 ASSERT(!ec);
1394 }
1395 }
1396 return r.release();
1397 }
1398
advance(int count)1399 void BackwardsCharacterIterator::advance(int count)
1400 {
1401 if (count <= 0) {
1402 ASSERT(!count);
1403 return;
1404 }
1405
1406 m_atBreak = false;
1407
1408 int remaining = m_textIterator.length() - m_runOffset;
1409 if (count < remaining) {
1410 m_runOffset += count;
1411 m_offset += count;
1412 return;
1413 }
1414
1415 count -= remaining;
1416 m_offset += remaining;
1417
1418 for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
1419 int runLength = m_textIterator.length();
1420 if (runLength == 0)
1421 m_atBreak = true;
1422 else {
1423 if (count < runLength) {
1424 m_runOffset = count;
1425 m_offset += count;
1426 return;
1427 }
1428
1429 count -= runLength;
1430 m_offset += runLength;
1431 }
1432 }
1433
1434 m_atBreak = true;
1435 m_runOffset = 0;
1436 }
1437
1438 // --------
1439
WordAwareIterator()1440 WordAwareIterator::WordAwareIterator()
1441 : m_previousText(0)
1442 , m_didLookAhead(false)
1443 {
1444 }
1445
WordAwareIterator(const Range * r)1446 WordAwareIterator::WordAwareIterator(const Range* r)
1447 : m_previousText(0)
1448 , m_didLookAhead(true) // so we consider the first chunk from the text iterator
1449 , m_textIterator(r)
1450 {
1451 advance(); // get in position over the first chunk of text
1452 }
1453
~WordAwareIterator()1454 WordAwareIterator::~WordAwareIterator()
1455 {
1456 }
1457
1458 // We're always in one of these modes:
1459 // - The current chunk in the text iterator is our current chunk
1460 // (typically its a piece of whitespace, or text that ended with whitespace)
1461 // - The previous chunk in the text iterator is our current chunk
1462 // (we looked ahead to the next chunk and found a word boundary)
1463 // - We built up our own chunk of text from many chunks from the text iterator
1464
1465 // FIXME: Performance could be bad for huge spans next to each other that don't fall on word boundaries.
1466
advance()1467 void WordAwareIterator::advance()
1468 {
1469 m_previousText = 0;
1470 m_buffer.clear(); // toss any old buffer we built up
1471
1472 // If last time we did a look-ahead, start with that looked-ahead chunk now
1473 if (!m_didLookAhead) {
1474 ASSERT(!m_textIterator.atEnd());
1475 m_textIterator.advance();
1476 }
1477 m_didLookAhead = false;
1478
1479 // Go to next non-empty chunk
1480 while (!m_textIterator.atEnd() && m_textIterator.length() == 0)
1481 m_textIterator.advance();
1482 m_range = m_textIterator.range();
1483
1484 if (m_textIterator.atEnd())
1485 return;
1486
1487 while (1) {
1488 // If this chunk ends in whitespace we can just use it as our chunk.
1489 if (isSpaceOrNewline(m_textIterator.characters()[m_textIterator.length() - 1]))
1490 return;
1491
1492 // If this is the first chunk that failed, save it in previousText before look ahead
1493 if (m_buffer.isEmpty()) {
1494 m_previousText = m_textIterator.characters();
1495 m_previousLength = m_textIterator.length();
1496 }
1497
1498 // Look ahead to next chunk. If it is whitespace or a break, we can use the previous stuff
1499 m_textIterator.advance();
1500 if (m_textIterator.atEnd() || m_textIterator.length() == 0 || isSpaceOrNewline(m_textIterator.characters()[0])) {
1501 m_didLookAhead = true;
1502 return;
1503 }
1504
1505 if (m_buffer.isEmpty()) {
1506 // Start gobbling chunks until we get to a suitable stopping point
1507 m_buffer.append(m_previousText, m_previousLength);
1508 m_previousText = 0;
1509 }
1510 m_buffer.append(m_textIterator.characters(), m_textIterator.length());
1511 int exception = 0;
1512 m_range->setEnd(m_textIterator.range()->endContainer(), m_textIterator.range()->endOffset(), exception);
1513 }
1514 }
1515
length() const1516 int WordAwareIterator::length() const
1517 {
1518 if (!m_buffer.isEmpty())
1519 return m_buffer.size();
1520 if (m_previousText)
1521 return m_previousLength;
1522 return m_textIterator.length();
1523 }
1524
characters() const1525 const UChar* WordAwareIterator::characters() const
1526 {
1527 if (!m_buffer.isEmpty())
1528 return m_buffer.data();
1529 if (m_previousText)
1530 return m_previousText;
1531 return m_textIterator.characters();
1532 }
1533
1534 // --------
1535
foldQuoteMarkOrSoftHyphen(UChar c)1536 static inline UChar foldQuoteMarkOrSoftHyphen(UChar c)
1537 {
1538 switch (c) {
1539 case hebrewPunctuationGershayim:
1540 case leftDoubleQuotationMark:
1541 case rightDoubleQuotationMark:
1542 return '"';
1543 case hebrewPunctuationGeresh:
1544 case leftSingleQuotationMark:
1545 case rightSingleQuotationMark:
1546 return '\'';
1547 case softHyphen:
1548 // Replace soft hyphen with an ignorable character so that their presence or absence will
1549 // not affect string comparison.
1550 return 0;
1551 default:
1552 return c;
1553 }
1554 }
1555
foldQuoteMarksAndSoftHyphens(String & s)1556 static inline void foldQuoteMarksAndSoftHyphens(String& s)
1557 {
1558 s.replace(hebrewPunctuationGeresh, '\'');
1559 s.replace(hebrewPunctuationGershayim, '"');
1560 s.replace(leftDoubleQuotationMark, '"');
1561 s.replace(leftSingleQuotationMark, '\'');
1562 s.replace(rightDoubleQuotationMark, '"');
1563 s.replace(rightSingleQuotationMark, '\'');
1564 // Replace soft hyphen with an ignorable character so that their presence or absence will
1565 // not affect string comparison.
1566 s.replace(softHyphen, 0);
1567 }
1568
1569 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
1570
foldQuoteMarksAndSoftHyphens(UChar * data,size_t length)1571 static inline void foldQuoteMarksAndSoftHyphens(UChar* data, size_t length)
1572 {
1573 for (size_t i = 0; i < length; ++i)
1574 data[i] = foldQuoteMarkOrSoftHyphen(data[i]);
1575 }
1576
1577 static const size_t minimumSearchBufferSize = 8192;
1578
1579 #ifndef NDEBUG
1580 static bool searcherInUse;
1581 #endif
1582
createSearcher()1583 static UStringSearch* createSearcher()
1584 {
1585 // Provide a non-empty pattern and non-empty text so usearch_open will not fail,
1586 // but it doesn't matter exactly what it is, since we don't perform any searches
1587 // without setting both the pattern and the text.
1588 UErrorCode status = U_ZERO_ERROR;
1589 UStringSearch* searcher = usearch_open(&newlineCharacter, 1, &newlineCharacter, 1, currentSearchLocaleID(), 0, &status);
1590 ASSERT(status == U_ZERO_ERROR || status == U_USING_FALLBACK_WARNING || status == U_USING_DEFAULT_WARNING);
1591 return searcher;
1592 }
1593
searcher()1594 static UStringSearch* searcher()
1595 {
1596 static UStringSearch* searcher = createSearcher();
1597 return searcher;
1598 }
1599
lockSearcher()1600 static inline void lockSearcher()
1601 {
1602 #ifndef NDEBUG
1603 ASSERT(!searcherInUse);
1604 searcherInUse = true;
1605 #endif
1606 }
1607
unlockSearcher()1608 static inline void unlockSearcher()
1609 {
1610 #ifndef NDEBUG
1611 ASSERT(searcherInUse);
1612 searcherInUse = false;
1613 #endif
1614 }
1615
1616 // ICU's search ignores the distinction between small kana letters and ones
1617 // that are not small, and also characters that differ only in the voicing
1618 // marks when considering only primary collation strength diffrences.
1619 // This is not helpful for end users, since these differences make words
1620 // distinct, so for our purposes we need these to be considered.
1621 // The Unicode folks do not think the collation algorithm should be
1622 // changed. To work around this, we would like to tailor the ICU searcher,
1623 // but we can't get that to work yet. So instead, we check for cases where
1624 // these differences occur, and skip those matches.
1625
1626 // We refer to the above technique as the "kana workaround". The next few
1627 // functions are helper functinos for the kana workaround.
1628
isKanaLetter(UChar character)1629 static inline bool isKanaLetter(UChar character)
1630 {
1631 // Hiragana letters.
1632 if (character >= 0x3041 && character <= 0x3096)
1633 return true;
1634
1635 // Katakana letters.
1636 if (character >= 0x30A1 && character <= 0x30FA)
1637 return true;
1638 if (character >= 0x31F0 && character <= 0x31FF)
1639 return true;
1640
1641 // Halfwidth katakana letters.
1642 if (character >= 0xFF66 && character <= 0xFF9D && character != 0xFF70)
1643 return true;
1644
1645 return false;
1646 }
1647
isSmallKanaLetter(UChar character)1648 static inline bool isSmallKanaLetter(UChar character)
1649 {
1650 ASSERT(isKanaLetter(character));
1651
1652 switch (character) {
1653 case 0x3041: // HIRAGANA LETTER SMALL A
1654 case 0x3043: // HIRAGANA LETTER SMALL I
1655 case 0x3045: // HIRAGANA LETTER SMALL U
1656 case 0x3047: // HIRAGANA LETTER SMALL E
1657 case 0x3049: // HIRAGANA LETTER SMALL O
1658 case 0x3063: // HIRAGANA LETTER SMALL TU
1659 case 0x3083: // HIRAGANA LETTER SMALL YA
1660 case 0x3085: // HIRAGANA LETTER SMALL YU
1661 case 0x3087: // HIRAGANA LETTER SMALL YO
1662 case 0x308E: // HIRAGANA LETTER SMALL WA
1663 case 0x3095: // HIRAGANA LETTER SMALL KA
1664 case 0x3096: // HIRAGANA LETTER SMALL KE
1665 case 0x30A1: // KATAKANA LETTER SMALL A
1666 case 0x30A3: // KATAKANA LETTER SMALL I
1667 case 0x30A5: // KATAKANA LETTER SMALL U
1668 case 0x30A7: // KATAKANA LETTER SMALL E
1669 case 0x30A9: // KATAKANA LETTER SMALL O
1670 case 0x30C3: // KATAKANA LETTER SMALL TU
1671 case 0x30E3: // KATAKANA LETTER SMALL YA
1672 case 0x30E5: // KATAKANA LETTER SMALL YU
1673 case 0x30E7: // KATAKANA LETTER SMALL YO
1674 case 0x30EE: // KATAKANA LETTER SMALL WA
1675 case 0x30F5: // KATAKANA LETTER SMALL KA
1676 case 0x30F6: // KATAKANA LETTER SMALL KE
1677 case 0x31F0: // KATAKANA LETTER SMALL KU
1678 case 0x31F1: // KATAKANA LETTER SMALL SI
1679 case 0x31F2: // KATAKANA LETTER SMALL SU
1680 case 0x31F3: // KATAKANA LETTER SMALL TO
1681 case 0x31F4: // KATAKANA LETTER SMALL NU
1682 case 0x31F5: // KATAKANA LETTER SMALL HA
1683 case 0x31F6: // KATAKANA LETTER SMALL HI
1684 case 0x31F7: // KATAKANA LETTER SMALL HU
1685 case 0x31F8: // KATAKANA LETTER SMALL HE
1686 case 0x31F9: // KATAKANA LETTER SMALL HO
1687 case 0x31FA: // KATAKANA LETTER SMALL MU
1688 case 0x31FB: // KATAKANA LETTER SMALL RA
1689 case 0x31FC: // KATAKANA LETTER SMALL RI
1690 case 0x31FD: // KATAKANA LETTER SMALL RU
1691 case 0x31FE: // KATAKANA LETTER SMALL RE
1692 case 0x31FF: // KATAKANA LETTER SMALL RO
1693 case 0xFF67: // HALFWIDTH KATAKANA LETTER SMALL A
1694 case 0xFF68: // HALFWIDTH KATAKANA LETTER SMALL I
1695 case 0xFF69: // HALFWIDTH KATAKANA LETTER SMALL U
1696 case 0xFF6A: // HALFWIDTH KATAKANA LETTER SMALL E
1697 case 0xFF6B: // HALFWIDTH KATAKANA LETTER SMALL O
1698 case 0xFF6C: // HALFWIDTH KATAKANA LETTER SMALL YA
1699 case 0xFF6D: // HALFWIDTH KATAKANA LETTER SMALL YU
1700 case 0xFF6E: // HALFWIDTH KATAKANA LETTER SMALL YO
1701 case 0xFF6F: // HALFWIDTH KATAKANA LETTER SMALL TU
1702 return true;
1703 }
1704 return false;
1705 }
1706
1707 enum VoicedSoundMarkType { NoVoicedSoundMark, VoicedSoundMark, SemiVoicedSoundMark };
1708
composedVoicedSoundMark(UChar character)1709 static inline VoicedSoundMarkType composedVoicedSoundMark(UChar character)
1710 {
1711 ASSERT(isKanaLetter(character));
1712
1713 switch (character) {
1714 case 0x304C: // HIRAGANA LETTER GA
1715 case 0x304E: // HIRAGANA LETTER GI
1716 case 0x3050: // HIRAGANA LETTER GU
1717 case 0x3052: // HIRAGANA LETTER GE
1718 case 0x3054: // HIRAGANA LETTER GO
1719 case 0x3056: // HIRAGANA LETTER ZA
1720 case 0x3058: // HIRAGANA LETTER ZI
1721 case 0x305A: // HIRAGANA LETTER ZU
1722 case 0x305C: // HIRAGANA LETTER ZE
1723 case 0x305E: // HIRAGANA LETTER ZO
1724 case 0x3060: // HIRAGANA LETTER DA
1725 case 0x3062: // HIRAGANA LETTER DI
1726 case 0x3065: // HIRAGANA LETTER DU
1727 case 0x3067: // HIRAGANA LETTER DE
1728 case 0x3069: // HIRAGANA LETTER DO
1729 case 0x3070: // HIRAGANA LETTER BA
1730 case 0x3073: // HIRAGANA LETTER BI
1731 case 0x3076: // HIRAGANA LETTER BU
1732 case 0x3079: // HIRAGANA LETTER BE
1733 case 0x307C: // HIRAGANA LETTER BO
1734 case 0x3094: // HIRAGANA LETTER VU
1735 case 0x30AC: // KATAKANA LETTER GA
1736 case 0x30AE: // KATAKANA LETTER GI
1737 case 0x30B0: // KATAKANA LETTER GU
1738 case 0x30B2: // KATAKANA LETTER GE
1739 case 0x30B4: // KATAKANA LETTER GO
1740 case 0x30B6: // KATAKANA LETTER ZA
1741 case 0x30B8: // KATAKANA LETTER ZI
1742 case 0x30BA: // KATAKANA LETTER ZU
1743 case 0x30BC: // KATAKANA LETTER ZE
1744 case 0x30BE: // KATAKANA LETTER ZO
1745 case 0x30C0: // KATAKANA LETTER DA
1746 case 0x30C2: // KATAKANA LETTER DI
1747 case 0x30C5: // KATAKANA LETTER DU
1748 case 0x30C7: // KATAKANA LETTER DE
1749 case 0x30C9: // KATAKANA LETTER DO
1750 case 0x30D0: // KATAKANA LETTER BA
1751 case 0x30D3: // KATAKANA LETTER BI
1752 case 0x30D6: // KATAKANA LETTER BU
1753 case 0x30D9: // KATAKANA LETTER BE
1754 case 0x30DC: // KATAKANA LETTER BO
1755 case 0x30F4: // KATAKANA LETTER VU
1756 case 0x30F7: // KATAKANA LETTER VA
1757 case 0x30F8: // KATAKANA LETTER VI
1758 case 0x30F9: // KATAKANA LETTER VE
1759 case 0x30FA: // KATAKANA LETTER VO
1760 return VoicedSoundMark;
1761 case 0x3071: // HIRAGANA LETTER PA
1762 case 0x3074: // HIRAGANA LETTER PI
1763 case 0x3077: // HIRAGANA LETTER PU
1764 case 0x307A: // HIRAGANA LETTER PE
1765 case 0x307D: // HIRAGANA LETTER PO
1766 case 0x30D1: // KATAKANA LETTER PA
1767 case 0x30D4: // KATAKANA LETTER PI
1768 case 0x30D7: // KATAKANA LETTER PU
1769 case 0x30DA: // KATAKANA LETTER PE
1770 case 0x30DD: // KATAKANA LETTER PO
1771 return SemiVoicedSoundMark;
1772 }
1773 return NoVoicedSoundMark;
1774 }
1775
isCombiningVoicedSoundMark(UChar character)1776 static inline bool isCombiningVoicedSoundMark(UChar character)
1777 {
1778 switch (character) {
1779 case 0x3099: // COMBINING KATAKANA-HIRAGANA VOICED SOUND MARK
1780 case 0x309A: // COMBINING KATAKANA-HIRAGANA SEMI-VOICED SOUND MARK
1781 return true;
1782 }
1783 return false;
1784 }
1785
containsKanaLetters(const String & pattern)1786 static inline bool containsKanaLetters(const String& pattern)
1787 {
1788 const UChar* characters = pattern.characters();
1789 unsigned length = pattern.length();
1790 for (unsigned i = 0; i < length; ++i) {
1791 if (isKanaLetter(characters[i]))
1792 return true;
1793 }
1794 return false;
1795 }
1796
normalizeCharacters(const UChar * characters,unsigned length,Vector<UChar> & buffer)1797 static void normalizeCharacters(const UChar* characters, unsigned length, Vector<UChar>& buffer)
1798 {
1799 ASSERT(length);
1800
1801 buffer.resize(length);
1802
1803 UErrorCode status = U_ZERO_ERROR;
1804 size_t bufferSize = unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), length, &status);
1805 ASSERT(status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING || status == U_BUFFER_OVERFLOW_ERROR);
1806 ASSERT(bufferSize);
1807
1808 buffer.resize(bufferSize);
1809
1810 if (status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING)
1811 return;
1812
1813 status = U_ZERO_ERROR;
1814 unorm_normalize(characters, length, UNORM_NFC, 0, buffer.data(), bufferSize, &status);
1815 ASSERT(status == U_STRING_NOT_TERMINATED_WARNING);
1816 }
1817
isNonLatin1Separator(UChar32 character)1818 static bool isNonLatin1Separator(UChar32 character)
1819 {
1820 ASSERT_ARG(character, character >= 256);
1821
1822 return U_GET_GC_MASK(character) & (U_GC_S_MASK | U_GC_P_MASK | U_GC_Z_MASK | U_GC_CF_MASK);
1823 }
1824
isSeparator(UChar32 character)1825 static inline bool isSeparator(UChar32 character)
1826 {
1827 static const bool latin1SeparatorTable[256] = {
1828 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1829 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1830 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // space ! " # $ % & ' ( ) * + , - . /
1831 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, // : ; < = > ?
1832 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // @
1833 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, // [ \ ] ^ _
1834 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // `
1835 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, // { | } ~
1836 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1837 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1838 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1839 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
1840 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1841 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
1842 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1843 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0
1844 };
1845
1846 if (character < 256)
1847 return latin1SeparatorTable[character];
1848
1849 return isNonLatin1Separator(character);
1850 }
1851
SearchBuffer(const String & target,FindOptions options)1852 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
1853 : m_target(target)
1854 , m_options(options)
1855 , m_prefixLength(0)
1856 , m_atBreak(true)
1857 , m_needsMoreContext(options & AtWordStarts)
1858 , m_targetRequiresKanaWorkaround(containsKanaLetters(m_target))
1859 {
1860 ASSERT(!m_target.isEmpty());
1861
1862 // FIXME: We'd like to tailor the searcher to fold quote marks for us instead
1863 // of doing it in a separate replacement pass here, but ICU doesn't offer a way
1864 // to add tailoring on top of the locale-specific tailoring as of this writing.
1865 foldQuoteMarksAndSoftHyphens(m_target);
1866
1867 size_t targetLength = m_target.length();
1868 m_buffer.reserveInitialCapacity(max(targetLength * 8, minimumSearchBufferSize));
1869 m_overlap = m_buffer.capacity() / 4;
1870
1871 if ((m_options & AtWordStarts) && targetLength) {
1872 UChar32 targetFirstCharacter;
1873 U16_GET(m_target.characters(), 0, 0, targetLength, targetFirstCharacter);
1874 // Characters in the separator category never really occur at the beginning of a word,
1875 // so if the target begins with such a character, we just ignore the AtWordStart option.
1876 if (isSeparator(targetFirstCharacter)) {
1877 m_options &= ~AtWordStarts;
1878 m_needsMoreContext = false;
1879 }
1880 }
1881
1882 // Grab the single global searcher.
1883 // If we ever have a reason to do more than once search buffer at once, we'll have
1884 // to move to multiple searchers.
1885 lockSearcher();
1886
1887 UStringSearch* searcher = WebCore::searcher();
1888 UCollator* collator = usearch_getCollator(searcher);
1889
1890 UCollationStrength strength = m_options & CaseInsensitive ? UCOL_PRIMARY : UCOL_TERTIARY;
1891 if (ucol_getStrength(collator) != strength) {
1892 ucol_setStrength(collator, strength);
1893 usearch_reset(searcher);
1894 }
1895
1896 UErrorCode status = U_ZERO_ERROR;
1897 usearch_setPattern(searcher, m_target.characters(), targetLength, &status);
1898 ASSERT(status == U_ZERO_ERROR);
1899
1900 // The kana workaround requires a normalized copy of the target string.
1901 if (m_targetRequiresKanaWorkaround)
1902 normalizeCharacters(m_target.characters(), m_target.length(), m_normalizedTarget);
1903 }
1904
~SearchBuffer()1905 inline SearchBuffer::~SearchBuffer()
1906 {
1907 // Leave the static object pointing to a valid string.
1908 UErrorCode status = U_ZERO_ERROR;
1909 usearch_setPattern(WebCore::searcher(), &newlineCharacter, 1, &status);
1910 ASSERT(status == U_ZERO_ERROR);
1911
1912 unlockSearcher();
1913 }
1914
append(const UChar * characters,size_t length)1915 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
1916 {
1917 ASSERT(length);
1918
1919 if (m_atBreak) {
1920 m_buffer.shrink(0);
1921 m_prefixLength = 0;
1922 m_atBreak = false;
1923 } else if (m_buffer.size() == m_buffer.capacity()) {
1924 memcpy(m_buffer.data(), m_buffer.data() + m_buffer.size() - m_overlap, m_overlap * sizeof(UChar));
1925 m_prefixLength -= min(m_prefixLength, m_buffer.size() - m_overlap);
1926 m_buffer.shrink(m_overlap);
1927 }
1928
1929 size_t oldLength = m_buffer.size();
1930 size_t usableLength = min(m_buffer.capacity() - oldLength, length);
1931 ASSERT(usableLength);
1932 m_buffer.append(characters, usableLength);
1933 foldQuoteMarksAndSoftHyphens(m_buffer.data() + oldLength, usableLength);
1934 return usableLength;
1935 }
1936
needsMoreContext() const1937 inline bool SearchBuffer::needsMoreContext() const
1938 {
1939 return m_needsMoreContext;
1940 }
1941
prependContext(const UChar * characters,size_t length)1942 inline void SearchBuffer::prependContext(const UChar* characters, size_t length)
1943 {
1944 ASSERT(m_needsMoreContext);
1945 ASSERT(m_prefixLength == m_buffer.size());
1946
1947 if (!length)
1948 return;
1949
1950 m_atBreak = false;
1951
1952 size_t wordBoundaryContextStart = length;
1953 if (wordBoundaryContextStart) {
1954 U16_BACK_1(characters, 0, wordBoundaryContextStart);
1955 wordBoundaryContextStart = startOfLastWordBoundaryContext(characters, wordBoundaryContextStart);
1956 }
1957
1958 size_t usableLength = min(m_buffer.capacity() - m_prefixLength, length - wordBoundaryContextStart);
1959 m_buffer.prepend(characters + length - usableLength, usableLength);
1960 m_prefixLength += usableLength;
1961
1962 if (wordBoundaryContextStart || m_prefixLength == m_buffer.capacity())
1963 m_needsMoreContext = false;
1964 }
1965
atBreak() const1966 inline bool SearchBuffer::atBreak() const
1967 {
1968 return m_atBreak;
1969 }
1970
reachedBreak()1971 inline void SearchBuffer::reachedBreak()
1972 {
1973 m_atBreak = true;
1974 }
1975
isBadMatch(const UChar * match,size_t matchLength) const1976 inline bool SearchBuffer::isBadMatch(const UChar* match, size_t matchLength) const
1977 {
1978 // This function implements the kana workaround. If usearch treats
1979 // it as a match, but we do not want to, then it's a "bad match".
1980 if (!m_targetRequiresKanaWorkaround)
1981 return false;
1982
1983 // Normalize into a match buffer. We reuse a single buffer rather than
1984 // creating a new one each time.
1985 normalizeCharacters(match, matchLength, m_normalizedMatch);
1986
1987 const UChar* a = m_normalizedTarget.begin();
1988 const UChar* aEnd = m_normalizedTarget.end();
1989
1990 const UChar* b = m_normalizedMatch.begin();
1991 const UChar* bEnd = m_normalizedMatch.end();
1992
1993 while (true) {
1994 // Skip runs of non-kana-letter characters. This is necessary so we can
1995 // correctly handle strings where the target and match have different-length
1996 // runs of characters that match, while still double checking the correctness
1997 // of matches of kana letters with other kana letters.
1998 while (a != aEnd && !isKanaLetter(*a))
1999 ++a;
2000 while (b != bEnd && !isKanaLetter(*b))
2001 ++b;
2002
2003 // If we reached the end of either the target or the match, we should have
2004 // reached the end of both; both should have the same number of kana letters.
2005 if (a == aEnd || b == bEnd) {
2006 ASSERT(a == aEnd);
2007 ASSERT(b == bEnd);
2008 return false;
2009 }
2010
2011 // Check for differences in the kana letter character itself.
2012 if (isSmallKanaLetter(*a) != isSmallKanaLetter(*b))
2013 return true;
2014 if (composedVoicedSoundMark(*a) != composedVoicedSoundMark(*b))
2015 return true;
2016 ++a;
2017 ++b;
2018
2019 // Check for differences in combining voiced sound marks found after the letter.
2020 while (1) {
2021 if (!(a != aEnd && isCombiningVoicedSoundMark(*a))) {
2022 if (b != bEnd && isCombiningVoicedSoundMark(*b))
2023 return true;
2024 break;
2025 }
2026 if (!(b != bEnd && isCombiningVoicedSoundMark(*b)))
2027 return true;
2028 if (*a != *b)
2029 return true;
2030 ++a;
2031 ++b;
2032 }
2033 }
2034 }
2035
isWordStartMatch(size_t start,size_t length) const2036 inline bool SearchBuffer::isWordStartMatch(size_t start, size_t length) const
2037 {
2038 ASSERT(m_options & AtWordStarts);
2039
2040 if (!start)
2041 return true;
2042
2043 if (m_options & TreatMedialCapitalAsWordStart) {
2044 int size = m_buffer.size();
2045 int offset = start;
2046 UChar32 firstCharacter;
2047 U16_GET(m_buffer.data(), 0, offset, size, firstCharacter);
2048 UChar32 previousCharacter;
2049 U16_PREV(m_buffer.data(), 0, offset, previousCharacter);
2050
2051 if (isSeparator(firstCharacter)) {
2052 // The start of a separator run is a word start (".org" in "webkit.org").
2053 if (!isSeparator(previousCharacter))
2054 return true;
2055 } else if (isASCIIUpper(firstCharacter)) {
2056 // The start of an uppercase run is a word start ("Kit" in "WebKit").
2057 if (!isASCIIUpper(previousCharacter))
2058 return true;
2059 // The last character of an uppercase run followed by a non-separator, non-digit
2060 // is a word start ("Request" in "XMLHTTPRequest").
2061 offset = start;
2062 U16_FWD_1(m_buffer.data(), offset, size);
2063 UChar32 nextCharacter = 0;
2064 if (offset < size)
2065 U16_GET(m_buffer.data(), 0, offset, size, nextCharacter);
2066 if (!isASCIIUpper(nextCharacter) && !isASCIIDigit(nextCharacter) && !isSeparator(nextCharacter))
2067 return true;
2068 } else if (isASCIIDigit(firstCharacter)) {
2069 // The start of a digit run is a word start ("2" in "WebKit2").
2070 if (!isASCIIDigit(previousCharacter))
2071 return true;
2072 } else if (isSeparator(previousCharacter) || isASCIIDigit(previousCharacter)) {
2073 // The start of a non-separator, non-uppercase, non-digit run is a word start,
2074 // except after an uppercase. ("org" in "webkit.org", but not "ore" in "WebCore").
2075 return true;
2076 }
2077 }
2078
2079 size_t wordBreakSearchStart = start + length;
2080 while (wordBreakSearchStart > start)
2081 wordBreakSearchStart = findNextWordFromIndex(m_buffer.data(), m_buffer.size(), wordBreakSearchStart, false /* backwards */);
2082 return wordBreakSearchStart == start;
2083 }
2084
search(size_t & start)2085 inline size_t SearchBuffer::search(size_t& start)
2086 {
2087 size_t size = m_buffer.size();
2088 if (m_atBreak) {
2089 if (!size)
2090 return 0;
2091 } else {
2092 if (size != m_buffer.capacity())
2093 return 0;
2094 }
2095
2096 UStringSearch* searcher = WebCore::searcher();
2097
2098 UErrorCode status = U_ZERO_ERROR;
2099 usearch_setText(searcher, m_buffer.data(), size, &status);
2100 ASSERT(status == U_ZERO_ERROR);
2101
2102 usearch_setOffset(searcher, m_prefixLength, &status);
2103 ASSERT(status == U_ZERO_ERROR);
2104
2105 int matchStart = usearch_next(searcher, &status);
2106 ASSERT(status == U_ZERO_ERROR);
2107
2108 nextMatch:
2109 if (!(matchStart >= 0 && static_cast<size_t>(matchStart) < size)) {
2110 ASSERT(matchStart == USEARCH_DONE);
2111 return 0;
2112 }
2113
2114 // Matches that start in the overlap area are only tentative.
2115 // The same match may appear later, matching more characters,
2116 // possibly including a combining character that's not yet in the buffer.
2117 if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) {
2118 size_t overlap = m_overlap;
2119 if (m_options & AtWordStarts) {
2120 // Ensure that there is sufficient context before matchStart the next time around for
2121 // determining if it is at a word boundary.
2122 int wordBoundaryContextStart = matchStart;
2123 U16_BACK_1(m_buffer.data(), 0, wordBoundaryContextStart);
2124 wordBoundaryContextStart = startOfLastWordBoundaryContext(m_buffer.data(), wordBoundaryContextStart);
2125 overlap = min(size - 1, max(overlap, size - wordBoundaryContextStart));
2126 }
2127 memcpy(m_buffer.data(), m_buffer.data() + size - overlap, overlap * sizeof(UChar));
2128 m_prefixLength -= min(m_prefixLength, size - overlap);
2129 m_buffer.shrink(overlap);
2130 return 0;
2131 }
2132
2133 size_t matchedLength = usearch_getMatchedLength(searcher);
2134 ASSERT(matchStart + matchedLength <= size);
2135
2136 // If this match is "bad", move on to the next match.
2137 if (isBadMatch(m_buffer.data() + matchStart, matchedLength) || ((m_options & AtWordStarts) && !isWordStartMatch(matchStart, matchedLength))) {
2138 matchStart = usearch_next(searcher, &status);
2139 ASSERT(status == U_ZERO_ERROR);
2140 goto nextMatch;
2141 }
2142
2143 size_t newSize = size - (matchStart + 1);
2144 memmove(m_buffer.data(), m_buffer.data() + matchStart + 1, newSize * sizeof(UChar));
2145 m_prefixLength -= min<size_t>(m_prefixLength, matchStart + 1);
2146 m_buffer.shrink(newSize);
2147
2148 start = size - matchStart;
2149 return matchedLength;
2150 }
2151
2152 #else // !ICU_UNICODE
2153
SearchBuffer(const String & target,FindOptions options)2154 inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
2155 : m_target(options & CaseInsensitive ? target.foldCase() : target)
2156 , m_options(options)
2157 , m_buffer(m_target.length())
2158 , m_isCharacterStartBuffer(m_target.length())
2159 , m_isBufferFull(false)
2160 , m_cursor(0)
2161 {
2162 ASSERT(!m_target.isEmpty());
2163 m_target.replace(noBreakSpace, ' ');
2164 foldQuoteMarksAndSoftHyphens(m_target);
2165 }
2166
~SearchBuffer()2167 inline SearchBuffer::~SearchBuffer()
2168 {
2169 }
2170
reachedBreak()2171 inline void SearchBuffer::reachedBreak()
2172 {
2173 m_cursor = 0;
2174 m_isBufferFull = false;
2175 }
2176
atBreak() const2177 inline bool SearchBuffer::atBreak() const
2178 {
2179 return !m_cursor && !m_isBufferFull;
2180 }
2181
append(UChar c,bool isStart)2182 inline void SearchBuffer::append(UChar c, bool isStart)
2183 {
2184 m_buffer[m_cursor] = c == noBreakSpace ? ' ' : foldQuoteMarkOrSoftHyphen(c);
2185 m_isCharacterStartBuffer[m_cursor] = isStart;
2186 if (++m_cursor == m_target.length()) {
2187 m_cursor = 0;
2188 m_isBufferFull = true;
2189 }
2190 }
2191
append(const UChar * characters,size_t length)2192 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
2193 {
2194 ASSERT(length);
2195 if (!(m_options & CaseInsensitive)) {
2196 append(characters[0], true);
2197 return 1;
2198 }
2199 const int maxFoldedCharacters = 16; // sensible maximum is 3, this should be more than enough
2200 UChar foldedCharacters[maxFoldedCharacters];
2201 bool error;
2202 int numFoldedCharacters = foldCase(foldedCharacters, maxFoldedCharacters, characters, 1, &error);
2203 ASSERT(!error);
2204 ASSERT(numFoldedCharacters);
2205 ASSERT(numFoldedCharacters <= maxFoldedCharacters);
2206 if (!error && numFoldedCharacters) {
2207 numFoldedCharacters = min(numFoldedCharacters, maxFoldedCharacters);
2208 append(foldedCharacters[0], true);
2209 for (int i = 1; i < numFoldedCharacters; ++i)
2210 append(foldedCharacters[i], false);
2211 }
2212 return 1;
2213 }
2214
needsMoreContext() const2215 inline bool SearchBuffer::needsMoreContext() const
2216 {
2217 return false;
2218 }
2219
prependContext(const UChar *,size_t)2220 void SearchBuffer::prependContext(const UChar*, size_t)
2221 {
2222 ASSERT_NOT_REACHED();
2223 }
2224
search(size_t & start)2225 inline size_t SearchBuffer::search(size_t& start)
2226 {
2227 if (!m_isBufferFull)
2228 return 0;
2229 if (!m_isCharacterStartBuffer[m_cursor])
2230 return 0;
2231
2232 size_t tailSpace = m_target.length() - m_cursor;
2233 if (memcmp(&m_buffer[m_cursor], m_target.characters(), tailSpace * sizeof(UChar)) != 0)
2234 return 0;
2235 if (memcmp(&m_buffer[0], m_target.characters() + tailSpace, m_cursor * sizeof(UChar)) != 0)
2236 return 0;
2237
2238 start = length();
2239
2240 // Now that we've found a match once, we don't want to find it again, because those
2241 // are the SearchBuffer semantics, allowing for a buffer where you append more than one
2242 // character at a time. To do this we take advantage of m_isCharacterStartBuffer, but if
2243 // we want to get rid of that in the future we could track this with a separate boolean
2244 // or even move the characters to the start of the buffer and set m_isBufferFull to false.
2245 m_isCharacterStartBuffer[m_cursor] = false;
2246
2247 return start;
2248 }
2249
2250 // Returns the number of characters that were appended to the buffer (what we are searching in).
2251 // That's not necessarily the same length as the passed-in target string, because case folding
2252 // can make two strings match even though they're not the same length.
length() const2253 size_t SearchBuffer::length() const
2254 {
2255 size_t bufferSize = m_target.length();
2256 size_t length = 0;
2257 for (size_t i = 0; i < bufferSize; ++i)
2258 length += m_isCharacterStartBuffer[i];
2259 return length;
2260 }
2261
2262 #endif // !ICU_UNICODE
2263
2264 // --------
2265
rangeLength(const Range * r,bool forSelectionPreservation)2266 int TextIterator::rangeLength(const Range* r, bool forSelectionPreservation)
2267 {
2268 int length = 0;
2269 for (TextIterator it(r, forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior); !it.atEnd(); it.advance())
2270 length += it.length();
2271
2272 return length;
2273 }
2274
subrange(Range * entireRange,int characterOffset,int characterCount)2275 PassRefPtr<Range> TextIterator::subrange(Range* entireRange, int characterOffset, int characterCount)
2276 {
2277 CharacterIterator entireRangeIterator(entireRange);
2278 return characterSubrange(entireRangeIterator, characterOffset, characterCount);
2279 }
2280
rangeFromLocationAndLength(Element * scope,int rangeLocation,int rangeLength,bool forSelectionPreservation)2281 PassRefPtr<Range> TextIterator::rangeFromLocationAndLength(Element* scope, int rangeLocation, int rangeLength, bool forSelectionPreservation)
2282 {
2283 RefPtr<Range> resultRange = scope->document()->createRange();
2284
2285 int docTextPosition = 0;
2286 int rangeEnd = rangeLocation + rangeLength;
2287 bool startRangeFound = false;
2288
2289 RefPtr<Range> textRunRange;
2290
2291 TextIterator it(rangeOfContents(scope).get(), forSelectionPreservation ? TextIteratorEmitsCharactersBetweenAllVisiblePositions : TextIteratorDefaultBehavior);
2292
2293 // FIXME: the atEnd() check shouldn't be necessary, workaround for <http://bugs.webkit.org/show_bug.cgi?id=6289>.
2294 if (rangeLocation == 0 && rangeLength == 0 && it.atEnd()) {
2295 textRunRange = it.range();
2296
2297 ExceptionCode ec = 0;
2298 resultRange->setStart(textRunRange->startContainer(), 0, ec);
2299 ASSERT(!ec);
2300 resultRange->setEnd(textRunRange->startContainer(), 0, ec);
2301 ASSERT(!ec);
2302
2303 return resultRange.release();
2304 }
2305
2306 for (; !it.atEnd(); it.advance()) {
2307 int len = it.length();
2308 textRunRange = it.range();
2309
2310 bool foundStart = rangeLocation >= docTextPosition && rangeLocation <= docTextPosition + len;
2311 bool foundEnd = rangeEnd >= docTextPosition && rangeEnd <= docTextPosition + len;
2312
2313 // Fix textRunRange->endPosition(), but only if foundStart || foundEnd, because it is only
2314 // in those cases that textRunRange is used.
2315 if (foundEnd) {
2316 // FIXME: This is a workaround for the fact that the end of a run is often at the wrong
2317 // position for emitted '\n's.
2318 if (len == 1 && it.characters()[0] == '\n') {
2319 scope->document()->updateLayoutIgnorePendingStylesheets();
2320 it.advance();
2321 if (!it.atEnd()) {
2322 RefPtr<Range> range = it.range();
2323 ExceptionCode ec = 0;
2324 textRunRange->setEnd(range->startContainer(), range->startOffset(), ec);
2325 ASSERT(!ec);
2326 } else {
2327 Position runStart = textRunRange->startPosition();
2328 Position runEnd = VisiblePosition(runStart).next().deepEquivalent();
2329 if (runEnd.isNotNull()) {
2330 ExceptionCode ec = 0;
2331 textRunRange->setEnd(runEnd.containerNode(), runEnd.computeOffsetInContainerNode(), ec);
2332 ASSERT(!ec);
2333 }
2334 }
2335 }
2336 }
2337
2338 if (foundStart) {
2339 startRangeFound = true;
2340 int exception = 0;
2341 if (textRunRange->startContainer()->isTextNode()) {
2342 int offset = rangeLocation - docTextPosition;
2343 resultRange->setStart(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
2344 } else {
2345 if (rangeLocation == docTextPosition)
2346 resultRange->setStart(textRunRange->startContainer(), textRunRange->startOffset(), exception);
2347 else
2348 resultRange->setStart(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2349 }
2350 }
2351
2352 if (foundEnd) {
2353 int exception = 0;
2354 if (textRunRange->startContainer()->isTextNode()) {
2355 int offset = rangeEnd - docTextPosition;
2356 resultRange->setEnd(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
2357 } else {
2358 if (rangeEnd == docTextPosition)
2359 resultRange->setEnd(textRunRange->startContainer(), textRunRange->startOffset(), exception);
2360 else
2361 resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2362 }
2363 docTextPosition += len;
2364 break;
2365 }
2366 docTextPosition += len;
2367 }
2368
2369 if (!startRangeFound)
2370 return 0;
2371
2372 if (rangeLength != 0 && rangeEnd > docTextPosition) { // rangeEnd is out of bounds
2373 int exception = 0;
2374 resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
2375 }
2376
2377 return resultRange.release();
2378 }
2379
locationAndLengthFromRange(const Range * range,size_t & location,size_t & length)2380 bool TextIterator::locationAndLengthFromRange(const Range* range, size_t& location, size_t& length)
2381 {
2382 location = notFound;
2383 length = 0;
2384
2385 if (!range->startContainer())
2386 return false;
2387
2388 Element* selectionRoot = range->ownerDocument()->frame()->selection()->rootEditableElement();
2389 Element* scope = selectionRoot ? selectionRoot : range->ownerDocument()->documentElement();
2390
2391 // The critical assumption is that this only gets called with ranges that
2392 // concentrate on a given area containing the selection root. This is done
2393 // because of text fields and textareas. The DOM for those is not
2394 // directly in the document DOM, so ensure that the range does not cross a
2395 // boundary of one of those.
2396 if (range->startContainer() != scope && !range->startContainer()->isDescendantOf(scope))
2397 return false;
2398 if (range->endContainer() != scope && !range->endContainer()->isDescendantOf(scope))
2399 return false;
2400
2401 RefPtr<Range> testRange = Range::create(scope->document(), scope, 0, range->startContainer(), range->startOffset());
2402 ASSERT(testRange->startContainer() == scope);
2403 location = TextIterator::rangeLength(testRange.get());
2404
2405 ExceptionCode ec;
2406 testRange->setEnd(range->endContainer(), range->endOffset(), ec);
2407 ASSERT(testRange->startContainer() == scope);
2408 length = TextIterator::rangeLength(testRange.get()) - location;
2409 return true;
2410 }
2411
2412 // --------
2413
plainTextToMallocAllocatedBuffer(const Range * r,unsigned & bufferLength,bool isDisplayString,TextIteratorBehavior defaultBehavior)2414 UChar* plainTextToMallocAllocatedBuffer(const Range* r, unsigned& bufferLength, bool isDisplayString, TextIteratorBehavior defaultBehavior)
2415 {
2416 UChar* result = 0;
2417
2418 // Do this in pieces to avoid massive reallocations if there is a large amount of text.
2419 // Use system malloc for buffers since they can consume lots of memory and current TCMalloc is unable return it back to OS.
2420 static const unsigned cMaxSegmentSize = 1 << 16;
2421 bufferLength = 0;
2422 typedef pair<UChar*, unsigned> TextSegment;
2423 OwnPtr<Vector<TextSegment> > textSegments;
2424 Vector<UChar> textBuffer;
2425 textBuffer.reserveInitialCapacity(cMaxSegmentSize);
2426 TextIteratorBehavior behavior = defaultBehavior;
2427 if (!isDisplayString)
2428 behavior = static_cast<TextIteratorBehavior>(behavior | TextIteratorEmitsTextsWithoutTranscoding);
2429
2430 for (TextIterator it(r, behavior); !it.atEnd(); it.advance()) {
2431 if (textBuffer.size() && textBuffer.size() + it.length() > cMaxSegmentSize) {
2432 UChar* newSegmentBuffer = static_cast<UChar*>(malloc(textBuffer.size() * sizeof(UChar)));
2433 if (!newSegmentBuffer)
2434 goto exit;
2435 memcpy(newSegmentBuffer, textBuffer.data(), textBuffer.size() * sizeof(UChar));
2436 if (!textSegments)
2437 textSegments = adoptPtr(new Vector<TextSegment>);
2438 textSegments->append(make_pair(newSegmentBuffer, (unsigned)textBuffer.size()));
2439 textBuffer.clear();
2440 }
2441 textBuffer.append(it.characters(), it.length());
2442 bufferLength += it.length();
2443 }
2444
2445 if (!bufferLength)
2446 return 0;
2447
2448 // Since we know the size now, we can make a single buffer out of the pieces with one big alloc
2449 result = static_cast<UChar*>(malloc(bufferLength * sizeof(UChar)));
2450 if (!result)
2451 goto exit;
2452
2453 {
2454 UChar* resultPos = result;
2455 if (textSegments) {
2456 unsigned size = textSegments->size();
2457 for (unsigned i = 0; i < size; ++i) {
2458 const TextSegment& segment = textSegments->at(i);
2459 memcpy(resultPos, segment.first, segment.second * sizeof(UChar));
2460 resultPos += segment.second;
2461 }
2462 }
2463 memcpy(resultPos, textBuffer.data(), textBuffer.size() * sizeof(UChar));
2464 }
2465
2466 exit:
2467 if (textSegments) {
2468 unsigned size = textSegments->size();
2469 for (unsigned i = 0; i < size; ++i)
2470 free(textSegments->at(i).first);
2471 }
2472
2473 if (isDisplayString && r->ownerDocument())
2474 r->ownerDocument()->displayBufferModifiedByEncoding(result, bufferLength);
2475
2476 return result;
2477 }
2478
plainText(const Range * r,TextIteratorBehavior defaultBehavior)2479 String plainText(const Range* r, TextIteratorBehavior defaultBehavior)
2480 {
2481 unsigned length;
2482 UChar* buf = plainTextToMallocAllocatedBuffer(r, length, false, defaultBehavior);
2483 if (!buf)
2484 return "";
2485 String result(buf, length);
2486 free(buf);
2487 return result;
2488 }
2489
isAllCollapsibleWhitespace(const String & string)2490 static inline bool isAllCollapsibleWhitespace(const String& string)
2491 {
2492 const UChar* characters = string.characters();
2493 unsigned length = string.length();
2494 for (unsigned i = 0; i < length; ++i) {
2495 if (!isCollapsibleWhitespace(characters[i]))
2496 return false;
2497 }
2498 return true;
2499 }
2500
collapsedToBoundary(const Range * range,bool forward)2501 static PassRefPtr<Range> collapsedToBoundary(const Range* range, bool forward)
2502 {
2503 ExceptionCode ec = 0;
2504 RefPtr<Range> result = range->cloneRange(ec);
2505 ASSERT(!ec);
2506 result->collapse(!forward, ec);
2507 ASSERT(!ec);
2508 return result.release();
2509 }
2510
findPlainText(CharacterIterator & it,const String & target,FindOptions options,size_t & matchStart)2511 static size_t findPlainText(CharacterIterator& it, const String& target, FindOptions options, size_t& matchStart)
2512 {
2513 matchStart = 0;
2514 size_t matchLength = 0;
2515
2516 SearchBuffer buffer(target, options);
2517
2518 if (buffer.needsMoreContext()) {
2519 RefPtr<Range> startRange = it.range();
2520 RefPtr<Range> beforeStartRange = startRange->ownerDocument()->createRange();
2521 ExceptionCode ec = 0;
2522 beforeStartRange->setEnd(startRange->startContainer(), startRange->startOffset(), ec);
2523 for (SimplifiedBackwardsTextIterator backwardsIterator(beforeStartRange.get()); !backwardsIterator.atEnd(); backwardsIterator.advance()) {
2524 buffer.prependContext(backwardsIterator.characters(), backwardsIterator.length());
2525 if (!buffer.needsMoreContext())
2526 break;
2527 }
2528 }
2529
2530 while (!it.atEnd()) {
2531 it.advance(buffer.append(it.characters(), it.length()));
2532 tryAgain:
2533 size_t matchStartOffset;
2534 if (size_t newMatchLength = buffer.search(matchStartOffset)) {
2535 // Note that we found a match, and where we found it.
2536 size_t lastCharacterInBufferOffset = it.characterOffset();
2537 ASSERT(lastCharacterInBufferOffset >= matchStartOffset);
2538 matchStart = lastCharacterInBufferOffset - matchStartOffset;
2539 matchLength = newMatchLength;
2540 // If searching forward, stop on the first match.
2541 // If searching backward, don't stop, so we end up with the last match.
2542 if (!(options & Backwards))
2543 break;
2544 goto tryAgain;
2545 }
2546 if (it.atBreak() && !buffer.atBreak()) {
2547 buffer.reachedBreak();
2548 goto tryAgain;
2549 }
2550 }
2551
2552 return matchLength;
2553 }
2554
findPlainText(const Range * range,const String & target,FindOptions options)2555 PassRefPtr<Range> findPlainText(const Range* range, const String& target, FindOptions options)
2556 {
2557 // First, find the text.
2558 size_t matchStart;
2559 size_t matchLength;
2560 {
2561 CharacterIterator findIterator(range, TextIteratorEntersTextControls);
2562 matchLength = findPlainText(findIterator, target, options, matchStart);
2563 if (!matchLength)
2564 return collapsedToBoundary(range, !(options & Backwards));
2565 }
2566
2567 // Then, find the document position of the start and the end of the text.
2568 CharacterIterator computeRangeIterator(range, TextIteratorEntersTextControls);
2569 return characterSubrange(computeRangeIterator, matchStart, matchLength);
2570 }
2571
2572 }
2573