1 /*
2 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
3 * Copyright (C) 2005 Alexey Proskuryakov.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY APPLE COMPUTER, INC. ``AS IS'' AND ANY
15 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
17 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR
18 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
19 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
20 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
22 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27 #include "config.h"
28 #include "TextIterator.h"
29
30 #include "CharacterNames.h"
31 #include "Document.h"
32 #include "Element.h"
33 #include "HTMLNames.h"
34 #include "htmlediting.h"
35 #include "InlineTextBox.h"
36 #include "Position.h"
37 #include "Range.h"
38 #include "RenderTableCell.h"
39 #include "RenderTableRow.h"
40 #include "RenderTextControl.h"
41 #include "VisiblePosition.h"
42 #include "visible_units.h"
43
44 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
45 #include <unicode/usearch.h>
46 #endif
47
48 using namespace WTF::Unicode;
49 using namespace std;
50
51 namespace WebCore {
52
53 using namespace HTMLNames;
54
55 // Buffer that knows how to compare with a search target.
56 // Keeps enough of the previous text to be able to search in the future, but no more.
57 // Non-breaking spaces are always equal to normal spaces.
58 // Case folding is also done if <isCaseSensitive> is false.
59 class SearchBuffer : Noncopyable {
60 public:
61 SearchBuffer(const String& target, bool isCaseSensitive);
62 ~SearchBuffer();
63
64 // Returns number of characters appended; guaranteed to be in the range [1, length].
65 size_t append(const UChar*, size_t length);
66 void reachedBreak();
67
68 // Result is the size in characters of what was found.
69 // And <startOffset> is the number of characters back to the start of what was found.
70 size_t search(size_t& startOffset);
71 bool atBreak() const;
72
73 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
74
75 private:
76 String m_target;
77 Vector<UChar> m_buffer;
78 size_t m_overlap;
79 bool m_atBreak;
80
81 #else
82
83 private:
84 void append(UChar, bool isCharacterStart);
85 size_t length() const;
86
87 String m_target;
88 bool m_isCaseSensitive;
89
90 Vector<UChar> m_buffer;
91 Vector<bool> m_isCharacterStartBuffer;
92 bool m_isBufferFull;
93 size_t m_cursor;
94
95 #endif
96 };
97
98 // --------
99
TextIterator()100 TextIterator::TextIterator()
101 : m_startContainer(0)
102 , m_startOffset(0)
103 , m_endContainer(0)
104 , m_endOffset(0)
105 , m_positionNode(0)
106 , m_lastCharacter(0)
107 , m_emitCharactersBetweenAllVisiblePositions(false)
108 , m_enterTextControls(false)
109 {
110 }
111
TextIterator(const Range * r,bool emitCharactersBetweenAllVisiblePositions,bool enterTextControls)112 TextIterator::TextIterator(const Range* r, bool emitCharactersBetweenAllVisiblePositions, bool enterTextControls)
113 : m_inShadowContent(false)
114 , m_startContainer(0)
115 , m_startOffset(0)
116 , m_endContainer(0)
117 , m_endOffset(0)
118 , m_positionNode(0)
119 , m_emitCharactersBetweenAllVisiblePositions(emitCharactersBetweenAllVisiblePositions)
120 , m_enterTextControls(enterTextControls)
121 {
122 if (!r)
123 return;
124
125 // get and validate the range endpoints
126 Node* startContainer = r->startContainer();
127 if (!startContainer)
128 return;
129 int startOffset = r->startOffset();
130 Node* endContainer = r->endContainer();
131 int endOffset = r->endOffset();
132
133 // Callers should be handing us well-formed ranges. If we discover that this isn't
134 // the case, we could consider changing this assertion to an early return.
135 ASSERT(r->boundaryPointsValid());
136
137 // remember range - this does not change
138 m_startContainer = startContainer;
139 m_startOffset = startOffset;
140 m_endContainer = endContainer;
141 m_endOffset = endOffset;
142
143 for (Node* n = startContainer; n; n = n->parentNode()) {
144 if (n->isShadowNode()) {
145 m_inShadowContent = true;
146 break;
147 }
148 }
149
150 // set up the current node for processing
151 m_node = r->firstNode();
152 if (m_node == 0)
153 return;
154 m_offset = m_node == m_startContainer ? m_startOffset : 0;
155 m_handledNode = false;
156 m_handledChildren = false;
157
158 // calculate first out of bounds node
159 m_pastEndNode = r->pastLastNode();
160
161 // initialize node processing state
162 m_needAnotherNewline = false;
163 m_textBox = 0;
164
165 // initialize record of previous node processing
166 m_haveEmitted = false;
167 m_lastTextNode = 0;
168 m_lastTextNodeEndedWithCollapsedSpace = false;
169 m_lastCharacter = 0;
170
171 #ifndef NDEBUG
172 // need this just because of the assert in advance()
173 m_positionNode = m_node;
174 #endif
175
176 // identify the first run
177 advance();
178 }
179
advance()180 void TextIterator::advance()
181 {
182 // reset the run information
183 m_positionNode = 0;
184 m_textLength = 0;
185
186 // handle remembered node that needed a newline after the text node's newline
187 if (m_needAnotherNewline) {
188 // Emit the extra newline, and position it *inside* m_node, after m_node's
189 // contents, in case it's a block, in the same way that we position the first
190 // newline. The range for the emitted newline should start where the line
191 // break begins.
192 // FIXME: It would be cleaner if we emitted two newlines during the last
193 // iteration, instead of using m_needAnotherNewline.
194 Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
195 emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
196 m_needAnotherNewline = false;
197 return;
198 }
199
200 // handle remembered text box
201 if (m_textBox) {
202 handleTextBox();
203 if (m_positionNode)
204 return;
205 }
206
207 while (m_node && m_node != m_pastEndNode) {
208 // if the range ends at offset 0 of an element, represent the
209 // position, but not the content, of that element e.g. if the
210 // node is a blockflow element, emit a newline that
211 // precedes the element
212 if (m_node == m_endContainer && m_endOffset == 0) {
213 representNodeOffsetZero();
214 m_node = 0;
215 return;
216 }
217
218 RenderObject *renderer = m_node->renderer();
219 if (!renderer) {
220 m_handledNode = true;
221 m_handledChildren = true;
222 } else {
223 // handle current node according to its type
224 if (!m_handledNode) {
225 if (renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) // FIXME: What about CDATA_SECTION_NODE?
226 m_handledNode = handleTextNode();
227 else if (renderer && (renderer->isImage() || renderer->isWidget() || (renderer->element() && renderer->element()->isControl())))
228 m_handledNode = handleReplacedElement();
229 else
230 m_handledNode = handleNonTextNode();
231 if (m_positionNode)
232 return;
233 }
234 }
235
236 // find a new current node to handle in depth-first manner,
237 // calling exitNode() as we come back thru a parent node
238 Node *next = m_handledChildren ? 0 : m_node->firstChild();
239 m_offset = 0;
240 if (!next) {
241 next = m_node->nextSibling();
242 if (!next) {
243 bool pastEnd = m_node->traverseNextNode() == m_pastEndNode;
244 Node* parentNode = m_node->parentNode();
245 if (!parentNode && m_inShadowContent) {
246 m_inShadowContent = false;
247 parentNode = m_node->shadowParentNode();
248 }
249 while (!next && parentNode) {
250 if (pastEnd && parentNode == m_endContainer || m_endContainer->isDescendantOf(parentNode))
251 return;
252 bool haveRenderer = m_node->renderer();
253 m_node = parentNode;
254 parentNode = m_node->parentNode();
255 if (!parentNode && m_inShadowContent) {
256 m_inShadowContent = false;
257 parentNode = m_node->shadowParentNode();
258 }
259 if (haveRenderer)
260 exitNode();
261 if (m_positionNode) {
262 m_handledNode = true;
263 m_handledChildren = true;
264 return;
265 }
266 next = m_node->nextSibling();
267 }
268 }
269 }
270
271 // set the new current node
272 m_node = next;
273 m_handledNode = false;
274 m_handledChildren = false;
275
276 // how would this ever be?
277 if (m_positionNode)
278 return;
279 }
280 }
281
compareBoxStart(const InlineTextBox * first,const InlineTextBox * second)282 static inline bool compareBoxStart(const InlineTextBox *first, const InlineTextBox *second)
283 {
284 return first->start() < second->start();
285 }
286
handleTextNode()287 bool TextIterator::handleTextNode()
288 {
289 RenderText* renderer = toRenderText(m_node->renderer());
290 if (renderer->style()->visibility() != VISIBLE)
291 return false;
292
293 m_lastTextNode = m_node;
294 String str = renderer->text();
295
296 // handle pre-formatted text
297 if (!renderer->style()->collapseWhiteSpace()) {
298 int runStart = m_offset;
299 if (m_lastTextNodeEndedWithCollapsedSpace) {
300 emitCharacter(' ', m_node, 0, runStart, runStart);
301 return false;
302 }
303 int strLength = str.length();
304 int end = (m_node == m_endContainer) ? m_endOffset : INT_MAX;
305 int runEnd = min(strLength, end);
306
307 if (runStart >= runEnd)
308 return true;
309
310 emitText(m_node, runStart, runEnd);
311 return true;
312 }
313
314 if (!renderer->firstTextBox() && str.length() > 0) {
315 m_lastTextNodeEndedWithCollapsedSpace = true; // entire block is collapsed space
316 return true;
317 }
318
319 // Used when text boxes are out of order (Hebrew/Arabic w/ embeded LTR text)
320 if (renderer->containsReversedText()) {
321 m_sortedTextBoxes.clear();
322 for (InlineTextBox * textBox = renderer->firstTextBox(); textBox; textBox = textBox->nextTextBox()) {
323 m_sortedTextBoxes.append(textBox);
324 }
325 std::sort(m_sortedTextBoxes.begin(), m_sortedTextBoxes.end(), compareBoxStart);
326 m_sortedTextBoxesPosition = 0;
327 }
328
329 m_textBox = renderer->containsReversedText() ? m_sortedTextBoxes[0] : renderer->firstTextBox();
330 handleTextBox();
331 return true;
332 }
333
handleTextBox()334 void TextIterator::handleTextBox()
335 {
336 RenderText *renderer = toRenderText(m_node->renderer());
337 String str = renderer->text();
338 int start = m_offset;
339 int end = (m_node == m_endContainer) ? m_endOffset : INT_MAX;
340 while (m_textBox) {
341 int textBoxStart = m_textBox->start();
342 int runStart = max(textBoxStart, start);
343
344 // Check for collapsed space at the start of this run.
345 InlineTextBox *firstTextBox = renderer->containsReversedText() ? m_sortedTextBoxes[0] : renderer->firstTextBox();
346 bool needSpace = m_lastTextNodeEndedWithCollapsedSpace
347 || (m_textBox == firstTextBox && textBoxStart == runStart && runStart > 0);
348 if (needSpace && !isCollapsibleWhitespace(m_lastCharacter) && m_lastCharacter) {
349 if (m_lastTextNode == m_node && runStart > 0 && str[runStart - 1] == ' ') {
350 unsigned spaceRunStart = runStart - 1;
351 while (spaceRunStart > 0 && str[spaceRunStart - 1] == ' ')
352 --spaceRunStart;
353 emitText(m_node, spaceRunStart, spaceRunStart + 1);
354 } else
355 emitCharacter(' ', m_node, 0, runStart, runStart);
356 return;
357 }
358 int textBoxEnd = textBoxStart + m_textBox->len();
359 int runEnd = min(textBoxEnd, end);
360
361 // Determine what the next text box will be, but don't advance yet
362 InlineTextBox *nextTextBox = 0;
363 if (renderer->containsReversedText()) {
364 if (m_sortedTextBoxesPosition + 1 < m_sortedTextBoxes.size())
365 nextTextBox = m_sortedTextBoxes[m_sortedTextBoxesPosition + 1];
366 } else
367 nextTextBox = m_textBox->nextTextBox();
368
369 if (runStart < runEnd) {
370 // Handle either a single newline character (which becomes a space),
371 // or a run of characters that does not include a newline.
372 // This effectively translates newlines to spaces without copying the text.
373 if (str[runStart] == '\n') {
374 emitCharacter(' ', m_node, 0, runStart, runStart + 1);
375 m_offset = runStart + 1;
376 } else {
377 int subrunEnd = str.find('\n', runStart);
378 if (subrunEnd == -1 || subrunEnd > runEnd)
379 subrunEnd = runEnd;
380
381 m_offset = subrunEnd;
382 emitText(m_node, runStart, subrunEnd);
383 }
384
385 // If we are doing a subrun that doesn't go to the end of the text box,
386 // come back again to finish handling this text box; don't advance to the next one.
387 if (m_positionEndOffset < textBoxEnd)
388 return;
389
390 // Advance and return
391 int nextRunStart = nextTextBox ? nextTextBox->start() : str.length();
392 if (nextRunStart > runEnd)
393 m_lastTextNodeEndedWithCollapsedSpace = true; // collapsed space between runs or at the end
394 m_textBox = nextTextBox;
395 if (renderer->containsReversedText())
396 ++m_sortedTextBoxesPosition;
397 return;
398 }
399 // Advance and continue
400 m_textBox = nextTextBox;
401 if (renderer->containsReversedText())
402 ++m_sortedTextBoxesPosition;
403 }
404 }
405
handleReplacedElement()406 bool TextIterator::handleReplacedElement()
407 {
408 RenderObject* renderer = m_node->renderer();
409 if (renderer->style()->visibility() != VISIBLE)
410 return false;
411
412 if (m_lastTextNodeEndedWithCollapsedSpace) {
413 emitCharacter(' ', m_lastTextNode->parentNode(), m_lastTextNode, 1, 1);
414 return false;
415 }
416
417 if (m_enterTextControls && (renderer->isTextArea() || renderer->isTextField())) {
418 m_node = static_cast<RenderTextControl*>(renderer)->innerTextElement();
419 m_offset = 0;
420 m_inShadowContent = true;
421 return false;
422 }
423
424 m_haveEmitted = true;
425
426 if (m_emitCharactersBetweenAllVisiblePositions) {
427 // We want replaced elements to behave like punctuation for boundary
428 // finding, and to simply take up space for the selection preservation
429 // code in moveParagraphs, so we use a comma.
430 emitCharacter(',', m_node->parentNode(), m_node, 0, 1);
431 return true;
432 }
433
434 m_positionNode = m_node->parentNode();
435 m_positionOffsetBaseNode = m_node;
436 m_positionStartOffset = 0;
437 m_positionEndOffset = 1;
438
439 m_textCharacters = 0;
440 m_textLength = 0;
441
442 m_lastCharacter = 0;
443
444 return true;
445 }
446
shouldEmitTabBeforeNode(Node * node)447 static bool shouldEmitTabBeforeNode(Node* node)
448 {
449 RenderObject* r = node->renderer();
450
451 // Table cells are delimited by tabs.
452 if (!r || !isTableCell(node))
453 return false;
454
455 // Want a tab before every cell other than the first one
456 RenderTableCell* rc = static_cast<RenderTableCell*>(r);
457 RenderTable* t = rc->table();
458 return t && (t->cellBefore(rc) || t->cellAbove(rc));
459 }
460
shouldEmitNewlineForNode(Node * node)461 static bool shouldEmitNewlineForNode(Node* node)
462 {
463 // br elements are represented by a single newline.
464 RenderObject* r = node->renderer();
465 if (!r)
466 return node->hasTagName(brTag);
467
468 return r->isBR();
469 }
470
shouldEmitNewlinesBeforeAndAfterNode(Node * node)471 static bool shouldEmitNewlinesBeforeAndAfterNode(Node* node)
472 {
473 // Block flow (versus inline flow) is represented by having
474 // a newline both before and after the element.
475 RenderObject* r = node->renderer();
476 if (!r) {
477 return (node->hasTagName(blockquoteTag)
478 || node->hasTagName(ddTag)
479 || node->hasTagName(divTag)
480 || node->hasTagName(dlTag)
481 || node->hasTagName(dtTag)
482 || node->hasTagName(h1Tag)
483 || node->hasTagName(h2Tag)
484 || node->hasTagName(h3Tag)
485 || node->hasTagName(h4Tag)
486 || node->hasTagName(h5Tag)
487 || node->hasTagName(h6Tag)
488 || node->hasTagName(hrTag)
489 || node->hasTagName(liTag)
490 || node->hasTagName(listingTag)
491 || node->hasTagName(olTag)
492 || node->hasTagName(pTag)
493 || node->hasTagName(preTag)
494 || node->hasTagName(trTag)
495 || node->hasTagName(ulTag));
496 }
497
498 // Need to make an exception for table cells, because they are blocks, but we
499 // want them tab-delimited rather than having newlines before and after.
500 if (isTableCell(node))
501 return false;
502
503 // Need to make an exception for table row elements, because they are neither
504 // "inline" or "RenderBlock", but we want newlines for them.
505 if (r->isTableRow()) {
506 RenderTable* t = static_cast<RenderTableRow*>(r)->table();
507 if (t && !t->isInline())
508 return true;
509 }
510
511 return !r->isInline() && r->isRenderBlock() && !r->isFloatingOrPositioned() && !r->isBody();
512 }
513
shouldEmitNewlineAfterNode(Node * node)514 static bool shouldEmitNewlineAfterNode(Node* node)
515 {
516 // FIXME: It should be better but slower to create a VisiblePosition here.
517 if (!shouldEmitNewlinesBeforeAndAfterNode(node))
518 return false;
519 // Check if this is the very last renderer in the document.
520 // If so, then we should not emit a newline.
521 while ((node = node->traverseNextSibling()))
522 if (node->renderer())
523 return true;
524 return false;
525 }
526
shouldEmitNewlineBeforeNode(Node * node)527 static bool shouldEmitNewlineBeforeNode(Node* node)
528 {
529 return shouldEmitNewlinesBeforeAndAfterNode(node);
530 }
531
shouldEmitExtraNewlineForNode(Node * node)532 static bool shouldEmitExtraNewlineForNode(Node* node)
533 {
534 // When there is a significant collapsed bottom margin, emit an extra
535 // newline for a more realistic result. We end up getting the right
536 // result even without margin collapsing. For example: <div><p>text</p></div>
537 // will work right even if both the <div> and the <p> have bottom margins.
538 RenderObject* r = node->renderer();
539 if (!r || !r->isBox())
540 return false;
541
542 // NOTE: We only do this for a select set of nodes, and fwiw WinIE appears
543 // not to do this at all
544 if (node->hasTagName(h1Tag)
545 || node->hasTagName(h2Tag)
546 || node->hasTagName(h3Tag)
547 || node->hasTagName(h4Tag)
548 || node->hasTagName(h5Tag)
549 || node->hasTagName(h6Tag)
550 || node->hasTagName(pTag)) {
551 RenderStyle* style = r->style();
552 if (style) {
553 int bottomMargin = toRenderBox(r)->collapsedMarginBottom();
554 int fontSize = style->fontDescription().computedPixelSize();
555 if (bottomMargin * 2 >= fontSize)
556 return true;
557 }
558 }
559
560 return false;
561 }
562
563 // Whether or not we should emit a character as we enter m_node (if it's a container) or as we hit it (if it's atomic).
shouldRepresentNodeOffsetZero()564 bool TextIterator::shouldRepresentNodeOffsetZero()
565 {
566 if (m_emitCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isTable())
567 return true;
568
569 // Leave element positioned flush with start of a paragraph
570 // (e.g. do not insert tab before a table cell at the start of a paragraph)
571 if (m_lastCharacter == '\n')
572 return false;
573
574 // Otherwise, show the position if we have emitted any characters
575 if (m_haveEmitted)
576 return true;
577
578 // We've not emitted anything yet. Generally, there is no need for any positioning then.
579 // The only exception is when the element is visually not in the same line as
580 // the start of the range (e.g. the range starts at the end of the previous paragraph).
581 // NOTE: Creating VisiblePositions and comparing them is relatively expensive, so we
582 // make quicker checks to possibly avoid that. Another check that we could make is
583 // is whether the inline vs block flow changed since the previous visible element.
584 // I think we're already in a special enough case that that won't be needed, tho.
585
586 // No character needed if this is the first node in the range.
587 if (m_node == m_startContainer)
588 return false;
589
590 // If we are outside the start container's subtree, assume we need to emit.
591 // FIXME: m_startContainer could be an inline block
592 if (!m_node->isDescendantOf(m_startContainer))
593 return true;
594
595 // If we started as m_startContainer offset 0 and the current node is a descendant of
596 // the start container, we already had enough context to correctly decide whether to
597 // emit after a preceding block. We chose not to emit (m_haveEmitted is false),
598 // so don't second guess that now.
599 // NOTE: Is this really correct when m_node is not a leftmost descendant? Probably
600 // immaterial since we likely would have already emitted something by now.
601 if (m_startOffset == 0)
602 return false;
603
604 // If this node is unrendered or invisible the VisiblePosition checks below won't have much meaning.
605 // Additionally, if the range we are iterating over contains huge sections of unrendered content,
606 // we would create VisiblePositions on every call to this function without this check.
607 if (!m_node->renderer() || m_node->renderer()->style()->visibility() != VISIBLE)
608 return false;
609
610 // The currPos.isNotNull() check is needed because positions in non-html content
611 // (like svg) do not have visible positions, and we don't want to emit for them either.
612 VisiblePosition startPos = VisiblePosition(m_startContainer, m_startOffset, DOWNSTREAM);
613 VisiblePosition currPos = VisiblePosition(m_node, 0, DOWNSTREAM);
614 return currPos.isNotNull() && !inSameLine(startPos, currPos);
615 }
616
shouldEmitSpaceBeforeAndAfterNode(Node * node)617 bool TextIterator::shouldEmitSpaceBeforeAndAfterNode(Node* node)
618 {
619 return node->renderer() && node->renderer()->isTable() && (node->renderer()->isInline() || m_emitCharactersBetweenAllVisiblePositions);
620 }
621
representNodeOffsetZero()622 void TextIterator::representNodeOffsetZero()
623 {
624 // Emit a character to show the positioning of m_node.
625
626 // When we haven't been emitting any characters, shouldRepresentNodeOffsetZero() can
627 // create VisiblePositions, which is expensive. So, we perform the inexpensive checks
628 // on m_node to see if it necessitates emitting a character first and will early return
629 // before encountering shouldRepresentNodeOffsetZero()s worse case behavior.
630 if (shouldEmitTabBeforeNode(m_node)) {
631 if (shouldRepresentNodeOffsetZero())
632 emitCharacter('\t', m_node->parentNode(), m_node, 0, 0);
633 } else if (shouldEmitNewlineBeforeNode(m_node)) {
634 if (shouldRepresentNodeOffsetZero())
635 emitCharacter('\n', m_node->parentNode(), m_node, 0, 0);
636 } else if (shouldEmitSpaceBeforeAndAfterNode(m_node)) {
637 if (shouldRepresentNodeOffsetZero())
638 emitCharacter(' ', m_node->parentNode(), m_node, 0, 0);
639 }
640 }
641
handleNonTextNode()642 bool TextIterator::handleNonTextNode()
643 {
644 if (shouldEmitNewlineForNode(m_node))
645 emitCharacter('\n', m_node->parentNode(), m_node, 0, 1);
646 else if (m_emitCharactersBetweenAllVisiblePositions && m_node->renderer() && m_node->renderer()->isHR())
647 emitCharacter(' ', m_node->parentNode(), m_node, 0, 1);
648 else
649 representNodeOffsetZero();
650
651 return true;
652 }
653
exitNode()654 void TextIterator::exitNode()
655 {
656 // prevent emitting a newline when exiting a collapsed block at beginning of the range
657 // FIXME: !m_haveEmitted does not necessarily mean there was a collapsed block... it could
658 // have been an hr (e.g.). Also, a collapsed block could have height (e.g. a table) and
659 // therefore look like a blank line.
660 if (!m_haveEmitted)
661 return;
662
663 // Emit with a position *inside* m_node, after m_node's contents, in
664 // case it is a block, because the run should start where the
665 // emitted character is positioned visually.
666 Node* baseNode = m_node->lastChild() ? m_node->lastChild() : m_node;
667 // FIXME: This shouldn't require the m_lastTextNode to be true, but we can't change that without making
668 // the logic in _web_attributedStringFromRange match. We'll get that for free when we switch to use
669 // TextIterator in _web_attributedStringFromRange.
670 // See <rdar://problem/5428427> for an example of how this mismatch will cause problems.
671 if (m_lastTextNode && shouldEmitNewlineAfterNode(m_node)) {
672 // use extra newline to represent margin bottom, as needed
673 bool addNewline = shouldEmitExtraNewlineForNode(m_node);
674
675 // FIXME: We need to emit a '\n' as we leave an empty block(s) that
676 // contain a VisiblePosition when doing selection preservation.
677 if (m_lastCharacter != '\n') {
678 // insert a newline with a position following this block's contents.
679 emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
680 // remember whether to later add a newline for the current node
681 ASSERT(!m_needAnotherNewline);
682 m_needAnotherNewline = addNewline;
683 } else if (addNewline)
684 // insert a newline with a position following this block's contents.
685 emitCharacter('\n', baseNode->parentNode(), baseNode, 1, 1);
686 }
687
688 // If nothing was emitted, see if we need to emit a space.
689 if (!m_positionNode && shouldEmitSpaceBeforeAndAfterNode(m_node))
690 emitCharacter(' ', baseNode->parentNode(), baseNode, 1, 1);
691 }
692
emitCharacter(UChar c,Node * textNode,Node * offsetBaseNode,int textStartOffset,int textEndOffset)693 void TextIterator::emitCharacter(UChar c, Node *textNode, Node *offsetBaseNode, int textStartOffset, int textEndOffset)
694 {
695 m_haveEmitted = true;
696
697 // remember information with which to construct the TextIterator::range()
698 // NOTE: textNode is often not a text node, so the range will specify child nodes of positionNode
699 m_positionNode = textNode;
700 m_positionOffsetBaseNode = offsetBaseNode;
701 m_positionStartOffset = textStartOffset;
702 m_positionEndOffset = textEndOffset;
703
704 // remember information with which to construct the TextIterator::characters() and length()
705 m_singleCharacterBuffer = c;
706 m_textCharacters = &m_singleCharacterBuffer;
707 m_textLength = 1;
708
709 // remember some iteration state
710 m_lastTextNodeEndedWithCollapsedSpace = false;
711 m_lastCharacter = c;
712 }
713
emitText(Node * textNode,int textStartOffset,int textEndOffset)714 void TextIterator::emitText(Node* textNode, int textStartOffset, int textEndOffset)
715 {
716 RenderText* renderer = toRenderText(m_node->renderer());
717 String str = renderer->text();
718 ASSERT(str.characters());
719
720 m_positionNode = textNode;
721 m_positionOffsetBaseNode = 0;
722 m_positionStartOffset = textStartOffset;
723 m_positionEndOffset = textEndOffset;
724 m_textCharacters = str.characters() + textStartOffset;
725 m_textLength = textEndOffset - textStartOffset;
726 m_lastCharacter = str[textEndOffset - 1];
727
728 m_lastTextNodeEndedWithCollapsedSpace = false;
729 m_haveEmitted = true;
730 }
731
range() const732 PassRefPtr<Range> TextIterator::range() const
733 {
734 // use the current run information, if we have it
735 if (m_positionNode) {
736 if (m_positionOffsetBaseNode) {
737 int index = m_positionOffsetBaseNode->nodeIndex();
738 m_positionStartOffset += index;
739 m_positionEndOffset += index;
740 m_positionOffsetBaseNode = 0;
741 }
742 return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
743 }
744
745 // otherwise, return the end of the overall range we were given
746 if (m_endContainer)
747 return Range::create(m_endContainer->document(), m_endContainer, m_endOffset, m_endContainer, m_endOffset);
748
749 return 0;
750 }
751
node() const752 Node* TextIterator::node() const
753 {
754 RefPtr<Range> textRange = range();
755 if (!textRange)
756 return 0;
757
758 Node* node = textRange->startContainer();
759 if (!node)
760 return 0;
761 if (node->offsetInCharacters())
762 return node;
763
764 return node->childNode(textRange->startOffset());
765 }
766
767 // --------
768
SimplifiedBackwardsTextIterator()769 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator() : m_positionNode(0)
770 {
771 }
772
SimplifiedBackwardsTextIterator(const Range * r)773 SimplifiedBackwardsTextIterator::SimplifiedBackwardsTextIterator(const Range *r)
774 {
775 m_positionNode = 0;
776
777 if (!r)
778 return;
779
780 Node* startNode = r->startContainer();
781 if (!startNode)
782 return;
783 Node* endNode = r->endContainer();
784 int startOffset = r->startOffset();
785 int endOffset = r->endOffset();
786
787 if (!startNode->offsetInCharacters()) {
788 if (startOffset >= 0 && startOffset < static_cast<int>(startNode->childNodeCount())) {
789 startNode = startNode->childNode(startOffset);
790 startOffset = 0;
791 }
792 }
793 if (!endNode->offsetInCharacters()) {
794 if (endOffset > 0 && endOffset <= static_cast<int>(endNode->childNodeCount())) {
795 endNode = endNode->childNode(endOffset - 1);
796 endOffset = endNode->offsetInCharacters() ? endNode->maxCharacterOffset() : endNode->childNodeCount();
797 }
798 }
799
800 m_node = endNode;
801 m_offset = endOffset;
802 m_handledNode = false;
803 m_handledChildren = endOffset == 0;
804
805 m_startNode = startNode;
806 m_startOffset = startOffset;
807 m_endNode = endNode;
808 m_endOffset = endOffset;
809
810 #ifndef NDEBUG
811 // Need this just because of the assert.
812 m_positionNode = endNode;
813 #endif
814
815 m_lastTextNode = 0;
816 m_lastCharacter = '\n';
817
818 if (startOffset == 0 || !startNode->firstChild()) {
819 m_pastStartNode = startNode->previousSibling();
820 while (!m_pastStartNode && startNode->parentNode()) {
821 startNode = startNode->parentNode();
822 m_pastStartNode = startNode->previousSibling();
823 }
824 } else
825 m_pastStartNode = startNode->childNode(startOffset - 1);
826
827 advance();
828 }
829
advance()830 void SimplifiedBackwardsTextIterator::advance()
831 {
832 ASSERT(m_positionNode);
833
834 m_positionNode = 0;
835 m_textLength = 0;
836
837 while (m_node && m_node != m_pastStartNode) {
838 // Don't handle node if we start iterating at [node, 0].
839 if (!m_handledNode && !(m_node == m_endNode && m_endOffset == 0)) {
840 RenderObject *renderer = m_node->renderer();
841 if (renderer && renderer->isText() && m_node->nodeType() == Node::TEXT_NODE) {
842 // FIXME: What about CDATA_SECTION_NODE?
843 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
844 m_handledNode = handleTextNode();
845 } else if (renderer && (renderer->isImage() || renderer->isWidget())) {
846 if (renderer->style()->visibility() == VISIBLE && m_offset > 0)
847 m_handledNode = handleReplacedElement();
848 } else
849 m_handledNode = handleNonTextNode();
850 if (m_positionNode)
851 return;
852 }
853
854 Node* next = m_handledChildren ? 0 : m_node->lastChild();
855 if (!next) {
856 // Exit empty containers as we pass over them or containers
857 // where [container, 0] is where we started iterating.
858 if (!m_handledNode &&
859 canHaveChildrenForEditing(m_node) &&
860 m_node->parentNode() &&
861 (!m_node->lastChild() || m_node == m_endNode && m_endOffset == 0)) {
862 exitNode();
863 if (m_positionNode) {
864 m_handledNode = true;
865 m_handledChildren = true;
866 return;
867 }
868 }
869 // Exit all other containers.
870 next = m_node->previousSibling();
871 while (!next) {
872 if (!m_node->parentNode())
873 break;
874 m_node = m_node->parentNode();
875 exitNode();
876 if (m_positionNode) {
877 m_handledNode = true;
878 m_handledChildren = true;
879 return;
880 }
881 next = m_node->previousSibling();
882 }
883 }
884
885 m_node = next;
886 m_offset = m_node ? caretMaxOffset(m_node) : 0;
887 m_handledNode = false;
888 m_handledChildren = false;
889
890 if (m_positionNode)
891 return;
892 }
893 }
894
handleTextNode()895 bool SimplifiedBackwardsTextIterator::handleTextNode()
896 {
897 m_lastTextNode = m_node;
898
899 RenderText *renderer = toRenderText(m_node->renderer());
900 String str = renderer->text();
901
902 if (!renderer->firstTextBox() && str.length() > 0)
903 return true;
904
905 m_positionEndOffset = m_offset;
906
907 m_offset = (m_node == m_startNode) ? m_startOffset : 0;
908 m_positionNode = m_node;
909 m_positionStartOffset = m_offset;
910 m_textLength = m_positionEndOffset - m_positionStartOffset;
911 m_textCharacters = str.characters() + m_positionStartOffset;
912
913 m_lastCharacter = str[m_positionEndOffset - 1];
914
915 return true;
916 }
917
handleReplacedElement()918 bool SimplifiedBackwardsTextIterator::handleReplacedElement()
919 {
920 unsigned index = m_node->nodeIndex();
921 // We want replaced elements to behave like punctuation for boundary
922 // finding, and to simply take up space for the selection preservation
923 // code in moveParagraphs, so we use a comma. Unconditionally emit
924 // here because this iterator is only used for boundary finding.
925 emitCharacter(',', m_node->parentNode(), index, index + 1);
926 return true;
927 }
928
handleNonTextNode()929 bool SimplifiedBackwardsTextIterator::handleNonTextNode()
930 {
931 // We can use a linefeed in place of a tab because this simple iterator is only used to
932 // find boundaries, not actual content. A linefeed breaks words, sentences, and paragraphs.
933 if (shouldEmitNewlineForNode(m_node) ||
934 shouldEmitNewlineAfterNode(m_node) ||
935 shouldEmitTabBeforeNode(m_node)) {
936 unsigned index = m_node->nodeIndex();
937 // The start of this emitted range is wrong, ensuring correctness would require
938 // VisiblePositions and so would be slow. previousBoundary expects this.
939 emitCharacter('\n', m_node->parentNode(), index + 1, index + 1);
940 }
941
942 return true;
943 }
944
exitNode()945 void SimplifiedBackwardsTextIterator::exitNode()
946 {
947 if (shouldEmitNewlineForNode(m_node) ||
948 shouldEmitNewlineBeforeNode(m_node) ||
949 shouldEmitTabBeforeNode(m_node))
950 // The start of this emitted range is wrong, ensuring correctness would require
951 // VisiblePositions and so would be slow. previousBoundary expects this.
952 emitCharacter('\n', m_node, 0, 0);
953 }
954
emitCharacter(UChar c,Node * node,int startOffset,int endOffset)955 void SimplifiedBackwardsTextIterator::emitCharacter(UChar c, Node *node, int startOffset, int endOffset)
956 {
957 m_singleCharacterBuffer = c;
958 m_positionNode = node;
959 m_positionStartOffset = startOffset;
960 m_positionEndOffset = endOffset;
961 m_textCharacters = &m_singleCharacterBuffer;
962 m_textLength = 1;
963 m_lastCharacter = c;
964 }
965
range() const966 PassRefPtr<Range> SimplifiedBackwardsTextIterator::range() const
967 {
968 if (m_positionNode)
969 return Range::create(m_positionNode->document(), m_positionNode, m_positionStartOffset, m_positionNode, m_positionEndOffset);
970
971 return Range::create(m_startNode->document(), m_startNode, m_startOffset, m_startNode, m_startOffset);
972 }
973
974 // --------
975
CharacterIterator()976 CharacterIterator::CharacterIterator()
977 : m_offset(0)
978 , m_runOffset(0)
979 , m_atBreak(true)
980 {
981 }
982
CharacterIterator(const Range * r,bool emitCharactersBetweenAllVisiblePositions,bool enterTextControls)983 CharacterIterator::CharacterIterator(const Range *r, bool emitCharactersBetweenAllVisiblePositions, bool enterTextControls)
984 : m_offset(0)
985 , m_runOffset(0)
986 , m_atBreak(true)
987 , m_textIterator(r, emitCharactersBetweenAllVisiblePositions, enterTextControls)
988 {
989 while (!atEnd() && m_textIterator.length() == 0)
990 m_textIterator.advance();
991 }
992
range() const993 PassRefPtr<Range> CharacterIterator::range() const
994 {
995 RefPtr<Range> r = m_textIterator.range();
996 if (!m_textIterator.atEnd()) {
997 if (m_textIterator.length() <= 1) {
998 ASSERT(m_runOffset == 0);
999 } else {
1000 Node* n = r->startContainer();
1001 ASSERT(n == r->endContainer());
1002 int offset = r->startOffset() + m_runOffset;
1003 ExceptionCode ec = 0;
1004 r->setStart(n, offset, ec);
1005 r->setEnd(n, offset + 1, ec);
1006 ASSERT(!ec);
1007 }
1008 }
1009 return r.release();
1010 }
1011
advance(int count)1012 void CharacterIterator::advance(int count)
1013 {
1014 if (count <= 0) {
1015 ASSERT(count == 0);
1016 return;
1017 }
1018
1019 m_atBreak = false;
1020
1021 // easy if there is enough left in the current m_textIterator run
1022 int remaining = m_textIterator.length() - m_runOffset;
1023 if (count < remaining) {
1024 m_runOffset += count;
1025 m_offset += count;
1026 return;
1027 }
1028
1029 // exhaust the current m_textIterator run
1030 count -= remaining;
1031 m_offset += remaining;
1032
1033 // move to a subsequent m_textIterator run
1034 for (m_textIterator.advance(); !atEnd(); m_textIterator.advance()) {
1035 int runLength = m_textIterator.length();
1036 if (runLength == 0)
1037 m_atBreak = true;
1038 else {
1039 // see whether this is m_textIterator to use
1040 if (count < runLength) {
1041 m_runOffset = count;
1042 m_offset += count;
1043 return;
1044 }
1045
1046 // exhaust this m_textIterator run
1047 count -= runLength;
1048 m_offset += runLength;
1049 }
1050 }
1051
1052 // ran to the end of the m_textIterator... no more runs left
1053 m_atBreak = true;
1054 m_runOffset = 0;
1055 }
1056
string(int numChars)1057 String CharacterIterator::string(int numChars)
1058 {
1059 Vector<UChar> result;
1060 result.reserveCapacity(numChars);
1061 while (numChars > 0 && !atEnd()) {
1062 int runSize = min(numChars, length());
1063 result.append(characters(), runSize);
1064 numChars -= runSize;
1065 advance(runSize);
1066 }
1067 return String::adopt(result);
1068 }
1069
characterSubrange(CharacterIterator & it,int offset,int length)1070 static PassRefPtr<Range> characterSubrange(CharacterIterator& it, int offset, int length)
1071 {
1072 it.advance(offset);
1073 RefPtr<Range> start = it.range();
1074
1075 if (length > 1)
1076 it.advance(length - 1);
1077 RefPtr<Range> end = it.range();
1078
1079 return Range::create(start->startContainer()->document(),
1080 start->startContainer(), start->startOffset(),
1081 end->endContainer(), end->endOffset());
1082 }
1083
1084 // --------
1085
WordAwareIterator()1086 WordAwareIterator::WordAwareIterator()
1087 : m_previousText(0), m_didLookAhead(false)
1088 {
1089 }
1090
WordAwareIterator(const Range * r)1091 WordAwareIterator::WordAwareIterator(const Range *r)
1092 : m_previousText(0), m_didLookAhead(false), m_textIterator(r)
1093 {
1094 m_didLookAhead = true; // so we consider the first chunk from the text iterator
1095 advance(); // get in position over the first chunk of text
1096 }
1097
1098 // We're always in one of these modes:
1099 // - The current chunk in the text iterator is our current chunk
1100 // (typically its a piece of whitespace, or text that ended with whitespace)
1101 // - The previous chunk in the text iterator is our current chunk
1102 // (we looked ahead to the next chunk and found a word boundary)
1103 // - We built up our own chunk of text from many chunks from the text iterator
1104
1105 // FIXME: Perf could be bad for huge spans next to each other that don't fall on word boundaries
1106
advance()1107 void WordAwareIterator::advance()
1108 {
1109 m_previousText = 0;
1110 m_buffer.clear(); // toss any old buffer we built up
1111
1112 // If last time we did a look-ahead, start with that looked-ahead chunk now
1113 if (!m_didLookAhead) {
1114 ASSERT(!m_textIterator.atEnd());
1115 m_textIterator.advance();
1116 }
1117 m_didLookAhead = false;
1118
1119 // Go to next non-empty chunk
1120 while (!m_textIterator.atEnd() && m_textIterator.length() == 0)
1121 m_textIterator.advance();
1122 m_range = m_textIterator.range();
1123
1124 if (m_textIterator.atEnd())
1125 return;
1126
1127 while (1) {
1128 // If this chunk ends in whitespace we can just use it as our chunk.
1129 if (isSpaceOrNewline(m_textIterator.characters()[m_textIterator.length() - 1]))
1130 return;
1131
1132 // If this is the first chunk that failed, save it in previousText before look ahead
1133 if (m_buffer.isEmpty()) {
1134 m_previousText = m_textIterator.characters();
1135 m_previousLength = m_textIterator.length();
1136 }
1137
1138 // Look ahead to next chunk. If it is whitespace or a break, we can use the previous stuff
1139 m_textIterator.advance();
1140 if (m_textIterator.atEnd() || m_textIterator.length() == 0 || isSpaceOrNewline(m_textIterator.characters()[0])) {
1141 m_didLookAhead = true;
1142 return;
1143 }
1144
1145 if (m_buffer.isEmpty()) {
1146 // Start gobbling chunks until we get to a suitable stopping point
1147 m_buffer.append(m_previousText, m_previousLength);
1148 m_previousText = 0;
1149 }
1150 m_buffer.append(m_textIterator.characters(), m_textIterator.length());
1151 int exception = 0;
1152 m_range->setEnd(m_textIterator.range()->endContainer(), m_textIterator.range()->endOffset(), exception);
1153 }
1154 }
1155
length() const1156 int WordAwareIterator::length() const
1157 {
1158 if (!m_buffer.isEmpty())
1159 return m_buffer.size();
1160 if (m_previousText)
1161 return m_previousLength;
1162 return m_textIterator.length();
1163 }
1164
characters() const1165 const UChar* WordAwareIterator::characters() const
1166 {
1167 if (!m_buffer.isEmpty())
1168 return m_buffer.data();
1169 if (m_previousText)
1170 return m_previousText;
1171 return m_textIterator.characters();
1172 }
1173
1174 // --------
1175
1176 #if USE(ICU_UNICODE) && !UCONFIG_NO_COLLATION
1177
1178 static const size_t minimumSearchBufferSize = 8192;
1179
1180 #ifndef NDEBUG
1181 static bool searcherInUse;
1182 #endif
1183
createSearcher()1184 static UStringSearch* createSearcher()
1185 {
1186 // Provide a non-empty pattern and non-empty text so usearch_open will not fail,
1187 // but it doesn't matter exactly what it is, since we don't perform any searches
1188 // without setting both the pattern and the text.
1189
1190 // Pass empty string for the locale for now to get the Unicode Collation Algorithm,
1191 // rather than something locale-specific.
1192
1193 UErrorCode status = U_ZERO_ERROR;
1194 UStringSearch* searcher = usearch_open(&newlineCharacter, 1, &newlineCharacter, 1, "", 0, &status);
1195 ASSERT(status == U_ZERO_ERROR);
1196 return searcher;
1197 }
1198
searcher()1199 static UStringSearch* searcher()
1200 {
1201 static UStringSearch* searcher = createSearcher();
1202 return searcher;
1203 }
1204
lockSearcher()1205 static inline void lockSearcher()
1206 {
1207 #ifndef NDEBUG
1208 ASSERT(!searcherInUse);
1209 searcherInUse = true;
1210 #endif
1211 }
1212
unlockSearcher()1213 static inline void unlockSearcher()
1214 {
1215 #ifndef NDEBUG
1216 ASSERT(searcherInUse);
1217 searcherInUse = false;
1218 #endif
1219 }
1220
SearchBuffer(const String & target,bool isCaseSensitive)1221 inline SearchBuffer::SearchBuffer(const String& target, bool isCaseSensitive)
1222 : m_target(target)
1223 , m_atBreak(true)
1224 {
1225 ASSERT(!m_target.isEmpty());
1226
1227 size_t targetLength = target.length();
1228 m_buffer.reserveCapacity(max(targetLength * 8, minimumSearchBufferSize));
1229 m_overlap = m_buffer.capacity() / 4;
1230
1231 // Grab the single global searcher.
1232 // If we ever have a reason to do more than once search buffer at once, we'll have
1233 // to move to multiple searchers.
1234 lockSearcher();
1235
1236 UStringSearch* searcher = WebCore::searcher();
1237 UCollator* collator = usearch_getCollator(searcher);
1238
1239 UCollationStrength strength = isCaseSensitive ? UCOL_TERTIARY : UCOL_PRIMARY;
1240 if (ucol_getStrength(collator) != strength) {
1241 ucol_setStrength(collator, strength);
1242 usearch_reset(searcher);
1243 }
1244
1245 UErrorCode status = U_ZERO_ERROR;
1246 usearch_setPattern(searcher, m_target.characters(), targetLength, &status);
1247 ASSERT(status == U_ZERO_ERROR);
1248 }
1249
~SearchBuffer()1250 inline SearchBuffer::~SearchBuffer()
1251 {
1252 unlockSearcher();
1253 }
1254
append(const UChar * characters,size_t length)1255 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
1256 {
1257 ASSERT(length);
1258
1259 if (m_atBreak) {
1260 m_buffer.shrink(0);
1261 m_atBreak = false;
1262 } else if (m_buffer.size() == m_buffer.capacity()) {
1263 memcpy(m_buffer.data(), m_buffer.data() + m_buffer.size() - m_overlap, m_overlap * sizeof(UChar));
1264 m_buffer.shrink(m_overlap);
1265 }
1266
1267 size_t usableLength = min(m_buffer.capacity() - m_buffer.size(), length);
1268 ASSERT(usableLength);
1269 m_buffer.append(characters, usableLength);
1270 return usableLength;
1271 }
1272
atBreak() const1273 inline bool SearchBuffer::atBreak() const
1274 {
1275 return m_atBreak;
1276 }
1277
reachedBreak()1278 inline void SearchBuffer::reachedBreak()
1279 {
1280 m_atBreak = true;
1281 }
1282
search(size_t & start)1283 inline size_t SearchBuffer::search(size_t& start)
1284 {
1285 size_t size = m_buffer.size();
1286 if (m_atBreak) {
1287 if (!size)
1288 return 0;
1289 } else {
1290 if (size != m_buffer.capacity())
1291 return 0;
1292 }
1293
1294 UStringSearch* searcher = WebCore::searcher();
1295
1296 UErrorCode status = U_ZERO_ERROR;
1297 usearch_setText(searcher, m_buffer.data(), size, &status);
1298 ASSERT(status == U_ZERO_ERROR);
1299
1300 int matchStart = usearch_first(searcher, &status);
1301 ASSERT(status == U_ZERO_ERROR);
1302 if (!(matchStart >= 0 && static_cast<size_t>(matchStart) < size)) {
1303 ASSERT(matchStart == USEARCH_DONE);
1304 return 0;
1305 }
1306
1307 // Matches that start in the overlap area are only tentative.
1308 // The same match may appear later, matching more characters,
1309 // possibly including a combining character that's not yet in the buffer.
1310 if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) {
1311 memcpy(m_buffer.data(), m_buffer.data() + size - m_overlap, m_overlap * sizeof(UChar));
1312 m_buffer.shrink(m_overlap);
1313 return 0;
1314 }
1315
1316 size_t newSize = size - (matchStart + 1);
1317 memmove(m_buffer.data(), m_buffer.data() + matchStart + 1, newSize * sizeof(UChar));
1318 m_buffer.shrink(newSize);
1319
1320 start = size - matchStart;
1321 return usearch_getMatchedLength(searcher);
1322 }
1323
1324 #else // !ICU_UNICODE
1325
SearchBuffer(const String & target,bool isCaseSensitive)1326 inline SearchBuffer::SearchBuffer(const String& target, bool isCaseSensitive)
1327 : m_target(isCaseSensitive ? target : target.foldCase())
1328 , m_isCaseSensitive(isCaseSensitive)
1329 , m_buffer(m_target.length())
1330 , m_isCharacterStartBuffer(m_target.length())
1331 , m_isBufferFull(false)
1332 , m_cursor(0)
1333 {
1334 ASSERT(!m_target.isEmpty());
1335 m_target.replace(noBreakSpace, ' ');
1336 }
1337
~SearchBuffer()1338 inline SearchBuffer::~SearchBuffer()
1339 {
1340 }
1341
reachedBreak()1342 inline void SearchBuffer::reachedBreak()
1343 {
1344 m_cursor = 0;
1345 m_isBufferFull = false;
1346 }
1347
atBreak() const1348 inline bool SearchBuffer::atBreak() const
1349 {
1350 return !m_cursor && !m_isBufferFull;
1351 }
1352
append(UChar c,bool isStart)1353 inline void SearchBuffer::append(UChar c, bool isStart)
1354 {
1355 m_buffer[m_cursor] = c == noBreakSpace ? ' ' : c;
1356 m_isCharacterStartBuffer[m_cursor] = isStart;
1357 if (++m_cursor == m_target.length()) {
1358 m_cursor = 0;
1359 m_isBufferFull = true;
1360 }
1361 }
1362
append(const UChar * characters,size_t length)1363 inline size_t SearchBuffer::append(const UChar* characters, size_t length)
1364 {
1365 ASSERT(length);
1366 if (m_isCaseSensitive) {
1367 append(characters[0], true);
1368 return 1;
1369 }
1370 const int maxFoldedCharacters = 16; // sensible maximum is 3, this should be more than enough
1371 UChar foldedCharacters[maxFoldedCharacters];
1372 bool error;
1373 int numFoldedCharacters = foldCase(foldedCharacters, maxFoldedCharacters, characters, 1, &error);
1374 ASSERT(!error);
1375 ASSERT(numFoldedCharacters);
1376 ASSERT(numFoldedCharacters <= maxFoldedCharacters);
1377 if (!error && numFoldedCharacters) {
1378 numFoldedCharacters = min(numFoldedCharacters, maxFoldedCharacters);
1379 append(foldedCharacters[0], true);
1380 for (int i = 1; i < numFoldedCharacters; ++i)
1381 append(foldedCharacters[i], false);
1382 }
1383 return 1;
1384 }
1385
search(size_t & start)1386 inline size_t SearchBuffer::search(size_t& start)
1387 {
1388 if (!m_isBufferFull)
1389 return 0;
1390 if (!m_isCharacterStartBuffer[m_cursor])
1391 return 0;
1392
1393 size_t tailSpace = m_target.length() - m_cursor;
1394 if (memcmp(&m_buffer[m_cursor], m_target.characters(), tailSpace * sizeof(UChar)) != 0)
1395 return 0;
1396 if (memcmp(&m_buffer[0], m_target.characters() + tailSpace, m_cursor * sizeof(UChar)) != 0)
1397 return 0;
1398
1399 start = length();
1400
1401 // Now that we've found a match once, we don't want to find it again, because those
1402 // are the SearchBuffer semantics, allowing for a buffer where you append more than one
1403 // character at a time. To do this we take advantage of m_isCharacterStartBuffer, but if
1404 // we want to get rid of that in the future we could track this with a separate boolean
1405 // or even move the characters to the start of the buffer and set m_isBufferFull to false.
1406 m_isCharacterStartBuffer[m_cursor] = false;
1407
1408 return start;
1409 }
1410
1411 // Returns the number of characters that were appended to the buffer (what we are searching in).
1412 // That's not necessarily the same length as the passed-in target string, because case folding
1413 // can make two strings match even though they're not the same length.
length() const1414 size_t SearchBuffer::length() const
1415 {
1416 size_t bufferSize = m_target.length();
1417 size_t length = 0;
1418 for (size_t i = 0; i < bufferSize; ++i)
1419 length += m_isCharacterStartBuffer[i];
1420 return length;
1421 }
1422
1423 #endif // !ICU_UNICODE
1424
1425 // --------
1426
rangeLength(const Range * r,bool forSelectionPreservation)1427 int TextIterator::rangeLength(const Range *r, bool forSelectionPreservation)
1428 {
1429 int length = 0;
1430 for (TextIterator it(r, forSelectionPreservation); !it.atEnd(); it.advance())
1431 length += it.length();
1432
1433 return length;
1434 }
1435
subrange(Range * entireRange,int characterOffset,int characterCount)1436 PassRefPtr<Range> TextIterator::subrange(Range* entireRange, int characterOffset, int characterCount)
1437 {
1438 CharacterIterator entireRangeIterator(entireRange);
1439 return characterSubrange(entireRangeIterator, characterOffset, characterCount);
1440 }
1441
rangeFromLocationAndLength(Element * scope,int rangeLocation,int rangeLength,bool forSelectionPreservation)1442 PassRefPtr<Range> TextIterator::rangeFromLocationAndLength(Element *scope, int rangeLocation, int rangeLength, bool forSelectionPreservation)
1443 {
1444 RefPtr<Range> resultRange = scope->document()->createRange();
1445
1446 int docTextPosition = 0;
1447 int rangeEnd = rangeLocation + rangeLength;
1448 bool startRangeFound = false;
1449
1450 RefPtr<Range> textRunRange;
1451
1452 TextIterator it(rangeOfContents(scope).get(), forSelectionPreservation);
1453
1454 // FIXME: the atEnd() check shouldn't be necessary, workaround for <http://bugs.webkit.org/show_bug.cgi?id=6289>.
1455 if (rangeLocation == 0 && rangeLength == 0 && it.atEnd()) {
1456 textRunRange = it.range();
1457
1458 ExceptionCode ec = 0;
1459 resultRange->setStart(textRunRange->startContainer(), 0, ec);
1460 ASSERT(!ec);
1461 resultRange->setEnd(textRunRange->startContainer(), 0, ec);
1462 ASSERT(!ec);
1463
1464 return resultRange.release();
1465 }
1466
1467 for (; !it.atEnd(); it.advance()) {
1468 int len = it.length();
1469 textRunRange = it.range();
1470
1471 bool foundStart = rangeLocation >= docTextPosition && rangeLocation <= docTextPosition + len;
1472 bool foundEnd = rangeEnd >= docTextPosition && rangeEnd <= docTextPosition + len;
1473
1474 // Fix textRunRange->endPosition(), but only if foundStart || foundEnd, because it is only
1475 // in those cases that textRunRange is used.
1476 if (foundStart || foundEnd) {
1477 // FIXME: This is a workaround for the fact that the end of a run is often at the wrong
1478 // position for emitted '\n's.
1479 if (len == 1 && it.characters()[0] == '\n') {
1480 Position runStart = textRunRange->startPosition();
1481 Position runEnd = VisiblePosition(runStart).next().deepEquivalent();
1482 if (runEnd.isNotNull()) {
1483 ExceptionCode ec = 0;
1484 textRunRange->setEnd(runEnd.node(), runEnd.offset(), ec);
1485 ASSERT(!ec);
1486 }
1487 }
1488 }
1489
1490 if (foundStart) {
1491 startRangeFound = true;
1492 int exception = 0;
1493 if (textRunRange->startContainer()->isTextNode()) {
1494 int offset = rangeLocation - docTextPosition;
1495 resultRange->setStart(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
1496 } else {
1497 if (rangeLocation == docTextPosition)
1498 resultRange->setStart(textRunRange->startContainer(), textRunRange->startOffset(), exception);
1499 else
1500 resultRange->setStart(textRunRange->endContainer(), textRunRange->endOffset(), exception);
1501 }
1502 }
1503
1504 if (foundEnd) {
1505 int exception = 0;
1506 if (textRunRange->startContainer()->isTextNode()) {
1507 int offset = rangeEnd - docTextPosition;
1508 resultRange->setEnd(textRunRange->startContainer(), offset + textRunRange->startOffset(), exception);
1509 } else {
1510 if (rangeEnd == docTextPosition)
1511 resultRange->setEnd(textRunRange->startContainer(), textRunRange->startOffset(), exception);
1512 else
1513 resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
1514 }
1515 docTextPosition += len;
1516 break;
1517 }
1518 docTextPosition += len;
1519 }
1520
1521 if (!startRangeFound)
1522 return 0;
1523
1524 if (rangeLength != 0 && rangeEnd > docTextPosition) { // rangeEnd is out of bounds
1525 int exception = 0;
1526 resultRange->setEnd(textRunRange->endContainer(), textRunRange->endOffset(), exception);
1527 }
1528
1529 return resultRange.release();
1530 }
1531
1532 // --------
1533
plainTextToMallocAllocatedBuffer(const Range * r,unsigned & bufferLength,bool isDisplayString)1534 UChar* plainTextToMallocAllocatedBuffer(const Range* r, unsigned& bufferLength, bool isDisplayString)
1535 {
1536 UChar* result = 0;
1537
1538 // Do this in pieces to avoid massive reallocations if there is a large amount of text.
1539 // Use system malloc for buffers since they can consume lots of memory and current TCMalloc is unable return it back to OS.
1540 static const unsigned cMaxSegmentSize = 1 << 16;
1541 bufferLength = 0;
1542 typedef pair<UChar*, unsigned> TextSegment;
1543 Vector<TextSegment>* textSegments = 0;
1544 Vector<UChar> textBuffer;
1545 textBuffer.reserveCapacity(cMaxSegmentSize);
1546 for (TextIterator it(r); !it.atEnd(); it.advance()) {
1547 if (textBuffer.size() && textBuffer.size() + it.length() > cMaxSegmentSize) {
1548 UChar* newSegmentBuffer = static_cast<UChar*>(malloc(textBuffer.size() * sizeof(UChar)));
1549 if (!newSegmentBuffer)
1550 goto exit;
1551 memcpy(newSegmentBuffer, textBuffer.data(), textBuffer.size() * sizeof(UChar));
1552 if (!textSegments)
1553 textSegments = new Vector<TextSegment>;
1554 textSegments->append(make_pair(newSegmentBuffer, (unsigned)textBuffer.size()));
1555 textBuffer.clear();
1556 }
1557 textBuffer.append(it.characters(), it.length());
1558 bufferLength += it.length();
1559 }
1560
1561 if (!bufferLength)
1562 return 0;
1563
1564 // Since we know the size now, we can make a single buffer out of the pieces with one big alloc
1565 result = static_cast<UChar*>(malloc(bufferLength * sizeof(UChar)));
1566 if (!result)
1567 goto exit;
1568
1569 {
1570 UChar* resultPos = result;
1571 if (textSegments) {
1572 unsigned size = textSegments->size();
1573 for (unsigned i = 0; i < size; ++i) {
1574 const TextSegment& segment = textSegments->at(i);
1575 memcpy(resultPos, segment.first, segment.second * sizeof(UChar));
1576 resultPos += segment.second;
1577 }
1578 }
1579 memcpy(resultPos, textBuffer.data(), textBuffer.size() * sizeof(UChar));
1580 }
1581
1582 exit:
1583 if (textSegments) {
1584 unsigned size = textSegments->size();
1585 for (unsigned i = 0; i < size; ++i)
1586 free(textSegments->at(i).first);
1587 delete textSegments;
1588 }
1589
1590 if (isDisplayString && r->ownerDocument())
1591 r->ownerDocument()->displayBufferModifiedByEncoding(result, bufferLength);
1592
1593 return result;
1594 }
1595
plainText(const Range * r)1596 String plainText(const Range* r)
1597 {
1598 unsigned length;
1599 UChar* buf = plainTextToMallocAllocatedBuffer(r, length, false);
1600 if (!buf)
1601 return "";
1602 String result(buf, length);
1603 free(buf);
1604 return result;
1605 }
1606
isAllCollapsibleWhitespace(const String & string)1607 static inline bool isAllCollapsibleWhitespace(const String& string)
1608 {
1609 const UChar* characters = string.characters();
1610 unsigned length = string.length();
1611 for (unsigned i = 0; i < length; ++i) {
1612 if (!isCollapsibleWhitespace(characters[i]))
1613 return false;
1614 }
1615 return true;
1616 }
1617
collapsedToBoundary(const Range * range,bool forward)1618 static PassRefPtr<Range> collapsedToBoundary(const Range* range, bool forward)
1619 {
1620 ExceptionCode ec = 0;
1621 RefPtr<Range> result = range->cloneRange(ec);
1622 ASSERT(!ec);
1623 result->collapse(!forward, ec);
1624 ASSERT(!ec);
1625 return result.release();
1626 }
1627
findPlainText(CharacterIterator & it,const String & target,bool forward,bool caseSensitive,size_t & matchStart)1628 static size_t findPlainText(CharacterIterator& it, const String& target, bool forward, bool caseSensitive, size_t& matchStart)
1629 {
1630 matchStart = 0;
1631 size_t matchLength = 0;
1632
1633 SearchBuffer buffer(target, caseSensitive);
1634
1635 while (!it.atEnd()) {
1636 it.advance(buffer.append(it.characters(), it.length()));
1637 tryAgain:
1638 size_t matchStartOffset;
1639 if (size_t newMatchLength = buffer.search(matchStartOffset)) {
1640 // Note that we found a match, and where we found it.
1641 size_t lastCharacterInBufferOffset = it.characterOffset();
1642 ASSERT(lastCharacterInBufferOffset >= matchStartOffset);
1643 matchStart = lastCharacterInBufferOffset - matchStartOffset;
1644 matchLength = newMatchLength;
1645 // If searching forward, stop on the first match.
1646 // If searching backward, don't stop, so we end up with the last match.
1647 if (forward)
1648 break;
1649 goto tryAgain;
1650 }
1651 if (it.atBreak() && !buffer.atBreak()) {
1652 buffer.reachedBreak();
1653 goto tryAgain;
1654 }
1655 }
1656
1657 return matchLength;
1658 }
1659
findPlainText(const Range * range,const String & target,bool forward,bool caseSensitive)1660 PassRefPtr<Range> findPlainText(const Range* range, const String& target, bool forward, bool caseSensitive)
1661 {
1662 // We can't search effectively for a string that's entirely made of collapsible
1663 // whitespace, so we won't even try. This also takes care of the empty string case.
1664 if (isAllCollapsibleWhitespace(target))
1665 return collapsedToBoundary(range, forward);
1666
1667 // First, find the text.
1668 size_t matchStart;
1669 size_t matchLength;
1670 {
1671 CharacterIterator findIterator(range, false, true);
1672 matchLength = findPlainText(findIterator, target, forward, caseSensitive, matchStart);
1673 if (!matchLength)
1674 return collapsedToBoundary(range, forward);
1675 }
1676
1677 // Then, find the document position of the start and the end of the text.
1678 CharacterIterator computeRangeIterator(range, false, true);
1679 return characterSubrange(computeRangeIterator, matchStart, matchLength);
1680 }
1681
1682 }
1683