1 /* 2 * Copyright (C) 2010 Adam Barth. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE COMPUTER, INC. OR 17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 */ 25 26 #ifndef SuffixTree_h 27 #define SuffixTree_h 28 29 #include "PlatformString.h" 30 #include <wtf/Vector.h> 31 32 namespace WebCore { 33 34 class UnicodeCodebook { 35 public: codeWord(UChar c)36 static int codeWord(UChar c) { return c; } 37 enum { codeSize = 1 << 8 * sizeof(UChar) }; 38 }; 39 40 class ASCIICodebook { 41 public: codeWord(UChar c)42 static int codeWord(UChar c) { return c & (codeSize - 1); } 43 enum { codeSize = 1 << (8 * sizeof(char) - 1) }; 44 }; 45 46 template<typename Codebook> 47 class SuffixTree { 48 public: SuffixTree(const String & text,unsigned depth)49 SuffixTree(const String& text, unsigned depth) 50 : m_depth(depth) 51 , m_leaf(true) 52 { 53 build(text); 54 } 55 mightContain(const String & query)56 bool mightContain(const String& query) 57 { 58 Node* current = &m_root; 59 int limit = std::min(m_depth, query.length()); 60 for (int i = 0; i < limit; ++i) { 61 current = current->at(Codebook::codeWord(query[i])); 62 if (!current) 63 return false; 64 } 65 return true; 66 } 67 68 private: 69 class Node { 70 public: 71 Node(bool isLeaf = false) 72 { 73 m_children.resize(Codebook::codeSize); 74 m_children.fill(0); 75 m_isLeaf = isLeaf; 76 } 77 ~Node()78 ~Node() 79 { 80 for (unsigned i = 0; i < m_children.size(); ++i) { 81 Node* child = m_children.at(i); 82 if (child && !child->m_isLeaf) 83 delete child; 84 } 85 } 86 at(int codeWord)87 Node*& at(int codeWord) { return m_children.at(codeWord); } 88 89 private: 90 typedef Vector<Node*, Codebook::codeSize> ChildrenVector; 91 92 ChildrenVector m_children; 93 bool m_isLeaf; 94 }; 95 build(const String & text)96 void build(const String& text) 97 { 98 for (unsigned base = 0; base < text.length(); ++base) { 99 Node* current = &m_root; 100 unsigned limit = std::min(base + m_depth, text.length()); 101 for (unsigned offset = 0; base + offset < limit; ++offset) { 102 ASSERT(current != &m_leaf); 103 Node*& child = current->at(Codebook::codeWord(text[base + offset])); 104 if (!child) 105 child = base + offset + 1 == limit ? &m_leaf : new Node(); 106 current = child; 107 } 108 } 109 } 110 111 Node m_root; 112 unsigned m_depth; 113 114 // Instead of allocating a fresh empty leaf node for ever leaf in the tree 115 // (there can be a lot of these), we alias all the leaves to this "static" 116 // leaf node. 117 Node m_leaf; 118 }; 119 120 } // namespace WebCore 121 122 #endif // SuffixTree_h 123