1 // Copyright (c) 2011, Mike Samuel 2 // All rights reserved. 3 // 4 // Redistribution and use in source and binary forms, with or without 5 // modification, are permitted provided that the following conditions 6 // are met: 7 // 8 // Redistributions of source code must retain the above copyright 9 // notice, this list of conditions and the following disclaimer. 10 // Redistributions in binary form must reproduce the above copyright 11 // notice, this list of conditions and the following disclaimer in the 12 // documentation and/or other materials provided with the distribution. 13 // Neither the name of the OWASP nor the names of its contributors may 14 // be used to endorse or promote products derived from this software 15 // without specific prior written permission. 16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 19 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 20 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 21 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 22 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 24 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 26 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 27 // POSSIBILITY OF SUCH DAMAGE. 28 29 package org.owasp.html; 30 31 import java.util.ArrayList; 32 import java.util.Arrays; 33 import java.util.List; 34 import java.util.Map; 35 import java.util.TreeMap; 36 37 /** 38 * A trie used to separate punctuation tokens in a run of non-whitespace 39 * characters by preferring the longest punctuation string possible in a 40 * greedy left-to-right scan. 41 * 42 * @author Mike Samuel <mikesamuel@gmail.com> 43 */ 44 final class Trie { 45 private final char[] childMap; 46 private final Trie[] children; 47 private final boolean terminal; 48 private final int value; 49 50 /** 51 * @param elements not empty, non null. 52 */ Trie(Map<String, Integer> elements)53 public Trie(Map<String, Integer> elements) { 54 this(sortedUniqEntries(elements), 0); 55 } 56 Trie(List<Map.Entry<String, Integer>> elements, int depth)57 private Trie(List<Map.Entry<String, Integer>> elements, int depth) { 58 this(elements, depth, 0, elements.size()); 59 } 60 61 /** 62 * @param elements not empty, non null. Not modified. 63 * @param depth the depth in the tree. 64 * @param start an index into punctuationStrings of the first string in this 65 * subtree. 66 * @param end an index into punctuationStrings past the last string in this 67 * subtree. 68 */ Trie( List<Map.Entry<String, Integer>> elements, int depth, int start, int end)69 private Trie( 70 List<Map.Entry<String, Integer>> elements, int depth, 71 int start, int end) { 72 this.terminal = depth == elements.get(start).getKey().length(); 73 if (this.terminal) { 74 this.value = elements.get(start).getValue(); 75 if (start + 1 == end) { // base case 76 this.childMap = ZERO_CHARS; 77 this.children = ZERO_TRIES; 78 return; 79 } else { 80 ++start; 81 } 82 } else { 83 this.value = Integer.MAX_VALUE; 84 } 85 int childCount = 0; 86 { 87 int last = -1; 88 for (int i = start; i < end; ++i) { 89 char ch = elements.get(i).getKey().charAt(depth); 90 if (ch != last) { 91 ++childCount; 92 last = ch; 93 } 94 } 95 } 96 this.childMap = new char[childCount]; 97 this.children = new Trie[childCount]; 98 int childStart = start; 99 int childIndex = 0; 100 char lastCh = elements.get(start).getKey().charAt(depth); 101 for (int i = start + 1; i < end; ++i) { 102 char ch = elements.get(i).getKey().charAt(depth); 103 if (ch != lastCh) { 104 childMap[childIndex] = lastCh; 105 children[childIndex++] = new Trie( 106 elements, depth + 1, childStart, i); 107 childStart = i; 108 lastCh = ch; 109 } 110 } 111 childMap[childIndex] = lastCh; 112 children[childIndex++] = new Trie(elements, depth + 1, childStart, end); 113 } 114 115 /** Does this node correspond to a complete string in the input set. */ isTerminal()116 public boolean isTerminal() { return terminal; } 117 getValue()118 public int getValue() { return value; } 119 120 /** 121 * The child corresponding to the given character. 122 * @return null if no such trie. 123 */ lookup(char ch)124 public Trie lookup(char ch) { 125 int i = Arrays.binarySearch(childMap, ch); 126 return i >= 0 ? children[i] : null; 127 } 128 129 /** 130 * The descendant of this trie corresponding to the string for this trie 131 * appended with s. 132 * @param s non null. 133 * @return null if no such trie. 134 */ lookup(CharSequence s)135 public Trie lookup(CharSequence s) { 136 Trie t = this; 137 for (int i = 0, n = s.length(); i < n; ++i) { 138 t = t.lookup(s.charAt(i)); 139 if (null == t) { break; } 140 } 141 return t; 142 } 143 contains(char ch)144 public boolean contains(char ch) { 145 return Arrays.binarySearch(childMap, ch) >= 0; 146 } 147 sortedUniqEntries( Map<String, T> m)148 private static <T> List<Map.Entry<String, T>> sortedUniqEntries( 149 Map<String, T> m) { 150 return new ArrayList<Map.Entry<String, T>>( 151 new TreeMap<String, T>(m).entrySet()); 152 } 153 154 private static final char[] ZERO_CHARS = new char[0]; 155 private static final Trie[] ZERO_TRIES = new Trie[0]; 156 157 /** 158 * Append all strings s such that {@code this.lookup(s).isTerminal()} to the 159 * given list in lexical order. 160 */ toStringList(List<String> strings)161 public void toStringList(List<String> strings) { 162 toStringList("", strings); 163 } 164 toStringList(String prefix, List<String> strings)165 private void toStringList(String prefix, List<String> strings) { 166 if (terminal) { strings.add(prefix); } 167 for (int i = 0, n = childMap.length; i < n; ++i) { 168 children[i].toStringList(prefix + childMap[i], strings); 169 } 170 } 171 172 @Override toString()173 public String toString() { 174 StringBuilder sb = new StringBuilder(); 175 toStringBuilder(0, sb); 176 return sb.toString(); 177 } 178 toStringBuilder(int depth, StringBuilder sb)179 private void toStringBuilder(int depth, StringBuilder sb) { 180 sb.append(terminal ? "terminal" : "nonterminal"); 181 ++depth; 182 for (int i = 0; i < childMap.length; ++i) { 183 sb.append('\n'); 184 for (int d = 0; d < depth; ++d) { 185 sb.append('\t'); 186 } 187 sb.append('\'').append(childMap[i]).append("' "); 188 children[i].toStringBuilder(depth, sb); 189 } 190 } 191 } 192