1 package org.jsoup.internal; 2 3 import org.jsoup.helper.Validate; 4 import org.jspecify.annotations.Nullable; 5 6 import java.net.MalformedURLException; 7 import java.net.URL; 8 import java.util.Arrays; 9 import java.util.Collection; 10 import java.util.Iterator; 11 import java.util.Stack; 12 import java.util.regex.Pattern; 13 14 /** 15 A minimal String utility class. Designed for <b>internal</b> jsoup use only - the API and outcome may change without 16 notice. 17 */ 18 public final class StringUtil { 19 // memoised padding up to 21 (blocks 0 to 20 spaces) 20 static final String[] padding = {"", " ", " ", " ", " ", " ", " ", " ", " ", 21 " ", " ", " ", " ", " ", " ", " ", 22 " ", " ", " ", " ", " "}; 23 24 /** 25 * Join a collection of strings by a separator 26 * @param strings collection of string objects 27 * @param sep string to place between strings 28 * @return joined string 29 */ join(Collection<?> strings, String sep)30 public static String join(Collection<?> strings, String sep) { 31 return join(strings.iterator(), sep); 32 } 33 34 /** 35 * Join a collection of strings by a separator 36 * @param strings iterator of string objects 37 * @param sep string to place between strings 38 * @return joined string 39 */ join(Iterator<?> strings, String sep)40 public static String join(Iterator<?> strings, String sep) { 41 if (!strings.hasNext()) 42 return ""; 43 44 String start = strings.next().toString(); 45 if (!strings.hasNext()) // only one, avoid builder 46 return start; 47 48 StringJoiner j = new StringJoiner(sep); 49 j.add(start); 50 while (strings.hasNext()) { 51 j.add(strings.next()); 52 } 53 return j.complete(); 54 } 55 56 /** 57 * Join an array of strings by a separator 58 * @param strings collection of string objects 59 * @param sep string to place between strings 60 * @return joined string 61 */ join(String[] strings, String sep)62 public static String join(String[] strings, String sep) { 63 return join(Arrays.asList(strings), sep); 64 } 65 66 /** 67 A StringJoiner allows incremental / filtered joining of a set of stringable objects. 68 @since 1.14.1 69 */ 70 public static class StringJoiner { 71 @Nullable StringBuilder sb = borrowBuilder(); // sets null on builder release so can't accidentally be reused 72 final String separator; 73 boolean first = true; 74 75 /** 76 Create a new joiner, that uses the specified separator. MUST call {@link #complete()} or will leak a thread 77 local string builder. 78 79 @param separator the token to insert between strings 80 */ StringJoiner(String separator)81 public StringJoiner(String separator) { 82 this.separator = separator; 83 } 84 85 /** 86 Add another item to the joiner, will be separated 87 */ add(Object stringy)88 public StringJoiner add(Object stringy) { 89 Validate.notNull(sb); // don't reuse 90 if (!first) 91 sb.append(separator); 92 sb.append(stringy); 93 first = false; 94 return this; 95 } 96 97 /** 98 Append content to the current item; not separated 99 */ append(Object stringy)100 public StringJoiner append(Object stringy) { 101 Validate.notNull(sb); // don't reuse 102 sb.append(stringy); 103 return this; 104 } 105 106 /** 107 Return the joined string, and release the builder back to the pool. This joiner cannot be reused. 108 */ complete()109 public String complete() { 110 String string = releaseBuilder(sb); 111 sb = null; 112 return string; 113 } 114 } 115 116 /** 117 * Returns space padding (up to the default max of 30). Use {@link #padding(int, int)} to specify a different limit. 118 * @param width amount of padding desired 119 * @return string of spaces * width 120 * @see #padding(int, int) 121 */ padding(int width)122 public static String padding(int width) { 123 return padding(width, 30); 124 } 125 126 /** 127 * Returns space padding, up to a max of maxPaddingWidth. 128 * @param width amount of padding desired 129 * @param maxPaddingWidth maximum padding to apply. Set to {@code -1} for unlimited. 130 * @return string of spaces * width 131 */ padding(int width, int maxPaddingWidth)132 public static String padding(int width, int maxPaddingWidth) { 133 Validate.isTrue(width >= 0, "width must be >= 0"); 134 Validate.isTrue(maxPaddingWidth >= -1); 135 if (maxPaddingWidth != -1) 136 width = Math.min(width, maxPaddingWidth); 137 if (width < padding.length) 138 return padding[width]; 139 char[] out = new char[width]; 140 for (int i = 0; i < width; i++) 141 out[i] = ' '; 142 return String.valueOf(out); 143 } 144 145 /** 146 * Tests if a string is blank: null, empty, or only whitespace (" ", \r\n, \t, etc) 147 * @param string string to test 148 * @return if string is blank 149 */ isBlank(final String string)150 public static boolean isBlank(final String string) { 151 if (string == null || string.length() == 0) 152 return true; 153 154 int l = string.length(); 155 for (int i = 0; i < l; i++) { 156 if (!StringUtil.isWhitespace(string.codePointAt(i))) 157 return false; 158 } 159 return true; 160 } 161 162 /** 163 Tests if a string starts with a newline character 164 @param string string to test 165 @return if its first character is a newline 166 */ startsWithNewline(final String string)167 public static boolean startsWithNewline(final String string) { 168 if (string == null || string.length() == 0) 169 return false; 170 return string.charAt(0) == '\n'; 171 } 172 173 /** 174 * Tests if a string is numeric, i.e. contains only digit characters 175 * @param string string to test 176 * @return true if only digit chars, false if empty or null or contains non-digit chars 177 */ isNumeric(String string)178 public static boolean isNumeric(String string) { 179 if (string == null || string.length() == 0) 180 return false; 181 182 int l = string.length(); 183 for (int i = 0; i < l; i++) { 184 if (!Character.isDigit(string.codePointAt(i))) 185 return false; 186 } 187 return true; 188 } 189 190 /** 191 * Tests if a code point is "whitespace" as defined in the HTML spec. Used for output HTML. 192 * @param c code point to test 193 * @return true if code point is whitespace, false otherwise 194 * @see #isActuallyWhitespace(int) 195 */ isWhitespace(int c)196 public static boolean isWhitespace(int c){ 197 return c == ' ' || c == '\t' || c == '\n' || c == '\f' || c == '\r'; 198 } 199 200 /** 201 * Tests if a code point is "whitespace" as defined by what it looks like. Used for Element.text etc. 202 * @param c code point to test 203 * @return true if code point is whitespace, false otherwise 204 */ isActuallyWhitespace(int c)205 public static boolean isActuallyWhitespace(int c){ 206 return c == ' ' || c == '\t' || c == '\n' || c == '\f' || c == '\r' || c == 160; 207 // 160 is (non-breaking space). Not in the spec but expected. 208 } 209 isInvisibleChar(int c)210 public static boolean isInvisibleChar(int c) { 211 return c == 8203 || c == 173; // zero width sp, soft hyphen 212 // previously also included zw non join, zw join - but removing those breaks semantic meaning of text 213 } 214 215 /** 216 * Normalise the whitespace within this string; multiple spaces collapse to a single, and all whitespace characters 217 * (e.g. newline, tab) convert to a simple space. 218 * @param string content to normalise 219 * @return normalised string 220 */ normaliseWhitespace(String string)221 public static String normaliseWhitespace(String string) { 222 StringBuilder sb = StringUtil.borrowBuilder(); 223 appendNormalisedWhitespace(sb, string, false); 224 return StringUtil.releaseBuilder(sb); 225 } 226 227 /** 228 * After normalizing the whitespace within a string, appends it to a string builder. 229 * @param accum builder to append to 230 * @param string string to normalize whitespace within 231 * @param stripLeading set to true if you wish to remove any leading whitespace 232 */ appendNormalisedWhitespace(StringBuilder accum, String string, boolean stripLeading)233 public static void appendNormalisedWhitespace(StringBuilder accum, String string, boolean stripLeading) { 234 boolean lastWasWhite = false; 235 boolean reachedNonWhite = false; 236 237 int len = string.length(); 238 int c; 239 for (int i = 0; i < len; i+= Character.charCount(c)) { 240 c = string.codePointAt(i); 241 if (isActuallyWhitespace(c)) { 242 if ((stripLeading && !reachedNonWhite) || lastWasWhite) 243 continue; 244 accum.append(' '); 245 lastWasWhite = true; 246 } 247 else if (!isInvisibleChar(c)) { 248 accum.appendCodePoint(c); 249 lastWasWhite = false; 250 reachedNonWhite = true; 251 } 252 } 253 } 254 in(final String needle, final String... haystack)255 public static boolean in(final String needle, final String... haystack) { 256 final int len = haystack.length; 257 for (int i = 0; i < len; i++) { 258 if (haystack[i].equals(needle)) 259 return true; 260 } 261 return false; 262 } 263 inSorted(String needle, String[] haystack)264 public static boolean inSorted(String needle, String[] haystack) { 265 return Arrays.binarySearch(haystack, needle) >= 0; 266 } 267 268 /** 269 Tests that a String contains only ASCII characters. 270 @param string scanned string 271 @return true if all characters are in range 0 - 127 272 */ isAscii(String string)273 public static boolean isAscii(String string) { 274 Validate.notNull(string); 275 for (int i = 0; i < string.length(); i++) { 276 int c = string.charAt(i); 277 if (c > 127) { // ascii range 278 return false; 279 } 280 } 281 return true; 282 } 283 284 private static final Pattern extraDotSegmentsPattern = Pattern.compile("^/((\\.{1,2}/)+)"); 285 /** 286 * Create a new absolute URL, from a provided existing absolute URL and a relative URL component. 287 * @param base the existing absolute base URL 288 * @param relUrl the relative URL to resolve. (If it's already absolute, it will be returned) 289 * @return the resolved absolute URL 290 * @throws MalformedURLException if an error occurred generating the URL 291 */ resolve(URL base, String relUrl)292 public static URL resolve(URL base, String relUrl) throws MalformedURLException { 293 relUrl = stripControlChars(relUrl); 294 // workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired 295 if (relUrl.startsWith("?")) 296 relUrl = base.getPath() + relUrl; 297 // workaround: //example.com + ./foo = //example.com/./foo, not //example.com/foo 298 URL url = new URL(base, relUrl); 299 String fixedFile = extraDotSegmentsPattern.matcher(url.getFile()).replaceFirst("/"); 300 if (url.getRef() != null) { 301 fixedFile = fixedFile + "#" + url.getRef(); 302 } 303 return new URL(url.getProtocol(), url.getHost(), url.getPort(), fixedFile); 304 } 305 306 /** 307 * Create a new absolute URL, from a provided existing absolute URL and a relative URL component. 308 * @param baseUrl the existing absolute base URL 309 * @param relUrl the relative URL to resolve. (If it's already absolute, it will be returned) 310 * @return an absolute URL if one was able to be generated, or the empty string if not 311 */ resolve(String baseUrl, String relUrl)312 public static String resolve(String baseUrl, String relUrl) { 313 // workaround: java will allow control chars in a path URL and may treat as relative, but Chrome / Firefox will strip and may see as a scheme. Normalize to browser's view. 314 baseUrl = stripControlChars(baseUrl); relUrl = stripControlChars(relUrl); 315 try { 316 URL base; 317 try { 318 base = new URL(baseUrl); 319 } catch (MalformedURLException e) { 320 // the base is unsuitable, but the attribute/rel may be abs on its own, so try that 321 URL abs = new URL(relUrl); 322 return abs.toExternalForm(); 323 } 324 return resolve(base, relUrl).toExternalForm(); 325 } catch (MalformedURLException e) { 326 // it may still be valid, just that Java doesn't have a registered stream handler for it, e.g. tel 327 // we test here vs at start to normalize supported URLs (e.g. HTTP -> http) 328 return validUriScheme.matcher(relUrl).find() ? relUrl : ""; 329 } 330 } 331 private static final Pattern validUriScheme = Pattern.compile("^[a-zA-Z][a-zA-Z0-9+-.]*:"); 332 333 private static final Pattern controlChars = Pattern.compile("[\\x00-\\x1f]*"); // matches ascii 0 - 31, to strip from url stripControlChars(final String input)334 private static String stripControlChars(final String input) { 335 return controlChars.matcher(input).replaceAll(""); 336 } 337 338 private static final ThreadLocal<Stack<StringBuilder>> threadLocalBuilders = ThreadLocal.withInitial(Stack::new); 339 340 /** 341 * Maintains cached StringBuilders in a flyweight pattern, to minimize new StringBuilder GCs. The StringBuilder is 342 * prevented from growing too large. 343 * <p> 344 * Care must be taken to release the builder once its work has been completed, with {@link #releaseBuilder} 345 * @return an empty StringBuilder 346 */ borrowBuilder()347 public static StringBuilder borrowBuilder() { 348 Stack<StringBuilder> builders = threadLocalBuilders.get(); 349 return builders.empty() ? 350 new StringBuilder(MaxCachedBuilderSize) : 351 builders.pop(); 352 } 353 354 /** 355 * Release a borrowed builder. Care must be taken not to use the builder after it has been returned, as its 356 * contents may be changed by this method, or by a concurrent thread. 357 * @param sb the StringBuilder to release. 358 * @return the string value of the released String Builder (as an incentive to release it!). 359 */ releaseBuilder(StringBuilder sb)360 public static String releaseBuilder(StringBuilder sb) { 361 Validate.notNull(sb); 362 String string = sb.toString(); 363 364 if (sb.length() > MaxCachedBuilderSize) 365 sb = new StringBuilder(MaxCachedBuilderSize); // make sure it hasn't grown too big 366 else 367 sb.delete(0, sb.length()); // make sure it's emptied on release 368 369 Stack<StringBuilder> builders = threadLocalBuilders.get(); 370 builders.push(sb); 371 372 while (builders.size() > MaxIdleBuilders) { 373 builders.pop(); 374 } 375 return string; 376 } 377 378 private static final int MaxCachedBuilderSize = 8 * 1024; 379 private static final int MaxIdleBuilders = 8; 380 } 381