• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.jsoup.internal;
2 
3 import org.jsoup.helper.Validate;
4 import org.jspecify.annotations.Nullable;
5 
6 import java.net.MalformedURLException;
7 import java.net.URL;
8 import java.util.Arrays;
9 import java.util.Collection;
10 import java.util.Iterator;
11 import java.util.Stack;
12 import java.util.regex.Pattern;
13 
14 /**
15  A minimal String utility class. Designed for <b>internal</b> jsoup use only - the API and outcome may change without
16  notice.
17  */
18 public final class StringUtil {
19     // memoised padding up to 21 (blocks 0 to 20 spaces)
20     static final String[] padding = {"", " ", "  ", "   ", "    ", "     ", "      ", "       ", "        ",
21         "         ", "          ", "           ", "            ", "             ", "              ", "               ",
22         "                ", "                 ", "                  ", "                   ", "                    "};
23 
24     /**
25      * Join a collection of strings by a separator
26      * @param strings collection of string objects
27      * @param sep string to place between strings
28      * @return joined string
29      */
join(Collection<?> strings, String sep)30     public static String join(Collection<?> strings, String sep) {
31         return join(strings.iterator(), sep);
32     }
33 
34     /**
35      * Join a collection of strings by a separator
36      * @param strings iterator of string objects
37      * @param sep string to place between strings
38      * @return joined string
39      */
join(Iterator<?> strings, String sep)40     public static String join(Iterator<?> strings, String sep) {
41         if (!strings.hasNext())
42             return "";
43 
44         String start = strings.next().toString();
45         if (!strings.hasNext()) // only one, avoid builder
46             return start;
47 
48         StringJoiner j = new StringJoiner(sep);
49         j.add(start);
50         while (strings.hasNext()) {
51             j.add(strings.next());
52         }
53         return j.complete();
54     }
55 
56     /**
57      * Join an array of strings by a separator
58      * @param strings collection of string objects
59      * @param sep string to place between strings
60      * @return joined string
61      */
join(String[] strings, String sep)62     public static String join(String[] strings, String sep) {
63         return join(Arrays.asList(strings), sep);
64     }
65 
66     /**
67      A StringJoiner allows incremental / filtered joining of a set of stringable objects.
68      @since 1.14.1
69      */
70     public static class StringJoiner {
71         @Nullable StringBuilder sb = borrowBuilder(); // sets null on builder release so can't accidentally be reused
72         final String separator;
73         boolean first = true;
74 
75         /**
76          Create a new joiner, that uses the specified separator. MUST call {@link #complete()} or will leak a thread
77          local string builder.
78 
79          @param separator the token to insert between strings
80          */
StringJoiner(String separator)81         public StringJoiner(String separator) {
82             this.separator = separator;
83         }
84 
85         /**
86          Add another item to the joiner, will be separated
87          */
add(Object stringy)88         public StringJoiner add(Object stringy) {
89             Validate.notNull(sb); // don't reuse
90             if (!first)
91                 sb.append(separator);
92             sb.append(stringy);
93             first = false;
94             return this;
95         }
96 
97         /**
98          Append content to the current item; not separated
99          */
append(Object stringy)100         public StringJoiner append(Object stringy) {
101             Validate.notNull(sb); // don't reuse
102             sb.append(stringy);
103             return this;
104         }
105 
106         /**
107          Return the joined string, and release the builder back to the pool. This joiner cannot be reused.
108          */
complete()109         public String complete() {
110             String string = releaseBuilder(sb);
111             sb = null;
112             return string;
113         }
114     }
115 
116     /**
117      * Returns space padding (up to the default max of 30). Use {@link #padding(int, int)} to specify a different limit.
118      * @param width amount of padding desired
119      * @return string of spaces * width
120      * @see #padding(int, int)
121       */
padding(int width)122     public static String padding(int width) {
123         return padding(width, 30);
124     }
125 
126     /**
127      * Returns space padding, up to a max of maxPaddingWidth.
128      * @param width amount of padding desired
129      * @param maxPaddingWidth maximum padding to apply. Set to {@code -1} for unlimited.
130      * @return string of spaces * width
131      */
padding(int width, int maxPaddingWidth)132     public static String padding(int width, int maxPaddingWidth) {
133         Validate.isTrue(width >= 0, "width must be >= 0");
134         Validate.isTrue(maxPaddingWidth >= -1);
135         if (maxPaddingWidth != -1)
136             width = Math.min(width, maxPaddingWidth);
137         if (width < padding.length)
138             return padding[width];
139         char[] out = new char[width];
140         for (int i = 0; i < width; i++)
141             out[i] = ' ';
142         return String.valueOf(out);
143     }
144 
145     /**
146      * Tests if a string is blank: null, empty, or only whitespace (" ", \r\n, \t, etc)
147      * @param string string to test
148      * @return if string is blank
149      */
isBlank(final String string)150     public static boolean isBlank(final String string) {
151         if (string == null || string.length() == 0)
152             return true;
153 
154         int l = string.length();
155         for (int i = 0; i < l; i++) {
156             if (!StringUtil.isWhitespace(string.codePointAt(i)))
157                 return false;
158         }
159         return true;
160     }
161 
162     /**
163      Tests if a string starts with a newline character
164      @param string string to test
165      @return if its first character is a newline
166      */
startsWithNewline(final String string)167     public static boolean startsWithNewline(final String string) {
168         if (string == null || string.length() == 0)
169             return false;
170         return string.charAt(0) == '\n';
171     }
172 
173     /**
174      * Tests if a string is numeric, i.e. contains only digit characters
175      * @param string string to test
176      * @return true if only digit chars, false if empty or null or contains non-digit chars
177      */
isNumeric(String string)178     public static boolean isNumeric(String string) {
179         if (string == null || string.length() == 0)
180             return false;
181 
182         int l = string.length();
183         for (int i = 0; i < l; i++) {
184             if (!Character.isDigit(string.codePointAt(i)))
185                 return false;
186         }
187         return true;
188     }
189 
190     /**
191      * Tests if a code point is "whitespace" as defined in the HTML spec. Used for output HTML.
192      * @param c code point to test
193      * @return true if code point is whitespace, false otherwise
194      * @see #isActuallyWhitespace(int)
195      */
isWhitespace(int c)196     public static boolean isWhitespace(int c){
197         return c == ' ' || c == '\t' || c == '\n' || c == '\f' || c == '\r';
198     }
199 
200     /**
201      * Tests if a code point is "whitespace" as defined by what it looks like. Used for Element.text etc.
202      * @param c code point to test
203      * @return true if code point is whitespace, false otherwise
204      */
isActuallyWhitespace(int c)205     public static boolean isActuallyWhitespace(int c){
206         return c == ' ' || c == '\t' || c == '\n' || c == '\f' || c == '\r' || c == 160;
207         // 160 is &nbsp; (non-breaking space). Not in the spec but expected.
208     }
209 
isInvisibleChar(int c)210     public static boolean isInvisibleChar(int c) {
211         return c == 8203 || c == 173; // zero width sp, soft hyphen
212         // previously also included zw non join, zw join - but removing those breaks semantic meaning of text
213     }
214 
215     /**
216      * Normalise the whitespace within this string; multiple spaces collapse to a single, and all whitespace characters
217      * (e.g. newline, tab) convert to a simple space.
218      * @param string content to normalise
219      * @return normalised string
220      */
normaliseWhitespace(String string)221     public static String normaliseWhitespace(String string) {
222         StringBuilder sb = StringUtil.borrowBuilder();
223         appendNormalisedWhitespace(sb, string, false);
224         return StringUtil.releaseBuilder(sb);
225     }
226 
227     /**
228      * After normalizing the whitespace within a string, appends it to a string builder.
229      * @param accum builder to append to
230      * @param string string to normalize whitespace within
231      * @param stripLeading set to true if you wish to remove any leading whitespace
232      */
appendNormalisedWhitespace(StringBuilder accum, String string, boolean stripLeading)233     public static void appendNormalisedWhitespace(StringBuilder accum, String string, boolean stripLeading) {
234         boolean lastWasWhite = false;
235         boolean reachedNonWhite = false;
236 
237         int len = string.length();
238         int c;
239         for (int i = 0; i < len; i+= Character.charCount(c)) {
240             c = string.codePointAt(i);
241             if (isActuallyWhitespace(c)) {
242                 if ((stripLeading && !reachedNonWhite) || lastWasWhite)
243                     continue;
244                 accum.append(' ');
245                 lastWasWhite = true;
246             }
247             else if (!isInvisibleChar(c)) {
248                 accum.appendCodePoint(c);
249                 lastWasWhite = false;
250                 reachedNonWhite = true;
251             }
252         }
253     }
254 
in(final String needle, final String... haystack)255     public static boolean in(final String needle, final String... haystack) {
256         final int len = haystack.length;
257         for (int i = 0; i < len; i++) {
258             if (haystack[i].equals(needle))
259                return true;
260         }
261         return false;
262     }
263 
inSorted(String needle, String[] haystack)264     public static boolean inSorted(String needle, String[] haystack) {
265         return Arrays.binarySearch(haystack, needle) >= 0;
266     }
267 
268     /**
269      Tests that a String contains only ASCII characters.
270      @param string scanned string
271      @return true if all characters are in range 0 - 127
272      */
isAscii(String string)273     public static boolean isAscii(String string) {
274         Validate.notNull(string);
275         for (int i = 0; i < string.length(); i++) {
276             int c = string.charAt(i);
277             if (c > 127) { // ascii range
278                 return false;
279             }
280         }
281         return true;
282     }
283 
284     private static final Pattern extraDotSegmentsPattern = Pattern.compile("^/((\\.{1,2}/)+)");
285     /**
286      * Create a new absolute URL, from a provided existing absolute URL and a relative URL component.
287      * @param base the existing absolute base URL
288      * @param relUrl the relative URL to resolve. (If it's already absolute, it will be returned)
289      * @return the resolved absolute URL
290      * @throws MalformedURLException if an error occurred generating the URL
291      */
resolve(URL base, String relUrl)292     public static URL resolve(URL base, String relUrl) throws MalformedURLException {
293         relUrl = stripControlChars(relUrl);
294         // workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired
295         if (relUrl.startsWith("?"))
296             relUrl = base.getPath() + relUrl;
297         // workaround: //example.com + ./foo = //example.com/./foo, not //example.com/foo
298         URL url = new URL(base, relUrl);
299         String fixedFile = extraDotSegmentsPattern.matcher(url.getFile()).replaceFirst("/");
300         if (url.getRef() != null) {
301             fixedFile = fixedFile + "#" + url.getRef();
302         }
303         return new URL(url.getProtocol(), url.getHost(), url.getPort(), fixedFile);
304     }
305 
306     /**
307      * Create a new absolute URL, from a provided existing absolute URL and a relative URL component.
308      * @param baseUrl the existing absolute base URL
309      * @param relUrl the relative URL to resolve. (If it's already absolute, it will be returned)
310      * @return an absolute URL if one was able to be generated, or the empty string if not
311      */
resolve(String baseUrl, String relUrl)312     public static String resolve(String baseUrl, String relUrl) {
313         // workaround: java will allow control chars in a path URL and may treat as relative, but Chrome / Firefox will strip and may see as a scheme. Normalize to browser's view.
314         baseUrl = stripControlChars(baseUrl); relUrl = stripControlChars(relUrl);
315         try {
316             URL base;
317             try {
318                 base = new URL(baseUrl);
319             } catch (MalformedURLException e) {
320                 // the base is unsuitable, but the attribute/rel may be abs on its own, so try that
321                 URL abs = new URL(relUrl);
322                 return abs.toExternalForm();
323             }
324             return resolve(base, relUrl).toExternalForm();
325         } catch (MalformedURLException e) {
326             // it may still be valid, just that Java doesn't have a registered stream handler for it, e.g. tel
327             // we test here vs at start to normalize supported URLs (e.g. HTTP -> http)
328             return validUriScheme.matcher(relUrl).find() ? relUrl : "";
329         }
330     }
331     private static final Pattern validUriScheme = Pattern.compile("^[a-zA-Z][a-zA-Z0-9+-.]*:");
332 
333     private static final Pattern controlChars = Pattern.compile("[\\x00-\\x1f]*"); // matches ascii 0 - 31, to strip from url
stripControlChars(final String input)334     private static String stripControlChars(final String input) {
335         return controlChars.matcher(input).replaceAll("");
336     }
337 
338     private static final ThreadLocal<Stack<StringBuilder>> threadLocalBuilders = ThreadLocal.withInitial(Stack::new);
339 
340     /**
341      * Maintains cached StringBuilders in a flyweight pattern, to minimize new StringBuilder GCs. The StringBuilder is
342      * prevented from growing too large.
343      * <p>
344      * Care must be taken to release the builder once its work has been completed, with {@link #releaseBuilder}
345      * @return an empty StringBuilder
346      */
borrowBuilder()347     public static StringBuilder borrowBuilder() {
348         Stack<StringBuilder> builders = threadLocalBuilders.get();
349         return builders.empty() ?
350             new StringBuilder(MaxCachedBuilderSize) :
351             builders.pop();
352     }
353 
354     /**
355      * Release a borrowed builder. Care must be taken not to use the builder after it has been returned, as its
356      * contents may be changed by this method, or by a concurrent thread.
357      * @param sb the StringBuilder to release.
358      * @return the string value of the released String Builder (as an incentive to release it!).
359      */
releaseBuilder(StringBuilder sb)360     public static String releaseBuilder(StringBuilder sb) {
361         Validate.notNull(sb);
362         String string = sb.toString();
363 
364         if (sb.length() > MaxCachedBuilderSize)
365             sb = new StringBuilder(MaxCachedBuilderSize); // make sure it hasn't grown too big
366         else
367             sb.delete(0, sb.length()); // make sure it's emptied on release
368 
369         Stack<StringBuilder> builders = threadLocalBuilders.get();
370         builders.push(sb);
371 
372         while (builders.size() > MaxIdleBuilders) {
373             builders.pop();
374         }
375         return string;
376     }
377 
378     private static final int MaxCachedBuilderSize = 8 * 1024;
379     private static final int MaxIdleBuilders = 8;
380 }
381