• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.jsoup.helper;
2 
3 import org.jsoup.Connection;
4 import org.jsoup.internal.StringUtil;
5 import org.jspecify.annotations.Nullable;
6 
7 import java.io.UnsupportedEncodingException;
8 import java.net.IDN;
9 import java.net.MalformedURLException;
10 import java.net.URI;
11 import java.net.URISyntaxException;
12 import java.net.URL;
13 import java.net.URLDecoder;
14 import java.net.URLEncoder;
15 
16 import static org.jsoup.helper.DataUtil.UTF_8;
17 
18 /**
19  A utility class to normalize input URLs. jsoup internal; API subject to change.
20  <p>Normalization includes puny-coding the host, and encoding non-ascii path components. Any non-ascii characters in
21  the query string (or the fragment/anchor) are escaped, but any existing escapes in those components are preserved.</p>
22  */
23 final class UrlBuilder {
24     URL u;
25     @Nullable StringBuilder q;
26 
UrlBuilder(URL inputUrl)27     UrlBuilder(URL inputUrl) {
28         this.u = inputUrl;
29         if (u.getQuery() != null)
30             q = StringUtil.borrowBuilder().append(u.getQuery());
31     }
32 
build()33     URL build() {
34         try {
35             // use the URI class to encode non-ascii in path
36             URI uri = new URI(
37                 u.getProtocol(),
38                 u.getUserInfo(),
39                 IDN.toASCII(decodePart(u.getHost())), // puny-code
40                 u.getPort(),
41                 null, null, null // path, query and fragment appended later so as not to encode
42             );
43 
44             StringBuilder normUrl = StringUtil.borrowBuilder().append(uri.toASCIIString());
45             appendToAscii(u.getPath(), false, normUrl);
46             if (q != null) {
47                 normUrl.append('?');
48                 appendToAscii(StringUtil.releaseBuilder(q), true, normUrl);
49             }
50             if (u.getRef() != null) {
51                 normUrl.append('#');
52                 appendToAscii(u.getRef(), false, normUrl);
53             }
54             u = new URL(StringUtil.releaseBuilder(normUrl));
55             return u;
56         } catch (MalformedURLException | URISyntaxException | UnsupportedEncodingException e) {
57             // we assert here so that any incomplete normalization issues can be caught in devel. but in practise,
58             // the remote end will be able to handle it, so in prod we just pass the original URL.
59             // The UnsupportedEncodingException would never happen as always UTF8
60             assert Validate.assertFail(e.toString());
61             return u;
62         }
63     }
64 
appendKeyVal(Connection.KeyVal kv)65     void appendKeyVal(Connection.KeyVal kv) throws UnsupportedEncodingException {
66         if (q == null)
67             q = StringUtil.borrowBuilder();
68         else
69             q.append('&');
70         q
71             .append(URLEncoder.encode(kv.key(), UTF_8.name()))
72             .append('=')
73             .append(URLEncoder.encode(kv.value(), UTF_8.name()));
74     }
75 
decodePart(String encoded)76     private static String decodePart(String encoded) {
77         try {
78             return URLDecoder.decode(encoded, UTF_8.name());
79         } catch (UnsupportedEncodingException e) {
80             throw new RuntimeException(e); // wtf!
81         }
82     }
83 
appendToAscii(String s, boolean spaceAsPlus, StringBuilder sb)84     private static void appendToAscii(String s, boolean spaceAsPlus, StringBuilder sb) throws UnsupportedEncodingException {
85         // minimal normalization of Unicode -> Ascii, and space normal. Existing escapes are left as-is.
86         for (int i = 0; i < s.length(); i++) {
87             int c = s.codePointAt(i);
88             if (c == ' ') {
89                 sb.append(spaceAsPlus ? '+' : "%20");
90             } else if (c > 127) { // out of ascii range
91                 sb.append(URLEncoder.encode(new String(Character.toChars(c)), UTF_8.name()));
92                 // ^^ is a bit heavy-handed - if perf critical, we could optimize
93                 if (Character.charCount(c) == 2) i++; // advance past supplemental
94             } else {
95                 sb.append((char) c);
96             }
97         }
98     }
99 
100 
101 }
102