1 package org.jsoup.helper; 2 3 import org.jsoup.Connection; 4 import org.jsoup.internal.StringUtil; 5 import org.jspecify.annotations.Nullable; 6 7 import java.io.UnsupportedEncodingException; 8 import java.net.IDN; 9 import java.net.MalformedURLException; 10 import java.net.URI; 11 import java.net.URISyntaxException; 12 import java.net.URL; 13 import java.net.URLDecoder; 14 import java.net.URLEncoder; 15 16 import static org.jsoup.helper.DataUtil.UTF_8; 17 18 /** 19 A utility class to normalize input URLs. jsoup internal; API subject to change. 20 <p>Normalization includes puny-coding the host, and encoding non-ascii path components. Any non-ascii characters in 21 the query string (or the fragment/anchor) are escaped, but any existing escapes in those components are preserved.</p> 22 */ 23 final class UrlBuilder { 24 URL u; 25 @Nullable StringBuilder q; 26 UrlBuilder(URL inputUrl)27 UrlBuilder(URL inputUrl) { 28 this.u = inputUrl; 29 if (u.getQuery() != null) 30 q = StringUtil.borrowBuilder().append(u.getQuery()); 31 } 32 build()33 URL build() { 34 try { 35 // use the URI class to encode non-ascii in path 36 URI uri = new URI( 37 u.getProtocol(), 38 u.getUserInfo(), 39 IDN.toASCII(decodePart(u.getHost())), // puny-code 40 u.getPort(), 41 null, null, null // path, query and fragment appended later so as not to encode 42 ); 43 44 StringBuilder normUrl = StringUtil.borrowBuilder().append(uri.toASCIIString()); 45 appendToAscii(u.getPath(), false, normUrl); 46 if (q != null) { 47 normUrl.append('?'); 48 appendToAscii(StringUtil.releaseBuilder(q), true, normUrl); 49 } 50 if (u.getRef() != null) { 51 normUrl.append('#'); 52 appendToAscii(u.getRef(), false, normUrl); 53 } 54 u = new URL(StringUtil.releaseBuilder(normUrl)); 55 return u; 56 } catch (MalformedURLException | URISyntaxException | UnsupportedEncodingException e) { 57 // we assert here so that any incomplete normalization issues can be caught in devel. but in practise, 58 // the remote end will be able to handle it, so in prod we just pass the original URL. 59 // The UnsupportedEncodingException would never happen as always UTF8 60 assert Validate.assertFail(e.toString()); 61 return u; 62 } 63 } 64 appendKeyVal(Connection.KeyVal kv)65 void appendKeyVal(Connection.KeyVal kv) throws UnsupportedEncodingException { 66 if (q == null) 67 q = StringUtil.borrowBuilder(); 68 else 69 q.append('&'); 70 q 71 .append(URLEncoder.encode(kv.key(), UTF_8.name())) 72 .append('=') 73 .append(URLEncoder.encode(kv.value(), UTF_8.name())); 74 } 75 decodePart(String encoded)76 private static String decodePart(String encoded) { 77 try { 78 return URLDecoder.decode(encoded, UTF_8.name()); 79 } catch (UnsupportedEncodingException e) { 80 throw new RuntimeException(e); // wtf! 81 } 82 } 83 appendToAscii(String s, boolean spaceAsPlus, StringBuilder sb)84 private static void appendToAscii(String s, boolean spaceAsPlus, StringBuilder sb) throws UnsupportedEncodingException { 85 // minimal normalization of Unicode -> Ascii, and space normal. Existing escapes are left as-is. 86 for (int i = 0; i < s.length(); i++) { 87 int c = s.codePointAt(i); 88 if (c == ' ') { 89 sb.append(spaceAsPlus ? '+' : "%20"); 90 } else if (c > 127) { // out of ascii range 91 sb.append(URLEncoder.encode(new String(Character.toChars(c)), UTF_8.name())); 92 // ^^ is a bit heavy-handed - if perf critical, we could optimize 93 if (Character.charCount(c) == 2) i++; // advance past supplemental 94 } else { 95 sb.append((char) c); 96 } 97 } 98 } 99 100 101 } 102