1 /* 2 * Copyright (C) 2009 The Guava Authors 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.common.net; 18 19 import static com.google.common.base.Preconditions.checkArgument; 20 import static com.google.common.base.Preconditions.checkNotNull; 21 import static com.google.common.base.Preconditions.checkState; 22 23 import com.google.common.annotations.Beta; 24 import com.google.common.annotations.GwtCompatible; 25 import com.google.common.base.Ascii; 26 import com.google.common.base.CharMatcher; 27 import com.google.common.base.Joiner; 28 import com.google.common.base.Splitter; 29 import com.google.common.collect.ImmutableList; 30 import com.google.thirdparty.publicsuffix.PublicSuffixPatterns; 31 32 import java.util.List; 33 34 import javax.annotation.Nullable; 35 36 /** 37 * An immutable well-formed internet domain name, such as {@code com} or {@code 38 * foo.co.uk}. Only syntactic analysis is performed; no DNS lookups or other 39 * network interactions take place. Thus there is no guarantee that the domain 40 * actually exists on the internet. 41 * 42 * <p>One common use of this class is to determine whether a given string is 43 * likely to represent an addressable domain on the web -- that is, for a 44 * candidate string {@code "xxx"}, might browsing to {@code "http://xxx/"} 45 * result in a webpage being displayed? In the past, this test was frequently 46 * done by determining whether the domain ended with a {@linkplain 47 * #isPublicSuffix() public suffix} but was not itself a public suffix. However, 48 * this test is no longer accurate. There are many domains which are both public 49 * suffixes and addressable as hosts; {@code "uk.com"} is one example. As a 50 * result, the only useful test to determine if a domain is a plausible web host 51 * is {@link #hasPublicSuffix()}. This will return {@code true} for many domains 52 * which (currently) are not hosts, such as {@code "com"}, but given that any 53 * public suffix may become a host without warning, it is better to err on the 54 * side of permissiveness and thus avoid spurious rejection of valid sites. 55 * 56 * <p>During construction, names are normalized in two ways: 57 * <ol> 58 * <li>ASCII uppercase characters are converted to lowercase. 59 * <li>Unicode dot separators other than the ASCII period ({@code '.'}) are 60 * converted to the ASCII period. 61 * </ol> 62 * <p>The normalized values will be returned from {@link #toString()} and 63 * {@link #parts()}, and will be reflected in the result of 64 * {@link #equals(Object)}. 65 * 66 * <p><a href="http://en.wikipedia.org/wiki/Internationalized_domain_name"> 67 * Internationalized domain names</a> such as {@code 网络.cn} are supported, as 68 * are the equivalent <a 69 * href="http://en.wikipedia.org/wiki/Internationalized_domain_name">IDNA 70 * Punycode-encoded</a> versions. 71 * 72 * @author Craig Berry 73 * @since 5.0 74 */ 75 @Beta 76 @GwtCompatible 77 public final class InternetDomainName { 78 79 private static final CharMatcher DOTS_MATCHER = 80 CharMatcher.anyOf(".\u3002\uFF0E\uFF61"); 81 private static final Splitter DOT_SPLITTER = Splitter.on('.'); 82 private static final Joiner DOT_JOINER = Joiner.on('.'); 83 84 /** 85 * Value of {@link #publicSuffixIndex} which indicates that no public suffix 86 * was found. 87 */ 88 private static final int NO_PUBLIC_SUFFIX_FOUND = -1; 89 90 private static final String DOT_REGEX = "\\."; 91 92 /** 93 * Maximum parts (labels) in a domain name. This value arises from 94 * the 255-octet limit described in 95 * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11 with 96 * the fact that the encoding of each part occupies at least two bytes 97 * (dot plus label externally, length byte plus label internally). Thus, if 98 * all labels have the minimum size of one byte, 127 of them will fit. 99 */ 100 private static final int MAX_PARTS = 127; 101 102 /** 103 * Maximum length of a full domain name, including separators, and 104 * leaving room for the root label. See 105 * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11. 106 */ 107 private static final int MAX_LENGTH = 253; 108 109 /** 110 * Maximum size of a single part of a domain name. See 111 * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11. 112 */ 113 private static final int MAX_DOMAIN_PART_LENGTH = 63; 114 115 /** 116 * The full domain name, converted to lower case. 117 */ 118 private final String name; 119 120 /** 121 * The parts of the domain name, converted to lower case. 122 */ 123 private final ImmutableList<String> parts; 124 125 /** 126 * The index in the {@link #parts()} list at which the public suffix begins. 127 * For example, for the domain name {@code www.google.co.uk}, the value would 128 * be 2 (the index of the {@code co} part). The value is negative 129 * (specifically, {@link #NO_PUBLIC_SUFFIX_FOUND}) if no public suffix was 130 * found. 131 */ 132 private final int publicSuffixIndex; 133 134 /** 135 * Constructor used to implement {@link #from(String)}, and from subclasses. 136 */ InternetDomainName(String name)137 InternetDomainName(String name) { 138 // Normalize: 139 // * ASCII characters to lowercase 140 // * All dot-like characters to '.' 141 // * Strip trailing '.' 142 143 name = Ascii.toLowerCase(DOTS_MATCHER.replaceFrom(name, '.')); 144 145 if (name.endsWith(".")) { 146 name = name.substring(0, name.length() - 1); 147 } 148 149 checkArgument(name.length() <= MAX_LENGTH, 150 "Domain name too long: '%s':", name); 151 this.name = name; 152 153 this.parts = ImmutableList.copyOf(DOT_SPLITTER.split(name)); 154 checkArgument(parts.size() <= MAX_PARTS, 155 "Domain has too many parts: '%s'", name); 156 checkArgument(validateSyntax(parts), "Not a valid domain name: '%s'", name); 157 158 this.publicSuffixIndex = findPublicSuffix(); 159 } 160 161 /** 162 * Returns the index of the leftmost part of the public suffix, or -1 if not 163 * found. Note that the value defined as the "public suffix" may not be a 164 * public suffix according to {@link #isPublicSuffix()} if the domain ends 165 * with an excluded domain pattern such as {@code "nhs.uk"}. 166 */ findPublicSuffix()167 private int findPublicSuffix() { 168 final int partsSize = parts.size(); 169 170 for (int i = 0; i < partsSize; i++) { 171 String ancestorName = DOT_JOINER.join(parts.subList(i, partsSize)); 172 173 if (PublicSuffixPatterns.EXACT.containsKey(ancestorName)) { 174 return i; 175 } 176 177 // Excluded domains (e.g. !nhs.uk) use the next highest 178 // domain as the effective public suffix (e.g. uk). 179 180 if (PublicSuffixPatterns.EXCLUDED.containsKey(ancestorName)) { 181 return i + 1; 182 } 183 184 if (matchesWildcardPublicSuffix(ancestorName)) { 185 return i; 186 } 187 } 188 189 return NO_PUBLIC_SUFFIX_FOUND; 190 } 191 192 /** 193 * Returns an instance of {@link InternetDomainName} after lenient 194 * validation. Specifically, validation against <a 195 * href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a> 196 * ("Internationalizing Domain Names in Applications") is skipped, while 197 * validation against <a 198 * href="http://www.ietf.org/rfc/rfc1035.txt">RFC 1035</a> is relaxed in 199 * the following ways: 200 * <ul> 201 * <li>Any part containing non-ASCII characters is considered valid. 202 * <li>Underscores ('_') are permitted wherever dashes ('-') are permitted. 203 * <li>Parts other than the final part may start with a digit. 204 * </ul> 205 * 206 * 207 * @param domain A domain name (not IP address) 208 * @throws IllegalArgumentException if {@code name} is not syntactically valid 209 * according to {@link #isValid} 210 * @since 10.0 (previously named {@code fromLenient}) 211 */ from(String domain)212 public static InternetDomainName from(String domain) { 213 return new InternetDomainName(checkNotNull(domain)); 214 } 215 216 /** 217 * Validation method used by {@from} to ensure that the domain name is 218 * syntactically valid according to RFC 1035. 219 * 220 * @return Is the domain name syntactically valid? 221 */ validateSyntax(List<String> parts)222 private static boolean validateSyntax(List<String> parts) { 223 final int lastIndex = parts.size() - 1; 224 225 // Validate the last part specially, as it has different syntax rules. 226 227 if (!validatePart(parts.get(lastIndex), true)) { 228 return false; 229 } 230 231 for (int i = 0; i < lastIndex; i++) { 232 String part = parts.get(i); 233 if (!validatePart(part, false)) { 234 return false; 235 } 236 } 237 238 return true; 239 } 240 241 private static final CharMatcher DASH_MATCHER = CharMatcher.anyOf("-_"); 242 243 private static final CharMatcher PART_CHAR_MATCHER = 244 CharMatcher.JAVA_LETTER_OR_DIGIT.or(DASH_MATCHER); 245 246 /** 247 * Helper method for {@link #validateSyntax(List)}. Validates that one part of 248 * a domain name is valid. 249 * 250 * @param part The domain name part to be validated 251 * @param isFinalPart Is this the final (rightmost) domain part? 252 * @return Whether the part is valid 253 */ validatePart(String part, boolean isFinalPart)254 private static boolean validatePart(String part, boolean isFinalPart) { 255 256 // These tests could be collapsed into one big boolean expression, but 257 // they have been left as independent tests for clarity. 258 259 if (part.length() < 1 || part.length() > MAX_DOMAIN_PART_LENGTH) { 260 return false; 261 } 262 263 /* 264 * GWT claims to support java.lang.Character's char-classification methods, 265 * but it actually only works for ASCII. So for now, assume any non-ASCII 266 * characters are valid. The only place this seems to be documented is here: 267 * http://osdir.com/ml/GoogleWebToolkitContributors/2010-03/msg00178.html 268 * 269 * <p>ASCII characters in the part are expected to be valid per RFC 1035, 270 * with underscore also being allowed due to widespread practice. 271 */ 272 273 String asciiChars = CharMatcher.ASCII.retainFrom(part); 274 275 if (!PART_CHAR_MATCHER.matchesAllOf(asciiChars)) { 276 return false; 277 } 278 279 // No initial or final dashes or underscores. 280 281 if (DASH_MATCHER.matches(part.charAt(0)) 282 || DASH_MATCHER.matches(part.charAt(part.length() - 1))) { 283 return false; 284 } 285 286 /* 287 * Note that we allow (in contravention of a strict interpretation of the 288 * relevant RFCs) domain parts other than the last may begin with a digit 289 * (for example, "3com.com"). It's important to disallow an initial digit in 290 * the last part; it's the only thing that stops an IPv4 numeric address 291 * like 127.0.0.1 from looking like a valid domain name. 292 */ 293 294 if (isFinalPart && CharMatcher.DIGIT.matches(part.charAt(0))) { 295 return false; 296 } 297 298 return true; 299 } 300 301 /** 302 * Returns the individual components of this domain name, normalized to all 303 * lower case. For example, for the domain name {@code mail.google.com}, this 304 * method returns the list {@code ["mail", "google", "com"]}. 305 */ parts()306 public ImmutableList<String> parts() { 307 return parts; 308 } 309 310 /** 311 * Indicates whether this domain name represents a <i>public suffix</i>, as 312 * defined by the Mozilla Foundation's 313 * <a href="http://publicsuffix.org/">Public Suffix List</a> (PSL). A public 314 * suffix is one under which Internet users can directly register names, such 315 * as {@code com}, {@code co.uk} or {@code pvt.k12.wy.us}. Examples of domain 316 * names that are <i>not</i> public suffixes include {@code google}, {@code 317 * google.com} and {@code foo.co.uk}. 318 * 319 * @return {@code true} if this domain name appears exactly on the public 320 * suffix list 321 * @since 6.0 322 */ isPublicSuffix()323 public boolean isPublicSuffix() { 324 return publicSuffixIndex == 0; 325 } 326 327 /** 328 * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix() 329 * public suffix}, including if it is a public suffix itself. For example, 330 * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and 331 * {@code com}, but not for {@code google} or {@code google.foo}. This is 332 * the recommended method for determining whether a domain is potentially an 333 * addressable host. 334 * 335 * @since 6.0 336 */ hasPublicSuffix()337 public boolean hasPublicSuffix() { 338 return publicSuffixIndex != NO_PUBLIC_SUFFIX_FOUND; 339 } 340 341 /** 342 * Returns the {@linkplain #isPublicSuffix() public suffix} portion of the 343 * domain name, or {@code null} if no public suffix is present. 344 * 345 * @since 6.0 346 */ publicSuffix()347 public InternetDomainName publicSuffix() { 348 return hasPublicSuffix() ? ancestor(publicSuffixIndex) : null; 349 } 350 351 /** 352 * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix() 353 * public suffix}, while not being a public suffix itself. For example, 354 * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and 355 * {@code bar.ca.us}, but not for {@code google}, {@code com}, or {@code 356 * google.foo}. 357 * 358 * <p><b>Warning:</b> a {@code false} result from this method does not imply 359 * that the domain does not represent an addressable host, as many public 360 * suffixes are also addressable hosts. Use {@link #hasPublicSuffix()} for 361 * that test. 362 * 363 * <p>This method can be used to determine whether it will probably be 364 * possible to set cookies on the domain, though even that depends on 365 * individual browsers' implementations of cookie controls. See 366 * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details. 367 * 368 * @since 6.0 369 */ isUnderPublicSuffix()370 public boolean isUnderPublicSuffix() { 371 return publicSuffixIndex > 0; 372 } 373 374 /** 375 * Indicates whether this domain name is composed of exactly one subdomain 376 * component followed by a {@linkplain #isPublicSuffix() public suffix}. For 377 * example, returns {@code true} for {@code google.com} and {@code foo.co.uk}, 378 * but not for {@code www.google.com} or {@code co.uk}. 379 * 380 * <p><b>Warning:</b> A {@code true} result from this method does not imply 381 * that the domain is at the highest level which is addressable as a host, as 382 * many public suffixes are also addressable hosts. For example, the domain 383 * {@code bar.uk.com} has a public suffix of {@code uk.com}, so it would 384 * return {@code true} from this method. But {@code uk.com} is itself an 385 * addressable host. 386 * 387 * <p>This method can be used to determine whether a domain is probably the 388 * highest level for which cookies may be set, though even that depends on 389 * individual browsers' implementations of cookie controls. See 390 * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details. 391 * 392 * @since 6.0 393 */ isTopPrivateDomain()394 public boolean isTopPrivateDomain() { 395 return publicSuffixIndex == 1; 396 } 397 398 /** 399 * Returns the portion of this domain name that is one level beneath the 400 * public suffix. For example, for {@code x.adwords.google.co.uk} it returns 401 * {@code google.co.uk}, since {@code co.uk} is a public suffix. 402 * 403 * <p>If {@link #isTopPrivateDomain()} is true, the current domain name 404 * instance is returned. 405 * 406 * <p>This method should not be used to determine the topmost parent domain 407 * which is addressable as a host, as many public suffixes are also 408 * addressable hosts. For example, the domain {@code foo.bar.uk.com} has 409 * a public suffix of {@code uk.com}, so it would return {@code bar.uk.com} 410 * from this method. But {@code uk.com} is itself an addressable host. 411 * 412 * <p>This method can be used to determine the probable highest level parent 413 * domain for which cookies may be set, though even that depends on individual 414 * browsers' implementations of cookie controls. 415 * 416 * @throws IllegalStateException if this domain does not end with a 417 * public suffix 418 * @since 6.0 419 */ topPrivateDomain()420 public InternetDomainName topPrivateDomain() { 421 if (isTopPrivateDomain()) { 422 return this; 423 } 424 checkState(isUnderPublicSuffix(), "Not under a public suffix: %s", name); 425 return ancestor(publicSuffixIndex - 1); 426 } 427 428 /** 429 * Indicates whether this domain is composed of two or more parts. 430 */ hasParent()431 public boolean hasParent() { 432 return parts.size() > 1; 433 } 434 435 /** 436 * Returns an {@code InternetDomainName} that is the immediate ancestor of 437 * this one; that is, the current domain with the leftmost part removed. For 438 * example, the parent of {@code www.google.com} is {@code google.com}. 439 * 440 * @throws IllegalStateException if the domain has no parent, as determined 441 * by {@link #hasParent} 442 */ parent()443 public InternetDomainName parent() { 444 checkState(hasParent(), "Domain '%s' has no parent", name); 445 return ancestor(1); 446 } 447 448 /** 449 * Returns the ancestor of the current domain at the given number of levels 450 * "higher" (rightward) in the subdomain list. The number of levels must be 451 * non-negative, and less than {@code N-1}, where {@code N} is the number of 452 * parts in the domain. 453 * 454 * <p>TODO: Reasonable candidate for addition to public API. 455 */ ancestor(int levels)456 private InternetDomainName ancestor(int levels) { 457 return from(DOT_JOINER.join(parts.subList(levels, parts.size()))); 458 } 459 460 /** 461 * Creates and returns a new {@code InternetDomainName} by prepending the 462 * argument and a dot to the current name. For example, {@code 463 * InternetDomainName.from("foo.com").child("www.bar")} returns a new 464 * {@code InternetDomainName} with the value {@code www.bar.foo.com}. Only 465 * lenient validation is performed, as described {@link #from(String) here}. 466 * 467 * @throws NullPointerException if leftParts is null 468 * @throws IllegalArgumentException if the resulting name is not valid 469 */ child(String leftParts)470 public InternetDomainName child(String leftParts) { 471 return from(checkNotNull(leftParts) + "." + name); 472 } 473 474 /** 475 * Indicates whether the argument is a syntactically valid domain name using 476 * lenient validation. Specifically, validation against <a 477 * href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a> 478 * ("Internationalizing Domain Names in Applications") is skipped. 479 * 480 * <p>The following two code snippets are equivalent: 481 * 482 * <pre> {@code 483 * domainName = InternetDomainName.isValid(name) 484 * ? InternetDomainName.from(name) 485 * : DEFAULT_DOMAIN;}</pre> 486 * 487 * <pre> {@code 488 * try { 489 * domainName = InternetDomainName.from(name); 490 * } catch (IllegalArgumentException e) { 491 * domainName = DEFAULT_DOMAIN; 492 * }}</pre> 493 * 494 * @since 8.0 (previously named {@code isValidLenient}) 495 */ isValid(String name)496 public static boolean isValid(String name) { 497 try { 498 from(name); 499 return true; 500 } catch (IllegalArgumentException e) { 501 return false; 502 } 503 } 504 505 /** 506 * Does the domain name match one of the "wildcard" patterns (e.g. 507 * {@code "*.ar"})? 508 */ matchesWildcardPublicSuffix(String domain)509 private static boolean matchesWildcardPublicSuffix(String domain) { 510 final String[] pieces = domain.split(DOT_REGEX, 2); 511 return pieces.length == 2 && PublicSuffixPatterns.UNDER.containsKey(pieces[1]); 512 } 513 514 /** 515 * Returns the domain name, normalized to all lower case. 516 */ 517 @Override toString()518 public String toString() { 519 return name; 520 } 521 522 /** 523 * Equality testing is based on the text supplied by the caller, 524 * after normalization as described in the class documentation. For 525 * example, a non-ASCII Unicode domain name and the Punycode version 526 * of the same domain name would not be considered equal. 527 * 528 */ 529 @Override equals(@ullable Object object)530 public boolean equals(@Nullable Object object) { 531 if (object == this) { 532 return true; 533 } 534 535 if (object instanceof InternetDomainName) { 536 InternetDomainName that = (InternetDomainName) object; 537 return this.name.equals(that.name); 538 } 539 540 return false; 541 } 542 543 @Override hashCode()544 public int hashCode() { 545 return name.hashCode(); 546 } 547 } 548