• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2009 The Guava Authors
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.common.net;
18 
19 import static com.google.common.base.Preconditions.checkArgument;
20 import static com.google.common.base.Preconditions.checkNotNull;
21 import static com.google.common.base.Preconditions.checkState;
22 
23 import com.google.common.annotations.Beta;
24 import com.google.common.annotations.GwtCompatible;
25 import com.google.common.base.Ascii;
26 import com.google.common.base.CharMatcher;
27 import com.google.common.base.Joiner;
28 import com.google.common.base.Objects;
29 import com.google.common.base.Splitter;
30 import com.google.common.collect.ImmutableList;
31 
32 import java.util.List;
33 
34 import javax.annotation.Nullable;
35 
36 /**
37  * An immutable well-formed internet domain name, such as {@code com} or {@code
38  * foo.co.uk}. Only syntactic analysis is performed; no DNS lookups or other
39  * network interactions take place. Thus there is no guarantee that the domain
40  * actually exists on the internet.
41  *
42  * <p>One common use of this class is to determine whether a given string is
43  * likely to represent an addressable domain on the web -- that is, for a
44  * candidate string {@code "xxx"}, might browsing to {@code "http://xxx/"}
45  * result in a webpage being displayed? In the past, this test was frequently
46  * done by determining whether the domain ended with a {@linkplain
47  * #isPublicSuffix() public suffix} but was not itself a public suffix. However,
48  * this test is no longer accurate. There are many domains which are both public
49  * suffixes and addressable as hosts; {@code "uk.com"} is one example. As a
50  * result, the only useful test to determine if a domain is a plausible web host
51  * is {@link #hasPublicSuffix()}. This will return {@code true} for many domains
52  * which (currently) are not hosts, such as {@code "com"}), but given that any
53  * public suffix may become a host without warning, it is better to err on the
54  * side of permissiveness and thus avoid spurious rejection of valid sites.
55  *
56  * <p>During construction, names are normalized in two ways:
57  * <ol>
58  * <li>ASCII uppercase characters are converted to lowercase.
59  * <li>Unicode dot separators other than the ASCII period ({@code '.'}) are
60  * converted to the ASCII period.
61  * </ol>
62  * The normalized values will be returned from {@link #name()} and
63  * {@link #parts()}, and will be reflected in the result of
64  * {@link #equals(Object)}.
65  *
66  * <p><a href="http://en.wikipedia.org/wiki/Internationalized_domain_name">
67  * internationalized domain names</a> such as {@code 网络.cn} are supported, as
68  * are the equivalent <a
69  * href="http://en.wikipedia.org/wiki/Internationalized_domain_name">IDNA
70  * Punycode-encoded</a> versions.
71  *
72  * @author Craig Berry
73  * @since 5.0
74  */
75 @Beta
76 @GwtCompatible(emulated = true)
77 public final class InternetDomainName {
78 
79   private static final CharMatcher DOTS_MATCHER =
80       CharMatcher.anyOf(".\u3002\uFF0E\uFF61");
81   private static final Splitter DOT_SPLITTER = Splitter.on('.');
82   private static final Joiner DOT_JOINER = Joiner.on('.');
83 
84   /**
85    * Value of {@link #publicSuffixIndex} which indicates that no public suffix
86    * was found.
87    */
88   private static final int NO_PUBLIC_SUFFIX_FOUND = -1;
89 
90   private static final String DOT_REGEX = "\\.";
91 
92   /**
93    * Maximum parts (labels) in a domain name. This value arises from
94    * the 255-octet limit described in
95    * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11 with
96    * the fact that the encoding of each part occupies at least two bytes
97    * (dot plus label externally, length byte plus label internally). Thus, if
98    * all labels have the minimum size of one byte, 127 of them will fit.
99    */
100   private static final int MAX_PARTS = 127;
101 
102   /**
103    * Maximum length of a full domain name, including separators, and
104    * leaving room for the root label. See
105    * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11.
106    */
107   private static final int MAX_LENGTH = 253;
108 
109   /**
110    * Maximum size of a single part of a domain name. See
111    * <a href="http://www.ietf.org/rfc/rfc2181.txt">RFC 2181</a> part 11.
112    */
113   private static final int MAX_DOMAIN_PART_LENGTH = 63;
114 
115   /**
116    * The full domain name, converted to lower case.
117    */
118   private final String name;
119 
120   /**
121    * The parts of the domain name, converted to lower case.
122    */
123   private final ImmutableList<String> parts;
124 
125   /**
126    * The index in the {@link #parts()} list at which the public suffix begins.
127    * For example, for the domain name {@code www.google.co.uk}, the value would
128    * be 2 (the index of the {@code co} part). The value is negative
129    * (specifically, {@link #NO_PUBLIC_SUFFIX_FOUND}) if no public suffix was
130    * found.
131    */
132   private final int publicSuffixIndex;
133 
134   /**
135    * Constructor used to implement {@link #from(String)}, and from subclasses.
136    */
InternetDomainName(String name)137   InternetDomainName(String name) {
138     // Normalize:
139     // * ASCII characters to lowercase
140     // * All dot-like characters to '.'
141     // * Strip trailing '.'
142 
143     name = Ascii.toLowerCase(DOTS_MATCHER.replaceFrom(name, '.'));
144 
145     if (name.endsWith(".")) {
146       name = name.substring(0, name.length() - 1);
147     }
148 
149     checkArgument(name.length() <= MAX_LENGTH, "Domain name too long: '%s':", name);
150     this.name = name;
151 
152     this.parts = ImmutableList.copyOf(DOT_SPLITTER.split(name));
153     checkArgument(parts.size() <= MAX_PARTS, "Domain has too many parts: '%s'", name);
154     checkArgument(validateSyntax(parts), "Not a valid domain name: '%s'", name);
155 
156     this.publicSuffixIndex = findPublicSuffix();
157   }
158 
159   /**
160    * Returns the index of the leftmost part of the public suffix, or -1 if not
161    * found. Note that the value defined as the "public suffix" may not be a
162    * public suffix according to {@link #isPublicSuffix()} if the domain ends
163    * with an excluded domain pattern such as {@code "nhs.uk"}.
164    */
findPublicSuffix()165   private int findPublicSuffix() {
166     final int partsSize = parts.size();
167 
168     for (int i = 0; i < partsSize; i++) {
169       String ancestorName = DOT_JOINER.join(parts.subList(i, partsSize));
170 
171       if (TldPatterns.EXACT.contains(ancestorName)) {
172         return i;
173       }
174 
175       // Excluded domains (e.g. !nhs.uk) use the next highest
176       // domain as the effective public suffix (e.g. uk).
177 
178       if (TldPatterns.EXCLUDED.contains(ancestorName)) {
179         return i + 1;
180       }
181 
182       if (matchesWildcardPublicSuffix(ancestorName)) {
183         return i;
184       }
185     }
186 
187     return NO_PUBLIC_SUFFIX_FOUND;
188   }
189 
190   /**
191    * A deprecated synonym for {@link #from(String)}.
192    *
193    * @param domain A domain name (not IP address)
194    * @throws IllegalArgumentException if {@code name} is not syntactically valid
195    *     according to {@link #isValidLenient}
196    * @since 8.0 (previously named {@code from})
197    * @deprecated Use {@link #from(String)}
198    */
199   @Deprecated
fromLenient(String domain)200   public static InternetDomainName fromLenient(String domain) {
201     return from(domain);
202   }
203 
204   /**
205    * Returns an instance of {@link InternetDomainName} after lenient
206    * validation.  Specifically, validation against <a
207    * href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>
208    * ("Internationalizing Domain Names in Applications") is skipped, while
209    * validation against <a
210    * href="http://www.ietf.org/rfc/rfc1035.txt">RFC 1035</a> is relaxed in
211    * the following ways:
212    * <ul>
213    * <li>Any part containing non-ASCII characters is considered valid.
214    * <li>Underscores ('_') are permitted wherever dashes ('-') are permitted.
215    * <li>Parts other than the final part may start with a digit.
216    * </ul>
217    *
218    *
219    * @param domain A domain name (not IP address)
220    * @throws IllegalArgumentException if {@code name} is not syntactically valid
221    *     according to {@link #isValid}
222    * @since 10.0 (previously named {@code fromLenient})
223    */
from(String domain)224   public static InternetDomainName from(String domain) {
225     return new InternetDomainName(checkNotNull(domain));
226   }
227 
228   /**
229    * Validation method used by {@from} to ensure that the domain name is
230    * syntactically valid according to RFC 1035.
231    *
232    * @return Is the domain name syntactically valid?
233    */
validateSyntax(List<String> parts)234   private static boolean validateSyntax(List<String> parts) {
235     final int lastIndex = parts.size() - 1;
236 
237     // Validate the last part specially, as it has different syntax rules.
238 
239     if (!validatePart(parts.get(lastIndex), true)) {
240       return false;
241     }
242 
243     for (int i = 0; i < lastIndex; i++) {
244       String part = parts.get(i);
245       if (!validatePart(part, false)) {
246         return false;
247       }
248     }
249 
250     return true;
251   }
252 
253   private static final CharMatcher DASH_MATCHER = CharMatcher.anyOf("-_");
254 
255   private static final CharMatcher PART_CHAR_MATCHER =
256       CharMatcher.JAVA_LETTER_OR_DIGIT.or(DASH_MATCHER);
257 
258   /**
259    * Helper method for {@link #validateSyntax(List)}. Validates that one part of
260    * a domain name is valid.
261    *
262    * @param part The domain name part to be validated
263    * @param isFinalPart Is this the final (rightmost) domain part?
264    * @return Whether the part is valid
265    */
validatePart(String part, boolean isFinalPart)266   private static boolean validatePart(String part, boolean isFinalPart) {
267 
268     // These tests could be collapsed into one big boolean expression, but
269     // they have been left as independent tests for clarity.
270 
271     if (part.length() < 1 || part.length() > MAX_DOMAIN_PART_LENGTH) {
272       return false;
273     }
274 
275     /*
276      * GWT claims to support java.lang.Character's char-classification methods,
277      * but it actually only works for ASCII. So for now, assume any non-ASCII
278      * characters are valid. The only place this seems to be documented is here:
279      * http://osdir.com/ml/GoogleWebToolkitContributors/2010-03/msg00178.html
280      *
281      * <p>ASCII characters in the part are expected to be valid per RFC 1035,
282      * with underscore also being allowed due to widespread practice.
283      */
284 
285     String asciiChars = CharMatcher.ASCII.retainFrom(part);
286 
287     if (!PART_CHAR_MATCHER.matchesAllOf(asciiChars)) {
288       return false;
289     }
290 
291     // No initial or final dashes or underscores.
292 
293     if (DASH_MATCHER.matches(part.charAt(0))
294         || DASH_MATCHER.matches(part.charAt(part.length() - 1))) {
295       return false;
296     }
297 
298     /*
299      * Note that we allow (in contravention of a strict interpretation of the
300      * relevant RFCs) domain parts other than the last may begin with a digit
301      * (for example, "3com.com"). It's important to disallow an initial digit in
302      * the last part; it's the only thing that stops an IPv4 numeric address
303      * like 127.0.0.1 from looking like a valid domain name.
304      */
305 
306     if (isFinalPart && CharMatcher.DIGIT.matches(part.charAt(0))) {
307       return false;
308     }
309 
310     return true;
311   }
312 
313   /**
314    * Returns the domain name, normalized to all lower case.
315    */
name()316   public String name() {
317     return name;
318   }
319 
320   /**
321    * Returns the individual components of this domain name, normalized to all
322    * lower case. For example, for the domain name {@code mail.google.com}, this
323    * method returns the list {@code ["mail", "google", "com"]}.
324    */
parts()325   public ImmutableList<String> parts() {
326     return parts;
327   }
328 
329   /**
330    * Indicates whether this domain name represents a <i>public suffix</i>, as
331    * defined by the Mozilla Foundation's
332    * <a href="http://publicsuffix.org/">Public Suffix List</a> (PSL). A public
333    * suffix is one under which Internet users can directly register names, such
334    * as {@code com}, {@code co.uk} or {@code pvt.k12.wy.us}. Examples of domain
335    * names that are <i>not</i> public suffixes include {@code google}, {@code
336    * google.com} and {@code foo.co.uk}.
337    *
338    * @return {@code true} if this domain name appears exactly on the public
339    *     suffix list
340    * @since 6.0
341    */
isPublicSuffix()342   public boolean isPublicSuffix() {
343     return publicSuffixIndex == 0;
344   }
345 
346   /**
347    * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix()
348    * public suffix}, including if it is a public suffix itself. For example,
349    * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and
350    * {@code com}, but not for {@code google} or {@code google.foo}. This is
351    * the recommended method for determining whether a domain is potentially an
352    * addressable host.
353    *
354    * @since 6.0
355    */
hasPublicSuffix()356   public boolean hasPublicSuffix() {
357     return publicSuffixIndex != NO_PUBLIC_SUFFIX_FOUND;
358   }
359 
360   /**
361    * Returns the {@linkplain #isPublicSuffix() public suffix} portion of the
362    * domain name, or {@code null} if no public suffix is present.
363    *
364    * @since 6.0
365    */
publicSuffix()366   public InternetDomainName publicSuffix() {
367     return hasPublicSuffix() ? ancestor(publicSuffixIndex) : null;
368   }
369 
370   /**
371    * Indicates whether this domain name ends in a {@linkplain #isPublicSuffix()
372    * public suffix}, while not being a public suffix itself. For example,
373    * returns {@code true} for {@code www.google.com}, {@code foo.co.uk} and
374    * {@code bar.ca.us}, but not for {@code google}, {@code com}, or {@code
375    * google.foo}.
376    *
377    * <p><b>Warning:</b> a {@code false} result from this method does not imply
378    * that the domain does not represent an addressable host, as many public
379    * suffixes are also addressable hosts. Use {@link #hasPublicSuffix()} for
380    * that test.
381    *
382    * <p>This method can be used to determine whether it will probably be
383    * possible to set cookies on the domain, though even that depends on
384    * individual browsers' implementations of cookie controls. See
385    * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details.
386    *
387    * @since 6.0
388    */
isUnderPublicSuffix()389   public boolean isUnderPublicSuffix() {
390     return publicSuffixIndex > 0;
391   }
392 
393   /**
394    * Indicates whether this domain name is composed of exactly one subdomain
395    * component followed by a {@linkplain #isPublicSuffix() public suffix}. For
396    * example, returns {@code true} for {@code google.com} and {@code foo.co.uk},
397    * but not for {@code www.google.com} or {@code co.uk}.
398    *
399    * <p><b>Warning:</b> A {@code true} result from this method does not imply
400    * that the domain is at the highest level which is addressable as a host, as
401    * many public suffixes are also addressable hosts. For example, the domain
402    * {@code bar.uk.com} has a public suffix of {@code uk.com}, so it would
403    * return {@code true} from this method. But {@code uk.com} is itself an
404    * addressable host.
405    *
406    * <p>This method can be used to determine whether a domain is probably the
407    * highest level for which cookies may be set, though even that depends on
408    * individual browsers' implementations of cookie controls. See
409    * <a href="http://www.ietf.org/rfc/rfc2109.txt">RFC 2109</a> for details.
410    *
411    * @since 6.0
412    */
isTopPrivateDomain()413   public boolean isTopPrivateDomain() {
414     return publicSuffixIndex == 1;
415   }
416 
417   /**
418    * Returns the portion of this domain name that is one level beneath the
419    * public suffix. For example, for {@code x.adwords.google.co.uk} it returns
420    * {@code google.co.uk}, since {@code co.uk} is a public suffix.
421    *
422    * <p>If {@link #isTopPrivateDomain()} is true, the current domain name
423    * instance is returned.
424    *
425    * <p>This method should not be used to determine the topmost parent domain
426    * which is addressable as a host, as many public suffixes are also
427    * addressable hosts. For example, the domain {@code foo.bar.uk.com} has
428    * a public suffix of {@code uk.com}, so it would return {@code bar.uk.com}
429    * from this method. But {@code uk.com} is itself an addressable host.
430    *
431    * <p>This method can be used to determine the probable highest level parent
432    * domain for which cookies may be set, though even that depends on individual
433    * browsers' implementations of cookie controls.
434    *
435    * @throws IllegalStateException if this domain does not end with a
436    *     public suffix
437    * @since 6.0
438    */
topPrivateDomain()439   public InternetDomainName topPrivateDomain() {
440     if (isTopPrivateDomain()) {
441       return this;
442     }
443     checkState(isUnderPublicSuffix(), "Not under a public suffix: %s", name);
444     return ancestor(publicSuffixIndex - 1);
445   }
446 
447   /**
448    * Indicates whether this domain is composed of two or more parts.
449    */
hasParent()450   public boolean hasParent() {
451     return parts.size() > 1;
452   }
453 
454   /**
455    * Returns an {@code InternetDomainName} that is the immediate ancestor of
456    * this one; that is, the current domain with the leftmost part removed. For
457    * example, the parent of {@code www.google.com} is {@code google.com}.
458    *
459    * @throws IllegalStateException if the domain has no parent, as determined
460    *     by {@link #hasParent}
461    */
parent()462   public InternetDomainName parent() {
463     checkState(hasParent(), "Domain '%s' has no parent", name);
464     return ancestor(1);
465   }
466 
467   /**
468    * Returns the ancestor of the current domain at the given number of levels
469    * "higher" (rightward) in the subdomain list. The number of levels must be
470    * non-negative, and less than {@code N-1}, where {@code N} is the number of
471    * parts in the domain.
472    *
473    * <p>TODO: Reasonable candidate for addition to public API.
474    */
ancestor(int levels)475   private InternetDomainName ancestor(int levels) {
476     return from(DOT_JOINER.join(parts.subList(levels, parts.size())));
477   }
478 
479   /**
480    * Creates and returns a new {@code InternetDomainName} by prepending the
481    * argument and a dot to the current name. For example, {@code
482    * InternetDomainName.from("foo.com").child("www.bar")} returns a new
483    * {@code InternetDomainName} with the value {@code www.bar.foo.com}. Only
484    * lenient validation is performed, as described {@link #from(String) here}.
485    *
486    * @throws NullPointerException if leftParts is null
487    * @throws IllegalArgumentException if the resulting name is not valid
488    */
child(String leftParts)489   public InternetDomainName child(String leftParts) {
490     return from(checkNotNull(leftParts) + "." + name);
491   }
492 
493   /**
494    * A deprecated synonym for {@link #isValid(String)}.
495    *
496    * @since 8.0 (previously named {@code isValid})
497    * @deprecated Use {@link #isValid(String)} instead
498    */
499   @Deprecated
isValidLenient(String name)500   public static boolean isValidLenient(String name) {
501     return isValid(name);
502   }
503 
504   /**
505    * Indicates whether the argument is a syntactically valid domain name using
506    * lenient validation. Specifically, validation against <a
507    * href="http://www.ietf.org/rfc/rfc3490.txt">RFC 3490</a>
508    * ("Internationalizing Domain Names in Applications") is skipped.
509    *
510    * <p>The following two code snippets are equivalent:
511    *
512    * <pre>   {@code
513    *
514    *   domainName = InternetDomainName.isValid(name)
515    *       ? InternetDomainName.from(name)
516    *       : DEFAULT_DOMAIN;
517    *   }</pre>
518    *
519    * <pre>   {@code
520    *
521    *   try {
522    *     domainName = InternetDomainName.from(name);
523    *   } catch (IllegalArgumentException e) {
524    *     domainName = DEFAULT_DOMAIN;
525    *   }}</pre>
526    *
527    * @since 8.0 (previously named {@code isValidLenient})
528    */
isValid(String name)529   public static boolean isValid(String name) {
530     try {
531       from(name);
532       return true;
533     } catch (IllegalArgumentException e) {
534       return false;
535     }
536   }
537 
538   /**
539    * Does the domain name match one of the "wildcard" patterns (e.g.
540    * {@code "*.ar"})?
541    */
matchesWildcardPublicSuffix(String domain)542   private static boolean matchesWildcardPublicSuffix(String domain) {
543     final String[] pieces = domain.split(DOT_REGEX, 2);
544     return pieces.length == 2 && TldPatterns.UNDER.contains(pieces[1]);
545   }
546 
547   // TODO: specify this to return the same as name(); remove name()
548   @Override
toString()549   public String toString() {
550     return Objects.toStringHelper(this).add("name", name).toString();
551   }
552 
553   /**
554    * Equality testing is based on the text supplied by the caller,
555    * after normalization as described in the class documentation. For
556    * example, a non-ASCII Unicode domain name and the Punycode version
557    * of the same domain name would not be considered equal.
558    *
559    */
560   @Override
equals(@ullable Object object)561   public boolean equals(@Nullable Object object) {
562     if (object == this) {
563       return true;
564     }
565 
566     if (object instanceof InternetDomainName) {
567       InternetDomainName that = (InternetDomainName) object;
568       return this.name.equals(that.name);
569     }
570 
571     return false;
572   }
573 
574   @Override
hashCode()575   public int hashCode() {
576     return name.hashCode();
577   }
578 }
579