• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2013-2014 The rust-url developers.
2 //
3 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6 // option. This file may not be copied, modified, or distributed
7 // except according to those terms.
8 
9 //! [*Unicode IDNA Compatibility Processing*
10 //! (Unicode Technical Standard #46)](http://www.unicode.org/reports/tr46/)
11 
12 use self::Mapping::*;
13 use crate::punycode;
14 use std::{error::Error as StdError, fmt};
15 use unicode_bidi::{bidi_class, BidiClass};
16 use unicode_normalization::char::is_combining_mark;
17 use unicode_normalization::{is_nfc, UnicodeNormalization};
18 
19 include!("uts46_mapping_table.rs");
20 
21 const PUNYCODE_PREFIX: &str = "xn--";
22 
23 #[derive(Debug)]
24 struct StringTableSlice {
25     // Store these as separate fields so the structure will have an
26     // alignment of 1 and thus pack better into the Mapping enum, below.
27     byte_start_lo: u8,
28     byte_start_hi: u8,
29     byte_len: u8,
30 }
31 
decode_slice(slice: &StringTableSlice) -> &'static str32 fn decode_slice(slice: &StringTableSlice) -> &'static str {
33     let lo = slice.byte_start_lo as usize;
34     let hi = slice.byte_start_hi as usize;
35     let start = (hi << 8) | lo;
36     let len = slice.byte_len as usize;
37     &STRING_TABLE[start..(start + len)]
38 }
39 
40 #[repr(u8)]
41 #[derive(Debug)]
42 enum Mapping {
43     Valid,
44     Ignored,
45     Mapped(StringTableSlice),
46     Deviation(StringTableSlice),
47     Disallowed,
48     DisallowedStd3Valid,
49     DisallowedStd3Mapped(StringTableSlice),
50     DisallowedIdna2008,
51 }
52 
find_char(codepoint: char) -> &'static Mapping53 fn find_char(codepoint: char) -> &'static Mapping {
54     let idx = match TABLE.binary_search_by_key(&codepoint, |&val| val.0) {
55         Ok(idx) => idx,
56         Err(idx) => idx - 1,
57     };
58 
59     const SINGLE_MARKER: u16 = 1 << 15;
60 
61     let (base, x) = TABLE[idx];
62     let single = (x & SINGLE_MARKER) != 0;
63     let offset = !SINGLE_MARKER & x;
64 
65     if single {
66         &MAPPING_TABLE[offset as usize]
67     } else {
68         &MAPPING_TABLE[(offset + (codepoint as u16 - base as u16)) as usize]
69     }
70 }
71 
72 struct Mapper<'a> {
73     chars: std::str::Chars<'a>,
74     config: Config,
75     errors: &'a mut Errors,
76     slice: Option<std::str::Chars<'static>>,
77 }
78 
79 impl<'a> Iterator for Mapper<'a> {
80     type Item = char;
81 
next(&mut self) -> Option<Self::Item>82     fn next(&mut self) -> Option<Self::Item> {
83         loop {
84             if let Some(s) = &mut self.slice {
85                 match s.next() {
86                     Some(c) => return Some(c),
87                     None => {
88                         self.slice = None;
89                     }
90                 }
91             }
92 
93             let codepoint = self.chars.next()?;
94             if let '.' | '-' | 'a'..='z' | '0'..='9' = codepoint {
95                 return Some(codepoint);
96             }
97 
98             return Some(match *find_char(codepoint) {
99                 Mapping::Valid => codepoint,
100                 Mapping::Ignored => continue,
101                 Mapping::Mapped(ref slice) => {
102                     self.slice = Some(decode_slice(slice).chars());
103                     continue;
104                 }
105                 Mapping::Deviation(ref slice) => {
106                     if self.config.transitional_processing {
107                         self.slice = Some(decode_slice(slice).chars());
108                         continue;
109                     } else {
110                         codepoint
111                     }
112                 }
113                 Mapping::Disallowed => {
114                     self.errors.disallowed_character = true;
115                     codepoint
116                 }
117                 Mapping::DisallowedStd3Valid => {
118                     if self.config.use_std3_ascii_rules {
119                         self.errors.disallowed_by_std3_ascii_rules = true;
120                     };
121                     codepoint
122                 }
123                 Mapping::DisallowedStd3Mapped(ref slice) => {
124                     if self.config.use_std3_ascii_rules {
125                         self.errors.disallowed_mapped_in_std3 = true;
126                     };
127                     self.slice = Some(decode_slice(slice).chars());
128                     continue;
129                 }
130                 Mapping::DisallowedIdna2008 => {
131                     if self.config.use_idna_2008_rules {
132                         self.errors.disallowed_in_idna_2008 = true;
133                     }
134                     codepoint
135                 }
136             });
137         }
138     }
139 }
140 
141 // http://tools.ietf.org/html/rfc5893#section-2
passes_bidi(label: &str, is_bidi_domain: bool) -> bool142 fn passes_bidi(label: &str, is_bidi_domain: bool) -> bool {
143     // Rule 0: Bidi Rules apply to Bidi Domain Names: a name with at least one RTL label.  A label
144     // is RTL if it contains at least one character of bidi class R, AL or AN.
145     if !is_bidi_domain {
146         return true;
147     }
148 
149     let mut chars = label.chars();
150     let first_char_class = match chars.next() {
151         Some(c) => bidi_class(c),
152         None => return true, // empty string
153     };
154 
155     match first_char_class {
156         // LTR label
157         BidiClass::L => {
158             // Rule 5
159             for c in chars.by_ref() {
160                 if !matches!(
161                     bidi_class(c),
162                     BidiClass::L
163                         | BidiClass::EN
164                         | BidiClass::ES
165                         | BidiClass::CS
166                         | BidiClass::ET
167                         | BidiClass::ON
168                         | BidiClass::BN
169                         | BidiClass::NSM
170                 ) {
171                     return false;
172                 }
173             }
174 
175             // Rule 6
176             // must end in L or EN followed by 0 or more NSM
177             let mut rev_chars = label.chars().rev();
178             let mut last_non_nsm = rev_chars.next();
179             loop {
180                 match last_non_nsm {
181                     Some(c) if bidi_class(c) == BidiClass::NSM => {
182                         last_non_nsm = rev_chars.next();
183                         continue;
184                     }
185                     _ => {
186                         break;
187                     }
188                 }
189             }
190             match last_non_nsm {
191                 Some(c) if bidi_class(c) == BidiClass::L || bidi_class(c) == BidiClass::EN => {}
192                 Some(_) => {
193                     return false;
194                 }
195                 _ => {}
196             }
197         }
198 
199         // RTL label
200         BidiClass::R | BidiClass::AL => {
201             let mut found_en = false;
202             let mut found_an = false;
203 
204             // Rule 2
205             for c in chars {
206                 let char_class = bidi_class(c);
207                 if char_class == BidiClass::EN {
208                     found_en = true;
209                 } else if char_class == BidiClass::AN {
210                     found_an = true;
211                 }
212 
213                 if !matches!(
214                     char_class,
215                     BidiClass::R
216                         | BidiClass::AL
217                         | BidiClass::AN
218                         | BidiClass::EN
219                         | BidiClass::ES
220                         | BidiClass::CS
221                         | BidiClass::ET
222                         | BidiClass::ON
223                         | BidiClass::BN
224                         | BidiClass::NSM
225                 ) {
226                     return false;
227                 }
228             }
229             // Rule 3
230             let mut rev_chars = label.chars().rev();
231             let mut last = rev_chars.next();
232             loop {
233                 // must end in L or EN followed by 0 or more NSM
234                 match last {
235                     Some(c) if bidi_class(c) == BidiClass::NSM => {
236                         last = rev_chars.next();
237                         continue;
238                     }
239                     _ => {
240                         break;
241                     }
242                 }
243             }
244             match last {
245                 Some(c)
246                     if matches!(
247                         bidi_class(c),
248                         BidiClass::R | BidiClass::AL | BidiClass::EN | BidiClass::AN
249                     ) => {}
250                 _ => {
251                     return false;
252                 }
253             }
254 
255             // Rule 4
256             if found_an && found_en {
257                 return false;
258             }
259         }
260 
261         // Rule 1: Should start with L or R/AL
262         _ => {
263             return false;
264         }
265     }
266 
267     true
268 }
269 
270 /// Check the validity criteria for the given label
271 ///
272 /// V1 (NFC) and V8 (Bidi) are checked inside `processing()` to prevent doing duplicate work.
273 ///
274 /// http://www.unicode.org/reports/tr46/#Validity_Criteria
check_validity(label: &str, config: Config, errors: &mut Errors)275 fn check_validity(label: &str, config: Config, errors: &mut Errors) {
276     let first_char = label.chars().next();
277     if first_char == None {
278         // Empty string, pass
279         return;
280     }
281 
282     // V2: No U+002D HYPHEN-MINUS in both third and fourth positions.
283     //
284     // NOTE: Spec says that the label must not contain a HYPHEN-MINUS character in both the
285     // third and fourth positions. But nobody follows this criteria. See the spec issue below:
286     // https://github.com/whatwg/url/issues/53
287 
288     // V3: neither begin nor end with a U+002D HYPHEN-MINUS
289     if config.check_hyphens && (label.starts_with('-') || label.ends_with('-')) {
290         errors.check_hyphens = true;
291         return;
292     }
293 
294     // V4: not contain a U+002E FULL STOP
295     //
296     // Here, label can't contain '.' since the input is from .split('.')
297 
298     // V5: not begin with a GC=Mark
299     if is_combining_mark(first_char.unwrap()) {
300         errors.start_combining_mark = true;
301         return;
302     }
303 
304     // V6: Check against Mapping Table
305     if label.chars().any(|c| match *find_char(c) {
306         Mapping::Valid | Mapping::DisallowedIdna2008 => false,
307         Mapping::Deviation(_) => config.transitional_processing,
308         Mapping::DisallowedStd3Valid => config.use_std3_ascii_rules,
309         _ => true,
310     }) {
311         errors.invalid_mapping = true;
312     }
313 
314     // V7: ContextJ rules
315     //
316     // TODO: Implement rules and add *CheckJoiners* flag.
317 
318     // V8: Bidi rules are checked inside `processing()`
319 }
320 
321 // Detect simple cases: all lowercase ASCII characters and digits where none
322 // of the labels start with PUNYCODE_PREFIX and labels don't start or end with hyphen.
is_simple(domain: &str) -> bool323 fn is_simple(domain: &str) -> bool {
324     if domain.is_empty() {
325         return false;
326     }
327     let (mut prev, mut puny_prefix) = ('?', 0);
328     for c in domain.chars() {
329         if c == '.' {
330             if prev == '-' {
331                 return false;
332             }
333             puny_prefix = 0;
334             continue;
335         } else if puny_prefix == 0 && c == '-' {
336             return false;
337         } else if puny_prefix < 5 {
338             if c == ['x', 'n', '-', '-'][puny_prefix] {
339                 puny_prefix += 1;
340                 if puny_prefix == 4 {
341                     return false;
342                 }
343             } else {
344                 puny_prefix = 5;
345             }
346         }
347         if !c.is_ascii_lowercase() && !c.is_ascii_digit() {
348             return false;
349         }
350         prev = c;
351     }
352 
353     true
354 }
355 
356 /// http://www.unicode.org/reports/tr46/#Processing
processing( domain: &str, config: Config, normalized: &mut String, output: &mut String, ) -> Errors357 fn processing(
358     domain: &str,
359     config: Config,
360     normalized: &mut String,
361     output: &mut String,
362 ) -> Errors {
363     normalized.clear();
364     let mut errors = Errors::default();
365     let offset = output.len();
366 
367     let iter = Mapper {
368         chars: domain.chars(),
369         config,
370         errors: &mut errors,
371         slice: None,
372     };
373 
374     normalized.extend(iter.nfc());
375 
376     let mut decoder = punycode::Decoder::default();
377     let non_transitional = config.transitional_processing(false);
378     let (mut first, mut has_bidi_labels) = (true, false);
379     for label in normalized.split('.') {
380         if !first {
381             output.push('.');
382         }
383         first = false;
384         if let Some(remainder) = label.strip_prefix(PUNYCODE_PREFIX) {
385             match decoder.decode(remainder) {
386                 Ok(decode) => {
387                     let start = output.len();
388                     output.extend(decode);
389                     let decoded_label = &output[start..];
390 
391                     if !has_bidi_labels {
392                         has_bidi_labels |= is_bidi_domain(decoded_label);
393                     }
394 
395                     if !errors.is_err() {
396                         if !is_nfc(decoded_label) {
397                             errors.nfc = true;
398                         } else {
399                             check_validity(decoded_label, non_transitional, &mut errors);
400                         }
401                     }
402                 }
403                 Err(()) => {
404                     has_bidi_labels = true;
405                     errors.punycode = true;
406                 }
407             }
408         } else {
409             if !has_bidi_labels {
410                 has_bidi_labels |= is_bidi_domain(label);
411             }
412 
413             // `normalized` is already `NFC` so we can skip that check
414             check_validity(label, config, &mut errors);
415             output.push_str(label)
416         }
417     }
418 
419     for label in output[offset..].split('.') {
420         // V8: Bidi rules
421         //
422         // TODO: Add *CheckBidi* flag
423         if !passes_bidi(label, has_bidi_labels) {
424             errors.check_bidi = true;
425             break;
426         }
427     }
428 
429     errors
430 }
431 
432 #[derive(Default)]
433 pub struct Idna {
434     config: Config,
435     normalized: String,
436     output: String,
437 }
438 
439 impl Idna {
new(config: Config) -> Self440     pub fn new(config: Config) -> Self {
441         Self {
442             config,
443             normalized: String::new(),
444             output: String::new(),
445         }
446     }
447 
to_ascii_inner(&mut self, domain: &str, out: &mut String) -> Errors448     pub fn to_ascii_inner(&mut self, domain: &str, out: &mut String) -> Errors {
449         if is_simple(domain) {
450             out.push_str(domain);
451             return Errors::default();
452         }
453         let mut errors = processing(domain, self.config, &mut self.normalized, out);
454         self.output = std::mem::replace(out, String::with_capacity(out.len()));
455         let mut first = true;
456         for label in self.output.split('.') {
457             if !first {
458                 out.push('.');
459             }
460             first = false;
461 
462             if label.is_ascii() {
463                 out.push_str(label);
464             } else {
465                 let offset = out.len();
466                 out.push_str(PUNYCODE_PREFIX);
467                 if let Err(()) = punycode::encode_into(label.chars(), out) {
468                     errors.punycode = true;
469                     out.truncate(offset);
470                 }
471             }
472         }
473         errors
474     }
475 
476     /// http://www.unicode.org/reports/tr46/#ToASCII
477     #[allow(clippy::wrong_self_convention)]
to_ascii<'a>(&'a mut self, domain: &str, out: &mut String) -> Result<(), Errors>478     pub fn to_ascii<'a>(&'a mut self, domain: &str, out: &mut String) -> Result<(), Errors> {
479         let mut errors = self.to_ascii_inner(domain, out);
480 
481         if self.config.verify_dns_length {
482             let domain = if out.ends_with('.') {
483                 &out[..out.len() - 1]
484             } else {
485                 &*out
486             };
487             if domain.is_empty() || domain.split('.').any(|label| label.is_empty()) {
488                 errors.too_short_for_dns = true;
489             }
490             if domain.len() > 253 || domain.split('.').any(|label| label.len() > 63) {
491                 errors.too_long_for_dns = true;
492             }
493         }
494 
495         errors.into()
496     }
497 
498     /// http://www.unicode.org/reports/tr46/#ToUnicode
499     #[allow(clippy::wrong_self_convention)]
to_unicode<'a>(&'a mut self, domain: &str, out: &mut String) -> Result<(), Errors>500     pub fn to_unicode<'a>(&'a mut self, domain: &str, out: &mut String) -> Result<(), Errors> {
501         if is_simple(domain) {
502             out.push_str(domain);
503             return Errors::default().into();
504         }
505         processing(domain, self.config, &mut self.normalized, out).into()
506     }
507 }
508 
509 #[derive(Clone, Copy)]
510 pub struct Config {
511     use_std3_ascii_rules: bool,
512     transitional_processing: bool,
513     verify_dns_length: bool,
514     check_hyphens: bool,
515     use_idna_2008_rules: bool,
516 }
517 
518 /// The defaults are that of https://url.spec.whatwg.org/#idna
519 impl Default for Config {
default() -> Self520     fn default() -> Self {
521         Config {
522             use_std3_ascii_rules: false,
523             transitional_processing: false,
524             check_hyphens: false,
525             // check_bidi: true,
526             // check_joiners: true,
527 
528             // Only use for to_ascii, not to_unicode
529             verify_dns_length: false,
530             use_idna_2008_rules: false,
531         }
532     }
533 }
534 
535 impl Config {
536     #[inline]
use_std3_ascii_rules(mut self, value: bool) -> Self537     pub fn use_std3_ascii_rules(mut self, value: bool) -> Self {
538         self.use_std3_ascii_rules = value;
539         self
540     }
541 
542     #[inline]
transitional_processing(mut self, value: bool) -> Self543     pub fn transitional_processing(mut self, value: bool) -> Self {
544         self.transitional_processing = value;
545         self
546     }
547 
548     #[inline]
verify_dns_length(mut self, value: bool) -> Self549     pub fn verify_dns_length(mut self, value: bool) -> Self {
550         self.verify_dns_length = value;
551         self
552     }
553 
554     #[inline]
check_hyphens(mut self, value: bool) -> Self555     pub fn check_hyphens(mut self, value: bool) -> Self {
556         self.check_hyphens = value;
557         self
558     }
559 
560     #[inline]
use_idna_2008_rules(mut self, value: bool) -> Self561     pub fn use_idna_2008_rules(mut self, value: bool) -> Self {
562         self.use_idna_2008_rules = value;
563         self
564     }
565 
566     /// http://www.unicode.org/reports/tr46/#ToASCII
to_ascii(self, domain: &str) -> Result<String, Errors>567     pub fn to_ascii(self, domain: &str) -> Result<String, Errors> {
568         let mut result = String::with_capacity(domain.len());
569         let mut codec = Idna::new(self);
570         codec.to_ascii(domain, &mut result).map(|()| result)
571     }
572 
573     /// http://www.unicode.org/reports/tr46/#ToUnicode
to_unicode(self, domain: &str) -> (String, Result<(), Errors>)574     pub fn to_unicode(self, domain: &str) -> (String, Result<(), Errors>) {
575         let mut codec = Idna::new(self);
576         let mut out = String::with_capacity(domain.len());
577         let result = codec.to_unicode(domain, &mut out);
578         (out, result)
579     }
580 }
581 
is_bidi_domain(s: &str) -> bool582 fn is_bidi_domain(s: &str) -> bool {
583     for c in s.chars() {
584         if c.is_ascii_graphic() {
585             continue;
586         }
587         match bidi_class(c) {
588             BidiClass::R | BidiClass::AL | BidiClass::AN => return true,
589             _ => {}
590         }
591     }
592     false
593 }
594 
595 /// Errors recorded during UTS #46 processing.
596 ///
597 /// This is opaque for now, indicating what types of errors have been encountered at least once.
598 /// More details may be exposed in the future.
599 #[derive(Default)]
600 pub struct Errors {
601     punycode: bool,
602     check_hyphens: bool,
603     check_bidi: bool,
604     start_combining_mark: bool,
605     invalid_mapping: bool,
606     nfc: bool,
607     disallowed_by_std3_ascii_rules: bool,
608     disallowed_mapped_in_std3: bool,
609     disallowed_character: bool,
610     too_long_for_dns: bool,
611     too_short_for_dns: bool,
612     disallowed_in_idna_2008: bool,
613 }
614 
615 impl Errors {
is_err(&self) -> bool616     fn is_err(&self) -> bool {
617         let Errors {
618             punycode,
619             check_hyphens,
620             check_bidi,
621             start_combining_mark,
622             invalid_mapping,
623             nfc,
624             disallowed_by_std3_ascii_rules,
625             disallowed_mapped_in_std3,
626             disallowed_character,
627             too_long_for_dns,
628             too_short_for_dns,
629             disallowed_in_idna_2008,
630         } = *self;
631         punycode
632             || check_hyphens
633             || check_bidi
634             || start_combining_mark
635             || invalid_mapping
636             || nfc
637             || disallowed_by_std3_ascii_rules
638             || disallowed_mapped_in_std3
639             || disallowed_character
640             || too_long_for_dns
641             || too_short_for_dns
642             || disallowed_in_idna_2008
643     }
644 }
645 
646 impl fmt::Debug for Errors {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result647     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
648         let Errors {
649             punycode,
650             check_hyphens,
651             check_bidi,
652             start_combining_mark,
653             invalid_mapping,
654             nfc,
655             disallowed_by_std3_ascii_rules,
656             disallowed_mapped_in_std3,
657             disallowed_character,
658             too_long_for_dns,
659             too_short_for_dns,
660             disallowed_in_idna_2008,
661         } = *self;
662 
663         let fields = [
664             ("punycode", punycode),
665             ("check_hyphens", check_hyphens),
666             ("check_bidi", check_bidi),
667             ("start_combining_mark", start_combining_mark),
668             ("invalid_mapping", invalid_mapping),
669             ("nfc", nfc),
670             (
671                 "disallowed_by_std3_ascii_rules",
672                 disallowed_by_std3_ascii_rules,
673             ),
674             ("disallowed_mapped_in_std3", disallowed_mapped_in_std3),
675             ("disallowed_character", disallowed_character),
676             ("too_long_for_dns", too_long_for_dns),
677             ("too_short_for_dns", too_short_for_dns),
678             ("disallowed_in_idna_2008", disallowed_in_idna_2008),
679         ];
680 
681         let mut empty = true;
682         f.write_str("Errors { ")?;
683         for (name, val) in &fields {
684             if *val {
685                 if !empty {
686                     f.write_str(", ")?;
687                 }
688                 f.write_str(*name)?;
689                 empty = false;
690             }
691         }
692 
693         if !empty {
694             f.write_str(" }")
695         } else {
696             f.write_str("}")
697         }
698     }
699 }
700 
701 impl From<Errors> for Result<(), Errors> {
from(e: Errors) -> Result<(), Errors>702     fn from(e: Errors) -> Result<(), Errors> {
703         if !e.is_err() {
704             Ok(())
705         } else {
706             Err(e)
707         }
708     }
709 }
710 
711 impl StdError for Errors {}
712 
713 impl fmt::Display for Errors {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result714     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
715         fmt::Debug::fmt(self, f)
716     }
717 }
718 
719 #[cfg(test)]
720 mod tests {
721     use super::{find_char, Mapping};
722 
723     #[test]
mapping_fast_path()724     fn mapping_fast_path() {
725         assert_matches!(find_char('-'), &Mapping::Valid);
726         assert_matches!(find_char('.'), &Mapping::Valid);
727         for c in &['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] {
728             assert_matches!(find_char(*c), &Mapping::Valid);
729         }
730         for c in &[
731             'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q',
732             'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
733         ] {
734             assert_matches!(find_char(*c), &Mapping::Valid);
735         }
736     }
737 }
738