• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4 
5 //! Unicode Extensions provide a mechanism to extend the [`LanguageIdentifier`] with
6 //! additional bits of information - a combination of a [`LanguageIdentifier`] and [`Extensions`]
7 //! is called [`Locale`].
8 //!
9 //! There are four types of extensions:
10 //!
11 //!  * [`Unicode Extensions`] - marked as `u`.
12 //!  * [`Transform Extensions`] - marked as `t`.
13 //!  * [`Private Use Extensions`] - marked as `x`.
14 //!  * [`Other Extensions`] - marked as any `a-z` except of `u`, `t` and `x`.
15 //!
16 //! One can think of extensions as a bag of extra information on top of basic 4 [`subtags`].
17 //!
18 //! Notice: `Other` extension type is currently not supported.
19 //!
20 //! # Examples
21 //!
22 //! ```
23 //! use icu::locale::extensions::unicode::{Key, Value};
24 //! use icu::locale::Locale;
25 //!
26 //! let loc: Locale = "en-US-u-ca-buddhist-t-en-us-h0-hybrid-x-foo"
27 //!     .parse()
28 //!     .expect("Failed to parse.");
29 //!
30 //! assert_eq!(loc.id.language, "en".parse().unwrap());
31 //! assert_eq!(loc.id.script, None);
32 //! assert_eq!(loc.id.region, Some("US".parse().unwrap()));
33 //! assert_eq!(loc.id.variants.len(), 0);
34 //!
35 //! let key: Key = "ca".parse().expect("Parsing key failed.");
36 //! let value: Value = "buddhist".parse().expect("Parsing value failed.");
37 //! assert_eq!(loc.extensions.unicode.keywords.get(&key), Some(&value));
38 //! ```
39 //!
40 //! # Syntactic vs Semantic Extension Handling
41 //!
42 //! This module is useful when you need to work with Locale extensions at a syntactic level,
43 //! perhaps for parsing or generating locale identifiers that include any syntactically valid
44 //! extensions.
45 //! For handling and validating known CLDR values with semantic meaning, see the
46 //! [`crate::preferences::extensions`] module.
47 //!
48 //! [`LanguageIdentifier`]: super::LanguageIdentifier
49 //! [`Locale`]: super::Locale
50 //! [`subtags`]: super::subtags
51 //! [`Other Extensions`]: other
52 //! [`Private Use Extensions`]: private
53 //! [`Transform Extensions`]: transform
54 //! [`Unicode Extensions`]: unicode
55 pub mod other;
56 pub mod private;
57 pub mod transform;
58 pub mod unicode;
59 
60 use core::cmp::Ordering;
61 
62 use other::Other;
63 use private::{Private, PRIVATE_EXT_CHAR};
64 use transform::{Transform, TRANSFORM_EXT_CHAR};
65 use unicode::{Unicode, UNICODE_EXT_CHAR};
66 
67 #[cfg(feature = "alloc")]
68 use alloc::vec::Vec;
69 
70 use crate::parser::ParseError;
71 #[cfg(feature = "alloc")]
72 use crate::parser::SubtagIterator;
73 use crate::subtags;
74 
75 /// Defines the type of extension.
76 #[derive(Debug, PartialEq, Eq, Clone, Hash, PartialOrd, Ord, Copy)]
77 #[non_exhaustive]
78 pub enum ExtensionType {
79     /// Transform Extension Type marked as `t`.
80     Transform,
81     /// Unicode Extension Type marked as `u`.
82     Unicode,
83     /// Private Extension Type marked as `x`.
84     Private,
85     /// All other extension types.
86     Other(u8),
87 }
88 
89 impl ExtensionType {
90     #[allow(dead_code)]
try_from_byte_slice(key: &[u8]) -> Result<Self, ParseError>91     pub(crate) const fn try_from_byte_slice(key: &[u8]) -> Result<Self, ParseError> {
92         if let [b] = key {
93             Self::try_from_byte(*b)
94         } else {
95             Err(ParseError::InvalidExtension)
96         }
97     }
98 
try_from_byte(key: u8) -> Result<Self, ParseError>99     pub(crate) const fn try_from_byte(key: u8) -> Result<Self, ParseError> {
100         let key = key.to_ascii_lowercase();
101         match key as char {
102             UNICODE_EXT_CHAR => Ok(Self::Unicode),
103             TRANSFORM_EXT_CHAR => Ok(Self::Transform),
104             PRIVATE_EXT_CHAR => Ok(Self::Private),
105             'a'..='z' => Ok(Self::Other(key)),
106             _ => Err(ParseError::InvalidExtension),
107         }
108     }
109 
try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError>110     pub(crate) const fn try_from_utf8(code_units: &[u8]) -> Result<Self, ParseError> {
111         let &[first] = code_units else {
112             return Err(ParseError::InvalidExtension);
113         };
114 
115         Self::try_from_byte(first)
116     }
117 }
118 
119 /// A map of extensions associated with a given [`Locale`](crate::Locale).
120 #[derive(Debug, Default, PartialEq, Eq, Clone, Hash)]
121 #[non_exhaustive]
122 pub struct Extensions {
123     /// A representation of the data for a Unicode extension, when present in the locale identifier.
124     pub unicode: Unicode,
125     /// A representation of the data for a transform extension, when present in the locale identifier.
126     pub transform: Transform,
127     /// A representation of the data for a private-use extension, when present in the locale identifier.
128     pub private: Private,
129     /// A sequence of any other extensions that are present in the locale identifier but are not formally
130     /// [defined](https://unicode.org/reports/tr35/) and represented explicitly as [`Unicode`], [`Transform`],
131     /// and [`Private`] are.
132     #[cfg(feature = "alloc")]
133     pub other: Vec<Other>,
134     /// A sequence of any other extensions that are present in the locale identifier but are not formally
135     /// [defined](https://unicode.org/reports/tr35/) and represented explicitly as [`Unicode`], [`Transform`],
136     /// and [`Private`] are.
137     #[cfg(not(feature = "alloc"))]
138     pub other: &'static [Other],
139 }
140 
141 impl Extensions {
142     /// Returns a new empty map of extensions. Same as [`default()`](Default::default()), but is `const`.
143     ///
144     /// # Examples
145     ///
146     /// ```
147     /// use icu::locale::extensions::Extensions;
148     ///
149     /// assert_eq!(Extensions::new(), Extensions::default());
150     /// ```
151     #[inline]
new() -> Self152     pub const fn new() -> Self {
153         Self {
154             unicode: Unicode::new(),
155             transform: Transform::new(),
156             private: Private::new(),
157             #[cfg(feature = "alloc")]
158             other: Vec::new(),
159             #[cfg(not(feature = "alloc"))]
160             other: &[],
161         }
162     }
163 
164     /// Function to create a new map of extensions containing exactly one unicode extension, callable in `const`
165     /// context.
166     #[inline]
from_unicode(unicode: Unicode) -> Self167     pub const fn from_unicode(unicode: Unicode) -> Self {
168         Self {
169             unicode,
170             transform: Transform::new(),
171             private: Private::new(),
172             #[cfg(feature = "alloc")]
173             other: Vec::new(),
174             #[cfg(not(feature = "alloc"))]
175             other: &[],
176         }
177     }
178 
179     /// Returns whether there are no extensions present.
180     ///
181     /// # Examples
182     ///
183     /// ```
184     /// use icu::locale::Locale;
185     ///
186     /// let loc: Locale = "en-US-u-foo".parse().expect("Parsing failed.");
187     ///
188     /// assert!(!loc.extensions.is_empty());
189     /// ```
is_empty(&self) -> bool190     pub fn is_empty(&self) -> bool {
191         self.unicode.is_empty()
192             && self.transform.is_empty()
193             && self.private.is_empty()
194             && self.other.is_empty()
195     }
196 
197     #[allow(clippy::type_complexity)]
as_tuple( &self, ) -> ( (&unicode::Attributes, &unicode::Keywords), ( Option<( subtags::Language, Option<subtags::Script>, Option<subtags::Region>, &subtags::Variants, )>, &transform::Fields, ), &private::Private, &[other::Other], )198     pub(crate) fn as_tuple(
199         &self,
200     ) -> (
201         (&unicode::Attributes, &unicode::Keywords),
202         (
203             Option<(
204                 subtags::Language,
205                 Option<subtags::Script>,
206                 Option<subtags::Region>,
207                 &subtags::Variants,
208             )>,
209             &transform::Fields,
210         ),
211         &private::Private,
212         &[other::Other],
213     ) {
214         (
215             self.unicode.as_tuple(),
216             self.transform.as_tuple(),
217             &self.private,
218             &self.other,
219         )
220     }
221 
222     /// Returns an ordering suitable for use in [`BTreeSet`].
223     ///
224     /// The ordering may or may not be equivalent to string ordering, and it
225     /// may or may not be stable across ICU4X releases.
226     ///
227     /// [`BTreeSet`]: alloc::collections::BTreeSet
total_cmp(&self, other: &Self) -> Ordering228     pub fn total_cmp(&self, other: &Self) -> Ordering {
229         self.as_tuple().cmp(&other.as_tuple())
230     }
231 
232     /// Retains the specified extension types, clearing all others.
233     ///
234     /// # Examples
235     ///
236     /// ```
237     /// use icu::locale::extensions::ExtensionType;
238     /// use icu::locale::Locale;
239     ///
240     /// let loc: Locale =
241     ///     "und-a-hello-t-mul-u-world-z-zzz-x-extra".parse().unwrap();
242     ///
243     /// let mut only_unicode = loc.clone();
244     /// only_unicode
245     ///     .extensions
246     ///     .retain_by_type(|t| t == ExtensionType::Unicode);
247     /// assert_eq!(only_unicode, "und-u-world".parse().unwrap());
248     ///
249     /// let mut only_t_z = loc.clone();
250     /// only_t_z.extensions.retain_by_type(|t| {
251     ///     t == ExtensionType::Transform || t == ExtensionType::Other(b'z')
252     /// });
253     /// assert_eq!(only_t_z, "und-t-mul-z-zzz".parse().unwrap());
254     /// ```
retain_by_type<F>(&mut self, mut predicate: F) where F: FnMut(ExtensionType) -> bool,255     pub fn retain_by_type<F>(&mut self, mut predicate: F)
256     where
257         F: FnMut(ExtensionType) -> bool,
258     {
259         if !predicate(ExtensionType::Unicode) {
260             self.unicode.clear();
261         }
262         if !predicate(ExtensionType::Transform) {
263             self.transform.clear();
264         }
265         if !predicate(ExtensionType::Private) {
266             self.private.clear();
267         }
268         #[cfg(feature = "alloc")]
269         self.other
270             .retain(|o| predicate(ExtensionType::Other(o.get_ext_byte())));
271     }
272 
273     #[cfg(feature = "alloc")]
try_from_iter(iter: &mut SubtagIterator) -> Result<Self, ParseError>274     pub(crate) fn try_from_iter(iter: &mut SubtagIterator) -> Result<Self, ParseError> {
275         let mut unicode = None;
276         let mut transform = None;
277         let mut private = None;
278         let mut other = Vec::new();
279 
280         while let Some(subtag) = iter.next() {
281             if subtag.is_empty() {
282                 return Err(ParseError::InvalidExtension);
283             }
284 
285             let &[subtag] = subtag else {
286                 return Err(ParseError::InvalidExtension);
287             };
288 
289             match ExtensionType::try_from_byte(subtag) {
290                 Ok(ExtensionType::Unicode) => {
291                     if unicode.is_some() {
292                         return Err(ParseError::DuplicatedExtension);
293                     }
294                     unicode = Some(Unicode::try_from_iter(iter)?);
295                 }
296                 Ok(ExtensionType::Transform) => {
297                     if transform.is_some() {
298                         return Err(ParseError::DuplicatedExtension);
299                     }
300                     transform = Some(Transform::try_from_iter(iter)?);
301                 }
302                 Ok(ExtensionType::Private) => {
303                     if private.is_some() {
304                         return Err(ParseError::DuplicatedExtension);
305                     }
306                     private = Some(Private::try_from_iter(iter)?);
307                 }
308                 Ok(ExtensionType::Other(ext)) => {
309                     if other.iter().any(|o: &Other| o.get_ext_byte() == ext) {
310                         return Err(ParseError::DuplicatedExtension);
311                     }
312                     let parsed = Other::try_from_iter(ext, iter)?;
313                     if let Err(idx) = other.binary_search(&parsed) {
314                         other.insert(idx, parsed);
315                     } else {
316                         return Err(ParseError::InvalidExtension);
317                     }
318                 }
319                 _ => return Err(ParseError::InvalidExtension),
320             }
321         }
322 
323         Ok(Self {
324             unicode: unicode.unwrap_or_default(),
325             transform: transform.unwrap_or_default(),
326             private: private.unwrap_or_default(),
327             other,
328         })
329     }
330 
for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E> where F: FnMut(&str) -> Result<(), E>,331     pub(crate) fn for_each_subtag_str<E, F>(&self, f: &mut F) -> Result<(), E>
332     where
333         F: FnMut(&str) -> Result<(), E>,
334     {
335         let mut wrote_tu = false;
336         // Alphabetic by singleton
337         self.other.iter().try_for_each(|other| {
338             if other.get_ext() > TRANSFORM_EXT_CHAR && !wrote_tu {
339                 // Since 't' and 'u' are next to each other in alphabetical
340                 // order, write both now.
341                 self.transform.for_each_subtag_str(f, true)?;
342                 self.unicode.for_each_subtag_str(f, true)?;
343                 wrote_tu = true;
344             }
345             other.for_each_subtag_str(f, true)?;
346             Ok(())
347         })?;
348 
349         if !wrote_tu {
350             self.transform.for_each_subtag_str(f, true)?;
351             self.unicode.for_each_subtag_str(f, true)?;
352         }
353 
354         // Private must be written last, since it allows single character
355         // keys. Extensions must also be written in alphabetical order,
356         // which would seem to imply that other extensions `y` and `z` are
357         // invalid, but this is not specified.
358         self.private.for_each_subtag_str(f, true)?;
359         Ok(())
360     }
361 }
362 
363 #[cfg(feature = "alloc")]
364 impl_writeable_for_each_subtag_str_no_test!(Extensions);
365 
366 #[test]
test_writeable()367 fn test_writeable() {
368     use crate::Locale;
369     use writeable::assert_writeable_eq;
370     assert_writeable_eq!(Extensions::new(), "");
371     assert_writeable_eq!(
372         "my-t-my-d0-zawgyi".parse::<Locale>().unwrap().extensions,
373         "t-my-d0-zawgyi",
374     );
375     assert_writeable_eq!(
376         "ar-SA-u-ca-islamic-civil"
377             .parse::<Locale>()
378             .unwrap()
379             .extensions,
380         "u-ca-islamic-civil",
381     );
382     assert_writeable_eq!(
383         "en-001-x-foo-bar".parse::<Locale>().unwrap().extensions,
384         "x-foo-bar",
385     );
386     assert_writeable_eq!(
387         "und-t-m0-true".parse::<Locale>().unwrap().extensions,
388         "t-m0-true",
389     );
390     assert_writeable_eq!(
391         "und-a-foo-t-foo-u-foo-w-foo-z-foo-x-foo"
392             .parse::<Locale>()
393             .unwrap()
394             .extensions,
395         "a-foo-t-foo-u-foo-w-foo-z-foo-x-foo",
396     );
397 }
398