• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10 
11 //! Unicode character composition and decomposition utilities
12 //! as described in
13 //! [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
14 //!
15 //! ```rust
16 //! extern crate unicode_normalization;
17 //!
18 //! use unicode_normalization::char::compose;
19 //! use unicode_normalization::UnicodeNormalization;
20 //!
21 //! fn main() {
22 //!     assert_eq!(compose('A','\u{30a}'), Some('Å'));
23 //!
24 //!     let s = "ÅΩ";
25 //!     let c = s.nfc().collect::<String>();
26 //!     assert_eq!(c, "ÅΩ");
27 //! }
28 //! ```
29 //!
30 //! # crates.io
31 //!
32 //! You can use this package in your project by adding the following
33 //! to your `Cargo.toml`:
34 //!
35 //! ```toml
36 //! [dependencies]
37 //! unicode-normalization = "0.1.20"
38 //! ```
39 
40 #![deny(missing_docs, unsafe_code)]
41 #![doc(
42     html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png",
43     html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png"
44 )]
45 #![cfg_attr(not(feature = "std"), no_std)]
46 
47 #[cfg(not(feature = "std"))]
48 extern crate alloc;
49 
50 #[cfg(feature = "std")]
51 extern crate core;
52 
53 extern crate tinyvec;
54 
55 pub use crate::decompose::Decompositions;
56 pub use crate::quick_check::{
57     is_nfc, is_nfc_quick, is_nfc_stream_safe, is_nfc_stream_safe_quick, is_nfd, is_nfd_quick,
58     is_nfd_stream_safe, is_nfd_stream_safe_quick, is_nfkc, is_nfkc_quick, is_nfkd, is_nfkd_quick,
59     IsNormalized,
60 };
61 pub use crate::recompose::Recompositions;
62 pub use crate::replace::Replacements;
63 pub use crate::stream_safe::StreamSafe;
64 pub use crate::tables::UNICODE_VERSION;
65 use core::{
66     str::Chars,
67     option,
68 };
69 
70 mod no_std_prelude;
71 
72 mod decompose;
73 mod lookups;
74 mod normalize;
75 mod perfect_hash;
76 mod quick_check;
77 mod recompose;
78 mod replace;
79 mod stream_safe;
80 
81 #[rustfmt::skip]
82 mod tables;
83 
84 #[doc(hidden)]
85 pub mod __test_api;
86 #[cfg(test)]
87 mod test;
88 
89 /// Methods for composing and decomposing characters.
90 pub mod char {
91     pub use crate::normalize::{
92         compose, decompose_canonical, decompose_cjk_compat_variants, decompose_compatible,
93     };
94 
95     pub use crate::lookups::{canonical_combining_class, is_combining_mark};
96 
97     /// Return whether the given character is assigned (`General_Category` != `Unassigned`)
98     /// and not Private-Use (`General_Category` != `Private_Use`), in the supported version
99     /// of Unicode.
100     pub use crate::tables::is_public_assigned;
101 }
102 
103 /// Methods for iterating over strings while applying Unicode normalizations
104 /// as described in
105 /// [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/).
106 pub trait UnicodeNormalization<I: Iterator<Item = char>> {
107     /// Returns an iterator over the string in Unicode Normalization Form D
108     /// (canonical decomposition).
nfd(self) -> Decompositions<I>109     fn nfd(self) -> Decompositions<I>;
110 
111     /// Returns an iterator over the string in Unicode Normalization Form KD
112     /// (compatibility decomposition).
nfkd(self) -> Decompositions<I>113     fn nfkd(self) -> Decompositions<I>;
114 
115     /// An Iterator over the string in Unicode Normalization Form C
116     /// (canonical decomposition followed by canonical composition).
nfc(self) -> Recompositions<I>117     fn nfc(self) -> Recompositions<I>;
118 
119     /// An Iterator over the string in Unicode Normalization Form KC
120     /// (compatibility decomposition followed by canonical composition).
nfkc(self) -> Recompositions<I>121     fn nfkc(self) -> Recompositions<I>;
122 
123     /// A transformation which replaces CJK Compatibility Ideograph codepoints
124     /// with normal forms using Standardized Variation Sequences. This is not
125     /// part of the canonical or compatibility decomposition algorithms, but
126     /// performing it before those algorithms produces normalized output which
127     /// better preserves the intent of the original text.
128     ///
129     /// Note that many systems today ignore variation selectors, so these
130     /// may not immediately help text display as intended, but they at
131     /// least preserve the information in a standardized form, giving
132     /// implementations the option to recognize them.
cjk_compat_variants(self) -> Replacements<I>133     fn cjk_compat_variants(self) -> Replacements<I>;
134 
135     /// An Iterator over the string with Conjoining Grapheme Joiner characters
136     /// inserted according to the Stream-Safe Text Process (UAX15-D4)
stream_safe(self) -> StreamSafe<I>137     fn stream_safe(self) -> StreamSafe<I>;
138 }
139 
140 impl<'a> UnicodeNormalization<Chars<'a>> for &'a str {
141     #[inline]
nfd(self) -> Decompositions<Chars<'a>>142     fn nfd(self) -> Decompositions<Chars<'a>> {
143         decompose::new_canonical(self.chars())
144     }
145 
146     #[inline]
nfkd(self) -> Decompositions<Chars<'a>>147     fn nfkd(self) -> Decompositions<Chars<'a>> {
148         decompose::new_compatible(self.chars())
149     }
150 
151     #[inline]
nfc(self) -> Recompositions<Chars<'a>>152     fn nfc(self) -> Recompositions<Chars<'a>> {
153         recompose::new_canonical(self.chars())
154     }
155 
156     #[inline]
nfkc(self) -> Recompositions<Chars<'a>>157     fn nfkc(self) -> Recompositions<Chars<'a>> {
158         recompose::new_compatible(self.chars())
159     }
160 
161     #[inline]
cjk_compat_variants(self) -> Replacements<Chars<'a>>162     fn cjk_compat_variants(self) -> Replacements<Chars<'a>> {
163         replace::new_cjk_compat_variants(self.chars())
164     }
165 
166     #[inline]
stream_safe(self) -> StreamSafe<Chars<'a>>167     fn stream_safe(self) -> StreamSafe<Chars<'a>> {
168         StreamSafe::new(self.chars())
169     }
170 }
171 
172 
173 impl UnicodeNormalization<option::IntoIter<char>> for char {
174     #[inline]
nfd(self) -> Decompositions<option::IntoIter<char>>175     fn nfd(self) -> Decompositions<option::IntoIter<char>> {
176         decompose::new_canonical(Some(self).into_iter())
177     }
178 
179     #[inline]
nfkd(self) -> Decompositions<option::IntoIter<char>>180     fn nfkd(self) -> Decompositions<option::IntoIter<char>> {
181         decompose::new_compatible(Some(self).into_iter())
182     }
183 
184     #[inline]
nfc(self) -> Recompositions<option::IntoIter<char>>185     fn nfc(self) -> Recompositions<option::IntoIter<char>> {
186         recompose::new_canonical(Some(self).into_iter())
187     }
188 
189     #[inline]
nfkc(self) -> Recompositions<option::IntoIter<char>>190     fn nfkc(self) -> Recompositions<option::IntoIter<char>> {
191         recompose::new_compatible(Some(self).into_iter())
192     }
193 
194     #[inline]
cjk_compat_variants(self) -> Replacements<option::IntoIter<char>>195     fn cjk_compat_variants(self) -> Replacements<option::IntoIter<char>> {
196         replace::new_cjk_compat_variants(Some(self).into_iter())
197     }
198 
199     #[inline]
stream_safe(self) -> StreamSafe<option::IntoIter<char>>200     fn stream_safe(self) -> StreamSafe<option::IntoIter<char>> {
201         StreamSafe::new(Some(self).into_iter())
202     }
203 }
204 
205 impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I {
206     #[inline]
nfd(self) -> Decompositions<I>207     fn nfd(self) -> Decompositions<I> {
208         decompose::new_canonical(self)
209     }
210 
211     #[inline]
nfkd(self) -> Decompositions<I>212     fn nfkd(self) -> Decompositions<I> {
213         decompose::new_compatible(self)
214     }
215 
216     #[inline]
nfc(self) -> Recompositions<I>217     fn nfc(self) -> Recompositions<I> {
218         recompose::new_canonical(self)
219     }
220 
221     #[inline]
nfkc(self) -> Recompositions<I>222     fn nfkc(self) -> Recompositions<I> {
223         recompose::new_compatible(self)
224     }
225 
226     #[inline]
cjk_compat_variants(self) -> Replacements<I>227     fn cjk_compat_variants(self) -> Replacements<I> {
228         replace::new_cjk_compat_variants(self)
229     }
230 
231     #[inline]
stream_safe(self) -> StreamSafe<I>232     fn stream_safe(self) -> StreamSafe<I> {
233         StreamSafe::new(self)
234     }
235 }
236