1 // Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT 2 // file at the top-level directory of this distribution and at 3 // http://rust-lang.org/COPYRIGHT. 4 // 5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 8 // option. This file may not be copied, modified, or distributed 9 // except according to those terms. 10 11 //! Unicode character composition and decomposition utilities 12 //! as described in 13 //! [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/). 14 //! 15 //! ```rust 16 //! extern crate unicode_normalization; 17 //! 18 //! use unicode_normalization::char::compose; 19 //! use unicode_normalization::UnicodeNormalization; 20 //! 21 //! fn main() { 22 //! assert_eq!(compose('A','\u{30a}'), Some('Å')); 23 //! 24 //! let s = "ÅΩ"; 25 //! let c = s.nfc().collect::<String>(); 26 //! assert_eq!(c, "ÅΩ"); 27 //! } 28 //! ``` 29 //! 30 //! # crates.io 31 //! 32 //! You can use this package in your project by adding the following 33 //! to your `Cargo.toml`: 34 //! 35 //! ```toml 36 //! [dependencies] 37 //! unicode-normalization = "0.1.20" 38 //! ``` 39 40 #![deny(missing_docs, unsafe_code)] 41 #![doc( 42 html_logo_url = "https://unicode-rs.github.io/unicode-rs_sm.png", 43 html_favicon_url = "https://unicode-rs.github.io/unicode-rs_sm.png" 44 )] 45 #![cfg_attr(not(feature = "std"), no_std)] 46 47 #[cfg(not(feature = "std"))] 48 extern crate alloc; 49 50 #[cfg(feature = "std")] 51 extern crate core; 52 53 extern crate tinyvec; 54 55 pub use crate::decompose::Decompositions; 56 pub use crate::quick_check::{ 57 is_nfc, is_nfc_quick, is_nfc_stream_safe, is_nfc_stream_safe_quick, is_nfd, is_nfd_quick, 58 is_nfd_stream_safe, is_nfd_stream_safe_quick, is_nfkc, is_nfkc_quick, is_nfkd, is_nfkd_quick, 59 IsNormalized, 60 }; 61 pub use crate::recompose::Recompositions; 62 pub use crate::replace::Replacements; 63 pub use crate::stream_safe::StreamSafe; 64 pub use crate::tables::UNICODE_VERSION; 65 use core::{ 66 str::Chars, 67 option, 68 }; 69 70 mod no_std_prelude; 71 72 mod decompose; 73 mod lookups; 74 mod normalize; 75 mod perfect_hash; 76 mod quick_check; 77 mod recompose; 78 mod replace; 79 mod stream_safe; 80 81 #[rustfmt::skip] 82 mod tables; 83 84 #[doc(hidden)] 85 pub mod __test_api; 86 #[cfg(test)] 87 mod test; 88 89 /// Methods for composing and decomposing characters. 90 pub mod char { 91 pub use crate::normalize::{ 92 compose, decompose_canonical, decompose_cjk_compat_variants, decompose_compatible, 93 }; 94 95 pub use crate::lookups::{canonical_combining_class, is_combining_mark}; 96 97 /// Return whether the given character is assigned (`General_Category` != `Unassigned`) 98 /// and not Private-Use (`General_Category` != `Private_Use`), in the supported version 99 /// of Unicode. 100 pub use crate::tables::is_public_assigned; 101 } 102 103 /// Methods for iterating over strings while applying Unicode normalizations 104 /// as described in 105 /// [Unicode Standard Annex #15](http://www.unicode.org/reports/tr15/). 106 pub trait UnicodeNormalization<I: Iterator<Item = char>> { 107 /// Returns an iterator over the string in Unicode Normalization Form D 108 /// (canonical decomposition). nfd(self) -> Decompositions<I>109 fn nfd(self) -> Decompositions<I>; 110 111 /// Returns an iterator over the string in Unicode Normalization Form KD 112 /// (compatibility decomposition). nfkd(self) -> Decompositions<I>113 fn nfkd(self) -> Decompositions<I>; 114 115 /// An Iterator over the string in Unicode Normalization Form C 116 /// (canonical decomposition followed by canonical composition). nfc(self) -> Recompositions<I>117 fn nfc(self) -> Recompositions<I>; 118 119 /// An Iterator over the string in Unicode Normalization Form KC 120 /// (compatibility decomposition followed by canonical composition). nfkc(self) -> Recompositions<I>121 fn nfkc(self) -> Recompositions<I>; 122 123 /// A transformation which replaces CJK Compatibility Ideograph codepoints 124 /// with normal forms using Standardized Variation Sequences. This is not 125 /// part of the canonical or compatibility decomposition algorithms, but 126 /// performing it before those algorithms produces normalized output which 127 /// better preserves the intent of the original text. 128 /// 129 /// Note that many systems today ignore variation selectors, so these 130 /// may not immediately help text display as intended, but they at 131 /// least preserve the information in a standardized form, giving 132 /// implementations the option to recognize them. cjk_compat_variants(self) -> Replacements<I>133 fn cjk_compat_variants(self) -> Replacements<I>; 134 135 /// An Iterator over the string with Conjoining Grapheme Joiner characters 136 /// inserted according to the Stream-Safe Text Process (UAX15-D4) stream_safe(self) -> StreamSafe<I>137 fn stream_safe(self) -> StreamSafe<I>; 138 } 139 140 impl<'a> UnicodeNormalization<Chars<'a>> for &'a str { 141 #[inline] nfd(self) -> Decompositions<Chars<'a>>142 fn nfd(self) -> Decompositions<Chars<'a>> { 143 decompose::new_canonical(self.chars()) 144 } 145 146 #[inline] nfkd(self) -> Decompositions<Chars<'a>>147 fn nfkd(self) -> Decompositions<Chars<'a>> { 148 decompose::new_compatible(self.chars()) 149 } 150 151 #[inline] nfc(self) -> Recompositions<Chars<'a>>152 fn nfc(self) -> Recompositions<Chars<'a>> { 153 recompose::new_canonical(self.chars()) 154 } 155 156 #[inline] nfkc(self) -> Recompositions<Chars<'a>>157 fn nfkc(self) -> Recompositions<Chars<'a>> { 158 recompose::new_compatible(self.chars()) 159 } 160 161 #[inline] cjk_compat_variants(self) -> Replacements<Chars<'a>>162 fn cjk_compat_variants(self) -> Replacements<Chars<'a>> { 163 replace::new_cjk_compat_variants(self.chars()) 164 } 165 166 #[inline] stream_safe(self) -> StreamSafe<Chars<'a>>167 fn stream_safe(self) -> StreamSafe<Chars<'a>> { 168 StreamSafe::new(self.chars()) 169 } 170 } 171 172 173 impl UnicodeNormalization<option::IntoIter<char>> for char { 174 #[inline] nfd(self) -> Decompositions<option::IntoIter<char>>175 fn nfd(self) -> Decompositions<option::IntoIter<char>> { 176 decompose::new_canonical(Some(self).into_iter()) 177 } 178 179 #[inline] nfkd(self) -> Decompositions<option::IntoIter<char>>180 fn nfkd(self) -> Decompositions<option::IntoIter<char>> { 181 decompose::new_compatible(Some(self).into_iter()) 182 } 183 184 #[inline] nfc(self) -> Recompositions<option::IntoIter<char>>185 fn nfc(self) -> Recompositions<option::IntoIter<char>> { 186 recompose::new_canonical(Some(self).into_iter()) 187 } 188 189 #[inline] nfkc(self) -> Recompositions<option::IntoIter<char>>190 fn nfkc(self) -> Recompositions<option::IntoIter<char>> { 191 recompose::new_compatible(Some(self).into_iter()) 192 } 193 194 #[inline] cjk_compat_variants(self) -> Replacements<option::IntoIter<char>>195 fn cjk_compat_variants(self) -> Replacements<option::IntoIter<char>> { 196 replace::new_cjk_compat_variants(Some(self).into_iter()) 197 } 198 199 #[inline] stream_safe(self) -> StreamSafe<option::IntoIter<char>>200 fn stream_safe(self) -> StreamSafe<option::IntoIter<char>> { 201 StreamSafe::new(Some(self).into_iter()) 202 } 203 } 204 205 impl<I: Iterator<Item = char>> UnicodeNormalization<I> for I { 206 #[inline] nfd(self) -> Decompositions<I>207 fn nfd(self) -> Decompositions<I> { 208 decompose::new_canonical(self) 209 } 210 211 #[inline] nfkd(self) -> Decompositions<I>212 fn nfkd(self) -> Decompositions<I> { 213 decompose::new_compatible(self) 214 } 215 216 #[inline] nfc(self) -> Recompositions<I>217 fn nfc(self) -> Recompositions<I> { 218 recompose::new_canonical(self) 219 } 220 221 #[inline] nfkc(self) -> Recompositions<I>222 fn nfkc(self) -> Recompositions<I> { 223 recompose::new_compatible(self) 224 } 225 226 #[inline] cjk_compat_variants(self) -> Replacements<I>227 fn cjk_compat_variants(self) -> Replacements<I> { 228 replace::new_cjk_compat_variants(self) 229 } 230 231 #[inline] stream_safe(self) -> StreamSafe<I>232 fn stream_safe(self) -> StreamSafe<I> { 233 StreamSafe::new(self) 234 } 235 } 236