lib.rs - OpenGrok cross reference for /external/rust/crates/cesu8/src/lib.rs

Lines Matching +full:utf +full:- +full:8
1 // Copyright 2012-2014 The Rust Project Developers and Eric Kidd.  See the
2 // COPYRIGHT-RUST.txt file at the top-level directory of this distribution.
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
11 //! A simple library implementing the [CESU-8 compatibility encoding
12 //! scheme](http://www.unicode.org/reports/tr26/tr26-2.html).  This is a
13 //! non-standard variant of UTF-8 that is used internally by some systems
14 //! that need to represent UTF-16 data as 8-bit characters.  Yes, this is
25 //! // 16-bit Unicode characters are the same in UTF-8 and CESU-8.
31 //! // This string is CESU-8 data containing a 6-byte surrogate pair,
32 //! // which decodes to a 4-byte UTF-8 string.
41 //! unexpected input.  CESU-8 is supposed to be an internal-only format,
46 //! If you have a use case for lossy conversion to UTF-8, or conversion
47 //! from mixed UTF-8/CESU-8 data, please feel free to submit a pull request
52 //! Java uses the CESU-8 encoding as described above, but with one
54 //! UTF-8 sequence `C0 80`. This is supported by the `from_java_cesu8` and
57 //! ### Surrogate pairs and UTF-8
59 //! The UTF-16 encoding uses "surrogate pairs" to represent Unicode code
60 //! points in the range from U+10000 to U+10FFFF.  These are 16-bit numbers
64 //!   CESU-8, these become **1110**1101 **10**100000 **10**000000 to
71 //! Wikipedia [explains](http://en.wikipedia.org/wiki/UTF-16) the
72 //! code point to UTF-16 conversion process:
78 //! > * Split this into the high 10-bit value and the low 10-bit value:
103 /// The CESU-8 data could not be decoded as valid UTF-8 data.
108     fn description(&self) -> &str { "decoding error" }  in description()
109     fn cause(&self) -> Option<&Error> { None }  in cause()
113     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {  in fmt()
114         write!(f, "could not convert CESU-8 data to UTF-8")  in fmt()
121     /// Regular CESU-8, with '\0' represented by itself.
123     /// This is technically Java's "Modified UTF-8", which is supposedly
124     /// like CESU-8, except that it UTF-8 encodes the '\0' byte.  I'm sure
129 /// Convert CESU-8 data to a Rust string, re-encoding only if necessary.
130 /// Returns an error if the data cannot be represented as valid UTF-8.
136 /// // This string is valid as UTF-8 or CESU-8, so it doesn't change,
141 /// // This string is CESU-8 data containing a 6-byte surrogate pair,
142 /// // which becomes a 4-byte UTF-8 string.
147 pub fn from_cesu8(bytes: &[u8]) -> Result<Cow<str>, Cesu8DecodingError> {  in from_cesu8()
151 /// Convert Java's modified UTF-8 data to a Rust string, re-encoding only if
153 /// UTF-8.
159 /// // This string is valid as UTF-8 or modified UTF-8, so it doesn't change,
164 /// // This string is modified UTF-8 data containing a 6-byte surrogate pair,
165 /// // which becomes a 4-byte UTF-8 string.
170 /// // This string is modified UTF-8 data containing null code-points.
175 pub fn from_java_cesu8(bytes: &[u8]) -> Result<Cow<str>, Cesu8DecodingError> {  in from_java_cesu8()
180 fn from_cesu8_internal(bytes: &[u8], variant: Variant) ->  in from_cesu8_internal()
190                 // all UTF-8 strings are valid.  in from_cesu8_internal()
202     // The surrogate-encoded character below is from the ICU library's  in test_from_cesu8()
208     // We used to have test data from the CESU-8 specification, but when we  in test_from_cesu8()
214     // 0b1101_101110_000000 -> 0xDB80  in test_from_cesu8()
215     // 0b1101_110000_000000 -> 0xDC00  in test_from_cesu8()
217     // ((0xDB80 - 0xD800) << 10) | (0xDC00 - 0xDC00) -> 0xE0000  in test_from_cesu8()
218     // 0x10000 + 0xE0000 -> 0xF0000  in test_from_cesu8()
228     -> bool  in decode_from_iter()
260             // Java's modified UTF-8 should never contain \0 directly.  in decode_from_iter()
274                 // Two-byte sequences can be used directly.  in decode_from_iter()
279                         // These are valid UTF-8, so pass them through.  in decode_from_iter()
305 /// Convert the two trailing bytes from a CESU-8 surrogate to a regular
307 fn dec_surrogate(second: u8, third: u8) -> u32 {  in dec_surrogate()
311 /// Convert the bytes from a CESU-8 surrogate pair into a valid UTF-8
313 fn dec_surrogates(second: u8, third: u8, fifth: u8, sixth: u8) -> [u8; 4] {  in dec_surrogates()
314     // Convert to a 32-bit code point.
317     let c = 0x10000 + (((s1 - 0xD800) << 10) | (s2 - 0xDC00));
318     //println!("{:0>8b} {:0>8b} {:0>8b} -> {:0>16b}", 0xEDu8, second, third, s1);
319     //println!("{:0>8b} {:0>8b} {:0>8b} -> {:0>16b}", 0xEDu8, fifth, sixth, s2);
320     //println!("-> {:0>32b}", c);
323     // Convert to UTF-8.
331 /// Convert a Rust `&str` to CESU-8 bytes.
337 /// // This string is valid as UTF-8 or CESU-8, so it doesn't change,
341 /// // This string is a 4-byte UTF-8 string, which becomes a 6-byte CESU-8
346 pub fn to_cesu8(text: &str) -> Cow<[u8]> {  in to_cesu8()
354 /// Convert a Rust `&str` to Java's modified UTF-8 bytes.
360 /// // This string is valid as UTF-8 or CESU-8, so it doesn't change,
364 /// // This string is a 4-byte UTF-8 string, which becomes a 6-byte modified
365 /// // UTF-8 vector.
369 /// // This string contains null, which becomes 2-byte modified UTF-8 encoding
373 pub fn to_java_cesu8(text: &str) -> Cow<[u8]> {  in to_java_cesu8()
381 fn to_cesu8_internal(text: &str, variant: Variant) -> Vec<u8> {  in to_cesu8_internal()
401                 // Pass through short UTF-8 sequences unmodified.  in to_cesu8_internal()
404                 // Encode 4-byte sequences as 6 bytes.  in to_cesu8_internal()
406                 let c = s.chars().next().unwrap() as u32 - 0x10000;  in to_cesu8_internal()
419 /// Check whether a Rust string contains valid CESU-8 data.
420 pub fn is_valid_cesu8(text: &str) -> bool {  in is_valid_cesu8()
422     // UTF-8.  in is_valid_cesu8()
430 /// Check whether a Rust string contains valid Java's modified UTF-8 data.
431 pub fn is_valid_java_cesu8(text: &str) -> bool {  in is_valid_java_cesu8()
446 /// Encode a single surrogate as CESU-8.
447 fn enc_surrogate(surrogate: u16) -> [u8; 3] {  in enc_surrogate()