1 // This file is part of ICU4X. For terms of use, please see the file 2 // called LICENSE at the top level of the ICU4X source tree 3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). 4 5 #[diplomat::bridge] 6 #[diplomat::abi_rename = "icu4x_{0}_mv1"] 7 #[diplomat::attr(auto, namespace = "icu4x")] 8 pub mod ffi { 9 use alloc::boxed::Box; 10 11 #[cfg(feature = "buffer_provider")] 12 use crate::provider::ffi::DataProvider; 13 #[cfg(any(feature = "compiled_data", feature = "buffer_provider"))] 14 use crate::{errors::ffi::DataError, locale_core::ffi::Locale}; 15 16 #[diplomat::opaque] 17 /// An ICU4X sentence-break segmenter, capable of finding sentence breakpoints in strings. 18 #[diplomat::rust_link(icu::segmenter::SentenceSegmenter, Struct)] 19 pub struct SentenceSegmenter(icu_segmenter::SentenceSegmenter); 20 21 #[diplomat::opaque] 22 #[diplomat::rust_link(icu::segmenter::SentenceBreakIterator, Struct)] 23 #[diplomat::rust_link( 24 icu::segmenter::SentenceBreakIteratorPotentiallyIllFormedUtf8, 25 Typedef, 26 hidden 27 )] 28 #[diplomat::rust_link(icu::segmenter::SentenceBreakIteratorUtf8, Typedef, hidden)] 29 pub struct SentenceBreakIteratorUtf8<'a>( 30 icu_segmenter::SentenceBreakIteratorPotentiallyIllFormedUtf8<'a, 'a>, 31 ); 32 33 #[diplomat::opaque] 34 #[diplomat::rust_link(icu::segmenter::SentenceBreakIterator, Struct)] 35 #[diplomat::rust_link(icu::segmenter::SentenceBreakIteratorUtf16, Typedef, hidden)] 36 pub struct SentenceBreakIteratorUtf16<'a>(icu_segmenter::SentenceBreakIteratorUtf16<'a, 'a>); 37 38 #[diplomat::opaque] 39 #[diplomat::rust_link(icu::segmenter::SentenceBreakIterator, Struct)] 40 #[diplomat::rust_link(icu::segmenter::SentenceBreakIteratorLatin1, Typedef, hidden)] 41 pub struct SentenceBreakIteratorLatin1<'a>(icu_segmenter::SentenceBreakIteratorLatin1<'a, 'a>); 42 43 impl SentenceSegmenter { 44 /// Construct a [`SentenceSegmenter`] using compiled data. This does not assume any content locale. 45 #[diplomat::rust_link(icu::segmenter::SentenceSegmenter::new, FnInStruct)] 46 #[diplomat::rust_link( 47 icu::segmenter::options::SentenceBreakInvariantOptions, 48 Struct, 49 hidden 50 )] 51 #[diplomat::attr(auto, constructor)] 52 #[cfg(feature = "compiled_data")] create() -> Box<SentenceSegmenter>53 pub fn create() -> Box<SentenceSegmenter> { 54 Box::new(SentenceSegmenter(icu_segmenter::SentenceSegmenter::new( 55 Default::default(), 56 ))) 57 } 58 /// Construct a [`SentenceSegmenter`] for content known to be of a given locale, using compiled data. 59 #[diplomat::rust_link(icu::segmenter::SentenceSegmenter::try_new, FnInStruct, hidden)] 60 #[diplomat::rust_link(icu::segmenter::options::SentenceBreakOptions, Struct, hidden)] 61 #[diplomat::attr(all(supports = fallible_constructors, supports = named_constructors), named_constructor = "with_content_locale")] 62 #[cfg(feature = "compiled_data")] create_with_content_locale( locale: &Locale, ) -> Result<Box<SentenceSegmenter>, DataError>63 pub fn create_with_content_locale( 64 locale: &Locale, 65 ) -> Result<Box<SentenceSegmenter>, DataError> { 66 Ok(Box::new(SentenceSegmenter( 67 icu_segmenter::SentenceSegmenter::try_new(locale.into())?, 68 ))) 69 } 70 71 /// Construct a [`SentenceSegmenter`] for content known to be of a given locale, using a particular data source. 72 #[diplomat::rust_link(icu::segmenter::SentenceSegmenter::try_new, FnInStruct, hidden)] 73 #[diplomat::attr(all(supports = fallible_constructors, supports = named_constructors), named_constructor = "with_content_locale_and_provider")] 74 #[cfg(feature = "buffer_provider")] create_with_content_locale_and_provider( provider: &DataProvider, locale: &Locale, ) -> Result<Box<SentenceSegmenter>, DataError>75 pub fn create_with_content_locale_and_provider( 76 provider: &DataProvider, 77 locale: &Locale, 78 ) -> Result<Box<SentenceSegmenter>, DataError> { 79 Ok(Box::new(SentenceSegmenter( 80 icu_segmenter::SentenceSegmenter::try_new_with_buffer_provider( 81 provider.get()?, 82 locale.into(), 83 )?, 84 ))) 85 } 86 87 /// Segments a string. 88 /// 89 /// Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according 90 /// to the WHATWG Encoding Standard. 91 #[diplomat::rust_link(icu::segmenter::SentenceSegmenter::segment_utf8, FnInStruct)] 92 #[diplomat::rust_link(icu::segmenter::SentenceSegmenter::segment_str, FnInStruct, hidden)] 93 #[diplomat::attr(not(supports = utf8_strings), disable)] 94 #[diplomat::attr(*, rename = "segment")] segment_utf8<'a>( &'a self, input: &'a DiplomatStr, ) -> Box<SentenceBreakIteratorUtf8<'a>>95 pub fn segment_utf8<'a>( 96 &'a self, 97 input: &'a DiplomatStr, 98 ) -> Box<SentenceBreakIteratorUtf8<'a>> { 99 Box::new(SentenceBreakIteratorUtf8(self.0.segment_utf8(input))) 100 } 101 102 /// Segments a string. 103 /// 104 /// Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according 105 /// to the WHATWG Encoding Standard. 106 #[diplomat::rust_link(icu::segmenter::SentenceSegmenter::segment_utf16, FnInStruct)] 107 #[diplomat::attr(not(supports = utf8_strings), rename = "segment")] 108 #[diplomat::attr(supports = utf8_strings, rename = "segment16")] segment_utf16<'a>( &'a self, input: &'a DiplomatStr16, ) -> Box<SentenceBreakIteratorUtf16<'a>>109 pub fn segment_utf16<'a>( 110 &'a self, 111 input: &'a DiplomatStr16, 112 ) -> Box<SentenceBreakIteratorUtf16<'a>> { 113 Box::new(SentenceBreakIteratorUtf16(self.0.segment_utf16(input))) 114 } 115 116 /// Segments a Latin-1 string. 117 #[diplomat::rust_link(icu::segmenter::SentenceSegmenter::segment_latin1, FnInStruct)] 118 #[diplomat::attr(not(supports = utf8_strings), disable)] segment_latin1<'a>( &'a self, input: &'a [u8], ) -> Box<SentenceBreakIteratorLatin1<'a>>119 pub fn segment_latin1<'a>( 120 &'a self, 121 input: &'a [u8], 122 ) -> Box<SentenceBreakIteratorLatin1<'a>> { 123 Box::new(SentenceBreakIteratorLatin1(self.0.segment_latin1(input))) 124 } 125 } 126 127 impl<'a> SentenceBreakIteratorUtf8<'a> { 128 /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is 129 /// out of range of a 32-bit signed integer. 130 #[diplomat::rust_link(icu::segmenter::SentenceBreakIterator::next, FnInStruct)] 131 #[diplomat::rust_link( 132 icu::segmenter::SentenceBreakIterator::Item, 133 AssociatedTypeInStruct, 134 hidden 135 )] next(&mut self) -> i32136 pub fn next(&mut self) -> i32 { 137 self.0 138 .next() 139 .and_then(|u| i32::try_from(u).ok()) 140 .unwrap_or(-1) 141 } 142 } 143 144 impl<'a> SentenceBreakIteratorUtf16<'a> { 145 /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is 146 /// out of range of a 32-bit signed integer. 147 #[diplomat::rust_link(icu::segmenter::SentenceBreakIterator::next, FnInStruct)] 148 #[diplomat::rust_link( 149 icu::segmenter::SentenceBreakIterator::Item, 150 AssociatedTypeInStruct, 151 hidden 152 )] next(&mut self) -> i32153 pub fn next(&mut self) -> i32 { 154 self.0 155 .next() 156 .and_then(|u| i32::try_from(u).ok()) 157 .unwrap_or(-1) 158 } 159 } 160 161 impl<'a> SentenceBreakIteratorLatin1<'a> { 162 /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is 163 /// out of range of a 32-bit signed integer. 164 #[diplomat::rust_link(icu::segmenter::SentenceBreakIterator::next, FnInStruct)] 165 #[diplomat::rust_link( 166 icu::segmenter::SentenceBreakIterator::Item, 167 AssociatedTypeInStruct, 168 hidden 169 )] next(&mut self) -> i32170 pub fn next(&mut self) -> i32 { 171 self.0 172 .next() 173 .and_then(|u| i32::try_from(u).ok()) 174 .unwrap_or(-1) 175 } 176 } 177 } 178 179 impl<'a> From<&'a crate::locale_core::ffi::Locale> 180 for icu_segmenter::options::SentenceBreakOptions<'a> 181 { from(other: &'a crate::locale_core::ffi::Locale) -> Self182 fn from(other: &'a crate::locale_core::ffi::Locale) -> Self { 183 let mut options = icu_segmenter::options::SentenceBreakOptions::default(); 184 options.content_locale = Some(&other.0.id); 185 options 186 } 187 } 188