1 // This file is part of ICU4X. For terms of use, please see the file 2 // called LICENSE at the top level of the ICU4X source tree 3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). 4 5 #[diplomat::bridge] 6 #[diplomat::abi_rename = "icu4x_{0}_mv1"] 7 #[diplomat::attr(auto, namespace = "icu4x")] 8 pub mod ffi { 9 use alloc::boxed::Box; 10 11 #[cfg(feature = "buffer_provider")] 12 use crate::provider::ffi::DataProvider; 13 #[cfg(any(feature = "compiled_data", feature = "buffer_provider"))] 14 use crate::{errors::ffi::DataError, locale_core::ffi::Locale}; 15 16 #[diplomat::enum_convert(icu_segmenter::options::WordType, needs_wildcard)] 17 #[diplomat::rust_link(icu::segmenter::options::WordType, Enum)] 18 pub enum SegmenterWordType { 19 None = 0, 20 Number = 1, 21 Letter = 2, 22 } 23 24 #[diplomat::opaque] 25 /// An ICU4X word-break segmenter, capable of finding word breakpoints in strings. 26 #[diplomat::rust_link(icu::segmenter::WordSegmenter, Struct)] 27 #[diplomat::demo(custom_func = "../../npm/demo_gen_custom/WordSegmenter.mjs")] 28 pub struct WordSegmenter(icu_segmenter::WordSegmenter); 29 30 #[diplomat::opaque] 31 #[diplomat::rust_link(icu::segmenter::WordBreakIterator, Struct)] 32 #[diplomat::rust_link( 33 icu::segmenter::WordBreakIteratorPotentiallyIllFormedUtf8, 34 Typedef, 35 hidden 36 )] 37 #[diplomat::rust_link(icu::segmenter::WordBreakIteratorUtf8, Typedef, hidden)] 38 pub struct WordBreakIteratorUtf8<'a>( 39 icu_segmenter::WordBreakIteratorPotentiallyIllFormedUtf8<'a, 'a>, 40 ); 41 42 #[diplomat::opaque] 43 #[diplomat::rust_link(icu::segmenter::WordBreakIterator, Struct)] 44 #[diplomat::rust_link(icu::segmenter::WordBreakIteratorUtf16, Typedef, hidden)] 45 pub struct WordBreakIteratorUtf16<'a>(icu_segmenter::WordBreakIteratorUtf16<'a, 'a>); 46 47 #[diplomat::opaque] 48 #[diplomat::rust_link(icu::segmenter::WordBreakIterator, Struct)] 49 #[diplomat::rust_link(icu::segmenter::WordBreakIteratorLatin1, Typedef, hidden)] 50 pub struct WordBreakIteratorLatin1<'a>(icu_segmenter::WordBreakIteratorLatin1<'a, 'a>); 51 52 impl SegmenterWordType { 53 #[diplomat::rust_link(icu::segmenter::options::WordType::is_word_like, FnInEnum)] 54 #[diplomat::attr(auto, getter)] is_word_like(self) -> bool55 pub fn is_word_like(self) -> bool { 56 icu_segmenter::options::WordType::from(self).is_word_like() 57 } 58 } 59 60 impl WordSegmenter { 61 /// Construct an [`WordSegmenter`] with automatically selecting the best available LSTM 62 /// or dictionary payload data, using compiled data. This does not assume any content locale. 63 /// 64 /// Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese, 65 /// Khmer, Lao, and Thai. 66 #[diplomat::rust_link(icu::segmenter::WordSegmenter::new_auto, FnInStruct)] 67 #[diplomat::rust_link(icu::segmenter::options::WordBreakInvariantOptions, Struct, hidden)] 68 #[diplomat::attr(auto, named_constructor = "auto")] 69 #[cfg(feature = "compiled_data")] create_auto() -> Box<WordSegmenter>70 pub fn create_auto() -> Box<WordSegmenter> { 71 Box::new(WordSegmenter(icu_segmenter::WordSegmenter::new_auto( 72 Default::default(), 73 ))) 74 } 75 76 /// Construct an [`WordSegmenter`] with automatically selecting the best available LSTM 77 /// or dictionary payload data, using compiled data. 78 /// 79 /// Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese, 80 /// Khmer, Lao, and Thai. 81 #[diplomat::rust_link(icu::segmenter::WordSegmenter::try_new_auto, FnInStruct)] 82 #[diplomat::rust_link(icu::segmenter::options::WordBreakOptions, Struct, hidden)] 83 #[diplomat::attr(all(supports = fallible_constructors, supports = named_constructors), named_constructor = "auto_with_content_locale")] 84 #[cfg(feature = "compiled_data")] create_auto_with_content_locale( locale: &Locale, ) -> Result<Box<WordSegmenter>, DataError>85 pub fn create_auto_with_content_locale( 86 locale: &Locale, 87 ) -> Result<Box<WordSegmenter>, DataError> { 88 Ok(Box::new(WordSegmenter( 89 icu_segmenter::WordSegmenter::try_new_auto(locale.into())?, 90 ))) 91 } 92 93 /// Construct an [`WordSegmenter`] with automatically selecting the best available LSTM 94 /// or dictionary payload data, using a particular data source. 95 /// 96 /// Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese, 97 /// Khmer, Lao, and Thai. 98 #[diplomat::rust_link(icu::segmenter::WordSegmenter::try_new_auto, FnInStruct)] 99 #[diplomat::attr(all(supports = fallible_constructors, supports = named_constructors), named_constructor = "auto_with_content_locale_and_provider")] 100 #[cfg(feature = "buffer_provider")] create_auto_with_content_locale_and_provider( provider: &DataProvider, locale: &Locale, ) -> Result<Box<WordSegmenter>, DataError>101 pub fn create_auto_with_content_locale_and_provider( 102 provider: &DataProvider, 103 locale: &Locale, 104 ) -> Result<Box<WordSegmenter>, DataError> { 105 Ok(Box::new(WordSegmenter( 106 icu_segmenter::WordSegmenter::try_new_auto_with_buffer_provider( 107 provider.get()?, 108 locale.into(), 109 )?, 110 ))) 111 } 112 113 /// Construct an [`WordSegmenter`] with LSTM payload data for Burmese, Khmer, Lao, and 114 /// Thai, using compiled data. This does not assume any content locale. 115 /// 116 /// Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese, 117 /// Khmer, Lao, and Thai. 118 #[diplomat::rust_link(icu::segmenter::WordSegmenter::new_lstm, FnInStruct)] 119 #[diplomat::attr(auto, named_constructor = "lstm")] 120 #[cfg(feature = "compiled_data")] create_lstm() -> Box<WordSegmenter>121 pub fn create_lstm() -> Box<WordSegmenter> { 122 Box::new(WordSegmenter(icu_segmenter::WordSegmenter::new_lstm( 123 Default::default(), 124 ))) 125 } 126 127 /// Construct an [`WordSegmenter`] with LSTM payload data for Burmese, Khmer, Lao, and 128 /// Thai, using compiled data. 129 /// 130 /// Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese, 131 /// Khmer, Lao, and Thai. 132 #[diplomat::rust_link(icu::segmenter::WordSegmenter::try_new_lstm, FnInStruct)] 133 #[diplomat::attr(all(supports = fallible_constructors, supports = named_constructors), named_constructor = "lstm_with_content_locale")] 134 #[cfg(feature = "compiled_data")] create_lstm_with_content_locale( locale: &Locale, ) -> Result<Box<WordSegmenter>, DataError>135 pub fn create_lstm_with_content_locale( 136 locale: &Locale, 137 ) -> Result<Box<WordSegmenter>, DataError> { 138 Ok(Box::new(WordSegmenter( 139 icu_segmenter::WordSegmenter::try_new_lstm(locale.into())?, 140 ))) 141 } 142 143 /// Construct an [`WordSegmenter`] with LSTM payload data for Burmese, Khmer, Lao, and 144 /// Thai, using a particular data source. 145 /// 146 /// Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese, 147 /// Khmer, Lao, and Thai. 148 #[diplomat::rust_link(icu::segmenter::WordSegmenter::try_new_lstm, FnInStruct)] 149 #[diplomat::attr(all(supports = fallible_constructors, supports = named_constructors), named_constructor = "lstm_with_content_locale_and_provider")] 150 #[cfg(feature = "buffer_provider")] create_lstm_with_content_locale_and_provider( provider: &DataProvider, locale: &Locale, ) -> Result<Box<WordSegmenter>, DataError>151 pub fn create_lstm_with_content_locale_and_provider( 152 provider: &DataProvider, 153 locale: &Locale, 154 ) -> Result<Box<WordSegmenter>, DataError> { 155 Ok(Box::new(WordSegmenter( 156 icu_segmenter::WordSegmenter::try_new_lstm_with_buffer_provider( 157 provider.get()?, 158 locale.into(), 159 )?, 160 ))) 161 } 162 163 /// Construct an [`WordSegmenter`] with with dictionary payload data for Chinese, Japanese, 164 /// Burmese, Khmer, Lao, and Thai, using compiled data. This does not assume any content locale. 165 /// 166 /// Note: currently, it uses dictionary for Chinese and Japanese, and dictionary for Burmese, 167 /// Khmer, Lao, and Thai. 168 #[diplomat::rust_link(icu::segmenter::WordSegmenter::new_dictionary, FnInStruct)] 169 #[diplomat::attr(auto, named_constructor = "dictionary")] 170 #[cfg(feature = "compiled_data")] create_dictionary() -> Box<WordSegmenter>171 pub fn create_dictionary() -> Box<WordSegmenter> { 172 Box::new(WordSegmenter(icu_segmenter::WordSegmenter::new_dictionary( 173 Default::default(), 174 ))) 175 } 176 177 /// Construct an [`WordSegmenter`] with dictionary payload data for Chinese, Japanese, 178 /// Burmese, Khmer, Lao, and Thai, using compiled data. 179 /// 180 /// Note: currently, it uses dictionary for Chinese and Japanese, and dictionary for Burmese, 181 /// Khmer, Lao, and Thai. 182 #[diplomat::rust_link(icu::segmenter::WordSegmenter::try_new_dictionary, FnInStruct)] 183 #[diplomat::attr(all(supports = fallible_constructors, supports = named_constructors), named_constructor = "dictionary_with_content_locale")] 184 #[cfg(feature = "compiled_data")] create_dictionary_with_content_locale( locale: &Locale, ) -> Result<Box<WordSegmenter>, DataError>185 pub fn create_dictionary_with_content_locale( 186 locale: &Locale, 187 ) -> Result<Box<WordSegmenter>, DataError> { 188 Ok(Box::new(WordSegmenter( 189 icu_segmenter::WordSegmenter::try_new_dictionary(locale.into())?, 190 ))) 191 } 192 193 /// Construct an [`WordSegmenter`] with dictionary payload data for Chinese, Japanese, 194 /// Burmese, Khmer, Lao, and Thai, using a particular data source. 195 /// 196 /// Note: currently, it uses dictionary for Chinese and Japanese, and dictionary for Burmese, 197 /// Khmer, Lao, and Thai. 198 #[diplomat::rust_link(icu::segmenter::WordSegmenter::try_new_dictionary, FnInStruct)] 199 #[diplomat::attr(all(supports = fallible_constructors, supports = named_constructors), named_constructor = "dictionary_with_content_locale_and_provider")] 200 #[cfg(feature = "buffer_provider")] create_dictionary_with_content_locale_and_provider( provider: &DataProvider, locale: &Locale, ) -> Result<Box<WordSegmenter>, DataError>201 pub fn create_dictionary_with_content_locale_and_provider( 202 provider: &DataProvider, 203 locale: &Locale, 204 ) -> Result<Box<WordSegmenter>, DataError> { 205 Ok(Box::new(WordSegmenter( 206 icu_segmenter::WordSegmenter::try_new_dictionary_with_buffer_provider( 207 provider.get()?, 208 locale.into(), 209 )?, 210 ))) 211 } 212 /// Segments a string. 213 /// 214 /// Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according 215 /// to the WHATWG Encoding Standard. 216 #[diplomat::rust_link(icu::segmenter::WordSegmenter::segment_utf8, FnInStruct)] 217 #[diplomat::rust_link(icu::segmenter::WordSegmenter::segment_str, FnInStruct, hidden)] 218 #[diplomat::attr(not(supports = utf8_strings), disable)] 219 #[diplomat::attr(*, rename = "segment")] segment_utf8<'a>( &'a self, input: &'a DiplomatStr, ) -> Box<WordBreakIteratorUtf8<'a>>220 pub fn segment_utf8<'a>( 221 &'a self, 222 input: &'a DiplomatStr, 223 ) -> Box<WordBreakIteratorUtf8<'a>> { 224 Box::new(WordBreakIteratorUtf8(self.0.segment_utf8(input))) 225 } 226 227 /// Segments a string. 228 /// 229 /// Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according 230 /// to the WHATWG Encoding Standard. 231 #[diplomat::rust_link(icu::segmenter::WordSegmenter::segment_utf16, FnInStruct)] 232 #[diplomat::attr(not(supports = utf8_strings), rename = "segment")] 233 #[diplomat::attr(supports = utf8_strings, rename = "segment16")] segment_utf16<'a>( &'a self, input: &'a DiplomatStr16, ) -> Box<WordBreakIteratorUtf16<'a>>234 pub fn segment_utf16<'a>( 235 &'a self, 236 input: &'a DiplomatStr16, 237 ) -> Box<WordBreakIteratorUtf16<'a>> { 238 Box::new(WordBreakIteratorUtf16(self.0.segment_utf16(input))) 239 } 240 241 /// Segments a Latin-1 string. 242 #[diplomat::rust_link(icu::segmenter::WordSegmenter::segment_latin1, FnInStruct)] 243 #[diplomat::attr(not(supports = utf8_strings), disable)] segment_latin1<'a>(&'a self, input: &'a [u8]) -> Box<WordBreakIteratorLatin1<'a>>244 pub fn segment_latin1<'a>(&'a self, input: &'a [u8]) -> Box<WordBreakIteratorLatin1<'a>> { 245 Box::new(WordBreakIteratorLatin1(self.0.segment_latin1(input))) 246 } 247 } 248 249 impl<'a> WordBreakIteratorUtf8<'a> { 250 /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is 251 /// out of range of a 32-bit signed integer. 252 #[diplomat::rust_link(icu::segmenter::WordBreakIterator::next, FnInStruct)] 253 #[diplomat::rust_link( 254 icu::segmenter::WordBreakIterator::Item, 255 AssociatedTypeInStruct, 256 hidden 257 )] next(&mut self) -> i32258 pub fn next(&mut self) -> i32 { 259 self.0 260 .next() 261 .and_then(|u| i32::try_from(u).ok()) 262 .unwrap_or(-1) 263 } 264 265 /// Return the status value of break boundary. 266 #[diplomat::rust_link(icu::segmenter::WordBreakIterator::word_type, FnInStruct)] 267 #[diplomat::attr(auto, getter)] word_type(&self) -> SegmenterWordType268 pub fn word_type(&self) -> SegmenterWordType { 269 self.0.word_type().into() 270 } 271 272 /// Return true when break boundary is word-like such as letter/number/CJK 273 #[diplomat::rust_link(icu::segmenter::WordBreakIterator::is_word_like, FnInStruct)] 274 #[diplomat::attr(auto, getter)] is_word_like(&self) -> bool275 pub fn is_word_like(&self) -> bool { 276 self.0.is_word_like() 277 } 278 } 279 280 impl<'a> WordBreakIteratorUtf16<'a> { 281 /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is 282 /// out of range of a 32-bit signed integer. 283 #[diplomat::rust_link(icu::segmenter::WordBreakIterator::next, FnInStruct)] 284 #[diplomat::rust_link( 285 icu::segmenter::WordBreakIterator::Item, 286 AssociatedTypeInStruct, 287 hidden 288 )] next(&mut self) -> i32289 pub fn next(&mut self) -> i32 { 290 self.0 291 .next() 292 .and_then(|u| i32::try_from(u).ok()) 293 .unwrap_or(-1) 294 } 295 296 /// Return the status value of break boundary. 297 #[diplomat::rust_link(icu::segmenter::WordBreakIterator::word_type, FnInStruct)] 298 #[diplomat::rust_link( 299 icu::segmenter::WordBreakIterator::iter_with_word_type, 300 FnInStruct, 301 hidden 302 )] 303 #[diplomat::attr(auto, getter)] word_type(&self) -> SegmenterWordType304 pub fn word_type(&self) -> SegmenterWordType { 305 self.0.word_type().into() 306 } 307 308 /// Return true when break boundary is word-like such as letter/number/CJK 309 #[diplomat::rust_link(icu::segmenter::WordBreakIterator::is_word_like, FnInStruct)] 310 #[diplomat::attr(auto, getter)] is_word_like(&self) -> bool311 pub fn is_word_like(&self) -> bool { 312 self.0.is_word_like() 313 } 314 } 315 316 impl<'a> WordBreakIteratorLatin1<'a> { 317 /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is 318 /// out of range of a 32-bit signed integer. 319 #[diplomat::rust_link(icu::segmenter::WordBreakIterator::next, FnInStruct)] 320 #[diplomat::rust_link( 321 icu::segmenter::WordBreakIterator::Item, 322 AssociatedTypeInStruct, 323 hidden 324 )] next(&mut self) -> i32325 pub fn next(&mut self) -> i32 { 326 self.0 327 .next() 328 .and_then(|u| i32::try_from(u).ok()) 329 .unwrap_or(-1) 330 } 331 332 /// Return the status value of break boundary. 333 #[diplomat::rust_link(icu::segmenter::WordBreakIterator::word_type, FnInStruct)] 334 #[diplomat::attr(auto, getter)] word_type(&self) -> SegmenterWordType335 pub fn word_type(&self) -> SegmenterWordType { 336 self.0.word_type().into() 337 } 338 339 /// Return true when break boundary is word-like such as letter/number/CJK 340 #[diplomat::rust_link(icu::segmenter::WordBreakIterator::is_word_like, FnInStruct)] 341 #[diplomat::attr(auto, getter)] is_word_like(&self) -> bool342 pub fn is_word_like(&self) -> bool { 343 self.0.is_word_like() 344 } 345 } 346 } 347 348 impl<'a> From<&'a crate::locale_core::ffi::Locale> 349 for icu_segmenter::options::WordBreakOptions<'a> 350 { from(other: &'a crate::locale_core::ffi::Locale) -> Self351 fn from(other: &'a crate::locale_core::ffi::Locale) -> Self { 352 let mut options = icu_segmenter::options::WordBreakOptions::default(); 353 options.content_locale = Some(&other.0.id); 354 options 355 } 356 } 357