1 // This file is part of ICU4X. For terms of use, please see the file 2 // called LICENSE at the top level of the ICU4X source tree 3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). 4 5 #[diplomat::bridge] 6 #[diplomat::abi_rename = "icu4x_{0}_mv1"] 7 #[diplomat::attr(auto, namespace = "icu4x")] 8 pub mod ffi { 9 use alloc::boxed::Box; 10 11 #[cfg(any(feature = "compiled_data", feature = "buffer_provider"))] 12 use crate::locale_core::ffi::Locale; 13 #[cfg(feature = "buffer_provider")] 14 use crate::{errors::ffi::DataError, provider::ffi::DataProvider}; 15 use diplomat_runtime::DiplomatOption; 16 #[cfg(any(feature = "compiled_data", feature = "buffer_provider"))] 17 use icu_segmenter::options::LineBreakOptions; 18 19 #[diplomat::opaque] 20 /// An ICU4X line-break segmenter, capable of finding breakpoints in strings. 21 #[diplomat::rust_link(icu::segmenter::LineSegmenter, Struct)] 22 pub struct LineSegmenter(icu_segmenter::LineSegmenter); 23 24 #[diplomat::rust_link(icu::segmenter::options::LineBreakStrictness, Enum)] 25 #[diplomat::enum_convert(icu_segmenter::options::LineBreakStrictness, needs_wildcard)] 26 pub enum LineBreakStrictness { 27 Loose, 28 Normal, 29 Strict, 30 Anywhere, 31 } 32 33 #[diplomat::rust_link(icu::segmenter::options::LineBreakWordOption, Enum)] 34 #[diplomat::enum_convert(icu_segmenter::options::LineBreakWordOption, needs_wildcard)] 35 pub enum LineBreakWordOption { 36 Normal, 37 BreakAll, 38 KeepAll, 39 } 40 41 #[diplomat::rust_link(icu::segmenter::options::LineBreakOptions, Struct)] 42 #[diplomat::attr(supports = non_exhaustive_structs, rename = "LineBreakOptions")] 43 pub struct LineBreakOptionsV2 { 44 pub strictness: DiplomatOption<LineBreakStrictness>, 45 pub word_option: DiplomatOption<LineBreakWordOption>, 46 } 47 48 #[diplomat::opaque] 49 #[diplomat::rust_link(icu::segmenter::LineBreakIterator, Struct)] 50 #[diplomat::rust_link( 51 icu::segmenter::LineBreakIteratorPotentiallyIllFormedUtf8, 52 Typedef, 53 compact 54 )] 55 #[diplomat::rust_link(icu::segmenter::LineBreakIteratorUtf8, Typedef, hidden)] 56 pub struct LineBreakIteratorUtf8<'a>( 57 icu_segmenter::LineBreakIteratorPotentiallyIllFormedUtf8<'a, 'a>, 58 ); 59 60 #[diplomat::opaque] 61 #[diplomat::rust_link(icu::segmenter::LineBreakIterator, Struct)] 62 #[diplomat::rust_link(icu::segmenter::LineBreakIteratorUtf16, Typedef, compact)] 63 pub struct LineBreakIteratorUtf16<'a>(icu_segmenter::LineBreakIteratorUtf16<'a, 'a>); 64 65 #[diplomat::opaque] 66 #[diplomat::rust_link(icu::segmenter::LineBreakIterator, Struct)] 67 #[diplomat::rust_link(icu::segmenter::LineBreakIteratorLatin1, Typedef, compact)] 68 pub struct LineBreakIteratorLatin1<'a>(icu_segmenter::LineBreakIteratorLatin1<'a, 'a>); 69 70 impl LineSegmenter { 71 /// Construct a [`LineSegmenter`] with default options (no locale-based tailoring) using compiled data. It automatically loads the best 72 /// available payload data for Burmese, Khmer, Lao, and Thai. 73 #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_auto, FnInStruct)] 74 #[diplomat::attr(auto, named_constructor = "auto")] 75 #[cfg(feature = "compiled_data")] create_auto() -> Box<LineSegmenter>76 pub fn create_auto() -> Box<LineSegmenter> { 77 Box::new(LineSegmenter(icu_segmenter::LineSegmenter::new_auto( 78 Default::default(), 79 ))) 80 } 81 82 /// Construct a [`LineSegmenter`] with default options (no locale-based tailoring) and LSTM payload data for 83 /// Burmese, Khmer, Lao, and Thai, using compiled data. 84 #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_lstm, FnInStruct)] 85 #[diplomat::attr(auto, named_constructor = "lstm")] 86 #[cfg(feature = "compiled_data")] create_lstm() -> Box<LineSegmenter>87 pub fn create_lstm() -> Box<LineSegmenter> { 88 Box::new(LineSegmenter(icu_segmenter::LineSegmenter::new_lstm( 89 Default::default(), 90 ))) 91 } 92 93 /// Construct a [`LineSegmenter`] with default options (no locale-based tailoring) and dictionary payload data for 94 /// Burmese, Khmer, Lao, and Thai, using compiled data 95 #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_dictionary, FnInStruct)] 96 #[diplomat::attr(auto, named_constructor = "dictionary")] 97 #[cfg(feature = "compiled_data")] create_dictionary() -> Box<LineSegmenter>98 pub fn create_dictionary() -> Box<LineSegmenter> { 99 Box::new(LineSegmenter(icu_segmenter::LineSegmenter::new_dictionary( 100 Default::default(), 101 ))) 102 } 103 104 /// Construct a [`LineSegmenter`] with custom options using compiled data. It automatically loads the best 105 /// available payload data for Burmese, Khmer, Lao, and Thai. 106 #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_auto, FnInStruct)] 107 #[diplomat::attr(supports = non_exhaustive_structs, rename = "auto_with_options")] 108 #[diplomat::attr(all(supports = non_exhaustive_structs, supports = named_constructors), named_constructor = "auto_with_options")] 109 #[diplomat::attr(all(not(supports = non_exhaustive_structs), supports = named_constructors), named_constructor = "auto_with_options_v2")] 110 #[cfg(feature = "compiled_data")] create_auto_with_options_v2( content_locale: Option<&Locale>, options: LineBreakOptionsV2, ) -> Box<LineSegmenter>111 pub fn create_auto_with_options_v2( 112 content_locale: Option<&Locale>, 113 options: LineBreakOptionsV2, 114 ) -> Box<LineSegmenter> { 115 let mut options: LineBreakOptions = options.into(); 116 options.content_locale = content_locale.map(|c| &c.0.id); 117 Box::new(LineSegmenter(icu_segmenter::LineSegmenter::new_auto( 118 options, 119 ))) 120 } 121 /// Construct a [`LineSegmenter`] with custom options. It automatically loads the best 122 /// available payload data for Burmese, Khmer, Lao, and Thai, using a particular data source. 123 #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_auto, FnInStruct)] 124 #[diplomat::attr(supports = non_exhaustive_structs, rename = "auto_with_options_and_provider")] 125 #[diplomat::attr(all(supports = non_exhaustive_structs, supports = fallible_constructors, supports = named_constructors), named_constructor = "auto_with_options_and_provider")] 126 #[diplomat::attr(all(not(supports = non_exhaustive_structs), supports = fallible_constructors, supports = named_constructors), named_constructor = "auto_with_options_v2_and_provider")] 127 #[cfg(feature = "buffer_provider")] create_auto_with_options_v2_and_provider( provider: &DataProvider, content_locale: Option<&Locale>, options: LineBreakOptionsV2, ) -> Result<Box<LineSegmenter>, DataError>128 pub fn create_auto_with_options_v2_and_provider( 129 provider: &DataProvider, 130 content_locale: Option<&Locale>, 131 options: LineBreakOptionsV2, 132 ) -> Result<Box<LineSegmenter>, DataError> { 133 let mut options: LineBreakOptions = options.into(); 134 options.content_locale = content_locale.map(|c| &c.0.id); 135 136 Ok(Box::new(LineSegmenter( 137 icu_segmenter::LineSegmenter::try_new_auto_with_buffer_provider( 138 provider.get()?, 139 options, 140 )?, 141 ))) 142 } 143 /// Construct a [`LineSegmenter`] with custom options and LSTM payload data for 144 /// Burmese, Khmer, Lao, and Thai, using compiled data. 145 #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_lstm, FnInStruct)] 146 #[diplomat::attr(supports = non_exhaustive_structs, rename = "lstm_with_options")] 147 #[diplomat::attr(all(supports = non_exhaustive_structs, supports = named_constructors), named_constructor = "lstm_with_options")] 148 #[diplomat::attr(all(not(supports = non_exhaustive_structs), supports = named_constructors), named_constructor = "lstm_with_options_v2")] 149 #[cfg(feature = "compiled_data")] create_lstm_with_options_v2( content_locale: Option<&Locale>, options: LineBreakOptionsV2, ) -> Box<LineSegmenter>150 pub fn create_lstm_with_options_v2( 151 content_locale: Option<&Locale>, 152 options: LineBreakOptionsV2, 153 ) -> Box<LineSegmenter> { 154 let mut options: LineBreakOptions = options.into(); 155 options.content_locale = content_locale.map(|c| &c.0.id); 156 157 Box::new(LineSegmenter(icu_segmenter::LineSegmenter::new_lstm( 158 options, 159 ))) 160 } 161 /// Construct a [`LineSegmenter`] with custom options and LSTM payload data for 162 /// Burmese, Khmer, Lao, and Thai, using a particular data source. 163 #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_lstm, FnInStruct)] 164 #[diplomat::attr(supports = non_exhaustive_structs, rename = "lstm_with_options_and_provider")] 165 #[diplomat::attr(all(supports = non_exhaustive_structs, supports = fallible_constructors, supports = named_constructors), named_constructor = "lstm_with_options_and_provider")] 166 #[diplomat::attr(all(not(supports = non_exhaustive_structs), supports = fallible_constructors, supports = named_constructors), named_constructor = "lstm_with_options_v2_and_provider")] 167 #[cfg(feature = "buffer_provider")] create_lstm_with_options_v2_and_provider( provider: &DataProvider, content_locale: Option<&Locale>, options: LineBreakOptionsV2, ) -> Result<Box<LineSegmenter>, DataError>168 pub fn create_lstm_with_options_v2_and_provider( 169 provider: &DataProvider, 170 content_locale: Option<&Locale>, 171 options: LineBreakOptionsV2, 172 ) -> Result<Box<LineSegmenter>, DataError> { 173 let mut options: LineBreakOptions = options.into(); 174 options.content_locale = content_locale.map(|c| &c.0.id); 175 176 Ok(Box::new(LineSegmenter( 177 icu_segmenter::LineSegmenter::try_new_lstm_with_buffer_provider( 178 provider.get()?, 179 options, 180 )?, 181 ))) 182 } 183 /// Construct a [`LineSegmenter`] with custom options and dictionary payload data for 184 /// Burmese, Khmer, Lao, and Thai, using compiled data. 185 #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_dictionary, FnInStruct)] 186 #[diplomat::attr(supports = non_exhaustive_structs, rename = "dictionary_with_options")] 187 #[diplomat::attr(all(supports = non_exhaustive_structs, supports = named_constructors), named_constructor = "dictionary_with_options")] 188 #[diplomat::attr(all(not(supports = non_exhaustive_structs), supports = named_constructors), named_constructor = "dictionary_with_options_v2")] 189 #[cfg(feature = "compiled_data")] create_dictionary_with_options_v2( content_locale: Option<&Locale>, options: LineBreakOptionsV2, ) -> Box<LineSegmenter>190 pub fn create_dictionary_with_options_v2( 191 content_locale: Option<&Locale>, 192 options: LineBreakOptionsV2, 193 ) -> Box<LineSegmenter> { 194 let mut options: LineBreakOptions = options.into(); 195 options.content_locale = content_locale.map(|c| &c.0.id); 196 197 Box::new(LineSegmenter(icu_segmenter::LineSegmenter::new_dictionary( 198 options, 199 ))) 200 } 201 /// Construct a [`LineSegmenter`] with custom options and dictionary payload data for 202 /// Burmese, Khmer, Lao, and Thai, using a particular data source. 203 #[diplomat::rust_link(icu::segmenter::LineSegmenter::new_dictionary, FnInStruct)] 204 #[diplomat::attr(supports = non_exhaustive_structs, rename = "dictionary_with_options_and_provider")] 205 #[diplomat::attr(all(supports = non_exhaustive_structs, supports = fallible_constructors, supports = named_constructors), named_constructor = "dictionary_with_options_and_provider")] 206 #[diplomat::attr(all(not(supports = non_exhaustive_structs), supports = fallible_constructors, supports = named_constructors), named_constructor = "dictionary_with_options_v2_and_provider")] 207 #[cfg(feature = "buffer_provider")] create_dictionary_with_options_v2_and_provider( provider: &DataProvider, content_locale: Option<&Locale>, options: LineBreakOptionsV2, ) -> Result<Box<LineSegmenter>, DataError>208 pub fn create_dictionary_with_options_v2_and_provider( 209 provider: &DataProvider, 210 content_locale: Option<&Locale>, 211 options: LineBreakOptionsV2, 212 ) -> Result<Box<LineSegmenter>, DataError> { 213 let mut options: LineBreakOptions = options.into(); 214 options.content_locale = content_locale.map(|c| &c.0.id); 215 216 Ok(Box::new(LineSegmenter( 217 icu_segmenter::LineSegmenter::try_new_dictionary_with_buffer_provider( 218 provider.get()?, 219 options, 220 )?, 221 ))) 222 } 223 /// Segments a string. 224 /// 225 /// Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according 226 /// to the WHATWG Encoding Standard. 227 #[diplomat::rust_link(icu::segmenter::LineSegmenter::segment_utf8, FnInStruct)] 228 #[diplomat::rust_link(icu::segmenter::LineSegmenter::segment_str, FnInStruct, hidden)] 229 #[diplomat::attr(not(supports = utf8_strings), disable)] 230 #[diplomat::attr(*, rename = "segment")] segment_utf8<'a>( &'a self, input: &'a DiplomatStr, ) -> Box<LineBreakIteratorUtf8<'a>>231 pub fn segment_utf8<'a>( 232 &'a self, 233 input: &'a DiplomatStr, 234 ) -> Box<LineBreakIteratorUtf8<'a>> { 235 Box::new(LineBreakIteratorUtf8(self.0.segment_utf8(input))) 236 } 237 238 /// Segments a string. 239 /// 240 /// Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according 241 /// to the WHATWG Encoding Standard. 242 #[diplomat::rust_link(icu::segmenter::LineSegmenter::segment_utf16, FnInStruct)] 243 #[diplomat::attr(not(supports = utf8_strings), rename = "segment")] 244 #[diplomat::attr(supports = utf8_strings, rename = "segment16")] segment_utf16<'a>( &'a self, input: &'a DiplomatStr16, ) -> Box<LineBreakIteratorUtf16<'a>>245 pub fn segment_utf16<'a>( 246 &'a self, 247 input: &'a DiplomatStr16, 248 ) -> Box<LineBreakIteratorUtf16<'a>> { 249 Box::new(LineBreakIteratorUtf16(self.0.segment_utf16(input))) 250 } 251 252 /// Segments a Latin-1 string. 253 #[diplomat::rust_link(icu::segmenter::LineSegmenter::segment_latin1, FnInStruct)] 254 #[diplomat::attr(not(supports = utf8_strings), disable)] segment_latin1<'a>(&'a self, input: &'a [u8]) -> Box<LineBreakIteratorLatin1<'a>>255 pub fn segment_latin1<'a>(&'a self, input: &'a [u8]) -> Box<LineBreakIteratorLatin1<'a>> { 256 Box::new(LineBreakIteratorLatin1(self.0.segment_latin1(input))) 257 } 258 } 259 260 impl<'a> LineBreakIteratorUtf8<'a> { 261 /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is 262 /// out of range of a 32-bit signed integer. 263 #[diplomat::rust_link(icu::segmenter::LineBreakIterator::next, FnInStruct)] 264 #[diplomat::rust_link( 265 icu::segmenter::LineBreakIterator::Item, 266 AssociatedTypeInStruct, 267 hidden 268 )] next(&mut self) -> i32269 pub fn next(&mut self) -> i32 { 270 self.0 271 .next() 272 .and_then(|u| i32::try_from(u).ok()) 273 .unwrap_or(-1) 274 } 275 } 276 277 impl<'a> LineBreakIteratorUtf16<'a> { 278 /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is 279 /// out of range of a 32-bit signed integer. 280 #[diplomat::rust_link(icu::segmenter::LineBreakIterator::next, FnInStruct)] 281 #[diplomat::rust_link( 282 icu::segmenter::LineBreakIterator::Item, 283 AssociatedTypeInStruct, 284 hidden 285 )] next(&mut self) -> i32286 pub fn next(&mut self) -> i32 { 287 self.0 288 .next() 289 .and_then(|u| i32::try_from(u).ok()) 290 .unwrap_or(-1) 291 } 292 } 293 294 impl<'a> LineBreakIteratorLatin1<'a> { 295 /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is 296 /// out of range of a 32-bit signed integer. 297 #[diplomat::rust_link(icu::segmenter::LineBreakIterator::next, FnInStruct)] 298 #[diplomat::rust_link( 299 icu::segmenter::LineBreakIterator::Item, 300 AssociatedTypeInStruct, 301 hidden 302 )] next(&mut self) -> i32303 pub fn next(&mut self) -> i32 { 304 self.0 305 .next() 306 .and_then(|u| i32::try_from(u).ok()) 307 .unwrap_or(-1) 308 } 309 } 310 } 311 312 impl From<ffi::LineBreakOptionsV2> for icu_segmenter::options::LineBreakOptions<'_> { from(other: ffi::LineBreakOptionsV2) -> Self313 fn from(other: ffi::LineBreakOptionsV2) -> Self { 314 let mut options = icu_segmenter::options::LineBreakOptions::default(); 315 options.strictness = other.strictness.into_converted_option(); 316 options.word_option = other.word_option.into_converted_option(); 317 options 318 } 319 } 320