• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4 
5 #[diplomat::bridge]
6 #[diplomat::abi_rename = "icu4x_{0}_mv1"]
7 #[diplomat::attr(auto, namespace = "icu4x")]
8 pub mod ffi {
9     use alloc::boxed::Box;
10 
11     #[cfg(feature = "buffer_provider")]
12     use crate::provider::ffi::DataProvider;
13     #[cfg(any(feature = "compiled_data", feature = "buffer_provider"))]
14     use crate::{errors::ffi::DataError, locale_core::ffi::Locale};
15 
16     #[diplomat::enum_convert(icu_segmenter::options::WordType, needs_wildcard)]
17     #[diplomat::rust_link(icu::segmenter::options::WordType, Enum)]
18     pub enum SegmenterWordType {
19         None = 0,
20         Number = 1,
21         Letter = 2,
22     }
23 
24     #[diplomat::opaque]
25     /// An ICU4X word-break segmenter, capable of finding word breakpoints in strings.
26     #[diplomat::rust_link(icu::segmenter::WordSegmenter, Struct)]
27     #[diplomat::demo(custom_func = "../../npm/demo_gen_custom/WordSegmenter.mjs")]
28     pub struct WordSegmenter(icu_segmenter::WordSegmenter);
29 
30     #[diplomat::opaque]
31     #[diplomat::rust_link(icu::segmenter::WordBreakIterator, Struct)]
32     #[diplomat::rust_link(
33         icu::segmenter::WordBreakIteratorPotentiallyIllFormedUtf8,
34         Typedef,
35         hidden
36     )]
37     #[diplomat::rust_link(icu::segmenter::WordBreakIteratorUtf8, Typedef, hidden)]
38     pub struct WordBreakIteratorUtf8<'a>(
39         icu_segmenter::WordBreakIteratorPotentiallyIllFormedUtf8<'a, 'a>,
40     );
41 
42     #[diplomat::opaque]
43     #[diplomat::rust_link(icu::segmenter::WordBreakIterator, Struct)]
44     #[diplomat::rust_link(icu::segmenter::WordBreakIteratorUtf16, Typedef, hidden)]
45     pub struct WordBreakIteratorUtf16<'a>(icu_segmenter::WordBreakIteratorUtf16<'a, 'a>);
46 
47     #[diplomat::opaque]
48     #[diplomat::rust_link(icu::segmenter::WordBreakIterator, Struct)]
49     #[diplomat::rust_link(icu::segmenter::WordBreakIteratorLatin1, Typedef, hidden)]
50     pub struct WordBreakIteratorLatin1<'a>(icu_segmenter::WordBreakIteratorLatin1<'a, 'a>);
51 
52     impl SegmenterWordType {
53         #[diplomat::rust_link(icu::segmenter::options::WordType::is_word_like, FnInEnum)]
54         #[diplomat::attr(auto, getter)]
is_word_like(self) -> bool55         pub fn is_word_like(self) -> bool {
56             icu_segmenter::options::WordType::from(self).is_word_like()
57         }
58     }
59 
60     impl WordSegmenter {
61         /// Construct an [`WordSegmenter`] with automatically selecting the best available LSTM
62         /// or dictionary payload data, using compiled data. This does not assume any content locale.
63         ///
64         /// Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese,
65         /// Khmer, Lao, and Thai.
66         #[diplomat::rust_link(icu::segmenter::WordSegmenter::new_auto, FnInStruct)]
67         #[diplomat::rust_link(icu::segmenter::options::WordBreakInvariantOptions, Struct, hidden)]
68         #[diplomat::attr(auto, named_constructor = "auto")]
69         #[cfg(feature = "compiled_data")]
create_auto() -> Box<WordSegmenter>70         pub fn create_auto() -> Box<WordSegmenter> {
71             Box::new(WordSegmenter(icu_segmenter::WordSegmenter::new_auto(
72                 Default::default(),
73             )))
74         }
75 
76         /// Construct an [`WordSegmenter`] with automatically selecting the best available LSTM
77         /// or dictionary payload data, using compiled data.
78         ///
79         /// Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese,
80         /// Khmer, Lao, and Thai.
81         #[diplomat::rust_link(icu::segmenter::WordSegmenter::try_new_auto, FnInStruct)]
82         #[diplomat::rust_link(icu::segmenter::options::WordBreakOptions, Struct, hidden)]
83         #[diplomat::attr(all(supports = fallible_constructors, supports = named_constructors), named_constructor = "auto_with_content_locale")]
84         #[cfg(feature = "compiled_data")]
create_auto_with_content_locale( locale: &Locale, ) -> Result<Box<WordSegmenter>, DataError>85         pub fn create_auto_with_content_locale(
86             locale: &Locale,
87         ) -> Result<Box<WordSegmenter>, DataError> {
88             Ok(Box::new(WordSegmenter(
89                 icu_segmenter::WordSegmenter::try_new_auto(locale.into())?,
90             )))
91         }
92 
93         /// Construct an [`WordSegmenter`] with automatically selecting the best available LSTM
94         /// or dictionary payload data, using a particular data source.
95         ///
96         /// Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese,
97         /// Khmer, Lao, and Thai.
98         #[diplomat::rust_link(icu::segmenter::WordSegmenter::try_new_auto, FnInStruct)]
99         #[diplomat::attr(all(supports = fallible_constructors, supports = named_constructors), named_constructor = "auto_with_content_locale_and_provider")]
100         #[cfg(feature = "buffer_provider")]
create_auto_with_content_locale_and_provider( provider: &DataProvider, locale: &Locale, ) -> Result<Box<WordSegmenter>, DataError>101         pub fn create_auto_with_content_locale_and_provider(
102             provider: &DataProvider,
103             locale: &Locale,
104         ) -> Result<Box<WordSegmenter>, DataError> {
105             Ok(Box::new(WordSegmenter(
106                 icu_segmenter::WordSegmenter::try_new_auto_with_buffer_provider(
107                     provider.get()?,
108                     locale.into(),
109                 )?,
110             )))
111         }
112 
113         /// Construct an [`WordSegmenter`] with LSTM payload data for Burmese, Khmer, Lao, and
114         /// Thai, using compiled data.  This does not assume any content locale.
115         ///
116         /// Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese,
117         /// Khmer, Lao, and Thai.
118         #[diplomat::rust_link(icu::segmenter::WordSegmenter::new_lstm, FnInStruct)]
119         #[diplomat::attr(auto, named_constructor = "lstm")]
120         #[cfg(feature = "compiled_data")]
create_lstm() -> Box<WordSegmenter>121         pub fn create_lstm() -> Box<WordSegmenter> {
122             Box::new(WordSegmenter(icu_segmenter::WordSegmenter::new_lstm(
123                 Default::default(),
124             )))
125         }
126 
127         /// Construct an [`WordSegmenter`] with LSTM payload data for Burmese, Khmer, Lao, and
128         /// Thai, using compiled data.
129         ///
130         /// Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese,
131         /// Khmer, Lao, and Thai.
132         #[diplomat::rust_link(icu::segmenter::WordSegmenter::try_new_lstm, FnInStruct)]
133         #[diplomat::attr(all(supports = fallible_constructors, supports = named_constructors), named_constructor = "lstm_with_content_locale")]
134         #[cfg(feature = "compiled_data")]
create_lstm_with_content_locale( locale: &Locale, ) -> Result<Box<WordSegmenter>, DataError>135         pub fn create_lstm_with_content_locale(
136             locale: &Locale,
137         ) -> Result<Box<WordSegmenter>, DataError> {
138             Ok(Box::new(WordSegmenter(
139                 icu_segmenter::WordSegmenter::try_new_lstm(locale.into())?,
140             )))
141         }
142 
143         /// Construct an [`WordSegmenter`] with LSTM payload data for Burmese, Khmer, Lao, and
144         /// Thai, using a particular data source.
145         ///
146         /// Note: currently, it uses dictionary for Chinese and Japanese, and LSTM for Burmese,
147         /// Khmer, Lao, and Thai.
148         #[diplomat::rust_link(icu::segmenter::WordSegmenter::try_new_lstm, FnInStruct)]
149         #[diplomat::attr(all(supports = fallible_constructors, supports = named_constructors), named_constructor = "lstm_with_content_locale_and_provider")]
150         #[cfg(feature = "buffer_provider")]
create_lstm_with_content_locale_and_provider( provider: &DataProvider, locale: &Locale, ) -> Result<Box<WordSegmenter>, DataError>151         pub fn create_lstm_with_content_locale_and_provider(
152             provider: &DataProvider,
153             locale: &Locale,
154         ) -> Result<Box<WordSegmenter>, DataError> {
155             Ok(Box::new(WordSegmenter(
156                 icu_segmenter::WordSegmenter::try_new_lstm_with_buffer_provider(
157                     provider.get()?,
158                     locale.into(),
159                 )?,
160             )))
161         }
162 
163         /// Construct an [`WordSegmenter`] with with dictionary payload data for Chinese, Japanese,
164         /// Burmese, Khmer, Lao, and Thai, using compiled data.  This does not assume any content locale.
165         ///
166         /// Note: currently, it uses dictionary for Chinese and Japanese, and dictionary for Burmese,
167         /// Khmer, Lao, and Thai.
168         #[diplomat::rust_link(icu::segmenter::WordSegmenter::new_dictionary, FnInStruct)]
169         #[diplomat::attr(auto, named_constructor = "dictionary")]
170         #[cfg(feature = "compiled_data")]
create_dictionary() -> Box<WordSegmenter>171         pub fn create_dictionary() -> Box<WordSegmenter> {
172             Box::new(WordSegmenter(icu_segmenter::WordSegmenter::new_dictionary(
173                 Default::default(),
174             )))
175         }
176 
177         /// Construct an [`WordSegmenter`] with dictionary payload data for Chinese, Japanese,
178         /// Burmese, Khmer, Lao, and Thai, using compiled data.
179         ///
180         /// Note: currently, it uses dictionary for Chinese and Japanese, and dictionary for Burmese,
181         /// Khmer, Lao, and Thai.
182         #[diplomat::rust_link(icu::segmenter::WordSegmenter::try_new_dictionary, FnInStruct)]
183         #[diplomat::attr(all(supports = fallible_constructors, supports = named_constructors), named_constructor = "dictionary_with_content_locale")]
184         #[cfg(feature = "compiled_data")]
create_dictionary_with_content_locale( locale: &Locale, ) -> Result<Box<WordSegmenter>, DataError>185         pub fn create_dictionary_with_content_locale(
186             locale: &Locale,
187         ) -> Result<Box<WordSegmenter>, DataError> {
188             Ok(Box::new(WordSegmenter(
189                 icu_segmenter::WordSegmenter::try_new_dictionary(locale.into())?,
190             )))
191         }
192 
193         /// Construct an [`WordSegmenter`] with dictionary payload data for Chinese, Japanese,
194         /// Burmese, Khmer, Lao, and Thai, using a particular data source.
195         ///
196         /// Note: currently, it uses dictionary for Chinese and Japanese, and dictionary for Burmese,
197         /// Khmer, Lao, and Thai.
198         #[diplomat::rust_link(icu::segmenter::WordSegmenter::try_new_dictionary, FnInStruct)]
199         #[diplomat::attr(all(supports = fallible_constructors, supports = named_constructors), named_constructor = "dictionary_with_content_locale_and_provider")]
200         #[cfg(feature = "buffer_provider")]
create_dictionary_with_content_locale_and_provider( provider: &DataProvider, locale: &Locale, ) -> Result<Box<WordSegmenter>, DataError>201         pub fn create_dictionary_with_content_locale_and_provider(
202             provider: &DataProvider,
203             locale: &Locale,
204         ) -> Result<Box<WordSegmenter>, DataError> {
205             Ok(Box::new(WordSegmenter(
206                 icu_segmenter::WordSegmenter::try_new_dictionary_with_buffer_provider(
207                     provider.get()?,
208                     locale.into(),
209                 )?,
210             )))
211         }
212         /// Segments a string.
213         ///
214         /// Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according
215         /// to the WHATWG Encoding Standard.
216         #[diplomat::rust_link(icu::segmenter::WordSegmenter::segment_utf8, FnInStruct)]
217         #[diplomat::rust_link(icu::segmenter::WordSegmenter::segment_str, FnInStruct, hidden)]
218         #[diplomat::attr(not(supports = utf8_strings), disable)]
219         #[diplomat::attr(*, rename = "segment")]
segment_utf8<'a>( &'a self, input: &'a DiplomatStr, ) -> Box<WordBreakIteratorUtf8<'a>>220         pub fn segment_utf8<'a>(
221             &'a self,
222             input: &'a DiplomatStr,
223         ) -> Box<WordBreakIteratorUtf8<'a>> {
224             Box::new(WordBreakIteratorUtf8(self.0.segment_utf8(input)))
225         }
226 
227         /// Segments a string.
228         ///
229         /// Ill-formed input is treated as if errors had been replaced with REPLACEMENT CHARACTERs according
230         /// to the WHATWG Encoding Standard.
231         #[diplomat::rust_link(icu::segmenter::WordSegmenter::segment_utf16, FnInStruct)]
232         #[diplomat::attr(not(supports = utf8_strings), rename = "segment")]
233         #[diplomat::attr(supports = utf8_strings, rename = "segment16")]
segment_utf16<'a>( &'a self, input: &'a DiplomatStr16, ) -> Box<WordBreakIteratorUtf16<'a>>234         pub fn segment_utf16<'a>(
235             &'a self,
236             input: &'a DiplomatStr16,
237         ) -> Box<WordBreakIteratorUtf16<'a>> {
238             Box::new(WordBreakIteratorUtf16(self.0.segment_utf16(input)))
239         }
240 
241         /// Segments a Latin-1 string.
242         #[diplomat::rust_link(icu::segmenter::WordSegmenter::segment_latin1, FnInStruct)]
243         #[diplomat::attr(not(supports = utf8_strings), disable)]
segment_latin1<'a>(&'a self, input: &'a [u8]) -> Box<WordBreakIteratorLatin1<'a>>244         pub fn segment_latin1<'a>(&'a self, input: &'a [u8]) -> Box<WordBreakIteratorLatin1<'a>> {
245             Box::new(WordBreakIteratorLatin1(self.0.segment_latin1(input)))
246         }
247     }
248 
249     impl<'a> WordBreakIteratorUtf8<'a> {
250         /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is
251         /// out of range of a 32-bit signed integer.
252         #[diplomat::rust_link(icu::segmenter::WordBreakIterator::next, FnInStruct)]
253         #[diplomat::rust_link(
254             icu::segmenter::WordBreakIterator::Item,
255             AssociatedTypeInStruct,
256             hidden
257         )]
next(&mut self) -> i32258         pub fn next(&mut self) -> i32 {
259             self.0
260                 .next()
261                 .and_then(|u| i32::try_from(u).ok())
262                 .unwrap_or(-1)
263         }
264 
265         /// Return the status value of break boundary.
266         #[diplomat::rust_link(icu::segmenter::WordBreakIterator::word_type, FnInStruct)]
267         #[diplomat::attr(auto, getter)]
word_type(&self) -> SegmenterWordType268         pub fn word_type(&self) -> SegmenterWordType {
269             self.0.word_type().into()
270         }
271 
272         /// Return true when break boundary is word-like such as letter/number/CJK
273         #[diplomat::rust_link(icu::segmenter::WordBreakIterator::is_word_like, FnInStruct)]
274         #[diplomat::attr(auto, getter)]
is_word_like(&self) -> bool275         pub fn is_word_like(&self) -> bool {
276             self.0.is_word_like()
277         }
278     }
279 
280     impl<'a> WordBreakIteratorUtf16<'a> {
281         /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is
282         /// out of range of a 32-bit signed integer.
283         #[diplomat::rust_link(icu::segmenter::WordBreakIterator::next, FnInStruct)]
284         #[diplomat::rust_link(
285             icu::segmenter::WordBreakIterator::Item,
286             AssociatedTypeInStruct,
287             hidden
288         )]
next(&mut self) -> i32289         pub fn next(&mut self) -> i32 {
290             self.0
291                 .next()
292                 .and_then(|u| i32::try_from(u).ok())
293                 .unwrap_or(-1)
294         }
295 
296         /// Return the status value of break boundary.
297         #[diplomat::rust_link(icu::segmenter::WordBreakIterator::word_type, FnInStruct)]
298         #[diplomat::rust_link(
299             icu::segmenter::WordBreakIterator::iter_with_word_type,
300             FnInStruct,
301             hidden
302         )]
303         #[diplomat::attr(auto, getter)]
word_type(&self) -> SegmenterWordType304         pub fn word_type(&self) -> SegmenterWordType {
305             self.0.word_type().into()
306         }
307 
308         /// Return true when break boundary is word-like such as letter/number/CJK
309         #[diplomat::rust_link(icu::segmenter::WordBreakIterator::is_word_like, FnInStruct)]
310         #[diplomat::attr(auto, getter)]
is_word_like(&self) -> bool311         pub fn is_word_like(&self) -> bool {
312             self.0.is_word_like()
313         }
314     }
315 
316     impl<'a> WordBreakIteratorLatin1<'a> {
317         /// Finds the next breakpoint. Returns -1 if at the end of the string or if the index is
318         /// out of range of a 32-bit signed integer.
319         #[diplomat::rust_link(icu::segmenter::WordBreakIterator::next, FnInStruct)]
320         #[diplomat::rust_link(
321             icu::segmenter::WordBreakIterator::Item,
322             AssociatedTypeInStruct,
323             hidden
324         )]
next(&mut self) -> i32325         pub fn next(&mut self) -> i32 {
326             self.0
327                 .next()
328                 .and_then(|u| i32::try_from(u).ok())
329                 .unwrap_or(-1)
330         }
331 
332         /// Return the status value of break boundary.
333         #[diplomat::rust_link(icu::segmenter::WordBreakIterator::word_type, FnInStruct)]
334         #[diplomat::attr(auto, getter)]
word_type(&self) -> SegmenterWordType335         pub fn word_type(&self) -> SegmenterWordType {
336             self.0.word_type().into()
337         }
338 
339         /// Return true when break boundary is word-like such as letter/number/CJK
340         #[diplomat::rust_link(icu::segmenter::WordBreakIterator::is_word_like, FnInStruct)]
341         #[diplomat::attr(auto, getter)]
is_word_like(&self) -> bool342         pub fn is_word_like(&self) -> bool {
343             self.0.is_word_like()
344         }
345     }
346 }
347 
348 impl<'a> From<&'a crate::locale_core::ffi::Locale>
349     for icu_segmenter::options::WordBreakOptions<'a>
350 {
from(other: &'a crate::locale_core::ffi::Locale) -> Self351     fn from(other: &'a crate::locale_core::ffi::Locale) -> Self {
352         let mut options = icu_segmenter::options::WordBreakOptions::default();
353         options.content_locale = Some(&other.0.id);
354         options
355     }
356 }
357