• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4 
5 // Provider structs must be stable
6 #![allow(clippy::exhaustive_structs, clippy::exhaustive_enums)]
7 
8 //! �� \[Unstable\] Data provider struct definitions for this ICU4X component.
9 //!
10 //! <div class="stab unstable">
11 //! �� This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
12 //! including in SemVer minor releases. While the serde representation of data structs is guaranteed
13 //! to be stable, their Rust representation might not be. Use with caution.
14 //! </div>
15 //!
16 //! Read more about data providers: [`icu_provider`]
17 
18 #[cfg(feature = "compiled_data")]
19 #[derive(Debug)]
20 /// Baked data
21 ///
22 /// <div class="stab unstable">
23 /// �� This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
24 /// including in SemVer minor releases. In particular, the `DataProvider` implementations are only
25 /// guaranteed to match with this version's `*_unstable` providers. Use with caution.
26 /// </div>
27 pub struct Baked;
28 
29 #[cfg(feature = "compiled_data")]
30 #[allow(unused_imports)]
31 const _: () = {
32     use icu_locale_data::*;
33     pub mod icu {
34         pub use crate as locale;
35         pub use icu_collections as collections;
36     }
37     make_provider!(Baked);
38     impl_locale_aliases_v1!(Baked);
39     impl_locale_likely_subtags_extended_v1!(Baked);
40     impl_locale_likely_subtags_language_v1!(Baked);
41     impl_locale_likely_subtags_script_region_v1!(Baked);
42     impl_locale_parents_v1!(Baked);
43     impl_locale_script_direction_v1!(Baked);
44 
45     impl_locale_exemplar_characters_auxiliary_v1!(Baked);
46     impl_locale_exemplar_characters_index_v1!(Baked);
47     impl_locale_exemplar_characters_main_v1!(Baked);
48     impl_locale_exemplar_characters_numbers_v1!(Baked);
49     impl_locale_exemplar_characters_punctuation_v1!(Baked);
50 };
51 
52 icu_provider::data_marker!(
53     /// Marker for locale alias data.
54     LocaleAliasesV1,
55     "locale/aliases/v1",
56     Aliases<'static>,
57     is_singleton = true
58 );
59 icu_provider::data_marker!(
60     /// Marker for data for likely subtags for languages.
61     LocaleLikelySubtagsLanguageV1,
62     "locale/likely/subtags/language/v1",
63     LikelySubtagsForLanguage<'static>,
64     is_singleton = true
65 );
66 icu_provider::data_marker!(
67     /// Marker for data for likely subtags for scripts and regions.
68     LocaleLikelySubtagsScriptRegionV1,
69     "locale/likely/subtags/script/region/v1",
70     LikelySubtagsForScriptRegion<'static>,
71     is_singleton = true
72 );
73 icu_provider::data_marker!(
74     /// Marker for extended data for likely subtags.
75     LocaleLikelySubtagsExtendedV1,
76     "locale/likely/subtags/extended/v1",
77     LikelySubtagsExtended<'static>,
78     is_singleton = true
79 );
80 icu_provider::data_marker!(
81     /// Marker for locale fallback parents data.
82     LocaleParentsV1,
83     "locale/parents/v1",
84     Parents<'static>,
85     is_singleton = true
86 );
87 
88 icu_provider::data_marker!(
89     /// Marker for script direction data.
90     LocaleScriptDirectionV1,
91     "locale/script/direction/v1",
92     ScriptDirection<'static>,
93     is_singleton = true
94 );
95 
96 icu_provider::data_marker!(
97     /// Marker for auxiliary exemplar characters data.
98     LocaleExemplarCharactersAuxiliaryV1,
99     "locale/exemplar/characters/auxiliary/v1",
100     ExemplarCharactersData<'static>,
101 );
102 icu_provider::data_marker!(
103     /// Marker for index exemplar characters data.
104     LocaleExemplarCharactersIndexV1,
105     "locale/exemplar/characters/index/v1",
106     ExemplarCharactersData<'static>,
107 );
108 icu_provider::data_marker!(
109     /// Marker for main exemplar characters data.
110     LocaleExemplarCharactersMainV1,
111     "locale/exemplar/characters/main/v1",
112     ExemplarCharactersData<'static>,
113 );
114 icu_provider::data_marker!(
115     /// Marker for numbers exemplar characters data.
116     LocaleExemplarCharactersNumbersV1,
117     "locale/exemplar/characters/numbers/v1",
118     ExemplarCharactersData<'static>,
119 );
120 icu_provider::data_marker!(
121     /// Marker for punctuation exemplar characters data.
122     LocaleExemplarCharactersPunctuationV1,
123     "locale/exemplar/characters/punctuation/v1",
124     ExemplarCharactersData<'static>,
125 );
126 
127 #[cfg(feature = "datagen")]
128 /// The latest minimum set of markers required by this component.
129 pub const MARKERS: &[DataMarkerInfo] = &[
130     LocaleAliasesV1::INFO,
131     LocaleExemplarCharactersAuxiliaryV1::INFO,
132     LocaleExemplarCharactersIndexV1::INFO,
133     LocaleExemplarCharactersMainV1::INFO,
134     LocaleExemplarCharactersNumbersV1::INFO,
135     LocaleExemplarCharactersPunctuationV1::INFO,
136     LocaleLikelySubtagsExtendedV1::INFO,
137     LocaleLikelySubtagsLanguageV1::INFO,
138     LocaleLikelySubtagsScriptRegionV1::INFO,
139     LocaleParentsV1::INFO,
140     LocaleScriptDirectionV1::INFO,
141 ];
142 
143 use alloc::borrow::Cow;
144 use icu_collections::codepointinvliststringlist::CodePointInversionListAndStringList;
145 use icu_locale_core::subtags::{Language, Region, Script, Variant};
146 use icu_provider::prelude::*;
147 use potential_utf::PotentialUtf8;
148 use tinystr::{TinyAsciiStr, UnvalidatedTinyAsciiStr};
149 use zerovec::{VarZeroVec, ZeroMap, ZeroSlice, ZeroVec};
150 
151 // We use raw TinyAsciiStrs for map keys, as we then don't have to
152 // validate them as subtags on deserialization. Map lookup can be
153 // done even if they are not valid tags (an invalid key will just
154 // become inaccessible).
155 type UnvalidatedLanguage = UnvalidatedTinyAsciiStr<3>;
156 type UnvalidatedScript = UnvalidatedTinyAsciiStr<4>;
157 type UnvalidatedRegion = UnvalidatedTinyAsciiStr<3>;
158 type UnvalidatedVariant = UnvalidatedTinyAsciiStr<8>;
159 type UnvalidatedSubdivision = UnvalidatedTinyAsciiStr<7>;
160 type SemivalidatedSubdivision = TinyAsciiStr<7>;
161 
162 // LanguageIdentifier doesn't have an AsULE implementation, so we have
163 // to store strs and parse when needed.
164 type UnvalidatedLanguageIdentifier = str;
165 type UnvalidatedLanguageIdentifierPair = StrStrPairVarULE;
166 type UnvalidatedLanguageVariantsPair = LanguageStrStrPairVarULE;
167 
168 #[zerovec::make_varule(StrStrPairVarULE)]
169 #[zerovec::derive(Debug)]
170 #[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug)]
171 #[cfg_attr(
172     feature = "serde",
173     derive(serde::Deserialize),
174     zerovec::derive(Deserialize)
175 )]
176 #[cfg_attr(
177     feature = "datagen",
178     derive(serde::Serialize, databake::Bake),
179     zerovec::derive(Serialize),
180     databake(path = icu_locale::provider),
181 )]
182 /// A pair of strings with a EncodeAsVarULE implementation.
183 ///
184 /// <div class="stab unstable">
185 /// �� This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
186 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed
187 /// to be stable, their Rust representation might not be. Use with caution.
188 /// </div>
189 pub struct StrStrPair<'a>(
190     #[cfg_attr(feature = "serde", serde(borrow))] pub Cow<'a, str>,
191     #[cfg_attr(feature = "serde", serde(borrow))] pub Cow<'a, str>,
192 );
193 
194 #[zerovec::make_varule(LanguageStrStrPairVarULE)]
195 #[zerovec::derive(Debug)]
196 #[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Debug)]
197 #[cfg_attr(
198     feature = "serde",
199     derive(serde::Deserialize),
200     zerovec::derive(Deserialize)
201 )]
202 #[cfg_attr(
203     feature = "datagen",
204     derive(serde::Serialize, databake::Bake),
205     zerovec::derive(Serialize),
206     databake(path = icu_locale::provider),
207 )]
208 /// A triplet of strings with a EncodeAsVarULE implementation.
209 pub struct LanguageStrStrPair<'a>(
210     pub Language,
211     #[cfg_attr(feature = "serde", serde(borrow))] pub Cow<'a, str>,
212     #[cfg_attr(feature = "serde", serde(borrow))] pub Cow<'a, str>,
213 );
214 
215 #[derive(PartialEq, Clone, Default, yoke::Yokeable, zerofrom::ZeroFrom)]
216 #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
217 #[cfg_attr(feature = "datagen", databake(path = icu_locale::provider))]
218 #[cfg_attr(feature = "serde", derive(serde::Deserialize))]
219 #[yoke(prove_covariance_manually)]
220 /// This alias data is used for locale canonicalization.
221 ///
222 /// Each field defines a
223 /// mapping from an old identifier to a new identifier, based upon the rules in
224 /// from <http://unicode.org/reports/tr35/#LocaleId_Canonicalization>. The data
225 /// is stored in sorted order, allowing for binary search to identify rules to
226 /// apply. It is broken down into smaller vectors based upon some characteristic
227 /// of the data, to help avoid unnecessary searches. For example, the `sgn_region`
228 /// field contains aliases for sign language and region, so that it is not
229 /// necessary to search the data unless the input is a sign language.
230 ///
231 /// The algorithm in tr35 is not guaranteed to terminate on data other than what
232 /// is currently in CLDR. For this reason, it is not a good idea to attempt to add
233 /// or modify aliases for use in this structure.
234 ///
235 /// <div class="stab unstable">
236 /// �� This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
237 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed
238 /// to be stable, their Rust representation might not be. Use with caution.
239 /// </div>
240 // TODO: Use validated types as value types
241 // Notice:  improves the alignment of `language_variants` speeding up canonicalization by upon
242 // to 40%. See https://github.com/unicode-org/icu4x/pull/2935 for details.
243 #[derive(Debug)]
244 pub struct Aliases<'data> {
245     /// `[language, variant(-variant)*] -> [langid]`
246     /// This is not a map as it's searched linearly according to the canonicalization rules.
247     #[cfg_attr(feature = "serde", serde(borrow))]
248     pub language_variants: VarZeroVec<'data, UnvalidatedLanguageVariantsPair>,
249     /// `sgn-[region] -> [language]`
250     #[cfg_attr(feature = "serde", serde(borrow))]
251     pub sgn_region: ZeroMap<'data, UnvalidatedRegion, Language>,
252     /// `[language{2}] -> [langid]`
253     #[cfg_attr(feature = "serde", serde(borrow))]
254     pub language_len2: ZeroMap<'data, UnvalidatedTinyAsciiStr<2>, UnvalidatedLanguageIdentifier>,
255     /// `[language{3}] -> [langid]`
256     #[cfg_attr(feature = "serde", serde(borrow))]
257     pub language_len3: ZeroMap<'data, UnvalidatedLanguage, UnvalidatedLanguageIdentifier>,
258     /// `[langid] -> [langid]`
259     /// This is not a map as it's searched linearly according to the canonicalization rules.
260     #[cfg_attr(feature = "serde", serde(borrow))]
261     pub language: VarZeroVec<'data, UnvalidatedLanguageIdentifierPair>,
262 
263     /// `[script] -> [script]`
264     #[cfg_attr(feature = "serde", serde(borrow))]
265     pub script: ZeroMap<'data, UnvalidatedScript, Script>,
266 
267     /// `[region{2}] -> [region]`
268     #[cfg_attr(feature = "serde", serde(borrow))]
269     pub region_alpha: ZeroMap<'data, UnvalidatedTinyAsciiStr<2>, Region>,
270     /// `[region{3}] -> [region]`
271     #[cfg_attr(feature = "serde", serde(borrow))]
272     pub region_num: ZeroMap<'data, UnvalidatedRegion, Region>,
273 
274     /// `[region] -> [region]+`
275     #[cfg_attr(feature = "serde", serde(borrow))]
276     pub complex_region: ZeroMap<'data, UnvalidatedRegion, ZeroSlice<Region>>,
277 
278     /// `[variant] -> [variant]`
279     #[cfg_attr(feature = "serde", serde(borrow))]
280     pub variant: ZeroMap<'data, UnvalidatedVariant, Variant>,
281 
282     /// `[value{7}] -> [value{7}]`
283     #[cfg_attr(feature = "serde", serde(borrow))]
284     pub subdivision: ZeroMap<'data, UnvalidatedSubdivision, SemivalidatedSubdivision>,
285 }
286 
287 icu_provider::data_struct!(
288     Aliases<'_>,
289     #[cfg(feature = "datagen")]
290 );
291 
292 #[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
293 #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
294 #[cfg_attr(feature = "datagen", databake(path = icu_locale::provider))]
295 #[cfg_attr(feature = "serde", derive(serde::Deserialize))]
296 /// This likely subtags data is used for the minimize and maximize operations.
297 ///
298 /// Each field defines a mapping from an old identifier to a new identifier,
299 /// based upon the rules in
300 /// <https://www.unicode.org/reports/tr35/#Likely_Subtags>.
301 ///
302 /// The data is stored is broken down into smaller vectors based upon the rules
303 /// defined for the likely subtags maximize algorithm.
304 ///
305 /// For efficiency, only the relevant part of the LanguageIdentifier is stored
306 /// for searching and replacing. E.g., the `language_script` field is used to store
307 /// rules for `LanguageIdentifier`s that contain a language and a script, but not a
308 /// region.
309 ///
310 /// This struct contains mappings when the input contains a language subtag.
311 /// Also see [`LikelySubtagsForScriptRegion`].
312 ///
313 /// <div class="stab unstable">
314 /// �� This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
315 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed
316 /// to be stable, their Rust representation might not be. Use with caution.
317 /// </div>
318 #[yoke(prove_covariance_manually)]
319 pub struct LikelySubtagsForLanguage<'data> {
320     /// Language and script.
321     #[cfg_attr(feature = "serde", serde(borrow))]
322     pub language_script: ZeroMap<'data, (UnvalidatedLanguage, UnvalidatedScript), Region>,
323     /// Language and region.
324     #[cfg_attr(feature = "serde", serde(borrow))]
325     pub language_region: ZeroMap<'data, (UnvalidatedLanguage, UnvalidatedRegion), Script>,
326     /// Just language.
327     #[cfg_attr(feature = "serde", serde(borrow))]
328     pub language: ZeroMap<'data, UnvalidatedLanguage, (Script, Region)>,
329     /// Undefined.
330     pub und: (Language, Script, Region),
331 }
332 
333 icu_provider::data_struct!(
334     LikelySubtagsForLanguage<'_>,
335     #[cfg(feature = "datagen")]
336 );
337 
338 #[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
339 #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
340 #[cfg_attr(feature = "datagen", databake(path = icu_locale::provider))]
341 #[cfg_attr(feature = "serde", derive(serde::Deserialize))]
342 /// This likely subtags data is used for the minimize and maximize operations.
343 ///
344 /// Each field defines a mapping from an old identifier to a new identifier,
345 /// based upon the rules in
346 /// <https://www.unicode.org/reports/tr35/#Likely_Subtags>.
347 ///
348 /// The data is stored is broken down into smaller vectors based upon the rules
349 /// defined for the likely subtags maximize algorithm.
350 ///
351 /// For efficiency, only the relevant part of the LanguageIdentifier is stored
352 /// for searching and replacing. E.g., the `script_region` field is used to store
353 /// rules for `LanguageIdentifier`s that contain a script and a region, but not a
354 /// language.
355 ///
356 /// This struct contains mappings when the input does not contain a language subtag.
357 /// Also see [`LikelySubtagsForLanguage`].
358 ///
359 /// <div class="stab unstable">
360 /// �� This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
361 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed
362 /// to be stable, their Rust representation might not be. Use with caution.
363 /// </div>
364 #[yoke(prove_covariance_manually)]
365 pub struct LikelySubtagsForScriptRegion<'data> {
366     /// Script and region.
367     #[cfg_attr(feature = "serde", serde(borrow))]
368     pub script_region: ZeroMap<'data, (UnvalidatedScript, UnvalidatedRegion), Language>,
369     /// Just script.
370     #[cfg_attr(feature = "serde", serde(borrow))]
371     pub script: ZeroMap<'data, UnvalidatedScript, (Language, Region)>,
372     /// Just region.
373     #[cfg_attr(feature = "serde", serde(borrow))]
374     pub region: ZeroMap<'data, UnvalidatedRegion, (Language, Script)>,
375 }
376 
377 icu_provider::data_struct!(
378     LikelySubtagsForScriptRegion<'_>,
379     #[cfg(feature = "datagen")]
380 );
381 
382 #[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
383 #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
384 #[cfg_attr(feature = "datagen", databake(path = icu_locale::provider))]
385 #[cfg_attr(feature = "serde", derive(serde::Deserialize))]
386 /// This likely subtags data is used for full coverage of locales, including ones that
387 /// don't otherwise have data in the Common Locale Data Repository (CLDR).
388 ///
389 /// <div class="stab unstable">
390 /// �� This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
391 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed
392 /// to be stable, their Rust representation might not be. Use with caution.
393 /// </div>
394 #[yoke(prove_covariance_manually)]
395 pub struct LikelySubtagsExtended<'data> {
396     /// Language and script.
397     #[cfg_attr(feature = "serde", serde(borrow))]
398     pub language_script: ZeroMap<'data, (UnvalidatedLanguage, UnvalidatedScript), Region>,
399     /// Language and region.
400     #[cfg_attr(feature = "serde", serde(borrow))]
401     pub language_region: ZeroMap<'data, (UnvalidatedLanguage, UnvalidatedRegion), Script>,
402     /// Just language.
403     #[cfg_attr(feature = "serde", serde(borrow))]
404     pub language: ZeroMap<'data, UnvalidatedLanguage, (Script, Region)>,
405     /// Script and region.
406     #[cfg_attr(feature = "serde", serde(borrow))]
407     pub script_region: ZeroMap<'data, (UnvalidatedScript, UnvalidatedRegion), Language>,
408     /// Just script.
409     #[cfg_attr(feature = "serde", serde(borrow))]
410     pub script: ZeroMap<'data, UnvalidatedScript, (Language, Region)>,
411     /// Just region.
412     #[cfg_attr(feature = "serde", serde(borrow))]
413     pub region: ZeroMap<'data, UnvalidatedRegion, (Language, Script)>,
414 }
415 
416 icu_provider::data_struct!(
417     LikelySubtagsExtended<'_>,
418     #[cfg(feature = "datagen")]
419 );
420 
421 /// Locale fallback rules derived from CLDR parent locales data.
422 #[derive(Default, Clone, PartialEq, Debug, yoke::Yokeable, zerofrom::ZeroFrom)]
423 #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
424 #[cfg_attr(feature = "datagen", databake(path = icu_locale::provider))]
425 #[cfg_attr(feature = "serde", derive(serde::Deserialize))]
426 #[yoke(prove_covariance_manually)]
427 pub struct Parents<'data> {
428     /// Map from language identifier to language identifier, indicating that the language on the
429     /// left should inherit from the language on the right.
430     #[cfg_attr(feature = "serde", serde(borrow))]
431     pub parents: ZeroMap<'data, PotentialUtf8, (Language, Option<Script>, Option<Region>)>,
432 }
433 
434 icu_provider::data_struct!(
435     Parents<'_>,
436     #[cfg(feature = "datagen")]
437 );
438 
439 #[derive(Debug, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
440 #[cfg_attr(feature = "datagen", derive(serde::Serialize, databake::Bake))]
441 #[cfg_attr(feature = "datagen", databake(path = icu_locale::provider))]
442 #[cfg_attr(feature = "serde", derive(serde::Deserialize))]
443 /// This directionality data is used to determine the script directionality of a locale.
444 ///
445 /// <div class="stab unstable">
446 /// �� This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
447 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed
448 /// to be stable, their Rust representation might not be. Use with caution.
449 /// </div>
450 #[yoke(prove_covariance_manually)]
451 pub struct ScriptDirection<'data> {
452     /// Scripts in right-to-left direction.
453     #[cfg_attr(feature = "serde", serde(borrow))]
454     pub rtl: ZeroVec<'data, UnvalidatedScript>,
455     /// Scripts in left-to-right direction.
456     #[cfg_attr(feature = "serde", serde(borrow))]
457     pub ltr: ZeroVec<'data, UnvalidatedScript>,
458 }
459 
460 icu_provider::data_struct!(
461     ScriptDirection<'_>,
462     #[cfg(feature = "datagen")]
463 );
464 
465 /// A set of characters and strings which share a particular property value.
466 ///
467 /// <div class="stab unstable">
468 /// �� This code is considered unstable; it may change at any time, in breaking or non-breaking ways,
469 /// including in SemVer minor releases. While the serde representation of data structs is guaranteed
470 /// to be stable, their Rust representation might not be. Use with caution.
471 /// </div>
472 #[derive(Debug, Eq, PartialEq, Clone, yoke::Yokeable, zerofrom::ZeroFrom)]
473 #[cfg_attr(
474     feature = "datagen",
475     derive(serde::Serialize, databake::Bake),
476     databake(path = icu_locale::provider),
477 )]
478 #[cfg_attr(feature = "serde", derive(serde::Deserialize))]
479 pub struct ExemplarCharactersData<'data>(
480     #[cfg_attr(feature = "serde", serde(borrow))] pub CodePointInversionListAndStringList<'data>,
481 );
482 
483 icu_provider::data_struct!(
484     ExemplarCharactersData<'_>,
485     #[cfg(feature = "datagen")]
486 );
487