• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4 
5 //! This module contains internal data handling tools for the special-cased Greek uppercasing
6 //! code. The Greek uppercasing algorithm is code-driven, not user-data-driven, however the code
7 //! relies on a CodePointTrie generated based on some Unicode rules.
8 //!
9 //! We try to keep most of the Greek-specific logic in here, though the actual logic to remove
10 //! accents is in full_helper() as it must integrate with the control flow.
11 //!
12 //! This is public and doc(hidden) so that it can be accessed from the gen_greek_to_me test file,
13 //! and should not be used otherwise.
14 
15 #[rustfmt::skip]
16 mod data;
17 
get_data(ch: char) -> Option<GreekPrecomposedLetterData>18 pub(crate) fn get_data(ch: char) -> Option<GreekPrecomposedLetterData> {
19     let ch_i = ch as usize;
20     let packed = if (0x370..=0x3FF).contains(&ch_i) {
21         *data::DATA_370.get(ch_i - 0x370)?
22     } else if (0x1f00..0x1fff).contains(&ch_i) {
23         *data::DATA_1F00.get(ch_i - 0x1f00)?
24     } else {
25         data::match_extras(ch)?
26     };
27 
28     let packed = PackedGreekPrecomposedLetterData(packed);
29 
30     GreekPrecomposedLetterData::try_from(packed).ok()
31 }
32 
33 /// A packed representation of [`GreekPrecomposedLetterData`]
34 ///
35 /// Bit layout:
36 ///
37 /// ```text
38 ///   7       6   5   4     3   2   1       0
39 /// discr=0 | [diacritics]  | [vowel            ]
40 /// discr=1 | [  unused = 0     ]      | [is_rho]
41 /// ```
42 ///
43 /// Bit 7 is the discriminant. if 0, it is a vowel, else, it is a consonant.
44 /// If the whole thing is a zero then it is assumed to be an empty entry.
45 ///
46 /// In the vowel case, the next three bits are the next three elements of GreekDiacritics,
47 /// in order (accented, dialytika, ypogegrammeni), and the four bits after that identify
48 /// a GreekVowel value.
49 ///
50 /// In the consonant case, the remaining seven bits identify a GreekConsonant value.
51 #[derive(Debug, Clone, Copy)]
52 pub struct PackedGreekPrecomposedLetterData(pub u8);
53 
54 impl TryFrom<PackedGreekPrecomposedLetterData> for GreekPrecomposedLetterData {
55     type Error = ();
try_from(other: PackedGreekPrecomposedLetterData) -> Result<GreekPrecomposedLetterData, ()>56     fn try_from(other: PackedGreekPrecomposedLetterData) -> Result<GreekPrecomposedLetterData, ()> {
57         if other.0 == 0 {
58             return Err(());
59         }
60         if other.0 & 0x80 == 0 {
61             // vowel
62             let diacritics = GreekDiacritics {
63                 accented: other.0 & 0x40 != 0,
64                 dialytika: other.0 & 0x20 != 0,
65                 ypogegrammeni: other.0 & 0x10 != 0,
66             };
67             let vowel = GreekVowel::try_from(other.0 & 0b1111);
68             debug_assert!(vowel.is_ok());
69             let vowel = vowel.unwrap_or(GreekVowel::Α);
70             Ok(GreekPrecomposedLetterData::Vowel(vowel, diacritics))
71         } else {
72             // consonant
73             // 0x80 is is_rho = false, 0x81 is is_rho = true
74             Ok(GreekPrecomposedLetterData::Consonant(other.0 == 0x81))
75         }
76     }
77 }
78 
79 impl From<GreekPrecomposedLetterData> for PackedGreekPrecomposedLetterData {
from(other: GreekPrecomposedLetterData) -> Self80     fn from(other: GreekPrecomposedLetterData) -> Self {
81         match other {
82             GreekPrecomposedLetterData::Vowel(vowel, diacritics) => {
83                 let mut bits = 0;
84                 if diacritics.accented {
85                     bits |= 0x40;
86                 }
87                 if diacritics.dialytika {
88                     bits |= 0x20;
89                 }
90                 if diacritics.ypogegrammeni {
91                     bits |= 0x10;
92                 }
93                 bits |= vowel as u8;
94                 PackedGreekPrecomposedLetterData(bits)
95             }
96             GreekPrecomposedLetterData::Consonant(is_rho) => {
97                 PackedGreekPrecomposedLetterData(0x80 + is_rho as u8)
98             }
99         }
100     }
101 }
102 
103 /// The precomposed letter data stored in the hardcoded data in `mod data`
104 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
105 pub enum GreekPrecomposedLetterData {
106     /// A vowel, with a capitalized base letter, and the diacritics found
107     Vowel(GreekVowel, GreekDiacritics),
108     /// A consonant or vowel that does not take diacritics
109     ///
110     /// The boolean is true when the consonant is a rho, which is handled specially since
111     /// it can take breathing marks (but is *not* a vowel)
112     Consonant(bool),
113 }
114 
115 /// n.b. these are Greek capital letters, not Latin
116 #[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq)]
117 pub enum GreekVowel {
118     // 0 is purposely left out so that the all-zero case is unambiguous
119     Α = 1,
120     Ε = 2,
121     Η = 3,
122     Ι = 4,
123     Ο = 5,
124     Υ = 6,
125     Ω = 7,
126     ϒ = 8,
127 }
128 pub const CAPITAL_RHO: char = 'Ρ';
129 
130 impl From<GreekVowel> for char {
from(other: GreekVowel) -> Self131     fn from(other: GreekVowel) -> Self {
132         match other {
133             GreekVowel::Α => 'Α',
134             GreekVowel::Ε => 'Ε',
135             GreekVowel::Η => 'Η',
136             GreekVowel::Ι => 'Ι',
137             GreekVowel::Ο => 'Ο',
138             GreekVowel::Υ => 'Υ',
139             GreekVowel::Ω => 'Ω',
140             GreekVowel::ϒ => 'ϒ',
141         }
142     }
143 }
144 
145 impl TryFrom<char> for GreekVowel {
146     type Error = ();
try_from(other: char) -> Result<Self, ()>147     fn try_from(other: char) -> Result<Self, ()> {
148         Ok(match other {
149             'Α' => GreekVowel::Α,
150             'Ε' => GreekVowel::Ε,
151             'Η' => GreekVowel::Η,
152             'Ι' => GreekVowel::Ι,
153             'Ο' => GreekVowel::Ο,
154             'Υ' => GreekVowel::Υ,
155             'Ω' => GreekVowel::Ω,
156             'ϒ' => GreekVowel::ϒ,
157             _ => return Err(()),
158         })
159     }
160 }
161 
162 impl TryFrom<u8> for GreekVowel {
163     type Error = ();
try_from(other: u8) -> Result<Self, ()>164     fn try_from(other: u8) -> Result<Self, ()> {
165         Ok(match other {
166             1 => Self::Α,
167             2 => Self::Ε,
168             3 => Self::Η,
169             4 => Self::Ι,
170             5 => Self::Ο,
171             6 => Self::Υ,
172             7 => Self::Ω,
173             8 => Self::ϒ,
174             _ => return Err(()),
175         })
176     }
177 }
178 
179 /// General diacritic information about a character or combining character sequence.
180 #[derive(Copy, Clone, Default, PartialEq, Eq, Debug)]
181 pub struct GreekDiacritics {
182     /// Whether it has an accent.
183     pub accented: bool,
184     /// Whether it has a dialytika.
185     pub dialytika: bool,
186     /// Whether it has a ypogegrammeni.
187     pub ypogegrammeni: bool,
188 }
189 
190 /// General diacritic information about a combining character sequence,
191 /// identifying the source of the diacritics.
192 #[derive(Copy, Clone, Default, PartialEq, Eq, Debug)]
193 pub struct GreekCombiningCharacterSequenceDiacritics {
194     // Diacritics precomposed on the base.
195     pub precomposed: GreekDiacritics,
196     // Combining diacritics.
197     pub combining: GreekDiacritics,
198 }
199 
200 pub const TONOS: char = '\u{0301}';
201 pub const DIALYTIKA: char = '\u{0308}';
202 pub const DIALYTIKA_TONOS: char = '\u{0344}';
203 pub const YPOGEGRAMMENI: char = '\u{0345}';
204 
205 #[macro_export]
206 #[doc(hidden)] // macro
207 macro_rules! diacritics {
208     // Accents.
209     // These are mostly removed when uppercasing, but their presence may require
210     // adding a διαλυτικά to a following vowel.
211     (ACCENTS) => {
212         // https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%5Cu0300+%5Cu0301+%5Cu0342+%5Cu0302+%5Cu0303+%5Cu0311%5D&g=&i=
213         '\u{0300}' // Polytonic βαρεία (varia), grave accent.
214         | $crate::greek_to_me::TONOS // Polytonic οξεία (oxia) unified with monotonic τόνος (tonos), acute accent.
215         | '\u{0342}' // Polytonic περισπωμένη (perispomeni), often translated to circumflex.
216         | '\u{0302}' // Circumflex accent, sometimes a lookalike of the περισπωμένη.
217         | '\u{0303}' // Tilde, sometimes a lookalike of the περισπωμένη.
218         | '\u{0311}' // Inverted breve, sometimes a lookalike of the περισπωμένη.
219     };
220     // Breathings and length marks.
221     // These expected to occur in Greek combining sequences, and are removed when uppercasing.
222     // This removal has no other effect.
223     (BREATHING_AND_LENGTH) => {
224         // https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%5Cu0304+%5Cu0306+%5Cu0313+%5Cu0314+%5Cu0343%5D&g=&i=
225         '\u{0304}'  // Macron, marking long vowels.
226         | '\u{0306}'  // Breve, marking short vowels.
227         | '\u{0313}'  // Comma above, smooth breathing or κορωνίς marking crasis.
228         | '\u{0314}'  // Reversed comma above, rough breathing.
229         | '\u{0343}'  // κορωνίς (koronis), canonically decomposes to comma above.
230     };
231     // All diacritics containing a dialytika
232     (DIALYTIKA_ALL) => { $crate::greek_to_me::DIALYTIKA | $crate::greek_to_me::DIALYTIKA_TONOS };
233     (DIALYTIKA) => { $crate::greek_to_me::DIALYTIKA };
234     (DIALYTIKA_TONOS) => { $crate::greek_to_me::DIALYTIKA_TONOS };
235     (YPOGEGRAMMENI) => { $crate::greek_to_me::YPOGEGRAMMENI };
236     ($($i:ident)|+) => { $(diacritics!($i))|+};
237 }
238 
239 /// Macro that generates match arms for various diacritic groupings.
240 ///
241 /// Groupings supported:
242 ///
243 /// - ACCENTS
244 /// - BREATHING_AND_LENGTH
245 /// - DIALYTIKA, DIALYTIKA_TONOS, and DIALITYKA_ALL
246 /// - YPOGEGRAMMENI
247 ///
248 /// This is a macro to make it easy to keep the lists of accents in sync.
249 pub use crate::diacritics;
250 
251 impl GreekDiacritics {
252     /// Whilst forwards-iterating from an existing character,
253     /// consume all further greek diacritics and store their existence into this struct.
consume_greek_diacritics(&mut self, context_after: &str)254     pub(crate) fn consume_greek_diacritics(&mut self, context_after: &str) {
255         for c in context_after.chars() {
256             match c {
257                 diacritics!(ACCENTS) => self.accented = true,
258                 DIALYTIKA_TONOS => {
259                     self.dialytika = true;
260                     self.accented = true;
261                 }
262                 DIALYTIKA => self.dialytika = true,
263                 YPOGEGRAMMENI => self.ypogegrammeni = true,
264                 // Ignore other accent marks that are expected to co-occur with Greek.
265                 diacritics!(BREATHING_AND_LENGTH) => (),
266                 _ => break,
267             }
268         }
269     }
270 }
271 
272 /// Given the context before a character, check if it is preceded by a Greek letter.
preceded_by_greek_letter(context_before: &str) -> bool273 pub(crate) fn preceded_by_greek_letter(context_before: &str) -> bool {
274     for c in context_before.chars().rev() {
275         match c {
276             diacritics!(ACCENTS | BREATHING_AND_LENGTH | DIALYTIKA_ALL | YPOGEGRAMMENI) => continue,
277             _ => return get_data(c).is_some(),
278         }
279     }
280     false
281 }
282 
283 /// Returns diacritic information for the combining character sequence preceding the current character
284 /// if it that preceding combining character sequence is a greek vowel.
preceding_greek_vowel_diacritics( context_before: &str, ) -> Option<GreekCombiningCharacterSequenceDiacritics>285 pub(crate) fn preceding_greek_vowel_diacritics(
286     context_before: &str,
287 ) -> Option<GreekCombiningCharacterSequenceDiacritics> {
288     let mut combining: GreekDiacritics = Default::default();
289     for c in context_before.chars().rev() {
290         match c {
291             diacritics!(ACCENTS) => combining.accented = true,
292             diacritics!(DIALYTIKA_TONOS) => {
293                 combining.dialytika = true;
294                 combining.accented = true;
295             }
296             diacritics!(DIALYTIKA) => combining.dialytika = true,
297             diacritics!(BREATHING_AND_LENGTH) => continue,
298             _ => {
299                 let data = get_data(c);
300                 if let Some(GreekPrecomposedLetterData::Vowel(_vowel, diacritics)) = data {
301                     return Some(GreekCombiningCharacterSequenceDiacritics {
302                         precomposed: diacritics,
303                         combining,
304                     });
305                 } else {
306                     // Not a greek vowel.
307                     return None;
308                 }
309             }
310         }
311     }
312     None
313 }
314 
315 /// Is the character a diacritic expected to be used with greek (except ypogegrammeni).
is_greek_diacritic_except_ypogegrammeni(c: char) -> bool316 pub(crate) fn is_greek_diacritic_except_ypogegrammeni(c: char) -> bool {
317     matches!(
318         c,
319         diacritics!(ACCENTS | BREATHING_AND_LENGTH | DIALYTIKA_ALL)
320     )
321 }
322