1 // This file is part of ICU4X. For terms of use, please see the file
2 // called LICENSE at the top level of the ICU4X source tree
3 // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).
4
5 //! This module contains internal data handling tools for the special-cased Greek uppercasing
6 //! code. The Greek uppercasing algorithm is code-driven, not user-data-driven, however the code
7 //! relies on a CodePointTrie generated based on some Unicode rules.
8 //!
9 //! We try to keep most of the Greek-specific logic in here, though the actual logic to remove
10 //! accents is in full_helper() as it must integrate with the control flow.
11 //!
12 //! This is public and doc(hidden) so that it can be accessed from the gen_greek_to_me test file,
13 //! and should not be used otherwise.
14
15 #[rustfmt::skip]
16 mod data;
17
get_data(ch: char) -> Option<GreekPrecomposedLetterData>18 pub(crate) fn get_data(ch: char) -> Option<GreekPrecomposedLetterData> {
19 let ch_i = ch as usize;
20 let packed = if (0x370..=0x3FF).contains(&ch_i) {
21 *data::DATA_370.get(ch_i - 0x370)?
22 } else if (0x1f00..0x1fff).contains(&ch_i) {
23 *data::DATA_1F00.get(ch_i - 0x1f00)?
24 } else {
25 data::match_extras(ch)?
26 };
27
28 let packed = PackedGreekPrecomposedLetterData(packed);
29
30 GreekPrecomposedLetterData::try_from(packed).ok()
31 }
32
33 /// A packed representation of [`GreekPrecomposedLetterData`]
34 ///
35 /// Bit layout:
36 ///
37 /// ```text
38 /// 7 6 5 4 3 2 1 0
39 /// discr=0 | [diacritics] | [vowel ]
40 /// discr=1 | [ unused = 0 ] | [is_rho]
41 /// ```
42 ///
43 /// Bit 7 is the discriminant. if 0, it is a vowel, else, it is a consonant.
44 /// If the whole thing is a zero then it is assumed to be an empty entry.
45 ///
46 /// In the vowel case, the next three bits are the next three elements of GreekDiacritics,
47 /// in order (accented, dialytika, ypogegrammeni), and the four bits after that identify
48 /// a GreekVowel value.
49 ///
50 /// In the consonant case, the remaining seven bits identify a GreekConsonant value.
51 #[derive(Debug, Clone, Copy)]
52 pub struct PackedGreekPrecomposedLetterData(pub u8);
53
54 impl TryFrom<PackedGreekPrecomposedLetterData> for GreekPrecomposedLetterData {
55 type Error = ();
try_from(other: PackedGreekPrecomposedLetterData) -> Result<GreekPrecomposedLetterData, ()>56 fn try_from(other: PackedGreekPrecomposedLetterData) -> Result<GreekPrecomposedLetterData, ()> {
57 if other.0 == 0 {
58 return Err(());
59 }
60 if other.0 & 0x80 == 0 {
61 // vowel
62 let diacritics = GreekDiacritics {
63 accented: other.0 & 0x40 != 0,
64 dialytika: other.0 & 0x20 != 0,
65 ypogegrammeni: other.0 & 0x10 != 0,
66 };
67 let vowel = GreekVowel::try_from(other.0 & 0b1111);
68 debug_assert!(vowel.is_ok());
69 let vowel = vowel.unwrap_or(GreekVowel::Α);
70 Ok(GreekPrecomposedLetterData::Vowel(vowel, diacritics))
71 } else {
72 // consonant
73 // 0x80 is is_rho = false, 0x81 is is_rho = true
74 Ok(GreekPrecomposedLetterData::Consonant(other.0 == 0x81))
75 }
76 }
77 }
78
79 impl From<GreekPrecomposedLetterData> for PackedGreekPrecomposedLetterData {
from(other: GreekPrecomposedLetterData) -> Self80 fn from(other: GreekPrecomposedLetterData) -> Self {
81 match other {
82 GreekPrecomposedLetterData::Vowel(vowel, diacritics) => {
83 let mut bits = 0;
84 if diacritics.accented {
85 bits |= 0x40;
86 }
87 if diacritics.dialytika {
88 bits |= 0x20;
89 }
90 if diacritics.ypogegrammeni {
91 bits |= 0x10;
92 }
93 bits |= vowel as u8;
94 PackedGreekPrecomposedLetterData(bits)
95 }
96 GreekPrecomposedLetterData::Consonant(is_rho) => {
97 PackedGreekPrecomposedLetterData(0x80 + is_rho as u8)
98 }
99 }
100 }
101 }
102
103 /// The precomposed letter data stored in the hardcoded data in `mod data`
104 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
105 pub enum GreekPrecomposedLetterData {
106 /// A vowel, with a capitalized base letter, and the diacritics found
107 Vowel(GreekVowel, GreekDiacritics),
108 /// A consonant or vowel that does not take diacritics
109 ///
110 /// The boolean is true when the consonant is a rho, which is handled specially since
111 /// it can take breathing marks (but is *not* a vowel)
112 Consonant(bool),
113 }
114
115 /// n.b. these are Greek capital letters, not Latin
116 #[derive(Debug, Clone, Copy, PartialOrd, Ord, PartialEq, Eq)]
117 pub enum GreekVowel {
118 // 0 is purposely left out so that the all-zero case is unambiguous
119 Α = 1,
120 Ε = 2,
121 Η = 3,
122 Ι = 4,
123 Ο = 5,
124 Υ = 6,
125 Ω = 7,
126 ϒ = 8,
127 }
128 pub const CAPITAL_RHO: char = 'Ρ';
129
130 impl From<GreekVowel> for char {
from(other: GreekVowel) -> Self131 fn from(other: GreekVowel) -> Self {
132 match other {
133 GreekVowel::Α => 'Α',
134 GreekVowel::Ε => 'Ε',
135 GreekVowel::Η => 'Η',
136 GreekVowel::Ι => 'Ι',
137 GreekVowel::Ο => 'Ο',
138 GreekVowel::Υ => 'Υ',
139 GreekVowel::Ω => 'Ω',
140 GreekVowel::ϒ => 'ϒ',
141 }
142 }
143 }
144
145 impl TryFrom<char> for GreekVowel {
146 type Error = ();
try_from(other: char) -> Result<Self, ()>147 fn try_from(other: char) -> Result<Self, ()> {
148 Ok(match other {
149 'Α' => GreekVowel::Α,
150 'Ε' => GreekVowel::Ε,
151 'Η' => GreekVowel::Η,
152 'Ι' => GreekVowel::Ι,
153 'Ο' => GreekVowel::Ο,
154 'Υ' => GreekVowel::Υ,
155 'Ω' => GreekVowel::Ω,
156 'ϒ' => GreekVowel::ϒ,
157 _ => return Err(()),
158 })
159 }
160 }
161
162 impl TryFrom<u8> for GreekVowel {
163 type Error = ();
try_from(other: u8) -> Result<Self, ()>164 fn try_from(other: u8) -> Result<Self, ()> {
165 Ok(match other {
166 1 => Self::Α,
167 2 => Self::Ε,
168 3 => Self::Η,
169 4 => Self::Ι,
170 5 => Self::Ο,
171 6 => Self::Υ,
172 7 => Self::Ω,
173 8 => Self::ϒ,
174 _ => return Err(()),
175 })
176 }
177 }
178
179 /// General diacritic information about a character or combining character sequence.
180 #[derive(Copy, Clone, Default, PartialEq, Eq, Debug)]
181 pub struct GreekDiacritics {
182 /// Whether it has an accent.
183 pub accented: bool,
184 /// Whether it has a dialytika.
185 pub dialytika: bool,
186 /// Whether it has a ypogegrammeni.
187 pub ypogegrammeni: bool,
188 }
189
190 /// General diacritic information about a combining character sequence,
191 /// identifying the source of the diacritics.
192 #[derive(Copy, Clone, Default, PartialEq, Eq, Debug)]
193 pub struct GreekCombiningCharacterSequenceDiacritics {
194 // Diacritics precomposed on the base.
195 pub precomposed: GreekDiacritics,
196 // Combining diacritics.
197 pub combining: GreekDiacritics,
198 }
199
200 pub const TONOS: char = '\u{0301}';
201 pub const DIALYTIKA: char = '\u{0308}';
202 pub const DIALYTIKA_TONOS: char = '\u{0344}';
203 pub const YPOGEGRAMMENI: char = '\u{0345}';
204
205 #[macro_export]
206 #[doc(hidden)] // macro
207 macro_rules! diacritics {
208 // Accents.
209 // These are mostly removed when uppercasing, but their presence may require
210 // adding a διαλυτικά to a following vowel.
211 (ACCENTS) => {
212 // https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%5Cu0300+%5Cu0301+%5Cu0342+%5Cu0302+%5Cu0303+%5Cu0311%5D&g=&i=
213 '\u{0300}' // Polytonic βαρεία (varia), grave accent.
214 | $crate::greek_to_me::TONOS // Polytonic οξεία (oxia) unified with monotonic τόνος (tonos), acute accent.
215 | '\u{0342}' // Polytonic περισπωμένη (perispomeni), often translated to circumflex.
216 | '\u{0302}' // Circumflex accent, sometimes a lookalike of the περισπωμένη.
217 | '\u{0303}' // Tilde, sometimes a lookalike of the περισπωμένη.
218 | '\u{0311}' // Inverted breve, sometimes a lookalike of the περισπωμένη.
219 };
220 // Breathings and length marks.
221 // These expected to occur in Greek combining sequences, and are removed when uppercasing.
222 // This removal has no other effect.
223 (BREATHING_AND_LENGTH) => {
224 // https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5B%5Cu0304+%5Cu0306+%5Cu0313+%5Cu0314+%5Cu0343%5D&g=&i=
225 '\u{0304}' // Macron, marking long vowels.
226 | '\u{0306}' // Breve, marking short vowels.
227 | '\u{0313}' // Comma above, smooth breathing or κορωνίς marking crasis.
228 | '\u{0314}' // Reversed comma above, rough breathing.
229 | '\u{0343}' // κορωνίς (koronis), canonically decomposes to comma above.
230 };
231 // All diacritics containing a dialytika
232 (DIALYTIKA_ALL) => { $crate::greek_to_me::DIALYTIKA | $crate::greek_to_me::DIALYTIKA_TONOS };
233 (DIALYTIKA) => { $crate::greek_to_me::DIALYTIKA };
234 (DIALYTIKA_TONOS) => { $crate::greek_to_me::DIALYTIKA_TONOS };
235 (YPOGEGRAMMENI) => { $crate::greek_to_me::YPOGEGRAMMENI };
236 ($($i:ident)|+) => { $(diacritics!($i))|+};
237 }
238
239 /// Macro that generates match arms for various diacritic groupings.
240 ///
241 /// Groupings supported:
242 ///
243 /// - ACCENTS
244 /// - BREATHING_AND_LENGTH
245 /// - DIALYTIKA, DIALYTIKA_TONOS, and DIALITYKA_ALL
246 /// - YPOGEGRAMMENI
247 ///
248 /// This is a macro to make it easy to keep the lists of accents in sync.
249 pub use crate::diacritics;
250
251 impl GreekDiacritics {
252 /// Whilst forwards-iterating from an existing character,
253 /// consume all further greek diacritics and store their existence into this struct.
consume_greek_diacritics(&mut self, context_after: &str)254 pub(crate) fn consume_greek_diacritics(&mut self, context_after: &str) {
255 for c in context_after.chars() {
256 match c {
257 diacritics!(ACCENTS) => self.accented = true,
258 DIALYTIKA_TONOS => {
259 self.dialytika = true;
260 self.accented = true;
261 }
262 DIALYTIKA => self.dialytika = true,
263 YPOGEGRAMMENI => self.ypogegrammeni = true,
264 // Ignore other accent marks that are expected to co-occur with Greek.
265 diacritics!(BREATHING_AND_LENGTH) => (),
266 _ => break,
267 }
268 }
269 }
270 }
271
272 /// Given the context before a character, check if it is preceded by a Greek letter.
preceded_by_greek_letter(context_before: &str) -> bool273 pub(crate) fn preceded_by_greek_letter(context_before: &str) -> bool {
274 for c in context_before.chars().rev() {
275 match c {
276 diacritics!(ACCENTS | BREATHING_AND_LENGTH | DIALYTIKA_ALL | YPOGEGRAMMENI) => continue,
277 _ => return get_data(c).is_some(),
278 }
279 }
280 false
281 }
282
283 /// Returns diacritic information for the combining character sequence preceding the current character
284 /// if it that preceding combining character sequence is a greek vowel.
preceding_greek_vowel_diacritics( context_before: &str, ) -> Option<GreekCombiningCharacterSequenceDiacritics>285 pub(crate) fn preceding_greek_vowel_diacritics(
286 context_before: &str,
287 ) -> Option<GreekCombiningCharacterSequenceDiacritics> {
288 let mut combining: GreekDiacritics = Default::default();
289 for c in context_before.chars().rev() {
290 match c {
291 diacritics!(ACCENTS) => combining.accented = true,
292 diacritics!(DIALYTIKA_TONOS) => {
293 combining.dialytika = true;
294 combining.accented = true;
295 }
296 diacritics!(DIALYTIKA) => combining.dialytika = true,
297 diacritics!(BREATHING_AND_LENGTH) => continue,
298 _ => {
299 let data = get_data(c);
300 if let Some(GreekPrecomposedLetterData::Vowel(_vowel, diacritics)) = data {
301 return Some(GreekCombiningCharacterSequenceDiacritics {
302 precomposed: diacritics,
303 combining,
304 });
305 } else {
306 // Not a greek vowel.
307 return None;
308 }
309 }
310 }
311 }
312 None
313 }
314
315 /// Is the character a diacritic expected to be used with greek (except ypogegrammeni).
is_greek_diacritic_except_ypogegrammeni(c: char) -> bool316 pub(crate) fn is_greek_diacritic_except_ypogegrammeni(c: char) -> bool {
317 matches!(
318 c,
319 diacritics!(ACCENTS | BREATHING_AND_LENGTH | DIALYTIKA_ALL)
320 )
321 }
322