1 /* 2 * Copyright 2024 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 use std::cmp; 18 19 /// An implementation of hyphenation for Android. 20 /// 21 /// The hyphenation in Android is done with two steps: first performs the Knuth-Liang hyphenation 22 /// algorithm for populating all possible hyphenation points. Then, resolve hyphenation type from 23 /// the scripts and locales. 24 /// 25 /// The Knuth-Liang hyphenation works like as follows: 26 /// The Knuth-Liang hyphenation uses two dictionary: pattern dictionary and exception files. The 27 /// files end with ".hyp.txt" are exception files and the files end with ".pat.txt" are pattern 28 /// files. If the word is in exception file, the hyphenation is performed as it is specified in the 29 /// exception file. 30 /// 31 /// Then, if the word is not in the exception file, the Knuth-Liang hyphenation is performed with 32 /// hyphenation pattern dictionary. The hyphenation pattern dictionary is a list of sub-word with 33 /// hyphenation level as number. The level values are assigned between each letters including before 34 /// the first letter and last letter. If the value is odd number, then the position is a hyphenation 35 /// point. If the value is even number, then the position is not a hyphenation point. In the pattern 36 /// file, the 0 is dropped, so the meaning of "re4ti4z" is level values "0040040" for sub-word 37 /// "retiz". The hyphenation is performed by iterating all patterns and assigning level values to 38 /// the possible break points. If the break point can be assigned from multiple patterns, the 39 /// maximum value is used. If none of the pattern matches the break point, the level is zero, 40 /// therefore do not break. And finally the odd numbered positions are the break points. 41 /// 42 /// Here is an example how the "hyphenation" is broken into "hy-phen-ation". 43 /// The relevant patterns in the pattern dictionary are 44 /// - hy3ph 45 /// - he2n 46 /// - hena4 47 /// - hen5at 48 /// - 1na 49 /// - n2at 50 /// - 1tio 51 /// - 2io 52 /// - o2n 53 /// 54 /// Then when these patterns are applied to the word "hyphenation", it becomes like 55 /// 56 /// h y p h e n a t i o n 57 /// 0 0 3 0 0 : hy3ph 58 /// 0 0 2 0 : he2n 59 /// 0 0 0 0 4 : hena4 60 /// 0 0 0 5 0 0 : hen5at 61 /// 1 0 0 : 1na 62 /// 0 2 0 0 : n2at 63 /// 1 0 0 0 : 1tio 64 /// 2 0 0 : 2io 65 /// 0 2 0: o2n 66 /// --------------------------------- 67 /// 0 0 3 0 0 2 5 4 2 0 2 0: max 68 /// 69 /// Then, the odd-numbered break points are hyphenation allowed break points, so the result is 70 /// "hy-phen-ation". 71 /// 72 /// In the Android implementation, the hyphenation pattern file is preprocessed to Trie in build 73 /// time. For the detailed file format, see comments of HyphenationData struct. 74 /// 75 /// Once the all possible hyphenation break points are collected, the decide the hyphenation break 76 /// type is determined based on the script and locale. For example, in case of Arabic, the letter 77 /// form should not be changed by hyphenation, so ZWJ can be inserted before and after hyphen 78 /// letter. 79 80 const CHAR_SOFT_HYPHEN: u16 = 0x00AD; 81 const CHAR_MIDDLE_DOT: u16 = 0x00B7; 82 const CHAR_HYPHEN_MINUS: u16 = 0x002D; 83 const CHAR_HYPHEN: u16 = 0x2010; 84 85 // The following U_JT_* constants must be same to the ones defined in 86 // frameworks/minikin/lib/minikin/ffi/IciBridge.h 87 // TODO: Replace with ICU4X once it becomes available in Android. 88 const U_JT_NON_JOINING: u8 = 0; 89 const U_JT_DUAL_JOINING: u8 = 1; 90 const U_JT_RIGHT_JOINING: u8 = 2; 91 const U_JT_LEFT_JOINING: u8 = 3; 92 const U_JT_JOIN_CAUSING: u8 = 4; 93 const U_JT_TRANSPARENT: u8 = 5; 94 95 // The following USCRIPT_* constants must be same to the ones defined in 96 // frameworks/minikin/lib/minikin/ffi/IciBridge.h 97 // TODO: Replace with ICU4X once it becomes available in Android. 98 const USCRIPT_LATIN: u8 = 0; 99 const USCRIPT_ARABIC: u8 = 1; 100 const USCRIPT_KANNADA: u8 = 2; 101 const USCRIPT_MALAYALAM: u8 = 3; 102 const USCRIPT_TAMIL: u8 = 4; 103 const USCRIPT_TELUGU: u8 = 5; 104 const USCRIPT_ARMENIAN: u8 = 6; 105 const USCRIPT_CANADIAN_ABORIGINAL: u8 = 7; 106 107 use crate::ffi::getJoiningType; 108 use crate::ffi::getScript; 109 110 /// Hyphenation types 111 /// The following values must be equal to the ones in 112 /// frameworks/minikin/include/minikin/Hyphenator.h 113 #[repr(u8)] 114 #[derive(PartialEq, Copy, Clone)] 115 pub enum HyphenationType { 116 /// Do not break. 117 DontBreak = 0, 118 /// Break the line and insert a normal hyphen. 119 BreakAndInsertHyphen = 1, 120 /// Break the line and insert an Armenian hyphen (U+058A). 121 BreakAndInsertArmenianHyphen = 2, 122 /// Break the line and insert a Canadian Syllabics hyphen (U+1400). 123 BreakAndInsertUcasHyphen = 4, 124 /// Break the line, but don't insert a hyphen. Used for cases when there is already a hyphen 125 /// present or the script does not use a hyphen (e.g. in Malayalam). 126 BreakAndDontInsertHyphen = 5, 127 /// Break and replace the last code unit with hyphen. Used for Catalan "l·l" which hyphenates 128 /// as "l-/l". 129 BreakAndReplaceWithHyphen = 6, 130 /// Break the line, and repeat the hyphen (which is the last character) at the beginning of the 131 /// next line. Used in Polish (where "czerwono-niebieska" should hyphenate as 132 /// "czerwono-/-niebieska") and Slovenian. 133 BreakAndInsertHyphenAtNextLine = 7, 134 /// Break the line, insert a ZWJ and hyphen at the first line, and a ZWJ at the second line. 135 /// This is used in Arabic script, mostly for writing systems of Central Asia. It's our default 136 /// behavior when a soft hyphen is used in Arabic script. 137 BreakAndInsertHyphenAndZwj = 8, 138 } 139 140 /// Hyphenation locale 141 #[repr(u8)] 142 #[derive(PartialEq, Copy, Clone)] 143 pub enum HyphenationLocale { 144 /// Other locale 145 Other = 0, 146 /// Catalan 147 Catalan = 1, 148 /// Polish 149 Polish = 2, 150 /// Slovenian 151 Slovenian = 3, 152 } 153 154 const MAX_HYPHEN_SIZE: u32 = 64; 155 156 struct HyphenationData<'a> { 157 bytes: &'a [u8], 158 } 159 160 /// The Hyphenation pattern file is encoded into binary format during build time. 161 /// The hyphenation pattern file is encoded into three objects: AlphabetTable, Trie, Patterns. 162 /// 163 /// First, to avoid high value of utf16 char values in Trie object, char values are mapped to 164 /// internal alphabet codes. The AlphabetTable0 and AndroidTable1 has a map from utf16 char values 165 /// to internal u16 alphabet codes. The AlphabetTable0 is used if the min and max used code points 166 /// has less than 1024, i.e. max_codepoint - min_codepoint < 1024. The AlphabetTable1 is used 167 /// otherwise. 168 /// 169 /// Then, the pattern file is encoded with Trie and Pattern object with using internal 170 /// alphabet code. For example, in case of the entry "ef5i5nite", the hyphenation score "00550000" 171 /// is stored in the Pattern object and the subword "efinite" is stored in the Trie object. 172 /// 173 /// The Trie object is encoded as one dimensional u32 arrays. Each u32 integer contains packed 174 /// index to the Pattern object, index to the next node entry and alphabet code. 175 /// Trie Entry: 176 /// 0 1 2 3 177 /// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 (bits) 178 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 179 /// | index to pattern data | index to the next node | code | 180 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 181 /// Note: the layout is as an example of pattern_shift = 19, link_shift = 5. 182 /// 183 /// The Pattern object is encoded into two data: entry list and data payload. The entry is a packed 184 /// u32 integer that contains length of the pattern, an amount of shift of the pattern index and 185 /// an offset from the payload head. 186 /// 187 /// Pattern Entry: 188 /// 0 1 2 3 189 /// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 (bits) 190 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 191 /// |len of pat | pat shift | offset to the pattern data | 192 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 193 /// 194 /// The pattern data and related information can be obtained as follows: 195 /// 196 /// // Pattern information 197 /// let entry = pattern[16 + pattern_index * 4] // read u32 as little endian. 198 /// let pattern_length = entry >> 26 199 /// let pattern_shift = (entry > 20) & 0x3f 200 /// 201 /// // Pattern value retrieval: i-th offset in the word. 202 /// let pattern_offset = pattern[8] // read u32 as little endian. 203 /// let pattern_value = pattern[pattern_offset + (entry & 0xfffff) + i] 204 impl<'a> HyphenationData<'a> { new(bytes: &'a [u8]) -> Self205 pub const fn new(bytes: &'a [u8]) -> Self { 206 HyphenationData { bytes } 207 } 208 read_u32(&self, offset: u32) -> u32209 pub fn read_u32(&self, offset: u32) -> u32 { 210 let usize_offset = offset as usize; 211 self.bytes 212 .get(usize_offset..usize_offset + 4) 213 .map(|x: &[u8]| u32::from_le_bytes(x.try_into().unwrap())) 214 .unwrap() 215 } 216 } 217 218 /// Header struct of the hyphenation pattern file. 219 /// The object layout follows: 220 /// 0 1 2 3 4 5 6 7 8 9 A B C D E F (bytes) 221 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 222 /// | magic | version |alphabet offset| trie offset | 223 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 224 /// |pattern offset | file size | 225 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 226 pub struct Header<'a> { 227 data: HyphenationData<'a>, 228 } 229 230 /// Alphabet Table version 0 struct of the hyphenation pattern file. 231 /// The object layout follows: 232 /// 0 1 2 3 4 5 6 7 8 9 A B C D E F (bytes) 233 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 234 /// | version | min codepoint | max codepoint | payload 235 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 236 pub struct AlphabetTable0<'a> { 237 data: HyphenationData<'a>, 238 min_codepoint: u32, 239 max_codepoint: u32, 240 } 241 242 /// Alphabet Table version 1 struct of the hyphenation pattern file. 243 /// The object layout follows: 244 /// 0 1 2 3 4 5 6 7 8 9 A B C D E F (bytes) 245 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 246 /// | version | num of entries| payload 247 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 248 pub struct AlphabetTable1<'a> { 249 data: HyphenationData<'a>, 250 num_entries: u32, 251 } 252 253 /// An entry of alphabet table version 1 struct of the hyphenation pattern file. 254 /// The entry is packed u32 value: the high 21 bits are code point and low 11 bits 255 /// are alphabet code value. 256 /// 0 1 2 3 257 /// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 (bits) 258 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 259 /// | code point | code value | 260 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 261 pub struct AlphabetTable1Entry { 262 entry: u32, 263 } 264 265 /// Trie struct of the hyphenation pattern file. 266 /// The object layout follows: 267 /// 0 1 2 3 4 5 6 7 8 9 A B C D E F (bytes) 268 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 269 /// | version | char mask | link shift | link mask | 270 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 271 /// | pattern shift | num entries | payload 272 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 273 pub struct Trie<'a> { 274 data: HyphenationData<'a>, 275 } 276 277 /// Pattern struct of the hyphenation pattern file. 278 /// The object layout follows: 279 /// 0 1 2 3 4 5 6 7 8 9 A B C D E F (bytes) 280 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 281 /// | version | num entries | pattern offset| pattern size | 282 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 283 /// | payload 284 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 285 pub struct Pattern<'a> { 286 data: HyphenationData<'a>, 287 pattern_offset: u32, 288 } 289 290 /// An entry of pattern struct of the hyphenation pattern file. 291 /// The entry is packed u32 value: the highest 6 bits are for length, next 6 bits are amount of 292 /// shift, and lowest 20 bits are offset of the first value from the pattern offset value. 293 /// 0 1 2 3 294 /// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 (bits) 295 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 296 /// | length | shift | offset of the first value | 297 /// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 298 pub struct PatternEntry<'a> { 299 data: HyphenationData<'a>, 300 pattern_offset: u32, 301 entry: u32, 302 } 303 304 impl<'a> Header<'a> { 305 /// Construct a reader of the Header struct from the byte array. new(bytes: &'a [u8]) -> Self306 pub const fn new(bytes: &'a [u8]) -> Self { 307 Header { data: HyphenationData::new(bytes) } 308 } 309 310 /// Returns the reader of the alphabet code. alphabet_table(&self) -> Option<Box<dyn AlphabetLookup + 'a>>311 pub fn alphabet_table(&self) -> Option<Box<dyn AlphabetLookup + 'a>> { 312 let offset = self.data.read_u32(8); 313 let version = self.data.read_u32(offset); 314 return match version { 315 0 => Some(Box::new(AlphabetTable0::new(self.read_offset_and_slice(8)))), 316 1 => Some(Box::new(AlphabetTable1::new(self.read_offset_and_slice(8)))), 317 _ => None, 318 }; 319 } 320 321 /// Returns the reader of the trie struct. trie_table(&self) -> Trie<'a>322 pub fn trie_table(&self) -> Trie<'a> { 323 Trie::new(self.read_offset_and_slice(12)) 324 } 325 326 /// Returns the reader of the pattern struct. pattern_table(&self) -> Pattern<'a>327 pub fn pattern_table(&self) -> Pattern<'a> { 328 Pattern::new(self.read_offset_and_slice(16)) 329 } 330 read_offset_and_slice(&self, offset: u32) -> &'a [u8]331 fn read_offset_and_slice(&self, offset: u32) -> &'a [u8] { 332 let offset = self.data.read_u32(offset) as usize; 333 self.data.bytes.get(offset..).unwrap() 334 } 335 } 336 337 pub trait AlphabetLookup { 338 /// Get the alphabet code for the code point. get_at(&self, c: u32) -> Option<u16>339 fn get_at(&self, c: u32) -> Option<u16>; 340 341 /// Lookup the internal alphabet codes from UTF-16 character codes. lookup( &self, alpha_codes: &mut [u16; MAX_HYPHEN_SIZE as usize], word: &[u16], ) -> HyphenationType342 fn lookup( 343 &self, 344 alpha_codes: &mut [u16; MAX_HYPHEN_SIZE as usize], 345 word: &[u16], 346 ) -> HyphenationType { 347 let mut result = HyphenationType::BreakAndInsertHyphen; 348 alpha_codes[0] = 0; // word start 349 for i in 0..word.len() { 350 let c = word[i] as u32; 351 if let Some(code) = self.get_at(c) { 352 alpha_codes[i + 1] = code; 353 } else { 354 return HyphenationType::DontBreak; 355 } 356 if result == HyphenationType::BreakAndInsertHyphen { 357 result = Hyphenator::hyphenation_type_based_on_script(c); 358 } 359 } 360 alpha_codes[word.len() + 1] = 0; // word termination 361 result 362 } 363 } 364 365 /// Map from utf16 code unit to the internal alphabet code. 366 impl<'a> AlphabetTable0<'a> { 367 /// Construct a reader of the Alphabet Table version 0 struct from the byte array. new(bytes: &'a [u8]) -> Self368 pub fn new(bytes: &'a [u8]) -> Self { 369 let data = HyphenationData::new(bytes); 370 let min_codepoint = data.read_u32(4); 371 let max_codepoint = data.read_u32(8); 372 AlphabetTable0 { data, min_codepoint, max_codepoint } 373 } 374 } 375 376 impl<'a> AlphabetLookup for AlphabetTable0<'a> { 377 /// Returns an entry of the specified offset. get_at(&self, offset: u32) -> Option<u16>378 fn get_at(&self, offset: u32) -> Option<u16> { 379 if offset < self.min_codepoint || offset >= self.max_codepoint { 380 None 381 } else { 382 let code = self.data.bytes[(offset - self.min_codepoint) as usize + 12] as u16; 383 if code == 0 { 384 None 385 } else { 386 Some(code) 387 } 388 } 389 } 390 } 391 392 /// Map from utf16 code unit to the internal alphabet code. 393 impl<'a> AlphabetTable1<'a> { 394 /// Construct a reader of the Alphabet Table version 1 struct from the byte array. new(bytes: &'a [u8]) -> Self395 pub fn new(bytes: &'a [u8]) -> Self { 396 let data = HyphenationData::new(bytes); 397 let num_entries = data.read_u32(4); 398 AlphabetTable1 { data, num_entries } 399 } 400 lower_bounds(&self, value: u32) -> Option<u32>401 fn lower_bounds(&self, value: u32) -> Option<u32> { 402 let mut b = 0; 403 let mut e = self.num_entries; 404 while b != e { 405 let m = b + (e - b) / 2; 406 let c = self.data.read_u32(8 + m * 4); 407 if c >= value { 408 e = m; 409 } else { 410 b = m + 1; 411 } 412 } 413 if b == self.num_entries { 414 None 415 } else { 416 Some(b) 417 } 418 } 419 } 420 421 impl<'a> AlphabetLookup for AlphabetTable1<'a> { get_at(&self, c: u32) -> Option<u16>422 fn get_at(&self, c: u32) -> Option<u16> { 423 if let Some(r) = self.lower_bounds(c << 11) { 424 let entry = AlphabetTable1Entry::new(self.data.read_u32(8 + r * 4)); 425 if entry.codepoint() == c { 426 Some(entry.value()) 427 } else { 428 None 429 } 430 } else { 431 None 432 } 433 } 434 } 435 436 /// A packed u32 entry of the AlphabetTable1. 437 impl AlphabetTable1Entry { new(entry_value: u32) -> Self438 pub const fn new(entry_value: u32) -> Self { 439 AlphabetTable1Entry { entry: entry_value } 440 } 441 442 /// Unpack code point from entry value. codepoint(&self) -> u32443 pub fn codepoint(&self) -> u32 { 444 self.entry >> 11 445 } 446 447 /// Unpack value from entry value. value(&self) -> u16448 pub fn value(&self) -> u16 { 449 (self.entry & 0x7ff).try_into().unwrap() 450 } 451 } 452 453 /// A Trie object. 454 /// See the function comment of HyphenationData for the details. 455 impl<'a> Trie<'a> { 456 /// Construct a reader of the Trie struct from the byte array. new(bytes: &'a [u8]) -> Self457 pub const fn new(bytes: &'a [u8]) -> Self { 458 Trie { data: HyphenationData::new(bytes) } 459 } 460 461 /// Returns an entry of at the offset. 462 /// The entry of the next alphabet code is 463 /// 464 /// let entry = trie.get_at(node + alphabet_codes[char]) get_at(&self, offset: u32) -> u32465 pub fn get_at(&self, offset: u32) -> u32 { 466 self.data.read_u32(24 + offset * 4) 467 } 468 469 /// Returns the bit mask for the character code point of the node. 470 /// You can get node's character code point by 471 /// 472 /// let node_character = entry & char_mask. char_mask(&self) -> u32473 pub fn char_mask(&self) -> u32 { 474 self.data.read_u32(4) 475 } 476 477 /// Returns the amount of shift of the node index. 478 /// You can get node number as following 479 /// 480 /// let next_node = (entry & link_mask) >> link_shift link_shift(&self) -> u32481 pub fn link_shift(&self) -> u32 { 482 self.data.read_u32(8) 483 } 484 485 /// Returns the mask for the node index. 486 /// You can get node number as following 487 /// 488 /// let next_node = (entry & link_mask) >> link_shift link_mask(&self) -> u32489 pub fn link_mask(&self) -> u32 { 490 self.data.read_u32(12) 491 } 492 493 /// Returns the amount of shift of the pattern index. 494 /// You can get pattern index as following 495 /// 496 /// let pattern_index = entry >> pattern_shift pattern_shift(&self) -> u32497 pub fn pattern_shift(&self) -> u32 { 498 self.data.read_u32(16) 499 } 500 } 501 502 /// A Pattern object. 503 /// See the function comment of HyphenationData for the details. 504 impl<'a> Pattern<'a> { 505 /// Construct a reader of the Pattern struct from the byte array. new(bytes: &'a [u8]) -> Self506 pub fn new(bytes: &'a [u8]) -> Self { 507 let data = HyphenationData::new(bytes); 508 let pattern_offset = data.read_u32(8); 509 Pattern { data, pattern_offset } 510 } 511 512 /// Returns a packed u32 entry at the given offset. entry_at(&self, offset: u32) -> PatternEntry<'a>513 pub fn entry_at(&self, offset: u32) -> PatternEntry<'a> { 514 let entry = self.data.read_u32(16 + offset * 4); 515 PatternEntry::new(self.data.bytes, self.pattern_offset, entry) 516 } 517 } 518 519 /// An entry of the pattern object. 520 impl<'a> PatternEntry<'a> { 521 /// Construct a reader of the Pattern struct from the byte array. new(bytes: &'a [u8], pattern_offset: u32, entry: u32) -> Self522 pub const fn new(bytes: &'a [u8], pattern_offset: u32, entry: u32) -> Self { 523 PatternEntry { data: HyphenationData::new(bytes), pattern_offset, entry } 524 } 525 526 /// Unpack length of the pattern from the packed entry value. len(&self) -> u32527 pub fn len(&self) -> u32 { 528 self.entry >> 26 529 } 530 531 /// Unpack an amount of shift of the pattern data from the packed entry value. shift(&self) -> u32532 pub fn shift(&self) -> u32 { 533 (self.entry >> 20) & 0x3f 534 } 535 536 /// Returns a hyphenation score value at the offset in word with the entry. value_at(&self, offset: u32) -> u8537 pub fn value_at(&self, offset: u32) -> u8 { 538 self.data.bytes[(self.pattern_offset + (self.entry & 0xfffff) + offset) as usize] 539 } 540 } 541 542 /// Performs hyphenation 543 pub struct Hyphenator { 544 data: &'static [u8], 545 min_prefix: u32, 546 min_suffix: u32, 547 locale: HyphenationLocale, 548 } 549 550 impl Hyphenator { 551 /// Create a new hyphenator instance new(data: &'static [u8], min_prefix: u32, min_suffix: u32, locale: &str) -> Self552 pub fn new(data: &'static [u8], min_prefix: u32, min_suffix: u32, locale: &str) -> Self { 553 logger::init( 554 logger::Config::default() 555 .with_tag_on_device("Minikin") 556 .with_max_level(log::LevelFilter::Trace), 557 ); 558 Self { 559 data, 560 min_prefix, 561 min_suffix, 562 locale: if locale == "pl" { 563 HyphenationLocale::Polish 564 } else if locale == "ca" { 565 HyphenationLocale::Catalan 566 } else if locale == "sl" { 567 HyphenationLocale::Slovenian 568 } else { 569 HyphenationLocale::Other 570 }, 571 } 572 } 573 574 /// Performs a hyphenation hyphenate(&self, word: &[u16], out: &mut [u8])575 pub fn hyphenate(&self, word: &[u16], out: &mut [u8]) { 576 let len: u32 = word.len().try_into().unwrap(); 577 let padded_len = len + 2; 578 if !self.data.is_empty() 579 && len >= self.min_prefix + self.min_suffix 580 && padded_len <= MAX_HYPHEN_SIZE 581 { 582 let header = Header::new(self.data); 583 let mut alpha_codes: [u16; MAX_HYPHEN_SIZE as usize] = [0; MAX_HYPHEN_SIZE as usize]; 584 let hyphen_value = if let Some(alphabet) = header.alphabet_table() { 585 alphabet.lookup(&mut alpha_codes, word) 586 } else { 587 HyphenationType::DontBreak 588 }; 589 590 if hyphen_value != HyphenationType::DontBreak { 591 self.hyphenate_from_codes(alpha_codes, padded_len, hyphen_value, out); 592 return; 593 } 594 // TODO: try NFC normalization 595 // TODO: handle non-BMP Unicode (requires remapping of offsets) 596 } 597 // Note that we will always get here if the word contains a hyphen or a soft hyphen, because 598 // the alphabet is not expected to contain a hyphen or a soft hyphen character, so 599 // alphabetLookup would return DONT_BREAK. 600 self.hyphenate_with_no_pattern(word, out); 601 } 602 603 /// This function determines whether a character is like U+2010 HYPHEN in line breaking and 604 /// usage: a character immediately after which line breaks are allowed, but words containing 605 /// it should not be automatically hyphenated using patterns. This is a curated set, created by 606 /// manually inspecting all the characters that have the Unicode line breaking property of BA or 607 /// HY and seeing which ones are hyphens. is_line_breaking_hyphen(c: u16) -> bool608 fn is_line_breaking_hyphen(c: u16) -> bool { 609 c == 0x002D || // HYPHEN-MINUS 610 c == 0x058A || // ARMENIAN HYPHEN 611 c == 0x05BE || // HEBREW PUNCTUATION MAQAF 612 c == 0x1400 || // CANADIAN SYLLABICS HYPHEN 613 c == 0x2010 || // HYPHEN 614 c == 0x2013 || // EN DASH 615 c == 0x2027 || // HYPHENATION POINT 616 c == 0x2E17 || // DOUBLE OBLIQUE HYPHEN 617 c == 0x2E40 // DOUBLE HYPHEN 618 } 619 620 /// Resolves the hyphenation type for Arabic text. 621 /// In case of Arabic text, the letter form should not be changed by hyphenation. 622 /// So, if the hyphenation is in the middle of the joining context, insert ZWJ for keeping the 623 /// form from the original text. get_hyph_type_for_arabic(word: &[u16], location: u32) -> HyphenationType624 fn get_hyph_type_for_arabic(word: &[u16], location: u32) -> HyphenationType { 625 let mut i = location; 626 let mut join_type: u8 = U_JT_NON_JOINING; 627 while i < word.len().try_into().unwrap() { 628 join_type = getJoiningType(word[i as usize].into()); 629 if join_type != U_JT_TRANSPARENT { 630 break; 631 } 632 i += 1; 633 } 634 if join_type == U_JT_DUAL_JOINING 635 || join_type == U_JT_RIGHT_JOINING 636 || join_type == U_JT_JOIN_CAUSING 637 { 638 // The next character is of the type that may join the last character. See if the last 639 // character is also of the right type. 640 join_type = U_JT_NON_JOINING; 641 if i >= 2 { 642 i = location - 2; // skip the soft hyphen 643 loop { 644 join_type = getJoiningType(word[i as usize].into()); 645 if join_type != U_JT_TRANSPARENT { 646 break; 647 } 648 if i == 0 { 649 break; 650 } 651 i -= 1; 652 } 653 } 654 if join_type == U_JT_DUAL_JOINING 655 || join_type == U_JT_LEFT_JOINING 656 || join_type == U_JT_JOIN_CAUSING 657 { 658 return HyphenationType::BreakAndInsertHyphenAndZwj; 659 } 660 } 661 HyphenationType::BreakAndInsertHyphen 662 } 663 664 /// Performs the hyphenation without pattern files. hyphenate_with_no_pattern(&self, word: &[u16], out: &mut [u8])665 fn hyphenate_with_no_pattern(&self, word: &[u16], out: &mut [u8]) { 666 let word_len: u32 = word.len().try_into().unwrap(); 667 out[0] = HyphenationType::DontBreak as u8; 668 for i in 1..word_len { 669 let prev_char = word[i as usize - 1]; 670 if i > 1 && Self::is_line_breaking_hyphen(prev_char) { 671 if (prev_char == CHAR_HYPHEN_MINUS || prev_char == CHAR_HYPHEN) 672 && (self.locale == HyphenationLocale::Polish 673 || self.locale == HyphenationLocale::Slovenian) 674 && getScript(word[i as usize].into()) == USCRIPT_LATIN 675 { 676 // In Polish and Slovenian, hyphens get repeated at the next line. To be safe, 677 // we will do this only if the next character is Latin. 678 out[i as usize] = HyphenationType::BreakAndInsertHyphenAtNextLine as u8; 679 } else { 680 out[i as usize] = HyphenationType::BreakAndDontInsertHyphen as u8; 681 } 682 } else if i > 1 && prev_char == CHAR_SOFT_HYPHEN { 683 // Break after soft hyphens, but only if they don't start the word (a soft hyphen 684 // starting the word doesn't give any useful break opportunities). The type of the 685 // break is based on the script of the character we break on. 686 if getScript(word[i as usize].into()) == USCRIPT_ARABIC { 687 // For Arabic, we need to look and see if the characters around the soft hyphen 688 // actually join. If they don't, we'll just insert a normal hyphen. 689 out[i as usize] = Self::get_hyph_type_for_arabic(word, i) as u8; 690 } else { 691 out[i as usize] = 692 Self::hyphenation_type_based_on_script(word[i as usize] as u32) as u8; 693 } 694 } else if prev_char == CHAR_MIDDLE_DOT 695 && self.min_prefix < i 696 && i <= word_len - self.min_suffix 697 && ((word[i as usize - 2] == 'l' as u16 && word[i as usize] == 'l' as u16) 698 || (word[i as usize - 2] == 'L' as u16 && word[i as usize] == 'L' as u16)) 699 && self.locale == HyphenationLocale::Catalan 700 { 701 // In Catalan, "l·l" should break as "l-" on the first line 702 // and "l" on the next line. 703 out[i as usize] = HyphenationType::BreakAndReplaceWithHyphen as u8; 704 } else { 705 out[i as usize] = HyphenationType::DontBreak as u8; 706 } 707 } 708 } 709 710 /// Performs the hyphenation with pattern file. hyphenate_from_codes( &self, codes: [u16; MAX_HYPHEN_SIZE as usize], len: u32, hyphen_value: HyphenationType, out: &mut [u8], )711 fn hyphenate_from_codes( 712 &self, 713 codes: [u16; MAX_HYPHEN_SIZE as usize], 714 len: u32, 715 hyphen_value: HyphenationType, 716 out: &mut [u8], 717 ) { 718 let header = Header::new(self.data); 719 let trie = header.trie_table(); 720 let pattern = header.pattern_table(); 721 let char_mask = trie.char_mask(); 722 let link_shift = trie.link_shift(); 723 let link_mask = trie.link_mask(); 724 let pattern_shift = trie.pattern_shift(); 725 let max_offset = len - self.min_suffix - 1; 726 727 for i in 0..(len - 1) { 728 let mut node: u32 = 0; // index into Trie table 729 for j in i..len { 730 let c: u32 = codes[j as usize].into(); 731 let entry = trie.get_at(node + c); 732 if (entry & char_mask) == c { 733 node = (entry & link_mask) >> link_shift; 734 } else { 735 break; 736 } 737 let pat_ix = trie.get_at(node) >> pattern_shift; 738 // pat_ix contains a 3-tuple of length, shift (number of trailing zeros), and an 739 // offset into the buf pool. This is the pattern for the substring (i..j) we just 740 // matched, which we combine (via point-wise max) into the buffer vector. 741 if pat_ix != 0 { 742 let pat_entry = pattern.entry_at(pat_ix); 743 let pat_len = pat_entry.len(); 744 let pat_shift = pat_entry.shift(); 745 let offset = j + 1 - (pat_len + pat_shift); 746 // offset is the index within buffer that lines up with the start of pat_buf 747 let start = if self.min_prefix < offset { 0 } else { self.min_prefix - offset }; 748 if offset > max_offset { 749 continue; 750 } 751 let end = cmp::min(pat_len, max_offset - offset); 752 for k in start..end { 753 out[(offset + k) as usize] = 754 cmp::max(out[(offset + k) as usize], pat_entry.value_at(k)); 755 } 756 } 757 } 758 } 759 760 // Since the above calculation does not modify values outside 761 // [mMinPrefix, len - mMinSuffix], they are left as 0 = DONT_BREAK. 762 for r in out.iter_mut().take(max_offset as usize).skip(self.min_prefix as usize) { 763 *r = if *r & 1 != 0 { hyphen_value as u8 } else { HyphenationType::DontBreak as u8 }; 764 } 765 } 766 hyphenation_type_based_on_script(code_point: u32) -> HyphenationType767 fn hyphenation_type_based_on_script(code_point: u32) -> HyphenationType { 768 let script = getScript(code_point); 769 if script == USCRIPT_KANNADA 770 || script == USCRIPT_MALAYALAM 771 || script == USCRIPT_TAMIL 772 || script == USCRIPT_TELUGU 773 { 774 HyphenationType::BreakAndDontInsertHyphen 775 } else if script == USCRIPT_ARMENIAN { 776 HyphenationType::BreakAndInsertArmenianHyphen 777 } else if script == USCRIPT_CANADIAN_ABORIGINAL { 778 HyphenationType::BreakAndInsertUcasHyphen 779 } else { 780 HyphenationType::BreakAndInsertHyphen 781 } 782 } 783 } 784