1 /*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "minikin/Hyphenator.h"
18
19 #include <unicode/uchar.h>
20 #include <unicode/uscript.h>
21
22 #include <algorithm>
23 #include <memory>
24 #include <string>
25 #include <vector>
26
27 #include "FeatureFlags.h"
28 #include "MinikinInternal.h"
29 #include "minikin/Characters.h"
30
31 #ifndef _WIN32
32 #include "minikin_cxx_bridge.rs.h"
33 #endif // _WIN32
34
35 namespace minikin {
36
37 // The following are structs that correspond to tables inside the hyb file format
38
39 struct AlphabetTable0 {
40 uint32_t version;
41 uint32_t min_codepoint;
42 uint32_t max_codepoint;
43 uint8_t data[1]; // actually flexible array, size is known at runtime
44 };
45
46 struct AlphabetTable1 {
47 uint32_t version;
48 uint32_t n_entries;
49 uint32_t data[1]; // actually flexible array, size is known at runtime
50
codepointminikin::AlphabetTable151 static uint32_t codepoint(uint32_t entry) { return entry >> 11; }
valueminikin::AlphabetTable152 static uint32_t value(uint32_t entry) { return entry & 0x7ff; }
53 };
54
55 struct Trie {
56 uint32_t version;
57 uint32_t char_mask;
58 uint32_t link_shift;
59 uint32_t link_mask;
60 uint32_t pattern_shift;
61 uint32_t n_entries;
62 uint32_t data[1]; // actually flexible array, size is known at runtime
63 };
64
65 struct Pattern {
66 uint32_t version;
67 uint32_t n_entries;
68 uint32_t pattern_offset;
69 uint32_t pattern_size;
70 uint32_t data[1]; // actually flexible array, size is known at runtime
71
72 // accessors
lenminikin::Pattern73 static uint32_t len(uint32_t entry) { return entry >> 26; }
shiftminikin::Pattern74 static uint32_t shift(uint32_t entry) { return (entry >> 20) & 0x3f; }
bufminikin::Pattern75 const uint8_t* buf(uint32_t entry) const {
76 return reinterpret_cast<const uint8_t*>(this) + pattern_offset + (entry & 0xfffff);
77 }
78 };
79
80 struct Header {
81 uint32_t magic;
82 uint32_t version;
83 uint32_t alphabet_offset;
84 uint32_t trie_offset;
85 uint32_t pattern_offset;
86 uint32_t file_size;
87
88 // accessors
bytesminikin::Header89 const uint8_t* bytes() const { return reinterpret_cast<const uint8_t*>(this); }
alphabetVersionminikin::Header90 uint32_t alphabetVersion() const {
91 return *reinterpret_cast<const uint32_t*>(bytes() + alphabet_offset);
92 }
alphabetTable0minikin::Header93 const AlphabetTable0* alphabetTable0() const {
94 return reinterpret_cast<const AlphabetTable0*>(bytes() + alphabet_offset);
95 }
alphabetTable1minikin::Header96 const AlphabetTable1* alphabetTable1() const {
97 return reinterpret_cast<const AlphabetTable1*>(bytes() + alphabet_offset);
98 }
trieTableminikin::Header99 const Trie* trieTable() const { return reinterpret_cast<const Trie*>(bytes() + trie_offset); }
patternTableminikin::Header100 const Pattern* patternTable() const {
101 return reinterpret_cast<const Pattern*>(bytes() + pattern_offset);
102 }
103 };
104
105 #ifndef _WIN32
106 class HyphenatorRust : public Hyphenator {
107 public:
HyphenatorRust(const uint8_t * patternData,size_t dataSize,size_t minPrefix,size_t minSuffix,const std::string & locale)108 HyphenatorRust(const uint8_t* patternData, size_t dataSize, size_t minPrefix, size_t minSuffix,
109 const std::string& locale)
110 : mHyphenator(rust::load_hyphenator(::rust::cxxbridge1::Slice(patternData, dataSize),
111 minPrefix, minSuffix, locale)) {}
112
hyphenate(const U16StringPiece & word,HyphenationType * out) const113 virtual void hyphenate(const U16StringPiece& word, HyphenationType* out) const override {
114 static_assert(sizeof(HyphenationType) == sizeof(uint8_t),
115 "HyphnationType must be uint8_t.");
116 rust::hyphenate(*mHyphenator, ::rust::cxxbridge1::Slice(word.data(), word.size()),
117 ::rust::cxxbridge1::Slice(reinterpret_cast<uint8_t*>(out), word.size()));
118 }
119
120 private:
121 ::rust::Box<rust::Hyphenator> mHyphenator;
122 };
123 #endif // _WIN32
124
125 // static
loadBinary(const uint8_t * patternData,size_t dataSize,size_t minPrefix,size_t minSuffix,const std::string & locale)126 Hyphenator* Hyphenator::loadBinary(const uint8_t* patternData, size_t dataSize, size_t minPrefix,
127 size_t minSuffix, const std::string& locale) {
128 #ifdef _WIN32
129 return HyphenatorCXX::loadBinary(patternData, dataSize, minPrefix, minSuffix, locale);
130 #else // _WIN32
131 if (features::rust_hyphenator()) {
132 return new HyphenatorRust(patternData, dataSize, minPrefix, minSuffix, locale);
133 } else {
134 return HyphenatorCXX::loadBinary(patternData, dataSize, minPrefix, minSuffix, locale);
135 }
136 #endif // _WIN32
137 }
138
139 #ifdef _WIN32
loadBinaryForRust(const uint8_t *,size_t,size_t,size_t,const std::string &)140 Hyphenator* Hyphenator::loadBinaryForRust(const uint8_t* /*patternData*/, size_t /*dataSize*/,
141 size_t /*minPrefix*/, size_t /*minSuffix*/,
142 const std::string& /*locale*/) {
143 MINIKIN_NOT_REACHED("Rust implementation is not available on Win32");
144 }
145 #else // _WIN32
loadBinaryForRust(const uint8_t * patternData,size_t dataSize,size_t minPrefix,size_t minSuffix,const std::string & locale)146 Hyphenator* Hyphenator::loadBinaryForRust(const uint8_t* patternData, size_t dataSize,
147 size_t minPrefix, size_t minSuffix,
148 const std::string& locale) {
149 return new HyphenatorRust(patternData, dataSize, minPrefix, minSuffix, locale);
150 }
151 #endif // _WIN32
152 // static
loadBinary(const uint8_t * patternData,size_t,size_t minPrefix,size_t minSuffix,const std::string & locale)153 Hyphenator* HyphenatorCXX::loadBinary(const uint8_t* patternData, size_t, size_t minPrefix,
154 size_t minSuffix, const std::string& locale) {
155 HyphenationLocale hyphenLocale = HyphenationLocale::OTHER;
156 if (locale == "pl") {
157 hyphenLocale = HyphenationLocale::POLISH;
158 } else if (locale == "ca") {
159 hyphenLocale = HyphenationLocale::CATALAN;
160 } else if (locale == "sl") {
161 hyphenLocale = HyphenationLocale::SLOVENIAN;
162 }
163 return new HyphenatorCXX(patternData, minPrefix, minSuffix, hyphenLocale);
164 }
165
HyphenatorCXX(const uint8_t * patternData,size_t minPrefix,size_t minSuffix,HyphenationLocale hyphenLocale)166 HyphenatorCXX::HyphenatorCXX(const uint8_t* patternData, size_t minPrefix, size_t minSuffix,
167 HyphenationLocale hyphenLocale)
168 : mPatternData(patternData),
169 mMinPrefix(minPrefix),
170 mMinSuffix(minSuffix),
171 mHyphenationLocale(hyphenLocale) {}
172
hyphenate(const U16StringPiece & word,HyphenationType * out) const173 void HyphenatorCXX::hyphenate(const U16StringPiece& word, HyphenationType* out) const {
174 const size_t len = word.size();
175 const size_t paddedLen = len + 2; // start and stop code each count for 1
176 if (mPatternData != nullptr && len >= mMinPrefix + mMinSuffix &&
177 paddedLen <= MAX_HYPHENATED_SIZE) {
178 uint16_t alpha_codes[MAX_HYPHENATED_SIZE];
179 const HyphenationType hyphenValue = alphabetLookup(alpha_codes, word);
180
181 if (hyphenValue != HyphenationType::DONT_BREAK) {
182 hyphenateFromCodes(alpha_codes, paddedLen, hyphenValue, out);
183 return;
184 }
185 // TODO: try NFC normalization
186 // TODO: handle non-BMP Unicode (requires remapping of offsets)
187 }
188 // Note that we will always get here if the word contains a hyphen or a soft hyphen, because the
189 // alphabet is not expected to contain a hyphen or a soft hyphen character, so alphabetLookup
190 // would return DONT_BREAK.
191 hyphenateWithNoPatterns(word, out);
192 }
193
194 // This function determines whether a character is like U+2010 HYPHEN in
195 // line breaking and usage: a character immediately after which line breaks
196 // are allowed, but words containing it should not be automatically
197 // hyphenated using patterns. This is a curated set, created by manually
198 // inspecting all the characters that have the Unicode line breaking
199 // property of BA or HY and seeing which ones are hyphens.
isLineBreakingHyphen(uint32_t c)200 bool Hyphenator::isLineBreakingHyphen(uint32_t c) {
201 return (c == 0x002D || // HYPHEN-MINUS
202 c == 0x058A || // ARMENIAN HYPHEN
203 c == 0x05BE || // HEBREW PUNCTUATION MAQAF
204 c == 0x1400 || // CANADIAN SYLLABICS HYPHEN
205 c == 0x2010 || // HYPHEN
206 c == 0x2013 || // EN DASH
207 c == 0x2027 || // HYPHENATION POINT
208 c == 0x2E17 || // DOUBLE OBLIQUE HYPHEN
209 c == 0x2E40); // DOUBLE HYPHEN
210 }
211
editForThisLine(HyphenationType type)212 EndHyphenEdit editForThisLine(HyphenationType type) {
213 switch (type) {
214 case HyphenationType::BREAK_AND_INSERT_HYPHEN:
215 return EndHyphenEdit::INSERT_HYPHEN;
216 case HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN:
217 return EndHyphenEdit::INSERT_ARMENIAN_HYPHEN;
218 case HyphenationType::BREAK_AND_INSERT_MAQAF:
219 return EndHyphenEdit::INSERT_MAQAF;
220 case HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN:
221 return EndHyphenEdit::INSERT_UCAS_HYPHEN;
222 case HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN:
223 return EndHyphenEdit::REPLACE_WITH_HYPHEN;
224 case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ:
225 return EndHyphenEdit::INSERT_ZWJ_AND_HYPHEN;
226 case HyphenationType::DONT_BREAK: // Hyphen edit for non breaking case doesn't make sense.
227 default:
228 return EndHyphenEdit::NO_EDIT;
229 }
230 }
231
editForNextLine(HyphenationType type)232 StartHyphenEdit editForNextLine(HyphenationType type) {
233 switch (type) {
234 case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE:
235 return StartHyphenEdit::INSERT_HYPHEN;
236 case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ:
237 return StartHyphenEdit::INSERT_ZWJ;
238 case HyphenationType::DONT_BREAK: // Hyphen edit for non breaking case doesn't make sense.
239 default:
240 return StartHyphenEdit::NO_EDIT;
241 }
242 }
243
getScript(uint32_t codePoint)244 static UScriptCode getScript(uint32_t codePoint) {
245 UErrorCode errorCode = U_ZERO_ERROR;
246 const UScriptCode script = uscript_getScript(static_cast<UChar32>(codePoint), &errorCode);
247 if (U_SUCCESS(errorCode)) {
248 return script;
249 } else {
250 return USCRIPT_INVALID_CODE;
251 }
252 }
253
hyphenationTypeBasedOnScript(uint32_t codePoint)254 static HyphenationType hyphenationTypeBasedOnScript(uint32_t codePoint) {
255 // Note: It's not clear what the best hyphen for Hebrew is. While maqaf is the "correct" hyphen
256 // for Hebrew, modern practice may have shifted towards Western hyphens. We use normal hyphens
257 // for now to be safe. BREAK_AND_INSERT_MAQAF is already implemented, so if we want to switch
258 // to maqaf for Hebrew, we can simply add a condition here.
259 const UScriptCode script = getScript(codePoint);
260 if (script == USCRIPT_KANNADA || script == USCRIPT_MALAYALAM || script == USCRIPT_TAMIL ||
261 script == USCRIPT_TELUGU) {
262 // Grantha is not included, since we don't support non-BMP hyphenation yet.
263 return HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
264 } else if (script == USCRIPT_ARMENIAN) {
265 return HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN;
266 } else if (script == USCRIPT_CANADIAN_ABORIGINAL) {
267 return HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN;
268 } else {
269 return HyphenationType::BREAK_AND_INSERT_HYPHEN;
270 }
271 }
272
getJoiningType(UChar32 codepoint)273 static inline int32_t getJoiningType(UChar32 codepoint) {
274 return u_getIntPropertyValue(codepoint, UCHAR_JOINING_TYPE);
275 }
276
277 // Assumption for caller: location must be >= 2 and word[location] == CHAR_SOFT_HYPHEN.
278 // This function decides if the letters before and after the hyphen should appear as joining.
getHyphTypeForArabic(const U16StringPiece & word,size_t location)279 static inline HyphenationType getHyphTypeForArabic(const U16StringPiece& word, size_t location) {
280 ssize_t i = location;
281 int32_t type = U_JT_NON_JOINING;
282 while (static_cast<size_t>(i) < word.size() &&
283 (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) {
284 i++;
285 }
286 if (type == U_JT_DUAL_JOINING || type == U_JT_RIGHT_JOINING || type == U_JT_JOIN_CAUSING) {
287 // The next character is of the type that may join the last character. See if the last
288 // character is also of the right type.
289 i = location - 2; // Skip the soft hyphen
290 type = U_JT_NON_JOINING;
291 while (i >= 0 && (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) {
292 i--;
293 }
294 if (type == U_JT_DUAL_JOINING || type == U_JT_LEFT_JOINING || type == U_JT_JOIN_CAUSING) {
295 return HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ;
296 }
297 }
298 return HyphenationType::BREAK_AND_INSERT_HYPHEN;
299 }
300
301 // Use various recommendations of UAX #14 Unicode Line Breaking Algorithm for hyphenating words
302 // that didn't match patterns, especially words that contain hyphens or soft hyphens (See sections
303 // 5.3, Use of Hyphen, and 5.4, Use of Soft Hyphen).
hyphenateWithNoPatterns(const U16StringPiece & word,HyphenationType * out) const304 void HyphenatorCXX::hyphenateWithNoPatterns(const U16StringPiece& word,
305 HyphenationType* out) const {
306 out[0] = HyphenationType::DONT_BREAK;
307 for (size_t i = 1; i < word.size(); i++) {
308 const uint16_t prevChar = word[i - 1];
309 if (i > 1 && isLineBreakingHyphen(prevChar)) {
310 // Break after hyphens, but only if they don't start the word.
311
312 if ((prevChar == CHAR_HYPHEN_MINUS || prevChar == CHAR_HYPHEN) &&
313 (mHyphenationLocale == HyphenationLocale::POLISH ||
314 mHyphenationLocale == HyphenationLocale::SLOVENIAN) &&
315 getScript(word[i]) == USCRIPT_LATIN) {
316 // In Polish and Slovenian, hyphens get repeated at the next line. To be safe,
317 // we will do this only if the next character is Latin.
318 out[i] = HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE;
319 } else {
320 out[i] = HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
321 }
322 } else if (i > 1 && prevChar == CHAR_SOFT_HYPHEN) {
323 // Break after soft hyphens, but only if they don't start the word (a soft hyphen
324 // starting the word doesn't give any useful break opportunities). The type of the break
325 // is based on the script of the character we break on.
326 if (getScript(word[i]) == USCRIPT_ARABIC) {
327 // For Arabic, we need to look and see if the characters around the soft hyphen
328 // actually join. If they don't, we'll just insert a normal hyphen.
329 out[i] = getHyphTypeForArabic(word, i);
330 } else {
331 out[i] = hyphenationTypeBasedOnScript(word[i]);
332 }
333 } else if (prevChar == CHAR_MIDDLE_DOT && mMinPrefix < i && i <= word.size() - mMinSuffix &&
334 ((word[i - 2] == 'l' && word[i] == 'l') ||
335 (word[i - 2] == 'L' && word[i] == 'L')) &&
336 mHyphenationLocale == HyphenationLocale::CATALAN) {
337 // In Catalan, "l·l" should break as "l-" on the first line
338 // and "l" on the next line.
339 out[i] = HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN;
340 } else {
341 out[i] = HyphenationType::DONT_BREAK;
342 }
343 }
344 }
345
alphabetLookup(uint16_t * alpha_codes,const U16StringPiece & word) const346 HyphenationType HyphenatorCXX::alphabetLookup(uint16_t* alpha_codes,
347 const U16StringPiece& word) const {
348 const Header* header = getHeader();
349 HyphenationType result = HyphenationType::BREAK_AND_INSERT_HYPHEN;
350 // TODO: check header magic
351 uint32_t alphabetVersion = header->alphabetVersion();
352 if (alphabetVersion == 0) {
353 const AlphabetTable0* alphabet = header->alphabetTable0();
354 uint32_t min_codepoint = alphabet->min_codepoint;
355 uint32_t max_codepoint = alphabet->max_codepoint;
356 alpha_codes[0] = 0; // word start
357 for (size_t i = 0; i < word.size(); i++) {
358 uint16_t c = word[i];
359 if (c < min_codepoint || c >= max_codepoint) {
360 return HyphenationType::DONT_BREAK;
361 }
362 uint8_t code = alphabet->data[c - min_codepoint];
363 if (code == 0) {
364 return HyphenationType::DONT_BREAK;
365 }
366 if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
367 result = hyphenationTypeBasedOnScript(c);
368 }
369 alpha_codes[i + 1] = code;
370 }
371 alpha_codes[word.size() + 1] = 0; // word termination
372 return result;
373 } else if (alphabetVersion == 1) {
374 const AlphabetTable1* alphabet = header->alphabetTable1();
375 size_t n_entries = alphabet->n_entries;
376 const uint32_t* begin = alphabet->data;
377 const uint32_t* end = begin + n_entries;
378 alpha_codes[0] = 0;
379 for (size_t i = 0; i < word.size(); i++) {
380 uint16_t c = word[i];
381 auto p = std::lower_bound(begin, end, c << 11);
382 if (p == end) {
383 return HyphenationType::DONT_BREAK;
384 }
385 uint32_t entry = *p;
386 if (AlphabetTable1::codepoint(entry) != c) {
387 return HyphenationType::DONT_BREAK;
388 }
389 if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
390 result = hyphenationTypeBasedOnScript(c);
391 }
392 alpha_codes[i + 1] = AlphabetTable1::value(entry);
393 }
394 alpha_codes[word.size() + 1] = 0;
395 return result;
396 }
397 return HyphenationType::DONT_BREAK;
398 }
399
400 /**
401 * Internal implementation, after conversion to codes. All case folding and normalization
402 * has been done by now, and all characters have been found in the alphabet.
403 * Note: len here is the padded length including 0 codes at start and end.
404 **/
hyphenateFromCodes(const uint16_t * codes,size_t len,HyphenationType hyphenValue,HyphenationType * out) const405 void HyphenatorCXX::hyphenateFromCodes(const uint16_t* codes, size_t len,
406 HyphenationType hyphenValue, HyphenationType* out) const {
407 static_assert(sizeof(HyphenationType) == sizeof(uint8_t), "HyphnationType must be uint8_t.");
408 // Reuse the result array as a buffer for calculating intermediate hyphenation numbers.
409 uint8_t* buffer = reinterpret_cast<uint8_t*>(out);
410
411 const Header* header = getHeader();
412 const Trie* trie = header->trieTable();
413 const Pattern* pattern = header->patternTable();
414 uint32_t char_mask = trie->char_mask;
415 uint32_t link_shift = trie->link_shift;
416 uint32_t link_mask = trie->link_mask;
417 uint32_t pattern_shift = trie->pattern_shift;
418 size_t maxOffset = len - mMinSuffix - 1;
419 for (size_t i = 0; i < len - 1; i++) {
420 uint32_t node = 0; // index into Trie table
421 for (size_t j = i; j < len; j++) {
422 uint16_t c = codes[j];
423 uint32_t entry = trie->data[node + c];
424 if ((entry & char_mask) == c) {
425 node = (entry & link_mask) >> link_shift;
426 } else {
427 break;
428 }
429 uint32_t pat_ix = trie->data[node] >> pattern_shift;
430 // pat_ix contains a 3-tuple of length, shift (number of trailing zeros), and an offset
431 // into the buf pool. This is the pattern for the substring (i..j) we just matched,
432 // which we combine (via point-wise max) into the buffer vector.
433 if (pat_ix != 0) {
434 uint32_t pat_entry = pattern->data[pat_ix];
435 int pat_len = Pattern::len(pat_entry);
436 int pat_shift = Pattern::shift(pat_entry);
437 const uint8_t* pat_buf = pattern->buf(pat_entry);
438 int offset = j + 1 - (pat_len + pat_shift);
439 // offset is the index within buffer that lines up with the start of pat_buf
440 int start = std::max((int)mMinPrefix - offset, 0);
441 int end = std::min(pat_len, (int)maxOffset - offset);
442 for (int k = start; k < end; k++) {
443 buffer[offset + k] = std::max(buffer[offset + k], pat_buf[k]);
444 }
445 }
446 }
447 }
448 // Since the above calculation does not modify values outside
449 // [mMinPrefix, len - mMinSuffix], they are left as 0 = DONT_BREAK.
450 for (size_t i = mMinPrefix; i < maxOffset; i++) {
451 // Hyphenation opportunities happen when the hyphenation numbers are odd.
452 out[i] = (buffer[i] & 1u) ? hyphenValue : HyphenationType::DONT_BREAK;
453 }
454 }
455
456 } // namespace minikin
457