1 /*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <unicode/uchar.h>
18 #include <unicode/uscript.h>
19 #include <algorithm>
20 #include <memory>
21 #include <string>
22 #include <cstring>
23 #include <vector>
24
25 // HACK: for reading pattern file
26 #include <fcntl.h>
27
28 #define LOG_TAG "Minikin"
29
30 #include "minikin/Hyphenator.h"
31 #include "utils/WindowsUtils.h"
32
33 using std::vector;
34
35 namespace minikin {
36
37 static const uint16_t CHAR_HYPHEN_MINUS = 0x002D;
38 static const uint16_t CHAR_SOFT_HYPHEN = 0x00AD;
39 static const uint16_t CHAR_MIDDLE_DOT = 0x00B7;
40 static const uint16_t CHAR_HYPHEN = 0x2010;
41
42 // The following are structs that correspond to tables inside the hyb file
43 // format
44
45 struct AlphabetTable0 {
46 uint32_t version;
47 uint32_t min_codepoint;
48 uint32_t max_codepoint;
49 uint8_t data[1]; // actually flexible array, size is known at runtime
50 };
51
52 struct AlphabetTable1 {
53 uint32_t version;
54 uint32_t n_entries;
55 uint32_t data[1]; // actually flexible array, size is known at runtime
56
codepointminikin::AlphabetTable157 static uint32_t codepoint(uint32_t entry) { return entry >> 11; }
valueminikin::AlphabetTable158 static uint32_t value(uint32_t entry) { return entry & 0x7ff; }
59 };
60
61 struct Trie {
62 uint32_t version;
63 uint32_t char_mask;
64 uint32_t link_shift;
65 uint32_t link_mask;
66 uint32_t pattern_shift;
67 uint32_t n_entries;
68 uint32_t data[1]; // actually flexible array, size is known at runtime
69 };
70
71 struct Pattern {
72 uint32_t version;
73 uint32_t n_entries;
74 uint32_t pattern_offset;
75 uint32_t pattern_size;
76 uint32_t data[1]; // actually flexible array, size is known at runtime
77
78 // accessors
lenminikin::Pattern79 static uint32_t len(uint32_t entry) { return entry >> 26; }
shiftminikin::Pattern80 static uint32_t shift(uint32_t entry) { return (entry >> 20) & 0x3f; }
bufminikin::Pattern81 const uint8_t* buf(uint32_t entry) const {
82 return reinterpret_cast<const uint8_t*>(this) + pattern_offset +
83 (entry & 0xfffff);
84 }
85 };
86
87 struct Header {
88 uint32_t magic;
89 uint32_t version;
90 uint32_t alphabet_offset;
91 uint32_t trie_offset;
92 uint32_t pattern_offset;
93 uint32_t file_size;
94
95 // accessors
bytesminikin::Header96 const uint8_t* bytes() const {
97 return reinterpret_cast<const uint8_t*>(this);
98 }
alphabetVersionminikin::Header99 uint32_t alphabetVersion() const {
100 return *reinterpret_cast<const uint32_t*>(bytes() + alphabet_offset);
101 }
alphabetTable0minikin::Header102 const AlphabetTable0* alphabetTable0() const {
103 return reinterpret_cast<const AlphabetTable0*>(bytes() + alphabet_offset);
104 }
alphabetTable1minikin::Header105 const AlphabetTable1* alphabetTable1() const {
106 return reinterpret_cast<const AlphabetTable1*>(bytes() + alphabet_offset);
107 }
trieTableminikin::Header108 const Trie* trieTable() const {
109 return reinterpret_cast<const Trie*>(bytes() + trie_offset);
110 }
patternTableminikin::Header111 const Pattern* patternTable() const {
112 return reinterpret_cast<const Pattern*>(bytes() + pattern_offset);
113 }
114 };
115
loadBinary(const uint8_t * patternData,size_t minPrefix,size_t minSuffix)116 Hyphenator* Hyphenator::loadBinary(const uint8_t* patternData,
117 size_t minPrefix,
118 size_t minSuffix) {
119 Hyphenator* result = new Hyphenator;
120 result->patternData = patternData;
121 result->minPrefix = minPrefix;
122 result->minSuffix = minSuffix;
123 return result;
124 }
125
hyphenate(vector<HyphenationType> * result,const uint16_t * word,size_t len,const icu::Locale & locale)126 void Hyphenator::hyphenate(vector<HyphenationType>* result,
127 const uint16_t* word,
128 size_t len,
129 const icu::Locale& locale) {
130 result->clear();
131 result->resize(len);
132 const size_t paddedLen = len + 2; // start and stop code each count for 1
133 if (patternData != nullptr && len >= minPrefix + minSuffix &&
134 paddedLen <= MAX_HYPHENATED_SIZE) {
135 uint16_t alpha_codes[MAX_HYPHENATED_SIZE];
136 const HyphenationType hyphenValue = alphabetLookup(alpha_codes, word, len);
137 if (hyphenValue != HyphenationType::DONT_BREAK) {
138 hyphenateFromCodes(result->data(), alpha_codes, paddedLen, hyphenValue);
139 return;
140 }
141 // TODO: try NFC normalization
142 // TODO: handle non-BMP Unicode (requires remapping of offsets)
143 }
144 // Note that we will always get here if the word contains a hyphen or a soft
145 // hyphen, because the alphabet is not expected to contain a hyphen or a soft
146 // hyphen character, so alphabetLookup would return DONT_BREAK.
147 hyphenateWithNoPatterns(result->data(), word, len, locale);
148 }
149
150 // This function determines whether a character is like U+2010 HYPHEN in
151 // line breaking and usage: a character immediately after which line breaks
152 // are allowed, but words containing it should not be automatically
153 // hyphenated using patterns. This is a curated set, created by manually
154 // inspecting all the characters that have the Unicode line breaking
155 // property of BA or HY and seeing which ones are hyphens.
isLineBreakingHyphen(uint32_t c)156 bool Hyphenator::isLineBreakingHyphen(uint32_t c) {
157 return (c == 0x002D || // HYPHEN-MINUS
158 c == 0x058A || // ARMENIAN HYPHEN
159 c == 0x05BE || // HEBREW PUNCTUATION MAQAF
160 c == 0x1400 || // CANADIAN SYLLABICS HYPHEN
161 c == 0x2010 || // HYPHEN
162 c == 0x2013 || // EN DASH
163 c == 0x2027 || // HYPHENATION POINT
164 c == 0x2E17 || // DOUBLE OBLIQUE HYPHEN
165 c == 0x2E40); // DOUBLE HYPHEN
166 }
167
168 const static uint32_t HYPHEN_STR[] = {0x2010, 0};
169 const static uint32_t ARMENIAN_HYPHEN_STR[] = {0x058A, 0};
170 const static uint32_t MAQAF_STR[] = {0x05BE, 0};
171 const static uint32_t UCAS_HYPHEN_STR[] = {0x1400, 0};
172 const static uint32_t ZWJ_STR[] = {0x200D, 0};
173 const static uint32_t ZWJ_AND_HYPHEN_STR[] = {0x200D, 0x2010, 0};
174
getHyphenString(uint32_t hyph)175 const uint32_t* HyphenEdit::getHyphenString(uint32_t hyph) {
176 switch (hyph) {
177 case INSERT_HYPHEN_AT_END:
178 case REPLACE_WITH_HYPHEN_AT_END:
179 case INSERT_HYPHEN_AT_START:
180 return HYPHEN_STR;
181 case INSERT_ARMENIAN_HYPHEN_AT_END:
182 return ARMENIAN_HYPHEN_STR;
183 case INSERT_MAQAF_AT_END:
184 return MAQAF_STR;
185 case INSERT_UCAS_HYPHEN_AT_END:
186 return UCAS_HYPHEN_STR;
187 case INSERT_ZWJ_AND_HYPHEN_AT_END:
188 return ZWJ_AND_HYPHEN_STR;
189 case INSERT_ZWJ_AT_START:
190 return ZWJ_STR;
191 default:
192 return nullptr;
193 }
194 }
195
editForThisLine(HyphenationType type)196 uint32_t HyphenEdit::editForThisLine(HyphenationType type) {
197 switch (type) {
198 case HyphenationType::DONT_BREAK:
199 return NO_EDIT;
200 case HyphenationType::BREAK_AND_INSERT_HYPHEN:
201 return INSERT_HYPHEN_AT_END;
202 case HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN:
203 return INSERT_ARMENIAN_HYPHEN_AT_END;
204 case HyphenationType::BREAK_AND_INSERT_MAQAF:
205 return INSERT_MAQAF_AT_END;
206 case HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN:
207 return INSERT_UCAS_HYPHEN_AT_END;
208 case HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN:
209 return REPLACE_WITH_HYPHEN_AT_END;
210 case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ:
211 return INSERT_ZWJ_AND_HYPHEN_AT_END;
212 default:
213 return BREAK_AT_END;
214 }
215 }
216
editForNextLine(HyphenationType type)217 uint32_t HyphenEdit::editForNextLine(HyphenationType type) {
218 switch (type) {
219 case HyphenationType::DONT_BREAK:
220 return NO_EDIT;
221 case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE:
222 return INSERT_HYPHEN_AT_START;
223 case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ:
224 return INSERT_ZWJ_AT_START;
225 default:
226 return BREAK_AT_START;
227 }
228 }
229
getScript(uint32_t codePoint)230 static UScriptCode getScript(uint32_t codePoint) {
231 UErrorCode errorCode = U_ZERO_ERROR;
232 const UScriptCode script =
233 uscript_getScript(static_cast<UChar32>(codePoint), &errorCode);
234 if (U_SUCCESS(errorCode)) {
235 return script;
236 } else {
237 return USCRIPT_INVALID_CODE;
238 }
239 }
240
hyphenationTypeBasedOnScript(uint32_t codePoint)241 static HyphenationType hyphenationTypeBasedOnScript(uint32_t codePoint) {
242 // Note: It's not clear what the best hyphen for Hebrew is. While maqaf is the
243 // "correct" hyphen for Hebrew, modern practice may have shifted towards
244 // Western hyphens. We use normal hyphens for now to be safe.
245 // BREAK_AND_INSERT_MAQAF is already implemented, so if we want to switch to
246 // maqaf for Hebrew, we can simply add a condition here.
247 const UScriptCode script = getScript(codePoint);
248 if (script == USCRIPT_KANNADA || script == USCRIPT_MALAYALAM ||
249 script == USCRIPT_TAMIL || script == USCRIPT_TELUGU) {
250 // Grantha is not included, since we don't support non-BMP hyphenation yet.
251 return HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
252 } else if (script == USCRIPT_ARMENIAN) {
253 return HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN;
254 } else if (script == USCRIPT_CANADIAN_ABORIGINAL) {
255 return HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN;
256 } else {
257 return HyphenationType::BREAK_AND_INSERT_HYPHEN;
258 }
259 }
260
getJoiningType(UChar32 codepoint)261 static inline int32_t getJoiningType(UChar32 codepoint) {
262 return u_getIntPropertyValue(codepoint, UCHAR_JOINING_TYPE);
263 }
264
265 // Assumption for caller: location must be >= 2 and word[location] ==
266 // CHAR_SOFT_HYPHEN. This function decides if the letters before and after the
267 // hyphen should appear as joining.
getHyphTypeForArabic(const uint16_t * word,size_t len,size_t location)268 static inline HyphenationType getHyphTypeForArabic(const uint16_t* word,
269 size_t len,
270 size_t location) {
271 ssize_t i = location;
272 int32_t type = U_JT_NON_JOINING;
273 while (static_cast<size_t>(i) < len &&
274 (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) {
275 i++;
276 }
277 if (type == U_JT_DUAL_JOINING || type == U_JT_RIGHT_JOINING ||
278 type == U_JT_JOIN_CAUSING) {
279 // The next character is of the type that may join the last character. See
280 // if the last character is also of the right type.
281 i = location - 2; // Skip the soft hyphen
282 type = U_JT_NON_JOINING;
283 while (i >= 0 && (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) {
284 i--;
285 }
286 if (type == U_JT_DUAL_JOINING || type == U_JT_LEFT_JOINING ||
287 type == U_JT_JOIN_CAUSING) {
288 return HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ;
289 }
290 }
291 return HyphenationType::BREAK_AND_INSERT_HYPHEN;
292 }
293
294 // Use various recommendations of UAX #14 Unicode Line Breaking Algorithm for
295 // hyphenating words that didn't match patterns, especially words that contain
296 // hyphens or soft hyphens (See sections 5.3, Use of Hyphen, and 5.4, Use of
297 // Soft Hyphen).
hyphenateWithNoPatterns(HyphenationType * result,const uint16_t * word,size_t len,const icu::Locale & locale)298 void Hyphenator::hyphenateWithNoPatterns(HyphenationType* result,
299 const uint16_t* word,
300 size_t len,
301 const icu::Locale& locale) {
302 result[0] = HyphenationType::DONT_BREAK;
303 for (size_t i = 1; i < len; i++) {
304 const uint16_t prevChar = word[i - 1];
305 if (i > 1 && isLineBreakingHyphen(prevChar)) {
306 // Break after hyphens, but only if they don't start the word.
307
308 if ((prevChar == CHAR_HYPHEN_MINUS || prevChar == CHAR_HYPHEN) &&
309 strcmp(locale.getLanguage(), "pl") == 0 &&
310 getScript(word[i]) == USCRIPT_LATIN) {
311 // In Polish, hyphens get repeated at the next line. To be safe,
312 // we will do this only if the next character is Latin.
313 result[i] = HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE;
314 } else {
315 result[i] = HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
316 }
317 } else if (i > 1 && prevChar == CHAR_SOFT_HYPHEN) {
318 // Break after soft hyphens, but only if they don't start the word (a soft
319 // hyphen starting the word doesn't give any useful break opportunities).
320 // The type of the break is based on the script of the character we break
321 // on.
322 if (getScript(word[i]) == USCRIPT_ARABIC) {
323 // For Arabic, we need to look and see if the characters around the soft
324 // hyphen actually join. If they don't, we'll just insert a normal
325 // hyphen.
326 result[i] = getHyphTypeForArabic(word, len, i);
327 } else {
328 result[i] = hyphenationTypeBasedOnScript(word[i]);
329 }
330 } else if (prevChar == CHAR_MIDDLE_DOT && minPrefix < i &&
331 i <= len - minSuffix &&
332 ((word[i - 2] == 'l' && word[i] == 'l') ||
333 (word[i - 2] == 'L' && word[i] == 'L')) &&
334 strcmp(locale.getLanguage(), "ca") == 0) {
335 // In Catalan, "l·l" should break as "l-" on the first line
336 // and "l" on the next line.
337 result[i] = HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN;
338 } else {
339 result[i] = HyphenationType::DONT_BREAK;
340 }
341 }
342 }
343
alphabetLookup(uint16_t * alpha_codes,const uint16_t * word,size_t len)344 HyphenationType Hyphenator::alphabetLookup(uint16_t* alpha_codes,
345 const uint16_t* word,
346 size_t len) {
347 const Header* header = getHeader();
348 HyphenationType result = HyphenationType::BREAK_AND_INSERT_HYPHEN;
349 // TODO: check header magic
350 uint32_t alphabetVersion = header->alphabetVersion();
351 if (alphabetVersion == 0) {
352 const AlphabetTable0* alphabet = header->alphabetTable0();
353 uint32_t min_codepoint = alphabet->min_codepoint;
354 uint32_t max_codepoint = alphabet->max_codepoint;
355 alpha_codes[0] = 0; // word start
356 for (size_t i = 0; i < len; i++) {
357 uint16_t c = word[i];
358 if (c < min_codepoint || c >= max_codepoint) {
359 return HyphenationType::DONT_BREAK;
360 }
361 uint8_t code = alphabet->data[c - min_codepoint];
362 if (code == 0) {
363 return HyphenationType::DONT_BREAK;
364 }
365 if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
366 result = hyphenationTypeBasedOnScript(c);
367 }
368 alpha_codes[i + 1] = code;
369 }
370 alpha_codes[len + 1] = 0; // word termination
371 return result;
372 } else if (alphabetVersion == 1) {
373 const AlphabetTable1* alphabet = header->alphabetTable1();
374 size_t n_entries = alphabet->n_entries;
375 const uint32_t* begin = alphabet->data;
376 const uint32_t* end = begin + n_entries;
377 alpha_codes[0] = 0;
378 for (size_t i = 0; i < len; i++) {
379 uint16_t c = word[i];
380 auto p = std::lower_bound<const uint32_t*, uint32_t>(begin, end, c << 11);
381 if (p == end) {
382 return HyphenationType::DONT_BREAK;
383 }
384 uint32_t entry = *p;
385 if (AlphabetTable1::codepoint(entry) != c) {
386 return HyphenationType::DONT_BREAK;
387 }
388 if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
389 result = hyphenationTypeBasedOnScript(c);
390 }
391 alpha_codes[i + 1] = AlphabetTable1::value(entry);
392 }
393 alpha_codes[len + 1] = 0;
394 return result;
395 }
396 return HyphenationType::DONT_BREAK;
397 }
398
399 /**
400 * Internal implementation, after conversion to codes. All case folding and
401 *normalization has been done by now, and all characters have been found in the
402 *alphabet. Note: len here is the padded length including 0 codes at start and
403 *end.
404 **/
hyphenateFromCodes(HyphenationType * result,const uint16_t * codes,size_t len,HyphenationType hyphenValue)405 void Hyphenator::hyphenateFromCodes(HyphenationType* result,
406 const uint16_t* codes,
407 size_t len,
408 HyphenationType hyphenValue) {
409 static_assert(sizeof(HyphenationType) == sizeof(uint8_t),
410 "HyphnationType must be uint8_t.");
411 // Reuse the result array as a buffer for calculating intermediate hyphenation
412 // numbers.
413 uint8_t* buffer = reinterpret_cast<uint8_t*>(result);
414
415 const Header* header = getHeader();
416 const Trie* trie = header->trieTable();
417 const Pattern* pattern = header->patternTable();
418 uint32_t char_mask = trie->char_mask;
419 uint32_t link_shift = trie->link_shift;
420 uint32_t link_mask = trie->link_mask;
421 uint32_t pattern_shift = trie->pattern_shift;
422 size_t maxOffset = len - minSuffix - 1;
423 for (size_t i = 0; i < len - 1; i++) {
424 uint32_t node = 0; // index into Trie table
425 for (size_t j = i; j < len; j++) {
426 uint16_t c = codes[j];
427 uint32_t entry = trie->data[node + c];
428 if ((entry & char_mask) == c) {
429 node = (entry & link_mask) >> link_shift;
430 } else {
431 break;
432 }
433 uint32_t pat_ix = trie->data[node] >> pattern_shift;
434 // pat_ix contains a 3-tuple of length, shift (number of trailing zeros),
435 // and an offset into the buf pool. This is the pattern for the substring
436 // (i..j) we just matched, which we combine (via point-wise max) into the
437 // buffer vector.
438 if (pat_ix != 0) {
439 uint32_t pat_entry = pattern->data[pat_ix];
440 int pat_len = Pattern::len(pat_entry);
441 int pat_shift = Pattern::shift(pat_entry);
442 const uint8_t* pat_buf = pattern->buf(pat_entry);
443 int offset = j + 1 - (pat_len + pat_shift);
444 // offset is the index within buffer that lines up with the start of
445 // pat_buf
446 int start = std::max((int)minPrefix - offset, 0);
447 int end = std::min(pat_len, (int)maxOffset - offset);
448 for (int k = start; k < end; k++) {
449 buffer[offset + k] = std::max(buffer[offset + k], pat_buf[k]);
450 }
451 }
452 }
453 }
454 // Since the above calculation does not modify values outside
455 // [minPrefix, len - minSuffix], they are left as 0 = DONT_BREAK.
456 for (size_t i = minPrefix; i < maxOffset; i++) {
457 // Hyphenation opportunities happen when the hyphenation numbers are odd.
458 result[i] = (buffer[i] & 1u) ? hyphenValue : HyphenationType::DONT_BREAK;
459 }
460 }
461
462 } // namespace minikin
463