1 /*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <vector>
18 #include <memory>
19 #include <algorithm>
20 #include <string>
21 #include <unicode/uchar.h>
22 #include <unicode/uscript.h>
23
24 // HACK: for reading pattern file
25 #include <fcntl.h>
26
27 #define LOG_TAG "Minikin"
28 #include "utils/Log.h"
29
30 #include "minikin/Hyphenator.h"
31
32 using std::vector;
33
34 namespace minikin {
35
36 static const uint16_t CHAR_HYPHEN_MINUS = 0x002D;
37 static const uint16_t CHAR_SOFT_HYPHEN = 0x00AD;
38 static const uint16_t CHAR_MIDDLE_DOT = 0x00B7;
39 static const uint16_t CHAR_HYPHEN = 0x2010;
40
41 // The following are structs that correspond to tables inside the hyb file format
42
43 struct AlphabetTable0 {
44 uint32_t version;
45 uint32_t min_codepoint;
46 uint32_t max_codepoint;
47 uint8_t data[1]; // actually flexible array, size is known at runtime
48 };
49
50 struct AlphabetTable1 {
51 uint32_t version;
52 uint32_t n_entries;
53 uint32_t data[1]; // actually flexible array, size is known at runtime
54
codepointminikin::AlphabetTable155 static uint32_t codepoint(uint32_t entry) { return entry >> 11; }
valueminikin::AlphabetTable156 static uint32_t value(uint32_t entry) { return entry & 0x7ff; }
57 };
58
59 struct Trie {
60 uint32_t version;
61 uint32_t char_mask;
62 uint32_t link_shift;
63 uint32_t link_mask;
64 uint32_t pattern_shift;
65 uint32_t n_entries;
66 uint32_t data[1]; // actually flexible array, size is known at runtime
67 };
68
69 struct Pattern {
70 uint32_t version;
71 uint32_t n_entries;
72 uint32_t pattern_offset;
73 uint32_t pattern_size;
74 uint32_t data[1]; // actually flexible array, size is known at runtime
75
76 // accessors
lenminikin::Pattern77 static uint32_t len(uint32_t entry) { return entry >> 26; }
shiftminikin::Pattern78 static uint32_t shift(uint32_t entry) { return (entry >> 20) & 0x3f; }
bufminikin::Pattern79 const uint8_t* buf(uint32_t entry) const {
80 return reinterpret_cast<const uint8_t*>(this) + pattern_offset + (entry & 0xfffff);
81 }
82 };
83
84 struct Header {
85 uint32_t magic;
86 uint32_t version;
87 uint32_t alphabet_offset;
88 uint32_t trie_offset;
89 uint32_t pattern_offset;
90 uint32_t file_size;
91
92 // accessors
bytesminikin::Header93 const uint8_t* bytes() const { return reinterpret_cast<const uint8_t*>(this); }
alphabetVersionminikin::Header94 uint32_t alphabetVersion() const {
95 return *reinterpret_cast<const uint32_t*>(bytes() + alphabet_offset);
96 }
alphabetTable0minikin::Header97 const AlphabetTable0* alphabetTable0() const {
98 return reinterpret_cast<const AlphabetTable0*>(bytes() + alphabet_offset);
99 }
alphabetTable1minikin::Header100 const AlphabetTable1* alphabetTable1() const {
101 return reinterpret_cast<const AlphabetTable1*>(bytes() + alphabet_offset);
102 }
trieTableminikin::Header103 const Trie* trieTable() const {
104 return reinterpret_cast<const Trie*>(bytes() + trie_offset);
105 }
patternTableminikin::Header106 const Pattern* patternTable() const {
107 return reinterpret_cast<const Pattern*>(bytes() + pattern_offset);
108 }
109 };
110
loadBinary(const uint8_t * patternData,size_t minPrefix,size_t minSuffix)111 Hyphenator* Hyphenator::loadBinary(const uint8_t* patternData, size_t minPrefix, size_t minSuffix) {
112 Hyphenator* result = new Hyphenator;
113 result->patternData = patternData;
114 result->minPrefix = minPrefix;
115 result->minSuffix = minSuffix;
116 return result;
117 }
118
hyphenate(vector<HyphenationType> * result,const uint16_t * word,size_t len,const icu::Locale & locale)119 void Hyphenator::hyphenate(vector<HyphenationType>* result, const uint16_t* word, size_t len,
120 const icu::Locale& locale) {
121 result->clear();
122 result->resize(len);
123 const size_t paddedLen = len + 2; // start and stop code each count for 1
124 if (patternData != nullptr &&
125 len >= minPrefix + minSuffix && paddedLen <= MAX_HYPHENATED_SIZE) {
126 uint16_t alpha_codes[MAX_HYPHENATED_SIZE];
127 const HyphenationType hyphenValue = alphabetLookup(alpha_codes, word, len);
128 if (hyphenValue != HyphenationType::DONT_BREAK) {
129 hyphenateFromCodes(result->data(), alpha_codes, paddedLen, hyphenValue);
130 return;
131 }
132 // TODO: try NFC normalization
133 // TODO: handle non-BMP Unicode (requires remapping of offsets)
134 }
135 // Note that we will always get here if the word contains a hyphen or a soft hyphen, because the
136 // alphabet is not expected to contain a hyphen or a soft hyphen character, so alphabetLookup
137 // would return DONT_BREAK.
138 hyphenateWithNoPatterns(result->data(), word, len, locale);
139 }
140
141 // This function determines whether a character is like U+2010 HYPHEN in
142 // line breaking and usage: a character immediately after which line breaks
143 // are allowed, but words containing it should not be automatically
144 // hyphenated using patterns. This is a curated set, created by manually
145 // inspecting all the characters that have the Unicode line breaking
146 // property of BA or HY and seeing which ones are hyphens.
isLineBreakingHyphen(uint32_t c)147 bool Hyphenator::isLineBreakingHyphen(uint32_t c) {
148 return (c == 0x002D || // HYPHEN-MINUS
149 c == 0x058A || // ARMENIAN HYPHEN
150 c == 0x05BE || // HEBREW PUNCTUATION MAQAF
151 c == 0x1400 || // CANADIAN SYLLABICS HYPHEN
152 c == 0x2010 || // HYPHEN
153 c == 0x2013 || // EN DASH
154 c == 0x2027 || // HYPHENATION POINT
155 c == 0x2E17 || // DOUBLE OBLIQUE HYPHEN
156 c == 0x2E40); // DOUBLE HYPHEN
157 }
158
159 const static uint32_t HYPHEN_STR[] = {0x2010, 0};
160 const static uint32_t ARMENIAN_HYPHEN_STR[] = {0x058A, 0};
161 const static uint32_t MAQAF_STR[] = {0x05BE, 0};
162 const static uint32_t UCAS_HYPHEN_STR[] = {0x1400, 0};
163 const static uint32_t ZWJ_STR[] = {0x200D, 0};
164 const static uint32_t ZWJ_AND_HYPHEN_STR[] = {0x200D, 0x2010, 0};
165
getHyphenString(uint32_t hyph)166 const uint32_t* HyphenEdit::getHyphenString(uint32_t hyph) {
167 switch (hyph) {
168 case INSERT_HYPHEN_AT_END:
169 case REPLACE_WITH_HYPHEN_AT_END:
170 case INSERT_HYPHEN_AT_START:
171 return HYPHEN_STR;
172 case INSERT_ARMENIAN_HYPHEN_AT_END:
173 return ARMENIAN_HYPHEN_STR;
174 case INSERT_MAQAF_AT_END:
175 return MAQAF_STR;
176 case INSERT_UCAS_HYPHEN_AT_END:
177 return UCAS_HYPHEN_STR;
178 case INSERT_ZWJ_AND_HYPHEN_AT_END:
179 return ZWJ_AND_HYPHEN_STR;
180 case INSERT_ZWJ_AT_START:
181 return ZWJ_STR;
182 default:
183 return nullptr;
184 }
185 }
186
editForThisLine(HyphenationType type)187 uint32_t HyphenEdit::editForThisLine(HyphenationType type) {
188 switch (type) {
189 case HyphenationType::DONT_BREAK:
190 return NO_EDIT;
191 case HyphenationType::BREAK_AND_INSERT_HYPHEN:
192 return INSERT_HYPHEN_AT_END;
193 case HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN:
194 return INSERT_ARMENIAN_HYPHEN_AT_END;
195 case HyphenationType::BREAK_AND_INSERT_MAQAF:
196 return INSERT_MAQAF_AT_END;
197 case HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN:
198 return INSERT_UCAS_HYPHEN_AT_END;
199 case HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN:
200 return REPLACE_WITH_HYPHEN_AT_END;
201 case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ:
202 return INSERT_ZWJ_AND_HYPHEN_AT_END;
203 default:
204 return BREAK_AT_END;
205 }
206 }
207
editForNextLine(HyphenationType type)208 uint32_t HyphenEdit::editForNextLine(HyphenationType type) {
209 switch (type) {
210 case HyphenationType::DONT_BREAK:
211 return NO_EDIT;
212 case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE:
213 return INSERT_HYPHEN_AT_START;
214 case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ:
215 return INSERT_ZWJ_AT_START;
216 default:
217 return BREAK_AT_START;
218 }
219 }
220
getScript(uint32_t codePoint)221 static UScriptCode getScript(uint32_t codePoint) {
222 UErrorCode errorCode = U_ZERO_ERROR;
223 const UScriptCode script = uscript_getScript(static_cast<UChar32>(codePoint), &errorCode);
224 if (U_SUCCESS(errorCode)) {
225 return script;
226 } else {
227 return USCRIPT_INVALID_CODE;
228 }
229 }
230
hyphenationTypeBasedOnScript(uint32_t codePoint)231 static HyphenationType hyphenationTypeBasedOnScript(uint32_t codePoint) {
232 // Note: It's not clear what the best hyphen for Hebrew is. While maqaf is the "correct" hyphen
233 // for Hebrew, modern practice may have shifted towards Western hyphens. We use normal hyphens
234 // for now to be safe. BREAK_AND_INSERT_MAQAF is already implemented, so if we want to switch
235 // to maqaf for Hebrew, we can simply add a condition here.
236 const UScriptCode script = getScript(codePoint);
237 if (script == USCRIPT_KANNADA
238 || script == USCRIPT_MALAYALAM
239 || script == USCRIPT_TAMIL
240 || script == USCRIPT_TELUGU) {
241 // Grantha is not included, since we don't support non-BMP hyphenation yet.
242 return HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
243 } else if (script == USCRIPT_ARMENIAN) {
244 return HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN;
245 } else if (script == USCRIPT_CANADIAN_ABORIGINAL) {
246 return HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN;
247 } else {
248 return HyphenationType::BREAK_AND_INSERT_HYPHEN;
249 }
250 }
251
getJoiningType(UChar32 codepoint)252 static inline int32_t getJoiningType(UChar32 codepoint) {
253 return u_getIntPropertyValue(codepoint, UCHAR_JOINING_TYPE);
254 }
255
256 // Assumption for caller: location must be >= 2 and word[location] == CHAR_SOFT_HYPHEN.
257 // This function decides if the letters before and after the hyphen should appear as joining.
getHyphTypeForArabic(const uint16_t * word,size_t len,size_t location)258 static inline HyphenationType getHyphTypeForArabic(const uint16_t* word, size_t len,
259 size_t location) {
260 ssize_t i = location;
261 int32_t type = U_JT_NON_JOINING;
262 while (static_cast<size_t>(i) < len && (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) {
263 i++;
264 }
265 if (type == U_JT_DUAL_JOINING || type == U_JT_RIGHT_JOINING || type == U_JT_JOIN_CAUSING) {
266 // The next character is of the type that may join the last character. See if the last
267 // character is also of the right type.
268 i = location - 2; // Skip the soft hyphen
269 type = U_JT_NON_JOINING;
270 while (i >= 0 && (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) {
271 i--;
272 }
273 if (type == U_JT_DUAL_JOINING || type == U_JT_LEFT_JOINING || type == U_JT_JOIN_CAUSING) {
274 return HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ;
275 }
276 }
277 return HyphenationType::BREAK_AND_INSERT_HYPHEN;
278 }
279
280 // Use various recommendations of UAX #14 Unicode Line Breaking Algorithm for hyphenating words
281 // that didn't match patterns, especially words that contain hyphens or soft hyphens (See sections
282 // 5.3, Use of Hyphen, and 5.4, Use of Soft Hyphen).
hyphenateWithNoPatterns(HyphenationType * result,const uint16_t * word,size_t len,const icu::Locale & locale)283 void Hyphenator::hyphenateWithNoPatterns(HyphenationType* result, const uint16_t* word, size_t len,
284 const icu::Locale& locale) {
285 result[0] = HyphenationType::DONT_BREAK;
286 for (size_t i = 1; i < len; i++) {
287 const uint16_t prevChar = word[i - 1];
288 if (i > 1 && isLineBreakingHyphen(prevChar)) {
289 // Break after hyphens, but only if they don't start the word.
290
291 if ((prevChar == CHAR_HYPHEN_MINUS || prevChar == CHAR_HYPHEN)
292 && strcmp(locale.getLanguage(), "pl") == 0
293 && getScript(word[i]) == USCRIPT_LATIN ) {
294 // In Polish, hyphens get repeated at the next line. To be safe,
295 // we will do this only if the next character is Latin.
296 result[i] = HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE;
297 } else {
298 result[i] = HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
299 }
300 } else if (i > 1 && prevChar == CHAR_SOFT_HYPHEN) {
301 // Break after soft hyphens, but only if they don't start the word (a soft hyphen
302 // starting the word doesn't give any useful break opportunities). The type of the break
303 // is based on the script of the character we break on.
304 if (getScript(word[i]) == USCRIPT_ARABIC) {
305 // For Arabic, we need to look and see if the characters around the soft hyphen
306 // actually join. If they don't, we'll just insert a normal hyphen.
307 result[i] = getHyphTypeForArabic(word, len, i);
308 } else {
309 result[i] = hyphenationTypeBasedOnScript(word[i]);
310 }
311 } else if (prevChar == CHAR_MIDDLE_DOT
312 && minPrefix < i && i <= len - minSuffix
313 && ((word[i - 2] == 'l' && word[i] == 'l')
314 || (word[i - 2] == 'L' && word[i] == 'L'))
315 && strcmp(locale.getLanguage(), "ca") == 0) {
316 // In Catalan, "l·l" should break as "l-" on the first line
317 // and "l" on the next line.
318 result[i] = HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN;
319 } else {
320 result[i] = HyphenationType::DONT_BREAK;
321 }
322 }
323 }
324
alphabetLookup(uint16_t * alpha_codes,const uint16_t * word,size_t len)325 HyphenationType Hyphenator::alphabetLookup(uint16_t* alpha_codes, const uint16_t* word,
326 size_t len) {
327 const Header* header = getHeader();
328 HyphenationType result = HyphenationType::BREAK_AND_INSERT_HYPHEN;
329 // TODO: check header magic
330 uint32_t alphabetVersion = header->alphabetVersion();
331 if (alphabetVersion == 0) {
332 const AlphabetTable0* alphabet = header->alphabetTable0();
333 uint32_t min_codepoint = alphabet->min_codepoint;
334 uint32_t max_codepoint = alphabet->max_codepoint;
335 alpha_codes[0] = 0; // word start
336 for (size_t i = 0; i < len; i++) {
337 uint16_t c = word[i];
338 if (c < min_codepoint || c >= max_codepoint) {
339 return HyphenationType::DONT_BREAK;
340 }
341 uint8_t code = alphabet->data[c - min_codepoint];
342 if (code == 0) {
343 return HyphenationType::DONT_BREAK;
344 }
345 if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
346 result = hyphenationTypeBasedOnScript(c);
347 }
348 alpha_codes[i + 1] = code;
349 }
350 alpha_codes[len + 1] = 0; // word termination
351 return result;
352 } else if (alphabetVersion == 1) {
353 const AlphabetTable1* alphabet = header->alphabetTable1();
354 size_t n_entries = alphabet->n_entries;
355 const uint32_t* begin = alphabet->data;
356 const uint32_t* end = begin + n_entries;
357 alpha_codes[0] = 0;
358 for (size_t i = 0; i < len; i++) {
359 uint16_t c = word[i];
360 auto p = std::lower_bound(begin, end, c << 11);
361 if (p == end) {
362 return HyphenationType::DONT_BREAK;
363 }
364 uint32_t entry = *p;
365 if (AlphabetTable1::codepoint(entry) != c) {
366 return HyphenationType::DONT_BREAK;
367 }
368 if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
369 result = hyphenationTypeBasedOnScript(c);
370 }
371 alpha_codes[i + 1] = AlphabetTable1::value(entry);
372 }
373 alpha_codes[len + 1] = 0;
374 return result;
375 }
376 return HyphenationType::DONT_BREAK;
377 }
378
379 /**
380 * Internal implementation, after conversion to codes. All case folding and normalization
381 * has been done by now, and all characters have been found in the alphabet.
382 * Note: len here is the padded length including 0 codes at start and end.
383 **/
hyphenateFromCodes(HyphenationType * result,const uint16_t * codes,size_t len,HyphenationType hyphenValue)384 void Hyphenator::hyphenateFromCodes(HyphenationType* result, const uint16_t* codes, size_t len,
385 HyphenationType hyphenValue) {
386 static_assert(sizeof(HyphenationType) == sizeof(uint8_t), "HyphnationType must be uint8_t.");
387 // Reuse the result array as a buffer for calculating intermediate hyphenation numbers.
388 uint8_t* buffer = reinterpret_cast<uint8_t*>(result);
389
390 const Header* header = getHeader();
391 const Trie* trie = header->trieTable();
392 const Pattern* pattern = header->patternTable();
393 uint32_t char_mask = trie->char_mask;
394 uint32_t link_shift = trie->link_shift;
395 uint32_t link_mask = trie->link_mask;
396 uint32_t pattern_shift = trie->pattern_shift;
397 size_t maxOffset = len - minSuffix - 1;
398 for (size_t i = 0; i < len - 1; i++) {
399 uint32_t node = 0; // index into Trie table
400 for (size_t j = i; j < len; j++) {
401 uint16_t c = codes[j];
402 uint32_t entry = trie->data[node + c];
403 if ((entry & char_mask) == c) {
404 node = (entry & link_mask) >> link_shift;
405 } else {
406 break;
407 }
408 uint32_t pat_ix = trie->data[node] >> pattern_shift;
409 // pat_ix contains a 3-tuple of length, shift (number of trailing zeros), and an offset
410 // into the buf pool. This is the pattern for the substring (i..j) we just matched,
411 // which we combine (via point-wise max) into the buffer vector.
412 if (pat_ix != 0) {
413 uint32_t pat_entry = pattern->data[pat_ix];
414 int pat_len = Pattern::len(pat_entry);
415 int pat_shift = Pattern::shift(pat_entry);
416 const uint8_t* pat_buf = pattern->buf(pat_entry);
417 int offset = j + 1 - (pat_len + pat_shift);
418 // offset is the index within buffer that lines up with the start of pat_buf
419 int start = std::max((int)minPrefix - offset, 0);
420 int end = std::min(pat_len, (int)maxOffset - offset);
421 for (int k = start; k < end; k++) {
422 buffer[offset + k] = std::max(buffer[offset + k], pat_buf[k]);
423 }
424 }
425 }
426 }
427 // Since the above calculation does not modify values outside
428 // [minPrefix, len - minSuffix], they are left as 0 = DONT_BREAK.
429 for (size_t i = minPrefix; i < maxOffset; i++) {
430 // Hyphenation opportunities happen when the hyphenation numbers are odd.
431 result[i] = (buffer[i] & 1u) ? hyphenValue : HyphenationType::DONT_BREAK;
432 }
433 }
434
435 } // namespace minikin
436