• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <unicode/uchar.h>
18 #include <unicode/uscript.h>
19 #include <algorithm>
20 #include <memory>
21 #include <string>
22 #include <cstring>
23 #include <vector>
24 
25 // HACK: for reading pattern file
26 #include <fcntl.h>
27 
28 #define LOG_TAG "Minikin"
29 
30 #include "minikin/Hyphenator.h"
31 #include "utils/WindowsUtils.h"
32 
33 using std::vector;
34 
35 namespace minikin {
36 
37 static const uint16_t CHAR_HYPHEN_MINUS = 0x002D;
38 static const uint16_t CHAR_SOFT_HYPHEN = 0x00AD;
39 static const uint16_t CHAR_MIDDLE_DOT = 0x00B7;
40 static const uint16_t CHAR_HYPHEN = 0x2010;
41 
42 // The following are structs that correspond to tables inside the hyb file
43 // format
44 
45 struct AlphabetTable0 {
46   uint32_t version;
47   uint32_t min_codepoint;
48   uint32_t max_codepoint;
49   uint8_t data[1];  // actually flexible array, size is known at runtime
50 };
51 
52 struct AlphabetTable1 {
53   uint32_t version;
54   uint32_t n_entries;
55   uint32_t data[1];  // actually flexible array, size is known at runtime
56 
codepointminikin::AlphabetTable157   static uint32_t codepoint(uint32_t entry) { return entry >> 11; }
valueminikin::AlphabetTable158   static uint32_t value(uint32_t entry) { return entry & 0x7ff; }
59 };
60 
61 struct Trie {
62   uint32_t version;
63   uint32_t char_mask;
64   uint32_t link_shift;
65   uint32_t link_mask;
66   uint32_t pattern_shift;
67   uint32_t n_entries;
68   uint32_t data[1];  // actually flexible array, size is known at runtime
69 };
70 
71 struct Pattern {
72   uint32_t version;
73   uint32_t n_entries;
74   uint32_t pattern_offset;
75   uint32_t pattern_size;
76   uint32_t data[1];  // actually flexible array, size is known at runtime
77 
78   // accessors
lenminikin::Pattern79   static uint32_t len(uint32_t entry) { return entry >> 26; }
shiftminikin::Pattern80   static uint32_t shift(uint32_t entry) { return (entry >> 20) & 0x3f; }
bufminikin::Pattern81   const uint8_t* buf(uint32_t entry) const {
82     return reinterpret_cast<const uint8_t*>(this) + pattern_offset +
83            (entry & 0xfffff);
84   }
85 };
86 
87 struct Header {
88   uint32_t magic;
89   uint32_t version;
90   uint32_t alphabet_offset;
91   uint32_t trie_offset;
92   uint32_t pattern_offset;
93   uint32_t file_size;
94 
95   // accessors
bytesminikin::Header96   const uint8_t* bytes() const {
97     return reinterpret_cast<const uint8_t*>(this);
98   }
alphabetVersionminikin::Header99   uint32_t alphabetVersion() const {
100     return *reinterpret_cast<const uint32_t*>(bytes() + alphabet_offset);
101   }
alphabetTable0minikin::Header102   const AlphabetTable0* alphabetTable0() const {
103     return reinterpret_cast<const AlphabetTable0*>(bytes() + alphabet_offset);
104   }
alphabetTable1minikin::Header105   const AlphabetTable1* alphabetTable1() const {
106     return reinterpret_cast<const AlphabetTable1*>(bytes() + alphabet_offset);
107   }
trieTableminikin::Header108   const Trie* trieTable() const {
109     return reinterpret_cast<const Trie*>(bytes() + trie_offset);
110   }
patternTableminikin::Header111   const Pattern* patternTable() const {
112     return reinterpret_cast<const Pattern*>(bytes() + pattern_offset);
113   }
114 };
115 
loadBinary(const uint8_t * patternData,size_t minPrefix,size_t minSuffix)116 Hyphenator* Hyphenator::loadBinary(const uint8_t* patternData,
117                                    size_t minPrefix,
118                                    size_t minSuffix) {
119   Hyphenator* result = new Hyphenator;
120   result->patternData = patternData;
121   result->minPrefix = minPrefix;
122   result->minSuffix = minSuffix;
123   return result;
124 }
125 
hyphenate(vector<HyphenationType> * result,const uint16_t * word,size_t len,const icu::Locale & locale)126 void Hyphenator::hyphenate(vector<HyphenationType>* result,
127                            const uint16_t* word,
128                            size_t len,
129                            const icu::Locale& locale) {
130   result->clear();
131   result->resize(len);
132   const size_t paddedLen = len + 2;  // start and stop code each count for 1
133   if (patternData != nullptr && len >= minPrefix + minSuffix &&
134       paddedLen <= MAX_HYPHENATED_SIZE) {
135     uint16_t alpha_codes[MAX_HYPHENATED_SIZE];
136     const HyphenationType hyphenValue = alphabetLookup(alpha_codes, word, len);
137     if (hyphenValue != HyphenationType::DONT_BREAK) {
138       hyphenateFromCodes(result->data(), alpha_codes, paddedLen, hyphenValue);
139       return;
140     }
141     // TODO: try NFC normalization
142     // TODO: handle non-BMP Unicode (requires remapping of offsets)
143   }
144   // Note that we will always get here if the word contains a hyphen or a soft
145   // hyphen, because the alphabet is not expected to contain a hyphen or a soft
146   // hyphen character, so alphabetLookup would return DONT_BREAK.
147   hyphenateWithNoPatterns(result->data(), word, len, locale);
148 }
149 
150 // This function determines whether a character is like U+2010 HYPHEN in
151 // line breaking and usage: a character immediately after which line breaks
152 // are allowed, but words containing it should not be automatically
153 // hyphenated using patterns. This is a curated set, created by manually
154 // inspecting all the characters that have the Unicode line breaking
155 // property of BA or HY and seeing which ones are hyphens.
isLineBreakingHyphen(uint32_t c)156 bool Hyphenator::isLineBreakingHyphen(uint32_t c) {
157   return (c == 0x002D ||  // HYPHEN-MINUS
158           c == 0x058A ||  // ARMENIAN HYPHEN
159           c == 0x05BE ||  // HEBREW PUNCTUATION MAQAF
160           c == 0x1400 ||  // CANADIAN SYLLABICS HYPHEN
161           c == 0x2010 ||  // HYPHEN
162           c == 0x2013 ||  // EN DASH
163           c == 0x2027 ||  // HYPHENATION POINT
164           c == 0x2E17 ||  // DOUBLE OBLIQUE HYPHEN
165           c == 0x2E40);   // DOUBLE HYPHEN
166 }
167 
168 const static uint32_t HYPHEN_STR[] = {0x2010, 0};
169 const static uint32_t ARMENIAN_HYPHEN_STR[] = {0x058A, 0};
170 const static uint32_t MAQAF_STR[] = {0x05BE, 0};
171 const static uint32_t UCAS_HYPHEN_STR[] = {0x1400, 0};
172 const static uint32_t ZWJ_STR[] = {0x200D, 0};
173 const static uint32_t ZWJ_AND_HYPHEN_STR[] = {0x200D, 0x2010, 0};
174 
getHyphenString(uint32_t hyph)175 const uint32_t* HyphenEdit::getHyphenString(uint32_t hyph) {
176   switch (hyph) {
177     case INSERT_HYPHEN_AT_END:
178     case REPLACE_WITH_HYPHEN_AT_END:
179     case INSERT_HYPHEN_AT_START:
180       return HYPHEN_STR;
181     case INSERT_ARMENIAN_HYPHEN_AT_END:
182       return ARMENIAN_HYPHEN_STR;
183     case INSERT_MAQAF_AT_END:
184       return MAQAF_STR;
185     case INSERT_UCAS_HYPHEN_AT_END:
186       return UCAS_HYPHEN_STR;
187     case INSERT_ZWJ_AND_HYPHEN_AT_END:
188       return ZWJ_AND_HYPHEN_STR;
189     case INSERT_ZWJ_AT_START:
190       return ZWJ_STR;
191     default:
192       return nullptr;
193   }
194 }
195 
editForThisLine(HyphenationType type)196 uint32_t HyphenEdit::editForThisLine(HyphenationType type) {
197   switch (type) {
198     case HyphenationType::DONT_BREAK:
199       return NO_EDIT;
200     case HyphenationType::BREAK_AND_INSERT_HYPHEN:
201       return INSERT_HYPHEN_AT_END;
202     case HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN:
203       return INSERT_ARMENIAN_HYPHEN_AT_END;
204     case HyphenationType::BREAK_AND_INSERT_MAQAF:
205       return INSERT_MAQAF_AT_END;
206     case HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN:
207       return INSERT_UCAS_HYPHEN_AT_END;
208     case HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN:
209       return REPLACE_WITH_HYPHEN_AT_END;
210     case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ:
211       return INSERT_ZWJ_AND_HYPHEN_AT_END;
212     default:
213       return BREAK_AT_END;
214   }
215 }
216 
editForNextLine(HyphenationType type)217 uint32_t HyphenEdit::editForNextLine(HyphenationType type) {
218   switch (type) {
219     case HyphenationType::DONT_BREAK:
220       return NO_EDIT;
221     case HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE:
222       return INSERT_HYPHEN_AT_START;
223     case HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ:
224       return INSERT_ZWJ_AT_START;
225     default:
226       return BREAK_AT_START;
227   }
228 }
229 
getScript(uint32_t codePoint)230 static UScriptCode getScript(uint32_t codePoint) {
231   UErrorCode errorCode = U_ZERO_ERROR;
232   const UScriptCode script =
233       uscript_getScript(static_cast<UChar32>(codePoint), &errorCode);
234   if (U_SUCCESS(errorCode)) {
235     return script;
236   } else {
237     return USCRIPT_INVALID_CODE;
238   }
239 }
240 
hyphenationTypeBasedOnScript(uint32_t codePoint)241 static HyphenationType hyphenationTypeBasedOnScript(uint32_t codePoint) {
242   // Note: It's not clear what the best hyphen for Hebrew is. While maqaf is the
243   // "correct" hyphen for Hebrew, modern practice may have shifted towards
244   // Western hyphens. We use normal hyphens for now to be safe.
245   // BREAK_AND_INSERT_MAQAF is already implemented, so if we want to switch to
246   // maqaf for Hebrew, we can simply add a condition here.
247   const UScriptCode script = getScript(codePoint);
248   if (script == USCRIPT_KANNADA || script == USCRIPT_MALAYALAM ||
249       script == USCRIPT_TAMIL || script == USCRIPT_TELUGU) {
250     // Grantha is not included, since we don't support non-BMP hyphenation yet.
251     return HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
252   } else if (script == USCRIPT_ARMENIAN) {
253     return HyphenationType::BREAK_AND_INSERT_ARMENIAN_HYPHEN;
254   } else if (script == USCRIPT_CANADIAN_ABORIGINAL) {
255     return HyphenationType::BREAK_AND_INSERT_UCAS_HYPHEN;
256   } else {
257     return HyphenationType::BREAK_AND_INSERT_HYPHEN;
258   }
259 }
260 
getJoiningType(UChar32 codepoint)261 static inline int32_t getJoiningType(UChar32 codepoint) {
262   return u_getIntPropertyValue(codepoint, UCHAR_JOINING_TYPE);
263 }
264 
265 // Assumption for caller: location must be >= 2 and word[location] ==
266 // CHAR_SOFT_HYPHEN. This function decides if the letters before and after the
267 // hyphen should appear as joining.
getHyphTypeForArabic(const uint16_t * word,size_t len,size_t location)268 static inline HyphenationType getHyphTypeForArabic(const uint16_t* word,
269                                                    size_t len,
270                                                    size_t location) {
271   ssize_t i = location;
272   int32_t type = U_JT_NON_JOINING;
273   while (static_cast<size_t>(i) < len &&
274          (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) {
275     i++;
276   }
277   if (type == U_JT_DUAL_JOINING || type == U_JT_RIGHT_JOINING ||
278       type == U_JT_JOIN_CAUSING) {
279     // The next character is of the type that may join the last character. See
280     // if the last character is also of the right type.
281     i = location - 2;  // Skip the soft hyphen
282     type = U_JT_NON_JOINING;
283     while (i >= 0 && (type = getJoiningType(word[i])) == U_JT_TRANSPARENT) {
284       i--;
285     }
286     if (type == U_JT_DUAL_JOINING || type == U_JT_LEFT_JOINING ||
287         type == U_JT_JOIN_CAUSING) {
288       return HyphenationType::BREAK_AND_INSERT_HYPHEN_AND_ZWJ;
289     }
290   }
291   return HyphenationType::BREAK_AND_INSERT_HYPHEN;
292 }
293 
294 // Use various recommendations of UAX #14 Unicode Line Breaking Algorithm for
295 // hyphenating words that didn't match patterns, especially words that contain
296 // hyphens or soft hyphens (See sections 5.3, Use of Hyphen, and 5.4, Use of
297 // Soft Hyphen).
hyphenateWithNoPatterns(HyphenationType * result,const uint16_t * word,size_t len,const icu::Locale & locale)298 void Hyphenator::hyphenateWithNoPatterns(HyphenationType* result,
299                                          const uint16_t* word,
300                                          size_t len,
301                                          const icu::Locale& locale) {
302   result[0] = HyphenationType::DONT_BREAK;
303   for (size_t i = 1; i < len; i++) {
304     const uint16_t prevChar = word[i - 1];
305     if (i > 1 && isLineBreakingHyphen(prevChar)) {
306       // Break after hyphens, but only if they don't start the word.
307 
308       if ((prevChar == CHAR_HYPHEN_MINUS || prevChar == CHAR_HYPHEN) &&
309           strcmp(locale.getLanguage(), "pl") == 0 &&
310           getScript(word[i]) == USCRIPT_LATIN) {
311         // In Polish, hyphens get repeated at the next line. To be safe,
312         // we will do this only if the next character is Latin.
313         result[i] = HyphenationType::BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE;
314       } else {
315         result[i] = HyphenationType::BREAK_AND_DONT_INSERT_HYPHEN;
316       }
317     } else if (i > 1 && prevChar == CHAR_SOFT_HYPHEN) {
318       // Break after soft hyphens, but only if they don't start the word (a soft
319       // hyphen starting the word doesn't give any useful break opportunities).
320       // The type of the break is based on the script of the character we break
321       // on.
322       if (getScript(word[i]) == USCRIPT_ARABIC) {
323         // For Arabic, we need to look and see if the characters around the soft
324         // hyphen actually join. If they don't, we'll just insert a normal
325         // hyphen.
326         result[i] = getHyphTypeForArabic(word, len, i);
327       } else {
328         result[i] = hyphenationTypeBasedOnScript(word[i]);
329       }
330     } else if (prevChar == CHAR_MIDDLE_DOT && minPrefix < i &&
331                i <= len - minSuffix &&
332                ((word[i - 2] == 'l' && word[i] == 'l') ||
333                 (word[i - 2] == 'L' && word[i] == 'L')) &&
334                strcmp(locale.getLanguage(), "ca") == 0) {
335       // In Catalan, "l·l" should break as "l-" on the first line
336       // and "l" on the next line.
337       result[i] = HyphenationType::BREAK_AND_REPLACE_WITH_HYPHEN;
338     } else {
339       result[i] = HyphenationType::DONT_BREAK;
340     }
341   }
342 }
343 
alphabetLookup(uint16_t * alpha_codes,const uint16_t * word,size_t len)344 HyphenationType Hyphenator::alphabetLookup(uint16_t* alpha_codes,
345                                            const uint16_t* word,
346                                            size_t len) {
347   const Header* header = getHeader();
348   HyphenationType result = HyphenationType::BREAK_AND_INSERT_HYPHEN;
349   // TODO: check header magic
350   uint32_t alphabetVersion = header->alphabetVersion();
351   if (alphabetVersion == 0) {
352     const AlphabetTable0* alphabet = header->alphabetTable0();
353     uint32_t min_codepoint = alphabet->min_codepoint;
354     uint32_t max_codepoint = alphabet->max_codepoint;
355     alpha_codes[0] = 0;  // word start
356     for (size_t i = 0; i < len; i++) {
357       uint16_t c = word[i];
358       if (c < min_codepoint || c >= max_codepoint) {
359         return HyphenationType::DONT_BREAK;
360       }
361       uint8_t code = alphabet->data[c - min_codepoint];
362       if (code == 0) {
363         return HyphenationType::DONT_BREAK;
364       }
365       if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
366         result = hyphenationTypeBasedOnScript(c);
367       }
368       alpha_codes[i + 1] = code;
369     }
370     alpha_codes[len + 1] = 0;  // word termination
371     return result;
372   } else if (alphabetVersion == 1) {
373     const AlphabetTable1* alphabet = header->alphabetTable1();
374     size_t n_entries = alphabet->n_entries;
375     const uint32_t* begin = alphabet->data;
376     const uint32_t* end = begin + n_entries;
377     alpha_codes[0] = 0;
378     for (size_t i = 0; i < len; i++) {
379       uint16_t c = word[i];
380       auto p = std::lower_bound<const uint32_t*, uint32_t>(begin, end, c << 11);
381       if (p == end) {
382         return HyphenationType::DONT_BREAK;
383       }
384       uint32_t entry = *p;
385       if (AlphabetTable1::codepoint(entry) != c) {
386         return HyphenationType::DONT_BREAK;
387       }
388       if (result == HyphenationType::BREAK_AND_INSERT_HYPHEN) {
389         result = hyphenationTypeBasedOnScript(c);
390       }
391       alpha_codes[i + 1] = AlphabetTable1::value(entry);
392     }
393     alpha_codes[len + 1] = 0;
394     return result;
395   }
396   return HyphenationType::DONT_BREAK;
397 }
398 
399 /**
400  * Internal implementation, after conversion to codes. All case folding and
401  *normalization has been done by now, and all characters have been found in the
402  *alphabet. Note: len here is the padded length including 0 codes at start and
403  *end.
404  **/
hyphenateFromCodes(HyphenationType * result,const uint16_t * codes,size_t len,HyphenationType hyphenValue)405 void Hyphenator::hyphenateFromCodes(HyphenationType* result,
406                                     const uint16_t* codes,
407                                     size_t len,
408                                     HyphenationType hyphenValue) {
409   static_assert(sizeof(HyphenationType) == sizeof(uint8_t),
410                 "HyphnationType must be uint8_t.");
411   // Reuse the result array as a buffer for calculating intermediate hyphenation
412   // numbers.
413   uint8_t* buffer = reinterpret_cast<uint8_t*>(result);
414 
415   const Header* header = getHeader();
416   const Trie* trie = header->trieTable();
417   const Pattern* pattern = header->patternTable();
418   uint32_t char_mask = trie->char_mask;
419   uint32_t link_shift = trie->link_shift;
420   uint32_t link_mask = trie->link_mask;
421   uint32_t pattern_shift = trie->pattern_shift;
422   size_t maxOffset = len - minSuffix - 1;
423   for (size_t i = 0; i < len - 1; i++) {
424     uint32_t node = 0;  // index into Trie table
425     for (size_t j = i; j < len; j++) {
426       uint16_t c = codes[j];
427       uint32_t entry = trie->data[node + c];
428       if ((entry & char_mask) == c) {
429         node = (entry & link_mask) >> link_shift;
430       } else {
431         break;
432       }
433       uint32_t pat_ix = trie->data[node] >> pattern_shift;
434       // pat_ix contains a 3-tuple of length, shift (number of trailing zeros),
435       // and an offset into the buf pool. This is the pattern for the substring
436       // (i..j) we just matched, which we combine (via point-wise max) into the
437       // buffer vector.
438       if (pat_ix != 0) {
439         uint32_t pat_entry = pattern->data[pat_ix];
440         int pat_len = Pattern::len(pat_entry);
441         int pat_shift = Pattern::shift(pat_entry);
442         const uint8_t* pat_buf = pattern->buf(pat_entry);
443         int offset = j + 1 - (pat_len + pat_shift);
444         // offset is the index within buffer that lines up with the start of
445         // pat_buf
446         int start = std::max((int)minPrefix - offset, 0);
447         int end = std::min(pat_len, (int)maxOffset - offset);
448         for (int k = start; k < end; k++) {
449           buffer[offset + k] = std::max(buffer[offset + k], pat_buf[k]);
450         }
451       }
452     }
453   }
454   // Since the above calculation does not modify values outside
455   // [minPrefix, len - minSuffix], they are left as 0 = DONT_BREAK.
456   for (size_t i = minPrefix; i < maxOffset; i++) {
457     // Hyphenation opportunities happen when the hyphenation numbers are odd.
458     result[i] = (buffer[i] & 1u) ? hyphenValue : HyphenationType::DONT_BREAK;
459   }
460 }
461 
462 }  // namespace minikin
463