1 /* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 /** 18 * An implementation of Liang's hyphenation algorithm. 19 */ 20 21 #include "unicode/locid.h" 22 #include <memory> 23 #include <unordered_map> 24 25 #ifndef MINIKIN_HYPHENATOR_H 26 #define MINIKIN_HYPHENATOR_H 27 28 namespace minikin { 29 30 enum class HyphenationType : uint8_t { 31 // Note: There are implicit assumptions scattered in the code that DONT_BREAK is 0. 32 33 // Do not break. 34 DONT_BREAK = 0, 35 // Break the line and insert a normal hyphen. 36 BREAK_AND_INSERT_HYPHEN = 1, 37 // Break the line and insert an Armenian hyphen (U+058A). 38 BREAK_AND_INSERT_ARMENIAN_HYPHEN = 2, 39 // Break the line and insert a maqaf (Hebrew hyphen, U+05BE). 40 BREAK_AND_INSERT_MAQAF = 3, 41 // Break the line and insert a Canadian Syllabics hyphen (U+1400). 42 BREAK_AND_INSERT_UCAS_HYPHEN = 4, 43 // Break the line, but don't insert a hyphen. Used for cases when there is already a hyphen 44 // present or the script does not use a hyphen (e.g. in Malayalam). 45 BREAK_AND_DONT_INSERT_HYPHEN = 5, 46 // Break and replace the last code unit with hyphen. Used for Catalan "l·l" which hyphenates 47 // as "l-/l". 48 BREAK_AND_REPLACE_WITH_HYPHEN = 6, 49 // Break the line, and repeat the hyphen (which is the last character) at the beginning of the 50 // next line. Used in Polish, where "czerwono-niebieska" should hyphenate as 51 // "czerwono-/-niebieska". 52 BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE = 7, 53 // Break the line, insert a ZWJ and hyphen at the first line, and a ZWJ at the second line. 54 // This is used in Arabic script, mostly for writing systems of Central Asia. It's our default 55 // behavior when a soft hyphen is used in Arabic script. 56 BREAK_AND_INSERT_HYPHEN_AND_ZWJ = 8 57 }; 58 59 // The hyphen edit represents an edit to the string when a word is 60 // hyphenated. The most common hyphen edit is adding a "-" at the end 61 // of a syllable, but nonstandard hyphenation allows for more choices. 62 // Note that a HyphenEdit can hold two types of edits at the same time, 63 // One at the beginning of the string/line and one at the end. 64 class HyphenEdit { 65 public: 66 static const uint32_t NO_EDIT = 0x00; 67 68 static const uint32_t INSERT_HYPHEN_AT_END = 0x01; 69 static const uint32_t INSERT_ARMENIAN_HYPHEN_AT_END = 0x02; 70 static const uint32_t INSERT_MAQAF_AT_END = 0x03; 71 static const uint32_t INSERT_UCAS_HYPHEN_AT_END = 0x04; 72 static const uint32_t INSERT_ZWJ_AND_HYPHEN_AT_END = 0x05; 73 static const uint32_t REPLACE_WITH_HYPHEN_AT_END = 0x06; 74 static const uint32_t BREAK_AT_END = 0x07; 75 76 static const uint32_t INSERT_HYPHEN_AT_START = 0x01 << 3; 77 static const uint32_t INSERT_ZWJ_AT_START = 0x02 << 3; 78 static const uint32_t BREAK_AT_START = 0x03 << 3; 79 80 // Keep in sync with the definitions in the Java code at: 81 // frameworks/base/graphics/java/android/graphics/Paint.java 82 static const uint32_t MASK_END_OF_LINE = 0x07; 83 static const uint32_t MASK_START_OF_LINE = 0x03 << 3; 84 isReplacement(uint32_t hyph)85 inline static bool isReplacement(uint32_t hyph) { 86 return hyph == REPLACE_WITH_HYPHEN_AT_END; 87 } 88 isInsertion(uint32_t hyph)89 inline static bool isInsertion(uint32_t hyph) { 90 return (hyph == INSERT_HYPHEN_AT_END 91 || hyph == INSERT_ARMENIAN_HYPHEN_AT_END 92 || hyph == INSERT_MAQAF_AT_END 93 || hyph == INSERT_UCAS_HYPHEN_AT_END 94 || hyph == INSERT_ZWJ_AND_HYPHEN_AT_END 95 || hyph == INSERT_HYPHEN_AT_START 96 || hyph == INSERT_ZWJ_AT_START); 97 } 98 99 const static uint32_t* getHyphenString(uint32_t hyph); 100 static uint32_t editForThisLine(HyphenationType type); 101 static uint32_t editForNextLine(HyphenationType type); 102 HyphenEdit()103 HyphenEdit() : hyphen(NO_EDIT) { } HyphenEdit(uint32_t hyphenInt)104 HyphenEdit(uint32_t hyphenInt) : hyphen(hyphenInt) { } // NOLINT(implicit) getHyphen()105 uint32_t getHyphen() const { return hyphen; } 106 bool operator==(const HyphenEdit &other) const { return hyphen == other.hyphen; } 107 getEnd()108 uint32_t getEnd() const { return hyphen & MASK_END_OF_LINE; } getStart()109 uint32_t getStart() const { return hyphen & MASK_START_OF_LINE; } 110 111 private: 112 uint32_t hyphen; 113 }; 114 115 // hyb file header; implementation details are in the .cpp file 116 struct Header; 117 118 class Hyphenator { 119 public: 120 // Compute the hyphenation of a word, storing the hyphenation in result vector. Each entry in 121 // the vector is a "hyphenation type" for a potential hyphenation that can be applied at the 122 // corresponding code unit offset in the word. 123 // 124 // Example: word is "hyphen", result is the following, corresponding to "hy-phen": 125 // [DONT_BREAK, DONT_BREAK, BREAK_AND_INSERT_HYPHEN, DONT_BREAK, DONT_BREAK, DONT_BREAK] 126 void hyphenate(std::vector<HyphenationType>* result, const uint16_t* word, size_t len, 127 const icu::Locale& locale); 128 129 // Returns true if the codepoint is like U+2010 HYPHEN in line breaking and usage: a character 130 // immediately after which line breaks are allowed, but words containing it should not be 131 // automatically hyphenated. 132 static bool isLineBreakingHyphen(uint32_t cp); 133 134 // pattern data is in binary format, as described in doc/hyb_file_format.md. Note: 135 // the caller is responsible for ensuring that the lifetime of the pattern data is 136 // at least as long as the Hyphenator object. 137 138 // Note: nullptr is valid input, in which case the hyphenator only processes soft hyphens. 139 static Hyphenator* loadBinary(const uint8_t* patternData, size_t minPrefix, size_t minSuffix); 140 141 private: 142 // apply various hyphenation rules including hard and soft hyphens, ignoring patterns 143 void hyphenateWithNoPatterns(HyphenationType* result, const uint16_t* word, size_t len, 144 const icu::Locale& locale); 145 146 // Try looking up word in alphabet table, return DONT_BREAK if any code units fail to map. 147 // Otherwise, returns BREAK_AND_INSERT_HYPHEN, BREAK_AND_INSERT_ARMENIAN_HYPHEN, or 148 // BREAK_AND_DONT_INSERT_HYPHEN based on the the script of the characters seen. 149 // Note that this method writes len+2 entries into alpha_codes (including start and stop) 150 HyphenationType alphabetLookup(uint16_t* alpha_codes, const uint16_t* word, size_t len); 151 152 // calculate hyphenation from patterns, assuming alphabet lookup has already been done 153 void hyphenateFromCodes(HyphenationType* result, const uint16_t* codes, size_t len, 154 HyphenationType hyphenValue); 155 156 // See also LONGEST_HYPHENATED_WORD in LineBreaker.cpp. Here the constant is used so 157 // that temporary buffers can be stack-allocated without waste, which is a slightly 158 // different use case. It measures UTF-16 code units. 159 static const size_t MAX_HYPHENATED_SIZE = 64; 160 161 const uint8_t* patternData; 162 size_t minPrefix, minSuffix; 163 164 // accessors for binary data getHeader()165 const Header* getHeader() const { 166 return reinterpret_cast<const Header*>(patternData); 167 } 168 169 }; 170 171 } // namespace minikin 172 173 #endif // MINIKIN_HYPHENATOR_H 174