1 /* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 /** 18 * An implementation of Liang's hyphenation algorithm. 19 */ 20 21 #ifndef U_USING_ICU_NAMESPACE 22 #define U_USING_ICU_NAMESPACE 0 23 #endif // U_USING_ICU_NAMESPACE 24 25 #include <memory> 26 #include <unordered_map> 27 #include <vector> 28 #include "unicode/locid.h" 29 30 #ifndef MINIKIN_HYPHENATOR_H 31 #define MINIKIN_HYPHENATOR_H 32 33 namespace minikin { 34 35 enum class HyphenationType : uint8_t { 36 // Note: There are implicit assumptions scattered in the code that DONT_BREAK 37 // is 0. 38 39 // Do not break. 40 DONT_BREAK = 0, 41 // Break the line and insert a normal hyphen. 42 BREAK_AND_INSERT_HYPHEN = 1, 43 // Break the line and insert an Armenian hyphen (U+058A). 44 BREAK_AND_INSERT_ARMENIAN_HYPHEN = 2, 45 // Break the line and insert a maqaf (Hebrew hyphen, U+05BE). 46 BREAK_AND_INSERT_MAQAF = 3, 47 // Break the line and insert a Canadian Syllabics hyphen (U+1400). 48 BREAK_AND_INSERT_UCAS_HYPHEN = 4, 49 // Break the line, but don't insert a hyphen. Used for cases when there is 50 // already a hyphen 51 // present or the script does not use a hyphen (e.g. in Malayalam). 52 BREAK_AND_DONT_INSERT_HYPHEN = 5, 53 // Break and replace the last code unit with hyphen. Used for Catalan "l·l" 54 // which hyphenates 55 // as "l-/l". 56 BREAK_AND_REPLACE_WITH_HYPHEN = 6, 57 // Break the line, and repeat the hyphen (which is the last character) at the 58 // beginning of the 59 // next line. Used in Polish, where "czerwono-niebieska" should hyphenate as 60 // "czerwono-/-niebieska". 61 BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE = 7, 62 // Break the line, insert a ZWJ and hyphen at the first line, and a ZWJ at the 63 // second line. 64 // This is used in Arabic script, mostly for writing systems of Central Asia. 65 // It's our default 66 // behavior when a soft hyphen is used in Arabic script. 67 BREAK_AND_INSERT_HYPHEN_AND_ZWJ = 8 68 }; 69 70 // The hyphen edit represents an edit to the string when a word is 71 // hyphenated. The most common hyphen edit is adding a "-" at the end 72 // of a syllable, but nonstandard hyphenation allows for more choices. 73 // Note that a HyphenEdit can hold two types of edits at the same time, 74 // One at the beginning of the string/line and one at the end. 75 class HyphenEdit { 76 public: 77 static const uint32_t NO_EDIT = 0x00; 78 79 static const uint32_t INSERT_HYPHEN_AT_END = 0x01; 80 static const uint32_t INSERT_ARMENIAN_HYPHEN_AT_END = 0x02; 81 static const uint32_t INSERT_MAQAF_AT_END = 0x03; 82 static const uint32_t INSERT_UCAS_HYPHEN_AT_END = 0x04; 83 static const uint32_t INSERT_ZWJ_AND_HYPHEN_AT_END = 0x05; 84 static const uint32_t REPLACE_WITH_HYPHEN_AT_END = 0x06; 85 static const uint32_t BREAK_AT_END = 0x07; 86 87 static const uint32_t INSERT_HYPHEN_AT_START = 0x01 << 3; 88 static const uint32_t INSERT_ZWJ_AT_START = 0x02 << 3; 89 static const uint32_t BREAK_AT_START = 0x03 << 3; 90 91 // Keep in sync with the definitions in the Java code at: 92 // frameworks/base/graphics/java/android/graphics/Paint.java 93 static const uint32_t MASK_END_OF_LINE = 0x07; 94 static const uint32_t MASK_START_OF_LINE = 0x03 << 3; 95 isReplacement(uint32_t hyph)96 inline static bool isReplacement(uint32_t hyph) { 97 return hyph == REPLACE_WITH_HYPHEN_AT_END; 98 } 99 isInsertion(uint32_t hyph)100 inline static bool isInsertion(uint32_t hyph) { 101 return (hyph == INSERT_HYPHEN_AT_END || 102 hyph == INSERT_ARMENIAN_HYPHEN_AT_END || 103 hyph == INSERT_MAQAF_AT_END || hyph == INSERT_UCAS_HYPHEN_AT_END || 104 hyph == INSERT_ZWJ_AND_HYPHEN_AT_END || 105 hyph == INSERT_HYPHEN_AT_START || hyph == INSERT_ZWJ_AT_START); 106 } 107 108 const static uint32_t* getHyphenString(uint32_t hyph); 109 static uint32_t editForThisLine(HyphenationType type); 110 static uint32_t editForNextLine(HyphenationType type); 111 HyphenEdit()112 HyphenEdit() : hyphen(NO_EDIT) {} HyphenEdit(uint32_t hyphenInt)113 HyphenEdit(uint32_t hyphenInt) : hyphen(hyphenInt) {} // NOLINT(implicit) getHyphen()114 uint32_t getHyphen() const { return hyphen; } 115 bool operator==(const HyphenEdit& other) const { 116 return hyphen == other.hyphen; 117 } 118 getEnd()119 uint32_t getEnd() const { return hyphen & MASK_END_OF_LINE; } getStart()120 uint32_t getStart() const { return hyphen & MASK_START_OF_LINE; } 121 122 private: 123 uint32_t hyphen; 124 }; 125 126 // hyb file header; implementation details are in the .cpp file 127 struct Header; 128 129 class Hyphenator { 130 public: 131 // Compute the hyphenation of a word, storing the hyphenation in result 132 // vector. Each entry in the vector is a "hyphenation type" for a potential 133 // hyphenation that can be applied at the corresponding code unit offset in 134 // the word. 135 // 136 // Example: word is "hyphen", result is the following, corresponding to 137 // "hy-phen": [DONT_BREAK, DONT_BREAK, BREAK_AND_INSERT_HYPHEN, DONT_BREAK, 138 // DONT_BREAK, DONT_BREAK] 139 void hyphenate(std::vector<HyphenationType>* result, 140 const uint16_t* word, 141 size_t len, 142 const icu::Locale& locale); 143 144 // Returns true if the codepoint is like U+2010 HYPHEN in line breaking and 145 // usage: a character immediately after which line breaks are allowed, but 146 // words containing it should not be automatically hyphenated. 147 static bool isLineBreakingHyphen(uint32_t cp); 148 149 // pattern data is in binary format, as described in doc/hyb_file_format.md. 150 // Note: the caller is responsible for ensuring that the lifetime of the 151 // pattern data is at least as long as the Hyphenator object. 152 153 // Note: nullptr is valid input, in which case the hyphenator only processes 154 // soft hyphens. 155 static Hyphenator* loadBinary(const uint8_t* patternData, 156 size_t minPrefix, 157 size_t minSuffix); 158 159 private: 160 // apply various hyphenation rules including hard and soft hyphens, ignoring 161 // patterns 162 void hyphenateWithNoPatterns(HyphenationType* result, 163 const uint16_t* word, 164 size_t len, 165 const icu::Locale& locale); 166 167 // Try looking up word in alphabet table, return DONT_BREAK if any code units 168 // fail to map. Otherwise, returns BREAK_AND_INSERT_HYPHEN, 169 // BREAK_AND_INSERT_ARMENIAN_HYPHEN, or BREAK_AND_DONT_INSERT_HYPHEN based on 170 // the script of the characters seen. Note that this method writes len+2 171 // entries into alpha_codes (including start and stop) 172 HyphenationType alphabetLookup(uint16_t* alpha_codes, 173 const uint16_t* word, 174 size_t len); 175 176 // calculate hyphenation from patterns, assuming alphabet lookup has already 177 // been done 178 void hyphenateFromCodes(HyphenationType* result, 179 const uint16_t* codes, 180 size_t len, 181 HyphenationType hyphenValue); 182 183 // See also LONGEST_HYPHENATED_WORD in LineBreaker.cpp. Here the constant is 184 // used so that temporary buffers can be stack-allocated without waste, which 185 // is a slightly different use case. It measures UTF-16 code units. 186 static const size_t MAX_HYPHENATED_SIZE = 64; 187 188 const uint8_t* patternData; 189 size_t minPrefix, minSuffix; 190 191 // accessors for binary data getHeader()192 const Header* getHeader() const { 193 return reinterpret_cast<const Header*>(patternData); 194 } 195 }; 196 197 } // namespace minikin 198 199 #endif // MINIKIN_HYPHENATOR_H 200