• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 /**
18  * An implementation of Liang's hyphenation algorithm.
19  */
20 
21 #ifndef U_USING_ICU_NAMESPACE
22 #define U_USING_ICU_NAMESPACE 0
23 #endif  //  U_USING_ICU_NAMESPACE
24 
25 #include <memory>
26 #include <unordered_map>
27 #include <vector>
28 #include "unicode/locid.h"
29 
30 #ifndef MINIKIN_HYPHENATOR_H
31 #define MINIKIN_HYPHENATOR_H
32 
33 namespace minikin {
34 
35 enum class HyphenationType : uint8_t {
36   // Note: There are implicit assumptions scattered in the code that DONT_BREAK
37   // is 0.
38 
39   // Do not break.
40   DONT_BREAK = 0,
41   // Break the line and insert a normal hyphen.
42   BREAK_AND_INSERT_HYPHEN = 1,
43   // Break the line and insert an Armenian hyphen (U+058A).
44   BREAK_AND_INSERT_ARMENIAN_HYPHEN = 2,
45   // Break the line and insert a maqaf (Hebrew hyphen, U+05BE).
46   BREAK_AND_INSERT_MAQAF = 3,
47   // Break the line and insert a Canadian Syllabics hyphen (U+1400).
48   BREAK_AND_INSERT_UCAS_HYPHEN = 4,
49   // Break the line, but don't insert a hyphen. Used for cases when there is
50   // already a hyphen
51   // present or the script does not use a hyphen (e.g. in Malayalam).
52   BREAK_AND_DONT_INSERT_HYPHEN = 5,
53   // Break and replace the last code unit with hyphen. Used for Catalan "l·l"
54   // which hyphenates
55   // as "l-/l".
56   BREAK_AND_REPLACE_WITH_HYPHEN = 6,
57   // Break the line, and repeat the hyphen (which is the last character) at the
58   // beginning of the
59   // next line. Used in Polish, where "czerwono-niebieska" should hyphenate as
60   // "czerwono-/-niebieska".
61   BREAK_AND_INSERT_HYPHEN_AT_NEXT_LINE = 7,
62   // Break the line, insert a ZWJ and hyphen at the first line, and a ZWJ at the
63   // second line.
64   // This is used in Arabic script, mostly for writing systems of Central Asia.
65   // It's our default
66   // behavior when a soft hyphen is used in Arabic script.
67   BREAK_AND_INSERT_HYPHEN_AND_ZWJ = 8
68 };
69 
70 // The hyphen edit represents an edit to the string when a word is
71 // hyphenated. The most common hyphen edit is adding a "-" at the end
72 // of a syllable, but nonstandard hyphenation allows for more choices.
73 // Note that a HyphenEdit can hold two types of edits at the same time,
74 // One at the beginning of the string/line and one at the end.
75 class HyphenEdit {
76  public:
77   static const uint32_t NO_EDIT = 0x00;
78 
79   static const uint32_t INSERT_HYPHEN_AT_END = 0x01;
80   static const uint32_t INSERT_ARMENIAN_HYPHEN_AT_END = 0x02;
81   static const uint32_t INSERT_MAQAF_AT_END = 0x03;
82   static const uint32_t INSERT_UCAS_HYPHEN_AT_END = 0x04;
83   static const uint32_t INSERT_ZWJ_AND_HYPHEN_AT_END = 0x05;
84   static const uint32_t REPLACE_WITH_HYPHEN_AT_END = 0x06;
85   static const uint32_t BREAK_AT_END = 0x07;
86 
87   static const uint32_t INSERT_HYPHEN_AT_START = 0x01 << 3;
88   static const uint32_t INSERT_ZWJ_AT_START = 0x02 << 3;
89   static const uint32_t BREAK_AT_START = 0x03 << 3;
90 
91   // Keep in sync with the definitions in the Java code at:
92   // frameworks/base/graphics/java/android/graphics/Paint.java
93   static const uint32_t MASK_END_OF_LINE = 0x07;
94   static const uint32_t MASK_START_OF_LINE = 0x03 << 3;
95 
isReplacement(uint32_t hyph)96   inline static bool isReplacement(uint32_t hyph) {
97     return hyph == REPLACE_WITH_HYPHEN_AT_END;
98   }
99 
isInsertion(uint32_t hyph)100   inline static bool isInsertion(uint32_t hyph) {
101     return (hyph == INSERT_HYPHEN_AT_END ||
102             hyph == INSERT_ARMENIAN_HYPHEN_AT_END ||
103             hyph == INSERT_MAQAF_AT_END || hyph == INSERT_UCAS_HYPHEN_AT_END ||
104             hyph == INSERT_ZWJ_AND_HYPHEN_AT_END ||
105             hyph == INSERT_HYPHEN_AT_START || hyph == INSERT_ZWJ_AT_START);
106   }
107 
108   const static uint32_t* getHyphenString(uint32_t hyph);
109   static uint32_t editForThisLine(HyphenationType type);
110   static uint32_t editForNextLine(HyphenationType type);
111 
HyphenEdit()112   HyphenEdit() : hyphen(NO_EDIT) {}
HyphenEdit(uint32_t hyphenInt)113   HyphenEdit(uint32_t hyphenInt) : hyphen(hyphenInt) {}  // NOLINT(implicit)
getHyphen()114   uint32_t getHyphen() const { return hyphen; }
115   bool operator==(const HyphenEdit& other) const {
116     return hyphen == other.hyphen;
117   }
118 
getEnd()119   uint32_t getEnd() const { return hyphen & MASK_END_OF_LINE; }
getStart()120   uint32_t getStart() const { return hyphen & MASK_START_OF_LINE; }
121 
122  private:
123   uint32_t hyphen;
124 };
125 
126 // hyb file header; implementation details are in the .cpp file
127 struct Header;
128 
129 class Hyphenator {
130  public:
131   // Compute the hyphenation of a word, storing the hyphenation in result
132   // vector. Each entry in the vector is a "hyphenation type" for a potential
133   // hyphenation that can be applied at the corresponding code unit offset in
134   // the word.
135   //
136   // Example: word is "hyphen", result is the following, corresponding to
137   // "hy-phen": [DONT_BREAK, DONT_BREAK, BREAK_AND_INSERT_HYPHEN, DONT_BREAK,
138   // DONT_BREAK, DONT_BREAK]
139   void hyphenate(std::vector<HyphenationType>* result,
140                  const uint16_t* word,
141                  size_t len,
142                  const icu::Locale& locale);
143 
144   // Returns true if the codepoint is like U+2010 HYPHEN in line breaking and
145   // usage: a character immediately after which line breaks are allowed, but
146   // words containing it should not be automatically hyphenated.
147   static bool isLineBreakingHyphen(uint32_t cp);
148 
149   // pattern data is in binary format, as described in doc/hyb_file_format.md.
150   // Note: the caller is responsible for ensuring that the lifetime of the
151   // pattern data is at least as long as the Hyphenator object.
152 
153   // Note: nullptr is valid input, in which case the hyphenator only processes
154   // soft hyphens.
155   static Hyphenator* loadBinary(const uint8_t* patternData,
156                                 size_t minPrefix,
157                                 size_t minSuffix);
158 
159  private:
160   // apply various hyphenation rules including hard and soft hyphens, ignoring
161   // patterns
162   void hyphenateWithNoPatterns(HyphenationType* result,
163                                const uint16_t* word,
164                                size_t len,
165                                const icu::Locale& locale);
166 
167   // Try looking up word in alphabet table, return DONT_BREAK if any code units
168   // fail to map. Otherwise, returns BREAK_AND_INSERT_HYPHEN,
169   // BREAK_AND_INSERT_ARMENIAN_HYPHEN, or BREAK_AND_DONT_INSERT_HYPHEN based on
170   // the the script of the characters seen. Note that this method writes len+2
171   // entries into alpha_codes (including start and stop)
172   HyphenationType alphabetLookup(uint16_t* alpha_codes,
173                                  const uint16_t* word,
174                                  size_t len);
175 
176   // calculate hyphenation from patterns, assuming alphabet lookup has already
177   // been done
178   void hyphenateFromCodes(HyphenationType* result,
179                           const uint16_t* codes,
180                           size_t len,
181                           HyphenationType hyphenValue);
182 
183   // See also LONGEST_HYPHENATED_WORD in LineBreaker.cpp. Here the constant is
184   // used so that temporary buffers can be stack-allocated without waste, which
185   // is a slightly different use case. It measures UTF-16 code units.
186   static const size_t MAX_HYPHENATED_SIZE = 64;
187 
188   const uint8_t* patternData;
189   size_t minPrefix, minSuffix;
190 
191   // accessors for binary data
getHeader()192   const Header* getHeader() const {
193     return reinterpret_cast<const Header*>(patternData);
194   }
195 };
196 
197 }  // namespace minikin
198 
199 #endif  // MINIKIN_HYPHENATOR_H
200