• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 /**
18  * An implementation of Liang's hyphenation algorithm.
19  */
20 
21 #include <memory>
22 #include <unordered_map>
23 
24 #ifndef MINIKIN_HYPHENATOR_H
25 #define MINIKIN_HYPHENATOR_H
26 
27 namespace android {
28 
29 // hyb file header; implementation details are in the .cpp file
30 struct Header;
31 
32 class Hyphenator {
33 public:
34     // Note: this will also require a locale, for proper case folding behavior
35     static Hyphenator* load(const uint16_t* patternData, size_t size);
36 
37     // Compute the hyphenation of a word, storing the hyphenation in result vector. Each
38     // entry in the vector is a "hyphen edit" to be applied at the corresponding code unit
39     // offset in the word. Currently 0 means no hyphen and 1 means insert hyphen and break,
40     // but this will be expanded to other edits for nonstandard hyphenation.
41     // Example: word is "hyphen", result is [0 0 1 0 0 0], corresponding to "hy-phen".
42     void hyphenate(std::vector<uint8_t>* result, const uint16_t* word, size_t len);
43 
44     // pattern data is in binary format, as described in doc/hyb_file_format.md. Note:
45     // the caller is responsible for ensuring that the lifetime of the pattern data is
46     // at least as long as the Hyphenator object.
47 
48     // Note: nullptr is valid input, in which case the hyphenator only processes soft hyphens
49     static Hyphenator* loadBinary(const uint8_t* patternData);
50 
51 private:
52     // apply soft hyphens only, ignoring patterns
53     void hyphenateSoft(uint8_t* result, const uint16_t* word, size_t len);
54 
55     // try looking up word in alphabet table, return false if any code units fail to map
56     // Note that this methor writes len+2 entries into alpha_codes (including start and stop)
57     bool alphabetLookup(uint16_t* alpha_codes, const uint16_t* word, size_t len);
58 
59     // calculate hyphenation from patterns, assuming alphabet lookup has already been done
60     void hyphenateFromCodes(uint8_t* result, const uint16_t* codes, size_t len);
61 
62     // TODO: these should become parameters, as they might vary by locale, screen size, and
63     // possibly explicit user control.
64     static const int MIN_PREFIX = 2;
65     static const int MIN_SUFFIX = 3;
66 
67     // See also LONGEST_HYPHENATED_WORD in LineBreaker.cpp. Here the constant is used so
68     // that temporary buffers can be stack-allocated without waste, which is a slightly
69     // different use case. It measures UTF-16 code units.
70     static const size_t MAX_HYPHENATED_SIZE = 64;
71 
72     const uint8_t* patternData;
73 
74     // accessors for binary data
getHeader()75     const Header* getHeader() const {
76         return reinterpret_cast<const Header*>(patternData);
77     }
78 
79 };
80 
81 }  // namespace android
82 
83 #endif   // MINIKIN_HYPHENATOR_H
84