1 /*
2 * Copyright (c) 2024 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "hyphen_pattern.h"
17
18 #include <algorithm>
19 #include <cstddef>
20 #include <climits>
21 #include <fstream>
22 #include <iostream>
23 #include <sys/stat.h>
24 #include <sys/types.h>
25 #include <map>
26 #include <unicode/utf.h>
27 #include <unicode/utf8.h>
28
29 using namespace std;
30
31 // to enable more information on development time
32 // define VERBOSE_PATTERNS
33
34 namespace OHOS::Hyphenate {
35 // upper limit for direct pointing arrays
36 #define MAXIMUM_DIRECT_CODE_POINT 0x7a
37
38 struct Leaf {
39 uint16_t offset{0};
40 uint16_t usecount{0};
41 };
42
43 struct Rule {
44 uint16_t offset{0};
45 map<uint16_t, vector<vector<uint16_t>>> patterns;
46 map<uint16_t, Leaf> uniqLeafs;
47 };
48
49 static map<vector<uint8_t>, Rule> g_allRules;
50
ConvertToUtf16(const string & utf8Str)51 vector<uint16_t> ConvertToUtf16(const string& utf8Str)
52 {
53 int32_t i = 0;
54 UChar32 c = 0;
55 vector<uint16_t> target;
56 const int32_t textLength = static_cast<int32_t>(utf8Str.size());
57 while (i < textLength) {
58 U8_NEXT(reinterpret_cast<const uint8_t*>(utf8Str.c_str()), i, textLength, c);
59 if (U16_LENGTH(c) == 1) {
60 target.push_back(c);
61 } else {
62 target.push_back(U16_LEAD(static_cast<uint32_t>(c)));
63 target.push_back(U16_TRAIL(static_cast<uint32_t>(c)));
64 }
65 }
66 return target;
67 }
68
69 // Recursive path implementation.
70 // Collects static information and the leafs that provide access to patterns
71 // The implementation is reversed to pattern code point order; end to beginning of pattern;
72 struct Path {
PathOHOS::Hyphenate::Path73 explicit Path(const vector<uint16_t>& path, const vector<uint8_t>* pat)
74 {
75 count++;
76 size_t targetIndex = path.size();
77 if (targetIndex > 0) {
78 code = path[--targetIndex];
79 }
80 if ((code <= MAXIMUM_DIRECT_CODE_POINT)) {
81 maximumCP = max(maximumCP, code);
82 minimumCP = min(minimumCP, code);
83 }
84
85 // Process children recursively
86 if (targetIndex > 0) {
87 Process(path, targetIndex, pat);
88 } else {
89 // store pattern to leafs
90 pattern = pat;
91 leafCount++;
92 }
93 }
94
ProcessOHOS::Hyphenate::Path95 void Process(const vector<uint16_t>& path, size_t targetIndex, const vector<uint8_t>* pat)
96 {
97 if (targetIndex == 0) {
98 pattern = pat;
99 return;
100 }
101
102 uint16_t key = path[--targetIndex];
103 if (auto ite = paths.find(key); ite != paths.end()) {
104 ite->second.Process(path, targetIndex, pat);
105 } else {
106 if (key > MAXIMUM_DIRECT_CODE_POINT) {
107 // if we have direct children with distinct code points, we need to use
108 // value pairs
109 haveNoncontiguousChildren = true;
110 }
111 vector<uint16_t> substr(path.cbegin(), path.cbegin() + targetIndex + 1);
112 // recurse
113 paths.emplace(key, Path(substr, pat));
114 }
115 }
116
117 // The graph is built using pattern end characters
118 // while the rules may have different leaf nodes
119 // Check the dictionary terminated graph for unified rules and leafs
FindSharedLeavesOHOS::Hyphenate::Path120 void FindSharedLeaves()
121 {
122 if (paths.size() != 0) {
123 for (auto& path : paths) {
124 path.second.FindSharedLeaves();
125 }
126 } else if (g_allRules.count(*pattern) != 0) {
127 auto ite = g_allRules[*pattern].uniqLeafs.find(code);
128 if (ite != g_allRules[*pattern].uniqLeafs.cend()) {
129 ite->second.usecount += 1;
130 }
131 }
132 }
133
134 // Once this node is reached, we can access pattern
135 // however traversing further may be needed
HasPatternOHOS::Hyphenate::Path136 bool HasPattern() const { return pattern != nullptr; }
137
138 // This instance of Path and its children implement a straight path without ambquity.
139 // No need to traverse through tables to reach pattern.
140 // Calculate the depth of the graph.
IsLinearOHOS::Hyphenate::Path141 bool IsLinear() const
142 {
143 if (paths.size() == 0) {
144 return true;
145 } else if (paths.size() == 1) {
146 return paths.begin()->second.IsLinear();
147 }
148 return false;
149 }
150
151 // debug print misc info
PrintOHOS::Hyphenate::Path152 void Print(size_t indent) const
153 {
154 #ifdef VERBOSE_PATTERNS
155 indent += HYPHEN_INDENT_INCREMENT;
156 for (size_t i = 0; i < indent; i++) {
157 cout << " ";
158 }
159 if (indent == ROOT_INDENT) {
160 cout << char(code) << "rootsize***: " << paths.size();
161 } else {
162 cout << char(code) << "***: " << paths.size();
163 }
164 if (paths.size() >= LARGE_PATH_SIZE)
165 cout << " LARGE";
166 else if (IsLinear()) {
167 cout << " LINEAR";
168 } else {
169 cout << " @@@";
170 }
171
172 cout << endl;
173 if (paths.size() == 0) {
174 return;
175 }
176 for (auto path : paths) {
177 path.second.Print(indent);
178 }
179 cout << endl;
180 #endif
181 }
182
WritePackedOHOS::Hyphenate::Path183 static void WritePacked(vector<uint16_t>& data, ostream& out, bool writeCount = true)
184 {
185 uint16_t size = data.size();
186 if (writeCount) {
187 out.write(reinterpret_cast<const char*>(&size), sizeof(size));
188 }
189
190 for (size_t i = 0; i < data.size(); i++) {
191 uint16_t bytes = data[i];
192 out.write(reinterpret_cast<const char*>(&bytes), sizeof(bytes));
193 }
194 }
195
WritePackedOHOS::Hyphenate::Path196 static uint16_t WritePacked(const vector<uint8_t>& data, ostream& out, bool writeSize = true)
197 {
198 constexpr size_t ALIGN_4BYTES = 0x03;
199 uint16_t size = data.size();
200 if (writeSize) {
201 out.write(reinterpret_cast<const char*>(&size), sizeof(size));
202 }
203
204 if ((data.size() & ALIGN_4BYTES) != 0) {
205 cerr << "### uint8_t vectors should be aligned in 4 bytes !!!" << endl;
206 size = size & ~ALIGN_4BYTES;
207 }
208
209 /* convert uint8 to uint32 */
210 for (size_t i = 0; i < size; i += BYTES_PRE_WORD) {
211 uint32_t bytes = data[i] | (data[i + 1] << 8) | (data[i + 2] << 16) | (data[i + 3] << 24);
212 out.write(reinterpret_cast<const char*>(&bytes), sizeof(bytes));
213 }
214 return size;
215 }
216
217 // no need to twiddle the bytes or words currently
WritePackedOHOS::Hyphenate::Path218 static void WritePacked(uint32_t word, ostream& out)
219 {
220 out.write(reinterpret_cast<const char*>(&word), sizeof(word));
221 }
222
223 // We make assumption that 14 bytes is enough to represent offset
224 // so we get two first bits in the array for path type
225 // we have two bytes on the offset arrays
226 // for these
227 enum class PathType : uint8_t { PATTERN = 0, LINEAR = 1, PAIRS = 2, DIRECT = 3 };
228
WritePackedLineOHOS::Hyphenate::Path229 static void WritePackedLine(const Path& pathSrc, ostream& out, PathType& type)
230 {
231 bool wroteSomething{false};
232 vector<uint16_t> output;
233
234 // we do NOT need to write local pattern if we don't have children
235 if (pathSrc.paths.empty()) {
236 type = PathType::PATTERN;
237 return;
238 }
239
240 type = PathType::LINEAR;
241 auto ite = pathSrc.paths.cbegin();
242 const auto* path = &(ite->second);
243 auto localPattern = path->pattern;
244 output.push_back(path->code);
245
246 while (path) {
247 if (localPattern) {
248 // if we have children, they need to be checked when collecting rules
249 if (output.size() > 0) {
250 WritePacked(output, out);
251 }
252 path->WritePatternOrNull(out);
253 output.clear();
254 localPattern = nullptr;
255 wroteSomething = true;
256 } else {
257 // traverse further
258 if (!path->paths.empty()) {
259 auto itr = path->paths.cbegin();
260 path = &(itr->second);
261 localPattern = path->pattern;
262 output.push_back(path->code);
263 } else {
264 break;
265 }
266 }
267 }
268 if (!wroteSomething) {
269 cerr << "Did not write anything linear" << endl;
270 type = PathType::PATTERN;
271 } else {
272 // mark array end so that reader knows when to stop recursing
273 uint16_t size = 0;
274 out.write(reinterpret_cast<const char*>(&size), sizeof(size));
275 }
276 }
277
WritePatternOrNullOHOS::Hyphenate::Path278 void WritePatternOrNull(ostream& out) const
279 {
280 uint16_t size = 0;
281 if (HasPattern()) {
282 auto ite = g_allRules.find(*pattern);
283 size = ite->second.offset;
284 }
285
286 out.write(reinterpret_cast<const char*>(&size), sizeof(size));
287 }
288
WriteTypedNodeOHOS::Hyphenate::Path289 void WriteTypedNode(ostream& out, uint32_t offset, uint32_t& pos, PathType& type) const
290 {
291 // check if we are linear or should write a table
292 if (IsLinear()) {
293 WritePatternOrNull(out);
294 WritePackedLine(*this, out, type);
295 } else if ((paths.size() < static_cast<size_t>(maximumCP - minimumCP) / HYPHEN_BASE_CODE_SHIFT)
296 || haveNoncontiguousChildren) {
297 // Using dense table, i.e. value pairs
298 vector<uint16_t> output;
299 for (const auto& path : paths) {
300 output.push_back(path.first);
301 output.push_back(path.second.Write(out, offset));
302 }
303 pos = static_cast<uint32_t>(out.tellp()); // our header is after children data
304 type = PathType::PAIRS;
305 WritePatternOrNull(out);
306 WritePacked(output, out);
307 } else {
308 // Direct pointing, initialize full mapping table
309 vector<uint16_t> output;
310 output.resize(maximumCP - minimumCP + 1, 0);
311 if ((output.size() & 0x1) != 0) {
312 output.push_back(0); // pad
313 }
314 for (const auto& path : paths) {
315 // traverse children recursively (dfs)
316 if (path.first >= minimumCP && path.first <= maximumCP) {
317 output[path.first - minimumCP] = path.second.Write(out, offset);
318 } else {
319 cerr << " ### Encountered distinct code point 0x'" << hex << static_cast<int>(path.first) <<
320 " when writing direct array" << endl;
321 }
322 }
323 pos = static_cast<uint32_t>(out.tellp()); // children first
324 WritePatternOrNull(out); // pattern first
325 WritePacked(output, out, false); // then children table
326 }
327 }
328
WriteOHOS::Hyphenate::Path329 uint16_t Write(ostream& out, uint32_t offset = 0, uint32_t* endPos = nullptr) const
330 {
331 if (HasPattern() && paths.size() == 0) { // currently only leafs are shared
332 // if we have a shared leaf for shared pattern, use it
333 if (auto ite = g_allRules[*pattern].uniqLeafs.find(code); ite != g_allRules[*pattern].uniqLeafs.cend()) {
334 if (ite->second.offset != 0) {
335 return ite->second.offset;
336 }
337 }
338 }
339
340 PathType type = PathType::DIRECT;
341 uint32_t pos = static_cast<uint32_t>(out.tellp());
342 uint32_t oPos = pos;
343
344 WriteTypedNode(out, offset, pos, type);
345
346 // return overall offset in 16bit
347 CheckThatDataFits(pos, offset, out, type, oPos);
348 if (endPos) {
349 *endPos = static_cast<uint32_t>(out.tellp()) >> 1;
350 }
351 return (((pos >> 1) - offset) | (static_cast<uint32_t>(type) << SHIFT_BITS_14));
352 }
353
CheckThatDataFitsOHOS::Hyphenate::Path354 void CheckThatDataFits(uint32_t& pos, uint32_t offset, ostream& out, PathType& type, uint32_t oPos) const
355 {
356 // return overall offset in 16bit
357 if (((pos >> 1) > offset) && ((pos >> 1) - offset) > 0x3fff) {
358 cerr << " ### Cannot fit offset " << hex << pos << " : " << offset
359 << " into 14 bits, dropping node" << endl;
360 out.seekp(oPos, ios_base::beg); // roll back to the beginning of this entry
361 if (!out.good()) {
362 // failing to roll back, terminate
363 cerr << "Could not roll back outfile, terminating" << endl;
364 exit(-1);
365 }
366 WritePatternOrNull(out);
367 type = PathType::PATTERN;
368 pos = static_cast<uint32_t>(out.tellp());
369 }
370 }
371
372 static size_t count;
373 static size_t leafCount;
374 static uint16_t minimumCP;
375 static uint16_t maximumCP;
376
377 uint16_t code{0};
378 map<uint16_t, Path> paths;
379 const vector<uint8_t>* pattern{nullptr};
380 bool haveNoncontiguousChildren{false};
381 };
382
383 size_t Path::count{0};
384 size_t Path::leafCount{0};
385 uint16_t Path::minimumCP = 0x7a;
386 uint16_t Path::maximumCP = 0x5f;
387
388 // Struct to hold all the patterns that end with the code.
389 struct PatternHolder {
390 uint16_t code{0};
391 map<vector<uint16_t>, vector<uint8_t>> patterns;
392 map<uint16_t, Path> paths;
393 };
394
395 struct CpRange {
396 uint16_t minimumCp{0};
397 uint16_t maximumCp{0};
398 };
399
400 struct PathOffset {
PathOffsetOHOS::Hyphenate::PathOffset401 PathOffset(uint32_t o, uint32_t e, uint16_t t, uint16_t c) : offset(o), end(e), type(t), code(c) {}
402 int32_t offset;
403 int32_t end;
404 uint32_t type;
405 uint16_t code;
406 };
407
408 struct WriteOffestsParams {
WriteOffestsParamsOHOS::Hyphenate::WriteOffestsParams409 WriteOffestsParams(vector<PathOffset> offsets, uint32_t mappingsPos, CpRange cpRange)
410 : fOffsets(offsets), fMappingsPos(mappingsPos), fCpRange(cpRange)
411 {
412 }
413 vector<PathOffset> fOffsets;
414 uint32_t fMappingsPos;
415 CpRange fCpRange;
416 uint16_t fCommonNodeOffset;
417 };
418
processSection(const string & line,map<string,vector<string>> & sections,vector<string> * & current)419 void processSection(const string& line, map<string, vector<string>>& sections, vector<string>*& current)
420 {
421 string pat;
422 for (size_t i = 1; i < line.size() && !iswspace(line[i]) && line[i] != '{'; i++) {
423 pat += line[i];
424 }
425 cout << "resolved section: " << pat << endl;
426 if (!pat.empty()) {
427 sections[pat] = vector<string>();
428 current = §ions[pat];
429 }
430 }
431
ProcessContent(const string & line,vector<string> * current)432 static void ProcessContent(const string& line, vector<string>* current)
433 {
434 string pat;
435 for (auto code : line) {
436 if (iswspace(code)) {
437 if (!pat.empty()) {
438 current->push_back(pat);
439 }
440 pat.clear();
441 continue;
442 }
443 if (code == '%') {
444 break;
445 }
446 // cout << code;
447 pat += code;
448 }
449 if (!pat.empty()) {
450 current->push_back(pat);
451 }
452 }
453
ProcessLine(const string & line,vector<string> * & current,vector<string> & uncategorized,map<string,vector<string>> & sections)454 static void ProcessLine(const string& line, vector<string>*& current, vector<string>& uncategorized,
455 map<string, vector<string>>& sections)
456 {
457 string pat;
458 if (line.empty()) {
459 return;
460 } else if (line[0] == '\\') {
461 processSection(line, sections, current);
462 } else if (line[0] == '}') {
463 current = &uncategorized;
464 } else {
465 ProcessContent(line, current);
466 }
467 }
468
ResolveSectionsFromFile(const std::string & fileName,map<string,vector<string>> & sections)469 static int32_t ResolveSectionsFromFile(const std::string& fileName, map<string, vector<string>>& sections)
470 {
471 char resolvedPath[PATH_MAX] = {0};
472 if (fileName.size() > PATH_MAX) {
473 cout << "The file name is too long" << endl;
474 return FAILED;
475 }
476 if (realpath(fileName.c_str(), resolvedPath) == nullptr) {
477 cout << "file name exception" << endl;
478 return FAILED;
479 }
480
481 ifstream input(resolvedPath);
482 if (!input.good()) {
483 cerr << "could not open '" << resolvedPath << "' for reading" << endl;
484 return FAILED;
485 }
486
487 string line;
488 vector<string> uncategorized;
489 vector<string>* current = &uncategorized;
490 while (getline(input, line)) {
491 ProcessLine(line, current, uncategorized, sections);
492 }
493
494 cout << "Uncategorized data size: " << uncategorized.size() << endl;
495 cout << "Amount of sections: " << sections.size() << endl;
496 for (const auto& section : sections) {
497 cout << " '" << section.first << "' size: " << section.second.size() << endl;
498 }
499 return SUCCEED;
500 }
501
ProcessWord(const string & wordString)502 static vector<uint16_t> ProcessWord(const string& wordString)
503 {
504 auto word = ConvertToUtf16(wordString);
505 vector<uint16_t> result;
506 bool addedBreak = false;
507 for (const auto code : word) {
508 if (code == '-') {
509 result.push_back(BREAK_FLAG);
510 addedBreak = true;
511 } else {
512 if (!addedBreak) {
513 result.push_back(NO_BREAK_FLAG);
514 }
515 result.push_back(code);
516 addedBreak = false;
517 }
518 }
519 // match exceptions in full words only
520 result.insert(result.cbegin(), '.');
521 result.push_back('.');
522 cout << "Adding exception: " << wordString << endl;
523 return result;
524 }
525
ResolvePatternsFromSections(map<string,vector<string>> & sections,vector<vector<uint16_t>> & utf16Patterns)526 static void ResolvePatternsFromSections(map<string, vector<string>>& sections, vector<vector<uint16_t>>& utf16Patterns)
527 {
528 for (const auto& pattern : sections["patterns"]) {
529 utf16Patterns.push_back(ConvertToUtf16(pattern));
530 }
531 for (const auto& word : sections["hyphenation"]) {
532 utf16Patterns.push_back(ProcessWord(word));
533 }
534 }
535
CollectLeaves(const vector<uint16_t> & pattern,uint16_t & ix)536 static void CollectLeaves(const vector<uint16_t>& pattern, uint16_t& ix)
537 {
538 for (size_t i = pattern.size(); i > 0;) {
539 if (!isdigit(pattern[--i])) {
540 ix = pattern[i];
541 break;
542 }
543 }
544 }
545
ProcessPattern(const vector<uint16_t> & pattern,vector<uint16_t> & codepoints,vector<uint8_t> & rules)546 static void ProcessPattern(const vector<uint16_t>& pattern, vector<uint16_t>& codepoints, vector<uint8_t>& rules)
547 {
548 bool addedRule = false;
549 for (size_t i = 0; i < pattern.size(); i++) {
550 uint16_t code = pattern[i];
551 if (isdigit(code)) {
552 rules.push_back(code - '0');
553 addedRule = true;
554 } else {
555 if (!addedRule) {
556 rules.push_back(0);
557 }
558 // These have been collected empirically from the existing pattern files.
559 // Remap typical distinct codepoints
560 // below 'a' to the beginning of contiguous range
561 // This same thing needs to be done in 'tolower'
562 // when parsing the results on runtime
563 if (code == '.') {
564 code = '`';
565 } else if (code == '-') {
566 code = '_';
567 } else if (code == '\'') {
568 code = '^';
569 }
570 codepoints.push_back(code);
571 addedRule = false;
572 }
573 }
574 }
575
PadRules(vector<uint8_t> & rules)576 static void PadRules(vector<uint8_t>& rules)
577 {
578 while ((rules.size() % PADDING_SIZE) != 0) {
579 if (rules.back() == 0) {
580 rules.pop_back();
581 } else {
582 break;
583 }
584 }
585 while ((rules.size() % PADDING_SIZE) != 0) {
586 rules.push_back(0);
587 }
588 }
589
ResolveLeavesFromPatterns(const vector<vector<uint16_t>> & utf16Patterns,map<uint16_t,PatternHolder> & leaves)590 void ResolveLeavesFromPatterns(const vector<vector<uint16_t>>& utf16Patterns, map<uint16_t, PatternHolder>& leaves)
591 {
592 for (const auto& pattern : utf16Patterns) {
593 uint16_t ix{0};
594 CollectLeaves(pattern, ix);
595 if (ix == 0) {
596 continue;
597 }
598
599 if (leaves.find(ix) == leaves.end()) {
600 leaves[ix] = {PatternHolder()};
601 }
602
603 vector<uint16_t> codepoints;
604 vector<uint8_t> rules;
605 ProcessPattern(pattern, codepoints, rules);
606
607 leaves[ix].code = ix;
608 if (leaves[ix].patterns.find(codepoints) != leaves[ix].patterns.cend()) {
609 cerr << "### Multiple definitions for pattern with size: " << codepoints.size() << endl;
610 cerr << "###";
611 for (auto codepoint : codepoints) {
612 cerr << " 0x" << hex << static_cast<int>(codepoint);
613 }
614 cerr << endl;
615 }
616
617 PadRules(rules);
618 leaves[ix].patterns[codepoints] = rules;
619 // collect a list of unique rules
620 if (auto it = OHOS::Hyphenate::g_allRules.find(rules); it != OHOS::Hyphenate::g_allRules.end()) {
621 it->second.patterns[ix].push_back(codepoints);
622 } else {
623 OHOS::Hyphenate::g_allRules[rules] = Rule();
624 Hyphenate::g_allRules[rules].patterns[ix].push_back(codepoints);
625 }
626 }
627
628 cout << "leaves: " << leaves.size() << endl;
629 cout << "unique rules: " << OHOS::Hyphenate::g_allRules.size() << endl;
630 }
631
BreakLeavesIntoPaths(map<uint16_t,PatternHolder> & leaves,CpRange & range,int & countPat)632 static void BreakLeavesIntoPaths(map<uint16_t, PatternHolder>& leaves, CpRange& range, int& countPat)
633 {
634 bool printCounts = true;
635 // break leave information to Path instances
636 for (auto& leave : leaves) {
637 cout << " '" << char(leave.first) << "' rootsize: " << leave.second.patterns.size() << endl;
638 for (const auto& pat : leave.second.patterns) {
639 if (auto ite = leave.second.paths.find(pat.first[pat.first.size() - 1]); ite != leave.second.paths.end()) {
640 ite->second.Process(pat.first, pat.first.size() - 1, &pat.second);
641 } else {
642 leave.second.paths.emplace(pat.first[pat.first.size() - 1], Path(pat.first, &pat.second));
643 }
644 #ifdef VERBOSE_PATTERNS
645 cout << " '";
646 for (const auto& digit : pat.first) {
647 cout << "'0x" << hex << static_cast<int>(digit) << "' ";
648 }
649 cout << "' size: " << pat.second.size() << endl;
650 cout << " ";
651 #endif
652 for (const auto& digit : pat.second) {
653 (void)digit;
654 countPat++;
655 #ifdef VERBOSE_PATTERNS
656 cout << "'" << to_string(digit) << "' ";
657 }
658 cout << endl;
659 #else
660 }
661 #endif
662 }
663
664 // collect some stats
665 for (auto path : leave.second.paths) {
666 if (printCounts) {
667 cout << "leafs-nodes: " << path.second.leafCount << " / " << path.second.count << endl;
668 cout << "min-max: " << path.second.minimumCP << " / " << path.second.maximumCP << endl;
669 range.minimumCp = path.second.minimumCP;
670 range.maximumCp = path.second.maximumCP;
671 break;
672 }
673 path.second.Print(HYPHEN_DEFAULT_INDENT);
674 }
675 }
676 }
677
678 const size_t FULL_TALBLE = 4;
679
InitOutFileHead(ofstream & out)680 static uint32_t InitOutFileHead(ofstream& out)
681 {
682 // reserve space for:
683 // - header
684 // - main toc. and
685 // - mapping array for large code points
686 // - version
687 for (size_t i = FULL_TALBLE; i != 0; i--) {
688 uint32_t bytes{0};
689 out.write(reinterpret_cast<const char*>(&bytes), sizeof(bytes));
690 }
691 return FULL_TALBLE * 2; // return 2 multiple talble size, check this number
692 }
693
FormatOutFileHead(ofstream & out,const WriteOffestsParams & params,const uint32_t toc)694 static int32_t FormatOutFileHead(ofstream& out, const WriteOffestsParams& params, const uint32_t toc)
695 {
696 out.seekp(ios::beg); // roll back to the beginning
697 if (!out.good()) {
698 cerr << "failed to write toc" << endl;
699 return FAILED;
700 }
701
702 // very minimalistic magic, perhaps more would be in order including
703 // possible version number
704 uint32_t header = ('H' | ('H' << 8) | (params.fCpRange.minimumCp << 16) | (params.fCpRange.maximumCp << 24));
705 // write header
706 out.write(reinterpret_cast<const char*>(&header), sizeof(header));
707 // write toc
708 out.write(reinterpret_cast<const char*>(&toc), sizeof(toc));
709 // write mappings
710 out.write(reinterpret_cast<const char*>(¶ms.fMappingsPos), sizeof(params.fMappingsPos));
711 // write binary version 8 top bits, using the lower 24 bits for common node offset without
712 // needing to increase header size overall offset on the binary file
713 // we may want to change this at some point
714 const uint32_t version = (0x2 << 0x18) | params.fCommonNodeOffset;
715 out.write(reinterpret_cast<const char*>(&version), sizeof(version));
716
717 return SUCCEED;
718 }
719
ProcessUniqueRule(std::pair<const vector<uint8_t>,Rule> & uniqueRule)720 void ProcessUniqueRule(std::pair<const vector<uint8_t>, Rule>& uniqueRule)
721 {
722 for (auto ite : uniqueRule.second.patterns) {
723 for (auto rule : ite.second) {
724 if (!uniqueRule.second.uniqLeafs.count(*rule.cbegin())) {
725 uniqueRule.second.uniqLeafs[*rule.cbegin()] = {0, 0};
726 }
727 }
728 }
729 }
730
WriteUniqueRules(ofstream & out)731 void WriteUniqueRules(ofstream& out)
732 {
733 for (auto& uniqueRule : OHOS::Hyphenate::g_allRules) {
734 uint32_t pos = static_cast<uint32_t>(out.tellp());
735 uint16_t size = Path::WritePacked(uniqueRule.first, out, false) / 0x4; // save bits by padding size
736 uniqueRule.second.offset = (size << 0xc) | pos;
737 ProcessUniqueRule(uniqueRule);
738 if ((pos >> 0xc) != 0) {
739 cerr << "PATTERNS: RUNNING OUT OF ADDRESS SPACE, file a bug" << endl;
740 exit(-1);
741 }
742 }
743 }
744
WriteSharedLeafs(ofstream & out,uint16_t & pos,uint32_t & end)745 void WriteSharedLeafs(ofstream& out, uint16_t& pos, uint32_t& end)
746 {
747 for (auto& uniqueRule : OHOS::Hyphenate::g_allRules) {
748 cout << "###### UniqueRule with " << uniqueRule.second.patterns.size() << " leaves" << endl;
749 for (auto& sharedLeaf : uniqueRule.second.uniqLeafs) {
750 if (sharedLeaf.second.usecount > 0) {
751 Path path({sharedLeaf.first}, &uniqueRule.first);
752 sharedLeaf.second.offset = path.Write(out, pos, &end);
753 cout << "found unique " << hex << static_cast<int>(sharedLeaf.first) <<
754 " wrote: '" << sharedLeaf.second.offset << "' " << endl;
755 }
756 }
757 }
758 }
759
CheckSharedLeaves(ofstream & out,map<uint16_t,PatternHolder> & leaves)760 uint16_t CheckSharedLeaves(ofstream& out, map<uint16_t, PatternHolder>& leaves)
761 {
762 // check how many of the unique rules remain valid once all the rules are combined
763 for (auto& leave : leaves) {
764 for (auto& path : leave.second.paths) {
765 path.second.FindSharedLeaves();
766 }
767 }
768 uint32_t end{0};
769 if ((out.tellp() % 1) != 0) {
770 out.write(reinterpret_cast<const char*>(&end), 1);
771 }
772 uint16_t pos = static_cast<uint16_t>(out.tellp()) >> 1;
773 cout << "NOW THIS IS PURE MAGIC NUMBER FOR NOW: " << hex << pos << endl;
774 // pad first offset with 16bit zero to make empty patterns ignore the zero offset
775 out.write(reinterpret_cast<const char*>(&end), 2);
776 WriteSharedLeafs(out, pos, end);
777 return pos;
778 }
779
WriteLeavePathsToOutFile(map<uint16_t,PatternHolder> & leaves,const CpRange & range,ofstream & out,uint32_t & tableOffset,vector<PathOffset> & offsets)780 static bool WriteLeavePathsToOutFile(map<uint16_t, PatternHolder>& leaves, const CpRange& range, ofstream& out,
781 uint32_t& tableOffset, vector<PathOffset>& offsets)
782 {
783 // unique rules have no offset
784 WriteUniqueRules(out);
785 // shared nodes offset needs to be stored to header
786 auto sharedOffset = CheckSharedLeaves(out, leaves);
787
788 vector<Path*> bigOnes;
789 bool hasDirect{false};
790 for (auto& leave : leaves) {
791 for (auto& path : leave.second.paths) {
792 if (path.first < range.minimumCp || path.first > range.maximumCp) {
793 bigOnes.push_back(&path.second);
794 continue;
795 }
796 uint32_t end{0};
797 uint16_t value = path.second.Write(out, tableOffset, &end);
798 uint16_t offset = value & 0x3fff;
799 uint32_t type = value & 0x0000c000;
800 uint16_t code = path.first;
801 cout << "direct:" << hex << static_cast<int>(code) << ": " << tableOffset << " : " << end << " type " <<
802 type << endl;
803 tableOffset = end;
804 offsets.push_back(PathOffset(offset, end, type, code));
805 hasDirect = true;
806 }
807 }
808
809 // write distinc code points array after the direct ones
810 for (auto path : bigOnes) {
811 uint32_t end{0};
812 uint16_t value = path->Write(out, tableOffset, &end);
813 uint16_t offset = value & 0x3fff;
814 uint32_t type = value & 0x0000c000;
815 uint16_t code = path->code;
816 cout << "distinct: 0x" << hex << static_cast<int>(code) << ": " << hex << tableOffset << " : " << end <<
817 " type " << type << dec << endl;
818 tableOffset = end;
819 offsets.push_back(PathOffset(offset, end, type, code));
820 }
821
822 offsets.push_back(PathOffset(sharedOffset, 0, 0, 0));
823 return hasDirect;
824 }
825
ProcessDirectPointingValues(std::vector<PathOffset>::const_iterator & lastEffectiveIterator,std::ofstream & out,WriteOffestsParams & params,uint32_t & currentEnd,bool hasDirect)826 void ProcessDirectPointingValues(std::vector<PathOffset>::const_iterator& lastEffectiveIterator, std::ofstream& out,
827 WriteOffestsParams& params, uint32_t& currentEnd, bool hasDirect)
828 {
829 for (size_t i = params.fCpRange.minimumCp; i <= params.fCpRange.maximumCp; i++) {
830 auto iterator = params.fOffsets.cbegin();
831 while (iterator != params.fOffsets.cend()) {
832 if (iterator->code == i) {
833 break;
834 }
835 iterator++;
836 }
837 if (iterator == params.fOffsets.cend()) {
838 if (!hasDirect) {
839 break;
840 }
841 uint32_t dummy{0};
842 Path::WritePacked(dummy, out);
843 Path::WritePacked(currentEnd, out);
844 std::cout << "Direct: padded " << std::endl;
845 continue;
846 }
847 lastEffectiveIterator = iterator;
848 uint32_t type = static_cast<uint32_t>(iterator->type);
849 uint32_t bytes = static_cast<uint32_t>(iterator->offset) | type << 16;
850 currentEnd = iterator->end;
851 std::cout << "Direct: " << std::hex << "o: 0x" << iterator->offset << " e: 0x" << iterator->end << " t: 0x" <<
852 type << " c: 0x" << bytes << std::endl;
853 Path::WritePacked(bytes, out);
854 Path::WritePacked(currentEnd, out);
855 }
856 }
857
ProcessDistinctCodepoints(std::vector<PathOffset>::const_iterator & lastEffectiveIterator,std::ofstream & out,WriteOffestsParams & params,std::vector<uint16_t> & mappings,uint32_t & currentEnd)858 void ProcessDistinctCodepoints(std::vector<PathOffset>::const_iterator& lastEffectiveIterator, std::ofstream& out,
859 WriteOffestsParams& params, std::vector<uint16_t>& mappings, uint32_t& currentEnd)
860 {
861 auto pos = params.fCpRange.maximumCp;
862 if (params.fCpRange.maximumCp != 0) {
863 pos++;
864 }
865 if (lastEffectiveIterator != params.fOffsets.cbegin()) {
866 ++lastEffectiveIterator;
867 }
868 while (lastEffectiveIterator != params.fOffsets.cend()) {
869 mappings.push_back(lastEffectiveIterator->code);
870 mappings.push_back(pos++);
871 uint32_t type = static_cast<uint32_t>(lastEffectiveIterator->type);
872 uint32_t bytes = static_cast<uint32_t>(lastEffectiveIterator->offset) | type << 16;
873 currentEnd = lastEffectiveIterator->end;
874 std::cout << "Distinct: " << std::hex << "code: 0x" << static_cast<int>(lastEffectiveIterator->code) <<
875 " o: 0x" << lastEffectiveIterator->offset << " e: 0x" << lastEffectiveIterator->end << " t: " << type <<
876 " c: 0x" << bytes << std::endl;
877 Path::WritePacked(bytes, out);
878 Path::WritePacked(currentEnd, out);
879 ++lastEffectiveIterator;
880 }
881 }
882
WriteOffestsToOutFile(ofstream & out,WriteOffestsParams & params,uint32_t currentEnd,bool hasDirect)883 static void WriteOffestsToOutFile(ofstream& out, WriteOffestsParams& params, uint32_t currentEnd, bool hasDirect)
884 {
885 if (!params.fOffsets.empty() && params.fOffsets.rbegin()->code == 0) {
886 params.fCommonNodeOffset = params.fOffsets.rbegin()->offset;
887 params.fOffsets.pop_back();
888 }
889
890 auto lastEffectiveIterator = params.fOffsets.cbegin();
891 vector<uint16_t> mappings;
892
893 ProcessDirectPointingValues(lastEffectiveIterator, out, params, currentEnd, hasDirect);
894 // If we don't have direct code points, mapped ones will have to be differently
895 // handled
896 if (!hasDirect) {
897 params.fCpRange.minimumCp = 0;
898 params.fCpRange.maximumCp = 0;
899 }
900
901 if (lastEffectiveIterator != params.fOffsets.cend()) {
902 // distinct codepoints that cannot be addressed by flat array index
903 ProcessDistinctCodepoints(lastEffectiveIterator, out, params, mappings, currentEnd);
904 }
905
906 params.fMappingsPos = static_cast<uint32_t>(out.tellp());
907
908 if (!mappings.empty()) {
909 Path::WritePacked(mappings, out);
910 } else {
911 uint32_t dummy{0};
912 Path::WritePacked(dummy, out);
913 }
914 }
915
GetFileNameWithoutSuffix(const std::string & filePath)916 std::string GetFileNameWithoutSuffix(const std::string& filePath)
917 {
918 size_t lastSlashPos = filePath.find_last_of("/\\");
919 size_t lastDotPos = filePath.find_last_of(".");
920 std::string fileName = filePath.substr(lastSlashPos + 1, lastDotPos - lastSlashPos - 1);
921 return fileName;
922 }
923
CreateDirectory(const std::string & folderPath)924 void CreateDirectory(const std::string& folderPath)
925 {
926 if (mkdir(folderPath.c_str(), 0755) == 0) { // 0755 means the owner has read, write, and execute permissions,
927 std::cout << "Directory created successfully: " << folderPath << std::endl;
928 } else {
929 std::cout << "Directory already exists: " << folderPath << std::endl;
930 }
931 }
932
Proccess(const std::string & filePath,const std::string & outFilePath) const933 void HyphenProcessor::Proccess(const std::string& filePath, const std::string& outFilePath) const
934 {
935 map<string, vector<string>> sections;
936 if (ResolveSectionsFromFile(filePath, sections) != SUCCEED) {
937 return;
938 }
939
940 char resolvedPath[PATH_MAX] = {0};
941 if (outFilePath.size() > PATH_MAX) {
942 cout << "The file name is too long" << endl;
943 return;
944 }
945 if (realpath(outFilePath.c_str(), resolvedPath) == nullptr) {
946 CreateDirectory(resolvedPath);
947 }
948
949 vector<vector<uint16_t>> utf16Patterns;
950 ResolvePatternsFromSections(sections, utf16Patterns);
951
952 map<uint16_t, PatternHolder> leaves;
953 ResolveLeavesFromPatterns(utf16Patterns, leaves);
954
955 CpRange range = {0, 0};
956 int countPat = 0;
957 BreakLeavesIntoPaths(leaves, range, countPat);
958
959 string filename = GetFileNameWithoutSuffix(filePath);
960 std::cout << "output file: " << (outFilePath + "/" + filename + ".hpb") << std::endl;
961 ofstream out((outFilePath + "/" + filename + ".hpb"), ios::binary);
962 uint32_t tableOffset = InitOutFileHead(out);
963 vector<PathOffset> offsets;
964 uint32_t toc = 0;
965
966 bool hasDirect = WriteLeavePathsToOutFile(leaves, range, out, tableOffset, offsets);
967 toc = static_cast<uint32_t>(out.tellp());
968 if ((toc % 0x4) != 0) {
969 out.write(reinterpret_cast<const char*>(&toc), toc % 0x4);
970 toc = static_cast<uint32_t>(out.tellp());
971 }
972 // and main table offsets
973 cout << "Produced " << offsets.size() << " paths with z: " << toc << endl;
974
975 uint32_t currentEnd = FULL_TALBLE * 2; // initial offset (in 16 bits)
976 Path::WritePacked(currentEnd, out);
977
978 uint32_t mappingsPos = 0;
979 WriteOffestsParams writeOffestsParams(offsets, mappingsPos, range);
980 WriteOffestsToOutFile(out, writeOffestsParams, currentEnd, hasDirect);
981 if (FormatOutFileHead(out, writeOffestsParams, toc) != SUCCEED) {
982 cout << "DONE: With " << to_string(countPat) << "patterns (8bit)" << endl;
983 }
984 }
985 } // namespace OHOS::Hyphenate
986
main(int argc,char ** argv)987 int main(int argc, char** argv)
988 {
989 if (argc != 3) { // 3: valid argument number
990 cout << "usage: './transform hyph-en-us.tex ./out/'" << endl;
991 return FAILED;
992 }
993
994 // open output
995 string filePath = argv[1];
996 string outFilePath = argv[2];
997
998 OHOS::Hyphenate::HyphenProcessor hyphenProcessor;
999 hyphenProcessor.Proccess(filePath, outFilePath);
1000
1001 return SUCCEED;
1002 }
1003