• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2024 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "hyphen_pattern.h"
17 
18 #include <algorithm>
19 #include <cstddef>
20 #include <climits>
21 #include <fstream>
22 #include <iostream>
23 #include <sys/stat.h>
24 #include <sys/types.h>
25 #include <map>
26 #include <unicode/utf.h>
27 #include <unicode/utf8.h>
28 
29 using namespace std;
30 
31 // to enable more information on development time
32 // define VERBOSE_PATTERNS
33 
34 namespace OHOS::Hyphenate {
35 // upper limit for direct pointing arrays
36 #define MAXIMUM_DIRECT_CODE_POINT 0x7a
37 
38 struct Leaf {
39     uint16_t offset{0};
40     uint16_t usecount{0};
41 };
42 
43 struct Rule {
44     uint16_t offset{0};
45     map<uint16_t, vector<vector<uint16_t>>> patterns;
46     map<uint16_t, Leaf> uniqLeafs;
47 };
48 
49 static map<vector<uint8_t>, Rule> g_allRules;
50 
ConvertToUtf16(const string & utf8Str)51 vector<uint16_t> ConvertToUtf16(const string& utf8Str)
52 {
53     int32_t i = 0;
54     UChar32 c = 0;
55     vector<uint16_t> target;
56     const int32_t textLength = static_cast<int32_t>(utf8Str.size());
57     while (i < textLength) {
58         U8_NEXT(reinterpret_cast<const uint8_t*>(utf8Str.c_str()), i, textLength, c);
59         if (U16_LENGTH(c) == 1) {
60             target.push_back(c);
61         } else {
62             target.push_back(U16_LEAD(static_cast<uint32_t>(c)));
63             target.push_back(U16_TRAIL(static_cast<uint32_t>(c)));
64         }
65     }
66     return target;
67 }
68 
69 // Recursive path implementation.
70 // Collects static information and the leafs that provide access to patterns
71 // The implementation is reversed to pattern code point order; end to beginning of pattern;
72 struct Path {
PathOHOS::Hyphenate::Path73     explicit Path(const vector<uint16_t>& path, const vector<uint8_t>* pat)
74     {
75         count++;
76         size_t targetIndex = path.size();
77         if (targetIndex > 0) {
78             code = path[--targetIndex];
79         }
80         if ((code <= MAXIMUM_DIRECT_CODE_POINT)) {
81             maximumCP = max(maximumCP, code);
82             minimumCP = min(minimumCP, code);
83         }
84 
85         // Process children recursively
86         if (targetIndex > 0) {
87             Process(path, targetIndex, pat);
88         } else {
89             // store pattern to leafs
90             pattern = pat;
91             leafCount++;
92         }
93     }
94 
ProcessOHOS::Hyphenate::Path95     void Process(const vector<uint16_t>& path, size_t targetIndex, const vector<uint8_t>* pat)
96     {
97         if (targetIndex == 0) {
98             pattern = pat;
99             return;
100         }
101 
102         uint16_t key = path[--targetIndex];
103         if (auto ite = paths.find(key); ite != paths.end()) {
104             ite->second.Process(path, targetIndex, pat);
105         } else {
106             if (key > MAXIMUM_DIRECT_CODE_POINT) {
107                 // if we have direct children with distinct code points, we need to use
108                 // value pairs
109                 haveNoncontiguousChildren = true;
110             }
111             vector<uint16_t> substr(path.cbegin(), path.cbegin() + targetIndex + 1);
112             // recurse
113             paths.emplace(key, Path(substr, pat));
114         }
115     }
116 
117     // The graph is built using pattern end characters
118     // while the rules may have different leaf nodes
119     // Check the dictionary terminated graph for unified rules and leafs
FindSharedLeavesOHOS::Hyphenate::Path120     void FindSharedLeaves()
121     {
122         if (paths.size() != 0) {
123             for (auto& path : paths) {
124                 path.second.FindSharedLeaves();
125             }
126         } else if (g_allRules.count(*pattern) != 0) {
127             auto ite = g_allRules[*pattern].uniqLeafs.find(code);
128             if (ite != g_allRules[*pattern].uniqLeafs.cend()) {
129                 ite->second.usecount += 1;
130             }
131         }
132     }
133 
134     // Once this node is reached, we can access pattern
135     // however traversing further may be needed
HasPatternOHOS::Hyphenate::Path136     bool HasPattern() const { return pattern != nullptr; }
137 
138     // This instance of Path and its children implement a straight path without ambquity.
139     // No need to traverse through tables to reach pattern.
140     // Calculate the depth of the graph.
IsLinearOHOS::Hyphenate::Path141     bool IsLinear() const
142     {
143         if (paths.size() == 0) {
144             return true;
145         } else if (paths.size() == 1) {
146             return paths.begin()->second.IsLinear();
147         }
148         return false;
149     }
150 
151     // debug print misc info
PrintOHOS::Hyphenate::Path152     void Print(size_t indent) const
153     {
154 #ifdef VERBOSE_PATTERNS
155         indent += HYPHEN_INDENT_INCREMENT;
156         for (size_t i = 0; i < indent; i++) {
157             cout << " ";
158         }
159         if (indent == ROOT_INDENT) {
160             cout << char(code) << "rootsize***: " << paths.size();
161         } else {
162             cout << char(code) << "***: " << paths.size();
163         }
164         if (paths.size() >= LARGE_PATH_SIZE)
165             cout << " LARGE";
166         else if (IsLinear()) {
167             cout << " LINEAR";
168         } else {
169             cout << " @@@";
170         }
171 
172         cout << endl;
173         if (paths.size() == 0) {
174             return;
175         }
176         for (auto path : paths) {
177             path.second.Print(indent);
178         }
179         cout << endl;
180 #endif
181     }
182 
WritePackedOHOS::Hyphenate::Path183     static void WritePacked(vector<uint16_t>& data, ostream& out, bool writeCount = true)
184     {
185         uint16_t size = data.size();
186         if (writeCount) {
187             out.write(reinterpret_cast<const char*>(&size), sizeof(size));
188         }
189 
190         for (size_t i = 0; i < data.size(); i++) {
191             uint16_t bytes = data[i];
192             out.write(reinterpret_cast<const char*>(&bytes), sizeof(bytes));
193         }
194     }
195 
WritePackedOHOS::Hyphenate::Path196     static uint16_t WritePacked(const vector<uint8_t>& data, ostream& out, bool writeSize = true)
197     {
198         constexpr size_t ALIGN_4BYTES = 0x03;
199         uint16_t size = data.size();
200         if (writeSize) {
201             out.write(reinterpret_cast<const char*>(&size), sizeof(size));
202         }
203 
204         if ((data.size() & ALIGN_4BYTES) != 0) {
205             cerr << "### uint8_t vectors should be aligned in 4 bytes !!!" << endl;
206             size = size & ~ALIGN_4BYTES;
207         }
208 
209         /* convert uint8 to uint32 */
210         for (size_t i = 0; i < size; i += BYTES_PRE_WORD) {
211             uint32_t bytes = data[i] | (data[i + 1] << 8) | (data[i + 2] << 16) | (data[i + 3] << 24);
212             out.write(reinterpret_cast<const char*>(&bytes), sizeof(bytes));
213         }
214         return size;
215     }
216 
217     // no need to twiddle the bytes or words currently
WritePackedOHOS::Hyphenate::Path218     static void WritePacked(uint32_t word, ostream& out)
219     {
220         out.write(reinterpret_cast<const char*>(&word), sizeof(word));
221     }
222 
223     // We make assumption that 14 bytes is enough to represent offset
224     // so we get two first bits in the array for path type
225     // we have two bytes on the offset arrays
226     // for these
227     enum class PathType : uint8_t { PATTERN = 0, LINEAR = 1, PAIRS = 2, DIRECT = 3 };
228 
WritePackedLineOHOS::Hyphenate::Path229     static void WritePackedLine(const Path& pathSrc, ostream& out, PathType& type)
230     {
231         bool wroteSomething{false};
232         vector<uint16_t> output;
233 
234         // we do NOT need to write local pattern if we don't have children
235         if (pathSrc.paths.empty()) {
236             type = PathType::PATTERN;
237             return;
238         }
239 
240         type = PathType::LINEAR;
241         auto ite = pathSrc.paths.cbegin();
242         const auto* path = &(ite->second);
243         auto localPattern = path->pattern;
244         output.push_back(path->code);
245 
246         while (path) {
247             if (localPattern) {
248                 // if we have children, they need to be checked when collecting rules
249                 if (output.size() > 0) {
250                     WritePacked(output, out);
251                 }
252                 path->WritePatternOrNull(out);
253                 output.clear();
254                 localPattern = nullptr;
255                 wroteSomething = true;
256             } else {
257                 // traverse further
258                 if (!path->paths.empty()) {
259                     auto itr = path->paths.cbegin();
260                     path = &(itr->second);
261                     localPattern = path->pattern;
262                     output.push_back(path->code);
263                 } else {
264                     break;
265                 }
266             }
267         }
268         if (!wroteSomething) {
269             cerr << "Did not write anything linear" << endl;
270             type = PathType::PATTERN;
271         } else {
272             // mark array end so that reader knows when to stop recursing
273             uint16_t size = 0;
274             out.write(reinterpret_cast<const char*>(&size), sizeof(size));
275         }
276     }
277 
WritePatternOrNullOHOS::Hyphenate::Path278     void WritePatternOrNull(ostream& out) const
279     {
280         uint16_t size = 0;
281         if (HasPattern()) {
282             auto ite = g_allRules.find(*pattern);
283             size = ite->second.offset;
284         }
285 
286         out.write(reinterpret_cast<const char*>(&size), sizeof(size));
287     }
288 
WriteTypedNodeOHOS::Hyphenate::Path289     void WriteTypedNode(ostream& out, uint32_t offset, uint32_t& pos, PathType& type) const
290     {
291         // check if we are linear or should write a table
292         if (IsLinear()) {
293             WritePatternOrNull(out);
294             WritePackedLine(*this, out, type);
295         } else if ((paths.size() < static_cast<size_t>(maximumCP - minimumCP) / HYPHEN_BASE_CODE_SHIFT)
296                    || haveNoncontiguousChildren) {
297             // Using dense table, i.e. value pairs
298             vector<uint16_t> output;
299             for (const auto& path : paths) {
300                 output.push_back(path.first);
301                 output.push_back(path.second.Write(out, offset));
302             }
303             pos = static_cast<uint32_t>(out.tellp()); // our header is after children data
304             type = PathType::PAIRS;
305             WritePatternOrNull(out);
306             WritePacked(output, out);
307         } else {
308             // Direct pointing, initialize full mapping table
309             vector<uint16_t> output;
310             output.resize(maximumCP - minimumCP + 1, 0);
311             if ((output.size() & 0x1) != 0) {
312                 output.push_back(0); // pad
313             }
314             for (const auto& path : paths) {
315                 // traverse children recursively (dfs)
316                 if (path.first >= minimumCP && path.first <= maximumCP) {
317                     output[path.first - minimumCP] = path.second.Write(out, offset);
318                 } else {
319                     cerr << " ### Encountered distinct code point 0x'" << hex << static_cast<int>(path.first) <<
320                         " when writing direct array" << endl;
321                 }
322             }
323             pos = static_cast<uint32_t>(out.tellp()); // children first
324             WritePatternOrNull(out);                  // pattern first
325             WritePacked(output, out, false);          // then children table
326         }
327     }
328 
WriteOHOS::Hyphenate::Path329     uint16_t Write(ostream& out, uint32_t offset = 0, uint32_t* endPos = nullptr) const
330     {
331         if (HasPattern() && paths.size() == 0) { // currently only leafs are shared
332             // if we have a shared leaf for shared pattern, use it
333             if (auto ite = g_allRules[*pattern].uniqLeafs.find(code); ite != g_allRules[*pattern].uniqLeafs.cend()) {
334                 if (ite->second.offset != 0) {
335                     return ite->second.offset;
336                 }
337             }
338         }
339 
340         PathType type = PathType::DIRECT;
341         uint32_t pos = static_cast<uint32_t>(out.tellp());
342         uint32_t oPos = pos;
343 
344         WriteTypedNode(out, offset, pos, type);
345 
346         // return overall offset in 16bit
347         CheckThatDataFits(pos, offset, out, type, oPos);
348         if (endPos) {
349             *endPos = static_cast<uint32_t>(out.tellp()) >> 1;
350         }
351         return (((pos >> 1) - offset) | (static_cast<uint32_t>(type) << SHIFT_BITS_14));
352     }
353 
CheckThatDataFitsOHOS::Hyphenate::Path354     void CheckThatDataFits(uint32_t& pos, uint32_t offset, ostream& out, PathType& type, uint32_t oPos) const
355     {
356         // return overall offset in 16bit
357         if (((pos >> 1) > offset) && ((pos >> 1) - offset) > 0x3fff) {
358             cerr << " ### Cannot fit offset " << hex << pos << " : " << offset
359                  << " into 14 bits, dropping node" << endl;
360             out.seekp(oPos, ios_base::beg); // roll back to the beginning of this entry
361             if (!out.good()) {
362                 // failing to roll back, terminate
363                 cerr << "Could not roll back outfile, terminating" << endl;
364                 exit(-1);
365             }
366             WritePatternOrNull(out);
367             type = PathType::PATTERN;
368             pos = static_cast<uint32_t>(out.tellp());
369         }
370     }
371 
372     static size_t count;
373     static size_t leafCount;
374     static uint16_t minimumCP;
375     static uint16_t maximumCP;
376 
377     uint16_t code{0};
378     map<uint16_t, Path> paths;
379     const vector<uint8_t>* pattern{nullptr};
380     bool haveNoncontiguousChildren{false};
381 };
382 
383 size_t Path::count{0};
384 size_t Path::leafCount{0};
385 uint16_t Path::minimumCP = 0x7a;
386 uint16_t Path::maximumCP = 0x5f;
387 
388 // Struct to hold all the patterns that end with the code.
389 struct PatternHolder {
390     uint16_t code{0};
391     map<vector<uint16_t>, vector<uint8_t>> patterns;
392     map<uint16_t, Path> paths;
393 };
394 
395 struct CpRange {
396     uint16_t minimumCp{0};
397     uint16_t maximumCp{0};
398 };
399 
400 struct PathOffset {
PathOffsetOHOS::Hyphenate::PathOffset401     PathOffset(uint32_t o, uint32_t e, uint16_t t, uint16_t c) : offset(o), end(e), type(t), code(c) {}
402     int32_t offset;
403     int32_t end;
404     uint32_t type;
405     uint16_t code;
406 };
407 
408 struct WriteOffestsParams {
WriteOffestsParamsOHOS::Hyphenate::WriteOffestsParams409     WriteOffestsParams(vector<PathOffset> offsets, uint32_t mappingsPos, CpRange cpRange)
410         : fOffsets(offsets), fMappingsPos(mappingsPos), fCpRange(cpRange)
411     {
412     }
413     vector<PathOffset> fOffsets;
414     uint32_t fMappingsPos;
415     CpRange fCpRange;
416     uint16_t fCommonNodeOffset;
417 };
418 
processSection(const string & line,map<string,vector<string>> & sections,vector<string> * & current)419 void processSection(const string& line, map<string, vector<string>>& sections, vector<string>*& current)
420 {
421     string pat;
422     for (size_t i = 1; i < line.size() && !iswspace(line[i]) && line[i] != '{'; i++) {
423         pat += line[i];
424     }
425     cout << "resolved section: " << pat << endl;
426     if (!pat.empty()) {
427         sections[pat] = vector<string>();
428         current = &sections[pat];
429     }
430 }
431 
ProcessContent(const string & line,vector<string> * current)432 static void ProcessContent(const string& line, vector<string>* current)
433 {
434     string pat;
435     for (auto code : line) {
436         if (iswspace(code)) {
437             if (!pat.empty()) {
438                 current->push_back(pat);
439             }
440             pat.clear();
441             continue;
442         }
443         if (code == '%') {
444             break;
445         }
446         // cout << code;
447         pat += code;
448     }
449     if (!pat.empty()) {
450         current->push_back(pat);
451     }
452 }
453 
ProcessLine(const string & line,vector<string> * & current,vector<string> & uncategorized,map<string,vector<string>> & sections)454 static void ProcessLine(const string& line, vector<string>*& current, vector<string>& uncategorized,
455                         map<string, vector<string>>& sections)
456 {
457     string pat;
458     if (line.empty()) {
459         return;
460     } else if (line[0] == '\\') {
461         processSection(line, sections, current);
462     } else if (line[0] == '}') {
463         current = &uncategorized;
464     } else {
465         ProcessContent(line, current);
466     }
467 }
468 
ResolveSectionsFromFile(const std::string & fileName,map<string,vector<string>> & sections)469 static int32_t ResolveSectionsFromFile(const std::string& fileName, map<string, vector<string>>& sections)
470 {
471     char resolvedPath[PATH_MAX] = {0};
472     if (fileName.size() > PATH_MAX) {
473         cout << "The file name is too long" << endl;
474         return FAILED;
475     }
476     if (realpath(fileName.c_str(), resolvedPath) == nullptr) {
477         cout << "file name exception" << endl;
478         return FAILED;
479     }
480 
481     ifstream input(resolvedPath);
482     if (!input.good()) {
483         cerr << "could not open '" << resolvedPath << "' for reading" << endl;
484         return FAILED;
485     }
486 
487     string line;
488     vector<string> uncategorized;
489     vector<string>* current = &uncategorized;
490     while (getline(input, line)) {
491         ProcessLine(line, current, uncategorized, sections);
492     }
493 
494     cout << "Uncategorized data size: " << uncategorized.size() << endl;
495     cout << "Amount of sections: " << sections.size() << endl;
496     for (const auto& section : sections) {
497         cout << "  '" << section.first << "' size: " << section.second.size() << endl;
498     }
499     return SUCCEED;
500 }
501 
ProcessWord(const string & wordString)502 static vector<uint16_t> ProcessWord(const string& wordString)
503 {
504     auto word = ConvertToUtf16(wordString);
505     vector<uint16_t> result;
506     bool addedBreak = false;
507     for (const auto code : word) {
508         if (code == '-') {
509             result.push_back(BREAK_FLAG);
510             addedBreak = true;
511         } else {
512             if (!addedBreak) {
513                 result.push_back(NO_BREAK_FLAG);
514             }
515             result.push_back(code);
516             addedBreak = false;
517         }
518     }
519     // match exceptions in full words only
520     result.insert(result.cbegin(), '.');
521     result.push_back('.');
522     cout << "Adding exception: " << wordString << endl;
523     return result;
524 }
525 
ResolvePatternsFromSections(map<string,vector<string>> & sections,vector<vector<uint16_t>> & utf16Patterns)526 static void ResolvePatternsFromSections(map<string, vector<string>>& sections, vector<vector<uint16_t>>& utf16Patterns)
527 {
528     for (const auto& pattern : sections["patterns"]) {
529         utf16Patterns.push_back(ConvertToUtf16(pattern));
530     }
531     for (const auto& word : sections["hyphenation"]) {
532         utf16Patterns.push_back(ProcessWord(word));
533     }
534 }
535 
CollectLeaves(const vector<uint16_t> & pattern,uint16_t & ix)536 static void CollectLeaves(const vector<uint16_t>& pattern, uint16_t& ix)
537 {
538     for (size_t i = pattern.size(); i > 0;) {
539         if (!isdigit(pattern[--i])) {
540             ix = pattern[i];
541             break;
542         }
543     }
544 }
545 
ProcessPattern(const vector<uint16_t> & pattern,vector<uint16_t> & codepoints,vector<uint8_t> & rules)546 static void ProcessPattern(const vector<uint16_t>& pattern, vector<uint16_t>& codepoints, vector<uint8_t>& rules)
547 {
548     bool addedRule = false;
549     for (size_t i = 0; i < pattern.size(); i++) {
550         uint16_t code = pattern[i];
551         if (isdigit(code)) {
552             rules.push_back(code - '0');
553             addedRule = true;
554         } else {
555             if (!addedRule) {
556                 rules.push_back(0);
557             }
558             // These have been collected empirically from the existing pattern files.
559             // Remap typical distinct codepoints
560             // below 'a' to the beginning of contiguous range
561             // This same thing needs to be done in 'tolower'
562             // when parsing the results on runtime
563             if (code == '.') {
564                 code = '`';
565             } else if (code == '-') {
566                 code = '_';
567             } else if (code == '\'') {
568                 code = '^';
569             }
570             codepoints.push_back(code);
571             addedRule = false;
572         }
573     }
574 }
575 
PadRules(vector<uint8_t> & rules)576 static void PadRules(vector<uint8_t>& rules)
577 {
578     while ((rules.size() % PADDING_SIZE) != 0) {
579         if (rules.back() == 0) {
580             rules.pop_back();
581         } else {
582             break;
583         }
584     }
585     while ((rules.size() % PADDING_SIZE) != 0) {
586         rules.push_back(0);
587     }
588 }
589 
ResolveLeavesFromPatterns(const vector<vector<uint16_t>> & utf16Patterns,map<uint16_t,PatternHolder> & leaves)590 void ResolveLeavesFromPatterns(const vector<vector<uint16_t>>& utf16Patterns, map<uint16_t, PatternHolder>& leaves)
591 {
592     for (const auto& pattern : utf16Patterns) {
593         uint16_t ix{0};
594         CollectLeaves(pattern, ix);
595         if (ix == 0) {
596             continue;
597         }
598 
599         if (leaves.find(ix) == leaves.end()) {
600             leaves[ix] = {PatternHolder()};
601         }
602 
603         vector<uint16_t> codepoints;
604         vector<uint8_t> rules;
605         ProcessPattern(pattern, codepoints, rules);
606 
607         leaves[ix].code = ix;
608         if (leaves[ix].patterns.find(codepoints) != leaves[ix].patterns.cend()) {
609             cerr << "### Multiple definitions for pattern with size: " << codepoints.size() << endl;
610             cerr << "###";
611             for (auto codepoint : codepoints) {
612                 cerr << " 0x" << hex << static_cast<int>(codepoint);
613             }
614             cerr << endl;
615         }
616 
617         PadRules(rules);
618         leaves[ix].patterns[codepoints] = rules;
619         // collect a list of unique rules
620         if (auto it = OHOS::Hyphenate::g_allRules.find(rules); it != OHOS::Hyphenate::g_allRules.end()) {
621             it->second.patterns[ix].push_back(codepoints);
622         } else {
623             OHOS::Hyphenate::g_allRules[rules] = Rule();
624             Hyphenate::g_allRules[rules].patterns[ix].push_back(codepoints);
625         }
626     }
627 
628     cout << "leaves: " << leaves.size() << endl;
629     cout << "unique rules: " << OHOS::Hyphenate::g_allRules.size() << endl;
630 }
631 
BreakLeavesIntoPaths(map<uint16_t,PatternHolder> & leaves,CpRange & range,int & countPat)632 static void BreakLeavesIntoPaths(map<uint16_t, PatternHolder>& leaves, CpRange& range, int& countPat)
633 {
634     bool printCounts = true;
635     // break leave information to Path instances
636     for (auto& leave : leaves) {
637         cout << "  '" << char(leave.first) << "' rootsize: " << leave.second.patterns.size() << endl;
638         for (const auto& pat : leave.second.patterns) {
639             if (auto ite = leave.second.paths.find(pat.first[pat.first.size() - 1]); ite != leave.second.paths.end()) {
640                 ite->second.Process(pat.first, pat.first.size() - 1, &pat.second);
641             } else {
642                 leave.second.paths.emplace(pat.first[pat.first.size() - 1], Path(pat.first, &pat.second));
643             }
644 #ifdef VERBOSE_PATTERNS
645             cout << "    '";
646             for (const auto& digit : pat.first) {
647                 cout << "'0x" << hex << static_cast<int>(digit) << "' ";
648             }
649             cout << "' size: " << pat.second.size() << endl;
650             cout << "       ";
651 #endif
652             for (const auto& digit : pat.second) {
653                 (void)digit;
654                 countPat++;
655 #ifdef VERBOSE_PATTERNS
656                 cout << "'" << to_string(digit) << "' ";
657             }
658             cout << endl;
659 #else
660             }
661 #endif
662         }
663 
664         // collect some stats
665         for (auto path : leave.second.paths) {
666             if (printCounts) {
667                 cout << "leafs-nodes: " << path.second.leafCount << " / " << path.second.count << endl;
668                 cout << "min-max: " << path.second.minimumCP << " / " << path.second.maximumCP << endl;
669                 range.minimumCp = path.second.minimumCP;
670                 range.maximumCp = path.second.maximumCP;
671                 break;
672             }
673             path.second.Print(HYPHEN_DEFAULT_INDENT);
674         }
675     }
676 }
677 
678 const size_t FULL_TALBLE = 4;
679 
InitOutFileHead(ofstream & out)680 static uint32_t InitOutFileHead(ofstream& out)
681 {
682     // reserve space for:
683     // - header
684     // - main toc. and
685     // - mapping array for large code points
686     // - version
687     for (size_t i = FULL_TALBLE; i != 0; i--) {
688         uint32_t bytes{0};
689         out.write(reinterpret_cast<const char*>(&bytes), sizeof(bytes));
690     }
691     return FULL_TALBLE * 2; // return 2 multiple talble size, check this number
692 }
693 
FormatOutFileHead(ofstream & out,const WriteOffestsParams & params,const uint32_t toc)694 static int32_t FormatOutFileHead(ofstream& out, const WriteOffestsParams& params, const uint32_t toc)
695 {
696     out.seekp(ios::beg); // roll back to the beginning
697     if (!out.good()) {
698         cerr << "failed to write toc" << endl;
699         return FAILED;
700     }
701 
702     // very minimalistic magic, perhaps more would be in order including
703     // possible version number
704     uint32_t header = ('H' | ('H' << 8) | (params.fCpRange.minimumCp << 16) | (params.fCpRange.maximumCp << 24));
705     // write header
706     out.write(reinterpret_cast<const char*>(&header), sizeof(header));
707     // write toc
708     out.write(reinterpret_cast<const char*>(&toc), sizeof(toc));
709     // write mappings
710     out.write(reinterpret_cast<const char*>(&params.fMappingsPos), sizeof(params.fMappingsPos));
711     // write binary version 8 top bits, using the lower 24 bits for common node offset without
712     // needing to increase header size overall offset on the binary file
713     // we may want to change this at some point
714     const uint32_t version = (0x2 << 0x18) | params.fCommonNodeOffset;
715     out.write(reinterpret_cast<const char*>(&version), sizeof(version));
716 
717     return SUCCEED;
718 }
719 
ProcessUniqueRule(std::pair<const vector<uint8_t>,Rule> & uniqueRule)720 void ProcessUniqueRule(std::pair<const vector<uint8_t>, Rule>& uniqueRule)
721 {
722     for (auto ite : uniqueRule.second.patterns) {
723         for (auto rule : ite.second) {
724             if (!uniqueRule.second.uniqLeafs.count(*rule.cbegin())) {
725                 uniqueRule.second.uniqLeafs[*rule.cbegin()] = {0, 0};
726             }
727         }
728     }
729 }
730 
WriteUniqueRules(ofstream & out)731 void WriteUniqueRules(ofstream& out)
732 {
733     for (auto& uniqueRule : OHOS::Hyphenate::g_allRules) {
734         uint32_t pos = static_cast<uint32_t>(out.tellp());
735         uint16_t size = Path::WritePacked(uniqueRule.first, out, false) / 0x4; // save bits by padding size
736         uniqueRule.second.offset = (size << 0xc) | pos;
737         ProcessUniqueRule(uniqueRule);
738         if ((pos >> 0xc) != 0) {
739             cerr << "PATTERNS: RUNNING OUT OF ADDRESS SPACE, file a bug" << endl;
740             exit(-1);
741         }
742     }
743 }
744 
WriteSharedLeafs(ofstream & out,uint16_t & pos,uint32_t & end)745 void WriteSharedLeafs(ofstream& out, uint16_t& pos, uint32_t& end)
746 {
747     for (auto& uniqueRule : OHOS::Hyphenate::g_allRules) {
748         cout << "###### UniqueRule with " << uniqueRule.second.patterns.size() << " leaves" << endl;
749         for (auto& sharedLeaf : uniqueRule.second.uniqLeafs) {
750             if (sharedLeaf.second.usecount > 0) {
751                 Path path({sharedLeaf.first}, &uniqueRule.first);
752                 sharedLeaf.second.offset = path.Write(out, pos, &end);
753                 cout << "found unique " << hex << static_cast<int>(sharedLeaf.first) <<
754                     " wrote: '" << sharedLeaf.second.offset << "' " << endl;
755             }
756         }
757     }
758 }
759 
CheckSharedLeaves(ofstream & out,map<uint16_t,PatternHolder> & leaves)760 uint16_t CheckSharedLeaves(ofstream& out, map<uint16_t, PatternHolder>& leaves)
761 {
762     // check how many of the unique rules remain valid once all the rules are combined
763     for (auto& leave : leaves) {
764         for (auto& path : leave.second.paths) {
765             path.second.FindSharedLeaves();
766         }
767     }
768     uint32_t end{0};
769     if ((out.tellp() % 1) != 0) {
770         out.write(reinterpret_cast<const char*>(&end), 1);
771     }
772     uint16_t pos = static_cast<uint16_t>(out.tellp()) >> 1;
773     cout << "NOW THIS IS PURE MAGIC NUMBER FOR NOW: " << hex << pos << endl;
774     // pad first offset with 16bit zero to make empty patterns ignore the zero offset
775     out.write(reinterpret_cast<const char*>(&end), 2);
776     WriteSharedLeafs(out, pos, end);
777     return pos;
778 }
779 
WriteLeavePathsToOutFile(map<uint16_t,PatternHolder> & leaves,const CpRange & range,ofstream & out,uint32_t & tableOffset,vector<PathOffset> & offsets)780 static bool WriteLeavePathsToOutFile(map<uint16_t, PatternHolder>& leaves, const CpRange& range, ofstream& out,
781                                      uint32_t& tableOffset, vector<PathOffset>& offsets)
782 {
783     // unique rules have no offset
784     WriteUniqueRules(out);
785     // shared nodes offset needs to be stored to header
786     auto sharedOffset = CheckSharedLeaves(out, leaves);
787 
788     vector<Path*> bigOnes;
789     bool hasDirect{false};
790     for (auto& leave : leaves) {
791         for (auto& path : leave.second.paths) {
792             if (path.first < range.minimumCp || path.first > range.maximumCp) {
793                 bigOnes.push_back(&path.second);
794                 continue;
795             }
796             uint32_t end{0};
797             uint16_t value = path.second.Write(out, tableOffset, &end);
798             uint16_t offset = value & 0x3fff;
799             uint32_t type = value & 0x0000c000;
800             uint16_t code = path.first;
801             cout << "direct:" << hex << static_cast<int>(code) << ": " << tableOffset << " : " << end << " type " <<
802                 type << endl;
803             tableOffset = end;
804             offsets.push_back(PathOffset(offset, end, type, code));
805             hasDirect = true;
806         }
807     }
808 
809     // write distinc code points array after the direct ones
810     for (auto path : bigOnes) {
811         uint32_t end{0};
812         uint16_t value = path->Write(out, tableOffset, &end);
813         uint16_t offset = value & 0x3fff;
814         uint32_t type = value & 0x0000c000;
815         uint16_t code = path->code;
816         cout << "distinct: 0x" << hex << static_cast<int>(code) << ": " << hex << tableOffset << " : " << end <<
817             " type " << type << dec << endl;
818         tableOffset = end;
819         offsets.push_back(PathOffset(offset, end, type, code));
820     }
821 
822     offsets.push_back(PathOffset(sharedOffset, 0, 0, 0));
823     return hasDirect;
824 }
825 
ProcessDirectPointingValues(std::vector<PathOffset>::const_iterator & lastEffectiveIterator,std::ofstream & out,WriteOffestsParams & params,uint32_t & currentEnd,bool hasDirect)826 void ProcessDirectPointingValues(std::vector<PathOffset>::const_iterator& lastEffectiveIterator, std::ofstream& out,
827                                  WriteOffestsParams& params, uint32_t& currentEnd, bool hasDirect)
828 {
829     for (size_t i = params.fCpRange.minimumCp; i <= params.fCpRange.maximumCp; i++) {
830         auto iterator = params.fOffsets.cbegin();
831         while (iterator != params.fOffsets.cend()) {
832             if (iterator->code == i) {
833                 break;
834             }
835             iterator++;
836         }
837         if (iterator == params.fOffsets.cend()) {
838             if (!hasDirect) {
839                 break;
840             }
841             uint32_t dummy{0};
842             Path::WritePacked(dummy, out);
843             Path::WritePacked(currentEnd, out);
844             std::cout << "Direct: padded " << std::endl;
845             continue;
846         }
847         lastEffectiveIterator = iterator;
848         uint32_t type = static_cast<uint32_t>(iterator->type);
849         uint32_t bytes = static_cast<uint32_t>(iterator->offset) | type << 16;
850         currentEnd = iterator->end;
851         std::cout << "Direct: " << std::hex << "o: 0x" << iterator->offset << " e: 0x" << iterator->end << " t: 0x" <<
852             type << " c: 0x" << bytes << std::endl;
853         Path::WritePacked(bytes, out);
854         Path::WritePacked(currentEnd, out);
855     }
856 }
857 
ProcessDistinctCodepoints(std::vector<PathOffset>::const_iterator & lastEffectiveIterator,std::ofstream & out,WriteOffestsParams & params,std::vector<uint16_t> & mappings,uint32_t & currentEnd)858 void ProcessDistinctCodepoints(std::vector<PathOffset>::const_iterator& lastEffectiveIterator, std::ofstream& out,
859                                WriteOffestsParams& params, std::vector<uint16_t>& mappings, uint32_t& currentEnd)
860 {
861     auto pos = params.fCpRange.maximumCp;
862     if (params.fCpRange.maximumCp != 0) {
863         pos++;
864     }
865     if (lastEffectiveIterator != params.fOffsets.cbegin()) {
866         ++lastEffectiveIterator;
867     }
868     while (lastEffectiveIterator != params.fOffsets.cend()) {
869         mappings.push_back(lastEffectiveIterator->code);
870         mappings.push_back(pos++);
871         uint32_t type = static_cast<uint32_t>(lastEffectiveIterator->type);
872         uint32_t bytes = static_cast<uint32_t>(lastEffectiveIterator->offset) | type << 16;
873         currentEnd = lastEffectiveIterator->end;
874         std::cout << "Distinct: " << std::hex << "code: 0x" << static_cast<int>(lastEffectiveIterator->code) <<
875             " o: 0x" << lastEffectiveIterator->offset << " e: 0x" << lastEffectiveIterator->end << " t: " << type <<
876             " c: 0x" << bytes << std::endl;
877         Path::WritePacked(bytes, out);
878         Path::WritePacked(currentEnd, out);
879         ++lastEffectiveIterator;
880     }
881 }
882 
WriteOffestsToOutFile(ofstream & out,WriteOffestsParams & params,uint32_t currentEnd,bool hasDirect)883 static void WriteOffestsToOutFile(ofstream& out, WriteOffestsParams& params, uint32_t currentEnd, bool hasDirect)
884 {
885     if (!params.fOffsets.empty() && params.fOffsets.rbegin()->code == 0) {
886         params.fCommonNodeOffset = params.fOffsets.rbegin()->offset;
887         params.fOffsets.pop_back();
888     }
889 
890     auto lastEffectiveIterator = params.fOffsets.cbegin();
891     vector<uint16_t> mappings;
892 
893     ProcessDirectPointingValues(lastEffectiveIterator, out, params, currentEnd, hasDirect);
894     // If we don't have direct code points, mapped ones will have to be differently
895     // handled
896     if (!hasDirect) {
897         params.fCpRange.minimumCp = 0;
898         params.fCpRange.maximumCp = 0;
899     }
900 
901     if (lastEffectiveIterator != params.fOffsets.cend()) {
902         // distinct codepoints that cannot be addressed by flat array index
903         ProcessDistinctCodepoints(lastEffectiveIterator, out, params, mappings, currentEnd);
904     }
905 
906     params.fMappingsPos = static_cast<uint32_t>(out.tellp());
907 
908     if (!mappings.empty()) {
909         Path::WritePacked(mappings, out);
910     } else {
911         uint32_t dummy{0};
912         Path::WritePacked(dummy, out);
913     }
914 }
915 
GetFileNameWithoutSuffix(const std::string & filePath)916 std::string GetFileNameWithoutSuffix(const std::string& filePath)
917 {
918     size_t lastSlashPos = filePath.find_last_of("/\\");
919     size_t lastDotPos = filePath.find_last_of(".");
920     std::string fileName = filePath.substr(lastSlashPos + 1, lastDotPos - lastSlashPos - 1);
921     return fileName;
922 }
923 
CreateDirectory(const std::string & folderPath)924 void CreateDirectory(const std::string& folderPath)
925 {
926     if (mkdir(folderPath.c_str(), 0755) == 0) { // 0755 means the owner has read, write, and execute permissions,
927         std::cout << "Directory created successfully: " << folderPath << std::endl;
928     } else {
929         std::cout << "Directory already exists: " << folderPath << std::endl;
930     }
931 }
932 
Proccess(const std::string & filePath,const std::string & outFilePath) const933 void HyphenProcessor::Proccess(const std::string& filePath, const std::string& outFilePath) const
934 {
935     map<string, vector<string>> sections;
936     if (ResolveSectionsFromFile(filePath, sections) != SUCCEED) {
937         return;
938     }
939 
940     char resolvedPath[PATH_MAX] = {0};
941     if (outFilePath.size() > PATH_MAX) {
942         cout << "The file name is too long" << endl;
943         return;
944     }
945     if (realpath(outFilePath.c_str(), resolvedPath) == nullptr) {
946         CreateDirectory(resolvedPath);
947     }
948 
949     vector<vector<uint16_t>> utf16Patterns;
950     ResolvePatternsFromSections(sections, utf16Patterns);
951 
952     map<uint16_t, PatternHolder> leaves;
953     ResolveLeavesFromPatterns(utf16Patterns, leaves);
954 
955     CpRange range = {0, 0};
956     int countPat = 0;
957     BreakLeavesIntoPaths(leaves, range, countPat);
958 
959     string filename = GetFileNameWithoutSuffix(filePath);
960     std::cout << "output file: " << (outFilePath + "/" + filename + ".hpb") << std::endl;
961     ofstream out((outFilePath + "/" + filename + ".hpb"), ios::binary);
962     uint32_t tableOffset = InitOutFileHead(out);
963     vector<PathOffset> offsets;
964     uint32_t toc = 0;
965 
966     bool hasDirect = WriteLeavePathsToOutFile(leaves, range, out, tableOffset, offsets);
967     toc = static_cast<uint32_t>(out.tellp());
968     if ((toc % 0x4) != 0) {
969         out.write(reinterpret_cast<const char*>(&toc), toc % 0x4);
970         toc = static_cast<uint32_t>(out.tellp());
971     }
972     // and main table offsets
973     cout << "Produced " << offsets.size() << " paths with z: " << toc << endl;
974 
975     uint32_t currentEnd = FULL_TALBLE * 2; // initial offset (in 16 bits)
976     Path::WritePacked(currentEnd, out);
977 
978     uint32_t mappingsPos = 0;
979     WriteOffestsParams writeOffestsParams(offsets, mappingsPos, range);
980     WriteOffestsToOutFile(out, writeOffestsParams, currentEnd, hasDirect);
981     if (FormatOutFileHead(out, writeOffestsParams, toc) != SUCCEED) {
982         cout << "DONE: With " << to_string(countPat) << "patterns (8bit)" << endl;
983     }
984 }
985 } // namespace OHOS::Hyphenate
986 
main(int argc,char ** argv)987 int main(int argc, char** argv)
988 {
989     if (argc != 3) { // 3: valid argument number
990         cout << "usage: './transform hyph-en-us.tex ./out/'" << endl;
991         return FAILED;
992     }
993 
994     // open output
995     string filePath = argv[1];
996     string outFilePath = argv[2];
997 
998     OHOS::Hyphenate::HyphenProcessor hyphenProcessor;
999     hyphenProcessor.Proccess(filePath, outFilePath);
1000 
1001     return SUCCEED;
1002 }
1003