/* * Copyright (c) 2024 Huawei Device Co., Ltd. * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "hyphen_pattern.h" #include #include #include #include #include #include #include #include #include #include using namespace std; // to enable more information on development time // define VERBOSE_PATTERNS namespace OHOS::Hyphenate { // upper limit for direct pointing arrays #define MAXIMUM_DIRECT_CODE_POINT 0x7a struct Leaf { uint16_t offset{0}; uint16_t usecount{0}; }; struct Rule { uint16_t offset{0}; map>> patterns; map uniqLeafs; }; static map, Rule> g_allRules; vector ConvertToUtf16(const string& utf8Str) { int32_t i = 0; UChar32 c = 0; vector target; const int32_t textLength = static_cast(utf8Str.size()); while (i < textLength) { U8_NEXT(reinterpret_cast(utf8Str.c_str()), i, textLength, c); if (U16_LENGTH(c) == 1) { target.push_back(c); } else { target.push_back(U16_LEAD(static_cast(c))); target.push_back(U16_TRAIL(static_cast(c))); } } return target; } // Recursive path implementation. // Collects static information and the leafs that provide access to patterns // The implementation is reversed to pattern code point order; end to beginning of pattern; struct Path { explicit Path(const vector& path, const vector* pat) { count++; size_t targetIndex = path.size(); if (targetIndex > 0) { code = path[--targetIndex]; } if ((code <= MAXIMUM_DIRECT_CODE_POINT)) { maximumCP = max(maximumCP, code); minimumCP = min(minimumCP, code); } // Process children recursively if (targetIndex > 0) { Process(path, targetIndex, pat); } else { // store pattern to leafs pattern = pat; leafCount++; } } void Process(const vector& path, size_t targetIndex, const vector* pat) { if (targetIndex == 0) { pattern = pat; return; } uint16_t key = path[--targetIndex]; if (auto ite = paths.find(key); ite != paths.end()) { ite->second.Process(path, targetIndex, pat); } else { if (key > MAXIMUM_DIRECT_CODE_POINT) { // if we have direct children with distinct code points, we need to use // value pairs haveNoncontiguousChildren = true; } vector substr(path.cbegin(), path.cbegin() + targetIndex + 1); // recurse paths.emplace(key, Path(substr, pat)); } } // The graph is built using pattern end characters // while the rules may have different leaf nodes // Check the dictionary terminated graph for unified rules and leafs void FindSharedLeaves() { if (paths.size() != 0) { for (auto& path : paths) { path.second.FindSharedLeaves(); } } else if (g_allRules.count(*pattern) != 0) { auto ite = g_allRules[*pattern].uniqLeafs.find(code); if (ite != g_allRules[*pattern].uniqLeafs.cend()) { ite->second.usecount += 1; } } } // Once this node is reached, we can access pattern // however traversing further may be needed bool HasPattern() const { return pattern != nullptr; } // This instance of Path and its children implement a straight path without ambquity. // No need to traverse through tables to reach pattern. // Calculate the depth of the graph. bool IsLinear() const { if (paths.size() == 0) { return true; } else if (paths.size() == 1) { return paths.begin()->second.IsLinear(); } return false; } // debug print misc info void Print(size_t indent) const { #ifdef VERBOSE_PATTERNS indent += HYPHEN_INDENT_INCREMENT; for (size_t i = 0; i < indent; i++) { cout << " "; } if (indent == ROOT_INDENT) { cout << char(code) << "rootsize***: " << paths.size(); } else { cout << char(code) << "***: " << paths.size(); } if (paths.size() >= LARGE_PATH_SIZE) cout << " LARGE"; else if (IsLinear()) { cout << " LINEAR"; } else { cout << " @@@"; } cout << endl; if (paths.size() == 0) { return; } for (auto path : paths) { path.second.Print(indent); } cout << endl; #endif } static void WritePacked(vector& data, ostream& out, bool writeCount = true) { uint16_t size = data.size(); if (writeCount) { out.write(reinterpret_cast(&size), sizeof(size)); } for (size_t i = 0; i < data.size(); i++) { uint16_t bytes = data[i]; out.write(reinterpret_cast(&bytes), sizeof(bytes)); } } static uint16_t WritePacked(const vector& data, ostream& out, bool writeSize = true) { constexpr size_t ALIGN_4BYTES = 0x03; uint16_t size = data.size(); if (writeSize) { out.write(reinterpret_cast(&size), sizeof(size)); } if ((data.size() & ALIGN_4BYTES) != 0) { cerr << "### uint8_t vectors should be aligned in 4 bytes !!!" << endl; size = size & ~ALIGN_4BYTES; } /* convert uint8 to uint32 */ for (size_t i = 0; i < size; i += BYTES_PRE_WORD) { uint32_t bytes = data[i] | (data[i + 1] << 8) | (data[i + 2] << 16) | (data[i + 3] << 24); out.write(reinterpret_cast(&bytes), sizeof(bytes)); } return size; } // no need to twiddle the bytes or words currently static void WritePacked(uint32_t word, ostream& out) { out.write(reinterpret_cast(&word), sizeof(word)); } // We make assumption that 14 bytes is enough to represent offset // so we get two first bits in the array for path type // we have two bytes on the offset arrays // for these enum class PathType : uint8_t { PATTERN = 0, LINEAR = 1, PAIRS = 2, DIRECT = 3 }; static void WritePackedLine(const Path& pathSrc, ostream& out, PathType& type) { bool wroteSomething{false}; vector output; // we do NOT need to write local pattern if we don't have children if (pathSrc.paths.empty()) { type = PathType::PATTERN; return; } type = PathType::LINEAR; auto ite = pathSrc.paths.cbegin(); const auto* path = &(ite->second); auto localPattern = path->pattern; output.push_back(path->code); while (path) { if (localPattern) { // if we have children, they need to be checked when collecting rules if (output.size() > 0) { WritePacked(output, out); } path->WritePatternOrNull(out); output.clear(); localPattern = nullptr; wroteSomething = true; } else { // traverse further if (!path->paths.empty()) { auto itr = path->paths.cbegin(); path = &(itr->second); localPattern = path->pattern; output.push_back(path->code); } else { break; } } } if (!wroteSomething) { cerr << "Did not write anything linear" << endl; type = PathType::PATTERN; } else { // mark array end so that reader knows when to stop recursing uint16_t size = 0; out.write(reinterpret_cast(&size), sizeof(size)); } } void WritePatternOrNull(ostream& out) const { uint16_t size = 0; if (HasPattern()) { auto ite = g_allRules.find(*pattern); size = ite->second.offset; } out.write(reinterpret_cast(&size), sizeof(size)); } void WriteTypedNode(ostream& out, uint32_t offset, uint32_t& pos, PathType& type) const { // check if we are linear or should write a table if (IsLinear()) { WritePatternOrNull(out); WritePackedLine(*this, out, type); } else if ((paths.size() < static_cast(maximumCP - minimumCP) / HYPHEN_BASE_CODE_SHIFT) || haveNoncontiguousChildren) { // Using dense table, i.e. value pairs vector output; for (const auto& path : paths) { output.push_back(path.first); output.push_back(path.second.Write(out, offset)); } pos = static_cast(out.tellp()); // our header is after children data type = PathType::PAIRS; WritePatternOrNull(out); WritePacked(output, out); } else { // Direct pointing, initialize full mapping table vector output; output.resize(maximumCP - minimumCP + 1, 0); if ((output.size() & 0x1) != 0) { output.push_back(0); // pad } for (const auto& path : paths) { // traverse children recursively (dfs) if (path.first >= minimumCP && path.first <= maximumCP) { output[path.first - minimumCP] = path.second.Write(out, offset); } else { cerr << " ### Encountered distinct code point 0x'" << hex << static_cast(path.first) << " when writing direct array" << endl; } } pos = static_cast(out.tellp()); // children first WritePatternOrNull(out); // pattern first WritePacked(output, out, false); // then children table } } uint16_t Write(ostream& out, uint32_t offset = 0, uint32_t* endPos = nullptr) const { if (HasPattern() && paths.size() == 0) { // currently only leafs are shared // if we have a shared leaf for shared pattern, use it if (auto ite = g_allRules[*pattern].uniqLeafs.find(code); ite != g_allRules[*pattern].uniqLeafs.cend()) { if (ite->second.offset != 0) { return ite->second.offset; } } } PathType type = PathType::DIRECT; uint32_t pos = static_cast(out.tellp()); uint32_t oPos = pos; WriteTypedNode(out, offset, pos, type); // return overall offset in 16bit CheckThatDataFits(pos, offset, out, type, oPos); if (endPos) { *endPos = static_cast(out.tellp()) >> 1; } return (((pos >> 1) - offset) | (static_cast(type) << SHIFT_BITS_14)); } void CheckThatDataFits(uint32_t& pos, uint32_t offset, ostream& out, PathType& type, uint32_t oPos) const { // return overall offset in 16bit if (((pos >> 1) > offset) && ((pos >> 1) - offset) > 0x3fff) { cerr << " ### Cannot fit offset " << hex << pos << " : " << offset << " into 14 bits, dropping node" << endl; out.seekp(oPos, ios_base::beg); // roll back to the beginning of this entry if (!out.good()) { // failing to roll back, terminate cerr << "Could not roll back outfile, terminating" << endl; exit(-1); } WritePatternOrNull(out); type = PathType::PATTERN; pos = static_cast(out.tellp()); } } static size_t count; static size_t leafCount; static uint16_t minimumCP; static uint16_t maximumCP; uint16_t code{0}; map paths; const vector* pattern{nullptr}; bool haveNoncontiguousChildren{false}; }; size_t Path::count{0}; size_t Path::leafCount{0}; uint16_t Path::minimumCP = 0x7a; uint16_t Path::maximumCP = 0x5f; // Struct to hold all the patterns that end with the code. struct PatternHolder { uint16_t code{0}; map, vector> patterns; map paths; }; struct CpRange { uint16_t minimumCp{0}; uint16_t maximumCp{0}; }; struct PathOffset { PathOffset(uint32_t o, uint32_t e, uint16_t t, uint16_t c) : offset(o), end(e), type(t), code(c) {} int32_t offset; int32_t end; uint32_t type; uint16_t code; }; struct WriteOffestsParams { WriteOffestsParams(vector offsets, uint32_t mappingsPos, CpRange cpRange) : fOffsets(offsets), fMappingsPos(mappingsPos), fCpRange(cpRange) { } vector fOffsets; uint32_t fMappingsPos; CpRange fCpRange; uint16_t fCommonNodeOffset; }; void processSection(const string& line, map>& sections, vector*& current) { string pat; for (size_t i = 1; i < line.size() && !iswspace(line[i]) && line[i] != '{'; i++) { pat += line[i]; } cout << "resolved section: " << pat << endl; if (!pat.empty()) { sections[pat] = vector(); current = §ions[pat]; } } static void ProcessContent(const string& line, vector* current) { string pat; for (auto code : line) { if (iswspace(code)) { if (!pat.empty()) { current->push_back(pat); } pat.clear(); continue; } if (code == '%') { break; } // cout << code; pat += code; } if (!pat.empty()) { current->push_back(pat); } } static void ProcessLine(const string& line, vector*& current, vector& uncategorized, map>& sections) { string pat; if (line.empty()) { return; } else if (line[0] == '\\') { processSection(line, sections, current); } else if (line[0] == '}') { current = &uncategorized; } else { ProcessContent(line, current); } } static int32_t ResolveSectionsFromFile(const std::string& fileName, map>& sections) { char resolvedPath[PATH_MAX] = {0}; if (fileName.size() > PATH_MAX) { cout << "The file name is too long" << endl; return FAILED; } if (realpath(fileName.c_str(), resolvedPath) == nullptr) { cout << "file name exception" << endl; return FAILED; } ifstream input(resolvedPath); if (!input.good()) { cerr << "could not open '" << resolvedPath << "' for reading" << endl; return FAILED; } string line; vector uncategorized; vector* current = &uncategorized; while (getline(input, line)) { ProcessLine(line, current, uncategorized, sections); } cout << "Uncategorized data size: " << uncategorized.size() << endl; cout << "Amount of sections: " << sections.size() << endl; for (const auto& section : sections) { cout << " '" << section.first << "' size: " << section.second.size() << endl; } return SUCCEED; } static vector ProcessWord(const string& wordString) { auto word = ConvertToUtf16(wordString); vector result; bool addedBreak = false; for (const auto code : word) { if (code == '-') { result.push_back(BREAK_FLAG); addedBreak = true; } else { if (!addedBreak) { result.push_back(NO_BREAK_FLAG); } result.push_back(code); addedBreak = false; } } // match exceptions in full words only result.insert(result.cbegin(), '.'); result.push_back('.'); cout << "Adding exception: " << wordString << endl; return result; } static void ResolvePatternsFromSections(map>& sections, vector>& utf16Patterns) { for (const auto& pattern : sections["patterns"]) { utf16Patterns.push_back(ConvertToUtf16(pattern)); } for (const auto& word : sections["hyphenation"]) { utf16Patterns.push_back(ProcessWord(word)); } } static void CollectLeaves(const vector& pattern, uint16_t& ix) { for (size_t i = pattern.size(); i > 0;) { if (!isdigit(pattern[--i])) { ix = pattern[i]; break; } } } static void ProcessPattern(const vector& pattern, vector& codepoints, vector& rules) { bool addedRule = false; for (size_t i = 0; i < pattern.size(); i++) { uint16_t code = pattern[i]; if (isdigit(code)) { rules.push_back(code - '0'); addedRule = true; } else { if (!addedRule) { rules.push_back(0); } // These have been collected empirically from the existing pattern files. // Remap typical distinct codepoints // below 'a' to the beginning of contiguous range // This same thing needs to be done in 'tolower' // when parsing the results on runtime if (code == '.') { code = '`'; } else if (code == '-') { code = '_'; } else if (code == '\'') { code = '^'; } codepoints.push_back(code); addedRule = false; } } } static void PadRules(vector& rules) { while ((rules.size() % PADDING_SIZE) != 0) { if (rules.back() == 0) { rules.pop_back(); } else { break; } } while ((rules.size() % PADDING_SIZE) != 0) { rules.push_back(0); } } void ResolveLeavesFromPatterns(const vector>& utf16Patterns, map& leaves) { for (const auto& pattern : utf16Patterns) { uint16_t ix{0}; CollectLeaves(pattern, ix); if (ix == 0) { continue; } if (leaves.find(ix) == leaves.end()) { leaves[ix] = {PatternHolder()}; } vector codepoints; vector rules; ProcessPattern(pattern, codepoints, rules); leaves[ix].code = ix; if (leaves[ix].patterns.find(codepoints) != leaves[ix].patterns.cend()) { cerr << "### Multiple definitions for pattern with size: " << codepoints.size() << endl; cerr << "###"; for (auto codepoint : codepoints) { cerr << " 0x" << hex << static_cast(codepoint); } cerr << endl; } PadRules(rules); leaves[ix].patterns[codepoints] = rules; // collect a list of unique rules if (auto it = OHOS::Hyphenate::g_allRules.find(rules); it != OHOS::Hyphenate::g_allRules.end()) { it->second.patterns[ix].push_back(codepoints); } else { OHOS::Hyphenate::g_allRules[rules] = Rule(); Hyphenate::g_allRules[rules].patterns[ix].push_back(codepoints); } } cout << "leaves: " << leaves.size() << endl; cout << "unique rules: " << OHOS::Hyphenate::g_allRules.size() << endl; } static void BreakLeavesIntoPaths(map& leaves, CpRange& range, int& countPat) { bool printCounts = true; // break leave information to Path instances for (auto& leave : leaves) { cout << " '" << char(leave.first) << "' rootsize: " << leave.second.patterns.size() << endl; for (const auto& pat : leave.second.patterns) { if (auto ite = leave.second.paths.find(pat.first[pat.first.size() - 1]); ite != leave.second.paths.end()) { ite->second.Process(pat.first, pat.first.size() - 1, &pat.second); } else { leave.second.paths.emplace(pat.first[pat.first.size() - 1], Path(pat.first, &pat.second)); } #ifdef VERBOSE_PATTERNS cout << " '"; for (const auto& digit : pat.first) { cout << "'0x" << hex << static_cast(digit) << "' "; } cout << "' size: " << pat.second.size() << endl; cout << " "; #endif for (const auto& digit : pat.second) { (void)digit; countPat++; #ifdef VERBOSE_PATTERNS cout << "'" << to_string(digit) << "' "; } cout << endl; #else } #endif } // collect some stats for (auto path : leave.second.paths) { if (printCounts) { cout << "leafs-nodes: " << path.second.leafCount << " / " << path.second.count << endl; cout << "min-max: " << path.second.minimumCP << " / " << path.second.maximumCP << endl; range.minimumCp = path.second.minimumCP; range.maximumCp = path.second.maximumCP; break; } path.second.Print(HYPHEN_DEFAULT_INDENT); } } } const size_t FULL_TALBLE = 4; static uint32_t InitOutFileHead(ofstream& out) { // reserve space for: // - header // - main toc. and // - mapping array for large code points // - version for (size_t i = FULL_TALBLE; i != 0; i--) { uint32_t bytes{0}; out.write(reinterpret_cast(&bytes), sizeof(bytes)); } return FULL_TALBLE * 2; // return 2 multiple talble size, check this number } static int32_t FormatOutFileHead(ofstream& out, const WriteOffestsParams& params, const uint32_t toc) { out.seekp(ios::beg); // roll back to the beginning if (!out.good()) { cerr << "failed to write toc" << endl; return FAILED; } // very minimalistic magic, perhaps more would be in order including // possible version number uint32_t header = ('H' | ('H' << 8) | (params.fCpRange.minimumCp << 16) | (params.fCpRange.maximumCp << 24)); // write header out.write(reinterpret_cast(&header), sizeof(header)); // write toc out.write(reinterpret_cast(&toc), sizeof(toc)); // write mappings out.write(reinterpret_cast(¶ms.fMappingsPos), sizeof(params.fMappingsPos)); // write binary version 8 top bits, using the lower 24 bits for common node offset without // needing to increase header size overall offset on the binary file // we may want to change this at some point const uint32_t version = (0x2 << 0x18) | params.fCommonNodeOffset; out.write(reinterpret_cast(&version), sizeof(version)); return SUCCEED; } void ProcessUniqueRule(std::pair, Rule>& uniqueRule) { for (auto ite : uniqueRule.second.patterns) { for (auto rule : ite.second) { if (!uniqueRule.second.uniqLeafs.count(*rule.cbegin())) { uniqueRule.second.uniqLeafs[*rule.cbegin()] = {0, 0}; } } } } void WriteUniqueRules(ofstream& out) { for (auto& uniqueRule : OHOS::Hyphenate::g_allRules) { uint32_t pos = static_cast(out.tellp()); uint16_t size = Path::WritePacked(uniqueRule.first, out, false) / 0x4; // save bits by padding size uniqueRule.second.offset = (size << 0xc) | pos; ProcessUniqueRule(uniqueRule); if ((pos >> 0xc) != 0) { cerr << "PATTERNS: RUNNING OUT OF ADDRESS SPACE, file a bug" << endl; exit(-1); } } } void WriteSharedLeafs(ofstream& out, uint16_t& pos, uint32_t& end) { for (auto& uniqueRule : OHOS::Hyphenate::g_allRules) { cout << "###### UniqueRule with " << uniqueRule.second.patterns.size() << " leaves" << endl; for (auto& sharedLeaf : uniqueRule.second.uniqLeafs) { if (sharedLeaf.second.usecount > 0) { Path path({sharedLeaf.first}, &uniqueRule.first); sharedLeaf.second.offset = path.Write(out, pos, &end); cout << "found unique " << hex << static_cast(sharedLeaf.first) << " wrote: '" << sharedLeaf.second.offset << "' " << endl; } } } } uint16_t CheckSharedLeaves(ofstream& out, map& leaves) { // check how many of the unique rules remain valid once all the rules are combined for (auto& leave : leaves) { for (auto& path : leave.second.paths) { path.second.FindSharedLeaves(); } } uint32_t end{0}; if ((out.tellp() % 1) != 0) { out.write(reinterpret_cast(&end), 1); } uint16_t pos = static_cast(out.tellp()) >> 1; cout << "NOW THIS IS PURE MAGIC NUMBER FOR NOW: " << hex << pos << endl; // pad first offset with 16bit zero to make empty patterns ignore the zero offset out.write(reinterpret_cast(&end), 2); WriteSharedLeafs(out, pos, end); return pos; } static bool WriteLeavePathsToOutFile(map& leaves, const CpRange& range, ofstream& out, uint32_t& tableOffset, vector& offsets) { // unique rules have no offset WriteUniqueRules(out); // shared nodes offset needs to be stored to header auto sharedOffset = CheckSharedLeaves(out, leaves); vector bigOnes; bool hasDirect{false}; for (auto& leave : leaves) { for (auto& path : leave.second.paths) { if (path.first < range.minimumCp || path.first > range.maximumCp) { bigOnes.push_back(&path.second); continue; } uint32_t end{0}; uint16_t value = path.second.Write(out, tableOffset, &end); uint16_t offset = value & 0x3fff; uint32_t type = value & 0x0000c000; uint16_t code = path.first; cout << "direct:" << hex << static_cast(code) << ": " << tableOffset << " : " << end << " type " << type << endl; tableOffset = end; offsets.push_back(PathOffset(offset, end, type, code)); hasDirect = true; } } // write distinc code points array after the direct ones for (auto path : bigOnes) { uint32_t end{0}; uint16_t value = path->Write(out, tableOffset, &end); uint16_t offset = value & 0x3fff; uint32_t type = value & 0x0000c000; uint16_t code = path->code; cout << "distinct: 0x" << hex << static_cast(code) << ": " << hex << tableOffset << " : " << end << " type " << type << dec << endl; tableOffset = end; offsets.push_back(PathOffset(offset, end, type, code)); } offsets.push_back(PathOffset(sharedOffset, 0, 0, 0)); return hasDirect; } void ProcessDirectPointingValues(std::vector::const_iterator& lastEffectiveIterator, std::ofstream& out, WriteOffestsParams& params, uint32_t& currentEnd, bool hasDirect) { for (size_t i = params.fCpRange.minimumCp; i <= params.fCpRange.maximumCp; i++) { auto iterator = params.fOffsets.cbegin(); while (iterator != params.fOffsets.cend()) { if (iterator->code == i) { break; } iterator++; } if (iterator == params.fOffsets.cend()) { if (!hasDirect) { break; } uint32_t dummy{0}; Path::WritePacked(dummy, out); Path::WritePacked(currentEnd, out); std::cout << "Direct: padded " << std::endl; continue; } lastEffectiveIterator = iterator; uint32_t type = static_cast(iterator->type); uint32_t bytes = static_cast(iterator->offset) | type << 16; currentEnd = iterator->end; std::cout << "Direct: " << std::hex << "o: 0x" << iterator->offset << " e: 0x" << iterator->end << " t: 0x" << type << " c: 0x" << bytes << std::endl; Path::WritePacked(bytes, out); Path::WritePacked(currentEnd, out); } } void ProcessDistinctCodepoints(std::vector::const_iterator& lastEffectiveIterator, std::ofstream& out, WriteOffestsParams& params, std::vector& mappings, uint32_t& currentEnd) { auto pos = params.fCpRange.maximumCp; if (params.fCpRange.maximumCp != 0) { pos++; } if (lastEffectiveIterator != params.fOffsets.cbegin()) { ++lastEffectiveIterator; } while (lastEffectiveIterator != params.fOffsets.cend()) { mappings.push_back(lastEffectiveIterator->code); mappings.push_back(pos++); uint32_t type = static_cast(lastEffectiveIterator->type); uint32_t bytes = static_cast(lastEffectiveIterator->offset) | type << 16; currentEnd = lastEffectiveIterator->end; std::cout << "Distinct: " << std::hex << "code: 0x" << static_cast(lastEffectiveIterator->code) << " o: 0x" << lastEffectiveIterator->offset << " e: 0x" << lastEffectiveIterator->end << " t: " << type << " c: 0x" << bytes << std::endl; Path::WritePacked(bytes, out); Path::WritePacked(currentEnd, out); ++lastEffectiveIterator; } } static void WriteOffestsToOutFile(ofstream& out, WriteOffestsParams& params, uint32_t currentEnd, bool hasDirect) { if (!params.fOffsets.empty() && params.fOffsets.rbegin()->code == 0) { params.fCommonNodeOffset = params.fOffsets.rbegin()->offset; params.fOffsets.pop_back(); } auto lastEffectiveIterator = params.fOffsets.cbegin(); vector mappings; ProcessDirectPointingValues(lastEffectiveIterator, out, params, currentEnd, hasDirect); // If we don't have direct code points, mapped ones will have to be differently // handled if (!hasDirect) { params.fCpRange.minimumCp = 0; params.fCpRange.maximumCp = 0; } if (lastEffectiveIterator != params.fOffsets.cend()) { // distinct codepoints that cannot be addressed by flat array index ProcessDistinctCodepoints(lastEffectiveIterator, out, params, mappings, currentEnd); } params.fMappingsPos = static_cast(out.tellp()); if (!mappings.empty()) { Path::WritePacked(mappings, out); } else { uint32_t dummy{0}; Path::WritePacked(dummy, out); } } std::string GetFileNameWithoutSuffix(const std::string& filePath) { size_t lastSlashPos = filePath.find_last_of("/\\"); size_t lastDotPos = filePath.find_last_of("."); std::string fileName = filePath.substr(lastSlashPos + 1, lastDotPos - lastSlashPos - 1); return fileName; } void CreateDirectory(const std::string& folderPath) { if (mkdir(folderPath.c_str(), 0755) == 0) { // 0755 means the owner has read, write, and execute permissions, std::cout << "Directory created successfully: " << folderPath << std::endl; } else { std::cout << "Directory already exists: " << folderPath << std::endl; } } void HyphenProcessor::Proccess(const std::string& filePath, const std::string& outFilePath) const { map> sections; if (ResolveSectionsFromFile(filePath, sections) != SUCCEED) { return; } char resolvedPath[PATH_MAX] = {0}; if (outFilePath.size() > PATH_MAX) { cout << "The file name is too long" << endl; return; } if (realpath(outFilePath.c_str(), resolvedPath) == nullptr) { CreateDirectory(resolvedPath); } vector> utf16Patterns; ResolvePatternsFromSections(sections, utf16Patterns); map leaves; ResolveLeavesFromPatterns(utf16Patterns, leaves); CpRange range = {0, 0}; int countPat = 0; BreakLeavesIntoPaths(leaves, range, countPat); string filename = GetFileNameWithoutSuffix(filePath); std::cout << "output file: " << (outFilePath + "/" + filename + ".hpb") << std::endl; ofstream out((outFilePath + "/" + filename + ".hpb"), ios::binary); uint32_t tableOffset = InitOutFileHead(out); vector offsets; uint32_t toc = 0; bool hasDirect = WriteLeavePathsToOutFile(leaves, range, out, tableOffset, offsets); toc = static_cast(out.tellp()); if ((toc % 0x4) != 0) { out.write(reinterpret_cast(&toc), toc % 0x4); toc = static_cast(out.tellp()); } // and main table offsets cout << "Produced " << offsets.size() << " paths with z: " << toc << endl; uint32_t currentEnd = FULL_TALBLE * 2; // initial offset (in 16 bits) Path::WritePacked(currentEnd, out); uint32_t mappingsPos = 0; WriteOffestsParams writeOffestsParams(offsets, mappingsPos, range); WriteOffestsToOutFile(out, writeOffestsParams, currentEnd, hasDirect); if (FormatOutFileHead(out, writeOffestsParams, toc) != SUCCEED) { cout << "DONE: With " << to_string(countPat) << "patterns (8bit)" << endl; } } } // namespace OHOS::Hyphenate int main(int argc, char** argv) { if (argc != 3) { // 3: valid argument number cout << "usage: './transform hyph-en-us.tex ./out/'" << endl; return FAILED; } // open output string filePath = argv[1]; string outFilePath = argv[2]; OHOS::Hyphenate::HyphenProcessor hyphenProcessor; hyphenProcessor.Proccess(filePath, outFilePath); return SUCCEED; }