• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2024 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 #include "hyphen_pattern.h"
16 
17 #include <codecvt>
18 #include <cstdio>
19 #include <cerrno>
20 #include <fcntl.h>
21 #include <fstream>
22 #include <iostream>
23 #include <map>
24 #include <cstdint>
25 #include <cstdio>
26 #include <cstdlib>
27 #include <sys/mman.h>
28 #include <sys/stat.h>
29 #include <sys/types.h>
30 #include <unicode/utf.h>
31 #include <unicode/utf8.h>
32 #include <unistd.h>
33 
34 using namespace std;
35 
36 namespace OHOS::Hyphenate {
37 
ConvertToUtf16(const string & utf8Str)38 vector<uint16_t> ConvertToUtf16(const string& utf8Str)
39 {
40     int32_t i = 0;
41     UChar32 c = 0;
42     vector<uint16_t> target;
43     const int32_t textLength = utf8Str.size();
44     while (i < textLength) {
45         U8_NEXT(reinterpret_cast<const uint8_t*>(utf8Str.c_str()), i, textLength, c);
46         if (U16_LENGTH(c) == 1) {
47             target.push_back(c);
48         } else {
49             target.push_back(U16_LEAD(c));
50             target.push_back(U16_TRAIL(c));
51         }
52     }
53     return target;
54 }
55 
56 struct Pattern {
57     uint8_t patterns[8]; // dynamic
58 };
59 
60 struct ArrayOf16bits {
61     uint16_t count;
62     uint16_t codes[3]; // dynamic
63 };
64 
65 struct Header {
66     uint8_t magic1;
67     uint8_t magic2;
68     uint8_t minCp;
69     uint8_t maxCp;
70     uint32_t toc;
71     uint32_t mappings;
72     uint32_t version;
73 
CodeOffsetOHOS::Hyphenate::Header74     inline uint16_t CodeOffset(uint16_t code, const ArrayOf16bits* maps = nullptr) const
75     {
76         if (maps && (code < minCp || code > maxCp)) {
77             for (size_t i = maps->count; i != 0;) {
78                 i -= HYPHEN_BASE_CODE_SHIFT;
79                 if (maps->codes[i] == code) {
80                     // cout << "resolved mapping ix: " << static_cast<int>(m->codes[i + 1]) << endl;
81                     auto offset = maps->codes[i + 1];
82                     return (maxCp - minCp) * HYPHEN_BASE_CODE_SHIFT + (offset - maxCp) * HYPHEN_BASE_CODE_SHIFT + 1;
83                 }
84             }
85             return MaxCount(maps);
86         }
87         if (maps) {
88             // + 1 because previous end is before next start
89             // 2x because every second value to beginning addres
90             return (code - minCp) * HYPHEN_BASE_CODE_SHIFT + 1;
91         } else {
92             if (code < minCp || code > maxCp) {
93                 return maxCp + 1;
94             }
95             return (code - minCp);
96         }
97     }
98 
ToLowerOHOS::Hyphenate::Header99     inline static void ToLower(uint16_t& code)
100     {
101         if (code == '.') {
102             code = '`';
103         } else if (code == '\'') {
104             code = '^';
105         } else if (code == '-') {
106             code = '_';
107         } else {
108             code = tolower(code);
109         }
110         cout << "tolower: " << hex << static_cast<int>(code) << endl;
111     }
112 
MaxCountOHOS::Hyphenate::Header113     inline uint16_t MaxCount(const ArrayOf16bits* maps) const
114     {
115         // need to write this in binary provider !!
116         return (maxCp - minCp) * HYPHEN_BASE_CODE_SHIFT + maps->count;
117     }
118 };
119 
120 struct CodeInfo {
121     int32_t OpenPatFile(const char* filePath);
122     int32_t GetHeader();
123     int32_t GetCodeInfo(uint16_t code);
124     void ProcessPattern(const size_t& offset, vector<uint8_t>& result, bool direct);
125     bool ProcessDirect(const std::vector<uint16_t>& target, const size_t& offset);
126     void ProcessLinear(const std::vector<uint16_t>& target, const size_t& offset, vector<uint8_t>& result);
127     bool ProcessNextCode(const std::vector<uint16_t>& target, const size_t& offset);
128     void ClearResource();
129     Header* fHeader{nullptr};
130     uint8_t* fAddress{nullptr};
131     FILE* fFile{nullptr};
132     size_t fFileSize{0};
133     uint16_t fMaxCount{0};
134     PathType fType{PathType::PATTERN};
135     uint16_t fOffset{0};
136     uint16_t fCode{0};
137     uint32_t fIndex{0};
138     uint32_t fNextOffset;
139     uint16_t* fStaticOffset{nullptr};
140     ArrayOf16bits* fMappings{nullptr};
141 };
142 
OpenPatFile(const char * filePath)143 int32_t CodeInfo::OpenPatFile(const char* filePath)
144 {
145     cout << "Attempt to mmap " << filePath << endl;
146 
147     FILE* file = fopen(filePath, "r");
148     if (file == nullptr) {
149         cerr << "FATAL: " << errno << endl;
150         return FAILED;
151     }
152 
153     struct stat st;
154     if (fstat(fileno(file), &st) != 0) {
155         cerr << "FATAL: fstat" << endl;
156         fclose(file);
157         return FAILED;
158     }
159     size_t length = st.st_size;
160     uint8_t* address = static_cast<uint8_t*>(mmap(nullptr, length, PROT_READ, MAP_PRIVATE, fileno(file), 0u));
161     if (address == MAP_FAILED) {
162         cerr << "FATAL: mmap" << endl;
163         fclose(file);
164         return FAILED;
165     }
166 
167     cout << "Magic: " << hex << *reinterpret_cast<uint32_t*>(address) << dec << endl;
168     this->fFile = file;
169     this->fFileSize = length;
170     this->fAddress = address;
171     return SUCCEED;
172 }
173 
GetInputWord(const char * input)174 static std::vector<uint16_t> GetInputWord(const char* input)
175 {
176     const std::string utf8Str = "." + std::string(input) + ".";
177     std::vector<uint16_t> target = ConvertToUtf16(utf8Str);
178     for (auto& code : target) {
179         Header::ToLower(code);
180     }
181     return target;
182 }
183 
GetHeader()184 int32_t CodeInfo::GetHeader()
185 {
186     fHeader = reinterpret_cast<Header*>(fAddress);
187     uint16_t minCp = fHeader->minCp;
188     uint16_t maxCp = fHeader->maxCp;
189     // get master table, it always is in direct mode
190     fMappings = reinterpret_cast<ArrayOf16bits*>(reinterpret_cast<uint32_t*>(fAddress + fHeader->mappings));
191     // this is actually beyond the real 32 bit address, but just to have an offset that
192     // is clearly out of bounds without recalculating it again
193     fMaxCount = fHeader->MaxCount(fMappings);
194     cout << "min/max: " << minCp << "/" << maxCp << " count " << static_cast<int>(fMaxCount) << endl;
195     cout << "size of top level mappings: " << static_cast<int>(fMappings->count) << endl;
196     if (minCp == maxCp && fMappings->count == 0) {
197         cerr << "### unexpected min/max in input file-> exit" << endl;
198         return FAILED;
199     }
200     return SUCCEED;
201 }
202 
ClearResource()203 void CodeInfo::ClearResource()
204 {
205     (void)munmap(fAddress, fFileSize);
206     fAddress = nullptr;
207     (void)fclose(fFile);
208     fFile = nullptr;
209     fFileSize = 0;
210 }
211 
GetCodeInfo(uint16_t code)212 int32_t CodeInfo::GetCodeInfo(uint16_t code)
213 {
214     fType = PathType::PATTERN;
215     this->fCode = code;
216     this->fIndex = 0;
217     fOffset = fHeader->CodeOffset(code, fMappings);
218     if (fOffset == fMaxCount) {
219         cout << hex << char(code) << " unable to map, contiue straight" << endl;
220         return FAILED;
221     }
222 
223     // previous entry end
224     uint32_t baseOffset =
225         *reinterpret_cast<uint32_t*>(reinterpret_cast<uint32_t*>(fAddress + fHeader->toc) + fOffset - 1);
226     uint32_t initialValue = *(reinterpret_cast<uint32_t*>(fAddress + fHeader->toc) + fOffset);
227     fType = static_cast<PathType>(initialValue >> SHIFT_BITS_30);
228     // direct and pairs need to have offset different from zero
229     if (initialValue == 0 && (fType == PathType::DIRECT || fType == PathType::PAIRS)) {
230         cout << char(code) << " is not in main dict, contiue straight" << endl;
231         return FAILED;
232     }
233     // base offset is 16 bit
234     fStaticOffset = reinterpret_cast<uint16_t*>(fAddress + HYPHEN_BASE_CODE_SHIFT * baseOffset);
235 
236     // get a subtable according character
237     // once: read as 32bit, the rest of the access will be 16bit (13bit for offsets)
238     fNextOffset = (initialValue & 0x3fffffff);
239 
240     cout << hex << baseOffset << " top level code: 0x" << hex << static_cast<int>(code) <<
241         " starting with offset: 0x" << hex << fOffset << " table-offset 0x" << fNextOffset << endl;
242     return SUCCEED;
243 }
244 
ProcessPattern(const size_t & offset,vector<uint8_t> & result,bool direct)245 void CodeInfo::ProcessPattern(const size_t& offset, vector<uint8_t>& result, bool direct)
246 {
247     cout << "direct : " << direct << " " << hex << fNextOffset << endl;
248     uint16_t poffset = 0;
249     if (direct && (fHeader->version >> 0x18) >= 0x2) {
250         poffset = *(reinterpret_cast<uint16_t*>(fAddress) + fNextOffset + (fHeader->version & 0xffff));
251     } else {
252         poffset = *(fStaticOffset + fNextOffset);
253     }
254     fNextOffset++; // there now is always at least pattern count before next node
255     if (!poffset) {
256         return;
257     }
258     uint16_t count = (poffset >> 0xc) * 0x4; // patterns are padded to 4 byte arrays.
259                                              // to save bits, the count is multiplied by four
260     poffset = 0xfff & poffset;
261 
262     //   if we have reached pattern, apply it to result
263     auto p = reinterpret_cast<const Pattern*>(fAddress + poffset);
264     if (count != 0) {
265         cout << "Node with a pattern, count " << count << hex << " offset: " << poffset << endl;
266         size_t i = 0;
267         for (size_t j = offset - fIndex; j < result.size() && i < count; j++) {
268             cout << "    " << static_cast<int>(j) << ": pattern index: " << i << " value: 0x" << hex
269                  << static_cast<int>(p->patterns[i]) << endl;
270             result[j] = std::max(result[j], (p->patterns[i]));
271             i++;
272         }
273     }
274 }
275 
ProcessDirect(const std::vector<uint16_t> & target,const size_t & offset)276 bool CodeInfo::ProcessDirect(const std::vector<uint16_t>& target, const size_t& offset)
277 {
278     // resolve new code point
279     if (fIndex == offset) { // should never be the case
280         cout << "# break loop on direct" << endl;
281         return true;
282     }
283 
284     fIndex++;
285     fCode = target[offset - fIndex];
286     fOffset = fHeader->CodeOffset(fCode);
287     if (fHeader->minCp != fHeader->maxCp && fOffset > fHeader->maxCp) {
288         cout << "# break loop on direct" << endl;
289         return true;
290     }
291 
292     auto nextValue = *(fStaticOffset + fNextOffset + fOffset);
293     fNextOffset = nextValue & 0x3fff;
294     fType = static_cast<PathType>(nextValue >> SHIFT_BITS_14);
295     cout << "  found direct: " << char(fCode) << " : " << hex << nextValue << " with offset: " << fNextOffset << endl;
296     return false;
297 }
298 
ProcessLinear(const std::vector<uint16_t> & target,const size_t & offset,vector<uint8_t> & result)299 void CodeInfo::ProcessLinear(const std::vector<uint16_t>& target, const size_t& offset, vector<uint8_t>& result)
300 {
301     auto p = reinterpret_cast<const ArrayOf16bits*>(fStaticOffset + fNextOffset);
302     auto count = p->count;
303 
304     fIndex++;
305     cout << "# linear " << offset << " " << fIndex << endl;
306     if (fIndex > offset || count > (offset - fIndex + 1)) {
307         // the pattern is longer than the remaining word
308         cout << "# break loop on linear " << offset << " " << fIndex << endl;
309         return;
310     }
311     // check the rest of the string
312     for (auto j = 0; j < count; j++) {
313         cout << "    linear " << offset << " index: " << j << " value: " << hex << static_cast<int>(p->codes[j]) <<
314             " vs " << static_cast<int>(target[offset - fIndex]) << endl;
315         if (p->codes[j] != target[offset - fIndex]) {
316             return;
317         } else {
318             fIndex++;
319         }
320     }
321     // if we reach the end, apply pattern
322     fNextOffset += count + 1; // array items + one for the count
323     fIndex--;                 // because of recursion
324     ProcessPattern(offset, result, false);
325     if (*(fStaticOffset + fNextOffset) != 0 && offset > count) { // peek if there is more to come
326         // make it tail recursive to save stack
327         return ProcessLinear(target, offset, result);
328     }
329 }
330 
ProcessNextCode(const std::vector<uint16_t> & target,const size_t & offset)331 bool CodeInfo::ProcessNextCode(const std::vector<uint16_t>& target, const size_t& offset)
332 {
333     // resolve new code point
334     if (fIndex == offset) { // should detect this sooner
335         cout << "# break loop on pairs" << endl;
336         return true;
337     }
338     auto p = reinterpret_cast<const ArrayOf16bits*>(fStaticOffset + fNextOffset);
339     uint16_t count = p->count;
340     fIndex++;
341     cout << "  continue to value pairs with size: " << count << " and code '" <<
342         static_cast<int>(target[offset - fIndex]) << "'" << endl;
343 
344     //     check pairs, array is sorted (but small)
345     bool match = false;
346     for (size_t j = 0; j < count; j += HYPHEN_BASE_CODE_SHIFT) {
347         cout << "    checking pair: " << j << " value: " << hex << static_cast<int>(p->codes[j]) << " vs " <<
348             static_cast<int>(target[offset - fIndex]) << endl;
349         if (p->codes[j] == target[offset - fIndex]) {
350             fCode = target[offset - fIndex];
351             cout << "      new value pair in : 0x" << j << " with code 0x" << hex << static_cast<int>(fCode) << "'" <<
352                 endl;
353             fOffset = fHeader->CodeOffset(fCode);
354             if (fHeader->minCp != fHeader->maxCp && fOffset > fHeader->maxCp) {
355                 cout << "# could not resolve debug offset in pairs" << endl;
356             }
357 
358             fNextOffset = p->codes[j + 1] & 0x3fff;
359             fType = static_cast<PathType>(p->codes[j + 1] >> SHIFT_BITS_14);
360             match = true;
361             break;
362         } else if (p->codes[j] > target[offset - fIndex]) {
363             break;
364         }
365     }
366     if (!match) {
367         cout << "# break loop on pairs" << endl;
368         return true;
369     }
370     return false;
371 }
372 
PrintResult(const vector<uint8_t> & result,const vector<uint16_t> & target)373 void PrintResult(const vector<uint8_t>& result, const vector<uint16_t>& target)
374 {
375     cout << dec << "result size: " << result.size() << " while expecting " << target.size() << endl;
376     if (result.size() <= target.size() + 1) {
377         size_t i = 0;
378         for (auto bp : result) {
379             cout << hex << static_cast<int>(target[i++]) << ": " << to_string(bp) << endl;
380         }
381     }
382 }
383 
InitializeCodeInfo(OHOS::Hyphenate::CodeInfo & codeInfo,const char * filePath)384 bool InitializeCodeInfo(OHOS::Hyphenate::CodeInfo& codeInfo, const char* filePath)
385 {
386     if (codeInfo.OpenPatFile(filePath) != SUCCEED) {
387         return false;
388     }
389     if (codeInfo.GetHeader() != SUCCEED) {
390         codeInfo.ClearResource();
391         return false;
392     }
393     return true;
394 }
395 
ProcessCodeLoop(OHOS::Hyphenate::CodeInfo & codeInfo,const std::vector<uint16_t> & target,size_t i,std::vector<uint8_t> & result)396 void ProcessCodeLoop(OHOS::Hyphenate::CodeInfo& codeInfo, const std::vector<uint16_t>& target, size_t i,
397                      std::vector<uint8_t>& result)
398 {
399     bool continueLoop = true;
400     while (continueLoop) {
401         std::cout << "#loop c: '" << codeInfo.fCode << "' starting with offset: 0x" << std::hex << codeInfo.fOffset <<
402             " table-offset 0x" << codeInfo.fNextOffset << " index: " << codeInfo.fIndex << std::endl;
403 
404         codeInfo.ProcessPattern(i, result, codeInfo.fType == OHOS::Hyphenate::PathType::PATTERN);
405         if (codeInfo.fType == OHOS::Hyphenate::PathType::PATTERN) {
406             continueLoop = false;
407         } else if (codeInfo.fType == OHOS::Hyphenate::PathType::DIRECT) {
408             if (codeInfo.ProcessDirect(target, i)) {
409                 continueLoop = false;
410             }
411         } else if (codeInfo.fType == OHOS::Hyphenate::PathType::LINEAR) {
412             codeInfo.ProcessLinear(target, i, result);
413             continueLoop = false;
414         } else {
415             if (codeInfo.ProcessNextCode(target, i)) {
416                 continueLoop = false;
417             }
418         }
419     }
420 }
421 
ProcessCodeInfo(OHOS::Hyphenate::CodeInfo & codeInfo,const std::vector<uint16_t> & target,std::vector<uint8_t> & result)422 void ProcessCodeInfo(OHOS::Hyphenate::CodeInfo& codeInfo, const std::vector<uint16_t>& target,
423                      std::vector<uint8_t>& result)
424 {
425     for (size_t i = target.size() - 1; i != 0; --i) {
426         if (codeInfo.GetCodeInfo(target[i]) != SUCCEED) {
427             continue;
428         }
429         codeInfo.fIndex = 0;
430         ProcessCodeLoop(codeInfo, target, i, result);
431     }
432 }
433 
Read(const char * filePath,const std::vector<uint16_t> & utf16Target) const434 int32_t HyphenReader::Read(const char* filePath, const std::vector<uint16_t>& utf16Target) const
435 {
436     CodeInfo codeInfo;
437     if (!InitializeCodeInfo(codeInfo, filePath)) {
438         return FAILED;
439     }
440 
441     std::vector<uint8_t> result(utf16Target.size(), 0);
442     ProcessCodeInfo(codeInfo, utf16Target, result);
443 
444     codeInfo.ClearResource();
445     PrintResult(result, utf16Target);
446     return SUCCEED;
447 }
448 } // namespace OHOS::Hyphenate
449 
450 namespace {
451 constexpr size_t ARG_NUM = 2;
452 
CheckArgs(int argc,char ** argv)453 std::vector<uint16_t> CheckArgs(int argc, char** argv)
454 {
455     std::vector<uint16_t> target;
456     if (argc != 3) { // 3: valid argument number
457         cout << "usage: './hyphen hyph-en-us.hpb <mytestword>' " << endl;
458         return target;
459     }
460     target = OHOS::Hyphenate::GetInputWord(argv[ARG_NUM]);
461     if (target.empty()) {
462         cout << "usage: './hyphen hyph-en-us.hpb <mytestword>' " << endl;
463     }
464     return target;
465 }
466 } // namespace
467 
main(int argc,char ** argv)468 int main(int argc, char** argv)
469 {
470     std::vector<uint16_t> target = CheckArgs(argc, argv);
471     if (target.empty()) {
472         return FAILED;
473     }
474 
475     OHOS::Hyphenate::HyphenReader hyphenReader;
476     return hyphenReader.Read(argv[1], target);
477 }
478