1 /*
2 * Copyright (c) 2024 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15 #include "hyphen_pattern.h"
16
17 #include <codecvt>
18 #include <cstdio>
19 #include <cerrno>
20 #include <fcntl.h>
21 #include <fstream>
22 #include <iostream>
23 #include <map>
24 #include <cstdint>
25 #include <cstdio>
26 #include <cstdlib>
27 #include <sys/mman.h>
28 #include <sys/stat.h>
29 #include <sys/types.h>
30 #include <unicode/utf.h>
31 #include <unicode/utf8.h>
32 #include <unistd.h>
33
34 using namespace std;
35
36 namespace OHOS::Hyphenate {
37
ConvertToUtf16(const string & utf8Str)38 vector<uint16_t> ConvertToUtf16(const string& utf8Str)
39 {
40 int32_t i = 0;
41 UChar32 c = 0;
42 vector<uint16_t> target;
43 const int32_t textLength = utf8Str.size();
44 while (i < textLength) {
45 U8_NEXT(reinterpret_cast<const uint8_t*>(utf8Str.c_str()), i, textLength, c);
46 if (U16_LENGTH(c) == 1) {
47 target.push_back(c);
48 } else {
49 target.push_back(U16_LEAD(c));
50 target.push_back(U16_TRAIL(c));
51 }
52 }
53 return target;
54 }
55
56 struct Pattern {
57 uint8_t patterns[8]; // dynamic
58 };
59
60 struct ArrayOf16bits {
61 uint16_t count;
62 uint16_t codes[3]; // dynamic
63 };
64
65 struct Header {
66 uint8_t magic1;
67 uint8_t magic2;
68 uint8_t minCp;
69 uint8_t maxCp;
70 uint32_t toc;
71 uint32_t mappings;
72 uint32_t version;
73
CodeOffsetOHOS::Hyphenate::Header74 inline uint16_t CodeOffset(uint16_t code, const ArrayOf16bits* maps = nullptr) const
75 {
76 if (maps && (code < minCp || code > maxCp)) {
77 for (size_t i = maps->count; i != 0;) {
78 i -= HYPHEN_BASE_CODE_SHIFT;
79 if (maps->codes[i] == code) {
80 // cout << "resolved mapping ix: " << static_cast<int>(m->codes[i + 1]) << endl;
81 auto offset = maps->codes[i + 1];
82 return (maxCp - minCp) * HYPHEN_BASE_CODE_SHIFT + (offset - maxCp) * HYPHEN_BASE_CODE_SHIFT + 1;
83 }
84 }
85 return MaxCount(maps);
86 }
87 if (maps) {
88 // + 1 because previous end is before next start
89 // 2x because every second value to beginning addres
90 return (code - minCp) * HYPHEN_BASE_CODE_SHIFT + 1;
91 } else {
92 if (code < minCp || code > maxCp) {
93 return maxCp + 1;
94 }
95 return (code - minCp);
96 }
97 }
98
ToLowerOHOS::Hyphenate::Header99 inline static void ToLower(uint16_t& code)
100 {
101 if (code == '.') {
102 code = '`';
103 } else if (code == '\'') {
104 code = '^';
105 } else if (code == '-') {
106 code = '_';
107 } else {
108 code = tolower(code);
109 }
110 cout << "tolower: " << hex << static_cast<int>(code) << endl;
111 }
112
MaxCountOHOS::Hyphenate::Header113 inline uint16_t MaxCount(const ArrayOf16bits* maps) const
114 {
115 // need to write this in binary provider !!
116 return (maxCp - minCp) * HYPHEN_BASE_CODE_SHIFT + maps->count;
117 }
118 };
119
120 struct CodeInfo {
121 int32_t OpenPatFile(const char* filePath);
122 int32_t GetHeader();
123 int32_t GetCodeInfo(uint16_t code);
124 void ProcessPattern(const size_t& offset, vector<uint8_t>& result, bool direct);
125 bool ProcessDirect(const std::vector<uint16_t>& target, const size_t& offset);
126 void ProcessLinear(const std::vector<uint16_t>& target, const size_t& offset, vector<uint8_t>& result);
127 bool ProcessNextCode(const std::vector<uint16_t>& target, const size_t& offset);
128 void ClearResource();
129 Header* fHeader{nullptr};
130 uint8_t* fAddress{nullptr};
131 FILE* fFile{nullptr};
132 size_t fFileSize{0};
133 uint16_t fMaxCount{0};
134 PathType fType{PathType::PATTERN};
135 uint16_t fOffset{0};
136 uint16_t fCode{0};
137 uint32_t fIndex{0};
138 uint32_t fNextOffset;
139 uint16_t* fStaticOffset{nullptr};
140 ArrayOf16bits* fMappings{nullptr};
141 };
142
OpenPatFile(const char * filePath)143 int32_t CodeInfo::OpenPatFile(const char* filePath)
144 {
145 cout << "Attempt to mmap " << filePath << endl;
146
147 FILE* file = fopen(filePath, "r");
148 if (file == nullptr) {
149 cerr << "FATAL: " << errno << endl;
150 return FAILED;
151 }
152
153 struct stat st;
154 if (fstat(fileno(file), &st) != 0) {
155 cerr << "FATAL: fstat" << endl;
156 fclose(file);
157 return FAILED;
158 }
159 size_t length = st.st_size;
160 uint8_t* address = static_cast<uint8_t*>(mmap(nullptr, length, PROT_READ, MAP_PRIVATE, fileno(file), 0u));
161 if (address == MAP_FAILED) {
162 cerr << "FATAL: mmap" << endl;
163 fclose(file);
164 return FAILED;
165 }
166
167 cout << "Magic: " << hex << *reinterpret_cast<uint32_t*>(address) << dec << endl;
168 this->fFile = file;
169 this->fFileSize = length;
170 this->fAddress = address;
171 return SUCCEED;
172 }
173
GetInputWord(const char * input)174 static std::vector<uint16_t> GetInputWord(const char* input)
175 {
176 const std::string utf8Str = "." + std::string(input) + ".";
177 std::vector<uint16_t> target = ConvertToUtf16(utf8Str);
178 for (auto& code : target) {
179 Header::ToLower(code);
180 }
181 return target;
182 }
183
GetHeader()184 int32_t CodeInfo::GetHeader()
185 {
186 fHeader = reinterpret_cast<Header*>(fAddress);
187 uint16_t minCp = fHeader->minCp;
188 uint16_t maxCp = fHeader->maxCp;
189 // get master table, it always is in direct mode
190 fMappings = reinterpret_cast<ArrayOf16bits*>(reinterpret_cast<uint32_t*>(fAddress + fHeader->mappings));
191 // this is actually beyond the real 32 bit address, but just to have an offset that
192 // is clearly out of bounds without recalculating it again
193 fMaxCount = fHeader->MaxCount(fMappings);
194 cout << "min/max: " << minCp << "/" << maxCp << " count " << static_cast<int>(fMaxCount) << endl;
195 cout << "size of top level mappings: " << static_cast<int>(fMappings->count) << endl;
196 if (minCp == maxCp && fMappings->count == 0) {
197 cerr << "### unexpected min/max in input file-> exit" << endl;
198 return FAILED;
199 }
200 return SUCCEED;
201 }
202
ClearResource()203 void CodeInfo::ClearResource()
204 {
205 (void)munmap(fAddress, fFileSize);
206 fAddress = nullptr;
207 (void)fclose(fFile);
208 fFile = nullptr;
209 fFileSize = 0;
210 }
211
GetCodeInfo(uint16_t code)212 int32_t CodeInfo::GetCodeInfo(uint16_t code)
213 {
214 fType = PathType::PATTERN;
215 this->fCode = code;
216 this->fIndex = 0;
217 fOffset = fHeader->CodeOffset(code, fMappings);
218 if (fOffset == fMaxCount) {
219 cout << hex << char(code) << " unable to map, contiue straight" << endl;
220 return FAILED;
221 }
222
223 // previous entry end
224 uint32_t baseOffset =
225 *reinterpret_cast<uint32_t*>(reinterpret_cast<uint32_t*>(fAddress + fHeader->toc) + fOffset - 1);
226 uint32_t initialValue = *(reinterpret_cast<uint32_t*>(fAddress + fHeader->toc) + fOffset);
227 fType = static_cast<PathType>(initialValue >> SHIFT_BITS_30);
228 // direct and pairs need to have offset different from zero
229 if (initialValue == 0 && (fType == PathType::DIRECT || fType == PathType::PAIRS)) {
230 cout << char(code) << " is not in main dict, contiue straight" << endl;
231 return FAILED;
232 }
233 // base offset is 16 bit
234 fStaticOffset = reinterpret_cast<uint16_t*>(fAddress + HYPHEN_BASE_CODE_SHIFT * baseOffset);
235
236 // get a subtable according character
237 // once: read as 32bit, the rest of the access will be 16bit (13bit for offsets)
238 fNextOffset = (initialValue & 0x3fffffff);
239
240 cout << hex << baseOffset << " top level code: 0x" << hex << static_cast<int>(code) <<
241 " starting with offset: 0x" << hex << fOffset << " table-offset 0x" << fNextOffset << endl;
242 return SUCCEED;
243 }
244
ProcessPattern(const size_t & offset,vector<uint8_t> & result,bool direct)245 void CodeInfo::ProcessPattern(const size_t& offset, vector<uint8_t>& result, bool direct)
246 {
247 cout << "direct : " << direct << " " << hex << fNextOffset << endl;
248 uint16_t poffset = 0;
249 if (direct && (fHeader->version >> 0x18) >= 0x2) {
250 poffset = *(reinterpret_cast<uint16_t*>(fAddress) + fNextOffset + (fHeader->version & 0xffff));
251 } else {
252 poffset = *(fStaticOffset + fNextOffset);
253 }
254 fNextOffset++; // there now is always at least pattern count before next node
255 if (!poffset) {
256 return;
257 }
258 uint16_t count = (poffset >> 0xc) * 0x4; // patterns are padded to 4 byte arrays.
259 // to save bits, the count is multiplied by four
260 poffset = 0xfff & poffset;
261
262 // if we have reached pattern, apply it to result
263 auto p = reinterpret_cast<const Pattern*>(fAddress + poffset);
264 if (count != 0) {
265 cout << "Node with a pattern, count " << count << hex << " offset: " << poffset << endl;
266 size_t i = 0;
267 for (size_t j = offset - fIndex; j < result.size() && i < count; j++) {
268 cout << " " << static_cast<int>(j) << ": pattern index: " << i << " value: 0x" << hex
269 << static_cast<int>(p->patterns[i]) << endl;
270 result[j] = std::max(result[j], (p->patterns[i]));
271 i++;
272 }
273 }
274 }
275
ProcessDirect(const std::vector<uint16_t> & target,const size_t & offset)276 bool CodeInfo::ProcessDirect(const std::vector<uint16_t>& target, const size_t& offset)
277 {
278 // resolve new code point
279 if (fIndex == offset) { // should never be the case
280 cout << "# break loop on direct" << endl;
281 return true;
282 }
283
284 fIndex++;
285 fCode = target[offset - fIndex];
286 fOffset = fHeader->CodeOffset(fCode);
287 if (fHeader->minCp != fHeader->maxCp && fOffset > fHeader->maxCp) {
288 cout << "# break loop on direct" << endl;
289 return true;
290 }
291
292 auto nextValue = *(fStaticOffset + fNextOffset + fOffset);
293 fNextOffset = nextValue & 0x3fff;
294 fType = static_cast<PathType>(nextValue >> SHIFT_BITS_14);
295 cout << " found direct: " << char(fCode) << " : " << hex << nextValue << " with offset: " << fNextOffset << endl;
296 return false;
297 }
298
ProcessLinear(const std::vector<uint16_t> & target,const size_t & offset,vector<uint8_t> & result)299 void CodeInfo::ProcessLinear(const std::vector<uint16_t>& target, const size_t& offset, vector<uint8_t>& result)
300 {
301 auto p = reinterpret_cast<const ArrayOf16bits*>(fStaticOffset + fNextOffset);
302 auto count = p->count;
303
304 fIndex++;
305 cout << "# linear " << offset << " " << fIndex << endl;
306 if (fIndex > offset || count > (offset - fIndex + 1)) {
307 // the pattern is longer than the remaining word
308 cout << "# break loop on linear " << offset << " " << fIndex << endl;
309 return;
310 }
311 // check the rest of the string
312 for (auto j = 0; j < count; j++) {
313 cout << " linear " << offset << " index: " << j << " value: " << hex << static_cast<int>(p->codes[j]) <<
314 " vs " << static_cast<int>(target[offset - fIndex]) << endl;
315 if (p->codes[j] != target[offset - fIndex]) {
316 return;
317 } else {
318 fIndex++;
319 }
320 }
321 // if we reach the end, apply pattern
322 fNextOffset += count + 1; // array items + one for the count
323 fIndex--; // because of recursion
324 ProcessPattern(offset, result, false);
325 if (*(fStaticOffset + fNextOffset) != 0 && offset > count) { // peek if there is more to come
326 // make it tail recursive to save stack
327 return ProcessLinear(target, offset, result);
328 }
329 }
330
ProcessNextCode(const std::vector<uint16_t> & target,const size_t & offset)331 bool CodeInfo::ProcessNextCode(const std::vector<uint16_t>& target, const size_t& offset)
332 {
333 // resolve new code point
334 if (fIndex == offset) { // should detect this sooner
335 cout << "# break loop on pairs" << endl;
336 return true;
337 }
338 auto p = reinterpret_cast<const ArrayOf16bits*>(fStaticOffset + fNextOffset);
339 uint16_t count = p->count;
340 fIndex++;
341 cout << " continue to value pairs with size: " << count << " and code '" <<
342 static_cast<int>(target[offset - fIndex]) << "'" << endl;
343
344 // check pairs, array is sorted (but small)
345 bool match = false;
346 for (size_t j = 0; j < count; j += HYPHEN_BASE_CODE_SHIFT) {
347 cout << " checking pair: " << j << " value: " << hex << static_cast<int>(p->codes[j]) << " vs " <<
348 static_cast<int>(target[offset - fIndex]) << endl;
349 if (p->codes[j] == target[offset - fIndex]) {
350 fCode = target[offset - fIndex];
351 cout << " new value pair in : 0x" << j << " with code 0x" << hex << static_cast<int>(fCode) << "'" <<
352 endl;
353 fOffset = fHeader->CodeOffset(fCode);
354 if (fHeader->minCp != fHeader->maxCp && fOffset > fHeader->maxCp) {
355 cout << "# could not resolve debug offset in pairs" << endl;
356 }
357
358 fNextOffset = p->codes[j + 1] & 0x3fff;
359 fType = static_cast<PathType>(p->codes[j + 1] >> SHIFT_BITS_14);
360 match = true;
361 break;
362 } else if (p->codes[j] > target[offset - fIndex]) {
363 break;
364 }
365 }
366 if (!match) {
367 cout << "# break loop on pairs" << endl;
368 return true;
369 }
370 return false;
371 }
372
PrintResult(const vector<uint8_t> & result,const vector<uint16_t> & target)373 void PrintResult(const vector<uint8_t>& result, const vector<uint16_t>& target)
374 {
375 cout << dec << "result size: " << result.size() << " while expecting " << target.size() << endl;
376 if (result.size() <= target.size() + 1) {
377 size_t i = 0;
378 for (auto bp : result) {
379 cout << hex << static_cast<int>(target[i++]) << ": " << to_string(bp) << endl;
380 }
381 }
382 }
383
InitializeCodeInfo(OHOS::Hyphenate::CodeInfo & codeInfo,const char * filePath)384 bool InitializeCodeInfo(OHOS::Hyphenate::CodeInfo& codeInfo, const char* filePath)
385 {
386 if (codeInfo.OpenPatFile(filePath) != SUCCEED) {
387 return false;
388 }
389 if (codeInfo.GetHeader() != SUCCEED) {
390 codeInfo.ClearResource();
391 return false;
392 }
393 return true;
394 }
395
ProcessCodeLoop(OHOS::Hyphenate::CodeInfo & codeInfo,const std::vector<uint16_t> & target,size_t i,std::vector<uint8_t> & result)396 void ProcessCodeLoop(OHOS::Hyphenate::CodeInfo& codeInfo, const std::vector<uint16_t>& target, size_t i,
397 std::vector<uint8_t>& result)
398 {
399 bool continueLoop = true;
400 while (continueLoop) {
401 std::cout << "#loop c: '" << codeInfo.fCode << "' starting with offset: 0x" << std::hex << codeInfo.fOffset <<
402 " table-offset 0x" << codeInfo.fNextOffset << " index: " << codeInfo.fIndex << std::endl;
403
404 codeInfo.ProcessPattern(i, result, codeInfo.fType == OHOS::Hyphenate::PathType::PATTERN);
405 if (codeInfo.fType == OHOS::Hyphenate::PathType::PATTERN) {
406 continueLoop = false;
407 } else if (codeInfo.fType == OHOS::Hyphenate::PathType::DIRECT) {
408 if (codeInfo.ProcessDirect(target, i)) {
409 continueLoop = false;
410 }
411 } else if (codeInfo.fType == OHOS::Hyphenate::PathType::LINEAR) {
412 codeInfo.ProcessLinear(target, i, result);
413 continueLoop = false;
414 } else {
415 if (codeInfo.ProcessNextCode(target, i)) {
416 continueLoop = false;
417 }
418 }
419 }
420 }
421
ProcessCodeInfo(OHOS::Hyphenate::CodeInfo & codeInfo,const std::vector<uint16_t> & target,std::vector<uint8_t> & result)422 void ProcessCodeInfo(OHOS::Hyphenate::CodeInfo& codeInfo, const std::vector<uint16_t>& target,
423 std::vector<uint8_t>& result)
424 {
425 for (size_t i = target.size() - 1; i != 0; --i) {
426 if (codeInfo.GetCodeInfo(target[i]) != SUCCEED) {
427 continue;
428 }
429 codeInfo.fIndex = 0;
430 ProcessCodeLoop(codeInfo, target, i, result);
431 }
432 }
433
Read(const char * filePath,const std::vector<uint16_t> & utf16Target) const434 int32_t HyphenReader::Read(const char* filePath, const std::vector<uint16_t>& utf16Target) const
435 {
436 CodeInfo codeInfo;
437 if (!InitializeCodeInfo(codeInfo, filePath)) {
438 return FAILED;
439 }
440
441 std::vector<uint8_t> result(utf16Target.size(), 0);
442 ProcessCodeInfo(codeInfo, utf16Target, result);
443
444 codeInfo.ClearResource();
445 PrintResult(result, utf16Target);
446 return SUCCEED;
447 }
448 } // namespace OHOS::Hyphenate
449
450 namespace {
451 constexpr size_t ARG_NUM = 2;
452
CheckArgs(int argc,char ** argv)453 std::vector<uint16_t> CheckArgs(int argc, char** argv)
454 {
455 std::vector<uint16_t> target;
456 if (argc != 3) { // 3: valid argument number
457 cout << "usage: './hyphen hyph-en-us.hpb <mytestword>' " << endl;
458 return target;
459 }
460 target = OHOS::Hyphenate::GetInputWord(argv[ARG_NUM]);
461 if (target.empty()) {
462 cout << "usage: './hyphen hyph-en-us.hpb <mytestword>' " << endl;
463 }
464 return target;
465 }
466 } // namespace
467
main(int argc,char ** argv)468 int main(int argc, char** argv)
469 {
470 std::vector<uint16_t> target = CheckArgs(argc, argv);
471 if (target.empty()) {
472 return FAILED;
473 }
474
475 OHOS::Hyphenate::HyphenReader hyphenReader;
476 return hyphenReader.Read(argv[1], target);
477 }
478