1 /*
2 * Copyright (c) 2023 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "namemangler.h"
17 #include <regex>
18 #include <cassert>
19 #include <map>
20
21 namespace namemangler {
22 #ifdef __MRT_DEBUG
23 #define DEBUG_ASSERT(f) assert(f)
24 #else
25 #define DEBUG_ASSERT(f) ((void)0)
26 #endif
27
28 const int KLOCAL_CODE_BUF_SIZE = 1024;
29 const int KMAX_CODEC_BUF_SIZE = (1 << 16);
30
31 #define GETHEXCHAR(n) static_cast<char>((n) < 10 ? (n) + '0' : (n) - 10 + 'a')
32 #define GETHEXCHARU(n) static_cast<char>((n) < 10 ? (n) + '0' : (n) - 10 + 'A')
33
34 using StringMap = std::map<const std::string, const std::string>;
35
36 // The returned buffer needs to be explicitly freed
AllocCodecBuf(size_t maxLen)37 static inline char *AllocCodecBuf(size_t maxLen)
38 {
39 if (maxLen == 0) {
40 return nullptr;
41 }
42 // each char may have 2 more char, so give out the max space buffer
43 constexpr int multi = 3;
44 return reinterpret_cast<char *>(
45 malloc((maxLen <= KLOCAL_CODE_BUF_SIZE) ? multi * maxLen : multi * KMAX_CODEC_BUF_SIZE));
46 }
47
FreeCodecBuf(char * buf)48 static inline void FreeCodecBuf(char *buf)
49 {
50 free(buf);
51 }
52
EncodeName(const std::string & name)53 std::string EncodeName(const std::string &name)
54 {
55 // name is guaranteed to be null-terminated
56 size_t nameLen = name.length();
57 nameLen = nameLen > KMAX_CODEC_BUF_SIZE ? KMAX_CODEC_BUF_SIZE : nameLen;
58 char *buf = AllocCodecBuf(nameLen);
59 if (buf == nullptr) {
60 return std::string(name);
61 }
62
63 size_t pos = 0;
64 size_t i = 0;
65 std::string str(name);
66 std::u16string str16;
67 while (i < nameLen) {
68 unsigned char c = static_cast<unsigned char>(name[i]);
69 if (c == '_') {
70 buf[pos++] = '_';
71 buf[pos++] = '_';
72 } else if (c == '[') {
73 buf[pos++] = 'A';
74 } else if (isalnum(c)) {
75 buf[pos++] = static_cast<char>(c);
76 } else if (c <= 0x7F) {
77 // _XX: '_' followed by ascii code in hex
78 if (c == '.') {
79 c = '/'; // use / in package name
80 }
81 buf[pos++] = '_';
82 unsigned char n = c >> 4; // get the high 4 bit and calculate
83 buf[pos++] = GETHEXCHARU(n);
84 n = static_cast<unsigned char>(c - static_cast<unsigned char>(n << 4)); // revert the high 4 bit
85 buf[pos++] = GETHEXCHARU(n);
86 } else {
87 str16.clear();
88 // process one 16-bit char at a time
89 unsigned int n = UTF8ToUTF16(str16, str.substr(i), 1, false);
90 buf[pos++] = '_';
91 if ((n >> 16) == 1) { // if n is 16-bit
92 unsigned short m = str16[0];
93 buf[pos++] = 'u';
94 buf[pos++] = GETHEXCHAR((m & 0xF000) >> 12);
95 buf[pos++] = GETHEXCHAR((m & 0x0F00) >> 8);
96 buf[pos++] = GETHEXCHAR((m & 0x00F0) >> 4);
97 buf[pos++] = GETHEXCHAR(m & 0x000F);
98 } else {
99 unsigned short m = str16[0];
100 buf[pos++] = 'U';
101 buf[pos++] = GETHEXCHAR((m & 0xF000) >> 12);
102 buf[pos++] = GETHEXCHAR((m & 0x0F00) >> 8);
103 buf[pos++] = GETHEXCHAR((m & 0x00F0) >> 4);
104 buf[pos++] = GETHEXCHAR(m & 0x000F);
105 m = str16[1];
106 buf[pos++] = GETHEXCHAR((m & 0xF000) >> 12);
107 buf[pos++] = GETHEXCHAR((m & 0x0F00) >> 8);
108 buf[pos++] = GETHEXCHAR((m & 0x00F0) >> 4);
109 buf[pos++] = GETHEXCHAR(m & 0x000F);
110 }
111 i += static_cast<size_t>(int32_t(n & 0xFFFF) - 1);
112 }
113 i++;
114 }
115
116 buf[pos] = '\0';
117 std::string newName = std::string(buf, pos);
118 FreeCodecBuf(buf);
119 return newName;
120 }
121
UpdatePrimType(bool primType,int splitNo,uint32_t ch)122 static inline bool UpdatePrimType(bool primType, int splitNo, uint32_t ch)
123 {
124 if (ch == 'L') {
125 return false;
126 }
127
128 if (((ch == ';') || (ch == '(') || (ch == ')')) && (splitNo > 1)) {
129 return true;
130 }
131
132 return primType;
133 }
134
135 namespace {
136 constexpr int kNumLimit = 10;
137 constexpr int kCodeOffset3 = 12;
138 constexpr int kCodeOffset2 = 8;
139 constexpr int kCodeOffset = 4;
140 constexpr size_t k64BitShift = 6; // 64 is 1 << 6
141 }
142
DecodeName(const std::string & name)143 std::string DecodeName(const std::string &name)
144 {
145 if (name.find(';') != std::string::npos) { // no need Decoding a non-encoded string
146 return name;
147 }
148 std::string decompressedName;
149 const char *namePtr = nullptr;
150 size_t nameLen;
151 namePtr = name.c_str();
152 nameLen = name.length();
153
154 // Demangled name is supposed to be shorter. No buffer overflow issue here.
155 std::string newName(nameLen, '\0');
156
157 bool primType = true;
158 int splitNo = 0; // split: class 0 | method 1 | signature 2
159 size_t pos = 0;
160 std::string str;
161 std::u16string str16;
162 for (size_t i = 0; i < nameLen;) {
163 unsigned char c = static_cast<unsigned char>(namePtr[i]);
164 ++i;
165 if (c == '_') { // _XX: '_' followed by ascii code in hex
166 if (i >= nameLen) {
167 break;
168 }
169 if (namePtr[i] == '_') {
170 newName[pos++] = namePtr[i++];
171 } else if (namePtr[i] == 'u') {
172 str.clear();
173 str16.clear();
174 i++;
175 c = static_cast<unsigned char>(namePtr[i++]);
176 uint8_t b1 = (c <= '9') ? c - '0' : c - 'a' + kNumLimit;
177 c = static_cast<unsigned char>(namePtr[i++]);
178 uint8_t b2 = (c <= '9') ? c - '0' : c - 'a' + kNumLimit;
179 c = static_cast<unsigned char>(namePtr[i++]);
180 uint8_t b3 = (c <= '9') ? c - '0' : c - 'a' + kNumLimit;
181 c = static_cast<unsigned char>(namePtr[i++]);
182 uint8_t b4 = (c <= '9') ? c - '0' : c - 'a' + kNumLimit;
183 uint32_t codepoint = (b1 << kCodeOffset3) | (b2 << kCodeOffset2) | (b3 << kCodeOffset) | b4;
184 str16 += static_cast<char16_t>(codepoint);
185 unsigned int count = UTF16ToUTF8(str, str16, 1, false) >> 16; // shift 16 to get count
186 if (count == 2) { // the count of str equal 2 to 4, use array to save the utf8
187 newName[pos++] = str[0];
188 newName[pos++] = str[1];
189 } else if (count == 3) { // the count of str equal 2 to 4, deal 3 new
190 newName[pos++] = str[0];
191 newName[pos++] = str[1];
192 newName[pos++] = str[2]; // 2 is index of third char
193 } else if (count == 4) { // the count of str equal 2 to 4
194 newName[pos++] = str[0];
195 newName[pos++] = str[1];
196 newName[pos++] = str[2]; // 2 is index of third char
197 newName[pos++] = str[3]; // 3 is index of fourth char
198 }
199 } else {
200 c = static_cast<unsigned char>(namePtr[i++]);
201 unsigned int v = (c <= '9') ? c - '0' : c - 'A' + kNumLimit;
202 unsigned int asc = v << kCodeOffset;
203 if (i >= nameLen) {
204 break;
205 }
206 c = static_cast<unsigned char>(namePtr[i++]);
207 v = (c <= '9') ? c - '0' : c - 'A' + kNumLimit;
208 asc += v;
209
210 newName[pos++] = static_cast<char>(asc);
211
212 if (asc == '|') {
213 splitNo++;
214 }
215
216 primType = UpdatePrimType(primType, splitNo, asc);
217 }
218 } else {
219 if (splitNo < 2) { // split: class 0 | method 1 | signature 2
220 newName[pos++] = static_cast<char>(c);
221 continue;
222 }
223
224 primType = UpdatePrimType(primType, splitNo, c);
225 if (primType) {
226 newName[pos++] = (c == 'A') ? '[' : c;
227 } else {
228 newName[pos++] = static_cast<char>(c);
229 }
230 }
231 }
232
233 newName.resize(pos);
234 return newName;
235 }
236
237 // input: maple name
238 // output: Lj/lang/Object; [Lj/lang/Object;
DecodeMapleNameToJDescriptor(const std::string & nameIn,std::string & nameOut)239 void DecodeMapleNameToJDescriptor(const std::string &nameIn, std::string &nameOut)
240 {
241 nameOut = DecodeName(nameIn);
242 if (nameOut[0] == 'A') {
243 size_t i = 0;
244 while (nameOut[i] == 'A') {
245 nameOut[i++] = '[';
246 }
247 }
248 }
249
ChangeEndian16(uint16_t u16)250 static uint16_t ChangeEndian16(uint16_t u16)
251 {
252 return ((u16 & 0xFF00) >> kCodeOffset2) | ((u16 & 0xFF) << kCodeOffset2);
253 }
254
255 /* UTF8
256 * U+0000 - U+007F 0xxxxxxx
257 * U+0080 - U+07FF 110xxxxx 10xxxxxx
258 * U+0800 - U+FFFF 1110xxxx 10xxxxxx 10xxxxxx
259 * U+10000- U+10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
260 *
261 * UTF16
262 * U+0000 - U+D7FF codePoint
263 * U+E000 - U+FFFF codePoint
264 * U+10000- U+10FFFF XXXX YYYY
265 * code = codePoint - 0x010000, ie, 20-bit number in the range 0x000000..0x0FFFFF
266 * XXXX: top 10 bits of code + 0xD800: 0xD800..0xDBFF
267 * YYYY: low 10 bits of code + 0xDC00: 0xDC00..0xDFFF
268 *
269 * convert upto num UTF8 elements
270 * return two 16-bit values: return_number_of_elements | consumed_input_number_of_elements
271 */
272 const int kCodepointOffset1 = 6; // U+0080 - U+07FF 110xxxxx 10xxxxxx
273 const int kCodepointOffset2 = 12; // U+0800 - U+FFFF 1110xxxx 10xxxxxx 10xxxxxx
274 const int kCodepointOffset3 = 18; // U+10000- U+10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
275 const int kCountOffset = 16;
276 const int kCodeAfterMinusOffset = 10; // codePoint equals itself minus 0x10000
277
UTF16ToUTF8(std::string & str,const std::u16string & str16,unsigned short num,bool isBigEndian)278 unsigned UTF16ToUTF8(std::string &str, const std::u16string &str16, unsigned short num, bool isBigEndian)
279 {
280 uint32_t codePoint = 0;
281 uint32_t i = 0;
282 unsigned short count = 0;
283 unsigned short retNum = 0;
284 while (i < str16.length()) {
285 if (isBigEndian || num == 1) {
286 codePoint = str16[i++];
287 } else {
288 codePoint = ChangeEndian16(str16[i++]);
289 }
290 if (codePoint > 0xFFFF) {
291 codePoint &= 0x3FF;
292 codePoint <<= kNumLimit;
293 if (isBigEndian) {
294 codePoint += str16[i++] & 0x3FF;
295 } else {
296 codePoint += ChangeEndian16(str16[i++]) & 0x3FF;
297 }
298 }
299 if (codePoint <= 0x7F) {
300 str += static_cast<char>(codePoint);
301 retNum += 1; // 1 UTF8 char
302 } else if (codePoint <= 0x7FF) {
303 str += static_cast<char>(0xC0 + (codePoint >> kCodepointOffset1));
304 str += static_cast<char>(0x80 + (codePoint & 0x3F));
305 retNum += 2; // 2 UTF8 chars
306 } else if (codePoint <= 0xFFFF) {
307 str += static_cast<char>(0xE0 + ((codePoint >> kCodepointOffset2) & 0xF));
308 str += static_cast<char>(0x80 + ((codePoint >> kCodepointOffset1) & 0x3F));
309 str += static_cast<char>(0x80 + (codePoint & 0x3F));
310 retNum += 3; // 3 UTF8 chars
311 } else {
312 str += static_cast<char>(0xF0 + ((codePoint >> kCodepointOffset3) & 0x7));
313 str += static_cast<char>(0x80 + ((codePoint >> kCodepointOffset2) & 0x3F));
314 str += static_cast<char>(0x80 + ((codePoint >> kCodepointOffset1) & 0x3F));
315 str += static_cast<char>(0x80 + (codePoint & 0x3F));
316 retNum += 4; // 4 UTF8 chars
317 }
318 count++;
319 if (num == count) {
320 return ((static_cast<unsigned>(retNum)) << kCountOffset) | static_cast<unsigned>(i);
321 }
322 }
323 return i;
324 }
325
NeedConvertUTF16(const std::string & str8)326 bool NeedConvertUTF16(const std::string &str8)
327 {
328 uint32_t a = 0;
329 size_t i = 0;
330 size_t size = str8.length();
331 while (i < size) {
332 a = static_cast<uint8_t>(str8[i++]);
333 constexpr uint8_t maxValidAscii = 0x7F;
334 if (a > maxValidAscii) {
335 return true;
336 }
337 }
338 return false;
339 }
340
GetCodePoint(const std::string & str8,uint32_t & i)341 uint32_t GetCodePoint(const std::string &str8, uint32_t &i)
342 {
343 uint32_t b;
344 uint32_t c;
345 uint32_t d;
346 uint32_t codePoint = 0;
347 uint32_t a = static_cast<uint8_t>(str8[i++]);
348 if (a <= 0x7F) { // 0...
349 codePoint = a;
350 } else if (a >= 0xF0) { // 11110...
351 b = static_cast<uint32_t>(str8[i++]);
352 c = static_cast<uint32_t>(str8[i++]);
353 d = static_cast<uint32_t>(str8[i++]);
354 codePoint = ((a & 0x7) << kCodepointOffset3) | ((b & 0x3F) << kCodepointOffset2) |
355 ((c & 0x3F) << kCodepointOffset1) | (d & 0x3F);
356 } else if (a >= 0xE0) { // 1110...
357 b = static_cast<uint32_t>(str8[i++]);
358 c = static_cast<uint32_t>(str8[i++]);
359 codePoint = ((a & 0xF) << kCodepointOffset2) | ((b & 0x3F) << kCodepointOffset1) | (c & 0x3F);
360 } else if (a >= 0xC0) { // 110...
361 b = static_cast<uint32_t>(str8[i++]);
362 codePoint = ((a & 0x1F) << kCodepointOffset1) | (b & 0x3F);
363 } else {
364 DEBUG_ASSERT(false && "invalid UTF-8");
365 }
366 return codePoint;
367 }
368
369 // convert upto num UTF16 elements
370 // two 16-bit values: return_number_of_elements | consumed_input_number_of_elements
UTF8ToUTF16(std::u16string & str16,const std::string & str8,unsigned short num,bool isBigEndian)371 unsigned UTF8ToUTF16(std::u16string &str16, const std::string &str8, unsigned short num, bool isBigEndian)
372 {
373 uint32_t i = 0;
374 unsigned short count = 0;
375 unsigned short retNum = 0;
376 while (i < str8.length()) {
377 uint32_t codePoint = GetCodePoint(str8, i);
378 if (codePoint <= 0xFFFF) {
379 if (isBigEndian || num == 1) {
380 str16 += static_cast<char16_t>(codePoint);
381 } else {
382 str16 += static_cast<char16_t>(ChangeEndian16(static_cast<uint16_t>(codePoint)));
383 }
384 retNum += 1; // 1 utf16
385 } else {
386 codePoint -= 0x10000;
387 if (isBigEndian || num == 1) {
388 str16 += static_cast<char16_t>((codePoint >> kCodeAfterMinusOffset) | 0xD800);
389 str16 += static_cast<char16_t>((codePoint & 0x3FF) | 0xDC00);
390 } else {
391 str16 += static_cast<char16_t>(
392 ChangeEndian16(static_cast<uint16_t>((codePoint >> kCodeAfterMinusOffset) | 0xD800)));
393 str16 += static_cast<char16_t>(ChangeEndian16((codePoint & 0x3FF) | 0xDC00));
394 }
395 retNum += 2; // 2 utf16
396 }
397 count++;
398 // only convert num elmements
399 if (num == count) {
400 return (static_cast<char16_t>(retNum) << kCountOffset) | static_cast<char16_t>(i);
401 }
402 }
403 return i;
404 }
405
406 const uint32_t kGreybackOffset = 7;
GetUnsignedLeb128Encode(std::vector<uint8_t> & dest,uint32_t value)407 void GetUnsignedLeb128Encode(std::vector<uint8_t> &dest, uint32_t value)
408 {
409 bool done = false;
410 do {
411 uint8_t byte = value & 0x7f;
412 value >>= kGreybackOffset;
413 done = (value == 0);
414 if (!done) {
415 byte |= 0x80;
416 }
417 dest.push_back(byte);
418 } while (!done);
419 }
420
GetUnsignedLeb128Decode(const uint8_t ** data)421 uint32_t GetUnsignedLeb128Decode(const uint8_t **data)
422 {
423 DEBUG_ASSERT(data != nullptr && "data in GetUnsignedLeb128Decode() is nullptr");
424 const uint8_t *ptr = *data;
425 uint32_t result = 0;
426 uint32_t shift = 0;
427 uint8_t byte = 0;
428 while (true) {
429 byte = *(ptr++);
430 result |= (byte & 0x7f) << shift;
431 if ((byte & 0x80) == 0) {
432 break;
433 }
434 shift += kGreybackOffset;
435 }
436 *data = ptr;
437 return result;
438 }
439
GetLEB128Encode(int64_t val,bool isUnsigned)440 uint64_t GetLEB128Encode(int64_t val, bool isUnsigned)
441 {
442 uint64_t res = 0;
443 uint8_t byte = 0;
444 uint8_t count = 0;
445 bool done = false;
446 do {
447 byte = static_cast<uint64_t>(val) & 0x7f;
448 val >>= kGreybackOffset; // intended signed shift: block codedex here
449 done = (isUnsigned ? val == 0 : (val == 0 || val == -1));
450 if (!done) {
451 byte |= 0x80;
452 }
453 res |= (static_cast<uint64_t>(byte) << (count++ << 3)); // each byte need 8 bit (left shift 3)
454 } while (!done);
455 return res;
456 }
457
GetUleb128Encode(uint64_t val)458 uint64_t GetUleb128Encode(uint64_t val)
459 {
460 return GetLEB128Encode(int64_t(val), true);
461 }
462
GetSleb128Encode(int64_t val)463 uint64_t GetSleb128Encode(int64_t val)
464 {
465 return GetLEB128Encode(val, false);
466 }
467
GetUleb128Decode(uint64_t val)468 uint64_t GetUleb128Decode(uint64_t val)
469 {
470 return val;
471 }
472
GetSleb128Decode(uint64_t val)473 int64_t GetSleb128Decode(uint64_t val)
474 {
475 return static_cast<int64_t>(val);
476 }
477
GetUleb128Size(uint64_t v)478 size_t GetUleb128Size(uint64_t v)
479 {
480 DEBUG_ASSERT(v && "if v == 0, __builtin_clzll(v) is not defined");
481 size_t clz = static_cast<size_t>(__builtin_clzll(v));
482 // num of 7-bit groups, (64 - clz + 6) / 7
483 return size_t((64 - clz + 6) / 7);
484 }
485
GetSleb128Size(int32_t v)486 size_t GetSleb128Size(int32_t v)
487 {
488 size_t size = 0;
489 int rem = v >> kGreybackOffset;
490 bool hasMore = true;
491 int end = ((v >= 0) ? 0 : -1);
492
493 while (hasMore) {
494 hasMore = (rem != end) || ((rem & 1) != ((v >> k64BitShift) & 1)); // judege whether has More valid rem
495 size++;
496 v = rem;
497 rem >>= static_cast<int>(kGreybackOffset); // intended signed shift: block codedex here
498 }
499 return size;
500 }
501 } // namespace namemangler
502