• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2023 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "namemangler.h"
17 #include <regex>
18 #include <cassert>
19 #include <map>
20 
21 namespace namemangler {
22 #ifdef __MRT_DEBUG
23 #define DEBUG_ASSERT(f) assert(f)
24 #else
25 #define DEBUG_ASSERT(f) ((void)0)
26 #endif
27 
28 const int KLOCAL_CODE_BUF_SIZE = 1024;
29 const int KMAX_CODEC_BUF_SIZE = (1 << 16);
30 
31 #define GETHEXCHAR(n) static_cast<char>((n) < 10 ? (n) + '0' : (n) - 10 + 'a')
32 #define GETHEXCHARU(n) static_cast<char>((n) < 10 ? (n) + '0' : (n) - 10 + 'A')
33 
34 using StringMap = std::map<const std::string, const std::string>;
35 
36 // The returned buffer needs to be explicitly freed
AllocCodecBuf(size_t maxLen)37 static inline char *AllocCodecBuf(size_t maxLen)
38 {
39     if (maxLen == 0) {
40         return nullptr;
41     }
42     // each char may have 2 more char, so give out the max space buffer
43     constexpr int multi = 3;
44     return reinterpret_cast<char *>(
45         malloc((maxLen <= KLOCAL_CODE_BUF_SIZE) ? multi * maxLen : multi * KMAX_CODEC_BUF_SIZE));
46 }
47 
FreeCodecBuf(char * buf)48 static inline void FreeCodecBuf(char *buf)
49 {
50     free(buf);
51 }
52 
EncodeName(const std::string & name)53 std::string EncodeName(const std::string &name)
54 {
55     // name is guaranteed to be null-terminated
56     size_t nameLen = name.length();
57     nameLen = nameLen > KMAX_CODEC_BUF_SIZE ? KMAX_CODEC_BUF_SIZE : nameLen;
58     char *buf = AllocCodecBuf(nameLen);
59     if (buf == nullptr) {
60         return std::string(name);
61     }
62 
63     size_t pos = 0;
64     size_t i = 0;
65     std::string str(name);
66     std::u16string str16;
67     while (i < nameLen) {
68         unsigned char c = static_cast<unsigned char>(name[i]);
69         if (c == '_') {
70             buf[pos++] = '_';
71             buf[pos++] = '_';
72         } else if (c == '[') {
73             buf[pos++] = 'A';
74         } else if (isalnum(c)) {
75             buf[pos++] = static_cast<char>(c);
76         } else if (c <= 0x7F) {
77             // _XX: '_' followed by ascii code in hex
78             if (c == '.') {
79                 c = '/';  // use / in package name
80             }
81             buf[pos++] = '_';
82             unsigned char n = c >> 4;  // get the high 4 bit and calculate
83             buf[pos++] = GETHEXCHARU(n);
84             n = static_cast<unsigned char>(c - static_cast<unsigned char>(n << 4));  // revert the high 4 bit
85             buf[pos++] = GETHEXCHARU(n);
86         } else {
87             str16.clear();
88             // process one 16-bit char at a time
89             unsigned int n = UTF8ToUTF16(str16, str.substr(i), 1, false);
90             buf[pos++] = '_';
91             if ((n >> 16) == 1) {  // if n is 16-bit
92                 unsigned short m = str16[0];
93                 buf[pos++] = 'u';
94                 buf[pos++] = GETHEXCHAR((m & 0xF000) >> 12);
95                 buf[pos++] = GETHEXCHAR((m & 0x0F00) >> 8);
96                 buf[pos++] = GETHEXCHAR((m & 0x00F0) >> 4);
97                 buf[pos++] = GETHEXCHAR(m & 0x000F);
98             } else {
99                 unsigned short m = str16[0];
100                 buf[pos++] = 'U';
101                 buf[pos++] = GETHEXCHAR((m & 0xF000) >> 12);
102                 buf[pos++] = GETHEXCHAR((m & 0x0F00) >> 8);
103                 buf[pos++] = GETHEXCHAR((m & 0x00F0) >> 4);
104                 buf[pos++] = GETHEXCHAR(m & 0x000F);
105                 m = str16[1];
106                 buf[pos++] = GETHEXCHAR((m & 0xF000) >> 12);
107                 buf[pos++] = GETHEXCHAR((m & 0x0F00) >> 8);
108                 buf[pos++] = GETHEXCHAR((m & 0x00F0) >> 4);
109                 buf[pos++] = GETHEXCHAR(m & 0x000F);
110             }
111             i += static_cast<size_t>(int32_t(n & 0xFFFF) - 1);
112         }
113         i++;
114     }
115 
116     buf[pos] = '\0';
117     std::string newName = std::string(buf, pos);
118     FreeCodecBuf(buf);
119     return newName;
120 }
121 
UpdatePrimType(bool primType,int splitNo,uint32_t ch)122 static inline bool UpdatePrimType(bool primType, int splitNo, uint32_t ch)
123 {
124     if (ch == 'L') {
125         return false;
126     }
127 
128     if (((ch == ';') || (ch == '(') || (ch == ')')) && (splitNo > 1)) {
129         return true;
130     }
131 
132     return primType;
133 }
134 
135 namespace {
136 constexpr int kNumLimit = 10;
137 constexpr int kCodeOffset3 = 12;
138 constexpr int kCodeOffset2 = 8;
139 constexpr int kCodeOffset = 4;
140 constexpr size_t k64BitShift = 6; // 64 is 1 << 6
141 }
142 
DecodeName(const std::string & name)143 std::string DecodeName(const std::string &name)
144 {
145     if (name.find(';') != std::string::npos) {  // no need Decoding a non-encoded string
146         return name;
147     }
148     std::string decompressedName;
149     const char *namePtr = nullptr;
150     size_t nameLen;
151     namePtr = name.c_str();
152     nameLen = name.length();
153 
154     // Demangled name is supposed to be shorter. No buffer overflow issue here.
155     std::string newName(nameLen, '\0');
156 
157     bool primType = true;
158     int splitNo = 0;  // split: class 0 | method 1 | signature 2
159     size_t pos = 0;
160     std::string str;
161     std::u16string str16;
162     for (size_t i = 0; i < nameLen;) {
163         unsigned char c = static_cast<unsigned char>(namePtr[i]);
164         ++i;
165         if (c == '_') {  // _XX: '_' followed by ascii code in hex
166             if (i >= nameLen) {
167                 break;
168             }
169             if (namePtr[i] == '_') {
170                 newName[pos++] = namePtr[i++];
171             } else if (namePtr[i] == 'u') {
172                 str.clear();
173                 str16.clear();
174                 i++;
175                 c = static_cast<unsigned char>(namePtr[i++]);
176                 uint8_t b1 = (c <= '9') ? c - '0' : c - 'a' + kNumLimit;
177                 c = static_cast<unsigned char>(namePtr[i++]);
178                 uint8_t b2 = (c <= '9') ? c - '0' : c - 'a' + kNumLimit;
179                 c = static_cast<unsigned char>(namePtr[i++]);
180                 uint8_t b3 = (c <= '9') ? c - '0' : c - 'a' + kNumLimit;
181                 c = static_cast<unsigned char>(namePtr[i++]);
182                 uint8_t b4 = (c <= '9') ? c - '0' : c - 'a' + kNumLimit;
183                 uint32_t codepoint = (b1 << kCodeOffset3) | (b2 << kCodeOffset2) | (b3 << kCodeOffset) | b4;
184                 str16 += static_cast<char16_t>(codepoint);
185                 unsigned int count = UTF16ToUTF8(str, str16, 1, false) >> 16; // shift 16 to get count
186                 if (count == 2) {  // the count of str equal 2 to 4, use array to save the utf8
187                     newName[pos++] = str[0];
188                     newName[pos++] = str[1];
189                 } else if (count == 3) {  // the count of str equal 2 to 4, deal 3 new
190                     newName[pos++] = str[0];
191                     newName[pos++] = str[1];
192                     newName[pos++] = str[2];  // 2 is index of third char
193                 } else if (count == 4) {      // the count of str equal 2 to 4
194                     newName[pos++] = str[0];
195                     newName[pos++] = str[1];
196                     newName[pos++] = str[2];  // 2 is index of third char
197                     newName[pos++] = str[3];  // 3 is index of fourth char
198                 }
199             } else {
200                 c = static_cast<unsigned char>(namePtr[i++]);
201                 unsigned int v = (c <= '9') ? c - '0' : c - 'A' + kNumLimit;
202                 unsigned int asc = v << kCodeOffset;
203                 if (i >= nameLen) {
204                     break;
205                 }
206                 c = static_cast<unsigned char>(namePtr[i++]);
207                 v = (c <= '9') ? c - '0' : c - 'A' + kNumLimit;
208                 asc += v;
209 
210                 newName[pos++] = static_cast<char>(asc);
211 
212                 if (asc == '|') {
213                     splitNo++;
214                 }
215 
216                 primType = UpdatePrimType(primType, splitNo, asc);
217             }
218         } else {
219             if (splitNo < 2) {  // split: class 0 | method 1 | signature 2
220                 newName[pos++] = static_cast<char>(c);
221                 continue;
222             }
223 
224             primType = UpdatePrimType(primType, splitNo, c);
225             if (primType) {
226                 newName[pos++] = (c == 'A') ? '[' : c;
227             } else {
228                 newName[pos++] = static_cast<char>(c);
229             }
230         }
231     }
232 
233     newName.resize(pos);
234     return newName;
235 }
236 
237 // input: maple name
238 // output: Lj/lang/Object;  [Lj/lang/Object;
DecodeMapleNameToJDescriptor(const std::string & nameIn,std::string & nameOut)239 void DecodeMapleNameToJDescriptor(const std::string &nameIn, std::string &nameOut)
240 {
241     nameOut = DecodeName(nameIn);
242     if (nameOut[0] == 'A') {
243         size_t i = 0;
244         while (nameOut[i] == 'A') {
245             nameOut[i++] = '[';
246         }
247     }
248 }
249 
ChangeEndian16(uint16_t u16)250 static uint16_t ChangeEndian16(uint16_t u16)
251 {
252     return ((u16 & 0xFF00) >> kCodeOffset2) | ((u16 & 0xFF) << kCodeOffset2);
253 }
254 
255 /* UTF8
256  * U+0000 - U+007F   0xxxxxxx
257  * U+0080 - U+07FF   110xxxxx 10xxxxxx
258  * U+0800 - U+FFFF   1110xxxx 10xxxxxx 10xxxxxx
259  * U+10000- U+10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
260  *
261  * UTF16
262  * U+0000 - U+D7FF   codePoint
263  * U+E000 - U+FFFF   codePoint
264  * U+10000- U+10FFFF XXXX YYYY
265  *   code = codePoint - 0x010000, ie, 20-bit number in the range 0x000000..0x0FFFFF
266  *   XXXX: top 10 bits of code + 0xD800: 0xD800..0xDBFF
267  *   YYYY: low 10 bits of code + 0xDC00: 0xDC00..0xDFFF
268  *
269  * convert upto num UTF8 elements
270  * return two 16-bit values: return_number_of_elements | consumed_input_number_of_elements
271  */
272 const int kCodepointOffset1 = 6;   // U+0080 - U+07FF   110xxxxx 10xxxxxx
273 const int kCodepointOffset2 = 12;  // U+0800 - U+FFFF   1110xxxx 10xxxxxx 10xxxxxx
274 const int kCodepointOffset3 = 18;  // U+10000- U+10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
275 const int kCountOffset = 16;
276 const int kCodeAfterMinusOffset = 10;  // codePoint equals itself minus 0x10000
277 
UTF16ToUTF8(std::string & str,const std::u16string & str16,unsigned short num,bool isBigEndian)278 unsigned UTF16ToUTF8(std::string &str, const std::u16string &str16, unsigned short num, bool isBigEndian)
279 {
280     uint32_t codePoint = 0;
281     uint32_t i = 0;
282     unsigned short count = 0;
283     unsigned short retNum = 0;
284     while (i < str16.length()) {
285         if (isBigEndian || num == 1) {
286             codePoint = str16[i++];
287         } else {
288             codePoint = ChangeEndian16(str16[i++]);
289         }
290         if (codePoint > 0xFFFF) {
291             codePoint &= 0x3FF;
292             codePoint <<= kNumLimit;
293             if (isBigEndian) {
294                 codePoint += str16[i++] & 0x3FF;
295             } else {
296                 codePoint += ChangeEndian16(str16[i++]) & 0x3FF;
297             }
298         }
299         if (codePoint <= 0x7F) {
300             str += static_cast<char>(codePoint);
301             retNum += 1;  // 1 UTF8 char
302         } else if (codePoint <= 0x7FF) {
303             str += static_cast<char>(0xC0 + (codePoint >> kCodepointOffset1));
304             str += static_cast<char>(0x80 + (codePoint & 0x3F));
305             retNum += 2;  // 2 UTF8 chars
306         } else if (codePoint <= 0xFFFF) {
307             str += static_cast<char>(0xE0 + ((codePoint >> kCodepointOffset2) & 0xF));
308             str += static_cast<char>(0x80 + ((codePoint >> kCodepointOffset1) & 0x3F));
309             str += static_cast<char>(0x80 + (codePoint & 0x3F));
310             retNum += 3;  // 3 UTF8 chars
311         } else {
312             str += static_cast<char>(0xF0 + ((codePoint >> kCodepointOffset3) & 0x7));
313             str += static_cast<char>(0x80 + ((codePoint >> kCodepointOffset2) & 0x3F));
314             str += static_cast<char>(0x80 + ((codePoint >> kCodepointOffset1) & 0x3F));
315             str += static_cast<char>(0x80 + (codePoint & 0x3F));
316             retNum += 4;  // 4 UTF8 chars
317         }
318         count++;
319         if (num == count) {
320             return ((static_cast<unsigned>(retNum)) << kCountOffset) | static_cast<unsigned>(i);
321         }
322     }
323     return i;
324 }
325 
NeedConvertUTF16(const std::string & str8)326 bool NeedConvertUTF16(const std::string &str8)
327 {
328     uint32_t a = 0;
329     size_t i = 0;
330     size_t size = str8.length();
331     while (i < size) {
332         a = static_cast<uint8_t>(str8[i++]);
333         constexpr uint8_t maxValidAscii = 0x7F;
334         if (a > maxValidAscii) {
335             return true;
336         }
337     }
338     return false;
339 }
340 
GetCodePoint(const std::string & str8,uint32_t & i)341 uint32_t GetCodePoint(const std::string &str8, uint32_t &i)
342 {
343     uint32_t b;
344     uint32_t c;
345     uint32_t d;
346     uint32_t codePoint = 0;
347     uint32_t a = static_cast<uint8_t>(str8[i++]);
348     if (a <= 0x7F) {  // 0...
349         codePoint = a;
350     } else if (a >= 0xF0) {  // 11110...
351         b = static_cast<uint32_t>(str8[i++]);
352         c = static_cast<uint32_t>(str8[i++]);
353         d = static_cast<uint32_t>(str8[i++]);
354         codePoint = ((a & 0x7) << kCodepointOffset3) | ((b & 0x3F) << kCodepointOffset2) |
355                     ((c & 0x3F) << kCodepointOffset1) | (d & 0x3F);
356     } else if (a >= 0xE0) {  // 1110...
357         b = static_cast<uint32_t>(str8[i++]);
358         c = static_cast<uint32_t>(str8[i++]);
359         codePoint = ((a & 0xF) << kCodepointOffset2) | ((b & 0x3F) << kCodepointOffset1) | (c & 0x3F);
360     } else if (a >= 0xC0) {  // 110...
361         b = static_cast<uint32_t>(str8[i++]);
362         codePoint = ((a & 0x1F) << kCodepointOffset1) | (b & 0x3F);
363     } else {
364         DEBUG_ASSERT(false && "invalid UTF-8");
365     }
366     return codePoint;
367 }
368 
369 // convert upto num UTF16 elements
370 // two 16-bit values: return_number_of_elements | consumed_input_number_of_elements
UTF8ToUTF16(std::u16string & str16,const std::string & str8,unsigned short num,bool isBigEndian)371 unsigned UTF8ToUTF16(std::u16string &str16, const std::string &str8, unsigned short num, bool isBigEndian)
372 {
373     uint32_t i = 0;
374     unsigned short count = 0;
375     unsigned short retNum = 0;
376     while (i < str8.length()) {
377         uint32_t codePoint = GetCodePoint(str8, i);
378         if (codePoint <= 0xFFFF) {
379             if (isBigEndian || num == 1) {
380                 str16 += static_cast<char16_t>(codePoint);
381             } else {
382                 str16 += static_cast<char16_t>(ChangeEndian16(static_cast<uint16_t>(codePoint)));
383             }
384             retNum += 1;  // 1 utf16
385         } else {
386             codePoint -= 0x10000;
387             if (isBigEndian || num == 1) {
388                 str16 += static_cast<char16_t>((codePoint >> kCodeAfterMinusOffset) | 0xD800);
389                 str16 += static_cast<char16_t>((codePoint & 0x3FF) | 0xDC00);
390             } else {
391                 str16 += static_cast<char16_t>(
392                     ChangeEndian16(static_cast<uint16_t>((codePoint >> kCodeAfterMinusOffset) | 0xD800)));
393                 str16 += static_cast<char16_t>(ChangeEndian16((codePoint & 0x3FF) | 0xDC00));
394             }
395             retNum += 2;  // 2 utf16
396         }
397         count++;
398         // only convert num elmements
399         if (num == count) {
400             return (static_cast<char16_t>(retNum) << kCountOffset) | static_cast<char16_t>(i);
401         }
402     }
403     return i;
404 }
405 
406 const uint32_t kGreybackOffset = 7;
GetUnsignedLeb128Encode(std::vector<uint8_t> & dest,uint32_t value)407 void GetUnsignedLeb128Encode(std::vector<uint8_t> &dest, uint32_t value)
408 {
409     bool done = false;
410     do {
411         uint8_t byte = value & 0x7f;
412         value >>= kGreybackOffset;
413         done = (value == 0);
414         if (!done) {
415             byte |= 0x80;
416         }
417         dest.push_back(byte);
418     } while (!done);
419 }
420 
GetUnsignedLeb128Decode(const uint8_t ** data)421 uint32_t GetUnsignedLeb128Decode(const uint8_t **data)
422 {
423     DEBUG_ASSERT(data != nullptr && "data in GetUnsignedLeb128Decode() is nullptr");
424     const uint8_t *ptr = *data;
425     uint32_t result = 0;
426     uint32_t shift = 0;
427     uint8_t byte = 0;
428     while (true) {
429         byte = *(ptr++);
430         result |= (byte & 0x7f) << shift;
431         if ((byte & 0x80) == 0) {
432             break;
433         }
434         shift += kGreybackOffset;
435     }
436     *data = ptr;
437     return result;
438 }
439 
GetLEB128Encode(int64_t val,bool isUnsigned)440 uint64_t GetLEB128Encode(int64_t val, bool isUnsigned)
441 {
442     uint64_t res = 0;
443     uint8_t byte = 0;
444     uint8_t count = 0;
445     bool done = false;
446     do {
447         byte = static_cast<uint64_t>(val) & 0x7f;
448         val >>= kGreybackOffset;  // intended signed shift: block codedex here
449         done = (isUnsigned ? val == 0 : (val == 0 || val == -1));
450         if (!done) {
451             byte |= 0x80;
452         }
453         res |= (static_cast<uint64_t>(byte) << (count++ << 3));  // each byte need 8 bit (left shift 3)
454     } while (!done);
455     return res;
456 }
457 
GetUleb128Encode(uint64_t val)458 uint64_t GetUleb128Encode(uint64_t val)
459 {
460     return GetLEB128Encode(int64_t(val), true);
461 }
462 
GetSleb128Encode(int64_t val)463 uint64_t GetSleb128Encode(int64_t val)
464 {
465     return GetLEB128Encode(val, false);
466 }
467 
GetUleb128Decode(uint64_t val)468 uint64_t GetUleb128Decode(uint64_t val)
469 {
470     return val;
471 }
472 
GetSleb128Decode(uint64_t val)473 int64_t GetSleb128Decode(uint64_t val)
474 {
475     return static_cast<int64_t>(val);
476 }
477 
GetUleb128Size(uint64_t v)478 size_t GetUleb128Size(uint64_t v)
479 {
480     DEBUG_ASSERT(v && "if v == 0, __builtin_clzll(v) is not defined");
481     size_t clz = static_cast<size_t>(__builtin_clzll(v));
482     // num of 7-bit groups, (64 - clz + 6) / 7
483     return size_t((64 - clz + 6) / 7);
484 }
485 
GetSleb128Size(int32_t v)486 size_t GetSleb128Size(int32_t v)
487 {
488     size_t size = 0;
489     int rem = v >> kGreybackOffset;
490     bool hasMore = true;
491     int end = ((v >= 0) ? 0 : -1);
492 
493     while (hasMore) {
494         hasMore = (rem != end) || ((rem & 1) != ((v >> k64BitShift) & 1));  // judege whether has More valid rem
495         size++;
496         v = rem;
497         rem >>= static_cast<int>(kGreybackOffset);  // intended signed shift: block codedex here
498     }
499     return size;
500 }
501 }  // namespace namemangler
502