1 /*
2 * Copyright (C) 2011 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "descriptors_names.h"
18
19 #include <algorithm>
20
21 #include "android-base/stringprintf.h"
22 #include "android-base/strings.h"
23
24 #include "base/macros.h"
25 #include "dex/utf-inl.h"
26
27 namespace art {
28
29 using android::base::StringAppendF;
30
AppendPrettyDescriptor(const char * descriptor,std::string * result)31 void AppendPrettyDescriptor(const char* descriptor, std::string* result) {
32 // Count the number of '['s to get the dimensionality.
33 const char* c = descriptor;
34 size_t dim = 0;
35 while (*c == '[') {
36 dim++;
37 c++;
38 }
39
40 // Reference or primitive?
41 bool primitive = false;
42 if (*c == 'L') {
43 // "[[La/b/C;" -> "a.b.C[][]".
44 c++; // Skip the 'L'.
45 } else {
46 primitive = true;
47 // "[[B" -> "byte[][]".
48 switch (*c) {
49 case 'B':
50 c = "byte";
51 break;
52 case 'C':
53 c = "char";
54 break;
55 case 'D':
56 c = "double";
57 break;
58 case 'F':
59 c = "float";
60 break;
61 case 'I':
62 c = "int";
63 break;
64 case 'J':
65 c = "long";
66 break;
67 case 'S':
68 c = "short";
69 break;
70 case 'Z':
71 c = "boolean";
72 break;
73 case 'V':
74 c = "void";
75 break; // Used when decoding return types.
76 default: result->append(descriptor); return;
77 }
78 }
79
80 // At this point, 'c' is a string of the form "fully/qualified/Type;" or
81 // "primitive". In the former case, rewrite the type with '.' instead of '/':
82 std::string temp(c);
83 if (!primitive) {
84 std::replace(temp.begin(), temp.end(), '/', '.');
85 // ...and remove the semicolon:
86 if (temp.back() == ';') {
87 temp.pop_back();
88 }
89 }
90 result->append(temp);
91
92 // Finally, add 'dim' "[]" pairs:
93 for (size_t i = 0; i < dim; ++i) {
94 result->append("[]");
95 }
96 }
97
PrettyDescriptor(const char * descriptor)98 std::string PrettyDescriptor(const char* descriptor) {
99 std::string result;
100 AppendPrettyDescriptor(descriptor, &result);
101 return result;
102 }
103
InversePrettyDescriptor(const std::string & pretty_descriptor)104 std::string InversePrettyDescriptor(const std::string& pretty_descriptor) {
105 std::string result;
106
107 // Used to determine the length of the descriptor without trailing "[]"s.
108 size_t l = pretty_descriptor.length();
109
110 // Determine dimensionality, and append the necessary leading '['s.
111 size_t dim = 0;
112 size_t pos = 0;
113 static const std::string array_indicator = "[]";
114 while ((pos = pretty_descriptor.find(array_indicator, pos)) != std::string::npos) {
115 if (dim == 0) {
116 l = pos;
117 }
118 ++dim;
119 pos += array_indicator.length();
120 }
121 for (size_t i = 0; i < dim; ++i) {
122 result += '[';
123 }
124
125 // temp_descriptor is now in the form of "some.pretty.Type" or "primitive".
126 std::string temp_descriptor(pretty_descriptor, 0, l);
127 if (temp_descriptor == "byte") {
128 result += 'B';
129 } else if (temp_descriptor == "char") {
130 result += 'C';
131 } else if (temp_descriptor == "double") {
132 result += 'D';
133 } else if (temp_descriptor == "float") {
134 result += 'F';
135 } else if (temp_descriptor == "int") {
136 result += 'I';
137 } else if (temp_descriptor == "long") {
138 result += 'J';
139 } else if (temp_descriptor == "short") {
140 result += 'S';
141 } else if (temp_descriptor == "boolean") {
142 result += 'Z';
143 } else if (temp_descriptor == "void") {
144 result += 'V';
145 } else {
146 result += 'L';
147 std::replace(temp_descriptor.begin(), temp_descriptor.end(), '.', '/');
148 result += temp_descriptor;
149 result += ';';
150 }
151 return result;
152 }
153
GetJniShortName(const std::string & class_descriptor,const std::string & method)154 std::string GetJniShortName(const std::string& class_descriptor, const std::string& method) {
155 // Remove the leading 'L' and trailing ';'...
156 std::string class_name(class_descriptor);
157 CHECK_EQ(class_name[0], 'L') << class_name;
158 CHECK_EQ(class_name[class_name.size() - 1], ';') << class_name;
159 class_name.erase(0, 1);
160 class_name.erase(class_name.size() - 1, 1);
161
162 std::string short_name;
163 short_name += "Java_";
164 short_name += MangleForJni(class_name);
165 short_name += "_";
166 short_name += MangleForJni(method);
167 return short_name;
168 }
169
170 // See http://java.sun.com/j2se/1.5.0/docs/guide/jni/spec/design.html#wp615 for the full rules.
MangleForJni(const std::string & s)171 std::string MangleForJni(const std::string& s) {
172 std::string result;
173 size_t char_count = CountModifiedUtf8Chars(s.c_str());
174 const char* cp = &s[0];
175 for (size_t i = 0; i < char_count; ++i) {
176 uint32_t ch = GetUtf16FromUtf8(&cp);
177 if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || (ch >= '0' && ch <= '9')) {
178 result.push_back(ch);
179 } else if (ch == '.' || ch == '/') {
180 result += "_";
181 } else if (ch == '_') {
182 result += "_1";
183 } else if (ch == ';') {
184 result += "_2";
185 } else if (ch == '[') {
186 result += "_3";
187 } else {
188 const uint16_t leading = GetLeadingUtf16Char(ch);
189 const uint32_t trailing = GetTrailingUtf16Char(ch);
190
191 StringAppendF(&result, "_0%04x", leading);
192 if (trailing != 0) {
193 StringAppendF(&result, "_0%04x", trailing);
194 }
195 }
196 }
197 return result;
198 }
199
DotToDescriptor(const char * class_name)200 std::string DotToDescriptor(const char* class_name) {
201 std::string descriptor(class_name);
202 std::replace(descriptor.begin(), descriptor.end(), '.', '/');
203 if (descriptor.length() > 0 && descriptor[0] != '[') {
204 descriptor = "L" + descriptor + ";";
205 }
206 return descriptor;
207 }
208
DescriptorToDot(const char * descriptor)209 std::string DescriptorToDot(const char* descriptor) {
210 size_t length = strlen(descriptor);
211 if (length > 1) {
212 if (descriptor[0] == 'L' && descriptor[length - 1] == ';') {
213 // Descriptors have the leading 'L' and trailing ';' stripped.
214 std::string result(descriptor + 1, length - 2);
215 std::replace(result.begin(), result.end(), '/', '.');
216 return result;
217 } else {
218 // For arrays the 'L' and ';' remain intact.
219 std::string result(descriptor);
220 std::replace(result.begin(), result.end(), '/', '.');
221 return result;
222 }
223 }
224 // Do nothing for non-class/array descriptors.
225 return descriptor;
226 }
227
DescriptorToName(const char * descriptor)228 std::string DescriptorToName(const char* descriptor) {
229 size_t length = strlen(descriptor);
230 if (descriptor[0] == 'L' && descriptor[length - 1] == ';') {
231 std::string result(descriptor + 1, length - 2);
232 return result;
233 }
234 return descriptor;
235 }
236
237 // Helper for IsValidPartOfMemberNameUtf8(), a bit vector indicating valid low ascii.
238 static constexpr uint32_t DEX_MEMBER_VALID_LOW_ASCII[4] = {
239 0x00000000, // 00..1f low control characters; nothing valid
240 0x03ff2011, // 20..3f space, digits and symbols; valid: ' ', '0'..'9', '$', '-'
241 0x87fffffe, // 40..5f uppercase etc.; valid: 'A'..'Z', '_'
242 0x07fffffe // 60..7f lowercase etc.; valid: 'a'..'z'
243 };
244
245 // Helper for IsValidPartOfMemberNameUtf8(); do not call directly.
246 COLD_ATTR
IsValidPartOfMemberNameUtf8Slow(const char ** pUtf8Ptr)247 static bool IsValidPartOfMemberNameUtf8Slow(const char** pUtf8Ptr) {
248 /*
249 * It's a multibyte encoded character. Decode it and analyze. We
250 * accept anything that isn't:
251 * - an improperly encoded low value
252 * - an improper surrogate pair
253 * - an encoded '\0'
254 * - a C1 control character U+0080..U+009f
255 * - a format character U+200b..U+200f, U+2028..U+202e
256 * - a special character U+fff0..U+ffff
257 * Prior to DEX format version 040, we also excluded some of the Unicode
258 * space characters:
259 * - U+00a0, U+2000..U+200a, U+202f
260 * This is all specified in the dex format document.
261 */
262
263 const uint32_t pair = GetUtf16FromUtf8(pUtf8Ptr);
264 const uint16_t leading = GetLeadingUtf16Char(pair);
265
266 // We have a surrogate pair resulting from a valid 4 byte UTF sequence.
267 // No further checks are necessary because 4 byte sequences span code
268 // points [U+10000, U+1FFFFF], which are valid codepoints in a dex
269 // identifier. Furthermore, GetUtf16FromUtf8 guarantees that each of
270 // the surrogate halves are valid and well formed in this instance.
271 if (GetTrailingUtf16Char(pair) != 0) {
272 return true;
273 }
274
275
276 // We've encountered a one, two or three byte UTF-8 sequence. The
277 // three byte UTF-8 sequence could be one half of a surrogate pair.
278 switch (leading >> 8) {
279 case 0x00:
280 // It's in the range that has C1 control characters.
281 return (leading >= 0x00a0);
282 case 0xd8:
283 case 0xd9:
284 case 0xda:
285 case 0xdb:
286 {
287 // We found a three byte sequence encoding one half of a surrogate.
288 // Look for the other half.
289 const uint32_t pair2 = GetUtf16FromUtf8(pUtf8Ptr);
290 const uint16_t trailing = GetLeadingUtf16Char(pair2);
291
292 return (GetTrailingUtf16Char(pair2) == 0) && (0xdc00 <= trailing && trailing <= 0xdfff);
293 }
294 case 0xdc:
295 case 0xdd:
296 case 0xde:
297 case 0xdf:
298 // It's a trailing surrogate, which is not valid at this point.
299 return false;
300 case 0x20:
301 case 0xff:
302 // It's in the range that has format characters and specials.
303 switch (leading & 0xfff8) {
304 case 0x2008:
305 return (leading <= 0x200a);
306 case 0x2028:
307 return (leading == 0x202f);
308 case 0xfff0:
309 case 0xfff8:
310 return false;
311 }
312 return true;
313 default:
314 return true;
315 }
316
317 UNREACHABLE();
318 }
319
320 /* Return whether the pointed-at modified-UTF-8 encoded character is
321 * valid as part of a member name, updating the pointer to point past
322 * the consumed character. This will consume two encoded UTF-16 code
323 * points if the character is encoded as a surrogate pair. Also, if
324 * this function returns false, then the given pointer may only have
325 * been partially advanced.
326 */
327 ALWAYS_INLINE
IsValidPartOfMemberNameUtf8(const char ** pUtf8Ptr)328 static bool IsValidPartOfMemberNameUtf8(const char** pUtf8Ptr) {
329 uint8_t c = (uint8_t) **pUtf8Ptr;
330 if (LIKELY(c <= 0x7f)) {
331 // It's low-ascii, so check the table.
332 uint32_t wordIdx = c >> 5;
333 uint32_t bitIdx = c & 0x1f;
334 (*pUtf8Ptr)++;
335 return (DEX_MEMBER_VALID_LOW_ASCII[wordIdx] & (1 << bitIdx)) != 0;
336 }
337
338 // It's a multibyte encoded character. Call a non-inline function
339 // for the heavy lifting.
340 return IsValidPartOfMemberNameUtf8Slow(pUtf8Ptr);
341 }
342
IsValidMemberName(const char * s)343 bool IsValidMemberName(const char* s) {
344 bool angle_name = false;
345
346 switch (*s) {
347 case '\0':
348 // The empty string is not a valid name.
349 return false;
350 case '<':
351 angle_name = true;
352 s++;
353 break;
354 }
355
356 while (true) {
357 switch (*s) {
358 case '\0':
359 return !angle_name;
360 case '>':
361 return angle_name && s[1] == '\0';
362 }
363
364 if (!IsValidPartOfMemberNameUtf8(&s)) {
365 return false;
366 }
367 }
368 }
369
370 enum ClassNameType { kName, kDescriptor };
371 template<ClassNameType kType, char kSeparator>
IsValidClassName(const char * s)372 static bool IsValidClassName(const char* s) {
373 int arrayCount = 0;
374 while (*s == '[') {
375 arrayCount++;
376 s++;
377 }
378
379 if (arrayCount > 255) {
380 // Arrays may have no more than 255 dimensions.
381 return false;
382 }
383
384 ClassNameType type = kType;
385 if (type != kDescriptor && arrayCount != 0) {
386 /*
387 * If we're looking at an array of some sort, then it doesn't
388 * matter if what is being asked for is a class name; the
389 * format looks the same as a type descriptor in that case, so
390 * treat it as such.
391 */
392 type = kDescriptor;
393 }
394
395 if (type == kDescriptor) {
396 /*
397 * We are looking for a descriptor. Either validate it as a
398 * single-character primitive type, or continue on to check the
399 * embedded class name (bracketed by "L" and ";").
400 */
401 switch (*(s++)) {
402 case 'B':
403 case 'C':
404 case 'D':
405 case 'F':
406 case 'I':
407 case 'J':
408 case 'S':
409 case 'Z':
410 // These are all single-character descriptors for primitive types.
411 return (*s == '\0');
412 case 'V':
413 // Non-array void is valid, but you can't have an array of void.
414 return (arrayCount == 0) && (*s == '\0');
415 case 'L':
416 // Class name: Break out and continue below.
417 break;
418 default:
419 // Oddball descriptor character.
420 return false;
421 }
422 }
423
424 /*
425 * We just consumed the 'L' that introduces a class name as part
426 * of a type descriptor, or we are looking for an unadorned class
427 * name.
428 */
429
430 bool sepOrFirst = true; // first character or just encountered a separator.
431 for (;;) {
432 uint8_t c = (uint8_t) *s;
433 switch (c) {
434 case '\0':
435 /*
436 * Premature end for a type descriptor, but valid for
437 * a class name as long as we haven't encountered an
438 * empty component (including the degenerate case of
439 * the empty string "").
440 */
441 return (type == kName) && !sepOrFirst;
442 case ';':
443 /*
444 * Invalid character for a class name, but the
445 * legitimate end of a type descriptor. In the latter
446 * case, make sure that this is the end of the string
447 * and that it doesn't end with an empty component
448 * (including the degenerate case of "L;").
449 */
450 return (type == kDescriptor) && !sepOrFirst && (s[1] == '\0');
451 case '/':
452 case '.':
453 if (c != kSeparator) {
454 // The wrong separator character.
455 return false;
456 }
457 if (sepOrFirst) {
458 // Separator at start or two separators in a row.
459 return false;
460 }
461 sepOrFirst = true;
462 s++;
463 break;
464 default:
465 if (!IsValidPartOfMemberNameUtf8(&s)) {
466 return false;
467 }
468 sepOrFirst = false;
469 break;
470 }
471 }
472 }
473
IsValidBinaryClassName(const char * s)474 bool IsValidBinaryClassName(const char* s) {
475 return IsValidClassName<kName, '.'>(s);
476 }
477
IsValidJniClassName(const char * s)478 bool IsValidJniClassName(const char* s) {
479 return IsValidClassName<kName, '/'>(s);
480 }
481
IsValidDescriptor(const char * s)482 bool IsValidDescriptor(const char* s) {
483 return IsValidClassName<kDescriptor, '/'>(s);
484 }
485
PrettyDescriptor(Primitive::Type type)486 std::string PrettyDescriptor(Primitive::Type type) {
487 return PrettyDescriptor(Primitive::Descriptor(type));
488 }
489
490 } // namespace art
491