1 /*
2 * Copyright (C) 2011 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 /*
18 * Validate and manipulate MUTF-8 (modified UTF-8) encoded string data.
19 */
20
21 #ifndef LIBDEX_DEXUTF_H_
22 #define LIBDEX_DEXUTF_H_
23
24 #include "DexFile.h"
25
26 /*
27 * Retrieve the next UTF-16 character from a UTF-8 string.
28 *
29 * Advances "*pUtf8Ptr" to the start of the next character.
30 *
31 * WARNING: If a string is corrupted by dropping a '\0' in the middle
32 * of a 3-byte sequence, you can end up overrunning the buffer with
33 * reads (and possibly with the writes if the length was computed and
34 * cached before the damage). For performance reasons, this function
35 * assumes that the string being parsed is known to be valid (e.g., by
36 * already being verified). Most strings we process here are coming
37 * out of dex files or other internal translations, so the only real
38 * risk comes from the JNI NewStringUTF call.
39 */
dexGetUtf16FromUtf8(const char ** pUtf8Ptr)40 DEX_INLINE u2 dexGetUtf16FromUtf8(const char** pUtf8Ptr)
41 {
42 unsigned int one, two, three;
43
44 one = *(*pUtf8Ptr)++;
45 if ((one & 0x80) != 0) {
46 /* two- or three-byte encoding */
47 two = *(*pUtf8Ptr)++;
48 if ((one & 0x20) != 0) {
49 /* three-byte encoding */
50 three = *(*pUtf8Ptr)++;
51 return ((one & 0x0f) << 12) |
52 ((two & 0x3f) << 6) |
53 (three & 0x3f);
54 } else {
55 /* two-byte encoding */
56 return ((one & 0x1f) << 6) |
57 (two & 0x3f);
58 }
59 } else {
60 /* one-byte encoding */
61 return one;
62 }
63 }
64
65 /* Compare two '\0'-terminated modified UTF-8 strings, using Unicode
66 * code point values for comparison. This treats different encodings
67 * for the same code point as equivalent, except that only a real '\0'
68 * byte is considered the string terminator. The return value is as
69 * for strcmp(). */
70 int dexUtf8Cmp(const char* s1, const char* s2);
71
72 /* for dexIsValidMemberNameUtf8(), a bit vector indicating valid low ascii */
73 extern u4 DEX_MEMBER_VALID_LOW_ASCII[4];
74
75 /* Helper for dexIsValidMemberUtf8(); do not call directly. */
76 bool dexIsValidMemberNameUtf8_0(const char** pUtf8Ptr);
77
78 /* Return whether the pointed-at modified-UTF-8 encoded character is
79 * valid as part of a member name, updating the pointer to point past
80 * the consumed character. This will consume two encoded UTF-16 code
81 * points if the character is encoded as a surrogate pair. Also, if
82 * this function returns false, then the given pointer may only have
83 * been partially advanced. */
dexIsValidMemberNameUtf8(const char ** pUtf8Ptr)84 DEX_INLINE bool dexIsValidMemberNameUtf8(const char** pUtf8Ptr) {
85 u1 c = (u1) **pUtf8Ptr;
86 if (c <= 0x7f) {
87 // It's low-ascii, so check the table.
88 u4 wordIdx = c >> 5;
89 u4 bitIdx = c & 0x1f;
90 (*pUtf8Ptr)++;
91 return (DEX_MEMBER_VALID_LOW_ASCII[wordIdx] & (1 << bitIdx)) != 0;
92 }
93
94 /*
95 * It's a multibyte encoded character. Call a non-inline function
96 * for the heavy lifting.
97 */
98 return dexIsValidMemberNameUtf8_0(pUtf8Ptr);
99 }
100
101 /* Return whether the given string is a valid field or method name. */
102 bool dexIsValidMemberName(const char* s);
103
104 /* Return whether the given string is a valid type descriptor. */
105 bool dexIsValidTypeDescriptor(const char* s);
106
107 /* Return whether the given string is a valid internal-form class
108 * name, with components separated either by dots or slashes as
109 * specified. A class name is like a type descriptor, except that it
110 * can't name a primitive type (including void). In terms of syntax,
111 * the form is either (a) the name of the class without adornment
112 * (that is, not bracketed by "L" and ";"); or (b) identical to the
113 * type descriptor syntax for array types. */
114 bool dexIsValidClassName(const char* s, bool dotSeparator);
115
116 /* Return whether the given string is a valid reference descriptor. This
117 * is true if dexIsValidTypeDescriptor() returns true and the descriptor
118 * is for a class or array and not a primitive type. */
119 bool dexIsReferenceDescriptor(const char* s);
120
121 /* Return whether the given string is a valid class descriptor. This
122 * is true if dexIsValidTypeDescriptor() returns true and the descriptor
123 * is for a class and not an array or primitive type. */
124 bool dexIsClassDescriptor(const char* s);
125
126 /* Return whether the given string is a valid field type descriptor. This
127 * is true if dexIsValidTypeDescriptor() returns true and the descriptor
128 * is for anything but "void". */
129 bool dexIsFieldDescriptor(const char* s);
130
131 #endif // LIBDEX_DEXUTF_H_
132