• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /*
2   * Copyright (C) 2011 The Android Open Source Project
3   *
4   * Licensed under the Apache License, Version 2.0 (the "License");
5   * you may not use this file except in compliance with the License.
6   * You may obtain a copy of the License at
7   *
8   *      http://www.apache.org/licenses/LICENSE-2.0
9   *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  
17  /*
18   * Validate and manipulate MUTF-8 encoded string data.
19   */
20  
21  #include "DexUtf.h"
22  
23  /* Compare two '\0'-terminated modified UTF-8 strings, using Unicode
24   * code point values for comparison. This treats different encodings
25   * for the same code point as equivalent, except that only a real '\0'
26   * byte is considered the string terminator. The return value is as
27   * for strcmp(). */
dexUtf8Cmp(const char * s1,const char * s2)28  int dexUtf8Cmp(const char* s1, const char* s2) {
29      for (;;) {
30          if (*s1 == '\0') {
31              if (*s2 == '\0') {
32                  return 0;
33              }
34              return -1;
35          } else if (*s2 == '\0') {
36              return 1;
37          }
38  
39          int utf1 = dexGetUtf16FromUtf8(&s1);
40          int utf2 = dexGetUtf16FromUtf8(&s2);
41          int diff = utf1 - utf2;
42  
43          if (diff != 0) {
44              return diff;
45          }
46      }
47  }
48  
49  /* for dexIsValidMemberNameUtf8(), a bit vector indicating valid low ascii */
50  u4 DEX_MEMBER_VALID_LOW_ASCII[4] = {
51      0x00000000, // 00..1f low control characters; nothing valid
52      0x03ff2010, // 20..3f digits and symbols; valid: '0'..'9', '$', '-'
53      0x87fffffe, // 40..5f uppercase etc.; valid: 'A'..'Z', '_'
54      0x07fffffe  // 60..7f lowercase etc.; valid: 'a'..'z'
55  };
56  
57  /* Helper for dexIsValidMemberNameUtf8(); do not call directly. */
dexIsValidMemberNameUtf8_0(const char ** pUtf8Ptr)58  bool dexIsValidMemberNameUtf8_0(const char** pUtf8Ptr) {
59      /*
60       * It's a multibyte encoded character. Decode it and analyze. We
61       * accept anything that isn't (a) an improperly encoded low value,
62       * (b) an improper surrogate pair, (c) an encoded '\0', (d) a high
63       * control character, or (e) a high space, layout, or special
64       * character (U+00a0, U+2000..U+200f, U+2028..U+202f,
65       * U+fff0..U+ffff). This is all specified in the dex format
66       * document.
67       */
68  
69      u2 utf16 = dexGetUtf16FromUtf8(pUtf8Ptr);
70  
71      // Perform follow-up tests based on the high 8 bits.
72      switch (utf16 >> 8) {
73          case 0x00: {
74              // It's only valid if it's above the ISO-8859-1 high space (0xa0).
75              return (utf16 > 0x00a0);
76          }
77          case 0xd8:
78          case 0xd9:
79          case 0xda:
80          case 0xdb: {
81              /*
82               * It's a leading surrogate. Check to see that a trailing
83               * surrogate follows.
84               */
85              utf16 = dexGetUtf16FromUtf8(pUtf8Ptr);
86              return (utf16 >= 0xdc00) && (utf16 <= 0xdfff);
87          }
88          case 0xdc:
89          case 0xdd:
90          case 0xde:
91          case 0xdf: {
92              // It's a trailing surrogate, which is not valid at this point.
93              return false;
94          }
95          case 0x20:
96          case 0xff: {
97              // It's in the range that has spaces, controls, and specials.
98              switch (utf16 & 0xfff8) {
99                  case 0x2000:
100                  case 0x2008:
101                  case 0x2028:
102                  case 0xfff0:
103                  case 0xfff8: {
104                      return false;
105                  }
106              }
107              break;
108          }
109      }
110  
111      return true;
112  }
113  
114  /* Return whether the given string is a valid field or method name. */
dexIsValidMemberName(const char * s)115  bool dexIsValidMemberName(const char* s) {
116      bool angleName = false;
117  
118      switch (*s) {
119          case '\0': {
120              // The empty string is not a valid name.
121              return false;
122          }
123          case '<': {
124              /*
125               * '<' is allowed only at the start of a name, and if present,
126               * means that the name must end with '>'.
127               */
128              angleName = true;
129              s++;
130              break;
131          }
132      }
133  
134      for (;;) {
135          switch (*s) {
136              case '\0': {
137                  return !angleName;
138              }
139              case '>': {
140                  return angleName && s[1] == '\0';
141              }
142          }
143          if (!dexIsValidMemberNameUtf8(&s)) {
144              return false;
145          }
146      }
147  }
148  
149  /* Helper for validating type descriptors and class names, which is parametric
150   * with respect to type vs. class and dot vs. slash. */
isValidTypeDescriptorOrClassName(const char * s,bool isClassName,bool dotSeparator)151  static bool isValidTypeDescriptorOrClassName(const char* s, bool isClassName,
152          bool dotSeparator) {
153      int arrayCount = 0;
154  
155      while (*s == '[') {
156          arrayCount++;
157          s++;
158      }
159  
160      if (arrayCount > 255) {
161          // Arrays may have no more than 255 dimensions.
162          return false;
163      }
164  
165      if (arrayCount != 0) {
166          /*
167           * If we're looking at an array of some sort, then it doesn't
168           * matter if what is being asked for is a class name; the
169           * format looks the same as a type descriptor in that case, so
170           * treat it as such.
171           */
172          isClassName = false;
173      }
174  
175      if (!isClassName) {
176          /*
177           * We are looking for a descriptor. Either validate it as a
178           * single-character primitive type, or continue on to check the
179           * embedded class name (bracketed by "L" and ";").
180           */
181          switch (*(s++)) {
182              case 'B':
183              case 'C':
184              case 'D':
185              case 'F':
186              case 'I':
187              case 'J':
188              case 'S':
189              case 'Z': {
190                  // These are all single-character descriptors for primitive types.
191                  return (*s == '\0');
192              }
193              case 'V': {
194                  // Non-array void is valid, but you can't have an array of void.
195                  return (arrayCount == 0) && (*s == '\0');
196              }
197              case 'L': {
198                  // Class name: Break out and continue below.
199                  break;
200              }
201              default: {
202                  // Oddball descriptor character.
203                  return false;
204              }
205          }
206      }
207  
208      /*
209       * We just consumed the 'L' that introduces a class name as part
210       * of a type descriptor, or we are looking for an unadorned class
211       * name.
212       */
213  
214      bool sepOrFirst = true; // first character or just encountered a separator.
215      for (;;) {
216          u1 c = (u1) *s;
217          switch (c) {
218              case '\0': {
219                  /*
220                   * Premature end for a type descriptor, but valid for
221                   * a class name as long as we haven't encountered an
222                   * empty component (including the degenerate case of
223                   * the empty string "").
224                   */
225                  return isClassName && !sepOrFirst;
226              }
227              case ';': {
228                  /*
229                   * Invalid character for a class name, but the
230                   * legitimate end of a type descriptor. In the latter
231                   * case, make sure that this is the end of the string
232                   * and that it doesn't end with an empty component
233                   * (including the degenerate case of "L;").
234                   */
235                  return !isClassName && !sepOrFirst && (s[1] == '\0');
236              }
237              case '/':
238              case '.': {
239                  if (dotSeparator != (c == '.')) {
240                      // The wrong separator character.
241                      return false;
242                  }
243                  if (sepOrFirst) {
244                      // Separator at start or two separators in a row.
245                      return false;
246                  }
247                  sepOrFirst = true;
248                  s++;
249                  break;
250              }
251              default: {
252                  if (!dexIsValidMemberNameUtf8(&s)) {
253                      return false;
254                  }
255                  sepOrFirst = false;
256                  break;
257              }
258          }
259      }
260  }
261  
262  /* Return whether the given string is a valid type descriptor. */
dexIsValidTypeDescriptor(const char * s)263  bool dexIsValidTypeDescriptor(const char* s) {
264      return isValidTypeDescriptorOrClassName(s, false, false);
265  }
266  
267  /* (documented in header) */
dexIsValidClassName(const char * s,bool dotSeparator)268  bool dexIsValidClassName(const char* s, bool dotSeparator) {
269      return isValidTypeDescriptorOrClassName(s, true, dotSeparator);
270  }
271  
272  /* Return whether the given string is a valid reference descriptor. This
273   * is true if dexIsValidTypeDescriptor() returns true and the descriptor
274   * is for a class or array and not a primitive type. */
dexIsReferenceDescriptor(const char * s)275  bool dexIsReferenceDescriptor(const char* s) {
276      if (!dexIsValidTypeDescriptor(s)) {
277          return false;
278      }
279  
280      return (s[0] == 'L') || (s[0] == '[');
281  }
282  
283  /* Return whether the given string is a valid class descriptor. This
284   * is true if dexIsValidTypeDescriptor() returns true and the descriptor
285   * is for a class and not an array or primitive type. */
dexIsClassDescriptor(const char * s)286  bool dexIsClassDescriptor(const char* s) {
287      if (!dexIsValidTypeDescriptor(s)) {
288          return false;
289      }
290  
291      return s[0] == 'L';
292  }
293  
294  /* Return whether the given string is a valid field type descriptor. This
295   * is true if dexIsValidTypeDescriptor() returns true and the descriptor
296   * is for anything but "void". */
dexIsFieldDescriptor(const char * s)297  bool dexIsFieldDescriptor(const char* s) {
298      if (!dexIsValidTypeDescriptor(s)) {
299          return false;
300      }
301  
302      return s[0] != 'V';
303  }
304  
305