1 /*
2 * Copyright (C) 2008 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 /*
17 * UTF-8 and Unicode string manipulation, plus java/lang/String convenience
18 * functions.
19 *
20 * In most cases we populate the fields in the String object directly,
21 * rather than going through an instance field lookup.
22 */
23 #include "Dalvik.h"
24 #include <stdlib.h>
25
26 /*
27 * Initialize string globals.
28 *
29 * This isn't part of the VM init sequence because it's hard to get the
30 * timing right -- we need it to happen after java/lang/String has been
31 * loaded, but before anybody wants to use a string. It's easiest to
32 * just initialize it on first use.
33 *
34 * In some unusual circumstances (e.g. trying to throw an exception because
35 * String implements java/lang/CharSequence, but CharSequence doesn't exist)
36 * we can try to create an exception string internally before anything has
37 * really tried to use String. In that case we basically self-destruct.
38 */
stringStartup()39 static bool stringStartup()
40 {
41 if (gDvm.javaLangStringReady < 0) {
42 LOGE("ERROR: reentrant string initialization\n");
43 assert(false);
44 return false;
45 }
46 assert(gDvm.javaLangStringReady == 0);
47
48 gDvm.javaLangStringReady = -1;
49
50 if (gDvm.classJavaLangString == NULL)
51 gDvm.classJavaLangString =
52 dvmFindSystemClassNoInit("Ljava/lang/String;");
53
54 gDvm.offJavaLangString_value =
55 dvmFindFieldOffset(gDvm.classJavaLangString, "value", "[C");
56 gDvm.offJavaLangString_count =
57 dvmFindFieldOffset(gDvm.classJavaLangString, "count", "I");
58 gDvm.offJavaLangString_offset =
59 dvmFindFieldOffset(gDvm.classJavaLangString, "offset", "I");
60 gDvm.offJavaLangString_hashCode =
61 dvmFindFieldOffset(gDvm.classJavaLangString, "hashCode", "I");
62
63 if (gDvm.offJavaLangString_value < 0 ||
64 gDvm.offJavaLangString_count < 0 ||
65 gDvm.offJavaLangString_offset < 0 ||
66 gDvm.offJavaLangString_hashCode < 0)
67 {
68 LOGE("VM-required field missing from java/lang/String\n");
69 return false;
70 }
71
72 gDvm.javaLangStringReady = 1;
73
74 return true;
75 }
76
77 /*
78 * Discard heap-allocated storage.
79 */
dvmStringShutdown()80 void dvmStringShutdown()
81 {
82 // currently unused
83 }
84
85 /*
86 * Compute a hash code on a UTF-8 string, for use with internal hash tables.
87 *
88 * This may or may not yield the same results as the java/lang/String
89 * computeHashCode() function. (To make sure this doesn't get abused,
90 * I'm initializing the hash code to 1 so they *don't* match up.)
91 *
92 * It would be more correct to invoke dexGetUtf16FromUtf8() here and compute
93 * the hash with the result. That way, if something encoded the same
94 * character in two different ways, the hash value would be the same. For
95 * our purposes that isn't necessary.
96 */
dvmComputeUtf8Hash(const char * utf8Str)97 u4 dvmComputeUtf8Hash(const char* utf8Str)
98 {
99 u4 hash = 1;
100
101 while (*utf8Str != '\0')
102 hash = hash * 31 + *utf8Str++;
103
104 return hash;
105 }
106
107 /*
108 * Like "strlen", but for strings encoded with "modified" UTF-8.
109 *
110 * The value returned is the number of characters, which may or may not
111 * be the same as the number of bytes.
112 *
113 * (If this needs optimizing, try: mask against 0xa0, shift right 5,
114 * get increment {1-3} from table of 8 values.)
115 */
dvmUtf8Len(const char * utf8Str)116 int dvmUtf8Len(const char* utf8Str)
117 {
118 int ic, len = 0;
119
120 while ((ic = *utf8Str++) != '\0') {
121 len++;
122 if ((ic & 0x80) != 0) {
123 /* two- or three-byte encoding */
124 utf8Str++;
125 if ((ic & 0x20) != 0) {
126 /* three-byte encoding */
127 utf8Str++;
128 }
129 }
130 }
131
132 return len;
133 }
134
135 /*
136 * Convert a "modified" UTF-8 string to UTF-16.
137 */
dvmConvertUtf8ToUtf16(u2 * utf16Str,const char * utf8Str)138 void dvmConvertUtf8ToUtf16(u2* utf16Str, const char* utf8Str)
139 {
140 while (*utf8Str != '\0')
141 *utf16Str++ = dexGetUtf16FromUtf8(&utf8Str);
142 }
143
144 /*
145 * Given a UTF-16 string, compute the length of the corresponding UTF-8
146 * string in bytes.
147 */
utf16_utf8ByteLen(const u2 * utf16Str,int len)148 static int utf16_utf8ByteLen(const u2* utf16Str, int len)
149 {
150 int utf8Len = 0;
151
152 while (len--) {
153 unsigned int uic = *utf16Str++;
154
155 /*
156 * The most common case is (uic > 0 && uic <= 0x7f).
157 */
158 if (uic == 0 || uic > 0x7f) {
159 if (uic > 0x07ff)
160 utf8Len += 3;
161 else /*(uic > 0x7f || uic == 0) */
162 utf8Len += 2;
163 } else
164 utf8Len++;
165 }
166 return utf8Len;
167 }
168
169 /*
170 * Convert a UTF-16 string to UTF-8.
171 *
172 * Make sure you allocate "utf8Str" with the result of utf16_utf8ByteLen(),
173 * not just "len".
174 */
convertUtf16ToUtf8(char * utf8Str,const u2 * utf16Str,int len)175 static void convertUtf16ToUtf8(char* utf8Str, const u2* utf16Str, int len)
176 {
177 assert(len >= 0);
178
179 while (len--) {
180 unsigned int uic = *utf16Str++;
181
182 /*
183 * The most common case is (uic > 0 && uic <= 0x7f).
184 */
185 if (uic == 0 || uic > 0x7f) {
186 if (uic > 0x07ff) {
187 *utf8Str++ = (uic >> 12) | 0xe0;
188 *utf8Str++ = ((uic >> 6) & 0x3f) | 0x80;
189 *utf8Str++ = (uic & 0x3f) | 0x80;
190 } else /*(uic > 0x7f || uic == 0)*/ {
191 *utf8Str++ = (uic >> 6) | 0xc0;
192 *utf8Str++ = (uic & 0x3f) | 0x80;
193 }
194 } else {
195 *utf8Str++ = uic;
196 }
197 }
198
199 *utf8Str = '\0';
200 }
201
202 /*
203 * Use the java/lang/String.computeHashCode() algorithm.
204 */
dvmComputeUtf16Hash(const u2 * utf16Str,int len)205 static inline u4 dvmComputeUtf16Hash(const u2* utf16Str, int len)
206 {
207 u4 hash = 0;
208
209 while (len--)
210 hash = hash * 31 + *utf16Str++;
211
212 return hash;
213 }
dvmComputeStringHash(StringObject * strObj)214 u4 dvmComputeStringHash(StringObject* strObj) {
215 ArrayObject* chars = (ArrayObject*) dvmGetFieldObject((Object*) strObj,
216 gDvm.offJavaLangString_value);
217 int offset, len;
218
219 len = dvmGetFieldInt((Object*) strObj, gDvm.offJavaLangString_count);
220 offset = dvmGetFieldInt((Object*) strObj, gDvm.offJavaLangString_offset);
221
222 return dvmComputeUtf16Hash((u2*) chars->contents + offset, len);
223 }
224
225 /*
226 * Create a new java/lang/String object, using the string data in "utf8Str".
227 *
228 * Note that "allocFlags" affects both of the allocations here. If you
229 * use ALLOC_DONT_TRACK in a context where a GC could happen between the
230 * two allocations, you could lose the array reference.
231 *
232 * Returns NULL and throws an exception on failure.
233 */
dvmCreateStringFromCstr(const char * utf8Str,int allocFlags)234 StringObject* dvmCreateStringFromCstr(const char* utf8Str, int allocFlags)
235 {
236 assert(utf8Str != NULL);
237
238 return dvmCreateStringFromCstrAndLength(utf8Str, dvmUtf8Len(utf8Str),
239 allocFlags);
240 }
241
242 /*
243 * Create a java/lang/String from a C string, given its UTF-16 length
244 * (number of UTF-16 code points).
245 *
246 * The caller must call dvmReleaseTrackedAlloc() on the return value or
247 * use a non-default value for "allocFlags". It is never appropriate
248 * to use ALLOC_DONT_TRACK with this function.
249 *
250 * Returns NULL and throws an exception on failure.
251 */
dvmCreateStringFromCstrAndLength(const char * utf8Str,u4 utf16Length,int allocFlags)252 StringObject* dvmCreateStringFromCstrAndLength(const char* utf8Str,
253 u4 utf16Length, int allocFlags)
254 {
255 StringObject* newObj;
256 ArrayObject* chars;
257 u4 hashCode = 0;
258
259 //LOGV("Creating String from '%s'\n", utf8Str);
260 assert(allocFlags != ALLOC_DONT_TRACK); /* don't currently need */
261 assert(utf8Str != NULL);
262
263 if (gDvm.javaLangStringReady <= 0) {
264 if (!stringStartup())
265 return NULL;
266 }
267
268 /* init before alloc */
269 if (!dvmIsClassInitialized(gDvm.classJavaLangString) &&
270 !dvmInitClass(gDvm.classJavaLangString))
271 {
272 return NULL;
273 }
274
275 newObj = (StringObject*) dvmAllocObject(gDvm.classJavaLangString,
276 allocFlags);
277 if (newObj == NULL)
278 return NULL;
279
280 chars = dvmAllocPrimitiveArray('C', utf16Length, allocFlags);
281 if (chars == NULL) {
282 dvmReleaseTrackedAllocIFN((Object*) newObj, NULL, allocFlags);
283 return NULL;
284 }
285 dvmConvertUtf8ToUtf16((u2*)chars->contents, utf8Str);
286 hashCode = dvmComputeUtf16Hash((u2*) chars->contents, utf16Length);
287
288 dvmSetFieldObject((Object*)newObj, gDvm.offJavaLangString_value,
289 (Object*)chars);
290 dvmReleaseTrackedAllocIFN((Object*) chars, NULL, allocFlags);
291 dvmSetFieldInt((Object*)newObj, gDvm.offJavaLangString_count, utf16Length);
292 dvmSetFieldInt((Object*)newObj, gDvm.offJavaLangString_hashCode, hashCode);
293 /* leave offset set to zero */
294
295 /* debugging stuff */
296 //dvmDumpObject((Object*)newObj);
297 //printHexDumpEx(ANDROID_LOG_DEBUG, chars->contents, utf16Length * 2,
298 // kHexDumpMem);
299
300 /* caller may need to dvmReleaseTrackedAlloc(newObj) */
301 return newObj;
302 }
303
304 /*
305 * Create a new java/lang/String object, using the Unicode data.
306 */
dvmCreateStringFromUnicode(const u2 * unichars,int len)307 StringObject* dvmCreateStringFromUnicode(const u2* unichars, int len)
308 {
309 StringObject* newObj;
310 ArrayObject* chars;
311 u4 hashCode = 0;
312
313 /* we allow a null pointer if the length is zero */
314 assert(len == 0 || unichars != NULL);
315
316 if (gDvm.javaLangStringReady <= 0) {
317 if (!stringStartup())
318 return NULL;
319 }
320
321 /* init before alloc */
322 if (!dvmIsClassInitialized(gDvm.classJavaLangString) &&
323 !dvmInitClass(gDvm.classJavaLangString))
324 {
325 return NULL;
326 }
327
328 newObj = (StringObject*) dvmAllocObject(gDvm.classJavaLangString,
329 ALLOC_DEFAULT);
330 if (newObj == NULL)
331 return NULL;
332
333 chars = dvmAllocPrimitiveArray('C', len, ALLOC_DEFAULT);
334 if (chars == NULL) {
335 dvmReleaseTrackedAlloc((Object*) newObj, NULL);
336 return NULL;
337 }
338 if (len > 0)
339 memcpy(chars->contents, unichars, len * sizeof(u2));
340 hashCode = dvmComputeUtf16Hash((u2*) chars->contents, len);
341
342 dvmSetFieldObject((Object*)newObj, gDvm.offJavaLangString_value,
343 (Object*)chars);
344 dvmReleaseTrackedAlloc((Object*) chars, NULL);
345 dvmSetFieldInt((Object*)newObj, gDvm.offJavaLangString_count, len);
346 dvmSetFieldInt((Object*)newObj, gDvm.offJavaLangString_hashCode, hashCode);
347 /* leave offset set to zero */
348
349 /* debugging stuff */
350 //dvmDumpObject((Object*)newObj);
351 //printHexDumpEx(ANDROID_LOG_DEBUG, chars->contents, len*2, kHexDumpMem);
352
353 /* caller must dvmReleaseTrackedAlloc(newObj) */
354 return newObj;
355 }
356
357 /*
358 * Create a new C string from a java/lang/String object.
359 *
360 * Returns NULL if the object is NULL.
361 */
dvmCreateCstrFromString(StringObject * jstr)362 char* dvmCreateCstrFromString(StringObject* jstr)
363 {
364 char* newStr;
365 ArrayObject* chars;
366 int len, byteLen, offset;
367 const u2* data;
368
369 assert(gDvm.javaLangStringReady > 0);
370
371 if (jstr == NULL)
372 return NULL;
373
374 len = dvmGetFieldInt((Object*) jstr, gDvm.offJavaLangString_count);
375 offset = dvmGetFieldInt((Object*) jstr, gDvm.offJavaLangString_offset);
376 chars = (ArrayObject*) dvmGetFieldObject((Object*) jstr,
377 gDvm.offJavaLangString_value);
378 data = (const u2*) chars->contents + offset;
379 assert(offset + len <= (int) chars->length);
380
381 byteLen = utf16_utf8ByteLen(data, len);
382 newStr = (char*) malloc(byteLen+1);
383 if (newStr == NULL)
384 return NULL;
385 convertUtf16ToUtf8(newStr, data, len);
386
387 return newStr;
388 }
389
390 /*
391 * Create a UTF-8 C string from a region of a java/lang/String. (Used by
392 * the JNI GetStringUTFRegion call.)
393 */
dvmCreateCstrFromStringRegion(StringObject * jstr,int start,int len,char * buf)394 void dvmCreateCstrFromStringRegion(StringObject* jstr, int start, int len,
395 char* buf)
396 {
397 const u2* data;
398
399 data = dvmStringChars(jstr) + start;
400 convertUtf16ToUtf8(buf, data, len);
401 }
402
403 /*
404 * Compute the length, in modified UTF-8, of a java/lang/String object.
405 *
406 * Does not include the terminating null byte.
407 */
dvmStringUtf8ByteLen(StringObject * jstr)408 int dvmStringUtf8ByteLen(StringObject* jstr)
409 {
410 ArrayObject* chars;
411 int len, offset;
412 const u2* data;
413
414 assert(gDvm.javaLangStringReady > 0);
415
416 if (jstr == NULL)
417 return 0; // should we throw something? assert?
418
419 len = dvmGetFieldInt((Object*) jstr, gDvm.offJavaLangString_count);
420 offset = dvmGetFieldInt((Object*) jstr, gDvm.offJavaLangString_offset);
421 chars = (ArrayObject*) dvmGetFieldObject((Object*) jstr,
422 gDvm.offJavaLangString_value);
423 data = (const u2*) chars->contents + offset;
424 assert(offset + len <= (int) chars->length);
425
426 return utf16_utf8ByteLen(data, len);
427 }
428
429 /*
430 * Get the string's length.
431 */
dvmStringLen(StringObject * jstr)432 int dvmStringLen(StringObject* jstr)
433 {
434 return dvmGetFieldInt((Object*) jstr, gDvm.offJavaLangString_count);
435 }
436
437 /*
438 * Get the string's data.
439 */
dvmStringChars(StringObject * jstr)440 const u2* dvmStringChars(StringObject* jstr)
441 {
442 ArrayObject* chars;
443 int offset;
444
445 offset = dvmGetFieldInt((Object*) jstr, gDvm.offJavaLangString_offset);
446 chars = (ArrayObject*) dvmGetFieldObject((Object*) jstr,
447 gDvm.offJavaLangString_value);
448 return (const u2*) chars->contents + offset;
449 }
450
451
452 /*
453 * Compare two String objects.
454 *
455 * This is a dvmHashTableLookup() callback. The function has already
456 * compared their hash values; we need to do a full compare to ensure
457 * that the strings really match.
458 */
dvmHashcmpStrings(const void * vstrObj1,const void * vstrObj2)459 int dvmHashcmpStrings(const void* vstrObj1, const void* vstrObj2)
460 {
461 const StringObject* strObj1 = (const StringObject*) vstrObj1;
462 const StringObject* strObj2 = (const StringObject*) vstrObj2;
463 ArrayObject* chars1;
464 ArrayObject* chars2;
465 int len1, len2, offset1, offset2;
466
467 assert(gDvm.javaLangStringReady > 0);
468
469 /* get offset and length into char array; all values are in 16-bit units */
470 len1 = dvmGetFieldInt((Object*) strObj1, gDvm.offJavaLangString_count);
471 offset1 = dvmGetFieldInt((Object*) strObj1, gDvm.offJavaLangString_offset);
472 len2 = dvmGetFieldInt((Object*) strObj2, gDvm.offJavaLangString_count);
473 offset2 = dvmGetFieldInt((Object*) strObj2, gDvm.offJavaLangString_offset);
474 if (len1 != len2)
475 return len1 - len2;
476
477 chars1 = (ArrayObject*) dvmGetFieldObject((Object*) strObj1,
478 gDvm.offJavaLangString_value);
479 chars2 = (ArrayObject*) dvmGetFieldObject((Object*) strObj2,
480 gDvm.offJavaLangString_value);
481
482 /* damage here actually indicates a broken java/lang/String */
483 assert(offset1 + len1 <= (int) chars1->length);
484 assert(offset2 + len2 <= (int) chars2->length);
485
486 return memcmp((const u2*) chars1->contents + offset1,
487 (const u2*) chars2->contents + offset2,
488 len1 * sizeof(u2));
489 }
490
491