• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2007 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <objmng/drm_i18n.h>
18 
19 #define IS_GB2312_HIGH_BYTE(c)  ((c) >= 0xA1 && (c) <= 0xF7)
20 #define IS_GB2312_LOW_BYTE(c)   ((c) >= 0xA1 && (c) <= 0xFE)
21 #define IS_GBK_HIGH_BYTE(c)     ((c) >= 0x81 && (c) <= 0xFE)
22 #define IS_GBK_LOW_BYTE(c)      ((c) >= 0x40 && (c) <= 0xFE && (c) != 0x7F)
23 #define IS_BIG5_HIGH_BYTE(c)    ((c) >= 0xA1 && (c) <= 0xF9)
24 #define IS_BIG5_LOW_BYTE(c)     (((c) >= 0x40 && (c) <= 0x7E) \
25                                  || ((c) >= 0xA1 && (c) <= 0xFE))
26 #define IS_ASCII(c)             ((c) <= 127)
27 
28 #define INVALID_UNICODE         0xFFFD
29 
30 #define I18N_LATIN1_SUPPORT
31 #define I18N_UTF8_UTF16_SUPPORT
32 
33 
34 /**
35  * Simply convert ISO 8859-1 (latin1) to unicode
36  */
37 static int32_t latin1ToWcs(const uint8_t *mbs, int32_t mbsLen,
38         uint16_t *wcsBuf, int32_t bufSizeInWideChar,
39         int32_t *bytesConsumed);
40 
41 /**
42  * Convert one unicode char to ISO 8859-1 (latin1) byte
43  */
44 static int32_t wcToLatin1(uint16_t wc, uint8_t * mbs, int32_t bufSize);
45 
46 /**
47  * Convert UTF-8 to unicode
48  */
49 static int32_t utf8ToWcs(const uint8_t *mbs, int32_t mbsLen,
50         uint16_t *wcsBuf, int32_t bufSizeInWideChar,
51         int32_t *bytesConsumed);
52 
53 /**
54  * Convert one unicode char to UTF-8 bytes
55  */
56 static int32_t wcToUtf8(uint16_t wc, uint8_t * mbs, int32_t bufSize);
57 
58 /**
59  * Convert UTF-16 BE to unicode
60  */
61 static int32_t utf16beToWcs(const uint8_t *mbs, int32_t mbsLen,
62         uint16_t *wcsBuf, int32_t bufSizeInWideChar,
63         int32_t *bytesConsumed);
64 
65 /**
66  * Convert one unicode char to UTF-16 BE bytes
67  */
68 static int32_t wcToUtf16be(uint16_t wc, uint8_t * mbs, int32_t bufSize);
69 
70 /**
71  * Convert UTF-16 LE to unicode
72  */
73 static int32_t utf16leToWcs(const uint8_t *mbs, int32_t mbsLen,
74         uint16_t *wcsBuf, int32_t bufSizeInWideChar,
75         int32_t *bytesConsumed);
76 
77 /**
78  * Convert one unicode char to UTF-16 LE bytes
79  */
80 static int32_t wcToUtf16le(uint16_t wc, uint8_t * mbs, int32_t bufSize);
81 
82 /*
83  * see drm_i18n.h
84  */
DRM_i18n_mbsToWcs(DRM_Charset_t charset,const uint8_t * mbs,int32_t mbsLen,uint16_t * wcsBuf,int32_t bufSizeInWideChar,int32_t * bytesConsumed)85 int32_t DRM_i18n_mbsToWcs(DRM_Charset_t charset,
86         const uint8_t *mbs, int32_t mbsLen,
87         uint16_t *wcsBuf, int32_t bufSizeInWideChar,
88         int32_t *bytesConsumed)
89 {
90     switch (charset)
91     {
92 #ifdef I18N_GB2312_SUPPORT
93         case DRM_CHARSET_GB2312:
94             return gb2312ToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
95 #endif
96 #ifdef I18N_GBK_SUPPORT
97         case DRM_CHARSET_GBK:
98             return gbkToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
99 #endif
100 #ifdef I18N_BIG5_SUPPORT
101         case DRM_CHARSET_BIG5:
102             return big5ToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
103 #endif
104 #ifdef I18N_LATIN1_SUPPORT
105         case DRM_CHARSET_LATIN1:
106             return latin1ToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
107 #endif
108 #ifdef I18N_ISO8859X_SUPPORT
109         case DRM_CHARSET_LATIN2:
110         case DRM_CHARSET_LATIN3:
111         case DRM_CHARSET_LATIN4:
112         case DRM_CHARSET_CYRILLIC:
113         case DRM_CHARSET_ARABIC:
114         case DRM_CHARSET_GREEK:
115         case DRM_CHARSET_HEBREW:
116         case DRM_CHARSET_LATIN5:
117         case DRM_CHARSET_LATIN6:
118         case DRM_CHARSET_THAI:
119         case DRM_CHARSET_LATIN7:
120         case DRM_CHARSET_LATIN8:
121         case DRM_CHARSET_LATIN9:
122         case DRM_CHARSET_LATIN10:
123             return iso8859xToWcs(charset, mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
124 #endif
125 #ifdef I18N_UTF8_UTF16_SUPPORT
126         case DRM_CHARSET_UTF8:
127             return utf8ToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
128         case DRM_CHARSET_UTF16BE:
129             return utf16beToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
130         case DRM_CHARSET_UTF16LE:
131             return utf16leToWcs(mbs, mbsLen, wcsBuf, bufSizeInWideChar, bytesConsumed);
132 #endif
133         default:
134             return -1;
135     }
136 }
137 
138 /*
139  * see drm_i18n.h
140  */
DRM_i18n_wcsToMbs(DRM_Charset_t charset,const uint16_t * wcs,int32_t wcsLen,uint8_t * mbsBuf,int32_t bufSizeInByte)141 int32_t DRM_i18n_wcsToMbs(DRM_Charset_t charset,
142         const uint16_t *wcs, int32_t wcsLen,
143         uint8_t *mbsBuf, int32_t bufSizeInByte)
144 {
145     int32_t (* wcToMbFunc)(uint16_t, uint8_t *, int32_t);
146     int32_t charIndex = 0;
147     int32_t numMultiBytes = 0;
148 
149     switch (charset)
150     {
151 #ifdef I18N_LATIN1_SUPPORT
152         case DRM_CHARSET_LATIN1:
153             wcToMbFunc = wcToLatin1;
154             break;
155 #endif
156 #ifdef I18N_UTF8_UTF16_SUPPORT
157         case DRM_CHARSET_UTF8:
158             wcToMbFunc = wcToUtf8;
159             break;
160         case DRM_CHARSET_UTF16BE:
161             wcToMbFunc = wcToUtf16be;
162             break;
163         case DRM_CHARSET_UTF16LE:
164             wcToMbFunc = wcToUtf16le;
165             break;
166 #endif
167 #ifdef I18N_ISO8859X_SUPPORT
168         case DRM_CHARSET_LATIN2:
169         case DRM_CHARSET_LATIN3:
170         case DRM_CHARSET_LATIN4:
171         case DRM_CHARSET_CYRILLIC:
172         case DRM_CHARSET_ARABIC:
173         case DRM_CHARSET_GREEK:
174         case DRM_CHARSET_HEBREW:
175         case DRM_CHARSET_LATIN5:
176         case DRM_CHARSET_LATIN6:
177         case DRM_CHARSET_THAI:
178         case DRM_CHARSET_LATIN7:
179         case DRM_CHARSET_LATIN8:
180         case DRM_CHARSET_LATIN9:
181         case DRM_CHARSET_LATIN10:
182             return wcsToIso8859x(charset, wcs, wcsLen, mbsBuf, bufSizeInByte);
183 #endif
184         default:
185             return -1;
186     }
187 
188     if (mbsBuf) {
189         while (numMultiBytes < bufSizeInByte && charIndex < wcsLen) {
190             /* TODO: handle surrogate pair values here */
191             int32_t mbLen = wcToMbFunc(wcs[charIndex],
192                     &mbsBuf[numMultiBytes], bufSizeInByte - numMultiBytes);
193 
194             if (numMultiBytes + mbLen > bufSizeInByte) {
195                 /* Insufficient buffer. Don't update numMultiBytes */
196                 break;
197             }
198             charIndex++;
199             numMultiBytes += mbLen;
200         }
201     } else {
202         while (charIndex < wcsLen) {
203             /* TODO: handle surrogate pair values here */
204             numMultiBytes += wcToMbFunc(wcs[charIndex], NULL, 0);
205             charIndex++;
206         }
207     }
208 
209     return numMultiBytes;
210 }
211 
212 
213 #ifdef I18N_LATIN1_SUPPORT
214 
latin1ToWcs(const uint8_t * mbs,int32_t mbsLen,uint16_t * wcsBuf,int32_t bufSizeInWideChar,int32_t * bytesConsumed)215 int32_t latin1ToWcs(const uint8_t *mbs, int32_t mbsLen,
216         uint16_t *wcsBuf, int32_t bufSizeInWideChar,
217         int32_t *bytesConsumed)
218 {
219     int32_t charsToConvert;
220     int32_t len;
221 
222     if (wcsBuf == NULL) {
223         return mbsLen;
224     }
225 
226     len = charsToConvert = mbsLen > bufSizeInWideChar ? bufSizeInWideChar : mbsLen;
227     if (len < 0)
228         return 0;
229     while (len--) {
230         *wcsBuf++ = *mbs++;
231     }
232 
233     if (bytesConsumed)
234         *bytesConsumed = charsToConvert;
235 
236     return charsToConvert;
237 }
238 
wcToLatin1(uint16_t wc,uint8_t * mbs,int32_t bufSize)239 int32_t wcToLatin1(uint16_t wc, uint8_t * mbs, int32_t bufSize)
240 {
241     uint8_t ch;
242 
243     if (wc < 0x100) {
244         ch = (uint8_t)(wc & 0xff);
245     } else {
246         ch = '?';
247     }
248     if (mbs && bufSize > 0)
249         *mbs = ch;
250     return 1;
251 }
252 
253 #endif /* I18N_LATIN1_SUPPORT */
254 
255 #ifdef I18N_UTF8_UTF16_SUPPORT
256 
utf8ToWcs(const uint8_t * mbs,int32_t mbsLen,uint16_t * wcsBuf,int32_t bufSizeInWideChar,int32_t * bytesConsumed)257 int32_t utf8ToWcs(const uint8_t *mbs, int32_t mbsLen,
258         uint16_t *wcsBuf, int32_t bufSizeInWideChar,
259         int32_t *bytesConsumed)
260 {
261     int32_t charsConverted = 0;
262     int32_t i = 0;
263     int32_t wideChar;
264 
265     if (wcsBuf == NULL) {
266         /* No conversion but we're still going to calculate bytesConsumed */
267         bufSizeInWideChar = mbsLen * 2;
268     }
269 
270     while((i < mbsLen) && (charsConverted < bufSizeInWideChar)) {
271         uint8_t ch = mbs[i];
272         uint8_t ch2, ch3, ch4;
273 
274         wideChar = -1;
275 
276         if(IS_ASCII(ch)) {
277             wideChar = ch;
278             i++;
279         } else if ((ch & 0xc0) == 0xc0) {
280             int utfStart = i;
281             if ((ch & 0xe0) == 0xc0) {
282                 /* 2 byte sequence */
283                 if (i + 1 < mbsLen && ((ch2 = mbs[i + 1]) & 0xc0) == 0x80) {
284                     wideChar = (uint16_t)(((ch & 0x1F) << 6) | (ch2 & 0x3F));
285                     i += 2;
286                 } else {
287                     /* skip incomplete sequence */
288                     i++;
289                 }
290             } else if ((ch & 0xf0) == 0xe0) {
291                 /* 3 byte sequence */
292                 if (i + 2 < mbsLen
293                         && ((ch2 = mbs[i + 1]) & 0xc0) == 0x80
294                         && ((ch3 = mbs[i + 2]) & 0xc0) == 0x80) {
295                     wideChar = (uint16_t)(((ch & 0x0F) << 12) | ((ch2 & 0x3F) << 6) | (ch3 & 0x3F));
296                     i += 3;
297                 } else {
298                     /* skip incomplete sequence (up to 2 bytes) */
299                     i++;
300                     if (i < mbsLen && (mbs[i] & 0xc0) == 0x80)
301                         i++;
302                 }
303             } else if ((ch & 0xf8) == 0xf0) {
304                 /* 4 byte sequence */
305                 if (i + 3 < mbsLen
306                         && ((ch2 = mbs[i + 1]) & 0xc0) == 0x80
307                         && ((ch3 = mbs[i + 2]) & 0xc0) == 0x80
308                         && ((ch4 = mbs[i + 3]) & 0xc0) == 0x80) {
309                     /* FIXME: we do NOT support U+10000 - U+10FFFF for now.
310                      *        leave it as 0xFFFD. */
311                     wideChar = INVALID_UNICODE;
312                     i += 4;
313                 } else {
314                     /* skip incomplete sequence (up to 3 bytes) */
315                     i++;
316                     if (i < mbsLen && (mbs[i] & 0xc0) == 0x80) {
317                         i++;
318                         if (i < mbsLen && (mbs[i] & 0xc0) == 0x80) {
319                             i++;
320                         }
321                     }
322                 }
323             } else {
324                 /* invalid */
325                 i++;
326             }
327             if (i >= mbsLen && wideChar == -1) {
328                 /* Possible incomplete UTF-8 sequence at the end of mbs.
329                  * Leave it to the caller.
330                  */
331                 i = utfStart;
332                 break;
333             }
334         } else {
335             /* invalid */
336             i++;
337         }
338         if(wcsBuf) {
339             if (wideChar == -1)
340                 wideChar = INVALID_UNICODE;
341             wcsBuf[charsConverted] = (uint16_t)wideChar;
342         }
343         charsConverted++;
344     }
345 
346     if (bytesConsumed)
347         *bytesConsumed = i;
348 
349     return charsConverted;
350 }
351 
wcToUtf8(uint16_t wc,uint8_t * mbs,int32_t bufSize)352 int32_t wcToUtf8(uint16_t wc, uint8_t * mbs, int32_t bufSize)
353 {
354     if (wc <= 0x7f) {
355         if (mbs && (bufSize >= 1)) {
356             *mbs = (uint8_t)wc;
357         }
358         return 1;
359     } else if (wc <= 0x7ff) {
360         if (mbs && (bufSize >= 2)) {
361             *mbs++ = (uint8_t)((wc >> 6) | 0xc0);
362             *mbs = (uint8_t)((wc & 0x3f) | 0x80);
363         }
364         return 2;
365     } else {
366         if (mbs && (bufSize >= 3)) {
367             *mbs++ = (uint8_t)((wc >> 12) | 0xe0);
368             *mbs++ = (uint8_t)(((wc >> 6) & 0x3f)| 0x80);
369             *mbs = (uint8_t)((wc & 0x3f) | 0x80);
370         }
371         return 3;
372     }
373 }
374 
utf16beToWcs(const uint8_t * mbs,int32_t mbsLen,uint16_t * wcsBuf,int32_t bufSizeInWideChar,int32_t * bytesConsumed)375 int32_t utf16beToWcs(const uint8_t *mbs, int32_t mbsLen,
376         uint16_t *wcsBuf, int32_t bufSizeInWideChar,
377         int32_t *bytesConsumed)
378 {
379     int32_t charsToConvert;
380     int32_t len;
381 
382     if (wcsBuf == NULL) {
383         return mbsLen / 2;
384     }
385 
386     len = charsToConvert = (mbsLen / 2) > bufSizeInWideChar ? bufSizeInWideChar : (mbsLen / 2);
387     while (len--) {
388         /* TODO: handle surrogate pair values */
389         *wcsBuf++ = (uint16_t)((*mbs << 8) | *(mbs + 1));
390         mbs += 2;
391     }
392 
393     if (bytesConsumed)
394         *bytesConsumed = charsToConvert * 2;
395 
396     return charsToConvert;
397 }
398 
wcToUtf16be(uint16_t wc,uint8_t * mbs,int32_t bufSize)399 int32_t wcToUtf16be(uint16_t wc, uint8_t * mbs, int32_t bufSize)
400 {
401     if (mbs && bufSize >= 2) {
402         /* TODO: handle surrogate pair values */
403         *mbs = (uint8_t)(wc >> 8);
404         *(mbs + 1) = (uint8_t)(wc & 0xff);
405     }
406     return 2;
407 }
408 
utf16leToWcs(const uint8_t * mbs,int32_t mbsLen,uint16_t * wcsBuf,int32_t bufSizeInWideChar,int32_t * bytesConsumed)409 int32_t utf16leToWcs(const uint8_t *mbs, int32_t mbsLen,
410         uint16_t *wcsBuf, int32_t bufSizeInWideChar,
411         int32_t *bytesConsumed)
412 {
413     int32_t charsToConvert;
414     int32_t len;
415 
416     if (wcsBuf == NULL) {
417         return mbsLen / 2;
418     }
419 
420     len = charsToConvert = (mbsLen / 2) > bufSizeInWideChar ? bufSizeInWideChar : (mbsLen / 2);
421     while (len--) {
422         /* TODO: handle surrogate pair values */
423         *wcsBuf++ = (uint16_t)(*mbs | (*(mbs + 1) << 8));
424         mbs += 2;
425     }
426 
427     if (bytesConsumed)
428         *bytesConsumed = charsToConvert * 2;
429 
430     return charsToConvert;
431 }
432 
wcToUtf16le(uint16_t wc,uint8_t * mbs,int32_t bufSize)433 int32_t wcToUtf16le(uint16_t wc, uint8_t * mbs, int32_t bufSize)
434 {
435     if (mbs && bufSize >= 2) {
436         /* TODO: handle surrogate pair values */
437         *mbs = (uint8_t)(wc & 0xff);
438         *(mbs + 1) = (uint8_t)(wc >> 8);
439     }
440     return 2;
441 }
442 
443 #endif /* I18N_UTF8_UTF16_SUPPORT */
444 
445