• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 *******************************************************************************
3 *
4 *   Copyright (C) 1999-2010, International Business Machines
5 *   Corporation and others.  All Rights Reserved.
6 *
7 *******************************************************************************
8 *   file name:  unistr_cnv.cpp
9 *   encoding:   US-ASCII
10 *   tab size:   8 (not used)
11 *   indentation:2
12 *
13 *   created on: 2004aug19
14 *   created by: Markus W. Scherer
15 *
16 *   Character conversion functions moved here from unistr.cpp
17 */
18 
19 #include "unicode/utypes.h"
20 
21 #if !UCONFIG_NO_CONVERSION
22 
23 #include "unicode/putil.h"
24 #include "cstring.h"
25 #include "cmemory.h"
26 #include "unicode/ustring.h"
27 #include "unicode/unistr.h"
28 #include "unicode/ucnv.h"
29 #include "ucnv_imp.h"
30 #include "putilimp.h"
31 #include "ustr_cnv.h"
32 #include "ustr_imp.h"
33 
34 U_NAMESPACE_BEGIN
35 
36 //========================================
37 // Constructors
38 //========================================
39 
40 #if !U_CHARSET_IS_UTF8
41 
UnicodeString(const char * codepageData)42 UnicodeString::UnicodeString(const char *codepageData)
43   : fShortLength(0),
44     fFlags(kShortString)
45 {
46     if(codepageData != 0) {
47         doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), 0);
48     }
49 }
50 
UnicodeString(const char * codepageData,int32_t dataLength)51 UnicodeString::UnicodeString(const char *codepageData,
52                              int32_t dataLength)
53   : fShortLength(0),
54     fFlags(kShortString)
55 {
56     if(codepageData != 0) {
57         doCodepageCreate(codepageData, dataLength, 0);
58     }
59 }
60 
61 // else see unistr.cpp
62 #endif
63 
UnicodeString(const char * codepageData,const char * codepage)64 UnicodeString::UnicodeString(const char *codepageData,
65                              const char *codepage)
66   : fShortLength(0),
67     fFlags(kShortString)
68 {
69     if(codepageData != 0) {
70         doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);
71     }
72 }
73 
UnicodeString(const char * codepageData,int32_t dataLength,const char * codepage)74 UnicodeString::UnicodeString(const char *codepageData,
75                              int32_t dataLength,
76                              const char *codepage)
77   : fShortLength(0),
78     fFlags(kShortString)
79 {
80     if(codepageData != 0) {
81         doCodepageCreate(codepageData, dataLength, codepage);
82     }
83 }
84 
UnicodeString(const char * src,int32_t srcLength,UConverter * cnv,UErrorCode & errorCode)85 UnicodeString::UnicodeString(const char *src, int32_t srcLength,
86                              UConverter *cnv,
87                              UErrorCode &errorCode)
88   : fShortLength(0),
89     fFlags(kShortString)
90 {
91     if(U_SUCCESS(errorCode)) {
92         // check arguments
93         if(src==NULL) {
94             // treat as an empty string, do nothing more
95         } else if(srcLength<-1) {
96             errorCode=U_ILLEGAL_ARGUMENT_ERROR;
97         } else {
98             // get input length
99             if(srcLength==-1) {
100                 srcLength=(int32_t)uprv_strlen(src);
101             }
102             if(srcLength>0) {
103                 if(cnv!=0) {
104                     // use the provided converter
105                     ucnv_resetToUnicode(cnv);
106                     doCodepageCreate(src, srcLength, cnv, errorCode);
107                 } else {
108                     // use the default converter
109                     cnv=u_getDefaultConverter(&errorCode);
110                     doCodepageCreate(src, srcLength, cnv, errorCode);
111                     u_releaseDefaultConverter(cnv);
112                 }
113             }
114         }
115 
116         if(U_FAILURE(errorCode)) {
117             setToBogus();
118         }
119     }
120 }
121 
122 //========================================
123 // Codeset conversion
124 //========================================
125 
126 #if !U_CHARSET_IS_UTF8
127 
128 int32_t
extract(int32_t start,int32_t length,char * target,uint32_t dstSize) const129 UnicodeString::extract(int32_t start,
130                        int32_t length,
131                        char *target,
132                        uint32_t dstSize) const {
133     return extract(start, length, target, dstSize, 0);
134 }
135 
136 // else see unistr.cpp
137 #endif
138 
139 int32_t
extract(int32_t start,int32_t length,char * target,uint32_t dstSize,const char * codepage) const140 UnicodeString::extract(int32_t start,
141                        int32_t length,
142                        char *target,
143                        uint32_t dstSize,
144                        const char *codepage) const
145 {
146     // if the arguments are illegal, then do nothing
147     if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
148         return 0;
149     }
150 
151     // pin the indices to legal values
152     pinIndices(start, length);
153 
154     // We need to cast dstSize to int32_t for all subsequent code.
155     // I don't know why the API was defined with uint32_t but we are stuck with it.
156     // Also, dstSize==0xffffffff means "unlimited" but if we use target+dstSize
157     // as a limit in some functions, it may wrap around and yield a pointer
158     // that compares less-than target.
159     int32_t capacity;
160     if(dstSize < 0x7fffffff) {
161         // Assume that the capacity is real and a limit pointer won't wrap around.
162         capacity = (int32_t)dstSize;
163     } else {
164         // Pin the capacity so that a limit pointer does not wrap around.
165         char *targetLimit = (char *)U_MAX_PTR(target);
166         // U_MAX_PTR(target) returns a targetLimit that is at most 0x7fffffff
167         // greater than target and does not wrap around the top of the address space.
168         capacity = (int32_t)(targetLimit - target);
169     }
170 
171     // create the converter
172     UConverter *converter;
173     UErrorCode status = U_ZERO_ERROR;
174 
175     // just write the NUL if the string length is 0
176     if(length == 0) {
177         return u_terminateChars(target, capacity, 0, &status);
178     }
179 
180     // if the codepage is the default, use our cache
181     // if it is an empty string, then use the "invariant character" conversion
182     if (codepage == 0) {
183         const char *defaultName = ucnv_getDefaultName();
184         if(UCNV_FAST_IS_UTF8(defaultName)) {
185             return toUTF8(start, length, target, capacity);
186         }
187         converter = u_getDefaultConverter(&status);
188     } else if (*codepage == 0) {
189         // use the "invariant characters" conversion
190         int32_t destLength;
191         if(length <= capacity) {
192             destLength = length;
193         } else {
194             destLength = capacity;
195         }
196         u_UCharsToChars(getArrayStart() + start, target, destLength);
197         return u_terminateChars(target, capacity, length, &status);
198     } else {
199         converter = ucnv_open(codepage, &status);
200     }
201 
202     length = doExtract(start, length, target, capacity, converter, status);
203 
204     // close the converter
205     if (codepage == 0) {
206         u_releaseDefaultConverter(converter);
207     } else {
208         ucnv_close(converter);
209     }
210 
211     return length;
212 }
213 
214 int32_t
extract(char * dest,int32_t destCapacity,UConverter * cnv,UErrorCode & errorCode) const215 UnicodeString::extract(char *dest, int32_t destCapacity,
216                        UConverter *cnv,
217                        UErrorCode &errorCode) const
218 {
219     if(U_FAILURE(errorCode)) {
220         return 0;
221     }
222 
223     if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
224         errorCode=U_ILLEGAL_ARGUMENT_ERROR;
225         return 0;
226     }
227 
228     // nothing to do?
229     if(isEmpty()) {
230         return u_terminateChars(dest, destCapacity, 0, &errorCode);
231     }
232 
233     // get the converter
234     UBool isDefaultConverter;
235     if(cnv==0) {
236         isDefaultConverter=TRUE;
237         cnv=u_getDefaultConverter(&errorCode);
238         if(U_FAILURE(errorCode)) {
239             return 0;
240         }
241     } else {
242         isDefaultConverter=FALSE;
243         ucnv_resetFromUnicode(cnv);
244     }
245 
246     // convert
247     int32_t len=doExtract(0, length(), dest, destCapacity, cnv, errorCode);
248 
249     // release the converter
250     if(isDefaultConverter) {
251         u_releaseDefaultConverter(cnv);
252     }
253 
254     return len;
255 }
256 
257 int32_t
doExtract(int32_t start,int32_t length,char * dest,int32_t destCapacity,UConverter * cnv,UErrorCode & errorCode) const258 UnicodeString::doExtract(int32_t start, int32_t length,
259                          char *dest, int32_t destCapacity,
260                          UConverter *cnv,
261                          UErrorCode &errorCode) const
262 {
263     if(U_FAILURE(errorCode)) {
264         if(destCapacity!=0) {
265             *dest=0;
266         }
267         return 0;
268     }
269 
270     const UChar *src=getArrayStart()+start, *srcLimit=src+length;
271     char *originalDest=dest;
272     const char *destLimit;
273 
274     if(destCapacity==0) {
275         destLimit=dest=0;
276     } else if(destCapacity==-1) {
277         // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
278         destLimit=(char*)U_MAX_PTR(dest);
279         // for NUL-termination, translate into highest int32_t
280         destCapacity=0x7fffffff;
281     } else {
282         destLimit=dest+destCapacity;
283     }
284 
285     // perform the conversion
286     ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
287     length=(int32_t)(dest-originalDest);
288 
289     // if an overflow occurs, then get the preflighting length
290     if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
291         char buffer[1024];
292 
293         destLimit=buffer+sizeof(buffer);
294         do {
295             dest=buffer;
296             errorCode=U_ZERO_ERROR;
297             ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
298             length+=(int32_t)(dest-buffer);
299         } while(errorCode==U_BUFFER_OVERFLOW_ERROR);
300     }
301 
302     return u_terminateChars(originalDest, destCapacity, length, &errorCode);
303 }
304 
305 void
doCodepageCreate(const char * codepageData,int32_t dataLength,const char * codepage)306 UnicodeString::doCodepageCreate(const char *codepageData,
307                                 int32_t dataLength,
308                                 const char *codepage)
309 {
310     // if there's nothing to convert, do nothing
311     if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
312         return;
313     }
314     if(dataLength == -1) {
315         dataLength = (int32_t)uprv_strlen(codepageData);
316     }
317 
318     UErrorCode status = U_ZERO_ERROR;
319 
320     // create the converter
321     // if the codepage is the default, use our cache
322     // if it is an empty string, then use the "invariant character" conversion
323     UConverter *converter;
324     if (codepage == 0) {
325         const char *defaultName = ucnv_getDefaultName();
326         if(UCNV_FAST_IS_UTF8(defaultName)) {
327             setToUTF8(StringPiece(codepageData, dataLength));
328             return;
329         }
330         converter = u_getDefaultConverter(&status);
331     } else if(*codepage == 0) {
332         // use the "invariant characters" conversion
333         if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
334             u_charsToUChars(codepageData, getArrayStart(), dataLength);
335             setLength(dataLength);
336         } else {
337             setToBogus();
338         }
339         return;
340     } else {
341         converter = ucnv_open(codepage, &status);
342     }
343 
344     // if we failed, set the appropriate flags and return
345     if(U_FAILURE(status)) {
346         setToBogus();
347         return;
348     }
349 
350     // perform the conversion
351     doCodepageCreate(codepageData, dataLength, converter, status);
352     if(U_FAILURE(status)) {
353         setToBogus();
354     }
355 
356     // close the converter
357     if(codepage == 0) {
358         u_releaseDefaultConverter(converter);
359     } else {
360         ucnv_close(converter);
361     }
362 }
363 
364 void
doCodepageCreate(const char * codepageData,int32_t dataLength,UConverter * converter,UErrorCode & status)365 UnicodeString::doCodepageCreate(const char *codepageData,
366                                 int32_t dataLength,
367                                 UConverter *converter,
368                                 UErrorCode &status)
369 {
370     if(U_FAILURE(status)) {
371         return;
372     }
373 
374     // set up the conversion parameters
375     const char *mySource     = codepageData;
376     const char *mySourceEnd  = mySource + dataLength;
377     UChar *array, *myTarget;
378 
379     // estimate the size needed:
380     int32_t arraySize;
381     if(dataLength <= US_STACKBUF_SIZE) {
382         // try to use the stack buffer
383         arraySize = US_STACKBUF_SIZE;
384     } else {
385         // 1.25 UChar's per source byte should cover most cases
386         arraySize = dataLength + (dataLength >> 2);
387     }
388 
389     // we do not care about the current contents
390     UBool doCopyArray = FALSE;
391     for(;;) {
392         if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
393             setToBogus();
394             break;
395         }
396 
397         // perform the conversion
398         array = getArrayStart();
399         myTarget = array + length();
400         ucnv_toUnicode(converter, &myTarget,  array + getCapacity(),
401             &mySource, mySourceEnd, 0, TRUE, &status);
402 
403         // update the conversion parameters
404         setLength((int32_t)(myTarget - array));
405 
406         // allocate more space and copy data, if needed
407         if(status == U_BUFFER_OVERFLOW_ERROR) {
408             // reset the error code
409             status = U_ZERO_ERROR;
410 
411             // keep the previous conversion results
412             doCopyArray = TRUE;
413 
414             // estimate the new size needed, larger than before
415             // try 2 UChar's per remaining source byte
416             arraySize = (int32_t)(length() + 2 * (mySourceEnd - mySource));
417         } else {
418             break;
419         }
420     }
421 }
422 
423 U_NAMESPACE_END
424 
425 #endif
426