1 /*
2 *******************************************************************************
3 *
4 * Copyright (C) 1999-2007, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *
7 *******************************************************************************
8 * file name: unistr_cnv.cpp
9 * encoding: US-ASCII
10 * tab size: 8 (not used)
11 * indentation:2
12 *
13 * created on: 2004aug19
14 * created by: Markus W. Scherer
15 *
16 * Character conversion functions moved here from unistr.cpp
17 */
18
19 #include "unicode/utypes.h"
20
21 #if !UCONFIG_NO_CONVERSION
22
23 #include "unicode/putil.h"
24 #include "cstring.h"
25 #include "cmemory.h"
26 #include "unicode/ustring.h"
27 #include "unicode/unistr.h"
28 #include "unicode/ucnv.h"
29 #include "putilimp.h"
30 #include "ustr_cnv.h"
31 #include "ustr_imp.h"
32
33 U_NAMESPACE_BEGIN
34
35 //========================================
36 // Constructors
37 //========================================
38
UnicodeString(const char * codepageData,const char * codepage)39 UnicodeString::UnicodeString(const char *codepageData,
40 const char *codepage)
41 : fLength(0),
42 fCapacity(US_STACKBUF_SIZE),
43 fArray(fStackBuffer),
44 fFlags(kShortString)
45 {
46 if(codepageData != 0) {
47 doCodepageCreate(codepageData, (int32_t)uprv_strlen(codepageData), codepage);
48 }
49 }
50
51
UnicodeString(const char * codepageData,int32_t dataLength,const char * codepage)52 UnicodeString::UnicodeString(const char *codepageData,
53 int32_t dataLength,
54 const char *codepage)
55 : fLength(0),
56 fCapacity(US_STACKBUF_SIZE),
57 fArray(fStackBuffer),
58 fFlags(kShortString)
59 {
60 if(codepageData != 0) {
61 doCodepageCreate(codepageData, dataLength, codepage);
62 }
63 }
64
UnicodeString(const char * src,int32_t srcLength,UConverter * cnv,UErrorCode & errorCode)65 UnicodeString::UnicodeString(const char *src, int32_t srcLength,
66 UConverter *cnv,
67 UErrorCode &errorCode)
68 : fLength(0),
69 fCapacity(US_STACKBUF_SIZE),
70 fArray(fStackBuffer),
71 fFlags(kShortString)
72 {
73 if(U_SUCCESS(errorCode)) {
74 // check arguments
75 if(src==NULL) {
76 // treat as an empty string, do nothing more
77 } else if(srcLength<-1) {
78 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
79 } else {
80 // get input length
81 if(srcLength==-1) {
82 srcLength=(int32_t)uprv_strlen(src);
83 }
84 if(srcLength>0) {
85 if(cnv!=0) {
86 // use the provided converter
87 ucnv_resetToUnicode(cnv);
88 doCodepageCreate(src, srcLength, cnv, errorCode);
89 } else {
90 // use the default converter
91 cnv=u_getDefaultConverter(&errorCode);
92 doCodepageCreate(src, srcLength, cnv, errorCode);
93 u_releaseDefaultConverter(cnv);
94 }
95 }
96 }
97
98 if(U_FAILURE(errorCode)) {
99 setToBogus();
100 }
101 }
102 }
103
104 //========================================
105 // Codeset conversion
106 //========================================
107 int32_t
extract(int32_t start,int32_t length,char * target,uint32_t dstSize,const char * codepage) const108 UnicodeString::extract(int32_t start,
109 int32_t length,
110 char *target,
111 uint32_t dstSize,
112 const char *codepage) const
113 {
114 // if the arguments are illegal, then do nothing
115 if(/*dstSize < 0 || */(dstSize > 0 && target == 0)) {
116 return 0;
117 }
118
119 // pin the indices to legal values
120 pinIndices(start, length);
121
122 // create the converter
123 UConverter *converter;
124 UErrorCode status = U_ZERO_ERROR;
125
126 // just write the NUL if the string length is 0
127 if(length == 0) {
128 if(dstSize >= 0x80000000) {
129 // careful: dstSize is unsigned! (0xffffffff means "unlimited")
130 // make sure that the NUL-termination works (takes int32_t)
131 dstSize=0x7fffffff;
132 }
133 return u_terminateChars(target, dstSize, 0, &status);
134 }
135
136 // if the codepage is the default, use our cache
137 // if it is an empty string, then use the "invariant character" conversion
138 if (codepage == 0) {
139 converter = u_getDefaultConverter(&status);
140 } else if (*codepage == 0) {
141 // use the "invariant characters" conversion
142 int32_t destLength;
143 // careful: dstSize is unsigned! (0xffffffff means "unlimited")
144 if(dstSize >= 0x80000000) {
145 destLength = length;
146 // make sure that the NUL-termination works (takes int32_t)
147 dstSize=0x7fffffff;
148 } else if(length <= (int32_t)dstSize) {
149 destLength = length;
150 } else {
151 destLength = (int32_t)dstSize;
152 }
153 u_UCharsToChars(getArrayStart() + start, target, destLength);
154 return u_terminateChars(target, (int32_t)dstSize, length, &status);
155 } else {
156 converter = ucnv_open(codepage, &status);
157 }
158
159 length = doExtract(start, length, target, (int32_t)dstSize, converter, status);
160
161 // close the converter
162 if (codepage == 0) {
163 u_releaseDefaultConverter(converter);
164 } else {
165 ucnv_close(converter);
166 }
167
168 return length;
169 }
170
171 int32_t
extract(char * dest,int32_t destCapacity,UConverter * cnv,UErrorCode & errorCode) const172 UnicodeString::extract(char *dest, int32_t destCapacity,
173 UConverter *cnv,
174 UErrorCode &errorCode) const
175 {
176 if(U_FAILURE(errorCode)) {
177 return 0;
178 }
179
180 if(isBogus() || destCapacity<0 || (destCapacity>0 && dest==0)) {
181 errorCode=U_ILLEGAL_ARGUMENT_ERROR;
182 return 0;
183 }
184
185 // nothing to do?
186 if(fLength<=0) {
187 return u_terminateChars(dest, destCapacity, 0, &errorCode);
188 }
189
190 // get the converter
191 UBool isDefaultConverter;
192 if(cnv==0) {
193 isDefaultConverter=TRUE;
194 cnv=u_getDefaultConverter(&errorCode);
195 if(U_FAILURE(errorCode)) {
196 return 0;
197 }
198 } else {
199 isDefaultConverter=FALSE;
200 ucnv_resetFromUnicode(cnv);
201 }
202
203 // convert
204 int32_t length=doExtract(0, fLength, dest, destCapacity, cnv, errorCode);
205
206 // release the converter
207 if(isDefaultConverter) {
208 u_releaseDefaultConverter(cnv);
209 }
210
211 return length;
212 }
213
214 int32_t
doExtract(int32_t start,int32_t length,char * dest,int32_t destCapacity,UConverter * cnv,UErrorCode & errorCode) const215 UnicodeString::doExtract(int32_t start, int32_t length,
216 char *dest, int32_t destCapacity,
217 UConverter *cnv,
218 UErrorCode &errorCode) const
219 {
220 if(U_FAILURE(errorCode)) {
221 if(destCapacity!=0) {
222 *dest=0;
223 }
224 return 0;
225 }
226
227 const UChar *src=fArray+start, *srcLimit=src+length;
228 char *originalDest=dest;
229 const char *destLimit;
230
231 if(destCapacity==0) {
232 destLimit=dest=0;
233 } else if(destCapacity==-1) {
234 // Pin the limit to U_MAX_PTR if the "magic" destCapacity is used.
235 destLimit=(char*)U_MAX_PTR(dest);
236 // for NUL-termination, translate into highest int32_t
237 destCapacity=0x7fffffff;
238 } else {
239 destLimit=dest+destCapacity;
240 }
241
242 // perform the conversion
243 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
244 length=(int32_t)(dest-originalDest);
245
246 // if an overflow occurs, then get the preflighting length
247 if(errorCode==U_BUFFER_OVERFLOW_ERROR) {
248 char buffer[1024];
249
250 destLimit=buffer+sizeof(buffer);
251 do {
252 dest=buffer;
253 errorCode=U_ZERO_ERROR;
254 ucnv_fromUnicode(cnv, &dest, destLimit, &src, srcLimit, 0, TRUE, &errorCode);
255 length+=(int32_t)(dest-buffer);
256 } while(errorCode==U_BUFFER_OVERFLOW_ERROR);
257 }
258
259 return u_terminateChars(originalDest, destCapacity, length, &errorCode);
260 }
261
262 void
doCodepageCreate(const char * codepageData,int32_t dataLength,const char * codepage)263 UnicodeString::doCodepageCreate(const char *codepageData,
264 int32_t dataLength,
265 const char *codepage)
266 {
267 // if there's nothing to convert, do nothing
268 if(codepageData == 0 || dataLength == 0 || dataLength < -1) {
269 return;
270 }
271 if(dataLength == -1) {
272 dataLength = (int32_t)uprv_strlen(codepageData);
273 }
274
275 UErrorCode status = U_ZERO_ERROR;
276
277 // create the converter
278 // if the codepage is the default, use our cache
279 // if it is an empty string, then use the "invariant character" conversion
280 UConverter *converter = (codepage == 0 ?
281 u_getDefaultConverter(&status) :
282 *codepage == 0 ?
283 0 :
284 ucnv_open(codepage, &status));
285
286 // if we failed, set the appropriate flags and return
287 if(U_FAILURE(status)) {
288 setToBogus();
289 return;
290 }
291
292 // perform the conversion
293 if(converter == 0) {
294 // use the "invariant characters" conversion
295 if(cloneArrayIfNeeded(dataLength, dataLength, FALSE)) {
296 u_charsToUChars(codepageData, getArrayStart(), dataLength);
297 fLength = dataLength;
298 } else {
299 setToBogus();
300 }
301 return;
302 }
303
304 // convert using the real converter
305 doCodepageCreate(codepageData, dataLength, converter, status);
306 if(U_FAILURE(status)) {
307 setToBogus();
308 }
309
310 // close the converter
311 if(codepage == 0) {
312 u_releaseDefaultConverter(converter);
313 } else {
314 ucnv_close(converter);
315 }
316 }
317
318 void
doCodepageCreate(const char * codepageData,int32_t dataLength,UConverter * converter,UErrorCode & status)319 UnicodeString::doCodepageCreate(const char *codepageData,
320 int32_t dataLength,
321 UConverter *converter,
322 UErrorCode &status)
323 {
324 if(U_FAILURE(status)) {
325 return;
326 }
327
328 // set up the conversion parameters
329 const char *mySource = codepageData;
330 const char *mySourceEnd = mySource + dataLength;
331 UChar *myTarget;
332
333 // estimate the size needed:
334 // 1.25 UChar's per source byte should cover most cases
335 int32_t arraySize = dataLength + (dataLength >> 2);
336
337 // we do not care about the current contents
338 UBool doCopyArray = FALSE;
339 for(;;) {
340 if(!cloneArrayIfNeeded(arraySize, arraySize, doCopyArray)) {
341 setToBogus();
342 break;
343 }
344
345 // perform the conversion
346 myTarget = fArray + fLength;
347 ucnv_toUnicode(converter, &myTarget, fArray + fCapacity,
348 &mySource, mySourceEnd, 0, TRUE, &status);
349
350 // update the conversion parameters
351 fLength = (int32_t)(myTarget - fArray);
352
353 // allocate more space and copy data, if needed
354 if(status == U_BUFFER_OVERFLOW_ERROR) {
355 // reset the error code
356 status = U_ZERO_ERROR;
357
358 // keep the previous conversion results
359 doCopyArray = TRUE;
360
361 // estimate the new size needed, larger than before
362 // try 2 UChar's per remaining source byte
363 arraySize = (int32_t)(fLength + 2 * (mySourceEnd - mySource));
364 } else {
365 break;
366 }
367 }
368 }
369
370 U_NAMESPACE_END
371
372 #endif
373