1 // Common/StringConvert.cpp
2
3 #include "StdAfx.h"
4
5 #include "StringConvert.h"
6
7 #ifndef _WIN32
8 // #include <stdio.h>
9 #include <stdlib.h>
10 #endif
11
12 #if !defined(_WIN32) || defined(ENV_HAVE_LOCALE)
13 #include "UTFConvert.h"
14 #endif
15
16 #ifdef ENV_HAVE_LOCALE
17 #include <locale.h>
18 #endif
19
20 static const char k_DefultChar = '_';
21
22 #ifdef _WIN32
23
24 /*
25 MultiByteToWideChar(CodePage, DWORD dwFlags,
26 LPCSTR lpMultiByteStr, int cbMultiByte,
27 LPWSTR lpWideCharStr, int cchWideChar)
28
29 if (cbMultiByte == 0)
30 return: 0. ERR: ERROR_INVALID_PARAMETER
31
32 if (cchWideChar == 0)
33 return: the required buffer size in characters.
34
35 if (supplied buffer size was not large enough)
36 return: 0. ERR: ERROR_INSUFFICIENT_BUFFER
37 The number of filled characters in lpWideCharStr can be smaller than cchWideChar (if last character is complex)
38
39 If there are illegal characters:
40 if MB_ERR_INVALID_CHARS is set in dwFlags:
41 - the function stops conversion on illegal character.
42 - Return: 0. ERR: ERROR_NO_UNICODE_TRANSLATION.
43
44 if MB_ERR_INVALID_CHARS is NOT set in dwFlags:
45 before Vista: illegal character is dropped (skipped). WinXP-64: GetLastError() returns 0.
46 in Vista+: illegal character is not dropped (MSDN). Undocumented: illegal
47 character is converted to U+FFFD, which is REPLACEMENT CHARACTER.
48 */
49
50
MultiByteToUnicodeString2(UString & dest,const AString & src,UINT codePage)51 void MultiByteToUnicodeString2(UString &dest, const AString &src, UINT codePage)
52 {
53 dest.Empty();
54 if (src.IsEmpty())
55 return;
56 {
57 /*
58 wchar_t *d = dest.GetBuf(src.Len());
59 const char *s = (const char *)src;
60 unsigned i;
61
62 for (i = 0;;)
63 {
64 Byte c = (Byte)s[i];
65 if (c >= 0x80 || c == 0)
66 break;
67 d[i++] = (wchar_t)c;
68 }
69
70 if (i != src.Len())
71 {
72 unsigned len = MultiByteToWideChar(codePage, 0, s + i,
73 src.Len() - i, d + i,
74 src.Len() + 1 - i);
75 if (len == 0)
76 throw 282228;
77 i += len;
78 }
79
80 d[i] = 0;
81 dest.ReleaseBuf_SetLen(i);
82 */
83 unsigned len = (unsigned)MultiByteToWideChar(codePage, 0, src, (int)src.Len(), NULL, 0);
84 if (len == 0)
85 {
86 if (GetLastError() != 0)
87 throw 282228;
88 }
89 else
90 {
91 len = (unsigned)MultiByteToWideChar(codePage, 0, src, (int)src.Len(), dest.GetBuf(len), (int)len);
92 if (len == 0)
93 throw 282228;
94 dest.ReleaseBuf_SetEnd(len);
95 }
96 }
97 }
98
99 /*
100 int WideCharToMultiByte(
101 UINT CodePage, DWORD dwFlags,
102 LPCWSTR lpWideCharStr, int cchWideChar,
103 LPSTR lpMultiByteStr, int cbMultiByte,
104 LPCSTR lpDefaultChar, LPBOOL lpUsedDefaultChar);
105
106 if (lpDefaultChar == NULL),
107 - it uses system default value.
108
109 if (CodePage == CP_UTF7 || CodePage == CP_UTF8)
110 if (lpDefaultChar != NULL || lpUsedDefaultChar != NULL)
111 return: 0. ERR: ERROR_INVALID_PARAMETER.
112
113 The function operates most efficiently, if (lpDefaultChar == NULL && lpUsedDefaultChar == NULL)
114
115 */
116
UnicodeStringToMultiByte2(AString & dest,const UString & src,UINT codePage,char defaultChar,bool & defaultCharWasUsed)117 static void UnicodeStringToMultiByte2(AString &dest, const UString &src, UINT codePage, char defaultChar, bool &defaultCharWasUsed)
118 {
119 dest.Empty();
120 defaultCharWasUsed = false;
121 if (src.IsEmpty())
122 return;
123 {
124 /*
125 unsigned numRequiredBytes = src.Len() * 2;
126 char *d = dest.GetBuf(numRequiredBytes);
127 const wchar_t *s = (const wchar_t *)src;
128 unsigned i;
129
130 for (i = 0;;)
131 {
132 wchar_t c = s[i];
133 if (c >= 0x80 || c == 0)
134 break;
135 d[i++] = (char)c;
136 }
137
138 if (i != src.Len())
139 {
140 BOOL defUsed = FALSE;
141 defaultChar = defaultChar;
142
143 bool isUtf = (codePage == CP_UTF8 || codePage == CP_UTF7);
144 unsigned len = WideCharToMultiByte(codePage, 0, s + i, src.Len() - i,
145 d + i, numRequiredBytes + 1 - i,
146 (isUtf ? NULL : &defaultChar),
147 (isUtf ? NULL : &defUsed));
148 defaultCharWasUsed = (defUsed != FALSE);
149 if (len == 0)
150 throw 282229;
151 i += len;
152 }
153
154 d[i] = 0;
155 dest.ReleaseBuf_SetLen(i);
156 */
157
158 /*
159 if (codePage != CP_UTF7)
160 {
161 const wchar_t *s = (const wchar_t *)src;
162 unsigned i;
163 for (i = 0;; i++)
164 {
165 wchar_t c = s[i];
166 if (c >= 0x80 || c == 0)
167 break;
168 }
169
170 if (s[i] == 0)
171 {
172 char *d = dest.GetBuf(src.Len());
173 for (i = 0;;)
174 {
175 wchar_t c = s[i];
176 if (c == 0)
177 break;
178 d[i++] = (char)c;
179 }
180 d[i] = 0;
181 dest.ReleaseBuf_SetLen(i);
182 return;
183 }
184 }
185 */
186
187 unsigned len = (unsigned)WideCharToMultiByte(codePage, 0, src, (int)src.Len(), NULL, 0, NULL, NULL);
188 if (len == 0)
189 {
190 if (GetLastError() != 0)
191 throw 282228;
192 }
193 else
194 {
195 BOOL defUsed = FALSE;
196 bool isUtf = (codePage == CP_UTF8 || codePage == CP_UTF7);
197 // defaultChar = defaultChar;
198 len = (unsigned)WideCharToMultiByte(codePage, 0, src, (int)src.Len(),
199 dest.GetBuf(len), (int)len,
200 (isUtf ? NULL : &defaultChar),
201 (isUtf ? NULL : &defUsed)
202 );
203 if (!isUtf)
204 defaultCharWasUsed = (defUsed != FALSE);
205 if (len == 0)
206 throw 282228;
207 dest.ReleaseBuf_SetEnd(len);
208 }
209 }
210 }
211
212 /*
213 #ifndef UNDER_CE
214 AString SystemStringToOemString(const CSysString &src)
215 {
216 AString dest;
217 const unsigned len = src.Len() * 2;
218 CharToOem(src, dest.GetBuf(len));
219 dest.ReleaseBuf_CalcLen(len);
220 return dest;
221 }
222 #endif
223 */
224
225 #else // _WIN32
226
227 // #include <stdio.h>
228 /*
229 if (wchar_t is 32-bit (#if WCHAR_MAX > 0xffff),
230 and utf-8 string contains big unicode character > 0xffff),
231 then we still use 16-bit surrogate pair in UString.
232 It simplifies another code where utf-16 encoding is used.
233 So we use surrogate-conversion code only in is file.
234 */
235
236 /*
237 mbstowcs() returns error if there is error in utf-8 stream,
238 mbstowcs() returns error if there is single surrogates point (d800-dfff) in utf-8 stream
239 */
240
241 /*
242 static void MultiByteToUnicodeString2_Native(UString &dest, const AString &src)
243 {
244 dest.Empty();
245 if (src.IsEmpty())
246 return;
247
248 const size_t limit = ((size_t)src.Len() + 1) * 2;
249 wchar_t *d = dest.GetBuf((unsigned)limit);
250 const size_t len = mbstowcs(d, src, limit);
251 if (len != (size_t)-1)
252 {
253 dest.ReleaseBuf_SetEnd((unsigned)len);
254 return;
255 }
256 dest.ReleaseBuf_SetEnd(0);
257 }
258 */
259
260 bool g_ForceToUTF8 = true; // false;
261
MultiByteToUnicodeString2(UString & dest,const AString & src,UINT codePage)262 void MultiByteToUnicodeString2(UString &dest, const AString &src, UINT codePage)
263 {
264 dest.Empty();
265 if (src.IsEmpty())
266 return;
267
268 if (codePage == CP_UTF8 || g_ForceToUTF8)
269 {
270 ConvertUTF8ToUnicode(src, dest);
271 return;
272 }
273
274 const size_t limit = ((size_t)src.Len() + 1) * 2;
275 wchar_t *d = dest.GetBuf((unsigned)limit);
276 const size_t len = mbstowcs(d, src, limit);
277 if (len != (size_t)-1)
278 {
279 dest.ReleaseBuf_SetEnd((unsigned)len);
280
281 #if WCHAR_MAX > 0xffff
282 d = dest.GetBuf();
283 for (size_t i = 0;; i++)
284 {
285 // wchar_t c = dest[i];
286 wchar_t c = d[i];
287 if (c == 0)
288 break;
289 if (c >= 0x10000 && c < 0x110000)
290 {
291 /*
292 c -= 0x10000;
293 unsigned c0 = 0xd800 + ((c >> 10) & 0x3FF);
294 dest.ReplaceOneCharAtPos(i, c0);
295 i++;
296 c = 0xdc00 + (c & 0x3FF);
297 dest.Insert_wchar_t(i, c);
298 */
299 UString temp = d + i;
300
301 for (size_t t = 0;; t++)
302 {
303 wchar_t w = temp[t];
304 if (w == 0)
305 break;
306 if (i == limit)
307 break; // unexpected error
308 if (w >= 0x10000 && w < 0x110000)
309 {
310 if (i + 1 == limit)
311 break; // unexpected error
312 w -= 0x10000;
313 d[i++] = (unsigned)0xd800 + (((unsigned)w >> 10) & 0x3FF);
314 w = 0xdc00 + (w & 0x3FF);
315 }
316 d[i++] = w;
317 }
318 dest.ReleaseBuf_SetEnd((unsigned)i);
319 }
320 }
321
322 #endif
323
324 /*
325 printf("\nMultiByteToUnicodeString2 (%d) %s\n", (int)src.Len(), src.Ptr());
326 printf("char: ");
327 for (unsigned i = 0; i < src.Len(); i++)
328 printf (" %02x", (int)(Byte)src[i]);
329 printf("\n");
330 printf("\n-> (%d) %ls\n", (int)dest.Len(), dest.Ptr());
331 printf("wchar_t: ");
332 for (unsigned i = 0; i < dest.Len(); i++)
333 {
334 printf (" %02x", (int)dest[i]);
335 }
336 printf("\n");
337 */
338
339 return;
340 }
341
342 /* if there is mbstowcs() error, we have two ways:
343
344 1) change 0x80+ characters to some character: '_'
345 in that case we lose data, but we have correct UString()
346 and that scheme can show errors to user in early stages,
347 when file converted back to mbs() cannot be found
348
349 2) transfer bad characters in some UTF-16 range.
350 it can be non-original Unicode character.
351 but later we still can restore original character.
352 */
353
354
355 // printf("\nmbstowcs ERROR !!!!!! s=%s\n", src.Ptr());
356 {
357 unsigned i;
358 const char *s = (const char *)src;
359 for (i = 0;;)
360 {
361 Byte c = (Byte)s[i];
362 if (c == 0)
363 break;
364 // we can use ascii compatibilty character '_'
365 // if (c > 0x7F) c = '_'; // we replace "bad: character
366 d[i++] = (wchar_t)c;
367 }
368 d[i] = 0;
369 dest.ReleaseBuf_SetLen(i);
370 }
371 }
372
UnicodeStringToMultiByte2_Native(AString & dest,const UString & src)373 static void UnicodeStringToMultiByte2_Native(AString &dest, const UString &src)
374 {
375 dest.Empty();
376 if (src.IsEmpty())
377 return;
378
379 const size_t limit = ((size_t)src.Len() + 1) * 6;
380 char *d = dest.GetBuf((unsigned)limit);
381
382 const size_t len = wcstombs(d, src, limit);
383
384 if (len != (size_t)-1)
385 {
386 dest.ReleaseBuf_SetEnd((unsigned)len);
387 return;
388 }
389 dest.ReleaseBuf_SetEnd(0);
390 }
391
392
UnicodeStringToMultiByte2(AString & dest,const UString & src2,UINT codePage,char defaultChar,bool & defaultCharWasUsed)393 static void UnicodeStringToMultiByte2(AString &dest, const UString &src2, UINT codePage, char defaultChar, bool &defaultCharWasUsed)
394 {
395 // if (codePage == 1234567) // for debug purposes
396 if (codePage == CP_UTF8 || g_ForceToUTF8)
397 {
398 defaultCharWasUsed = false;
399 ConvertUnicodeToUTF8(src2, dest);
400 return;
401 }
402
403 UString src = src2;
404 #if WCHAR_MAX > 0xffff
405 {
406 src.Empty();
407 for (unsigned i = 0; i < src2.Len();)
408 {
409 wchar_t c = src2[i];
410 if (c >= 0xd800 && c < 0xdc00 && i + 1 != src2.Len())
411 {
412 const wchar_t c2 = src2[i + 1];
413 if (c2 >= 0xdc00 && c2 < 0x10000)
414 {
415 // printf("\nSurragate [%d]: %4x %4x -> ", i, (int)c, (int)c2);
416 c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff);
417 // printf("%4x\n", (int)c);
418 i++;
419 }
420 }
421 src += c;
422 i++;
423 }
424 }
425 #endif
426
427 dest.Empty();
428 defaultCharWasUsed = false;
429 if (src.IsEmpty())
430 return;
431
432 const size_t len = wcstombs(NULL, src, 0);
433
434 if (len != (size_t)-1)
435 {
436 const unsigned limit = ((unsigned)len);
437 if (limit == len)
438 {
439 char *d = dest.GetBuf(limit);
440
441 /*
442 {
443 printf("\nwcstombs; len = %d %ls \n", (int)src.Len(), src.Ptr());
444 for (unsigned i = 0; i < src.Len(); i++)
445 printf (" %02x", (int)src[i]);
446 printf("\n");
447 printf("\ndest Limit = %d \n", limit);
448 }
449 */
450
451 const size_t len2 = wcstombs(d, src, len + 1);
452
453 if (len2 != (size_t)-1 && len2 <= limit)
454 {
455 /*
456 printf("\nOK : destLen = %d : %s\n", (int)len, dest.Ptr());
457 for (unsigned i = 0; i < len2; i++)
458 printf(" %02x", (int)(Byte)dest[i]);
459 printf("\n");
460 */
461 dest.ReleaseBuf_SetEnd((unsigned)len2);
462 return;
463 }
464 }
465 }
466
467 {
468 const wchar_t *s = (const wchar_t *)src;
469 char *d = dest.GetBuf(src.Len());
470
471 unsigned i;
472 for (i = 0;;)
473 {
474 wchar_t c = s[i];
475 if (c == 0)
476 break;
477 if (c >=
478 0x100
479 // 0x80
480 )
481 {
482 c = defaultChar;
483 defaultCharWasUsed = true;
484 }
485
486 d[i++] = (char)c;
487 }
488 d[i] = 0;
489 dest.ReleaseBuf_SetLen(i);
490 /*
491 printf("\nUnicodeStringToMultiByte2; len = %d \n", (int)src.Len());
492 printf("ERROR: %s\n", dest.Ptr());
493 */
494 }
495 }
496
497 #endif // _WIN32
498
499
MultiByteToUnicodeString(const AString & src,UINT codePage)500 UString MultiByteToUnicodeString(const AString &src, UINT codePage)
501 {
502 UString dest;
503 MultiByteToUnicodeString2(dest, src, codePage);
504 return dest;
505 }
506
MultiByteToUnicodeString(const char * src,UINT codePage)507 UString MultiByteToUnicodeString(const char *src, UINT codePage)
508 {
509 return MultiByteToUnicodeString(AString(src), codePage);
510 }
511
512
UnicodeStringToMultiByte2(AString & dest,const UString & src,UINT codePage)513 void UnicodeStringToMultiByte2(AString &dest, const UString &src, UINT codePage)
514 {
515 bool defaultCharWasUsed;
516 UnicodeStringToMultiByte2(dest, src, codePage, k_DefultChar, defaultCharWasUsed);
517 }
518
UnicodeStringToMultiByte(const UString & src,UINT codePage,char defaultChar,bool & defaultCharWasUsed)519 AString UnicodeStringToMultiByte(const UString &src, UINT codePage, char defaultChar, bool &defaultCharWasUsed)
520 {
521 AString dest;
522 UnicodeStringToMultiByte2(dest, src, codePage, defaultChar, defaultCharWasUsed);
523 return dest;
524 }
525
UnicodeStringToMultiByte(const UString & src,UINT codePage)526 AString UnicodeStringToMultiByte(const UString &src, UINT codePage)
527 {
528 AString dest;
529 bool defaultCharWasUsed;
530 UnicodeStringToMultiByte2(dest, src, codePage, k_DefultChar, defaultCharWasUsed);
531 return dest;
532 }
533
534
535
536
537 #if !defined(_WIN32) || defined(ENV_HAVE_LOCALE)
538
539 #ifdef _WIN32
540 #define U_to_A(a, b, c) UnicodeStringToMultiByte2
541 // #define A_to_U(a, b, c) MultiByteToUnicodeString2
542 #else
543 // void MultiByteToUnicodeString2_Native(UString &dest, const AString &src);
544 #define U_to_A(a, b, c) UnicodeStringToMultiByte2_Native(a, b)
545 // #define A_to_U(a, b, c) MultiByteToUnicodeString2_Native(a, b)
546 #endif
547
IsNativeUTF8()548 bool IsNativeUTF8()
549 {
550 UString u;
551 AString a, a2;
552 // for (unsigned c = 0x80; c < (UInt32)0x10000; c += (c >> 9) + 1)
553 for (unsigned c = 0x80; c < (UInt32)0xD000; c += (c >> 2) + 1)
554 {
555 u.Empty();
556 u += (wchar_t)c;
557 /*
558 if (Unicode_Is_There_Utf16SurrogateError(u))
559 continue;
560 #ifndef _WIN32
561 if (Unicode_Is_There_BmpEscape(u))
562 continue;
563 #endif
564 */
565 ConvertUnicodeToUTF8(u, a);
566 U_to_A(a2, u, CP_OEMCP);
567 if (a != a2)
568 return false;
569 }
570 return true;
571 }
572
573 #endif
574
575
576 #ifdef ENV_HAVE_LOCALE
577
GetLocale(void)578 const char *GetLocale(void)
579 {
580 #ifdef ENV_HAVE_LOCALE
581 // printf("\n\nsetlocale(LC_CTYPE, NULL) : return : ");
582 const char *s = setlocale(LC_CTYPE, NULL);
583 if (!s)
584 {
585 // printf("[NULL]\n");
586 s = "C";
587 }
588 else
589 {
590 // ubuntu returns "C" after program start
591 // printf("\"%s\"\n", s);
592 }
593 return s;
594 #elif defined(LOCALE_IS_UTF8)
595 return "utf8";
596 #else
597 return "C";
598 #endif
599 }
600
601 #ifdef _WIN32
Set_ForceToUTF8(bool)602 static void Set_ForceToUTF8(bool) {}
603 #else
Set_ForceToUTF8(bool val)604 static void Set_ForceToUTF8(bool val) { g_ForceToUTF8 = val; }
605 #endif
606
Is_Default_Basic_Locale(const char * locale)607 static bool Is_Default_Basic_Locale(const char *locale)
608 {
609 const AString a (locale);
610 if (a.IsEqualTo_Ascii_NoCase("")
611 || a.IsEqualTo_Ascii_NoCase("C")
612 || a.IsEqualTo_Ascii_NoCase("POSIX"))
613 return true;
614 return false;
615 }
616
Is_Default_Basic_Locale()617 static bool Is_Default_Basic_Locale()
618 {
619 return Is_Default_Basic_Locale(GetLocale());
620 }
621
622
MY_SetLocale()623 void MY_SetLocale()
624 {
625 #ifdef ENV_HAVE_LOCALE
626 /*
627 {
628 const char *s = GetLocale();
629 printf("\nGetLocale() : returned : \"%s\"\n", s);
630 }
631 */
632
633 unsigned start = 0;
634 // unsigned lim = 0;
635 unsigned lim = 3;
636
637 /*
638 #define MY_SET_LOCALE_FLAGS__FROM_ENV 1
639 #define MY_SET_LOCALE_FLAGS__TRY_UTF8 2
640
641 unsigned flags =
642 MY_SET_LOCALE_FLAGS__FROM_ENV |
643 MY_SET_LOCALE_FLAGS__TRY_UTF8
644
645 if (flags != 0)
646 {
647 if (flags & MY_SET_LOCALE_FLAGS__FROM_ENV)
648 lim = (flags & MY_SET_LOCALE_FLAGS__TRY_UTF8) ? 3 : 1;
649 else
650 {
651 start = 1;
652 lim = 2;
653 }
654 }
655 */
656
657 for (unsigned i = start; i < lim; i++)
658 {
659 /*
660 man7: "If locale is an empty string, "", each part of the locale that
661 should be modified is set according to the environment variables.
662 for glibc: glibc, first from the user's environment variables:
663 1) the environment variable LC_ALL,
664 2) environment variable with the same name as the category (see the
665 3) the environment variable LANG
666 The locale "C" or "POSIX" is a portable locale; it exists on all conforming systems.
667
668 for WIN32 : MSDN :
669 Sets the locale to the default, which is the user-default
670 ANSI code page obtained from the operating system.
671 The locale name is set to the value returned by GetUserDefaultLocaleName.
672 The code page is set to the value returned by GetACP
673 */
674 const char *newLocale = "";
675
676 #ifdef __APPLE__
677
678 /* look also CFLocale
679 there is no C.UTF-8 in macos
680 macos has UTF-8 locale only with some language like en_US.UTF-8
681 what is best way to set UTF-8 locale in macos? */
682 if (i == 1)
683 newLocale = "en_US.UTF-8";
684
685 /* file open with non-utf8 sequencies return
686 #define EILSEQ 92 // "Illegal byte sequence"
687 */
688 #else
689 // newLocale = "C";
690 if (i == 1)
691 {
692 newLocale = "C.UTF-8"; // main UTF-8 locale in ubuntu
693 // newLocale = ".utf8"; // supported in new Windows 10 build 17134 (April 2018 Update), the Universal C Runtime
694 // newLocale = "en_US.utf8"; // supported by ubuntu ?
695 // newLocale = "en_US.UTF-8";
696 /* setlocale() in ubuntu allows locales with minor chracter changes in strings
697 "en_US.UTF-8" / "en_US.utf8" */
698 }
699
700 #endif
701
702 // printf("\nsetlocale(LC_ALL, \"%s\") : returned: ", newLocale);
703
704 // const char *s =
705 setlocale(LC_ALL, newLocale);
706
707 /*
708 if (!s)
709 printf("NULL: can't set locale");
710 else
711 printf("\"%s\"\n", s);
712 */
713
714 // request curent locale of program
715 const char *locale = GetLocale();
716 if (locale)
717 {
718 AString a (locale);
719 a.MakeLower_Ascii();
720 // if (a.Find("utf") >= 0)
721 {
722 if (IsNativeUTF8())
723 {
724 Set_ForceToUTF8(true);
725 return;
726 }
727 }
728 if (!Is_Default_Basic_Locale(locale))
729 {
730 // if there is some non-default and non-utf locale, we want to use it
731 break; // comment it for debug
732 }
733 }
734 }
735
736 if (IsNativeUTF8())
737 {
738 Set_ForceToUTF8(true);
739 return;
740 }
741
742 if (Is_Default_Basic_Locale())
743 {
744 Set_ForceToUTF8(true);
745 return;
746 }
747
748 Set_ForceToUTF8(false);
749
750 #elif defined(LOCALE_IS_UTF8)
751 // assume LC_CTYPE="utf8"
752 #else
753 // assume LC_CTYPE="C"
754 #endif
755 }
756 #endif
757