1 // Common/StringConvert.cpp
2
3 #include "StdAfx.h"
4
5 #include "StringConvert.h"
6
7 #ifndef _WIN32
8 // #include <stdio.h>
9 #include <stdlib.h>
10 #endif
11
12 #if !defined(_WIN32) || defined(ENV_HAVE_LOCALE)
13 #include "UTFConvert.h"
14 #endif
15
16 #ifdef ENV_HAVE_LOCALE
17 #include <locale.h>
18 #endif
19
20 static const char k_DefultChar = '_';
21
22 #ifdef _WIN32
23
24 /*
25 MultiByteToWideChar(CodePage, DWORD dwFlags,
26 LPCSTR lpMultiByteStr, int cbMultiByte,
27 LPWSTR lpWideCharStr, int cchWideChar)
28
29 if (cbMultiByte == 0)
30 return: 0. ERR: ERROR_INVALID_PARAMETER
31
32 if (cchWideChar == 0)
33 return: the required buffer size in characters.
34
35 if (supplied buffer size was not large enough)
36 return: 0. ERR: ERROR_INSUFFICIENT_BUFFER
37 The number of filled characters in lpWideCharStr can be smaller than cchWideChar (if last character is complex)
38
39 If there are illegal characters:
40 if MB_ERR_INVALID_CHARS is set in dwFlags:
41 - the function stops conversion on illegal character.
42 - Return: 0. ERR: ERROR_NO_UNICODE_TRANSLATION.
43
44 if MB_ERR_INVALID_CHARS is NOT set in dwFlags:
45 before Vista: illegal character is dropped (skipped). WinXP-64: GetLastError() returns 0.
46 in Vista+: illegal character is not dropped (MSDN). Undocumented: illegal
47 character is converted to U+FFFD, which is REPLACEMENT CHARACTER.
48 */
49
50
MultiByteToUnicodeString2(UString & dest,const AString & src,UINT codePage)51 void MultiByteToUnicodeString2(UString &dest, const AString &src, UINT codePage)
52 {
53 dest.Empty();
54 if (src.IsEmpty())
55 return;
56 {
57 /*
58 wchar_t *d = dest.GetBuf(src.Len());
59 const char *s = (const char *)src;
60 unsigned i;
61
62 for (i = 0;;)
63 {
64 Byte c = (Byte)s[i];
65 if (c >= 0x80 || c == 0)
66 break;
67 d[i++] = (wchar_t)c;
68 }
69
70 if (i != src.Len())
71 {
72 unsigned len = MultiByteToWideChar(codePage, 0, s + i,
73 src.Len() - i, d + i,
74 src.Len() + 1 - i);
75 if (len == 0)
76 throw 282228;
77 i += len;
78 }
79
80 d[i] = 0;
81 dest.ReleaseBuf_SetLen(i);
82 */
83 unsigned len = (unsigned)MultiByteToWideChar(codePage, 0, src, (int)src.Len(), NULL, 0);
84 if (len == 0)
85 {
86 if (GetLastError() != 0)
87 throw 282228;
88 }
89 else
90 {
91 len = (unsigned)MultiByteToWideChar(codePage, 0, src, (int)src.Len(), dest.GetBuf(len), (int)len);
92 if (len == 0)
93 throw 282228;
94 dest.ReleaseBuf_SetEnd(len);
95 }
96 }
97 }
98
99 /*
100 int WideCharToMultiByte(
101 UINT CodePage, DWORD dwFlags,
102 LPCWSTR lpWideCharStr, int cchWideChar,
103 LPSTR lpMultiByteStr, int cbMultiByte,
104 LPCSTR lpDefaultChar, LPBOOL lpUsedDefaultChar);
105
106 if (lpDefaultChar == NULL),
107 - it uses system default value.
108
109 if (CodePage == CP_UTF7 || CodePage == CP_UTF8)
110 if (lpDefaultChar != NULL || lpUsedDefaultChar != NULL)
111 return: 0. ERR: ERROR_INVALID_PARAMETER.
112
113 The function operates most efficiently, if (lpDefaultChar == NULL && lpUsedDefaultChar == NULL)
114
115 */
116
UnicodeStringToMultiByte2(AString & dest,const UString & src,UINT codePage,char defaultChar,bool & defaultCharWasUsed)117 static void UnicodeStringToMultiByte2(AString &dest, const UString &src, UINT codePage, char defaultChar, bool &defaultCharWasUsed)
118 {
119 dest.Empty();
120 defaultCharWasUsed = false;
121 if (src.IsEmpty())
122 return;
123 {
124 /*
125 unsigned numRequiredBytes = src.Len() * 2;
126 char *d = dest.GetBuf(numRequiredBytes);
127 const wchar_t *s = (const wchar_t *)src;
128 unsigned i;
129
130 for (i = 0;;)
131 {
132 wchar_t c = s[i];
133 if (c >= 0x80 || c == 0)
134 break;
135 d[i++] = (char)c;
136 }
137
138 if (i != src.Len())
139 {
140 BOOL defUsed = FALSE;
141 defaultChar = defaultChar;
142
143 bool isUtf = (codePage == CP_UTF8 || codePage == CP_UTF7);
144 unsigned len = WideCharToMultiByte(codePage, 0, s + i, src.Len() - i,
145 d + i, numRequiredBytes + 1 - i,
146 (isUtf ? NULL : &defaultChar),
147 (isUtf ? NULL : &defUsed));
148 defaultCharWasUsed = (defUsed != FALSE);
149 if (len == 0)
150 throw 282229;
151 i += len;
152 }
153
154 d[i] = 0;
155 dest.ReleaseBuf_SetLen(i);
156 */
157
158 /*
159 if (codePage != CP_UTF7)
160 {
161 const wchar_t *s = (const wchar_t *)src;
162 unsigned i;
163 for (i = 0;; i++)
164 {
165 wchar_t c = s[i];
166 if (c >= 0x80 || c == 0)
167 break;
168 }
169
170 if (s[i] == 0)
171 {
172 char *d = dest.GetBuf(src.Len());
173 for (i = 0;;)
174 {
175 wchar_t c = s[i];
176 if (c == 0)
177 break;
178 d[i++] = (char)c;
179 }
180 d[i] = 0;
181 dest.ReleaseBuf_SetLen(i);
182 return;
183 }
184 }
185 */
186
187 unsigned len = (unsigned)WideCharToMultiByte(codePage, 0, src, (int)src.Len(), NULL, 0, NULL, NULL);
188 if (len == 0)
189 {
190 if (GetLastError() != 0)
191 throw 282228;
192 }
193 else
194 {
195 BOOL defUsed = FALSE;
196 bool isUtf = (codePage == CP_UTF8 || codePage == CP_UTF7);
197 // defaultChar = defaultChar;
198 len = (unsigned)WideCharToMultiByte(codePage, 0, src, (int)src.Len(),
199 dest.GetBuf(len), (int)len,
200 (isUtf ? NULL : &defaultChar),
201 (isUtf ? NULL : &defUsed)
202 );
203 if (!isUtf)
204 defaultCharWasUsed = (defUsed != FALSE);
205 if (len == 0)
206 throw 282228;
207 dest.ReleaseBuf_SetEnd(len);
208 }
209 }
210 }
211
212 /*
213 #ifndef UNDER_CE
214 AString SystemStringToOemString(const CSysString &src)
215 {
216 AString dest;
217 const unsigned len = src.Len() * 2;
218 CharToOem(src, dest.GetBuf(len));
219 dest.ReleaseBuf_CalcLen(len);
220 return dest;
221 }
222 #endif
223 */
224
225 #else // _WIN32
226
227 // #include <stdio.h>
228 /*
229 if (wchar_t is 32-bit (#if WCHAR_MAX > 0xffff),
230 and utf-8 string contains big unicode character > 0xffff),
231 then we still use 16-bit surrogate pair in UString.
232 It simplifies another code where utf-16 encoding is used.
233 So we use surrogate-conversion code only in is file.
234 */
235
236 /*
237 mbstowcs() returns error if there is error in utf-8 stream,
238 mbstowcs() returns error if there is single surrogates point (d800-dfff) in utf-8 stream
239 */
240
241 /*
242 static void MultiByteToUnicodeString2_Native(UString &dest, const AString &src)
243 {
244 dest.Empty();
245 if (src.IsEmpty())
246 return;
247
248 const size_t limit = ((size_t)src.Len() + 1) * 2;
249 wchar_t *d = dest.GetBuf((unsigned)limit);
250 const size_t len = mbstowcs(d, src, limit);
251 if (len != (size_t)-1)
252 {
253 dest.ReleaseBuf_SetEnd((unsigned)len);
254 return;
255 }
256 dest.ReleaseBuf_SetEnd(0);
257 }
258 */
259
260 bool g_ForceToUTF8 = true; // false;
261
MultiByteToUnicodeString2(UString & dest,const AString & src,UINT codePage)262 void MultiByteToUnicodeString2(UString &dest, const AString &src, UINT codePage)
263 {
264 dest.Empty();
265 if (src.IsEmpty())
266 return;
267
268 if (codePage == CP_UTF8 || g_ForceToUTF8)
269 {
270 ConvertUTF8ToUnicode(src, dest);
271 return;
272 }
273
274 const size_t limit = ((size_t)src.Len() + 1) * 2;
275 wchar_t *d = dest.GetBuf((unsigned)limit);
276 const size_t len = mbstowcs(d, src, limit);
277 if (len != (size_t)-1)
278 {
279 dest.ReleaseBuf_SetEnd((unsigned)len);
280
281 #if WCHAR_MAX > 0xffff
282 d = dest.GetBuf();
283 for (size_t i = 0;; i++)
284 {
285 // wchar_t c = dest[i];
286 wchar_t c = d[i];
287 if (c == 0)
288 break;
289 if (c >= 0x10000 && c < 0x110000)
290 {
291 /*
292 c -= 0x10000;
293 unsigned c0 = 0xd800 + ((c >> 10) & 0x3FF);
294 dest.ReplaceOneCharAtPos(i, c0);
295 i++;
296 c = 0xdc00 + (c & 0x3FF);
297 dest.Insert_wchar_t(i, c);
298 */
299 UString temp = d + i;
300
301 for (size_t t = 0;; t++)
302 {
303 wchar_t w = temp[t];
304 if (w == 0)
305 break;
306 if (i == limit)
307 break; // unexpected error
308 if (w >= 0x10000 && w < 0x110000)
309 {
310 if (i + 1 == limit)
311 break; // unexpected error
312 w -= 0x10000;
313 d[i++] = (unsigned)0xd800 + (((unsigned)w >> 10) & 0x3FF);
314 w = 0xdc00 + (w & 0x3FF);
315 }
316 d[i++] = w;
317 }
318 dest.ReleaseBuf_SetEnd((unsigned)i);
319 }
320 }
321
322 #endif
323
324 /*
325 printf("\nMultiByteToUnicodeString2 (%d) %s\n", (int)src.Len(), src.Ptr());
326 printf("char: ");
327 for (unsigned i = 0; i < src.Len(); i++)
328 printf (" %02x", (int)(Byte)src[i]);
329 printf("\n");
330 printf("\n-> (%d) %ls\n", (int)dest.Len(), dest.Ptr());
331 printf("wchar_t: ");
332 for (unsigned i = 0; i < dest.Len(); i++)
333 {
334 printf (" %02x", (int)dest[i]);
335 }
336 printf("\n");
337 */
338
339 return;
340 }
341
342 /* if there is mbstowcs() error, we have two ways:
343
344 1) change 0x80+ characters to some character: '_'
345 in that case we lose data, but we have correct UString()
346 and that scheme can show errors to user in early stages,
347 when file converted back to mbs() cannot be found
348
349 2) transfer bad characters in some UTF-16 range.
350 it can be non-original Unicode character.
351 but later we still can restore original character.
352 */
353
354
355 // printf("\nmbstowcs ERROR !!!!!! s=%s\n", src.Ptr());
356 {
357 unsigned i;
358 const char *s = (const char *)src;
359 for (i = 0;;)
360 {
361 Byte c = (Byte)s[i];
362 if (c == 0)
363 break;
364 // we can use ascii compatibilty character '_'
365 // if (c > 0x7F) c = '_'; // we replace "bad: character
366 d[i++] = (wchar_t)c;
367 }
368 d[i] = 0;
369 dest.ReleaseBuf_SetLen(i);
370 }
371 }
372
UnicodeStringToMultiByte2_Native(AString & dest,const UString & src)373 static void UnicodeStringToMultiByte2_Native(AString &dest, const UString &src)
374 {
375 dest.Empty();
376 if (src.IsEmpty())
377 return;
378
379 const size_t limit = ((size_t)src.Len() + 1) * 6;
380 char *d = dest.GetBuf((unsigned)limit);
381
382 const size_t len = wcstombs(d, src, limit);
383
384 if (len != (size_t)-1)
385 {
386 dest.ReleaseBuf_SetEnd((unsigned)len);
387 return;
388 }
389 dest.ReleaseBuf_SetEnd(0);
390 }
391
392
UnicodeStringToMultiByte2(AString & dest,const UString & src2,UINT codePage,char defaultChar,bool & defaultCharWasUsed)393 static void UnicodeStringToMultiByte2(AString &dest, const UString &src2, UINT codePage, char defaultChar, bool &defaultCharWasUsed)
394 {
395 // if (codePage == 1234567) // for debug purposes
396 if (codePage == CP_UTF8 || g_ForceToUTF8)
397 {
398 defaultCharWasUsed = false;
399 ConvertUnicodeToUTF8(src2, dest);
400 return;
401 }
402
403 UString src = src2;
404 #if WCHAR_MAX > 0xffff
405 {
406 src.Empty();
407 for (unsigned i = 0; i < src2.Len();)
408 {
409 wchar_t c = src2[i];
410 if (c >= 0xd800 && c < 0xdc00 && i + 1 != src2.Len())
411 {
412 const wchar_t c2 = src2[i + 1];
413 if (c2 >= 0xdc00 && c2 < 0x10000)
414 {
415 // printf("\nSurragate [%d]: %4x %4x -> ", i, (int)c, (int)c2);
416 c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff);
417 // printf("%4x\n", (int)c);
418 i++;
419 }
420 }
421 src += c;
422 i++;
423 }
424 }
425 #endif
426
427 dest.Empty();
428 defaultCharWasUsed = false;
429 if (src.IsEmpty())
430 return;
431
432 const size_t len = wcstombs(NULL, src, 0);
433
434 if (len != (size_t)-1)
435 {
436 const unsigned limit = ((unsigned)len);
437 if (limit == len)
438 {
439 char *d = dest.GetBuf(limit);
440
441 /*
442 {
443 printf("\nwcstombs; len = %d %ls \n", (int)src.Len(), src.Ptr());
444 for (unsigned i = 0; i < src.Len(); i++)
445 printf (" %02x", (int)src[i]);
446 printf("\n");
447 printf("\ndest Limit = %d \n", limit);
448 }
449 */
450
451 const size_t len2 = wcstombs(d, src, len + 1);
452
453 if (len2 != (size_t)-1 && len2 <= limit)
454 {
455 /*
456 printf("\nOK : destLen = %d : %s\n", (int)len, dest.Ptr());
457 for (unsigned i = 0; i < len2; i++)
458 printf(" %02x", (int)(Byte)dest[i]);
459 printf("\n");
460 */
461 dest.ReleaseBuf_SetEnd((unsigned)len2);
462 return;
463 }
464 }
465 }
466
467 {
468 const wchar_t *s = (const wchar_t *)src;
469 char *d = dest.GetBuf(src.Len());
470
471 unsigned i;
472 for (i = 0;;)
473 {
474 wchar_t c = s[i];
475 if (c == 0)
476 break;
477 if (c >=
478 0x100
479 // 0x80
480 )
481 {
482 c = defaultChar;
483 defaultCharWasUsed = true;
484 }
485
486 d[i++] = (char)c;
487 }
488 d[i] = 0;
489 dest.ReleaseBuf_SetLen(i);
490 /*
491 printf("\nUnicodeStringToMultiByte2; len = %d \n", (int)src.Len());
492 printf("ERROR: %s\n", dest.Ptr());
493 */
494 }
495 }
496
497 #endif // _WIN32
498
499
MultiByteToUnicodeString(const AString & src,UINT codePage)500 UString MultiByteToUnicodeString(const AString &src, UINT codePage)
501 {
502 UString dest;
503 MultiByteToUnicodeString2(dest, src, codePage);
504 return dest;
505 }
506
MultiByteToUnicodeString(const char * src,UINT codePage)507 UString MultiByteToUnicodeString(const char *src, UINT codePage)
508 {
509 return MultiByteToUnicodeString(AString(src), codePage);
510 }
511
512
UnicodeStringToMultiByte2(AString & dest,const UString & src,UINT codePage)513 void UnicodeStringToMultiByte2(AString &dest, const UString &src, UINT codePage)
514 {
515 bool defaultCharWasUsed;
516 UnicodeStringToMultiByte2(dest, src, codePage, k_DefultChar, defaultCharWasUsed);
517 }
518
UnicodeStringToMultiByte(const UString & src,UINT codePage,char defaultChar,bool & defaultCharWasUsed)519 AString UnicodeStringToMultiByte(const UString &src, UINT codePage, char defaultChar, bool &defaultCharWasUsed)
520 {
521 AString dest;
522 UnicodeStringToMultiByte2(dest, src, codePage, defaultChar, defaultCharWasUsed);
523 return dest;
524 }
525
UnicodeStringToMultiByte(const UString & src,UINT codePage)526 AString UnicodeStringToMultiByte(const UString &src, UINT codePage)
527 {
528 AString dest;
529 bool defaultCharWasUsed;
530 UnicodeStringToMultiByte2(dest, src, codePage, k_DefultChar, defaultCharWasUsed);
531 return dest;
532 }
533
534
535
536
537
538 #ifdef _WIN32
539 #define U_to_A(a, b, c) UnicodeStringToMultiByte2
540 // #define A_to_U(a, b, c) MultiByteToUnicodeString2
541 #else
542 // void MultiByteToUnicodeString2_Native(UString &dest, const AString &src);
543 #define U_to_A(a, b, c) UnicodeStringToMultiByte2_Native(a, b)
544 // #define A_to_U(a, b, c) MultiByteToUnicodeString2_Native(a, b)
545 #endif
546
547 #if !defined(_WIN32) || defined(ENV_HAVE_LOCALE)
548
IsNativeUTF8()549 bool IsNativeUTF8()
550 {
551 UString u;
552 AString a, a2;
553 // for (unsigned c = 0x80; c < (UInt32)0x10000; c += (c >> 9) + 1)
554 for (unsigned c = 0x80; c < (UInt32)0xD000; c += (c >> 2) + 1)
555 {
556 u.Empty();
557 u += (wchar_t)c;
558 /*
559 if (Unicode_Is_There_Utf16SurrogateError(u))
560 continue;
561 #ifndef _WIN32
562 if (Unicode_Is_There_BmpEscape(u))
563 continue;
564 #endif
565 */
566 ConvertUnicodeToUTF8(u, a);
567 U_to_A(a2, u, CP_OEMCP);
568 if (a != a2)
569 return false;
570 }
571 return true;
572 }
573
574 #endif
575
576
577 #ifdef ENV_HAVE_LOCALE
578
GetLocale(void)579 const char *GetLocale(void)
580 {
581 #ifdef ENV_HAVE_LOCALE
582 // printf("\n\nsetlocale(LC_CTYPE, NULL) : return : ");
583 const char *s = setlocale(LC_CTYPE, NULL);
584 if (!s)
585 {
586 // printf("[NULL]\n");
587 s = "C";
588 }
589 else
590 {
591 // ubuntu returns "C" after program start
592 // printf("\"%s\"\n", s);
593 }
594 return s;
595 #elif defined(LOCALE_IS_UTF8)
596 return "utf8";
597 #else
598 return "C";
599 #endif
600 }
601
602 #ifdef _WIN32
Set_ForceToUTF8(bool)603 static void Set_ForceToUTF8(bool) {}
604 #else
Set_ForceToUTF8(bool val)605 static void Set_ForceToUTF8(bool val) { g_ForceToUTF8 = val; }
606 #endif
607
Is_Default_Basic_Locale(const char * locale)608 static bool Is_Default_Basic_Locale(const char *locale)
609 {
610 const AString a (locale);
611 if (a.IsEqualTo_Ascii_NoCase("")
612 || a.IsEqualTo_Ascii_NoCase("C")
613 || a.IsEqualTo_Ascii_NoCase("POSIX"))
614 return true;
615 return false;
616 }
617
Is_Default_Basic_Locale()618 static bool Is_Default_Basic_Locale()
619 {
620 return Is_Default_Basic_Locale(GetLocale());
621 }
622
623
MY_SetLocale()624 void MY_SetLocale()
625 {
626 #ifdef ENV_HAVE_LOCALE
627 /*
628 {
629 const char *s = GetLocale();
630 printf("\nGetLocale() : returned : \"%s\"\n", s);
631 }
632 */
633
634 unsigned start = 0;
635 // unsigned lim = 0;
636 unsigned lim = 3;
637
638 /*
639 #define MY_SET_LOCALE_FLAGS__FROM_ENV 1
640 #define MY_SET_LOCALE_FLAGS__TRY_UTF8 2
641
642 unsigned flags =
643 MY_SET_LOCALE_FLAGS__FROM_ENV |
644 MY_SET_LOCALE_FLAGS__TRY_UTF8
645
646 if (flags != 0)
647 {
648 if (flags & MY_SET_LOCALE_FLAGS__FROM_ENV)
649 lim = (flags & MY_SET_LOCALE_FLAGS__TRY_UTF8) ? 3 : 1;
650 else
651 {
652 start = 1;
653 lim = 2;
654 }
655 }
656 */
657
658 for (unsigned i = start; i < lim; i++)
659 {
660 /*
661 man7: "If locale is an empty string, "", each part of the locale that
662 should be modified is set according to the environment variables.
663 for glibc: glibc, first from the user's environment variables:
664 1) the environment variable LC_ALL,
665 2) environment variable with the same name as the category (see the
666 3) the environment variable LANG
667 The locale "C" or "POSIX" is a portable locale; it exists on all conforming systems.
668
669 for WIN32 : MSDN :
670 Sets the locale to the default, which is the user-default
671 ANSI code page obtained from the operating system.
672 The locale name is set to the value returned by GetUserDefaultLocaleName.
673 The code page is set to the value returned by GetACP
674 */
675 const char *newLocale = "";
676
677 #ifdef __APPLE__
678
679 /* look also CFLocale
680 there is no C.UTF-8 in macos
681 macos has UTF-8 locale only with some language like en_US.UTF-8
682 what is best way to set UTF-8 locale in macos? */
683 if (i == 1)
684 newLocale = "en_US.UTF-8";
685
686 /* file open with non-utf8 sequencies return
687 #define EILSEQ 92 // "Illegal byte sequence"
688 */
689 #else
690 // newLocale = "C";
691 if (i == 1)
692 {
693 newLocale = "C.UTF-8"; // main UTF-8 locale in ubuntu
694 // newLocale = ".utf8"; // supported in new Windows 10 build 17134 (April 2018 Update), the Universal C Runtime
695 // newLocale = "en_US.utf8"; // supported by ubuntu ?
696 // newLocale = "en_US.UTF-8";
697 /* setlocale() in ubuntu allows locales with minor chracter changes in strings
698 "en_US.UTF-8" / "en_US.utf8" */
699 }
700
701 #endif
702
703 // printf("\nsetlocale(LC_ALL, \"%s\") : returned: ", newLocale);
704
705 // const char *s =
706 setlocale(LC_ALL, newLocale);
707
708 /*
709 if (!s)
710 printf("NULL: can't set locale");
711 else
712 printf("\"%s\"\n", s);
713 */
714
715 // request curent locale of program
716 const char *locale = GetLocale();
717 if (locale)
718 {
719 AString a (locale);
720 a.MakeLower_Ascii();
721 // if (a.Find("utf") >= 0)
722 {
723 if (IsNativeUTF8())
724 {
725 Set_ForceToUTF8(true);
726 return;
727 }
728 }
729 if (!Is_Default_Basic_Locale(locale))
730 {
731 // if there is some non-default and non-utf locale, we want to use it
732 break; // comment it for debug
733 }
734 }
735 }
736
737 if (IsNativeUTF8())
738 {
739 Set_ForceToUTF8(true);
740 return;
741 }
742
743 if (Is_Default_Basic_Locale())
744 {
745 Set_ForceToUTF8(true);
746 return;
747 }
748
749 Set_ForceToUTF8(false);
750
751 #elif defined(LOCALE_IS_UTF8)
752 // assume LC_CTYPE="utf8"
753 #else
754 // assume LC_CTYPE="C"
755 #endif
756 }
757 #endif
758