1 // Common/UTFConvert.h 2 3 #ifndef ZIP7_INC_COMMON_UTF_CONVERT_H 4 #define ZIP7_INC_COMMON_UTF_CONVERT_H 5 6 #include "MyBuffer.h" 7 #include "MyString.h" 8 9 struct CUtf8Check 10 { 11 // Byte MaxByte; // in original src stream 12 bool NonUtf; 13 bool ZeroChar; 14 bool SingleSurrogate; 15 bool Escape; 16 bool Truncated; 17 UInt32 MaxHighPoint; // only for points >= 0x80 18 CUtf8CheckCUtf8Check19 CUtf8Check() { Clear(); } 20 ClearCUtf8Check21 void Clear() 22 { 23 // MaxByte = 0; 24 NonUtf = false; 25 ZeroChar = false; 26 SingleSurrogate = false; 27 Escape = false; 28 Truncated = false; 29 MaxHighPoint = 0; 30 } 31 UpdateCUtf8Check32 void Update(const CUtf8Check &c) 33 { 34 if (c.NonUtf) NonUtf = true; 35 if (c.ZeroChar) ZeroChar = true; 36 if (c.SingleSurrogate) SingleSurrogate = true; 37 if (c.Escape) Escape = true; 38 if (c.Truncated) Truncated = true; 39 if (MaxHighPoint < c.MaxHighPoint) MaxHighPoint = c.MaxHighPoint; 40 } 41 PrintStatusCUtf8Check42 void PrintStatus(AString &s) const 43 { 44 s.Empty(); 45 46 // s.Add_OptSpaced("MaxByte="); 47 // s.Add_UInt32(MaxByte); 48 49 if (NonUtf) s.Add_OptSpaced("non-UTF8"); 50 if (ZeroChar) s.Add_OptSpaced("ZeroChar"); 51 if (SingleSurrogate) s.Add_OptSpaced("SingleSurrogate"); 52 if (Escape) s.Add_OptSpaced("Escape"); 53 if (Truncated) s.Add_OptSpaced("Truncated"); 54 55 if (MaxHighPoint != 0) 56 { 57 s.Add_OptSpaced("MaxUnicode="); 58 s.Add_UInt32(MaxHighPoint); 59 } 60 } 61 62 63 bool IsOK(bool allowReduced = false) const 64 { 65 if (NonUtf || SingleSurrogate || ZeroChar) 66 return false; 67 if (MaxHighPoint >= 0x110000) 68 return false; 69 if (Truncated && !allowReduced) 70 return false; 71 return true; 72 } 73 74 // it checks full buffer as specified in (size) and it doesn't stop on zero char 75 void Check_Buf(const char *src, size_t size) throw(); 76 Check_AStringCUtf8Check77 void Check_AString(const AString &s) throw() 78 { 79 Check_Buf(s.Ptr(), s.Len()); 80 } 81 }; 82 83 /* 84 if (allowReduced == false) - all UTF-8 character sequences must be finished. 85 if (allowReduced == true) - it allows truncated last character-Utf8-sequence 86 */ 87 88 bool Check_UTF8_Buf(const char *src, size_t size, bool allowReduced) throw(); 89 bool CheckUTF8_AString(const AString &s) throw(); 90 91 #define Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR (1 << 0) 92 #define Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE (1 << 1) 93 #define Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT (1 << 2) 94 95 /* 96 Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR 97 98 if (flag is NOT set) 99 { 100 it processes SINGLE-SURROGATE-8 as valid Unicode point. 101 it converts SINGLE-SURROGATE-8 to SINGLE-SURROGATE-16 102 Note: some sequencies of two SINGLE-SURROGATE-8 points 103 will generate correct SURROGATE-16-PAIR, and 104 that SURROGATE-16-PAIR later will be converted to correct 105 UTF8-SURROGATE-21 point. So we don't restore original 106 STR-8 sequence in that case. 107 } 108 109 if (flag is set) 110 { 111 if (Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE is defined) 112 it generates ESCAPE for SINGLE-SURROGATE-8, 113 if (Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE is not defined) 114 it generates U+fffd for SINGLE-SURROGATE-8, 115 } 116 117 118 Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE 119 120 if (flag is NOT set) 121 it generates (U+fffd) code for non-UTF-8 (invalid) characters 122 123 if (flag is set) 124 { 125 It generates (ESCAPE) codes for NON-UTF-8 (invalid) characters. 126 And later we can restore original UTF-8-RAW characters from (ESCAPE-16-21) codes. 127 } 128 129 Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT 130 131 if (flag is NOT set) 132 { 133 it process ESCAPE-8 points as another Unicode points. 134 In Linux: ESCAPE-16 will mean two different ESCAPE-8 seqences, 135 so we need HIGH-ESCAPE-PLANE-21 to restore UTF-8-RAW -> UTF-16 -> UTF-8-RAW 136 } 137 138 if (flag is set) 139 { 140 it generates ESCAPE-16-21 for ESCAPE-8 points 141 so we can restore UTF-8-RAW -> UTF-16 -> UTF-8-RAW without HIGH-ESCAPE-PLANE-21. 142 } 143 144 145 Main USE CASES with UTF-8 <-> UTF-16 conversions: 146 147 WIN32: UTF-16-RAW -> UTF-8 (Archive) -> UTF-16-RAW 148 { 149 set Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE 150 Do NOT set Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR 151 Do NOT set Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT 152 153 So we restore original SINGLE-SURROGATE-16 from single SINGLE-SURROGATE-8. 154 } 155 156 Linux: UTF-8-RAW -> UTF-16 (Intermediate / Archive) -> UTF-8-RAW 157 { 158 we want restore original UTF-8-RAW sequence later from that ESCAPE-16. 159 Set the flags: 160 Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR 161 Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE 162 Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT 163 } 164 165 MacOS: UTF-8-RAW -> UTF-16 (Intermediate / Archive) -> UTF-8-RAW 166 { 167 we want to restore correct UTF-8 without any BMP processing: 168 Set the flags: 169 Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR 170 Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE 171 } 172 173 */ 174 175 // zero char is not allowed in (src) buf 176 bool Convert_UTF8_Buf_To_Unicode(const char *src, size_t srcSize, UString &dest, unsigned flags = 0); 177 178 bool ConvertUTF8ToUnicode_Flags(const AString &src, UString &dest, unsigned flags = 0); 179 bool ConvertUTF8ToUnicode(const AString &src, UString &dest); 180 181 #define Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR (1 << 8) 182 #define Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE (1 << 9) 183 // #define Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE (1 << 10) 184 185 /* 186 Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR 187 188 if (flag is NOT set) 189 { 190 we extract SINGLE-SURROGATE as normal UTF-8 191 192 In Windows : for UTF-16-RAW <-> UTF-8 (archive) <-> UTF-16-RAW in . 193 194 In Linux : 195 use-case-1: UTF-8 -> UTF-16 -> UTF-8 doesn't generate UTF-16 SINGLE-SURROGATE, 196 if (Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR) is used. 197 use-case 2: UTF-16-7z (with SINGLE-SURROGATE from Windows) -> UTF-8 (Linux) 198 will generate SINGLE-SURROGATE-UTF-8 here. 199 } 200 201 if (flag is set) 202 { 203 we generate UTF_REPLACEMENT_CHAR (0xfffd) for SINGLE_SURROGATE 204 it can be used for compatibility mode with WIN32 UTF function 205 or if we want UTF-8 stream without any errors 206 } 207 208 209 Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE 210 211 if (flag is NOT set) it doesn't extract raw 8-bit symbol from Escape-Plane-16 212 if (flag is set) it extracts raw 8-bit symbol from Escape-Plane-16 213 214 in Linux we need some way to extract NON-UTF8 RAW 8-bits from BMP (UTF-16 7z archive): 215 if (we use High-Escape-Plane), we can transfer BMP escapes to High-Escape-Plane. 216 if (we don't use High-Escape-Plane), we must use Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE. 217 218 219 Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE 220 // that flag affects the code only if (wchar_t is 32-bit) 221 // that mode with high-escape can be disabled now in UTFConvert.cpp 222 if (flag is NOT set) 223 it doesn't extract raw 8-bit symbol from High-Escape-Plane 224 if (flag is set) 225 it extracts raw 8-bit symbol from High-Escape-Plane 226 227 Main use cases: 228 229 WIN32 : UTF-16-RAW -> UTF-8 (archive) -> UTF-16-RAW 230 { 231 Do NOT set Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE. 232 Do NOT set Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR. 233 So we restore original UTF-16-RAW. 234 } 235 236 Linix : UTF-8 with Escapes -> UTF-16 (7z archive) -> UTF-8 with Escapes 237 set Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE to extract non-UTF from 7z archive 238 set Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE for intermediate UTF-16. 239 Note: high esacape mode can be ignored now in UTFConvert.cpp 240 241 macOS: 242 the system doesn't support incorrect UTF-8 in file names. 243 set Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR 244 */ 245 246 extern unsigned g_Unicode_To_UTF8_Flags; 247 248 void ConvertUnicodeToUTF8_Flags(const UString &src, AString &dest, unsigned flags = 0); 249 void ConvertUnicodeToUTF8(const UString &src, AString &dest); 250 251 void Convert_Unicode_To_UTF8_Buf(const UString &src, CByteBuffer &dest); 252 253 /* 254 #ifndef _WIN32 255 void Convert_UTF16_To_UTF32(const UString &src, UString &dest); 256 void Convert_UTF32_To_UTF16(const UString &src, UString &dest); 257 bool UTF32_IsThere_BigPoint(const UString &src); 258 bool Unicode_IsThere_BmpEscape(const UString &src); 259 #endif 260 261 bool Unicode_IsThere_Utf16SurrogateError(const UString &src); 262 */ 263 264 #ifdef Z7_WCHART_IS_16BIT 265 #define Convert_UnicodeEsc16_To_UnicodeEscHigh(s) 266 #else 267 void Convert_UnicodeEsc16_To_UnicodeEscHigh(UString &s); 268 #endif 269 270 /* 271 // #include "../../C/CpuArch.h" 272 273 // ---------- Utf16 Little endian functions ---------- 274 275 // We store 16-bit surrogates even in 32-bit WCHARs in Linux. 276 // So now we don't use the following code: 277 278 #if WCHAR_MAX > 0xffff 279 280 // void *p : pointer to src bytes stream 281 // size_t len : num Utf16 characters : it can include or not include NULL character 282 283 inline size_t Utf16LE__Get_Num_WCHARs(const void *p, size_t len) 284 { 285 #if WCHAR_MAX > 0xffff 286 size_t num_wchars = 0; 287 for (size_t i = 0; i < len; i++) 288 { 289 wchar_t c = GetUi16(p); 290 p = (const void *)((const Byte *)p + 2); 291 if (c >= 0xd800 && c < 0xdc00 && i + 1 != len) 292 { 293 wchar_t c2 = GetUi16(p); 294 if (c2 >= 0xdc00 && c2 < 0xe000) 295 { 296 c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff); 297 p = (const void *)((const Byte *)p + 2); 298 i++; 299 } 300 } 301 num_wchars++; 302 } 303 return num_wchars; 304 #else 305 UNUSED_VAR(p) 306 return len; 307 #endif 308 } 309 310 // #include <stdio.h> 311 312 inline wchar_t *Utf16LE__To_WCHARs_Sep(const void *p, size_t len, wchar_t *dest) 313 { 314 for (size_t i = 0; i < len; i++) 315 { 316 wchar_t c = GetUi16(p); 317 p = (const void *)((const Byte *)p + 2); 318 319 #if WCHAR_PATH_SEPARATOR != L'/' 320 if (c == L'/') 321 c = WCHAR_PATH_SEPARATOR; 322 #endif 323 324 #if WCHAR_MAX > 0xffff 325 326 if (c >= 0xd800 && c < 0xdc00 && i + 1 != len) 327 { 328 wchar_t c2 = GetUi16(p); 329 if (c2 >= 0xdc00 && c2 < 0xe000) 330 { 331 // printf("\nSurragate : %4x %4x -> ", (int)c, (int)c2); 332 c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff); 333 p = (const void *)((const Byte *)p + 2); 334 i++; 335 // printf("%4x\n", (int)c); 336 } 337 } 338 339 #endif 340 341 *dest++ = c; 342 } 343 return dest; 344 } 345 346 347 inline size_t Get_Num_Utf16_chars_from_wchar_string(const wchar_t *p) 348 { 349 size_t num = 0; 350 for (;;) 351 { 352 wchar_t c = *p++; 353 if (c == 0) 354 return num; 355 num += ((c >= 0x10000 && c < 0x110000) ? 2 : 1); 356 } 357 return num; 358 } 359 360 inline Byte *wchars_to_Utf16LE(const wchar_t *p, Byte *dest) 361 { 362 for (;;) 363 { 364 wchar_t c = *p++; 365 if (c == 0) 366 return dest; 367 if (c >= 0x10000 && c < 0x110000) 368 { 369 SetUi16(dest , (UInt16)(0xd800 + ((c >> 10) & 0x3FF))); 370 SetUi16(dest + 2, (UInt16)(0xdc00 + ( c & 0x3FF))); 371 dest += 4; 372 } 373 else 374 { 375 SetUi16(dest, c); 376 dest += 2; 377 } 378 } 379 } 380 381 #endif 382 */ 383 384 #endif 385