1// Copyright (C) 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3//******************************************************************************* 4// 5// Copyright (C) 2003-2015, International Business Machines 6// Corporation and others. All Rights Reserved. 7// 8// file name: conversion.txt 9// encoding: US-ASCII 10// tab size: 8 (not used) 11// indentation:4 12// 13// created on: 2003jul15 14// created by: Markus W. Scherer 15// 16// ICU resource bundle source file with test data for data-driven conversion tests. 17// 18//******************************************************************************* 19 20conversion:table(nofallback) { 21 Info { 22 Description { "Test data for conversion" } 23 LongDescription { 24 "Test data for data-driven conversion tests in icu/source/test/intltest/convtest.cpp\n" 25 "Run intltest conversion\n" 26 27 "Charset names starting with '*' are for testdata names.\n" 28 "Charset names starting with '+' are for charsets currently not supported in ICU4J.\n" 29 30 "ICU callbacks are specified as strings with pairs of characters, each optional.\n" 31 "Callback function - '?'=Sub '0'=Skip '.'=Stop '&'=Escape\n" 32 "Callback option - a letter is passed in directly as const char * see ucnv_err.h\n" 33 "Empty string: Sub callback with NULL option\n" 34 35 "In order to specify a charset substitution character (for ucnv_setSubstChars()),\n" 36 "add a NUL (U+0000) to the callback string followed by the subchar bytes as Latin-1\n" 37 "characters. For example, for a Sub callback with no option and a subchar of FC FC,\n" 38 "use the string \"?\x00\xFC\xFC\"\n" 39 40 "In order to specify a substitution string (for ucnv_setSubstString()),\n" 41 "add an '=' to the callback string followed by the substitution string.\n" 42 "For example, for a Sub callback with no option and a substitution string\n" 43 "of \"ab\", use the string \"?=ab\"\n" 44 45 "fallbacks: per-direction boolean, currently only for fromUnicode; see Jitterbug 2401\n" 46 47 "errorCode: (empty)==zero | invalid | illegal | truncated | illesc | unsuppesc\n" 48 } 49 } 50 TestData { 51 toUnicode { 52 Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" } 53 Cases { 54 // Test ticket 6789: implement Java-compatible Unicode, UnicodeBig and UnicodeLittle converters 55 // For details about these encodings see convrtrs.txt. 56 // Standard UTF-16 57 { "UTF-16", :bin{ 00610062 }, "ab", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} } 58 { "UTF-16", :bin{ feff0061 }, "a", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} } 59 { "UTF-16", :bin{ fffe0061 }, "\u6100", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} } 60 // Java "Unicode" requires a BOM 61 { "+UTF-16,version=1", :bin{ 00610062 }, "\\x00\\x61b", :intvector{ 0,0,0,0,0,0,0,0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} } 62 { "+UTF-16,version=1", :bin{ feff0061 }, "a", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} } 63 { "+UTF-16,version=1", :bin{ fffe0061 }, "\u6100", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} } 64 // Standard UTF-16BE 65 { "UTF-16BE", :bin{ 00610062 }, "ab", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} } 66 { "UTF-16BE", :bin{ feff0061 }, "\ufeffa", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} } 67 { "UTF-16BE", :bin{ fffe0061 }, "\ufffea", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} } 68 // Java "UnicodeBig" requires a BE BOM or no BOM; it consumes the BE BOM 69 { "UTF-16BE,version=1",:bin{ 00610062 }, "ab", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} } 70 { "UnicodeBig", :bin{ feff0061 }, "a", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} } 71 { "UnicodeBig", :bin{ fffe0061 }, "\\xFF\\xFEa", :intvector{ 0,0,0,0,0,0,0,0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} } 72 // Standard UTF-16LE 73 { "UTF-16LE", :bin{ 61006200 }, "ab", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} } 74 { "UTF-16LE", :bin{ fffe6100 }, "\ufeffa", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} } 75 { "UTF-16LE", :bin{ feff6100 }, "\ufffea", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} } 76 // Java "UnicodeLittle" requires an LE BOM or no BOM; it consumes the LE BOM 77 { "UTF-16LE,version=1",:bin{ 61006200 }, "ab", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} } 78 { "UnicodeLittle", :bin{ fffe6100 }, "a", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} } 79 { "x-UTF-16LE-BOM", :bin{ feff6100 }, "\\xFE\\xFFa", :intvector{ 0,0,0,0,0,0,0,0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} } 80 81 // Test ticket 7704: implement Java-compatible "UTF-16" converter. 82 // Same as standard UTF-16 but fromUnicode always writes big-endian byte stream. 83 { "+UTF-16,version=2", :bin{ 00610062 }, "ab", :intvector{ 0,2 }, :int{1}, :int{0}, "", "&C", :bin{""} } 84 { "+UTF-16,version=2", :bin{ feff0061 }, "a", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} } 85 { "+UTF-16,version=2", :bin{ fffe0061 }, "\u6100", :intvector{ 2 }, :int{1}, :int{0}, "", "&C", :bin{""} } 86 87 // Test ticket 5691: consistent illegal sequences 88 // The following test cases are for illegal character byte sequences. 89 // 90 // Unfortunately, we cannot use the Shift-JIS examples from the ticket 91 // comments because our Shift-JIS table is Windows-compatible and 92 // therefore has no illegal single bytes. Same for GBK. 93 // Instead, we use the stricter GB 18030 also for 2-byte examples. 94 // The byte sequences are generally slightly different from the ticket 95 // comment, simply using assigned characters rather than just 96 // theoretically valid sequences. 97 { 98 "gb18030", 99 :bin{ 618140813c81ff7a }, 100 "a\u4e02\\x81<\\x81\\xFFz", 101 :intvector{ 0,1,3,3,3,3,4,5,5,5,5,5,5,5,5,7 }, 102 :int{1}, :int{0}, "", "&C", :bin{""} 103 } 104 { 105 "EUC-JP", 106 :bin{ 618fb0a98fb03c8f3cb0a97a }, 107 "a\u4e28\\x8F\\xB0<\\x8F<\u9022z", 108 :intvector{ 0,1,4,4,4,4,5,5,5,5,6,7,7,7,7,8,9,11 }, 109 :int{1}, :int{0}, "", "&C", :bin{""} 110 } 111 { 112 "gb18030", 113 :bin{ 618130fc318130fc8181303c3e813cfc817afe90a8bc }, 114 "a\u05ed\\x810\u9f07\\x810<>\\x81<\u9f07z\ue854\u1e3f", 115 :intvector{ 0,1,5,5,5,5,6,7,9,9,9,9,10,11,12,13,13,13,13,14,15,17,18,20 }, 116 :int{1}, :int{0}, "", "&C", :bin{""} 117 } 118 { 119 "UTF-8", 120 :bin{ 61f1808182f180813cf18081fff180ff3cf1ff3c3e7a }, 121 "a\U00040042\\xF1\\x80\\x81<\\xF1\\x80\\x81\\xFF\\xF1\\x80\\xFF<\\xF1\\xFF<>z", 122 :intvector{ 0,1,1,5,5,5,5,5,5,5,5,5,5,5,5,8,9,9,9,9,9,9,9,9,9,9,9,9,12,12,12,12,13,13,13,13,13,13,13,13,15,15,15,15,16,17,17,17,17,18,18,18,18,19,20,21 }, 123 :int{1}, :int{0}, "", "&C", :bin{""} 124 } 125 { 126 "ISO-2022-JP", 127 :bin{ 1b24424141af4142affe41431b2842 }, 128 "\u758f\\xAF\u758e\\xAF\\xFE\u790e", 129 :intvector{ 3,5,5,5,5,6,8,8,8,8,8,8,8,8,10 }, 130 :int{1}, :int{0}, "", "&C", :bin{""} 131 } 132 { 133 "ibm-25546", 134 :bin{ 411b242943420e4141af4142affe41430f5a }, 135 "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ", 136 :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 }, 137 :int{1}, :int{0}, "", "&C", :bin{""} 138 } 139 { 140 "ISO-2022-KR", 141 :bin{ 411b242943420e4141af4142affe41430f5a }, 142 "AB\uc88b\\xAF\uc88c\\xAF\\xFE\uc88dZ", 143 :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 }, 144 :int{1}, :int{0}, "", "&C", :bin{""} 145 } 146 { 147 "ISO-2022-CN", 148 :bin{ 411b242941420e4141af4142affe41430f5a }, 149 "AB\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z", 150 :intvector{ 0,5,7,9,9,9,9,10,12,12,12,12,12,12,12,12,14,17 }, 151 :int{1}, :int{0}, "", "&C", :bin{""} 152 } 153 { 154 "ISO-2022-CN-CNS", 155 :bin{ 411b2429470e21702541256f0f }, 156 "A\u00a7\u03c4\u02c7", 157 :intvector{ 0,6,8,10 }, 158 :int{1}, :int{0}, "", "&C", :bin{""} 159 } 160 { 161 "HZ", 162 :bin{ 417e7b4141af4142affe41437e7d5a }, 163 "A\u4eae\\xAF\u8c05\\xAF\\xFE\u64a9Z", 164 :intvector{ 0,3,5,5,5,5,6,8,8,8,8,8,8,8,8,10,14 }, 165 :int{1}, :int{0}, "", "&C", :bin{""} 166 } 167 // Test ticket 5691: consistent illegal sequences 168 // The following test cases are for illegal escape/designator/shift sequences. 169 // 170 // ISO-2022-JP and -CN with illegal escape sequences. 171 { 172 "ISO-2022-JP", 173 :bin{ 611b24201b244241411b283f1b28427a }, 174 "a\\x1B$ \u758f\\x1B\u2538z", 175 :intvector{ 0,1,1,1,1,2,3,7,9,9,9,9,10,15 }, 176 :int{1}, :int{0}, "", "&C", :bin{""} 177 } 178 { 179 "ISO-2022-CN", 180 :bin{ 611b2429201b2429410e41410f7a }, 181 "a\\x1B$) \u4eaez", 182 :intvector{ 0,1,1,1,1,2,3,4,10,13 }, 183 :int{1}, :int{0}, "", "&C", :bin{""} 184 } 185 // Test ticket 5691: ISO-2022-JP-2 with illegal single-shift SS2 and SS3 sequences. 186 // The first ESC N comes before its designator sequence, the last sequence is ESC+space. 187 { 188 "ISO-2022-JP-2", 189 :bin{ 4e1b4e4e1b2e414e1b4e4e4e1b204e }, 190 "N\\x1BNNN\xceN\\x1B N", 191 :intvector{ 0,1,1,1,1,2,3,7,10,11,12,12,12,12,13,14 }, 192 :int{1}, :int{0}, "", "&C", :bin{""} 193 } 194 { 195 "ISO-2022-CN-EXT", 196 :bin{ 4e1b4e4e1b242a484e1b4e4e4e4e1b204e }, 197 "N\\x1BNNN\u8f0eN\\x1B N", 198 :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 }, 199 :int{1}, :int{0}, "", "&C", :bin{""} 200 } 201 /* 202 * ICU 4.4 (ticket #7314) removes mappings for CNS 11643 planes 3..7 203 { 204 "ISO-2022-CN-EXT", 205 :bin{ 4f1b4f4f1b242b494f1b4f4f4f4f1b204f }, 206 "O\\x1BOOO\u492bO\\x1B O", 207 :intvector{ 0,1,1,1,1,2,3,8,11,13,14,14,14,14,15,16 }, 208 :int{1}, :int{0}, "", "&C", :bin{""} 209 } 210 */ 211 // Test ticket 5691: HZ with illegal tilde sequences. 212 { 213 "HZ", 214 :bin{ 417e20427e21437e80447e7b41417e207e41427e7f41437e7d5a }, 215 "A\\x7E B\\x7E!C\\x7E\\x80D\u4eae\\x7E\\x20\\x7E\u8c05\\x7E\\x7F\u64a9Z", 216 :intvector{ 0,1,1,1,1,2,3,4,4,4,4,5,6,7,7,7,7,7,7,7,7,9, // SBCS 217 12,14,14,14,14,14,14,14,14,16,16,16,16,17,19,19,19,19,19,19,19,19,21, // DBCS 218 25 }, // SBCS 219 :int{1}, :int{0}, "", "&C", :bin{""} 220 } 221 // Test ticket 5691: Example from Peter Edberg. 222 { 223 "ISO-2022-JP", 224 :bin{ 1b244230212f7e742630801b284a621b2458631b2842648061 }, 225 "\u4e9c\ufffd\u7199\ufffdb\ufffd$Xcd\ufffda", 226 :intvector{ 3,5,7,9,14,15,16,17,18,22,23,24 }, 227 :int{1}, :int{0}, "", "?", :bin{""} 228 } 229 // Test bug 6071 (2:1 Unicode:charset SBCS mapping). 230 { 231 "*test1bmp", 232 :bin{ 050008 }, 233 "e@uv", 234 :intvector{ 0,1,2,2 }, 235 :int{1}, :int{1}, "", "?", :bin{""} 236 } 237 // test that HZ limits its byte values to lead bytes 21..7d and trail bytes 21..7e 238 { 239 "HZ", 240 :bin{ 7e7b21212120217e217f772100007e217e7e7d207e7e807e0a2b }, 241 "\u3000\ufffd\u3013\ufffd\u9ccc\ufffd\ufffd\u3013 ~\ufffd+", 242 :intvector{ 2,4,6,8,10,12,14,15,19,20,22,25 }, 243 :int{1}, :int{1}, "", "?", :bin{""} 244 } 245 // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and 246 // using the Shift-JIS table for JIS X 0208 (ticket #5797) 247 { 248 "ISO-2022-JP", 249 :bin{ 1b284a7d7e801b2442306c20217f7e21202160217f22202225227f5f211b2842 }, 250 "}\u203e\ufffd\u4e00\ufffd\ufffd\ufffd\ufffd\xf7\ufffd\ufffd\u25b2\ufffd\u6f3e", 251 :intvector{ 3,4,5,9,11,12,14,16,17,19,21,23,25,27 }, 252 :int{1}, :int{1}, "", "?", :bin{""} 253 } 254 // improve coverage of ISO-2022-JP converter by simulating erroneous input 255 { 256 "ISO-2022-JP-2", 257 :bin{ 0f0ed11b2e41461b244141411b4e411b2e4147451b4ed31b2e4641411b4ed2 }, 258 "\u0046\u4eae\u00c1\u6865\u4eae", 259 :intvector{ 6, 10, 14, 18, 26 }, 260 :int{1}, :int{0}, "", "0", :bin{""} 261 } 262 // improve coverage of JIS7 converter by simulating incomplete shifted input 263 { 264 "JIS7", 265 :bin{ 0e11 }, 266 "", 267 :intvector{}, 268 :int{1}, :int{0}, "", "0", :bin{""} 269 } 270 // improve coverage of unrolled loops in ucnvmbcs.c/ucnv_MBCSSingleToBMPWithOffsets() 271 { 272 "ISO-8859-3", 273 :bin{ 000102030405060708090a0b0c0d0e0f101112131415161718191a1b1c1d1e1f2021222324252627 }, 274 "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\x22#$%&'", 275 :intvector{ 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39 }, 276 :int{1}, :int{0}, "", "?", :bin{""} 277 } 278 // test that ISO-2022-JP encodes ASCII as itself 279 { 280 "ISO-2022-JP", 281 :bin{ 3f4041424344454647 }, 282 "?@ABCDEFG", 283 :intvector{ 0,1,2,3,4,5,6,7,8 }, 284 :int{1}, :int{1}, "", "?", :bin{""} 285 } 286 // test that ISO-2022-CN encodes ASCII as itself 287 { 288 "ISO-2022-CN", 289 :bin{ 3f4041424344454647 }, 290 "?@ABCDEFG", 291 :intvector{ 0,1,2,3,4,5,6,7,8 }, 292 :int{1}, :int{1}, "", "?", :bin{""} 293 } 294 295 // ISO-2022-KR 296 297 // truncated, partial escape sequence 298 { 299 "ibm-25546", 300 :bin{ 1b }, "", :intvector{}, 301 :int{1}, :int{1}, "truncated", ".", :bin{ 1b } 302 } 303 { 304 "ibm-25546", 305 :bin{ 1b24 }, "", :intvector{}, 306 :int{1}, :int{1}, "truncated", ".", :bin{ 1b24 } 307 } 308 { 309 "ibm-25546", 310 :bin{ 1b2429 }, "", :intvector{}, 311 :int{1}, :int{1}, "truncated", ".", :bin{ 1b2429 } 312 } 313 // complete escape sequence but nothing else 314 { 315 "ibm-25546", 316 :bin{ 1b242943 }, "", :intvector{}, 317 :int{1}, :int{1}, "", ".", :bin{""} 318 } 319 { 320 "ibm-25546", 321 :bin{ 1b2429430e }, "", :intvector{}, 322 :int{1}, :int{1}, "", ".", :bin{""} 323 } 324 // escape plus ASCII character 325 { 326 "ibm-25546", 327 :bin{ 1b24294341 }, "A", :intvector{ 4 }, 328 :int{1}, :int{1}, "", ".", :bin{""} 329 } 330 // escape plus incomplete DBCS character 331 { 332 "ibm-25546", 333 :bin{ 1b2429430e41 }, "", :intvector{}, 334 :int{1}, :int{1}, "truncated", ".", :bin{ 41 } 335 } 336 // all complete with DBCS character 337 { 338 "ibm-25546", 339 :bin{ 1b2429430e4141 }, "\uc88b", :intvector{ 5 }, 340 :int{1}, :int{1}, "", ".", :bin{""} 341 } 342 // more complicated example 343 { 344 "ibm-25546", 345 :bin{ 411b242943420e4141affe0f43 }, 346 "AB\uc88b%XAF%XFEC", 347 :intvector{ 0, 5, 7, 9, 9, 9, 9, 9, 9, 9, 9, 12 }, 348 :int{1}, :int{1}, "", "&", :bin{""} 349 } 350 351 // truncated, partial escape sequence 352 { 353 "ISO-2022-KR", 354 :bin{ 1b }, "", :intvector{}, 355 :int{1}, :int{1}, "truncated", ".", :bin{ 1b } 356 } 357 { 358 "ISO-2022-KR", 359 :bin{ 1b24 }, "", :intvector{}, 360 :int{1}, :int{1}, "truncated", ".", :bin{ 1b24 } 361 } 362 { 363 "ISO-2022-KR", 364 :bin{ 1b2429 }, "", :intvector{}, 365 :int{1}, :int{1}, "truncated", ".", :bin{ 1b2429 } 366 } 367 // complete escape sequence but nothing else 368 { 369 "ISO-2022-KR", 370 :bin{ 1b242943 }, "", :intvector{}, 371 :int{1}, :int{1}, "", ".", :bin{""} 372 } 373 { 374 "ISO-2022-KR", 375 :bin{ 1b2429430e }, "", :intvector{}, 376 :int{1}, :int{1}, "", ".", :bin{""} 377 } 378 // escape plus ASCII character 379 { 380 "ISO-2022-KR", 381 :bin{ 1b24294341 }, "A", :intvector{ 4 }, 382 :int{1}, :int{1}, "", ".", :bin{""} 383 } 384 // escape plus incomplete DBCS character 385 { 386 "ISO-2022-KR", 387 :bin{ 1b2429430e41 }, "", :intvector{}, 388 :int{1}, :int{1}, "truncated", ".", :bin{ 41 } 389 } 390 // all complete with DBCS character 391 { 392 "ISO-2022-KR", 393 :bin{ 1b2429430e4141 }, "\uc88b", :intvector{ 5 }, 394 :int{1}, :int{1}, "", ".", :bin{""} 395 } 396 // more complicated example 397 { 398 "ISO-2022-KR", 399 :bin{ 411b242943420e4141affe0f43 }, 400 "AB\uc88b%XAF%XFEC", 401 :intvector{ 0, 5, 7, 9, 9, 9, 9, 9, 9, 9, 9, 12 }, 402 :int{1}, :int{1}, "", "&", :bin{""} 403 } 404 // empty segment (using substitution and stop) 405 { 406 "ISO-2022-KR", 407 :bin{ 1b242943610e0f620d0a }, 408 "a\uFFFDb\u000D\u000A", 409 :intvector{ 4, 6, 7, 8, 9 }, 410 :int{1}, :int{1}, "", "?", :bin{""} 411 } 412 { 413 "ISO-2022-KR", 414 :bin{ 1b242943610e0f620d0a }, 415 "a", 416 :intvector{ 4 }, 417 :int{1}, :int{1}, "illesc", ".", :bin{"0f"} 418 } 419 420 // ISO-2022-JP 421 422 // truncated, partial escape sequence 423 { 424 "ISO-2022-JP", 425 :bin{ 1b }, "", :intvector{}, 426 :int{1}, :int{1}, "truncated", ".", :bin{ 1b } 427 } 428 { 429 "ISO-2022-JP-2", 430 :bin{ 1b24 }, "", :intvector{}, 431 :int{1}, :int{1}, "truncated", ".", :bin{ 1b24 } 432 } 433 // complete escape sequence but nothing else 434 { 435 "ISO-2022-JP-2", 436 :bin{ 1b2442 }, "", :intvector{}, 437 :int{1}, :int{1}, "", ".", :bin{""} 438 } 439 // escape plus incomplete DBCS character 440 { 441 "ISO-2022-JP-2", 442 :bin{ 1b244241 }, "", :intvector{}, 443 :int{1}, :int{1}, "truncated", ".", :bin{ 41 } 444 } 445 // all complete with DBCS character 446 { 447 "ISO-2022-JP-2", 448 :bin{ 1b24424141 }, "\u758f", :intvector{ 3 }, 449 :int{1}, :int{1}, "", ".", :bin{""} 450 } 451 // test the G2 designator & SS2 shift 452 { 453 "ISO-2022-JP-2", 454 :bin{ 431b2e46461b244241411b4e4e353f }, "CF\u758f\u039e\u7591", :intvector{ 0, 4, 8, 12, 13 }, 455 :int{1}, :int{1}, "", ".", :bin{""} 456 } 457 // JIS7 with Katakana 458 { 459 "JIS7", 460 :bin{ 41420e41420f4142 }, "AB\uff81\uff82AB", :intvector{ 0, 1, 3, 4, 6, 7 }, 461 :int{1}, :int{1}, "", ".", :bin{""} 462 } 463 // JIS8 with Katakana 464 { 465 "JIS8", 466 :bin{ 41c15c1b284a5cc242 }, "A\uff81\\\xa5\uff82B", :intvector{ 0, 1, 2, 6, 7, 8 }, 467 :int{1}, :int{1}, "", ".", :bin{""} 468 } 469 // empty segment (using substitution and stop) 470 { 471 "ISO-2022-JP", 472 :bin{ 61621b24421b284263640d0a }, 473 "ab\uFFFDcd\u000D\u000A", 474 :intvector{ 0, 1, 5, 8, 9, 10, 11 }, 475 :int{1}, :int{1}, "", "?", :bin{""} 476 } 477 { 478 "ISO-2022-JP", 479 :bin{ 61621b24421b284263640d0a }, 480 "ab", 481 :intvector{ 0, 1 }, 482 :int{1}, :int{1}, "illesc", ".", :bin{"1b2842"} 483 } 484 485 // ISO-2022-CN 486 487 // truncated, partial escape sequence 488 { 489 "ISO_2022,locale=zh,version=1", 490 :bin{ 1b }, "", :intvector{}, 491 :int{1}, :int{1}, "truncated", ".", :bin{ 1b } 492 } 493 { 494 "ISO_2022,locale=zh,version=1", 495 :bin{ 1b24 }, "", :intvector{}, 496 :int{1}, :int{1}, "truncated", ".", :bin{ 1b24 } 497 } 498 { 499 "ISO_2022,locale=zh,version=1", 500 :bin{ 1b2429 }, "", :intvector{}, 501 :int{1}, :int{1}, "truncated", ".", :bin{ 1b2429 } 502 } 503 // complete escape sequence but nothing else 504 { 505 "ISO_2022,locale=zh,version=1", 506 :bin{ 1b242941 }, "", :intvector{}, 507 :int{1}, :int{1}, "", ".", :bin{""} 508 } 509 { 510 "ISO_2022,locale=zh,version=1", 511 :bin{ 1b2429410e }, "", :intvector{}, 512 :int{1}, :int{1}, "", ".", :bin{""} 513 } 514 // escape plus ASCII character 515 { 516 "ISO_2022,locale=zh,version=1", 517 :bin{ 1b24294141 }, "\x41", :intvector{ 4 }, 518 :int{1}, :int{1}, "", ".", :bin{""} 519 } 520 // escape plus incomplete DBCS character 521 { 522 "ISO_2022,locale=zh,version=1", 523 :bin{ 1b2429410e41 }, "", :intvector{}, 524 :int{1}, :int{1}, "truncated", ".", :bin{ 41 } 525 } 526 // all complete with DBCS character 527 { 528 "ISO_2022,locale=zh,version=1", 529 :bin{ 1b2429410e4141 }, "\u4eae", :intvector{ 5 }, 530 :int{1}, :int{1}, "", ".", :bin{""} 531 } 532 /* 533 * ICU 4.4 (ticket #7314) removes mappings for CNS 11643 planes 3..7 534 // ISO-2022-CN-EXT with all subcharsets and shifts and with supplementary code points 535 { 536 "ISO-2022-CN-EXT", 537 :bin{ 1b2429411b242a480e41411b2429457e7c1b4e70341b242b4d1b2429477c341b4f664c2421 }, 538 "\u4eae\u9f82\u56cd\u56cc\U0002a6d6\x30", 539 :intvector{ 9, 15, 19, 29, 33, 33, 35 }, 540 :int{1}, :int{1}, "", ".", :bin{""} 541 } 542 */ 543 544 // illegal and unsupported escape sequences 545 // SS2 without designator: illegal 546 { 547 "ISO-2022-CN-EXT", 548 :bin{ 411b4e2121 }, "\x41", :intvector{ 0 }, 549 :int{1}, :int{1}, "illesc", ".", :bin{ 1b } 550 } 551 // G3 designator: recognized, but not supported for -CN (only for -CN-EXT) 552 { 553 "ISO-2022-CN", 554 :bin{ 411b242b491b4f2121 }, "\x41", :intvector{ 0 }, 555 :int{1}, :int{1}, "unsuppesc", ".", :bin{ 1b242b49 } 556 } 557 // empty segment 1 (using substitution and stop) 558 { 559 "ISO-2022-CN", 560 :bin{ 611b242941620e0f1b242a481b4e6a65630d0a }, 561 "ab\uFFFD\u994Cc\u000D\u000A", 562 :intvector{ 0, 5, 7, 14, 16, 17, 18 }, 563 :int{1}, :int{1}, "", "?", :bin{""} 564 } 565 { 566 "ISO-2022-CN", 567 :bin{ 611b242941620e0f1b242a481b4e6a65630d0a }, 568 "ab", 569 :intvector{ 0, 5 }, 570 :int{1}, :int{1}, "illesc", ".", :bin{"0f"} 571 } 572 // empty segment 2 (using substitution and stop) 573 { 574 "ISO-2022-CN", 575 :bin{ 611b242941620e1b24294768640f630d0a }, 576 "ab\uFFFD\u5F70c\u000D\u000A", 577 :intvector{ 0, 5, 7, 11, 14, 15, 16 }, 578 :int{1}, :int{1}, "", "?", :bin{""} 579 } 580 { 581 "ISO-2022-CN", 582 :bin{ 611b242941620e1b24294768640f630d0a }, 583 "ab", 584 :intvector{ 0, 5 }, 585 :int{1}, :int{1}, "illesc", ".", :bin{"1b242947"} 586 } 587 588 // ISO-2022 SBCS 589 // [U_ENABLE_GENERIC_ISO_2022] 590 // The _generic_ ISO-2022 converter is disabled starting 2003-dec-03 (ICU 2.8). 591 // For details see the icu mailing list from 2003-dec-01 and the ucnv2022.c file. 592 // Language-specific variants of ISO-2022 continue to be available as listed below. 593 //{ 594 // "ISO_2022", 595 // :bin{ 0008090a0d1a1c1f203f415c7d7e7f }, 596 // "\x00\x08\t\n\r\x1a\x1c\x1f ?A\\}~\x7f", 597 // :intvector{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, 598 // :int{1}, :int{1}, "", ".", :bin{""} 599 //} 600 601 // HZ-GB-2312 602 603 // empty segment 1 (using substitution and stop) 604 { 605 "HZ-GB-2312", 606 :bin{ 61627e7b7e7d6364 }, 607 "ab\uFFFDcd", 608 :intvector{ 0, 1, 4, 6, 7 }, 609 :int{1}, :int{1}, "", "?", :bin{""} 610 } 611 { 612 "HZ-GB-2312", 613 :bin{ 61627e7b7e7d63640d0a }, 614 "ab", 615 :intvector{ 0, 1 }, 616 :int{1}, :int{1}, "illesc", ".", :bin{"7e7d"} 617 } 618 // empty segment 2 & legal redundant switches (using substitution and stop) 619 { 620 "HZ-GB-2312", 621 :bin{ 61627e7b323b3f557e7b7e7b523b7e7d63647e7d65667e7d7e7d }, 622 "ab\u4E0D\u7A7A\uFFFD\u4E00cdef\uFFFD", 623 :intvector{ 0, 1, 4, 6, 10, 12, 16, 17, 20, 21, 24 }, 624 :int{1}, :int{1}, "", "?", :bin{""} 625 } 626 { 627 "HZ-GB-2312", 628 :bin{ 61627e7b323b3f557e7b7e7b523b7e7d63647e7d65667e7d7e7d }, 629 "ab\u4E0D\u7A7A", 630 :intvector{ 0, 1, 4, 6 }, 631 :int{1}, :int{1}, "illesc", ".", :bin{"7e7b"} 632 } 633 634 // DBCS-only extensions 635 { 636 "ibm-970", 637 :bin{ 617eece9b2eb }, 638 "\x61\x7e\u4e00\ub000", 639 :intvector{ 0, 1, 2, 4 }, 640 :int{1}, :int{1}, "", "?", :bin{""} 641 } 642 643 { 644 "ibm-971", 645 :bin{ 617eece9b2eb }, 646 "\ufffd\u4e00\ub000", 647 :intvector{ 0, 2, 4 }, 648 :int{1}, :int{1}, "", "?", :bin{""} 649 } 650 651 { 652 "ibm-16684", 653 :bin{ 430e4395ecc1404042e1 }, 654 "\ufffd\u30C8\u30C8\u309A\u3000\u20ac", 655 :intvector{ 0, 2, 4, 4, 6, 8 }, 656 :int{1}, :int{0}, "", "?", :bin{""} 657 } 658 659 { 660 "ibm-1399", 661 :bin{ 430e4395ecc140400fe1 }, 662 "\uff62\u30C8\u30C8\u309A\u3000\u20ac", 663 :intvector{ 0, 2, 4, 4, 6, 9 }, 664 :int{1}, :int{0}, "", "?", :bin{""} 665 } 666 667 // extensions 668 { 669 "ibm-1390", 670 :bin{ 430e4395ecc1 }, 671 "\uff63\u30C8\u30C8\u309A", 672 :intvector{ 0, 2, 4, 4 }, 673 :int{1}, :int{0}, "", "?", :bin{""} 674 } 675 676 { 677 "ibm-16684", 678 :bin{ ececec8bec8cec8d4386ecb5ecb6ecb7 }, 679 "\ufffd\u31f6\u31f7\u31f8\u30ab\u304b\u309a\u304d\u309a\u304f\u309a", 680 :intvector{ 0, 2, 4, 6, 8, 10, 10, 12, 12, 14, 14 }, 681 :int{1}, :int{0}, "", "?", :bin{""} 682 } 683 684 { 685 "ibm-1390", 686 :bin{ 43860eececec8bec8cec8d4386ecb5ecb6ecb7ecc10fec }, 687 "\uff63\uff76\ufffd\u31f6\u31f7\u31f8\u30ab\u304b\u309a\u304d\u309a\u304f\u309a\u30C8\u309A\x1a", 688 :intvector{ 0, 1, 3, 5, 7, 9, 11, 13, 13, 15, 15, 17, 17, 19, 19, 22 }, 689 :int{1}, :int{0}, "", "?", :bin{""} 690 } 691 692 { 693 "*test3", 694 :bin{ 00050601020b0701020a01020c }, 695 "\u20ac\x05\x06\x0b\U00101234\U00023456\ufffd", 696 :intvector{ 0, 1, 2, 3, 6, 6, 7, 7, 10 }, 697 :int{1}, :int{0}, "", "?", :bin{""} 698 } 699 700 // test mapping to sequence of multiple Unicode characters which includes nonBMP (ticket #9235) 701 { 702 "*test3", 703 :bin{ 05070001020e050501020c06 }, 704 "\x05\U00101234\U00050005\u00c4\u00c4\U00101234\x05\x06", 705 :intvector{ 0, 1, 1, 1, 1, 7, 7, 7, 7, 7, 11 }, 706 :int{1}, :int{0}, "", "?", :bin{""} 707 } 708 709 // normal conversions 710 { 711 "UTF-16LE", 712 :bin{ 310000d801dc00d902dc320000d8330001dc3400 }, 713 "1\U00010001\U000500022\ufffd3\ufffd4", 714 :intvector{ 0, 2, 2, 6, 6, 10, 12, 14, 16, 18 }, 715 :int{1}, :int{0}, "", "?", :bin{""} 716 } 717 { "UTF-16LE", :bin{ 00 }, "", :intvector{}, :int{1}, :int{0}, "truncated", ".", :bin{ 00 } } 718 { "UTF-16LE", :bin{ 00d800 }, "", :intvector{}, :int{1}, :int{0}, "truncated", ".", :bin{ 00d800 } } 719 720 { 721 "UTF-16BE", 722 :bin{ 0031d800dc01d900dc020032d8000033dc010034 }, 723 "1\U00010001\U000500022\ufffd3\ufffd4", 724 :intvector{ 0, 2, 2, 6, 6, 10, 12, 14, 16, 18 }, 725 :int{1}, :int{0}, "", "?", :bin{""} 726 } 727 { "UTF-16BE", :bin{ 00 }, "", :intvector{}, :int{1}, :int{0}, "truncated", ".", :bin{ 00 } } 728 { "UTF-16BE", :bin{ d800dc }, "", :intvector{}, :int{1}, :int{0}, "truncated", ".", :bin{ d800dc } } 729 730 // e4b8 is a partial sequence 731 { "UTF-8", :bin{ 31e4ba8ce4b8 }, "1\u4e8c", :intvector{ 0, 1 }, :int{1}, :int{0}, "truncated", ".", :bin{ e4b8 } } 732 { "UTF-8", :bin{ 31e4ba8ce4b8 }, "1\u4e8c\ufffd", :intvector{ 0, 1, 4 }, :int{1}, :int{0}, "", "?", :bin{""} } 733 734 // LMBCS with escape callback (1292a0 is unassigned) 735 { 736 "LMBCS", 737 :bin{ 12c9501292a01292a1 }, 738 "\u4e2e%X12%X92%XA0\ue5c4", 739 :intvector{ 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 6 }, 740 :int{1}, :int{0}, "", "&", :bin{""} 741 } 742 { 743 "LMBCS", 744 :bin{ 61012981a00f270f91140a7414f60214d84d14dc561088a0 }, 745 "\u0061\u2013\u00fc\u00e1\u0007\u0091\u0a74\u0200\ud84d\udc56\u5516", 746 :intvector{ 0, 1, 3, 4, 5, 7, 9, 12, 15, 18, 21 }, 747 :int{1}, :int{0}, "", ".", :bin{""} 748 } 749 750 // IMAP-mailbox-name with SUB 751 // a<DEL> a&AB~ a&AB\x0c a&AB- a&AB. a&. 752 { 753 "IMAP-mailbox-name", 754 :bin{ 617f612641427e612641420c612641422d612641422e61262e }, 755 "a\ufffda\ufffda\ufffda\ufffda\ufffda\ufffd", 756 :intvector{ 0, 1, 2, 4, 7, 9, 12, 14, 17, 19, 22, 23 }, 757 :int{1}, :int{0}, "", "?", :bin{""} 758 } 759 760 // using testdata_test1.cnv 761 { "*test1", :bin{ 000506070809 }, "\u20ac\x05\x06\U00101234\ufffd\ufffd", :intvector{ 0, 1, 2, 3, 3, 4, 5 }, :int{1}, :int{0}, "", "", :bin{""} } 762 763 // surrogates in CESU-8 764 { "CESU-8", :bin{ eda080eda081edb081 }, "\ud800\U00010401", :intvector{ 0, 3, 6 }, :int{1}, :int{0}, "", "", :bin{""} } 765 // e080 is a partial sequence 766 { "UTF-8", :bin{ 31ffe4ba8ce08061 }, "1\ufffd\u4e8c\ufffd\ufffda", :intvector{ 0, 1, 2, 5, 6, 7 }, :int{0}, :int{0}, "", "", :bin{ 80 } } 767 // fbbfbfbfbf exceedes U+10ffff 768 { "UTF-8", :bin{ 31fbbfbfbfbf61 }, "1\ufffd\ufffd\ufffd\ufffd\ufffda", :intvector{ 0, 1, 2, 3, 4, 5, 6 }, :int{0}, :int{0}, "", "", :bin{ bf } } 769 770 // lead byte a2 without trail byte 771 { "ibm-1363", :bin{ a2aea2 }, "\u00a1", :intvector{ 0 }, :int{1}, :int{0}, "truncated", ".", :bin{ a2 } } 772 { "ibm-1363", :bin{ a2aea2 }, "\u00a1\u001a", :intvector{ 0, 2 }, :int{1}, :int{0}, "", "?", :bin{""} } 773 774 // simple sample, no error handling 775 { "UTF-8", :bin{ 61F48FBFBF }, "a\U0010FFFF", :intvector{ 0, 1, 1 }, :int{1}, :int{0}, "", "", :bin{""} } 776 // Headers { "charset", "bytes", "unicode", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidChars" } 777 { 778 "iscii-dev", 779 :bin{ EF4BC0E9BFE9E8D80AEF4AC0D4BFD4E8D80AEF4838B30AEF4939B30AEF4A3AB30AEF4B3BB30A}, 780 "\u0A5C\u0A4D\u0A39\u0A5C\u0A4D\u0A39\u000A" /* Gurmukhi test */ 781 "\u0AA2\u0AB5\u0AA1\u0AB5\u0ACD\u0AB9\u000A" /* Gujarati test */ 782 "\u0038\u0C95\u000A" /* Kannada test */ 783 "\u0039\u0D15\u000A" /* Malayalam test */ 784 "\u003A\u0A95\u000A" /* Gujarati test */ 785 "\u003B\u0A15\u000A" /* Punjabi test */, 786 :intvector { 2, 2, 2, 4, 6, 7, 8, 11, 12, 13, 14, 15, 16, 17, 20, 21, 22, 25, 26, 27, 30, 31, 32, 35, 36, 37 }, 787 :int{1}, :int{0}, "", "", :bin{ "" } 788 } 789 { 790 "iscii-gur", 791 :bin{3BB30AC0E9BFE9E8D80AEF43C0E9BFE9E8D80A3BB30AEF403BB30A}, 792 "\u003b\u0a15\u000a" /* Easy characters */ 793 "\u0a5c\u0a4d\u0a39\u0a5c\u0a4d\u0a39\u000a" /* Gurmukhi test */ 794 "\u09dd\u09dc\u09cd\u09b9\u000a" /* Switch script: to Bengali*/ 795 "\u003b\u0a15\u000a" /* Easy characters - new line" so should default!*/ 796 "\u003b\u0a15\u000a", /* Back to Gurmukhi*/ 797 :intvector { 0, 1, 2, 3, 3, 3, 5, 7, 8, 9, 13, 15, 16, 17, 18, 19, 20, 21, 24, 25, 26 }, 798 :int{1}, :int{0}, "", "", :bin{ "" } 799 } 800 // Test Gurmukhi (Bindi Tippi and Consonant Cluster) 801 { 802 "iscii-gur", 803 :bin{EF4BB3A2A2B3E8B3C0E9BFE9E8D8}, 804 "\u0a15\u0a70\u0a02\u0a71\u0a15\u0a5c\u0a4d\u0a39\u0a5c\u0a4d\u0a39", 805 :intvector { 2, 3, 4, 5, 5, 8, 8, 8, 10, 12, 13 }, 806 :int{1}, :int{0}, "", ".", :bin{ "" } 807 } 808 { // Verify Supplementary support 809 "Big5-HKSCS", 810 :bin{ fcfcfcfd }, 811 "\U000233E6\U00026DA0", 812 :intvector{ 0, 0, 2, 2 }, 813 :int{1}, :int{0}, "", "?", :bin{""} 814 } 815 { // Verify non-supplementary support 816 "big5-hkscs:unicode3.0", 817 :bin{ fcfcfcfd }, 818 "\uE1D4\uE1D5", 819 :intvector{ 0, 2 }, 820 :int{1}, :int{0}, "", "?", :bin{""} 821 } 822 { // Verify gb18030 enumeration 823 "gb18030", 824 :bin{ 8130D2398130D3308136A5318136A532 }, 825 "\u0450\u0452\u200F\u2011", 826 :intvector{ 0, 4, 8, 12 }, 827 :int{1}, :int{0}, "", "?", :bin{""} 828 } 829 { 830 "x11-compound-text", 831 :bin{ 1b242944b5ac1b2d41a5e31b2d43d5f51b2d4dd01b2d41411b2d43bc1b2d42ff1b2d54df1b2d44c0b31b2d46b41b2d47b01b2d48e01b2d4ca1 }, 832 "\u54A1\u00A5\u00E3\u0120\u0121\u011E\u0041\u0135\u02D9\u0E3F\u0100\u0157\u0384\u0660\u05D0\u0401", 833 :intvector{ }, 834 :int{1}, :int{0}, "", "?", :bin{""} 835 } 836 // Improve coverage of ISCII 837 { 838 "iscii-bng", 839 :bin{ f0a0f0b0b8 }, 840 "", 841 :intvector{}, 842 :int{1}, :int{0}, "", "0", :bin{""} 843 } 844 // Test iso-2022-jp-2 miscellaneous symbols 845 { 846 "iso-2022-jp-2", 847 :bin{ 1b242843224f224e1b2842 }, 848 "\u260E\u260F", 849 :intvector{ 4, 6 }, 850 :int{1}, :int{0}, "", ".", :bin{""} 851 } 852 853 // Improve Code Coverage for BOCU-1 854 { 855 "BOCU-1", 856 :bin{ 91fbc555fd6349 }, 857 "\u0041\ud841\ud888\udc81", 858 :intvector{ 0, 1, 4, 4 }, 859 :int{1}, :int{0}, "", ".", :bin{""} 860 } 861 { 862 "BOCU-1", 863 :bin{ fbeda44ff0fe189bb821f05926 }, 864 "\ufe88\ufe70\udbff\udfff\u0061", 865 :intvector{}, 866 :int{1}, :int{0}, "", ".", :bin{""} 867 } 868 { 869 "BOCU-1", 870 :bin{ 5b4bccf9 }, 871 "\u000b", 872 :intvector{}, 873 :int{1}, :int{0}, "", "0", :bin{""} 874 } 875 { 876 "BOCU-1", 877 :bin{ fe0053c6 }, 878 "\u0003\u0076", 879 :intvector{}, 880 :int{1}, :int{0}, "", "0", :bin{""} 881 } 882 883 //Improve code coverage for SCSU 884 { 885 "SCSU", 886 :bin{ 0fd899dc7fd888dc99e041424361 }, 887 "\ud899\udc7f\ud888\udc99\u0041\u0042\u0043\u0061", 888 :intvector{}, 889 :int{1}, :int{0}, "", "0", :bin{""} 890 } 891 { 892 "SCSU", 893 :bin{ 41df1281035f10df1b03df1c88800bbfffff }, 894 "\u0041\u00df\u0401\u015f\u00df\u01df\uf000\udbff\udfff", 895 :intvector{}, 896 :int{1}, :int{0}, "", ".", :bin{""} 897 } 898 { 899 "SCSU", 900 :bin{ 1b9a1b541bb2411bfd1b0041 }, 901 "", 902 :intvector{}, 903 :int{1}, :int{0}, "", ".", :bin{""} 904 } 905 { 906 "SCSU", 907 :bin{ 0f6441b413a733f2 }, 908 "\u6441\ub413\ua733", 909 :intvector{}, 910 :int{1}, :int{0}, "illegal", ".", :bin{ f2 } 911 } 912 913 //Improve code coverage for MBCS 914 { 915 "*test5", 916 :bin{ 0506 }, 917 "\ufffd\x06", 918 :intvector{}, 919 :int{1}, :int{0}, "", "?", :bin{""} 920 } 921 { 922 "ibm-1390,swaplfnl", 923 :bin{ 430e4395ecc140400fc1e115 }, 924 "\uff63\u30C8\u30C8\u309A\u3000\x41\u20ac\x0a", 925 :intvector{ 0, 2, 4, 4, 6, 9, 10, 11 }, 926 :int{1}, :int{0}, "", "?", :bin{""} 927 } 928 /* 929 * ICU 4.4 (ticket #7314) removes mappings for CNS 11643 planes 3..7 930 { 931 "ISO-2022-CN-EXT", 932 :bin{ 1b242b4d1b4f66791b242b4d1b4f216a }, 933 "\u3667", 934 :intvector{ 14 }, 935 :int{1}, :int{0}, "", "0", :bin{""} 936 } 937 */ 938 { 939 "*test1bmp", 940 :bin{ 060a05 }, 941 "\u0066\ufffd\u0065", 942 :intvector{ 0, 2, 2 }, 943 :int{0}, :int{1}, "", "?", :bin{""} 944 } 945 { 946 "*test1bmp", 947 :bin{ 060708 }, 948 "\u0066", 949 :intvector{}, 950 :int{1}, :int{0}, "invalid", ".", :bin{ 07 } 951 } 952 { 953 "*test5", 954 :bin{ 010304 }, 955 "\u0034\ufffd", 956 :intvector{}, 957 :int{1}, :int{0}, "", "?", :bin{""} 958 } 959 { 960 "*test1", 961 :bin{ 0a0b }, 962 "", 963 :intvector{}, 964 :int{1}, :int{0}, "", "0", :bin{""} 965 } 966 { 967 "*test1bmp", 968 :bin{ 0c06 }, 969 "\u0066", 970 :intvector{}, 971 :int{0}, :int{0}, "", ".", :bin{""} 972 } 973 { 974 "*test5", 975 :bin{ 0906 }, 976 "\udbc8\udf45\u0006", 977 :intvector{}, 978 :int{0}, :int{0}, "", ".", :bin{""} 979 } 980 { 981 "ibm-16684", 982 :bin{ 0e }, 983 "", 984 :intvector{}, 985 :int{0}, :int{0}, "illegal", ".", :bin{ 0e } 986 } 987 { 988 "UTF-7", 989 :bin{ 2b414b4d2d492b414b4d4170412d }, 990 "\u00a3I\u00a3\u00a4", 991 :intvector{ 1,5,7,9 }, 992 :int{0}, :int{0}, "", ".", :bin{""} 993 } 994 { 995 "x11-compound-text", 996 :bin{ 1e6ddc9b26bc10801bbcad50a040fc }, 997 "\u001e\u006d\u00dc\u009b\u0026\u00bc\u0010\u0080\ufffd\u00bc\u00ad\u0050\u00a0\u0040\u00fc", 998 :intvector{ }, 999 :int{1}, :int{0}, "", "?", :bin{""} 1000 } 1001 } 1002 } 1003 1004 // --------------------------------------------------------------------- *** 1005 1006 fromUnicode { 1007 Headers { "charset", "unicode", "bytes", "offsets", "flush", "fallbacks", "errorCode", "callback", "invalidUChars" } 1008 Cases { 1009 // Test ticket 9602: Add "good one-way" mapping type (|4). 1010 // Such mappings are used regardless of the fallback flag. 1011 { 1012 "+*test3", "##\uFE0E#\uFE0F", 1013 :bin{ 010204010204010204 }, :intvector{ 0,0,0,1,1,1,3,3,3 }, 1014 :int{1}, :int{0}, // no fallbacks 1015 "", "?", "" 1016 } 1017 { 1018 "+*test3", "##\uFE0E#\uFE0F", 1019 :bin{ 010204010204010204 }, :intvector{ 0,0,0,1,1,1,3,3,3 }, 1020 :int{1}, :int{1}, // with fallbacks 1021 "", "?", "" 1022 } 1023 // Test ticket 6789: implement Java-compatible Unicode, UnicodeBig and UnicodeLittle converters 1024 // For details about these encodings see convrtrs.txt. 1025 // Standard UTF-16BE 1026 { "UTF-16BE", "a", :bin{ 0061 }, :intvector{ 0,0 }, :int{1}, :int{0}, "", "?", "" } 1027 // Java "UnicodeBig" writes a BOM 1028 { "UnicodeBig", "a", :bin{ feff0061 }, :intvector{ -1,-1,0,0 }, :int{1}, :int{0}, "", "?", "" } 1029 // Standard UTF-16LE 1030 { "UTF-16LE", "a", :bin{ 6100 }, :intvector{ 0,0 }, :int{1}, :int{0}, "", "?", "" } 1031 // Java "UnicodeLittle" writes a BOM 1032 { "UnicodeLittle", "a", :bin{ fffe6100 }, :intvector{ -1,-1,0,0 }, :int{1}, :int{0}, "", "?", "" } 1033 1034 // Test ticket 7704: implement Java-compatible "UTF-16" converter. 1035 // Same as standard UTF-16 but fromUnicode always writes big-endian byte stream. 1036 { "+UTF-16,version=2", "a", :bin{ feff0061 }, :intvector{ -1,-1,0,0 }, :int{1}, :int{0}, "", "?", "" } 1037 1038 // Test bug 6071 (1:2 Unicode:charset SBCS mapping). 1039 { 1040 "*test1bmp", 1041 "e@t", 1042 :bin{ 05000709 }, 1043 :intvector{ 0,1,2,2 }, 1044 :int{1}, :int{0}, "", "?", "" 1045 } 1046 // improve coverage of ISO-2022-JP converter with hardcoded JIS X 0201 and 1047 // using the Shift-JIS table for JIS X 0208 (ticket #5797) 1048 { 1049 "ISO-2022-JP", 1050 "\u203e\xa5\u4e00\ufa10\u6f3e\u0391", 1051 :bin{ 1b284a7e5c1b2442306c222e5f2126211b2842 }, 1052 :intvector{ 0,0,0,0,1,2,2,2,2,2,3,3,4,4,5,5,5,5,5 }, 1053 :int{1}, :int{0}, "", "?=\u3013", "" // U+3013 Geta Mark converts to 222e 1054 } 1055 // Verify that mappings that would result in byte values outside 20..7F (for SBCS) 1056 // or 21..7E (for DBCS) are not used. 1057 // ibm-9005_X110-2007.ucm (ISO 8859-7, <ESC>.F=1b2e46): 1058 // <U009F> \x9F |0 (also in ISO 8859-1) 1059 // <U0387> \xB7 |1 1060 // windows-949-2000 (KSC_5601, <ESC>$(C=1b242843): 1061 // <UC829> \xA0\xA1 |0 1062 // <UD4FE> \xC0\x41 |0 1063 // <UD79D> \xC8\xFE |0 1064 { 1065 "JIS8", // =ISO_2022,locale=ja,version=4 1066 "\u009f\u0387\uc829\ud4fe\ud79d", 1067 :bin{ 1a1b2e461b4e371a1a1b242843487e1b2842 }, 1068 :intvector{ 0,1,1,1,1,1,1,2,3,4,4,4,4,4,4,4,4,4 }, 1069 :int{1}, :int{1}, "", "?", "" 1070 } 1071 // Ticket 5483: ISO 2022 converter incorrectly using fallback mapping 1072 // Verify that a roundtrip mapping is used even when a fallback mapping is 1073 // available in the current state. 1074 // U+FF61 is handled in code 1075 // jisx-208.ucm (<ESC>$B=1b2442): 1076 // <U30FE> \x21\x34 |0 1077 // <UFF5D> \x21\x51 |0 and 1078 // ibm-897_P100-1995.ucm (JIS X 0201, <ESC>(J=1b284a): 1079 // <UFF5D> \x7D |1 1080 // ibm-9005_X110-2007.ucm (ISO 8859-7, <ESC>.F=1b2e46): 1081 // <U03D5> \xF6 |1 1082 // <U2015> \xAF |0 1083 // <UFF5D> \x7D |1 (not legal for ISO 2022) 1084 // windows-949-2000 (KSC_5601, <ESC>$(C=1b242843): 1085 // <UAC00> \xB0\xA1 |0 1086 // <UFF5D> \xA3\xFD |0 1087 // <U223C> \xA1\xAD |0 (in extension table) 1088 { 1089 "JIS8", // =ISO_2022,locale=ja,version=4 1090 "a\uff61\u03d5\uff5d\uac00\u223c\uff5d\u30fe\uff5d", // Make it switch to ISO-8859-7, KSC 5601 and JIS X 0208. 1091 :bin{ 61a11b2e461b4e761b244221511b2428433021212d237d1b2442213421511b2842 }, 1092 :intvector{ 0,1,2,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,4,5,5,6,6,7,7,7,7,7,8,8,8,8,8 }, 1093 :int{1}, :int{1}, "", "?", "" 1094 } 1095 // Code coverage for UTF-8->SBCS conversion (ucnv_convertEx()). 1096 // Test code path for non-roundtripping ASCII characters 1097 // (try EBCDIC SBCS, and IBM PC SBCS with control code rotation). 1098 { 1099 "ibm-37", 1100 "a\x85c", 1101 :bin{ 811583 }, 1102 :intvector{ 0,1,2 }, 1103 :int{1}, :int{0}, "", "?", "" 1104 } 1105 { 1106 "ibm-850", 1107 "a\x1ac", 1108 :bin{ 617f63 }, 1109 :intvector{ 0,1,2 }, 1110 :int{1}, :int{0}, "", "?", "" 1111 } 1112 // Code coverage for UTF-8->DBCS conversion (ucnv_convertEx()). 1113 // Test code path for non-roundtripping ASCII characters 1114 // (try IBM PC DBCS with control code rotation). 1115 { 1116 "ibm-943", 1117 "a\x1ac\u30a1\x7ff", 1118 :bin{ 617f6383401c66 }, 1119 :intvector{ 0,1,2,3,3,4,5 }, 1120 :int{1}, :int{0}, "", "?", "" 1121 } 1122 // SCSU regression test. 1123 { 1124 "SCSU", 1125 "1\U00010001\u0085\U000500022\ud8003\udc014\ue001", 1126 :bin{ 310be0008102050fd900dc02e7320efffd330efffd34186881 }, 1127 :intvector{ 0,1,1,1,1,3,3,4,4,4,4,4,6,6,7,7,7,8,9,9,9,10,11,11,11 }, 1128 :int{1}, :int{0}, "", "?", "" 1129 } 1130 // LMBCS 1131 { 1132 "lmbcs", 1133 "\u0061\u2013\u00fc\u00e1\u0007\u0091\u0a74\u0200\ud84d\udc56\u5516", 1134 :bin { 61039681a00f270f91140a7414f60214d84d14dc561088a0 }, 1135 :intvector{ 0, 1, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10 }, 1136 :int{1}, :int{0}, "", ".", "" 1137 } 1138 // Test substitution strings. 1139 { 1140 "windows-1252", // stateless MBCS with WriteSub(), should internally set char * 1141 "a\ufdd0c", 1142 :bin{ 61402421402463 }, 1143 :intvector{ 0,1,1,1,1,1,2 }, 1144 :int{1}, :int{0}, "", "?=@$!@$", "" 1145 } 1146 { 1147 "windows-1252", 1148 "1\U00010001\u0085\U000500022\ud8003\udc014\ue001", 1149 :bin{ 311a1a1a321a331a341a }, 1150 :intvector{ 0,1,3,4,6,7,8,9,10,11 }, 1151 :int{1}, :int{0}, "", "?", "" 1152 } 1153 { 1154 "windows-1252", 1155 "\uD87E", // lone surrogate can cause an offset overflow 1156 :bin{ 1a }, 1157 :intvector{ 0 }, 1158 :int{1}, :int{0}, "", "?", "" 1159 } 1160 { 1161 "windows-1252", 1162 "\uD87E", // lone surrogate can cause an offset overflow 1163 :bin{ 6875683f }, 1164 :intvector{ 0,0,0,0 }, 1165 :int{1}, :int{0}, "", "?=huh?", "" // Use a long substitution character 1166 } 1167 { 1168 "*test4", 1169 "\u30ab", // An incomplete multi-codepoint character 1170 :bin{ ff }, 1171 :intvector{ 0 }, 1172 :int{1}, :int{0}, "", "?", "" 1173 } 1174 { 1175 "ibm-930", // stateful MBCS 1176 "a\ufdd0\u4e00\ufdd0e", 1177 :bin{ 620e4bce0f400e45414bce0f4066 }, 1178 :intvector{ 0,1,1,1,1,1,2,2,2,3,3,3,3,4 }, 1179 :int{1}, :int{0}, "", "?=\u4e01 ", "" 1180 } 1181 { 1182 "iso-2022-jp", 1183 "a\x1bc", // Unicode ESC must not occur as a character 1184 :bin{ 6163 }, 1185 :intvector{ 0,2 }, 1186 :int{1}, :int{0}, "", "?=", "" // empty substitution string 1187 } 1188 { 1189 "iso-2022-cn", 1190 "a\x1bc", // Unicode ESC must not occur as a character 1191 :bin{ 61202063 }, 1192 :intvector{ 0,1,1,2 }, 1193 :int{1}, :int{0}, "", "?= ", "" 1194 } 1195 { 1196 "iso-2022-cn", 1197 "a\x1bc", // Unicode ESC must not occur as a character 1198 :bin{ 611b2429410e523b0f2063 }, 1199 :intvector{ 0,1,1,1,1,1,1,1,1,1,2 }, 1200 :int{1}, :int{0}, "", "?=\u4e00 ", "" 1201 } 1202 { 1203 "us-ascii", 1204 "a\x85c", 1205 :bin{ 61402421402463 }, 1206 :intvector{ 0,1,1,1,1,1,2 }, 1207 :int{1}, :int{0}, "", "?=@$!@$", "" 1208 } 1209 // ISO 2022-CN: test a single-byte subchar, j5171 1210 { 1211 "iso-2022-cn", 1212 "a\x1bc", // Unicode ESC must not occur as a character 1213 :bin{ 612163 }, 1214 :intvector{ 0,1,2 }, 1215 :int{1}, :int{0}, "", "?\x00\x21", "" 1216 } 1217 // UTF-16/32: do not output a BOM if there is no data at all 1218 { 1219 "UTF-16", 1220 "", 1221 :bin{ "" }, 1222 :intvector{ }, 1223 :int{1}, :int{1}, "", "?", "" 1224 } 1225 { 1226 "UTF-32", 1227 "", 1228 :bin{ "" }, 1229 :intvector{ }, 1230 :int{1}, :int{1}, "", "?", "" 1231 } 1232 1233 // do not convert SO/SI/ESC 1234 { 1235 "iso-2022-jp", 1236 "A\x0eB\x0f\x09\x1bC", 1237 :bin{ 411a421a091a43 }, 1238 :intvector{ 0,1,2,3,4,5,6 }, 1239 :int{1}, :int{1}, "", "?", "" 1240 } 1241 { 1242 "iso-2022-cn", 1243 "A\x0eB\x0f\x09\x1bC", 1244 :bin{ 411a421a091a43 }, 1245 :intvector{ 0,1,2,3,4,5,6 }, 1246 :int{1}, :int{1}, "", "?", "" 1247 } 1248 { 1249 "iso-2022-kr", 1250 "A\x0eB\x0f\x09\x1bC", 1251 :bin{ 1b242943411a421a091a43 }, 1252 :intvector{ -1,-1,-1,-1,0,1,2,3,4,5,6 }, 1253 :int{1}, :int{1}, "", "?", "" 1254 } 1255 { 1256 "ibm-25546", 1257 "A\x0eB\x0f\x09\x1bC", 1258 :bin{ 1b242943411a421a091a43 }, 1259 :intvector{ -1,-1,-1,-1,0,1,2,3,4,5,6 }, 1260 :int{1}, :int{1}, "", "?", "" 1261 } 1262 1263 // test ISO 8859-1/7 vs. JIS X 0201 1264 { 1265 "ISO-2022-JP-2", 1266 "?@A\u00e4\u03b1\u203EB", 1267 :bin{ 3f40411B2E411B4E641b244226411b284a7e421b2842 }, 1268 :intvector{ 0,1,2,3,3,3,3,3,3,4,4,4,4,4,5,5,5,5,6,6,6,6 }, 1269 :int{1}, :int{1}, "", "?", "" 1270 } 1271 1272 // Improve ucnv_ext.c code coverage: 1273 // There will be a partial match up to the lead surrogate of U+603ff 1274 // which then results in one more unit in the prefetch buffer 1275 // than the match length when converting one code unit at a time. 1276 // See ucnv_extContinueMatchFromU() comment 1277 // "the match did not use all of preFromU[] - keep the rest for replay" 1278 { 1279 "*test3", 1280 "\U00101234\U00101234\U00050005\U000603ff", 1281 :bin{ 07070001020e05ff }, 1282 :intvector{ 0, 2, 2, 2, 2, 2, 2, 6 }, 1283 :int{1}, :int{0}, "", "?", "" 1284 } 1285 1286 // test that ISO-2022-JP encodes ASCII as itself 1287 { 1288 "ISO-2022-JP", 1289 "?@ABCDEFG", 1290 :bin{ 3f4041424344454647 }, 1291 :intvector{ 0,1,2,3,4,5,6,7,8 }, 1292 :int{1}, :int{1}, "", "?", "" 1293 } 1294 // test that ISO-2022-CN encodes ASCII as itself 1295 { 1296 "ISO-2022-CN", 1297 "?@ABCDEFG", 1298 :bin{ 3f4041424344454647 }, 1299 :intvector{ 0,1,2,3,4,5,6,7,8 }, 1300 :int{1}, :int{1}, "", "?", "" 1301 } 1302 1303 // moved from cintltst /tsconv/nccbtst/TestSkipCallBack 1304 { 1305 "iso-2022-jp", 1306 "\u3000\xe9\u3001", 1307 :bin{ 1b2442212121221b2842 }, 1308 :intvector{ 0,0,0,0,0,2,2,2,2,2 }, 1309 :int{1}, :int{1}, "", "0", "" 1310 } 1311 // moved from cintltst /tsconv/nccbtst/TestSubCallBack 1312 { 1313 "iso-2022-jp", 1314 "A\xe9B\xe9\u3000", 1315 :bin{ 411a421a1b244221211b2842 }, 1316 :intvector{ 0,1,2,3,4,4,4,4,4,4,4,4 }, 1317 :int{1}, :int{1}, "", "?", "" 1318 } 1319 // moved from cintltst /tsconv/nccbtst/TestSubWithValueCallBack 1320 { 1321 "iso-2022-jp", 1322 "A\xe9B\xe9\u3000", 1323 :bin{ 41255530304539422555303045391b244221211b2842 }, 1324 :intvector{ 0,1,1,1,1,1,1,2,3,3,3,3,3,3,4,4,4,4,4,4,4,4 }, 1325 :int{1}, :int{1}, "", "&", "" 1326 } 1327 { 1328 "iso-2022-cn", 1329 "\u4e00\u3712\u4e01", 1330 :bin{ 1b2429410e523b0f2555333731320e36210f }, 1331 :intvector{ 0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,2,2 }, 1332 :int{1}, :int{1}, "", "&", "" 1333 } 1334 { 1335 "iso-2022-cn", 1336 "A\u3712\u4e00", 1337 :bin{ 412555333731321b2429410e523b0f }, 1338 :intvector{ 0,1,1,1,1,1,1,2,2,2,2,2,2,2,2 }, 1339 :int{1}, :int{1}, "", "&", "" 1340 } 1341 { 1342 "iso-2022-cn", 1343 "\u3000\u3712\u3001", 1344 :bin{ 1b2429410e21210f2555333731320e21220f }, 1345 :intvector{ 0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,2,2 }, 1346 :int{1}, :int{1}, "", "&", "" 1347 } 1348 1349 // moved from cintltst /tsconv/nucnvtst/TestJIS 1350 { 1351 "JIS", 1352 "\uFF81\uFF82\u30EC\u30ED\u30EE\u30EF\uFF93\uFF94\uFF95\uFF96\uFF97\uFF98", 1353 :bin{ 1b244225412544256c256d256e256F25622564256625682569256a1b2842 }, 1354 :intvector{ 0,0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,11,11,11 }, 1355 :int{1}, :int{1}, "", "?", "" 1356 } 1357 { 1358 "JIS7", 1359 "\uFF81\uFF82\u30EC\u30ED\u30EE\u30EF\uFF93\uFF94\uFF95\uFF96\uFF97\uFF98", 1360 :bin{ 0e41420f1b2442256c256d256e256F0e5354555657580f1b2842 }, 1361 :intvector{ 0,0,1,2,2,2,2,2,2,3,3,4,4,5,5,6,6,7,8,9,10,11,11,11,11,11 }, 1362 :int{1}, :int{1}, "", "?", "" 1363 } 1364 { 1365 "JIS8", 1366 "\uFF81\uFF82\u30EC\u30ED\u30EE\u30EF\uFF93\uFF94\uFF95\uFF96\uFF97\uFF98", 1367 :bin{ C1C21b2442256c256d256e256F1b284AD3D4D5D6D7D81b2842 }, 1368 :intvector{ 0,1,2,2,2,2,2,3,3,4,4,5,5,6,6,6,6,7,8,9,10,11,11,11,11 }, 1369 :int{1}, :int{1}, "", "?", "" 1370 } 1371 { 1372 "JIS8", 1373 "\u2019Aaa10\u4ED5\u5165\u5148\u30B3", 1374 :bin{ 1b244221471b284241616131301b24423b45467e406825331b2842 }, 1375 :intvector{ 0,0,0,0,0,1,1,1,1,2,3,4,5,6,6,6,6,6,7,7,8,8,9,9,9,9,9 }, 1376 :int{1}, :int{1}, "", "?", "" 1377 } 1378 1379 // moved from cintltst /tsconv/ncnvtst/TestErrorBehaviour 1380 { 1381 "iso-2022-jp", 1382 "\u3000\x50\udc01\u3001", 1383 :bin{ 1B244221211B2842501A1B24422122 }, 1384 :intvector{ 0,0,0,0,0,1,1,1,1,2,3,3,3,3,3 }, 1385 :int{0}, :int{1}, "", "?", "\udc01" 1386 } 1387 { 1388 "iso-2022-jp", 1389 "\u3000\x50\udc01\u3001", 1390 :bin{ 1B244221211B2842501A1B244221221b2842 }, 1391 :intvector{ 0,0,0,0,0,1,1,1,1,2,3,3,3,3,3,3,3,3 }, 1392 :int{1}, :int{1}, "", "?", "" 1393 } 1394 { 1395 "iso-2022-kr", 1396 "\x61\u4e00\udc01\u4e00", 1397 :bin{ 1b242943610e6c690f1a0e6c69 }, 1398 :intvector{ -1,-1,-1,-1,0,1,1,1,2,2,3,3,3 }, 1399 :int{0}, :int{1}, "", "?", "\udc01" 1400 } 1401 { 1402 "iso-2022-kr", 1403 "\x61\u4e00\udc01\u4e00", 1404 :bin{ 1b242943610e6c690f1a0e6c690f }, 1405 :intvector{ -1,-1,-1,-1,0,1,1,1,2,2,3,3,3,3 }, 1406 :int{1}, :int{1}, "", "?", "" 1407 } 1408 1409 // ISO-2022-KR 1410 { 1411 "ibm-25546", 1412 "AB\uc88b\U00050005\uacccC", 1413 :bin{ 1b24294341420e41410f7b552b35303030357d0e306a0f43 }, 1414 :intvector{ -1,-1,-1,-1,0,1,2,2,2,3,3,3,3,3,3,3,3,3,3,5,5,5,6,6 }, 1415 :int{1}, :int{1}, "", "&U", "" 1416 } 1417 { 1418 "ibm-25546", 1419 "AB\uc88b\U00050005\uacccC", 1420 :bin{ 1b24294341420e41410f1a0e306a0f43 }, 1421 :intvector{ -1,-1,-1,-1,0,1,2,2,2,3,3,5,5,5,6,6 }, 1422 :int{1}, :int{1}, "", "?\x00\x1a", "" 1423 } 1424 { 1425 "ibm-25546", 1426 "AB\uc88b\U00050005\uacccC", 1427 :bin{ 1b24294341420e41412f7e306a0f43 }, 1428 :intvector{ -1,-1,-1,-1,0,1,2,2,2,3,3,5,5,6,6 }, 1429 :int{1}, :int{1}, "", "?", "" 1430 } 1431 { 1432 "ibm-25546", 1433 "AB\uc88b\U00050005\uaccc", 1434 :bin{ 1b24294341420e41412f7e306a0f }, 1435 :intvector{ -1,-1,-1,-1,0,1,2,2,2,3,3,5,5,5 }, 1436 :int{1}, :int{1}, "", "?", "" 1437 } 1438 { 1439 "ISO-2022-KR", 1440 "AB\uc88b\U00050005\uacccC", 1441 :bin{ 1b24294341420e41410f7b552b35303030357d0e306a0f43 }, 1442 :intvector{ -1,-1,-1,-1,0,1,2,2,2,3,3,3,3,3,3,3,3,3,3,5,5,5,6,6 }, 1443 :int{1}, :int{1}, "", "&U", "" 1444 } 1445 { 1446 "ISO-2022-KR", 1447 "AB\uc88b\U00050005\uacccC", 1448 :bin{ 1b24294341420e41410f1a0e306a0f43 }, 1449 :intvector{ -1,-1,-1,-1,0,1,2,2,2,3,3,5,5,5,6,6 }, 1450 :int{1}, :int{1}, "", "?", "" 1451 } 1452 { 1453 "ISO-2022-KR", 1454 "AB\uc88b\U00050005\uacccC", 1455 :bin{ 1b24294341420e41412f7e306a0f43 }, 1456 :intvector{ -1,-1,-1,-1,0,1,2,2,2,3,3,5,5,6,6 }, 1457 :int{1}, :int{1}, "", "?\x00\x2f\x7e", "" 1458 } 1459 { 1460 "ISO-2022-KR", 1461 "AB\uc88b\U00050005\uaccc", 1462 :bin{ 1b24294341420e41412f7e306a0f }, 1463 :intvector{ -1,-1,-1,-1,0,1,2,2,2,3,3,5,5,5 }, 1464 :int{1}, :int{1}, "", "?\x00\x2f\x7e", "" 1465 } 1466 // ISO-2022-KR 1467 { 1468 "ibm-25546", 1469 "AB\uc88b\U00050005\uacccC", 1470 :bin{ 1b24294341420e41410f5c3530303035200e306a0f43 }, 1471 :intvector{ -1,-1,-1,-1,0,1,2,2,2,3,3,3,3,3,3,3,3,5,5,5,6,6 }, 1472 :int{1}, :int{1}, "", "&S", "" 1473 } 1474 1475 // ISO-2022-JP-2 with G2 designator & SS2 shift 1476 { 1477 "ISO-2022-JP-2", 1478 "CF\u758f\u038f\u7591", 1479 :bin{ 43461b244241411b2e461b4e3f353f1b2842 }, 1480 :intvector{ 0,1,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4 }, 1481 :int{1}, :int{1}, "", ".", "" 1482 } 1483 // JIS7 with Katakana 1484 { 1485 "JIS7", 1486 "AB\uff81\uff82AB", 1487 :bin{ 41420e41420f4142 }, 1488 :intvector{ 0,1,2,2,3,4,4,5 }, 1489 :int{1}, :int{1}, "", ".", "" 1490 } 1491 // JIS7 with shift to ASCII at the very end 1492 { 1493 "JIS7", 1494 "AB\uff81\uff82", 1495 :bin{ 41420e41420f }, 1496 :intvector{ 0,1,2,2,3,3 }, 1497 :int{1}, :int{1}, "", ".", "" 1498 } 1499 // JIS8 with Katakana 1500 { 1501 "JIS8", 1502 "A\uff81\\\xa5\uff82B", 1503 :bin{ 41c15c1b284a5cc2421b2842 }, 1504 :intvector{ 0,1,2,3,3,3,3,4,5,5,5,5 }, 1505 :int{1}, :int{1}, "", ".", "" 1506 } 1507 1508 /* 1509 * ICU 4.4 (ticket #7314) removes mappings for CNS 11643 planes 3..7 1510 // ISO-2022-CN-EXT with all subcharsets and shifts and with supplementary code points 1511 { 1512 "ISO-2022-CN-EXT", 1513 "\u4eae\u9f82\u56cd\u56cc\U0002a6d6\x30", 1514 :bin{ 1b2429410e41411b2429457e7c1b242a481b4e70341b2429477c341b242b4d1b4f664c0f30 }, 1515 :intvector{ 0,0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,4,4,6,6 }, 1516 :int{1}, :int{1}, "", ".", "" 1517 } 1518 // ISO-2022-CN-EXT with shift to ASCII at the very end 1519 { 1520 "ISO-2022-CN-EXT", 1521 "\u4eae\u9f82\u56cd\u56cc\U0002a6d6", 1522 :bin{ 1b2429410e41411b2429457e7c1b242a481b4e70341b2429477c341b242b4d1b4f664c0f }, 1523 :intvector{ 0,0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,4,4,4 }, 1524 :int{1}, :int{1}, "", ".", "" 1525 } 1526 // ISO-2022-CN-EXT without flush so do not shift to ASCII at the very end 1527 { 1528 "ISO-2022-CN-EXT", 1529 "\u4eae\u9f82\u56cd\u56cc\U0002a6d6", 1530 :bin{ 1b2429410e41411b2429457e7c1b242a481b4e70341b2429477c341b242b4d1b4f664c }, 1531 :intvector{ 0,0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,4,4,4,4,4,4,4,4 }, 1532 :int{0}, :int{1}, "", ".", "" 1533 } 1534 */ 1535 1536 // windows-936 vs. ibm-1386 1537 { 1538 "ibm-1386", 1539 "\x1a\u20ac\u5555\x80\x81\U00055555", 1540 :bin{ 7fa2e3dffb7f7fa1a1 }, 1541 :intvector{ 0, 1, 1, 2, 2, 3, 4, 5, 5 }, 1542 :int{1}, :int{1}, "", "?", "" 1543 } 1544 { 1545 "windows-936", 1546 "\x1a\u20ac\u5555\x80\x81\U00055555", 1547 :bin{ 1a80dffb3f3f3f }, 1548 :intvector{ 0, 1, 2, 2, 3, 4, 5 }, 1549 :int{1}, :int{1}, "", "?", "" 1550 } 1551 1552 // verify that if a conversion table does not have any mapping for U+0000, 1553 // then there will not even be a phantom fallback to 00 1554 { 1555 "ibm-971", 1556 "\x00", 1557 :bin{ affe }, 1558 :intvector{ 0, 0 }, 1559 :int{1}, :int{1}, "", "?", "" 1560 } 1561 1562 { 1563 "*test4", 1564 "\x00", 1565 :bin{ ff }, 1566 :intvector{ 0 }, 1567 :int{1}, :int{1}, "", "?", "" 1568 } 1569 1570 // extension in testdata 1571 { 1572 "*test4x", 1573 "\u20ac\x09", 1574 :bin{ 0009 }, 1575 :intvector{ 0, 1 }, 1576 :int{1}, :int{1}, "", "?", "" 1577 } 1578 1579 // DBCS-only extensions 1580 { 1581 "ibm-970", 1582 "\x61\uffa1\u2015\ub000", 1583 :bin{ 611aa1aab2eb }, 1584 :intvector{ 0, 1, 2, 2, 3, 3 }, 1585 :int{1}, :int{1}, "", "?", "" 1586 } 1587 1588 { 1589 "ibm-971", 1590 "\x61\uffa1\u2015\ub000", 1591 :bin{ affeaffeaffeb2eb }, 1592 :intvector{ 0, 0, 1, 1, 2, 2, 3, 3 }, 1593 :int{1}, :int{1}, "", "?", "" 1594 } 1595 1596 { 1597 "ibm-1390,swaplfnl", 1598 "\uff63\u30C8\u30C8\u309A\u3000\x41\u20ac\x0a", 1599 :bin{ 430e4395ecc140400fc1e115 }, 1600 :intvector{ 0, 1, 1, 1, 2, 2, 4, 4, 5, 5, 6, 7 }, 1601 :int{1}, :int{0}, "", "?", "" 1602 } 1603 1604 { 1605 "ibm-16684", 1606 "\uff63\u30C8\u30C8\u309A\u3000\x41\u20ac\x0a", 1607 :bin{ fefe4395ecc14040fefe42e1fefe }, 1608 :intvector{ 0, 0, 1, 1, 2, 2, 4, 4, 5, 5, 6, 6, 7, 7 }, 1609 :int{1}, :int{0}, "", "?", "" 1610 } 1611 1612 { 1613 "ibm-1399", 1614 "\uff63\u30C8\u30C8\u309A\u3000\x41\u20ac\x0a", 1615 :bin{ 440e4395ecc140400fc1e125 }, 1616 :intvector{ 0, 1, 1, 1, 2, 2, 4, 4, 5, 5, 6, 7 }, 1617 :int{1}, :int{0}, "", "?", "" 1618 } 1619 1620 // <subchar1> from |2 mappings 1621 { 1622 "ibm-1390", 1623 "\x0e\x0f\u0901\U00050000\uffe8\uffee", 1624 :bin{ 3f3f0efefefefe0f3f3f }, 1625 :intvector{ 0, 1, 2, 2, 2, 3, 3, 5, 5, 6 }, 1626 :int{1}, :int{1}, "", "?", "" 1627 } 1628 1629 // <subchar1> from |2 mappings, and also contains a fallback to 00 1630 { 1631 "*test4", 1632 "\u20ac\u20ad\U00050005\U00023456\U0010ffff\x30", 1633 :bin{ 0000e10102030affff }, 1634 :intvector{ 0, 1, 2, 4, 4, 4, 4, 6, 8 }, 1635 :int{1}, :int{1}, "", "?", "" 1636 } 1637 1638 // setting a <subchar> resets the <subchar1> 1639 { 1640 "*test4", 1641 "\u20ac\u20ad\U00050005\U00023456\U0010ffff\x30", 1642 :bin{ 00000102030f0102030a0102030f0102030f }, 1643 :intvector{ 0, 1, 2, 2, 2, 2, 4, 4, 4, 4, 6, 6, 6, 6, 8, 8, 8, 8 }, 1644 :int{1}, :int{1}, "", "?\x00\x01\x02\x03\x0f", "" 1645 } 1646 1647 // fallback to 00 with old single-byte data structure 1648 { 1649 "*test1", 1650 "\u20ac\u20ad\U00101234\U00050000", 1651 :bin{ 000007ff }, 1652 :intvector{ 0, 1, 2, 4 }, 1653 :int{1}, :int{1}, "", "?", "" 1654 } 1655 1656 // extensions 1657 { 1658 "ibm-1390", 1659 "\u025a\u025a\u0300\u025a\u0301\u025a\u0302\uffe8\U0002a0f9", 1660 :bin{ 0ed896eccaeccbd896ea530f3f0eb7c20f }, 1661 :intvector{ 0, 0, 0, 1, 1, 3, 3, 5, 5, 6, 6, 7, 7, 8, 8, 8, 8 }, 1662 :int{1}, :int{0}, "", "?", "" 1663 } 1664 1665 { 1666 "*test3", 1667 "\xc4\xc4\xc4\U00101234\xc4\xc4\U00101234\x05", 1668 :bin{ ffffff070501020c }, 1669 :intvector{ 0, 1, 2, 3, 5, 5, 5, 5 }, 1670 :int{1}, :int{0}, "", "?", "" 1671 } 1672 1673 { 1674 "*test3", 1675 "\U00101234\U00101234\U00050005\U00101234\U00050005\U00060006", 1676 :bin{ 07070001020e05070001020f09 }, 1677 :intvector{ 0, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6, 6 }, 1678 :int{1}, :int{0}, "", "?", "" 1679 } 1680 1681 // normal conversions 1682 { 1683 "UTF-16LE", 1684 "1\U00010001\U000500022\ud8003\udc014", 1685 :bin{ 310000d801dc00d902dc3200fdff3300fdff3400 }, 1686 :intvector{ 0, 0, 1, 1, 1, 1, 3, 3, 3, 3, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9 }, 1687 :int{1}, :int{0}, "", "?", "" 1688 } 1689 { "UTF-16LE", "\ud800", :bin{""}, :intvector{}, :int{1}, :int{0}, "truncated", ".", "\ud800" } 1690 1691 { 1692 "UTF-16BE", 1693 "1\U00010001\U000500022\ud8003\udc014", 1694 :bin{ 0031d800dc01d900dc020032fffd0033fffd0034 }, 1695 :intvector{ 0, 0, 1, 1, 1, 1, 3, 3, 3, 3, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9 }, 1696 :int{1}, :int{0}, "", "?", "" 1697 } 1698 { "UTF-16BE", "\ud800", :bin{""}, :intvector{}, :int{1}, :int{0}, "truncated", ".", "\ud800" } 1699 1700 { 1701 "SCSU", 1702 "1\U00010001\u00082\ud8003\udc014\ue001", 1703 :bin{ 310be000810108320efffd330efffd34186881 }, 1704 :intvector{ 0, 1, 1, 1, 1, 3, 3, 4, 5, 5, 5, 6, 7, 7, 7, 8, 9, 9, 9 }, 1705 :int{1}, :int{0}, "", "?", "" 1706 } 1707 { 1708 "x11-compound-text", 1709 "\u54A1\u00A5\u00E3\u0120\u0121\u011E\u0041\u0135\u02D9\u0E3F\u0100\u0157\u0384\u0660\u05D0\u0401", 1710 :bin{ 1b242944b5ac1b2d41a5e31b2d43d5f51b2d4dd01b2d41411b2d43bc1b2d42ff1b2d54df1b2d44c0b31b2d46b41b2d47b01b2d48e01b2d4ca1 }, 1711 :intvector{ }, 1712 :int{1}, :int{0}, "", "?", "" 1713 } 1714 // Test Gurmukhi (Bindi Tippi and Consonant clusters) 1715 { 1716 "iscii-gur", 1717 "\u0a15\u0a70\u0a02\u0a71\u0a15\u0a5c\u0a4d\u0a39\u000a\u0043\u0041\u000a", 1718 :bin { ef4bb3a2a2b3e8b3bfe9e8d80aef4b43410a }, 1719 :intvector{ 0, 0, 0, 1, 2, 3, 3, 3, 5, 5, 6, 7, 8, 8, 8, 9, 10, 11 }, 1720 :int{1}, :int{0}, "", ".", "" 1721 } 1722 // escape callback 1723 { 1724 "iscii-dev", 1725 "A\u0901\U00023456\u0902B\U00023456C", 1726 :bin{ 41ef42a1255544383444255544433536a24225554438344425554443353643 }, 1727 :intvector{ 1728 0, 1729 1,1,1, 1730 2,2,2,2,2,2, 1731 2,2,2,2,2,2, 1732 4, 1733 5, 1734 6,6,6,6,6,6, 1735 6,6,6,6,6,6, 1736 8 1737 }, 1738 :int{1}, :int{0}, "", "&", "" 1739 } 1740 1741 // escape callback (hex) 1742 { 1743 "iso-2022-jp", 1744 "\u3000\U00023456\u3001\U00023456B\u901c", 1745 :bin{ 1b244221211b284226237832333435363b1b244221221b284226237832333435363b42262378393031433b }, 1746 :intvector{ 1747 0,0,0,0,0, 1748 1,1,1,1,1,1,1,1,1,1,1,1, 1749 3,3,3,3,3, 1750 4,4,4,4,4,4,4,4,4,4,4,4, 1751 6, 1752 7,7,7,7,7,7,7,7 1753 }, 1754 :int{1}, :int{0}, "", "&X", "" 1755 } 1756 1757 // sub callback 1758 { 1759 "gb18030", 1760 "$\x7f\x80\u01f9\u20ac\u4e00\u9fa6\uffff\U00010000\U0010ffff", 1761 :bin{ 247f81308130a8bfa2e3d2bb82358f338431a43990308130e3329a35 }, 1762 :intvector{ 0, 1, 2, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 10, 10, 10, 10 }, 1763 :int{1}, :int{0}, "", "?", "" 1764 } 1765 1766 // skip callback 1767 { "ibm-930", "\u6D63\u6D64\u6D65\u6D66", :bin{ 0e5d5f5d63466b0f }, :intvector{ 0, 0, 0, 1, 1, 3, 3, 3 }, :int{1}, :int{0}, "", "0", "" } 1768 { "ibm-930", "\u6D63\u6D64\ud89a\u6D66", :bin{ 0e5d5f5d63466b0f }, :intvector{ 0, 0, 0, 1, 1, 3, 3, 3 }, :int{1}, :int{0}, "", "0", "" } 1769 { "ibm-930", "\u6D63\u6D64\ud89a\u6D66", :bin{ 0e5d5f5d63 }, :intvector{ 0, 0, 0, 1, 1 }, :int{1}, :int{0}, "illegal", "0i", "\ud89a" } 1770 1771 // sub callback for supplementary code point 1772 { "LATIN1", "1\U000104012", :bin{ 311a32 }, :intvector{ 0, 1, 3 }, :int{1}, :int{0}, "", "", "" } 1773 { "ibm-920", "1\U000104012", :bin{ 311a32 }, :intvector{ 0, 1, 3 }, :int{1}, :int{0}, "", "", "" } 1774 1775 // sub callback with AA as subchar 1776 { "ibm-920", "1\U000104012", :bin{ 31AA32 }, :intvector{ 0, 1, 3 }, :int{1}, :int{0}, "", "?\x00\xAA", "" } 1777 1778 // same but not flushing 1779 { "LATIN1", "1\U000104012", :bin{ 311a32 }, :intvector{ 0, 1, 3 }, :int{0}, :int{0}, "", "", "\U00010401" } 1780 { "ibm-920", "1\U000104012", :bin{ 311a32 }, :intvector{ 0, 1, 3 }, :int{0}, :int{0}, "", "", "\U00010401" } 1781 1782 // simple sample, no error handling 1783 { "UTF-8", "a\U0010FFFF", :bin{ 61F48FBFBF }, :intvector{ 0, 1, 1, 1, 1 }, :int{1}, :int{0}, "", "", "" } 1784 1785 // Verify that incomplete surrogates are handled as an error 1786 { "UTF-8", "a\udc00", :bin{ 61efbfbd }, :intvector{ 0, 1, 1, 1 }, :int{1}, :int{0}, "", "", "" } 1787 { "UTF-8", "a\ud800", :bin{ 61efbfbd }, :intvector{ 0, 1, 1, 1 }, :int{1}, :int{0}, "", "", "" } 1788 { "UTF-8", "a\udc00b", :bin{ 61efbfbd62 }, :intvector{ 0, 1, 1, 1, 2 }, :int{1}, :int{0}, "", "", "" } 1789 { "UTF-8", "a\ud800b", :bin{ 61efbfbd62 }, :intvector{ 0, 1, 1, 1, 2 }, :int{1}, :int{0}, "", "", "" } 1790 1791 // Code coverage for the EUC variants. 1792 { "IBM-eucJP", "\u0061\u4edd\u5bec\ud801\udc01\ud801\u0061\u00a2", :bin{ 61a1b88ff4ae618ee0 }, :intvector{ 0, 1, 1, 2, 2, 2, 6, 7, 7 }, :int{1}, :int{0}, "", "0", "" } 1793 { "IBM-eucJP", "\u0061\u4edd\u5bec\ud801\udc01\ud801\u0061\u00a2", :bin{ 61a1b88ff4aef4fef4fe618ee0 }, :intvector{ 0, 1, 1, 2, 2, 2, 3, 3, 5, 5, 6, 7, 7 }, :int{1}, :int{0}, "", "", "" } 1794 { "EUC-TW", "\u0061\u2295\u5BF2\ud801\udc01\ud801\u0061\u8706\u008a", :bin{ 61a2d38ea2dce561e6ca8a }, :intvector{ 0, 1, 1, 2, 2, 2, 2, 6, 7, 7, 8 }, :int{1}, :int{0}, "", "0", "" } 1795 { "EUC-TW", "\u0061\u2295\u5BF2\ud801\udc01\ud801\u0061\u8706\u008a", :bin{ 61a2d38ea2dce5fdfefdfe61e6ca8a }, :intvector{ 0, 1, 1, 2, 2, 2, 2, 3, 3, 5, 5, 6, 7, 7, 8 }, :int{1}, :int{0}, "", "", "" } 1796 1797 // Code Coverage for BOCU-1 1798 { 1799 "BOCU-1", 1800 "\u0041\ud841\ud888\udc81", 1801 :bin{ 91fbc555fd6349 }, 1802 :intvector{ 0, 1, 1, 1, 2, 2, 2 }, 1803 :int{1}, :int{0}, "", ".", "" 1804 } 1805 { 1806 "BOCU-1", 1807 "\ufe88\ufe70\udbff\udfff\u0061", 1808 :bin{ fbeda44ff0fe189bb821f05926 }, 1809 :intvector{}, 1810 :int{1}, :int{0}, "", ".", "" 1811 } 1812 // Improve code coverage for SCSU 1813 { 1814 "SCSU", 1815 "\ud899\udc7f\ud977\ud888\udc99\ud888\u0041", 1816 :bin{ 0fd899dc7fd888dc99e041 }, 1817 :intvector{}, 1818 :int{1}, :int{0}, "", "0", "" 1819 } 1820 { 1821 "gb18030", 1822 "\U00020087\ue790\ue78f\u1e3f", 1823 :bin{ 95329031a6dca6dba8bc }, 1824 :intvector{ 0,0,0,0,2,2,3,3,4,4 }, 1825 :int{1}, :int{0}, "", "0", "" 1826 } 1827 { 1828 "UTF-7", 1829 "\u00a3I\u00a3\u00a4", 1830 :bin{ 2b414b4d2d492b414b4d4170412d }, 1831 :intvector{ 0,0,0,0,0,1,2,2,2,3,3,3,3,3 }, 1832 :int{1}, :int{0}, "", "0", "" 1833 } 1834 // Bug #9601 direct-from-UTF-8 m:n Unicode:charset conversion. 1835 { 1836 "*test1bmp", 1837 "uv", 1838 :bin{ 08 }, 1839 :intvector{ 0 }, 1840 :int{1}, :int{0}, "", "?", "" 1841 } 1842 { 1843 "*test2", 1844 "\U00101234\U00050005", 1845 :bin{ 0700010e05 }, 1846 :intvector{ 0,0,0,0,0 }, 1847 :int{1}, :int{0}, "", "?", "" 1848 } 1849 } 1850 } 1851 1852 getUnicodeSet { 1853 // charset - will be opened, and ucnv_getUnicodeSet() called on it 1854 // map - set of code points and strings that must be in the returned set 1855 // mapnot - set of code points and strings that must *not* be in the returned set 1856 // which - numeric UConverterUnicodeSet value 1857 Headers { "charset", "map", "mapnot", "which" } 1858 Cases { 1859 // Test ticket 9602: Add "good one-way" mapping type (|4). 1860 // Excluded from roundtrip set, included in the set with fallbacks. 1861 { 1862 "+*test3", 1863 "[{#\uFE0F}]", 1864 "[#{#\uFE0E}]", 1865 :int{0} 1866 } 1867 { 1868 "+*test3", 1869 "[#{#\uFE0E}{#\uFE0F}]", 1870 "[]", 1871 :int{1} 1872 } 1873 // Unicode charsets that do not map surrogate code points 1874 { 1875 "UTF-8", 1876 "[\x00-\ud7ff\ue000-\U0010ffff]", 1877 "[\ud800-\udfff]", 1878 :int{0} 1879 } 1880 { 1881 "UTF-16", 1882 "[\x00-\ud7ff\ue000-\U0010ffff]", 1883 "[\ud800-\udfff]", 1884 :int{0} 1885 } 1886 { 1887 "UTF-16BE", 1888 "[\x00-\ud7ff\ue000-\U0010ffff]", 1889 "[\ud800-\udfff]", 1890 :int{0} 1891 } 1892 { 1893 "UTF-16LE", 1894 "[\x00-\ud7ff\ue000-\U0010ffff]", 1895 "[\ud800-\udfff]", 1896 :int{0} 1897 } 1898 { 1899 "UTF-32", 1900 "[\x00-\ud7ff\ue000-\U0010ffff]", 1901 "[\ud800-\udfff]", 1902 :int{0} 1903 } 1904 { 1905 "UTF-32BE", 1906 "[\x00-\ud7ff\ue000-\U0010ffff]", 1907 "[\ud800-\udfff]", 1908 :int{0} 1909 } 1910 { 1911 "UTF-32LE", 1912 "[\x00-\ud7ff\ue000-\U0010ffff]", 1913 "[\ud800-\udfff]", 1914 :int{0} 1915 } 1916 1917 // Unicode charsets that do map surrogate code points 1918 { 1919 "UTF-7", 1920 "[\x00-\U0010ffff]", 1921 "[]", 1922 :int{0} 1923 } 1924 { 1925 "SCSU", 1926 "[\x00-\U0010ffff]", 1927 "[]", 1928 :int{0} 1929 } 1930 { 1931 "BOCU-1", 1932 "[\x00-\U0010ffff]", 1933 "[]", 1934 :int{0} 1935 } 1936 { 1937 "CESU-8", 1938 "[\x00-\U0010ffff]", 1939 "[]", 1940 :int{0} 1941 } 1942 1943 // versions of ISO-2022-KR 1944 { 1945 "ISO-2022-KR", 1946 "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa1\xa4\xfe\u0111\u4e00\u4e01\uac00-\uac02\uffe6]", 1947 "[\x0e\x0f\x1b\x80-\xa0\xa3\xa5\xff-\u0110\uac03\uffe7-\U0010ffff]", 1948 :int{0} 1949 } 1950 { 1951 "ibm-25546", 1952 "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa1\xa4\xfe\u0111\u4e00\u4e01\uac00-\uac01\uffe6]", 1953 "[\x0e\x0f\x1b\x80-\xa0\xa3\xa5\xff-\u0110\uac02\uffe7-\U0010ffff]", 1954 :int{0} 1955 } 1956 1957 // versions of ISO-2022-JP 1958 { 1959 "ISO-2022-JP", 1960 "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2015\u203e\u4e00\u4e01\uffe5]", 1961 "[\x0e\x0f\x1b\u0100-\u0113\u0385-\u038a\u2014\u301c\u4e02\u4e27-\u4e29\u4fe0\u663b\u9eb5\ufa0e-\ufa2d\uff61-\uff9f\uffe4\uffe6-\U0010ffff]", 1962 :int{0} 1963 } 1964 { 1965 "ISO-2022-JP-2", 1966 "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uffe6]", 1967 "[\x0e\x0f\x1b\uff61-\uff9f\uffe4\uffe7-\U0010ffff]", 1968 :int{0} 1969 } 1970 { 1971 "JIS7", 1972 "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa0-\u0113\u0384-\u0386\u0388-\u038a\u0390-\u03a1\u203e\u4e00-\u4e05\u4fe0\u663b\uff61-\uff9f\uffe6]", 1973 "[\x0e\x0f\x1b\uffe4\uffe7-\U0010ffff]", 1974 :int{0} 1975 } 1976 // with fallbacks 1977 { 1978 "ISO-2022-JP", 1979 "[\x00-\x0d\x10-\x1a\x1c-\x7f\xa5\u0391-\u03a1\u2014\u2015\u203e\u301c\u4e00\u4e01\u4fe0\u9eb5\uff61-\uff9f\uffe5]", 1980 "[\x0e\x0f\x1b\xa6\u0100-\u0113\u0385-\u038a\u4e02\u4e27-\u4e29\u663b\ufa0e-\ufa2d\uffe4\uffe6-\U0010ffff]", 1981 :int{1} 1982 } 1983 1984 // versions of ISO-2022-CN 1985 { 1986 "ISO-2022-CN", 1987 "[\x00-\x0d\x10-\x1a\x1c-\x7f\u4e00\u4e01\u9f98\ufe6b]", 1988 "[\x0e\x0f\x1b\u4e29\uffe6-\U0010ffff]", 1989 :int{0} 1990 } 1991 /* 1992 * ICU 4.4 (ticket #7314) removes mappings for CNS 11643 planes 3..7 1993 { 1994 "ISO-2022-CN-EXT", 1995 "[\x00-\x0d\x10-\x1a\x1c-\x7f\u4e00-\u4e05\u9f98\ufe6b\u4e28-\u4e2b\U00020000\U00020003-\U00020005\U00029664]", 1996 "[\x0e\x0f\x1b\U00020001\U00020002\U0002a6d7-\U0010ffff]", 1997 :int{0} 1998 } 1999 */ 2000 2001 // HZ 2002 { 2003 "HZ", 2004 "[\u0410-\u044f\u4e00\u4e01\u4e03]", 2005 "[\u4e02\u4e04-\u4e06\uac00-\ud7ff]", 2006 :int{0} 2007 } 2008 2009 // LMBCS 2010 { 2011 "LMBCS", 2012 "[\x00-\U0010ffff]", 2013 "[]", 2014 :int{0} 2015 } 2016 2017 // ISCII,version=0 (note: all versions of ISCII generates the same Unicodeset) 2018 { 2019 "iscii-dev", 2020 "[\x00-\xa0\u0909-\u0939\u0993-\u09a8\u0a13-\u0a28\u0a93-\u0aa8\u0ae6-\u0aef\u0b05-\u0b0c\u0b13-\u0b28\u0bae-\u0bb5\u0c12-\u0c28\u0c92-\u0ca8\u0d12-\u0d28]", 2021 "[\u0971-\u0975\u09e4\u0a4e-\u0a58\u0a80\u0b72-\u0b7a\u0bfb-\u0bff\u0c70-\u0c7f\u0c80\u0d00]", 2022 :int{0} 2023 } 2024 2025 { 2026 "iso-8859-1", 2027 "[\x00-\xff]", 2028 "[\u0100-\u01ff]", 2029 :int{0} 2030 } 2031 2032 { 2033 "us-ascii", 2034 "[\x00-\x7f]", 2035 "[\u0100-\u01ff]", 2036 :int{0} 2037 } 2038 // DBCS-only 2039 { 2040 "ibm-971", 2041 "[\xa1\xa4\uac01\ub000]", 2042 "[\x00-\x9f\u2015]", 2043 :int{0} 2044 } 2045 2046 { 2047 "ibm-16684", 2048 "[\xa0\xa1\xa4\xa6-\xab\xad-\u017f\u0254\u309b-\u30ff\u4e00-\u4e05\U00023d00\U000243bc\U0002a6b2" 2049 "{\u0254\u0300}{\u0254\u0301}{\u304b\u309a}{\u30ad\u309a}{\u30af\u309a}]", 2050 "[\x00-0x9f\xa2\xa3\xa5\xac\u0200-\u024f\U00010000-\U0001ffff\U0002a61b-\U0002a6b1]", 2051 :int{0} 2052 } 2053 2054 // extensions 2055 { 2056 "ibm-1390", 2057 "[\x00-\x0d\x10-\u017f\u0254\u309b-\u30ff\u4e00-\u4e05\U00023d00\U000243bc\U0002a6b2" 2058 "{\u0254\u0300}{\u0254\u0301}{\u304b\u309a}{\u30ad\u309a}{\u30af\u309a}]", 2059 "[\x0e\x0f\u0200-\u024f\U00010000-\U0001ffff\U0002a61b-\U0002a6b1]", 2060 :int{0} 2061 } 2062 2063 { 2064 "*test3", 2065 "[\x05\x0b\xc0\u20ac\U00023456\U00101234" 2066 "{\U00101234\U00050005\U00060006}{\U00101234\U00050005}{\U00101234\U00060006}{\xc4\xc4\U00101234\x05}]", 2067 "[\x06\x0e\U00034567\U000febcd{\U00101234\U00070007}]", 2068 :int{0} 2069 } 2070 } 2071 } 2072 } 2073} 2074