1<?xml version="1.0" encoding="UTF-8"?> 2 3<!-- Copyright (C) 2016 and later: Unicode, Inc. and others. License & terms of use: http://www.unicode.org/copyright.html --> 4<!-- Copyright (c) 2007-2009 IBM Corporation and others. All rights reserved --> 5 6<!-- Test data file for string search --> 7<!DOCTYPE stringsearch-tests [ 8<!ELEMENT stringsearch-tests (test-case+)> 9<!ATTLIST stringsearch-tests debug IDREF #IMPLIED > 10<!ELEMENT test-case (pattern, pre?, m?, post?)> 11<!ATTLIST test-case 12 id ID #REQUIRED 13 locale CDATA "en" 14 strength (PRIMARY | SECONDARY | TERTIARY | QUATERNARY | IDENTICAL) "TERTIARY" 15 norm (ON | OFF) "OFF" 16 alternate_handling (NON_IGNORABLE | SHIFTED) "NON_IGNORABLE" 17 > 18 19<!ELEMENT pattern (#PCDATA)> 20<!ELEMENT pre (#PCDATA)> 21<!ELEMENT m (#PCDATA)> 22<!ELEMENT post (#PCDATA)> 23]> 24 25<stringsearch-tests> 26 <!-- debug="test11" (for copying into the above element) --> 27 28 <!-- Very simple match --> 29 <test-case id="test01" > 30 <pattern>abc</pattern> 31 <pre>xxx</pre><m>abc</m><post>yyy</post> 32 </test-case> 33 34 <!-- Very simple no-match --> 35 <test-case id="test02" > 36 <pattern>abc</pattern> 37 <pre>xxx</pre><post>yyy</post> 38 </test-case> 39 40 <!-- Match after several near-misses. --> 41 <test-case id="test03" > 42 <pattern>string</pattern> 43 <pre>silly spring stling strxng strilg strinx stri</pre><m>string</m><post> fling</post> 44 </test-case> 45 46 <test-case id="test04" strength="PRIMARY" > 47 <pattern>FUSS</pattern> 48 <pre>abc</pre><m>fuss</m><post>sss</post> 49 </test-case> 50 51 <test-case id="test05" strength="PRIMARY" > 52 <pattern>FUSS</pattern> 53 <pre>abc</pre><m>fuß</m><post>sss</post> 54 </test-case> 55 56 <test-case id="test05.5" strength="PRIMARY" > 57 <pattern>fuss</pattern> 58 <pre>a </pre> 59 <m>fuß</m> 60 <post>ball table</post> 61 </test-case> 62 63 <test-case id="test06" strength="PRIMARY" > 64 <pattern>fuß</pattern> 65 <pre>abc</pre><m>fuss</m><post>xyz</post> 66 </test-case> 67 68 <test-case id="test07" strength="SECONDARY" > 69 <pattern>fuß</pattern> 70 <pre>abcfussxyz</pre> 71 </test-case> 72 73 <test-case id="test08" strength="PRIMARY" > 74 <pattern>fus</pattern> 75 <pre>abcfuß</pre><post>xyz</post> 76 </test-case> 77 78 <!-- A good match following an initial match that failed because 79 of not ending on a character boundary --> 80 <test-case id="test09" strength="PRIMARY"> 81 <pattern>fus</pattern> 82 <pre>fuß </pre><m>fus</m><post>sss</post> 83 </test-case> 84 85 86 <!-- Test cases from usrchdat.c BREAKITERATOREXACT --> 87 88 <test-case id="test10" strength="TERTIARY"> 89 <pattern>fox</pattern> 90 <m>fox</m><post>y fox</post> 91 </test-case> 92 93 <test-case id="test11" strength="PRIMARY" locale="de_DE@collation=phonebook"> 94 <pattern>toe</pattern> 95 <pre>This is a </pre><m>Tö</m><post>ne</post> 96 </test-case> 97 98 <test-case id="test11a" strength="SECONDARY" locale="de_DE@collation=phonebook"> 99 <pattern>toe</pattern> 100 <pre>This is a </pre><post>Töne</post> 101 </test-case> 102 103 <test-case id="test12" strength="TERTIARY"> 104 <pattern>e</pattern> 105 <pre>tésting that é doés not match </pre><m>e</m><post></post> 106 </test-case> 107 108 <test-case id="test13" strength="PRIMARY" locale="fr"> 109 <pattern>e</pattern> 110 <pre></pre><m>É</m><post>É</post> 111 </test-case> 112 113 <test-case id="test14" strength="PRIMARY" locale="fr"> 114 <pattern>O</pattern> 115 <pre>C</pre><m>O\u0302</m><post>TÉ</post> 116 </test-case> 117 118 119 <!-- Test cases from usrchdat.c STRENGTH --> 120 121 122 <test-case id="test15" strength="PRIMARY" locale="en"> 123 <pattern>fox</pattern> 124 <pre>The quick brown </pre><m>fox</m><post> jumps over the lazy foxes</post> 125 </test-case> 126 127 <test-case id="test16" strength="PRIMARY" locale="fr"> 128 <pattern>peche</pattern> 129 <pre>blackbirds pat </pre><m>p\u00E9ch\u00E9</m><post> </post> 130 </test-case> 131 132 <test-case id="test17" strength="PRIMARY" locale="fr"> 133 <pattern>peche</pattern> 134 <pre>blackbirds pat </pre><m>p\u00EAche</m><post> </post> 135 </test-case> 136 137 <test-case id="test18" strength="PRIMARY" locale="fr"> 138 <pattern>peche</pattern> 139 <pre>blackbirds pat </pre><m>p\u00E9che</m><post>r </post> 140 </test-case> 141 142 <test-case id="test19" strength="PRIMARY" locale="fr"> 143 <pattern>peche</pattern> 144 <pre>blackbirds pat </pre><m>p\u00EAche</m><post>r </post> 145 </test-case> 146 147 <test-case id="test20" strength="PRIMARY" locale="es"> 148 <pattern>channel</pattern> 149 <pre>A </pre><m>channel</m><post>, </post> 150 </test-case> 151 152 <test-case id="test21" strength="PRIMARY" locale="es"> 153 <pattern>channel</pattern> 154 <pre>A </pre><m>CHANNEL</m><post>, </post> 155 </test-case> 156 157 <test-case id="test22" strength="PRIMARY" locale="es"> 158 <pattern>channel</pattern> 159 <pre>A </pre><m>Channel</m><post>s, </post> 160 </test-case> 161 162 <test-case id="test23" strength="PRIMARY" locale="es"> 163 <pattern>channel</pattern> 164 <pre>A </pre><m>channel</m><post>... </post> 165 </test-case> 166 167 <test-case id="test24" strength="TERTIARY" locale="en"> 168 <pattern>A\u0300</pattern> 169 <pre>A miss, and then </pre><m>\u00c0</m><post> should match but not A"</post> 170 </test-case> 171 172 <!-- TODO: In the original test data, this test matched at IDENTICAL strength. 173 Doesn't seem right. The characters are different. 174 --> 175 <test-case id="test24a" strength="IDENTICAL" locale="en"> 176 <pattern>A\u0300</pattern> 177 <pre>At IDENTICAL, shoud this match? </pre><m>\u00c0</m><post></post> 178 </test-case> 179 180 <test-case id="test24b" strength="IDENTICAL" alternate_handling="SHIFTED" locale="en"> 181 <pattern>A\u0300</pattern> 182 <pre>At IDENTICAL, shoud this match? </pre> 183 <m>\u00c0</m> 184 <post></post> 185 </test-case> 186 187 <test-case id="test25" strength="SECONDARY" locale="en"> 188 <pattern>Ű</pattern> 189 <pre>12</pre><m>ű</m><post> Ű</post> 190 </test-case> 191 192 <test-case id="test26" strength="SECONDARY" locale="en"> 193 <pattern>A</pattern> 194 <pre>12</pre><m>a</m><post>...</post> 195 </test-case> 196 197 198 <!-- Test Cases from usrchdat.c, VARIABLE --> 199 <test-case id="test27" strength="TERTIARY" locale="en"> 200 <pattern>blackbird</pattern> 201 <pre>black-bird </pre><m>blackbird</m><post>...</post> 202 </test-case> 203 204 <test-case id="test28" strength="TERTIARY" locale="en"> 205 <pattern>go</pattern> 206 <pre> on</pre> 207 </test-case> 208 209 <!-- TODO: this gives an U_ILLEGAL_ARGUMENT error when opening 210 the UStringSearch. How did the orignal test run? --> 211 <!-- 212 <test-case id="test29" strength="PRIMARY" locale="en"> 213 <pattern> </pattern> 214 <pre></pre><m></m><post>abc</post> 215 </test-case> 216 --> 217 218 <test-case id="test30" strength="SECONDARY" locale="en"> 219 <pattern>abc</pattern> 220 <pre> a bc ab c a bc ab c"</pre> 221 </test-case> 222 223 <test-case id="test31" strength="SECONDARY" locale="en"> 224 <pattern>abc</pattern> 225 <pre> ---------------</pre> 226 </test-case> 227 228 229 <!-- Normalization test cases from usrchdat.c --> 230 <test-case id="test32" strength="TERTIARY" norm="ON"> 231 <pattern>a\u0325\u0300</pattern> 232 <pre></pre><m>a\u0300\u0325</m> 233 </test-case> 234 235 236 <test-case id="test32a" strength="TERTIARY" norm="OFF"> 237 <pattern>a\u0325\u0300</pattern> 238 <pre>a\u0300\u0325</pre> 239 </test-case> 240 241 242 <!-- COMPOSITEBOUNDARIES from usrchdat.c 243 Boundaries are not identical to orignal test data because 244 of matching only full combining sequences 245 --> 246 <test-case id="test40" strength="TERTIARY"> 247 <pattern>A</pattern> 248 <pre>À</pre> <!-- \u00C0 --> 249 </test-case> 250 251 <test-case id="test41" strength="TERTIARY"> 252 <pattern>A</pattern> 253 <pre>À</pre><m>A</m><post>C</post> 254 </test-case> 255 256 <test-case id="test42" strength="TERTIARY"> 257 <pattern>A\u030A</pattern> 258 <pre>À\u01FA</pre> 259 </test-case> 260 261 262 263 <!-- SUPPLEMENTARYCANONICAL from usrchdat.c --> 264 <test-case id="test50" strength="TERTIARY"> 265 <pattern>\uD800\uDC00</pattern> 266 <pre>abc \uD802\uDC00 \uD800\uDC01 \uD801\uDC00 </pre><m>\uD800\uDC00</m> 267 <post>abc abc\uD800\uDC00 \uD800\uD800\uDC00 \uD800\uDC00\uDC00</post> 268 </test-case> 269 270 <test-case id="test51" strength="TERTIARY"> 271 <pattern>\\uD834\\uDDB9</pattern> 272 <pre>and</pre><m>\\uD834\\uDDB9</m><post>this sentence</post> 273 </test-case> 274 275 <test-case id="test52" strength="TERTIARY"> 276 <pattern> \\uD834\\uDDB9 </pattern> 277 <pre>and</pre><m> \\uD834\\uDDB9 </m><post>this sentence</post> 278 </test-case> 279 280 <test-case id="test53" strength="TERTIARY"> 281 <pattern>-\\uD834\\uDDB9-</pattern> 282 <pre>and</pre><m>-\\uD834\\uDDB9-</m><post>this sentence</post> 283 </test-case> 284 285 <test-case id="test54" strength="TERTIARY"> 286 <pattern>,\\uD834\\uDDB9,</pattern> 287 <pre>and</pre><m>,\\uD834\\uDDB9,</m><post>this sentence</post> 288 </test-case> 289 290 <test-case id="test55" strength="TERTIARY"> 291 <pattern>?\\uD834\\uDDB9?</pattern> 292 <pre>and</pre><m>?\\uD834\\uDDB9?</m><post>this sentence</post> 293 </test-case> 294 295 296 <!-- Long combining sequences --> 297 <!-- Backwards search fails because patterns ends w/ ignorables 298 <test-case id="test60" strength="PRIMARY"> 299 <pattern>A\u0301\u0301\u0301\u0301</pattern> 300 <m>A\u0301\u0301\u0301\u0301\u0301</m> 301 </test-case> 302 --> 303 304 <test-case id="test61" strength="TERTIARY"> 305 <pattern>A\u0301\u0301\u0301\u0301</pattern> 306 <pre>A\u0301\u0301\u0301\u0301\u0301</pre> 307 </test-case> 308 309 <test-case id="test62" strength="TERTIARY"> 310 <pattern>A\u0301\u0301\u0301\u0301</pattern> 311 <m>A\u0301\u0301\u0301\u0301</m> 312 </test-case> 313 314 <!-- stand-alone combining marks don't match attached marks --> 315 <test-case id="test63" strength="TERTIARY"> 316 <pattern>\u0301</pattern> 317 <pre>A\u0301\u0301\u0301\u0301</pre> 318 </test-case> 319 320 <test-case id="test64" strength="TERTIARY"> 321 <pattern>\u0301</pattern> 322 <post>\u0301\u0301\u0301\u0301</post> 323 </test-case> 324 325 <!-- stand-alone combining mark does match an un-attached combining mark --> 326 <test-case id="test65" strength="TERTIARY"> 327 <pattern>\u0301</pattern> 328 <m>\u0301</m><post>A\u0301\u0301</post> 329 </test-case> 330 331 <test-case id="test66" strength="TERTIARY"> 332 <pattern>\u0301</pattern> 333 <m>\u0301</m> 334 </test-case> 335 336 <!-- stand-alone combining marks at end of the target text --> 337 <test-case id="test67" strength="TERTIARY"> 338 <pattern>\u0301</pattern> 339 <pre>abcd\r</pre><m>\u0301</m> 340 </test-case> 341 342 <!-- attached combining marks at end of the target text, no match --> 343 <test-case id="test68" strength="TERTIARY"> 344 <pattern>\u0301</pattern> 345 <pre>abcd\u0301</pre> 346 </test-case> 347 348 349 350 <!-- no match within expansions at the start --> 351 <test-case id="test70" strength="PRIMARY"> 352 <pattern>Eligature</pattern> 353 <pre>Æligature</pre> 354 </test-case> 355 356 <test-case id="test71" strength="PRIMARY"> 357 <pattern>AEligature</pattern> 358 <m>Æligature</m> 359 </test-case> 360 361 <test-case id="test72" strength="PRIMARY"> 362 <pattern>AEligature</pattern> 363 <m>Æligature</m> 364 </test-case> 365 366 <!-- unattached combining Tilde will not match a Tilde that is 367 part of a composed Ñ (\u00D1) --> 368 <test-case id="test73" strength="SECONDARY"> 369 <pattern>\u0303</pattern> <!-- combining tilde --> 370 <pre>Ñ
</pre><m>\u0303</m> 371 </test-case> 372 373 <test-case id="test74" strength="SECONDARY"> 374 <pattern>\u0303</pattern> <!-- combining tilde --> 375 <pre>Ñ 
</pre><m>\u0303</m><post>a</post> 376 </test-case> 377 378 <test-case id="test75" strength="TERTIARY" locale="fr"> 379 <pattern>\u00EA</pattern> 380 <pre>p</pre><m>\u00EA</m><post>che</post> 381 </test-case> 382 383 <test-case id="test76" strength="TERTIARY" locale="fr"> 384 <pattern>\u00EA</pattern> 385 <pre>p</pre><m>e\u0302</m><post>che</post> 386 </test-case> 387 388 <test-case id="test77" strength="TERTIARY" locale="fr"> 389 <pattern>e\u0302</pattern> 390 <pre>p</pre><m>\u00EA</m><post>che</post> 391 </test-case> 392 393 <!-- Test cases from ticket:5382 --> 394 <test-case id="test78" strength="SECONDARY" locale="hu_HU"> 395 <pattern>\u0170</pattern> 396 <m>\u0171</m> 397 <post>12</post> 398 </test-case> 399 400 <test-case id="test79" strength="SECONDARY" locale="hu_HU"> 401 <pattern>\u0170</pattern> 402 <pre>1</pre> 403 <m>\u0171</m> 404 <post>2</post> 405 </test-case> 406 407 <test-case id="test80" strength="SECONDARY" locale="hu_HU"> 408 <pattern>\u0170</pattern> 409 <pre>12</pre> 410 <m>\u0171</m> 411 </test-case> 412 413 <!-- Test cases from ticket:5959 --> 414 <test-case id="test81" strength="SECONDARY"> 415 <pattern>\u2166</pattern> 416 <m>VII</m> 417 </test-case> 418 419 <test-case id="test82" strength="SECONDARY"> 420 <pattern>VII</pattern> 421 <m>\u2166</m> 422 </test-case> 423 424 <test-case id="test83" strength="IDENTICAL" alternate_handling="SHIFTED" locale="en"> 425 <pattern>Universal Declaration of Human Rights</pattern> 426 <pre>Proclaims this </pre><m>Universal Declaration of Human Rights</m><post> as a common standard of achievement for all peoples and all nations</post> 427 </test-case> 428 429 <test-case id="test83b" strength="TERTIARY" alternate_handling="SHIFTED" locale="en"> 430 <pattern>Universal Declaration of Human Rights</pattern> 431 <pre>Proclaims this </pre> 432 <m>Universal-Declaration-of-Human-Rights</m> 433 <post> as a common standard of achievement for all peoples and all nations</post> 434 </test-case> 435 436 <test-case id="test84" strength="TERTIARY" locale="en"> 437 <pattern>\u05E9\u0591\u05E9</pattern> 438 <m>\u05E9\u0592\u05E9</m> 439 </test-case> 440 441 <test-case id="test84b" strength="IDENTICAL" locale="en"> 442 <pattern>\u05E9\u0591\u05E9</pattern> 443 <pre>\u05E9\u0592\u05E9</pre> 444 </test-case> 445</stringsearch-tests> 446 447