1# Copyright (c) 2001-2011 International Business Machines 2# Corporation and others. All Rights Reserved. 3# 4# RBBI Test Data 5# 6# File: rbbitst.txt 7# 8# The format of this file looks vaguely like some kind of xml-ish markup, 9# but it is NOT. The syntax is this.. 10# 11# <word> any following data is for word break testing 12# <sent> any following data is for sentence break testing 13# <line> any following data is for line break testing 14# <char> any following data is for char break testing 15# <locale local_name> Switch to the named locale at the next occurence of <word>, <sent>, etc. 16# <data> ... </data> test data. May span multiple lines. 17# <> Break position, status == 0 18# • Break position, status == 0 (Bullet, \u2022) 19# <nnn> Break position, status == nnn 20# \ Escape. Normal ICU unescape applied. 21# \ at end of line -> Line Continuation. Remove both the backslash and the new line 22# 23# 24 25 26# Temp debugging tests 27<line> 28<data>•\ufffc•\u30e3\u000c<100>\u1b39\u300a\u002f\u203a\u200b•\ufffc•\uaf64•\udcfb•</data> 29 30######################################################################################## 31# 32# 33# G r a p h e m e C l u s t e r T e s t s 34# 35# 36########################################################################################## 37<char> 38 39<data>•a•b•c• •,•\u0666•</data> # Quick Test 40<data>•\r•\r•\r\n•\r\n•\n•\r•</data> # don't break CR/LF 41 42# Always break after controls. Combining chars don't combine with them. 43<data>•\u0003•\N{COMBINING GRAVE ACCENT}•\r•\N{COMBINING GRAVE ACCENT}•</data> 44<data>•\u0085•\N{COMBINING MACRON}•A\N{COMBINING MACRON}•</data> 45 46# Surrogates 47<data>•\U00011000•\U00010020•\U00010000\N{COMBINING MACRON}•</data> 48<data>•\ud800\udc00•\udbff\udfff•a•</data> 49 50# Extend (Combining chars) combine. 51<data>•A\N{COMBINING GRAVE ACCENT}•B•</data> 52<data>•\N{GREEK SMALL LETTER MU}\N{COMBINING LOW LINE}\N{COMBINING HORN}•</data> 53<data>•a\u0301•b\u0302•c\u0303•d\u0304•e\u0305•f\u0306•g\u0307•h\u0308•i\u0309•</data> 54 55<data>•a\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304\u0301\u0302\u0303\u0304•</data> 56 57# Don't break Hangul Syllables 58# L : \u1100 59# V : \u1161 60# T : \u11A8 61# LV : \uAC00 62# LVT : \uAC01 63 64<data>•\u1100\u1161\u11a8•\u1100\u1161\u11a8•</data> #LVT 65<data>•\u1100\u1161•\u1100\u1161•</data> 66<data>•\u1100\u1161\u11a8•\u1161•\u1100•\u11a8•\u1161\u1161\u1161\u11a8•</data> 67<data>•\u1100\u1100\uac01•\u1100\uac01•\u1100\uac01\u0301•\uac01•</data> 68<data>•\u1100\u0301•\u1161\u11a8\u0301•\u11a8•</data> 69 70 71 72# Hindi combining chars. (An old test) 73# TODO: Update these tests for Unicode 5.1 Extended Grapheme clusters 74#<data>•भ••ा•\u0930•\u0924• •\u0938\u0941\u0902•\u0926•\u0930• 75#•\u0939•\u094c•\u0964•</data> 76#<data>•\u0916\u0947•\u0938\u0941\u0902•\u0926•\u0930•\u0939•\u094c•\u0964•</data> 77 78 79# Bug 1587. Tamil. \u0baa\u0bc1 is an Extended Grpaheme Cluster 80<data>•\u0baa\u0bc1•\u0baa\u0bc1•</data> 81 82# Regression test for bug 1889 83<data>•\u0f40\u0f7d•\u0000•\u0f7e•</data> 84 85 86# 0xffff is a legal character, and should not stop the break iterator early. 87# (Requires special casing in implementation, which is why it gets a test.) 88<data>•\uffff•\uffff• •a•</data> 89 90# Treat Japanese Half Width voicing marks as combining 91<data>•A\uff9e•B\uff9f\uff9e\uff9f•C•</data> 92 93######################################################################################## 94# 95# 96# E x t e n d e d G r a p h e m e C l u s t e r T e s t s 97# 98# 99########################################################################################## 100#<xgc> 101 102# Plain Vanilla grapheme clusters 103#<data>•a•b•c•</data> 104#<data>•a\u0301\u0302• •b\u0303\u0304•</data> 105 106# Assorted Hindi combining marks 107#<data>•\u0904\u0903• •\u0937\u093E• •\u0904\u093F• •\u0937\u0940• •\u0937\u0949• •\u0937\u094A• •\u0937\u094B• •\u0937\u094C•</data> 108 109# Thai Clusters 110# $Prepend $Extend* $PrependBase $Extend*; 111# 112#<data>•\u0e40\u0e01•\u0e44\u0301\u0e23\u0302\u0303•\u0e40•\u0e40\u0e02•\u0e02• •</data> 113 114 115######################################################################################## 116# 117# 118# W o r d B o u n d a r y T e s t s 119# 120# 121########################################################################################## 122 123<word> 124# 125# Quick sanity test 126# 127<data>•hello<200> •there<200> •goodbye<200></data> 128<data>•hello<200> •12345<100> •,•</data> 129 130 131# 132# Test data originally in RBBIAPITest::TestFirstNextFollowing() and TestLastPreviousPreceding() 133# 134 135<word> 136<data>•This<200> •is<200> •a<200> •word<200> •break<200>.• • •Isn't<200> •it<200>?• •2.25<100></data> 137 138 139 140# 141# Data originally from TestDefaultRuleBasedWordIteration() 142# 143<data>•Write<200> •wordrules<200>.• •123.456<100> •alpha\u00adbeta\u00adgamma<200> •\u092f\u0939<200> •</data> 144<data>• •\u0939\u093f\u0928\u094d\u200d\u0926\u0940<200> •\u0939\u0948<200> •\u0905\u093e\u092a<200> •\u0938\u093f\u0916\u094b\u0917\u0947<200>?•</data> 145 146#Hindi Numbers 147<data>• •\u0968\u0966.\u0969\u096f<100> •\u0967\u0966\u0966.\u0966\u0966<100> •\N{RUPEE SIGN}•\u0967,\u0967\u0966\u0966.\u0966\u0966<100> • •\u0905\u092e\u091c<200>\n•</data> 148 149<data>•\u0938\u094d\u200d\u0935\u0924\u0902deadTA\u0930<200>\r•It's<200> •$•30.10<100> •12,34<100>¢•£•¤•¥•alpha\u05f3beta\u05f4gamma<200> •</data> 150 151<data>•Badges<200>?• •BADGES<200>!•?•!• •We<200> •don't<200> •need<200> •no<200> •STINKING<200> •BADGES<200>!•!•1000,233,456.000<100> •1,23.322<100>%•123.1222<100>$•123,000.20<100> •179.01<100>%•X<200> •Now<200>\r•is<200>\n•the<200>\r\n•time<200> •</data> 152 153#Hangul 154<data>•\uc5f0\ud569<200> •\uc7a5\ub85c\uad50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u110b\u1175\u11ab<200> •Hello<200>,• •how<200> •are<200> •you<200> •</data> 155 156 157# Words containing non-BMP letters 158<data>•abc\U00010300<200> •abc\N{DESERET SMALL LETTER ENG}<200> •abc\N{MATHEMATICAL BOLD SMALL Z}<200> •abc\N{MATHEMATICAL SANS-SERIF BOLD ITALIC PI SYMBOL}<200> •</data> 159 160# Unassigned code points 161<data>•abc<200>\U0001D800•def<200>\U0001D3FF• •</data> 162 163# Hiragana & Katakana stay together, but separates from each other and Latin. 164<data>•abc<200>\N{HIRAGANA LETTER SMALL A}<300>\N{HIRAGANA LETTER VU}\N{COMBINING ACUTE ACCENT}<300>\N{HIRAGANA ITERATION MARK}<300>\N{KATAKANA LETTER SMALL A}\N{KATAKANA ITERATION MARK}\N{HALFWIDTH KATAKANA LETTER WO}\N{HALFWIDTH KATAKANA LETTER N}<300>def<200>#•</data> 165 166# Words with interior formatting characters 167<data>•def\N{COMBINING ACUTE ACCENT}\N{SYRIAC ABBREVIATION MARK}ghi<200> •</data> 168 169# to test for bug #4097779 170<data>•aa\N{COMBINING GRAVE ACCENT}a<200> •</data> 171 172 173# to test for bug #4098467 174# What follows is a string of Korean characters (I found it in the Yellow Pages 175# ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed 176# it correctly), first as precomposed syllables, and then as conjoining jamo. 177# Both sequences should be semantically identical and break the same way. 178# precomposed syllables... 179<data>•\uc0c1\ud56d<200> •\ud55c\uc778<200> •\uc5f0\ud569<200> •\uc7a5\ub85c\uad50\ud68c<200> •\u1109\u1161\u11bc\u1112\u1161\u11bc<200> •\u1112\u1161\u11ab\u110b\u1175\u11ab<200> •\u110b\u1167\u11ab\u1112\u1161\u11b8<200> •\u110c\u1161\u11bc\u1105\u1169\u1100\u116d\u1112\u116c<200> •</data> 180 181<data>•abc<200>\u4e01<400>\u4e02<400>\u3005<200>\u4e03<400>\u4e03<400>abc<200> •</data> 182 183<data>•\u06c9\uc799\ufffa<200></data> 184 185# 186# Try some words from other scripts. 187# 188 189# Try some words from other scripts. 190# Greek, Cyrillic, Hebrew, Arabic, Arabic, Georgian, Latin 191# 192<data>•ΑΒΓ<200> •БВГ<200> •אבג֓<200> •ابت<200> •١٢٣<100> •\u10A0\u10A1\u10A2<200> •ABC<200> •</data> 193 194<data>•\u0301•A<200></data> 195 196 197# 198# Hindi word break tests, imported from the old RBBI tests. 199# An historical note: a much earlier version of ICU break iterators had a number 200# of special case rules for Hindi, which were tested by an earlier version of 201# this test data. The current RBBI rules do not special case Hindi in 202# any way, making this test data much less signfificant. 203# 204<data>•\u0917\u092a\u00ad\u0936\u092a<200>!•\u092f\u0939<200> •\u0939\u093f\u0928\u094d\u200d\u0926\u0940<200> •\u0939\u0948<200> •\u0905\u093e\u092a<200> •\u0938\u093f\u0916\u094b\u0917\u0947<200>?•\n•:•\u092a\u094d\u0930\u093e\u092f\u0903<200> 205•\u0935\u0930\u094d\u0937\u093e<200>\r\n•\u092a\u094d\u0930\u0915\u093e\u0936<200>,•\u0924\u0941\u092e\u093e\u0930\u094b<200> •\u092e\u093f\u0924\u094d\u0930<200> •\u0915\u093e<200> •\u092a\u0924\u094d\u0930<200> •\u092a\u095d\u094b<200> •\u0938\u094d\u0924\u094d\u0930\u093f<200>.• •\u0968\u0966.\u0969\u096f<100> •\u0967\u0966\u0966.\u0966\u0966<100>\u20a8•\u0967,\u0967\u0966\u0966.\u0966\u0966<100> •\u0905\u092e\u091c<200>\n•\u0938\u094d\u200d\u0935\u0924\u0902\u0924\u094d\u0930<200>\r•</data> 206 207# 208# Failures from monkey tests 209# 210<data>•\u8527<400>\u02ba<200>\u0027\u0d42•\u00b7•\u09ea<100></data> 211 212# 213# Jitterbug 5276 - treat Japanese half width voicing marks as Grapheme Extend 214# 215<data>•A\uff9e\uff9fBC<200> •1\uff9e\uff9f23<100></data> 216 217######################################################################################## 218# 219# 220# S e n t e n c e B o u n d a r y T e s t s 221# 222# 223########################################################################################## 224 225 226# 227# Test data originally from RBBI RBBITest::TestDefaultRuleBasedSentenceIteration() 228# 229<sent> 230 231 232<sent> 233<data>•This\n<100></data> 234<data>•Hello! •how are you? •I'am fine. •Thankyou. •How are you \ 235doing? •This\n<100> costs $20,00,000. •</data> 236 237 238# Sentence ending in a quote. 239<data>•"Sentence ending with a quote." •Bye.•</data> 240 241# Sentence, and test data, ending without a period or other terminator. 242<data>•Here is a random sentence, no ending period<100></data> 243 244 245<data>• (This is it). •Testing the sentence iterator. •\ 246"This isn't it." •Hi! \ 247•This is a simple sample sentence. •(This is it.) •This is a simple sample sentence. •\ 248"This isn't it." •\ 249Hi! •This is a simple sample sentence. •It does not have to make any sense as you can see. •Nel mezzo del cammin di nostra vita, mi ritrovai in una selva oscura. •Che la dritta via aveo smarrita. •He said, that I said, that you said!! •Don't rock the boat.\u2029•Because I am the daddy, that is why. 250•Not on my time (el timo.)! •</data> 251 252<data>•Hello. •So what!!\u2029•"But now," he said, \ 253"I know!" •\ 254Harris thumbed down several, including "Away We Go" (which became the huge success Oklahoma!). •One species, B. anthracis, is highly virulent. 255•Wolf said about Sounder:\ 256"Beautifully thought-out and directed." •\ 257Have you ever said, "This is where\tI shall live"? •He answered, \ 258"You may not!" •Another popular saying is: "How do you do?". \n•\ 259Yet another popular saying is: \ 260'I'm fine thanks.' •\ 261What is the proper use of the abbreviation pp.? •Yes, I am definatelly 12" tall!!\ 262•Now\r<100>is\n<100>the\r\n<100>time\n<100>\r<100>for\r<100>\r<100></data> 263 264<data>•No breaks when . is surrounded by UPPER.Case letters. •</data> 265<data>•No breaks when . is followed by Numeric .4 a.4 C.4 3.1 .•</data> 266<data>•No breaks when . is followed by a lower, with possible intervening punct .,a .$a .)a. •</data> 267 268# 269# Sentence Breaks: no break at the boundary between CJK and other letters 270# 271<data>•\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165:"JAVA\u821c\u8165\u7fc8\u51ce\u306d,\u2494\u56d8\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46".\u2029•\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8\u97e4JAVA\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u3002•\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8\u97e4\u6470\u8790JAVA\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u2048•He said, "I can go there."\u2029•Bye, now.•</data> 272 273# 274# Treat fullwidth variants of .!? the same as their 275# normal counterparts 276# 277<data>•I know I'm right\uff0e •Right\uff1f •Right\uff01 •</data> 278 279 280# 281# Don't break sentences at boundary between CJK and digits 282# 283<data>•\u5487\u67ff\ue591\u5017\u61b3\u60a1\u9510\u8165\u9de8\u97e48888\u821c\u8165\u7fc8\u51ce\u306d\ue30b\u2494\u56d8\u4ec0\u60b1\u8560\u51ba\u611d\u57b6\u2510\u5d46\u97e5\u7751\u3002•Bye, now<100></data> 284 285# 286# Breaks around '(' following a sentence TERM. (Rule 9) 287# 288<data>•How do you do?(•Fine). •</data> 289<data>•How do you do? •(Fine). •</data> 290<data>•How do you do?(•fine). •</data> 291<data>•How do you do? •(fine). •</data> 292 293# 294<data>•Hello.123<100></data> # Rule 6 295<data>•Hello?•123<100></data> 296 297<data>•HELLO.Bye<100></data> # Rule 7 298<data>•HELLO?•Bye<100></data> 299 300<data>•Hello.goodbye<100></data> #Rule 8 301<data>•Hello. •Goodbye<100></data> 302<data>•Hello. goodbye<100></data> 303 304 305 306# 307# test for bug #4158381: No breaks when there are no terminators around 308# 309<data>•\<P>Provides a set of "lightweight" (all-java\<FONT SIZE="-2">\<SUP>TM\</SUP>\</FONT> language) components that, to the maximum degree possible, work the same on all platforms. •</data> 310<data>•Another test.\u2029•</data> 311 312# test for bug #4143071: Make sure sentences that end with digits 313# work right 314# 315<data>•Today is the 27th of May, 1998. •</data> 316<data>•Tomorrow with be 28 May 1998. •</data> 317<data>•The day after will be the 30th.\u2029•</data> 318 319# test for bug #4152416: Make sure sentences ending with a capital 320# letter are treated correctly 321# 322<data>•The type of all primitive \<code>boolean\</code> values accessed in the target VM. •Calls to xxx will return an implementor of this interface. \u2029•</data> 323 324# test for bug #4152117: Make sure sentence breaking is handling 325# punctuation correctly [COULD NOT REPRODUCE THIS BUG, BUT TEST IS 326# HERE TO MAKE SURE IT DOESN'T CROP UP] 327# 328<data>•Constructs a randomly generated BigInteger, uniformly distributed over the range \<tt>0\</tt> to \<tt>(2\<sup>numBits\</sup> - 1\)\</tt>, inclusive. •The uniformity of the distribution assumes that a fair source of random bits is provided in \<tt>rnd\</tt>. •Note that this constructor always constructs a non-negative biginteger. \n•Ahh abc. 329•</data> 330 331# sentence breaks for hindi which used Devanagari script 332# make sure there is sentence break after ?,danda(hindi phrase separator), 333# fullstop followed by space. (VERY old test) 334# 335<data>•\u0928\u092e\u0938\u094d\u200d\u0924\u0947 \u0930\u092e\u0947\u0936\u0905\u093e\u092a\u0915\u0948\u0938\u0947 \u0939\u0948?•\u092e\u0948 \u0905\u091a\u094d\u200d \u091b\u093e \u0939\u0942\u0901\u0964 •\u0905\u093e\u092a\r\n<100>\ 336\u0915\u0948\u0938\u0947 \u0939\u0948?•\u0935\u0939 \u0915\u094d\u200d\u092f\u093e\n\ 337<100>\u0939\u0948?•\u092f\u0939 \u0905\u093e\u092e \u0939\u0948. •\u092f\u0939 means "this". •"\u092a\u095d\u093e\u0908" meaning "education" or "studies". •\u0905\u093e\u091c(\u0938\u094d\u200d\u0935\u0924\u0902\u0924\u094d\u0930 \u0926\u093f\u0935\u093e\u0938) \u0939\u0948\u0964 •Let's end here. •</data> 338 339# Regression test for bug #1984, Sentence break in Arabic text. 340 341<data>\ 342•\u0623\u0633\u0627\u0633\u064b\u0627\u060c\u0020\u062a\u062a\u0639\u0627"\u0645\u0644\u0020\u0627\u0644\u062d\u0648\u0627\u0633\u064a\u0628\u0020"\u0641\u0642\u0637\u0020\u0645\u0639\u0020\u0627\u0644\u0623\u0631\u0642\u0627\u0645\u060c\u0648\u062a\u0642\u0648\u0645\u0020\u0628\u062a\u062e\u0632\u064a\u0646\u0020\u0627\u0644\u0623\u062d\u0631\u0641\u0020\u0648\u0627\u0644\u0645\u062d\u0627\u0631\u0641\u0020\u0627\u0644\u0623\u062e\u0631\u0649\u0020\u0628\u0639\u062f\u0020\u0623\u0646\u062a\u064f\u0639\u0637\u064a\u0020\u0631\u0642\u0645\u0627\u0020\u0645\u0639\u064a\u0646\u0627\u0020\u0644\u0643\u0644\u0020\u0648\u0627\u062d\u062f\u0020\u0645\u0646\u0647\u0627\u002e\u0020•\u0648\u0642\u0628\u0644\u0020\u0627\u062e\u062a\u0631\u0627\u0639\u0022\u064a\u0648\u0646\u0650\u0643\u0648\u062f\u0022\u060c\u0020\u0643\u0627\u0646\u0020\u0647\u0646\u0627\u0643\u0020\u0645\u0626\u0627\u062a\u0020\u0627\u0644\u0623\u0646\u0638\u0645\u0629\u0020\u0644\u0644\u062a\u0634\u0641\u064a\u0631\u0648\u062a\u062e\u0635\u064a\u0635\u0020\u0647\u0630\u0647\u0020\u0627\u0644\u0623\u0631\u0642\u0627\u0645\u0020\u0644\u0644\u0645\u062d\u0627\u0631\u0641\u060c\u0020\u0648\u0644\u0645\u0020\u064a\u0648\u062c\u062f\u0020\u0646\u0638\u0627\u0645\u062a\u0634\u0641\u064a\u0020\u0639\u0644\u0649\u0020\u062c\u0645\u064a\u0639\u0020\u0627\u0644\u0645\u062d\u0627\u0631\u0641\u0020\u0627\u0644\u0636\u0631\u0648\u0631\u064a\u0629. •</data> 343 344# Try a few more of the less common sentence endings. 345<data>•Hello, world\u3002 •Hello, world\u1803 •Hello, world\u2048 •Hello, world\u203c •Let's end here. •</data> 346 347 348 349 350################################################################ 351# 352# 353# L I N E B R E A K 354# 355# 356################################################################ 357 358<line> 359# 360# Test Character for each of the line break classes. 361# 362# 00A1;AI # INVERTED EXCLAMATION MARK ¡ 363# 0041;AL # LATIN CAPITAL LETTER A 364# 0009;BA # <control> 365# 00B4;BB # ACUTE ACCENT 366# 000C;BK # <control> 367# 2014;B2 # EM DASH 368# FFFC;CB # OBJECT REPLACEMENT CHARACTER 369# 0029;CL # RIGHT PARENTHESIS 370# 0301;CM # COMBINING ACUTE ACCENT 371# 0021;EX # EXCLAMATION MARK 372# 00A0;GL # NO-BREAK SPACE 373# 002D;HY # HYPHEN-MINUS 374# 4E00;ID # <CJK Ideograph, First> 375# 2024;IN # ONE DOT LEADER 376# 002C;IS # COMMA 377# 000A;LF # <control> 378# 0E5A;NS # THAI CHARACTER ANGKHANKHU 379# 0032;NU # DIGIT TWO 380# 0028;OP # LEFT PARENTHESIS 381# 0025;PO # PERCENT SIGN 382# 0024;PR # DOLLAR SIGN 383# 0022;QU # QUOTATION MARK 384# 0E01;SA # THAI CHARACTER KO KAI 385# DB7F;SG # Surrogate 386# 0020;SP # SPACE 387# 002F;SY # SOLIDUS / 388# F8FF;XX # Private Use 389# 200B;ZW # ZERO WIDTH SPACE 390 391 392# 2b Always break at end of text 393 394<data>• •\u00A1•</data> 395<data>• •\u0041•</data> 396<data>• •\u0009•</data> 397<data>• •\u00B4•</data> 398<data>• \u000C<100></data> # LB3C × BK 399<data>• •\u2014•</data> 400<data>• •\uFFFC•</data> 401<data>• \u0029•</data> # LB 8 × CL 402# <data>• • \u0301•</data> # LB 7a Treat SP CM* as if it were ID #TODO: SP CM 403<data>• \u0021•</data> # LB 8 × EX 404#<data>• \u00A0•</data> # LB 11b × GL TODO: fix. 405<data>• •\u002D•</data> 406<data>• •\u4E00•</data> 407<data>• •\u2024•</data> 408<data>• \u002C•</data> # LB 8 × IS 409<data>• \u000A<100></data> # LB3C × ( BK | CR | LF | NL ) 410<data>• •\u0E5A•</data> 411<data>• •\u0032•</data> 412<data>• •\u0028•</data> 413<data>• •\u0025•</data> 414<data>• •\u0024•</data> 415<data>• •\u0022•</data> 416<data>• •\u0E01•</data> 417<data>• •\uDB7F•</data> 418<data>• \u0020•</data> # LB4 - don't break before space. 419<data>• \u002F•</data> # LB 8 × SY 420<data>• •\uF8FF•</data> 421<data>• \u200B•</data> # LB4 - don't break before ZA 422 423 424# 3a Always break after hard line breaks. 425# 3c Never break before hard line breaks. 426 427<data>• •\u00A1\u2028<100>\u00A1•</data> 428<data>• •\u0041\u2028<100>\u0041•</data> 429<data>• •\u0009\u2028<100>\u0009•</data> 430<data>• •\u00B4\u2028<100>\u00B4•</data> 431<data>• \u000C<100>\u2028<100>\u000C<100></data> 432<data>• •\u2014\u2028<100>\u2014•</data> 433<data>• •\uFFFC\u2028<100>\uFFFC•</data> 434<data>• \u0029\u2028<100>\u0029•</data> 435#<data>• \u0301\u2028<100>\u0301•</data> # TODO: fix. 436<data>• \u0021\u2028<100>\u0021•</data> 437#<data>• \u00A0\u2028<100>\u00A0•</data> # TODO: fix 438<data>• •\u002D\u2028<100>\u002D•</data> 439<data>• •\u4E00\u2028<100>\u4E00•</data> 440<data>• •\u2024\u2028<100>\u2024•</data> 441<data>• \u002C\u2028<100>\u002C•</data> 442<data>• \u000A<100>\u2028<100>\u000A<100></data> 443<data>• •\u0E5A\u2028<100>\u0E5A•</data> 444<data>• •\u0032\u2028<100>\u0032•</data> 445<data>• •\u0028\u2028<100>\u0028•</data> 446<data>• •\u0025\u2028<100>\u0025•</data> 447<data>• •\u0024\u2028<100>\u0024•</data> 448<data>• •\u0022\u2028<100>\u0022•</data> 449<data>• •\u0E01\u2028<100>\u0E01•</data> 450<data>• •\uDB7F\u2028<100>\uDB7F•</data> 451<data>• \u0020\u2028<100>\u0020•</data> 452<data>• \u002F\u2028<100>\u002F•</data> 453<data>• •\uF8FF\u2028<100>\uF8FF•</data> 454<data>• \u200B\u2028<100>\u200B•</data> 455 456 457# 458# Old Line Break Test data. Orginally located in RBBITest::TestDefaultRuleBasedLineIteration() 459# 460 461<line> 462 463<data>•Multi-•Level •example •of •a •semi-•idiotic •non-•sensical •(non-•important) •sentence. 464<100>Hi •Hello •How\n<100>are\r<100>you\u2028<100>fine.\t•good. •Now\r<100>is\n<100>the\r\n<100>time\n<100>\r<100>for\r<100>\r<100>all•</data> 465 466<line> 467<data>•Hello! •how\r\n<100> •(are)\r<100> •you? •I'am •fine- •Thankyou. •foo\u00a0bar 468<100>How, •are, •you? •This, •costs •$20,00,000.•</data> 469 470# test for bug #4068133 471# 472<data>•\u96f6•\u4e00\u3002•\u4e8c\u3001•\u4e09\u3002\u3001•\u56db\u3001\u3002\u3001•\u4e94,•\u516d.•\u4e03.\u3001,\u3002•\u516b•</data> 473 474# to test for bug #4086052 475<data>•foo\u00a0bar•</data> 476 477# to test for bug #4097920 478<data>•dog,cat,mouse •(one)•(two)\n<100></data> 479 480# to test for bug #4035266 481<data>•The •balance •is •$-23,456.78, •not •-•$32,456.78!\n<100></data> 482 483 484# to test for bug #4098467 485# What follows is a string of Korean characters (I found it in the Yellow Pages 486# ad for the Korean Presbyterian Church of San Francisco, and I hope I transcribed 487# it correctly), first as precomposed syllables, and then as conjoining jamo. 488# Both sequences should be semantically identical and break the same way. 489# precomposed syllables... (I == Rich Gillam?) 490# 491<data>•\uc0c1•\ud56d •\ud55c•\uc778 •\uc5f0•\ud569 •\uc7a5•\ub85c•\uad50•\ud68c•</data> 492 493# conjoining jamo... 494# TODO: rules update needed 495#<data>•\u1109\u1161\u11bc•\u1112\u1161\u11bc •\u1112\u1161\u11ab•\u110b\u1175\u11ab #•\u110b\u1167\u11ab•\u1112\u1161\u11b8 •\u110c\u1161\u11bc•\u1105\u1169•\u1100\u116d•\u1112\u116c•</data> 496 497# to test for bug #4117554: Fullwidth .!? should be treated as postJwrd 498<data>•\u4e01\uff0e•\u4e02\uff01•\u4e03\uff1f•</data> 499 500# Surrogate line break tests. 501# 502<data>•\u4e01•\ud840\udc01•\u4e02•abc •\ue000 •\udb80\udc01•</data> 503 504# Regression for bug 836 505# Note: Unicode 5.1 changed this behavior 506# Unicode 5.2 changed it again, there is no break following the '(' 507<data>•AAA(AAA •</data> 508 509# Try some words from other scripts. 510# Greek, Cyrillic, Hebrew, Arabic, Arabic, Georgian, Latin 511# 512<data>•ΑΒΓ •БВГ •אבג֓ •ابت •١٢٣ •\u10A0\u10A1\u10A2 •ABC •</data> 513 514# 515# ticket #4853: unpaired surrogates should behave like AL 516# 517<data>•abc\ud801xyz•</data> 518 519# 520# Regression tests for failures that originally came from the monkey test. 521# Monkey test failure lines can, with slight reformatting, be copied into this section 522# as test cases. The error display from here is more informative. 523# 524<data>•\ufffc•\u30e3\u000c<100>\u1b39\u300a\u002f\u203a\u200b•\ufffc•\uaf64•\udcfb•</data> 525<data>•\u114d\u31f3•\ube44\u002d•\u0362\u24e2\u276e\u2014\u205f\ufe16•\uc877•\u0fd0\u000a<100>\u20a3•</data> 526<data>•\u080a\u215b\U0001d7d3\u002c•\u2025\U000e012e•\u02df\u118d\u0029\ua8d6\u0085<100>\u6cc4\u2024\u202f\ufffc•</data> 527 528 529######################################################################################## 530# 531# 532# T i t l e B o u n d a r y T e s t s 533# 534# 535########################################################################################## 536<title> 537<data>•Here •is •a •short •sample •sentence. •And •another.•</data> 538<data>•HERE •IS •A •SHORT •SAMPLE •SENTENCE. •AND •ANOTHER.•</data> 539<data>• •Start •and •end •with •spaces •</data> 540<data>•Include 123 456 ^& •some 54332 •numbers 4445•abc123•abc •ending 1223 •</data> 541 542<data>•Combining\u0301 \u0301•ma\u0306rks •bye •</data> 543<data>•123 •Start •with •a •number.•</data> 544 545<data>•'•start •with •a •case-•ignorable •cha'r'a'cter•</data> 546 547 548########################################################################################## 549# 550# Thai Tests 551# 552########################################################################################## 553<locale th> 554<word> 555# 556# Test data originally from the test code source file 557# // @suwit -- Thai sample data from GVT Guideline 558# 559<data>•\u0E2B\u0E19\u0E36\u0E48\u0E07<200>\u0E04\u0E33<200>\u0E44\u0E17\u0E22<200>\ 560\u0E2A\u0E32\u0E21\u0E32\u0E23\u0E16<200>\u0E1B\u0E23\u0E30\u0E01\u0E2D\u0E1A<200>\ 561\u0E14\u0E49\u0E27\u0E22<200>\u0e2b\u0e25\u0e32\u0e22<200>\ 562\u0e1e\u0e22\u0e32\u0e07\u0e04\u0e4c<200></data> 563 564# 565# Jitterbug 3671 Test Case 566# 567<data>•สวัสดี<200>ครับ<200>สบาย<200>ดี<200>ไหม<200> •ครับ<200></data> 568 569# 570# Trac ticket 5595 Test Case 571<data>•บท<200>ที่๑พายุ<200>ไซโคลน<200>โด<200>โรธี<200>อาศัย<200>อยู่<200>ท่ามกลาง<200>\ 572ทุ่งใหญ่<200>ใน<200>แคนซัส<200>กับ<200>ลุง<200>เฮ<200>นรี<200>ชาวไร่<200>และ<200>ป้า<200>เอ็ม<200>\ 573ภรรยา<200>ชาวไร่<200>บ้าน<200>ของ<200>พวก<200>เขา<200>หลัง<200>เล็ก<200>เพราะ<200>ไม้<200>\ 574สร้าง<200>บ้าน<200>ต้อง<200>ขน<200>มา<200>ด้วย<200>เกวียน<200>เป็น<200>ระยะ<200>ทาง<200>หลาย<200>\ 575ไมล์<200></data> 576 577 578# Japanese line break tailoring test 579 580<locale ja> 581<line> 582<data>•\u3041•\u3043•\u3045•\u31f1•</data> 583<locale en> 584<line> 585<data>•\u3041\u3043\u3045\u31f1•</data> 586 587