1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /** 4 ******************************************************************************* 5 * Copyright (C) 1996-2016, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 package com.ibm.icu.dev.test.lang; 11 12 import java.util.BitSet; 13 14 import org.junit.Test; 15 import org.junit.runner.RunWith; 16 import org.junit.runners.JUnit4; 17 18 import com.ibm.icu.dev.test.TestFmwk; 19 import com.ibm.icu.lang.UProperty; 20 import com.ibm.icu.lang.UScript; 21 import com.ibm.icu.lang.UScript.ScriptUsage; 22 import com.ibm.icu.text.UnicodeSet; 23 24 @RunWith(JUnit4.class) 25 public class TestUScript extends TestFmwk { 26 27 /** 28 * Constructor 29 */ TestUScript()30 public TestUScript() 31 { 32 } 33 34 @Test TestGetScriptOfCharsWithScriptExtensions()35 public void TestGetScriptOfCharsWithScriptExtensions() { 36 /* test characters which have Script_Extensions */ 37 if(!( 38 UScript.COMMON==UScript.getScript(0x0640) && 39 UScript.INHERITED==UScript.getScript(0x0650) && 40 UScript.ARABIC==UScript.getScript(0xfdf2)) 41 ) { 42 errln("UScript.getScript(character with Script_Extensions) failed"); 43 } 44 } 45 46 @Test TestHasScript()47 public void TestHasScript() { 48 if(!( 49 !UScript.hasScript(0x063f, UScript.COMMON) && 50 UScript.hasScript(0x063f, UScript.ARABIC) && /* main Script value */ 51 !UScript.hasScript(0x063f, UScript.SYRIAC) && 52 !UScript.hasScript(0x063f, UScript.THAANA)) 53 ) { 54 errln("UScript.hasScript(U+063F, ...) is wrong"); 55 } 56 if(!( 57 !UScript.hasScript(0x0640, UScript.COMMON) && /* main Script value */ 58 UScript.hasScript(0x0640, UScript.ARABIC) && 59 UScript.hasScript(0x0640, UScript.SYRIAC) && 60 !UScript.hasScript(0x0640, UScript.THAANA)) 61 ) { 62 errln("UScript.hasScript(U+0640, ...) is wrong"); 63 } 64 if(!( 65 !UScript.hasScript(0x0650, UScript.INHERITED) && /* main Script value */ 66 UScript.hasScript(0x0650, UScript.ARABIC) && 67 UScript.hasScript(0x0650, UScript.SYRIAC) && 68 !UScript.hasScript(0x0650, UScript.THAANA)) 69 ) { 70 errln("UScript.hasScript(U+0650, ...) is wrong"); 71 } 72 if(!( 73 !UScript.hasScript(0x0660, UScript.COMMON) && /* main Script value */ 74 UScript.hasScript(0x0660, UScript.ARABIC) && 75 !UScript.hasScript(0x0660, UScript.SYRIAC) && 76 UScript.hasScript(0x0660, UScript.THAANA)) 77 ) { 78 errln("UScript.hasScript(U+0660, ...) is wrong"); 79 } 80 if(!( 81 !UScript.hasScript(0xfdf2, UScript.COMMON) && 82 UScript.hasScript(0xfdf2, UScript.ARABIC) && /* main Script value */ 83 !UScript.hasScript(0xfdf2, UScript.SYRIAC) && 84 UScript.hasScript(0xfdf2, UScript.THAANA)) 85 ) { 86 errln("UScript.hasScript(U+FDF2, ...) is wrong"); 87 } 88 if(UScript.hasScript(0x0640, 0xaffe)) { 89 // An unguarded implementation might go into an infinite loop. 90 errln("UScript.hasScript(U+0640, bogus 0xaffe) is wrong"); 91 } 92 } 93 94 @Test TestGetScriptExtensions()95 public void TestGetScriptExtensions() { 96 BitSet scripts=new BitSet(UScript.CODE_LIMIT); 97 98 /* invalid code points */ 99 if(UScript.getScriptExtensions(-1, scripts)!=UScript.UNKNOWN || scripts.cardinality()!=1 || 100 !scripts.get(UScript.UNKNOWN)) { 101 errln("UScript.getScriptExtensions(-1) is not {UNKNOWN}"); 102 } 103 if(UScript.getScriptExtensions(0x110000, scripts)!=UScript.UNKNOWN || scripts.cardinality()!=1 || 104 !scripts.get(UScript.UNKNOWN)) { 105 errln("UScript.getScriptExtensions(0x110000) is not {UNKNOWN}"); 106 } 107 108 /* normal usage */ 109 if(UScript.getScriptExtensions(0x063f, scripts)!=UScript.ARABIC || scripts.cardinality()!=1 || 110 !scripts.get(UScript.ARABIC)) { 111 errln("UScript.getScriptExtensions(U+063F) is not {ARABIC}"); 112 } 113 if(UScript.getScriptExtensions(0x0640, scripts)>-3 || scripts.cardinality()<3 || 114 !scripts.get(UScript.ARABIC) || !scripts.get(UScript.SYRIAC) || !scripts.get(UScript.MANDAIC) 115 ) { 116 errln("UScript.getScriptExtensions(U+0640) failed"); 117 } 118 if(UScript.getScriptExtensions(0xfdf2, scripts)!=-2 || scripts.cardinality()!=2 || 119 !scripts.get(UScript.ARABIC) || !scripts.get(UScript.THAANA)) { 120 errln("UScript.getScriptExtensions(U+FDF2) failed"); 121 } 122 if(UScript.getScriptExtensions(0xff65, scripts)!=-6 || scripts.cardinality()!=6 || 123 !scripts.get(UScript.BOPOMOFO) || !scripts.get(UScript.YI)) { 124 errln("UScript.getScriptExtensions(U+FF65) failed"); 125 } 126 } 127 128 @Test TestDefaultScriptExtensions()129 public void TestDefaultScriptExtensions() { 130 // Block 3000..303F CJK Symbols and Punctuation defaults to scx=Bopo Hang Hani Hira Kana Yiii 131 // but some of its characters revert to scx=<script> which is usually Common. 132 BitSet scx = new BitSet(); 133 assertEquals("U+3000 num scx", // IDEOGRAPHIC SPACE 134 UScript.COMMON, 135 UScript.getScriptExtensions(0x3000, scx)); 136 scx.clear(); 137 assertEquals("U+3012 num scx", // POSTAL MARK 138 UScript.COMMON, 139 UScript.getScriptExtensions(0x3012, scx)); 140 } 141 142 @Test TestScriptMetadataAPI()143 public void TestScriptMetadataAPI() { 144 /* API & code coverage. */ 145 String sample = UScript.getSampleString(UScript.LATIN); 146 if(sample.length()!=1 || UScript.getScript(sample.charAt(0))!=UScript.LATIN) { 147 errln("UScript.getSampleString(Latn) failed"); 148 } 149 sample = UScript.getSampleString(UScript.INVALID_CODE); 150 if(sample.length()!=0) { 151 errln("UScript.getSampleString(invalid) failed"); 152 } 153 154 if(UScript.getUsage(UScript.LATIN)!=ScriptUsage.RECOMMENDED || 155 // Unicode 10 gives up on "aspirational". 156 UScript.getUsage(UScript.YI)!=ScriptUsage.LIMITED_USE || 157 UScript.getUsage(UScript.CHEROKEE)!=ScriptUsage.LIMITED_USE || 158 UScript.getUsage(UScript.COPTIC)!=ScriptUsage.EXCLUDED || 159 UScript.getUsage(UScript.CIRTH)!=ScriptUsage.NOT_ENCODED || 160 UScript.getUsage(UScript.INVALID_CODE)!=ScriptUsage.NOT_ENCODED || 161 UScript.getUsage(UScript.CODE_LIMIT)!=ScriptUsage.NOT_ENCODED) { 162 errln("UScript.getUsage() failed"); 163 } 164 165 if(UScript.isRightToLeft(UScript.LATIN) || 166 UScript.isRightToLeft(UScript.CIRTH) || 167 !UScript.isRightToLeft(UScript.ARABIC) || 168 !UScript.isRightToLeft(UScript.HEBREW)) { 169 errln("UScript.isRightToLeft() failed"); 170 } 171 172 if(UScript.breaksBetweenLetters(UScript.LATIN) || 173 UScript.breaksBetweenLetters(UScript.CIRTH) || 174 !UScript.breaksBetweenLetters(UScript.HAN) || 175 !UScript.breaksBetweenLetters(UScript.THAI)) { 176 errln("UScript.breaksBetweenLetters() failed"); 177 } 178 179 if(UScript.isCased(UScript.CIRTH) || 180 UScript.isCased(UScript.HAN) || 181 !UScript.isCased(UScript.LATIN) || 182 !UScript.isCased(UScript.GREEK)) { 183 errln("UScript.isCased() failed"); 184 } 185 } 186 187 /** 188 * Maps a special script code to the most common script of its encoded characters. 189 */ getCharScript(int script)190 private static final int getCharScript(int script) { 191 switch(script) { 192 case UScript.HAN_WITH_BOPOMOFO: 193 case UScript.SIMPLIFIED_HAN: 194 case UScript.TRADITIONAL_HAN: 195 return UScript.HAN; 196 case UScript.JAPANESE: 197 return UScript.HIRAGANA; 198 case UScript.JAMO: 199 case UScript.KOREAN: 200 return UScript.HANGUL; 201 case UScript.SYMBOLS_EMOJI: 202 return UScript.SYMBOLS; 203 default: 204 return script; 205 } 206 } 207 208 @Test TestScriptMetadata()209 public void TestScriptMetadata() { 210 UnicodeSet rtl = new UnicodeSet("[[:bc=R:][:bc=AL:]-[:Cn:]-[:sc=Common:]]"); 211 // So far, sample characters are uppercase. 212 // Georgian is special. 213 UnicodeSet cased = new UnicodeSet("[[:Lu:]-[:sc=Common:]-[:sc=Geor:]]"); 214 for(int sc = 0; sc < UScript.CODE_LIMIT; ++sc) { 215 String sn = UScript.getShortName(sc); 216 ScriptUsage usage = UScript.getUsage(sc); 217 String sample = UScript.getSampleString(sc); 218 UnicodeSet scriptSet = new UnicodeSet(); 219 scriptSet.applyIntPropertyValue(UProperty.SCRIPT, sc); 220 if(usage == ScriptUsage.NOT_ENCODED) { 221 assertTrue(sn + " not encoded, no sample", sample.isEmpty()); 222 assertFalse(sn + " not encoded, not RTL", UScript.isRightToLeft(sc)); 223 assertFalse(sn + " not encoded, not LB letters", UScript.breaksBetweenLetters(sc)); 224 assertFalse(sn + " not encoded, not cased", UScript.isCased(sc)); 225 assertTrue(sn + " not encoded, no characters", scriptSet.isEmpty()); 226 } else { 227 assertFalse(sn + " encoded, has a sample character", sample.isEmpty()); 228 int firstChar = sample.codePointAt(0); 229 int charScript = getCharScript(sc); 230 assertEquals(sn + " script(sample(script))", 231 charScript, UScript.getScript(firstChar)); 232 assertEquals(sn + " RTL vs. set", rtl.contains(firstChar), UScript.isRightToLeft(sc)); 233 assertEquals(sn + " cased vs. set", cased.contains(firstChar), UScript.isCased(sc)); 234 assertEquals(sn + " encoded, has characters", sc == charScript, !scriptSet.isEmpty()); 235 if(UScript.isRightToLeft(sc)) { 236 rtl.removeAll(scriptSet); 237 } 238 if(UScript.isCased(sc)) { 239 cased.removeAll(scriptSet); 240 } 241 } 242 } 243 assertEquals("no remaining RTL characters", "[]", rtl.toPattern(true)); 244 assertEquals("no remaining cased characters", "[]", cased.toPattern(true)); 245 246 assertTrue("Hani breaks between letters", UScript.breaksBetweenLetters(UScript.HAN)); 247 assertTrue("Thai breaks between letters", UScript.breaksBetweenLetters(UScript.THAI)); 248 assertFalse("Latn does not break between letters", UScript.breaksBetweenLetters(UScript.LATIN)); 249 } 250 251 @Test TestScriptNames()252 public void TestScriptNames(){ 253 for(int i=0; i<UScript.CODE_LIMIT;i++){ 254 String name = UScript.getName(i); 255 if(name.equals("") ){ 256 errln("FAILED: getName for code : "+i); 257 } 258 String shortName= UScript.getShortName(i); 259 if(shortName.equals("")){ 260 errln("FAILED: getName for code : "+i); 261 } 262 } 263 } 264 @Test TestAllCodepoints()265 public void TestAllCodepoints(){ 266 int code; 267 //String oldId=""; 268 //String oldAbbrId=""; 269 for( int i =0; i <= 0x10ffff; i++){ 270 code =UScript.INVALID_CODE; 271 code = UScript.getScript(i); 272 if(code==UScript.INVALID_CODE){ 273 errln("UScript.getScript for codepoint 0x"+ hex(i)+" failed"); 274 } 275 String id =UScript.getName(code); 276 if(id.indexOf("INVALID")>=0){ 277 errln("UScript.getScript for codepoint 0x"+ hex(i)+" failed"); 278 } 279 String abbr = UScript.getShortName(code); 280 if(abbr.indexOf("INV")>=0){ 281 errln("UScript.getScript for codepoint 0x"+ hex(i)+" failed"); 282 } 283 } 284 } 285 @Test TestNewCode()286 public void TestNewCode(){ 287 /* 288 * These script codes were originally added to ICU pre-3.6, so that ICU would 289 * have all ISO 15924 script codes. ICU was then based on Unicode 4.1. 290 * These script codes were added with only short names because we don't 291 * want to invent long names ourselves. 292 * Unicode 5 and later encode some of these scripts and give them long names. 293 * Whenever this happens, the long script names here need to be updated. 294 */ 295 String[] expectedLong = new String[]{ 296 "Balinese", "Batak", "Blis", "Brahmi", "Cham", "Cirt", "Cyrs", 297 "Egyd", "Egyh", "Egyptian_Hieroglyphs", 298 "Geok", "Hans", "Hant", "Pahawh_Hmong", "Old_Hungarian", "Inds", 299 "Javanese", "Kayah_Li", "Latf", "Latg", 300 "Lepcha", "Linear_A", "Mandaic", "Maya", "Meroitic_Hieroglyphs", 301 "Nko", "Old_Turkic", "Old_Permic", "Phags_Pa", "Phoenician", 302 "Miao", "Roro", "Sara", "Syre", "Syrj", "Syrn", "Teng", "Vai", "Visp", "Cuneiform", 303 "Zxxx", "Unknown", 304 "Carian", "Jpan", "Tai_Tham", "Lycian", "Lydian", "Ol_Chiki", "Rejang", "Saurashtra", "SignWriting", "Sundanese", 305 "Moon", "Meetei_Mayek", 306 /* new in ICU 4.0 */ 307 "Imperial_Aramaic", "Avestan", "Chakma", "Kore", 308 "Kaithi", "Manichaean", "Inscriptional_Pahlavi", "Psalter_Pahlavi", "Phlv", 309 "Inscriptional_Parthian", "Samaritan", "Tai_Viet", 310 "Zmth", "Zsym", 311 /* new in ICU 4.4 */ 312 "Bamum", "Lisu", "Nkgb", "Old_South_Arabian", 313 /* new in ICU 4.6 */ 314 "Bassa_Vah", "Duployan", "Elbasan", "Grantha", "Kpel", 315 "Loma", "Mende_Kikakui", "Meroitic_Cursive", 316 "Old_North_Arabian", "Nabataean", "Palmyrene", "Khudawadi", "Warang_Citi", 317 /* new in ICU 4.8 */ 318 "Afak", "Jurc", "Mro", "Nushu", "Sharada", "Sora_Sompeng", "Takri", "Tangut", "Wole", 319 /* new in ICU 49 */ 320 "Anatolian_Hieroglyphs", "Khojki", "Tirhuta", 321 /* new in ICU 52 */ 322 "Caucasian_Albanian", "Mahajani", 323 /* new in ICU 54 */ 324 "Ahom", "Hatran", "Modi", "Multani", "Pau_Cin_Hau", "Siddham", 325 // new in ICU 58 326 "Adlam", "Bhaiksuki", "Marchen", "Newa", "Osage", "Hanb", "Jamo", "Zsye", 327 // new in ICU 60 328 "Masaram_Gondi", "Soyombo", "Zanabazar_Square", 329 // new in ICU 61 330 "Dogra", "Gunjala_Gondi", "Makasar", "Medefaidrin", 331 "Hanifi_Rohingya", "Sogdian", "Old_Sogdian", 332 }; 333 String[] expectedShort = new String[]{ 334 "Bali", "Batk", "Blis", "Brah", "Cham", "Cirt", "Cyrs", "Egyd", "Egyh", "Egyp", 335 "Geok", "Hans", "Hant", "Hmng", "Hung", "Inds", "Java", "Kali", "Latf", "Latg", 336 "Lepc", "Lina", "Mand", "Maya", "Mero", "Nkoo", "Orkh", "Perm", "Phag", "Phnx", 337 "Plrd", "Roro", "Sara", "Syre", "Syrj", "Syrn", "Teng", "Vaii", "Visp", "Xsux", 338 "Zxxx", "Zzzz", 339 "Cari", "Jpan", "Lana", "Lyci", "Lydi", "Olck", "Rjng", "Saur", "Sgnw", "Sund", 340 "Moon", "Mtei", 341 /* new in ICU 4.0 */ 342 "Armi", "Avst", "Cakm", "Kore", 343 "Kthi", "Mani", "Phli", "Phlp", "Phlv", "Prti", "Samr", "Tavt", 344 "Zmth", "Zsym", 345 /* new in ICU 4.4 */ 346 "Bamu", "Lisu", "Nkgb", "Sarb", 347 /* new in ICU 4.6 */ 348 "Bass", "Dupl", "Elba", "Gran", "Kpel", "Loma", "Mend", "Merc", 349 "Narb", "Nbat", "Palm", "Sind", "Wara", 350 /* new in ICU 4.8 */ 351 "Afak", "Jurc", "Mroo", "Nshu", "Shrd", "Sora", "Takr", "Tang", "Wole", 352 /* new in ICU 49 */ 353 "Hluw", "Khoj", "Tirh", 354 /* new in ICU 52 */ 355 "Aghb", "Mahj", 356 /* new in ICU 54 */ 357 "Ahom", "Hatr", "Modi", "Mult", "Pauc", "Sidd", 358 // new in ICU 58 359 "Adlm", "Bhks", "Marc", "Newa", "Osge", "Hanb", "Jamo", "Zsye", 360 // new in ICU 60 361 "Gonm", "Soyo", "Zanb", 362 // new in ICU 61 363 "Dogr", "Gong", "Maka", "Medf", "Rohg", "Sogd", "Sogo", 364 }; 365 if(expectedLong.length!=(UScript.CODE_LIMIT-UScript.BALINESE)) { 366 errln("need to add new script codes in lang.TestUScript.java!"); 367 return; 368 } 369 int j = 0; 370 int i = 0; 371 for(i=UScript.BALINESE; i<UScript.CODE_LIMIT; i++, j++){ 372 String name = UScript.getName(i); 373 if(name==null || !name.equals(expectedLong[j])){ 374 errln("UScript.getName failed for code"+ i + name +"!=" +expectedLong[j]); 375 } 376 name = UScript.getShortName(i); 377 if(name==null || !name.equals(expectedShort[j])){ 378 errln("UScript.getShortName failed for code"+ i + name +"!=" +expectedShort[j]); 379 } 380 } 381 for(i=0; i<expectedLong.length; i++){ 382 int[] ret = UScript.getCode(expectedShort[i]); 383 if(ret.length>1){ 384 errln("UScript.getCode did not return expected number of codes for script"+ expectedShort[i]+". EXPECTED: 1 GOT: "+ ret.length); 385 } 386 if(ret[0]!= (UScript.BALINESE+i)){ 387 errln("UScript.getCode did not return expected code for script"+ expectedShort[i]+". EXPECTED: "+ (UScript.BALINESE+i)+" GOT: %i\n"+ ret[0] ); 388 } 389 } 390 } 391 } 392