• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /**
4 *******************************************************************************
5 * Copyright (C) 1996-2016, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 *******************************************************************************
8 */
9 
10 package com.ibm.icu.dev.test.lang;
11 
12 import java.util.BitSet;
13 
14 import org.junit.Test;
15 import org.junit.runner.RunWith;
16 import org.junit.runners.JUnit4;
17 
18 import com.ibm.icu.dev.test.TestFmwk;
19 import com.ibm.icu.lang.UProperty;
20 import com.ibm.icu.lang.UScript;
21 import com.ibm.icu.lang.UScript.ScriptUsage;
22 import com.ibm.icu.text.UnicodeSet;
23 
24 @RunWith(JUnit4.class)
25 public class TestUScript extends TestFmwk {
26 
27     /**
28     * Constructor
29     */
TestUScript()30     public TestUScript()
31     {
32     }
33 
34     @Test
TestGetScriptOfCharsWithScriptExtensions()35     public void TestGetScriptOfCharsWithScriptExtensions() {
36         /* test characters which have Script_Extensions */
37         if(!(
38             UScript.COMMON==UScript.getScript(0x0640) &&
39             UScript.INHERITED==UScript.getScript(0x0650) &&
40             UScript.ARABIC==UScript.getScript(0xfdf2))
41         ) {
42             errln("UScript.getScript(character with Script_Extensions) failed");
43         }
44     }
45 
46     @Test
TestHasScript()47     public void TestHasScript() {
48         if(!(
49             !UScript.hasScript(0x063f, UScript.COMMON) &&
50             UScript.hasScript(0x063f, UScript.ARABIC) &&  /* main Script value */
51             !UScript.hasScript(0x063f, UScript.SYRIAC) &&
52             !UScript.hasScript(0x063f, UScript.THAANA))
53         ) {
54             errln("UScript.hasScript(U+063F, ...) is wrong");
55         }
56         if(!(
57             !UScript.hasScript(0x0640, UScript.COMMON) &&  /* main Script value */
58             UScript.hasScript(0x0640, UScript.ARABIC) &&
59             UScript.hasScript(0x0640, UScript.SYRIAC) &&
60             !UScript.hasScript(0x0640, UScript.THAANA))
61         ) {
62             errln("UScript.hasScript(U+0640, ...) is wrong");
63         }
64         if(!(
65             !UScript.hasScript(0x0650, UScript.INHERITED) &&  /* main Script value */
66             UScript.hasScript(0x0650, UScript.ARABIC) &&
67             UScript.hasScript(0x0650, UScript.SYRIAC) &&
68             !UScript.hasScript(0x0650, UScript.THAANA))
69         ) {
70             errln("UScript.hasScript(U+0650, ...) is wrong");
71         }
72         if(!(
73             !UScript.hasScript(0x0660, UScript.COMMON) &&  /* main Script value */
74             UScript.hasScript(0x0660, UScript.ARABIC) &&
75             !UScript.hasScript(0x0660, UScript.SYRIAC) &&
76             UScript.hasScript(0x0660, UScript.THAANA))
77         ) {
78             errln("UScript.hasScript(U+0660, ...) is wrong");
79         }
80         if(!(
81             !UScript.hasScript(0xfdf2, UScript.COMMON) &&
82             UScript.hasScript(0xfdf2, UScript.ARABIC) &&  /* main Script value */
83             !UScript.hasScript(0xfdf2, UScript.SYRIAC) &&
84             UScript.hasScript(0xfdf2, UScript.THAANA))
85         ) {
86             errln("UScript.hasScript(U+FDF2, ...) is wrong");
87         }
88         if(UScript.hasScript(0x0640, 0xaffe)) {
89             // An unguarded implementation might go into an infinite loop.
90             errln("UScript.hasScript(U+0640, bogus 0xaffe) is wrong");
91         }
92     }
93 
94     @Test
TestGetScriptExtensions()95     public void TestGetScriptExtensions() {
96         BitSet scripts=new BitSet(UScript.CODE_LIMIT);
97 
98         /* invalid code points */
99         if(UScript.getScriptExtensions(-1, scripts)!=UScript.UNKNOWN || scripts.cardinality()!=1 ||
100                 !scripts.get(UScript.UNKNOWN)) {
101             errln("UScript.getScriptExtensions(-1) is not {UNKNOWN}");
102         }
103         if(UScript.getScriptExtensions(0x110000, scripts)!=UScript.UNKNOWN || scripts.cardinality()!=1 ||
104                 !scripts.get(UScript.UNKNOWN)) {
105             errln("UScript.getScriptExtensions(0x110000) is not {UNKNOWN}");
106         }
107 
108         /* normal usage */
109         if(UScript.getScriptExtensions(0x063f, scripts)!=UScript.ARABIC || scripts.cardinality()!=1 ||
110                 !scripts.get(UScript.ARABIC)) {
111             errln("UScript.getScriptExtensions(U+063F) is not {ARABIC}");
112         }
113         if(UScript.getScriptExtensions(0x0640, scripts)>-3 || scripts.cardinality()<3 ||
114            !scripts.get(UScript.ARABIC) || !scripts.get(UScript.SYRIAC) || !scripts.get(UScript.MANDAIC)
115         ) {
116             errln("UScript.getScriptExtensions(U+0640) failed");
117         }
118         if(UScript.getScriptExtensions(0xfdf2, scripts)!=-2 || scripts.cardinality()!=2 ||
119                 !scripts.get(UScript.ARABIC) || !scripts.get(UScript.THAANA)) {
120             errln("UScript.getScriptExtensions(U+FDF2) failed");
121         }
122         if(UScript.getScriptExtensions(0xff65, scripts)!=-6 || scripts.cardinality()!=6 ||
123                 !scripts.get(UScript.BOPOMOFO) || !scripts.get(UScript.YI)) {
124             errln("UScript.getScriptExtensions(U+FF65) failed");
125         }
126     }
127 
128     @Test
TestDefaultScriptExtensions()129     public void TestDefaultScriptExtensions() {
130         // Block 3000..303F CJK Symbols and Punctuation defaults to scx=Bopo Hang Hani Hira Kana Yiii
131         // but some of its characters revert to scx=<script> which is usually Common.
132         BitSet scx = new BitSet();
133         assertEquals("U+3000 num scx",  // IDEOGRAPHIC SPACE
134                 UScript.COMMON,
135                 UScript.getScriptExtensions(0x3000, scx));
136         scx.clear();
137         assertEquals("U+3012 num scx",  // POSTAL MARK
138                 UScript.COMMON,
139                 UScript.getScriptExtensions(0x3012, scx));
140     }
141 
142     @Test
TestScriptMetadataAPI()143     public void TestScriptMetadataAPI() {
144         /* API & code coverage. */
145         String sample = UScript.getSampleString(UScript.LATIN);
146         if(sample.length()!=1 || UScript.getScript(sample.charAt(0))!=UScript.LATIN) {
147             errln("UScript.getSampleString(Latn) failed");
148         }
149         sample = UScript.getSampleString(UScript.INVALID_CODE);
150         if(sample.length()!=0) {
151             errln("UScript.getSampleString(invalid) failed");
152         }
153 
154         if(UScript.getUsage(UScript.LATIN)!=ScriptUsage.RECOMMENDED ||
155                 // Unicode 10 gives up on "aspirational".
156                 UScript.getUsage(UScript.YI)!=ScriptUsage.LIMITED_USE ||
157                 UScript.getUsage(UScript.CHEROKEE)!=ScriptUsage.LIMITED_USE ||
158                 UScript.getUsage(UScript.COPTIC)!=ScriptUsage.EXCLUDED ||
159                 UScript.getUsage(UScript.CIRTH)!=ScriptUsage.NOT_ENCODED ||
160                 UScript.getUsage(UScript.INVALID_CODE)!=ScriptUsage.NOT_ENCODED ||
161                 UScript.getUsage(UScript.CODE_LIMIT)!=ScriptUsage.NOT_ENCODED) {
162             errln("UScript.getUsage() failed");
163         }
164 
165         if(UScript.isRightToLeft(UScript.LATIN) ||
166                 UScript.isRightToLeft(UScript.CIRTH) ||
167                 !UScript.isRightToLeft(UScript.ARABIC) ||
168                 !UScript.isRightToLeft(UScript.HEBREW)) {
169             errln("UScript.isRightToLeft() failed");
170         }
171 
172         if(UScript.breaksBetweenLetters(UScript.LATIN) ||
173                 UScript.breaksBetweenLetters(UScript.CIRTH) ||
174                 !UScript.breaksBetweenLetters(UScript.HAN) ||
175                 !UScript.breaksBetweenLetters(UScript.THAI)) {
176             errln("UScript.breaksBetweenLetters() failed");
177         }
178 
179         if(UScript.isCased(UScript.CIRTH) ||
180                 UScript.isCased(UScript.HAN) ||
181                 !UScript.isCased(UScript.LATIN) ||
182                 !UScript.isCased(UScript.GREEK)) {
183             errln("UScript.isCased() failed");
184         }
185     }
186 
187     /**
188      * Maps a special script code to the most common script of its encoded characters.
189      */
getCharScript(int script)190     private static final int getCharScript(int script) {
191         switch(script) {
192         case UScript.HAN_WITH_BOPOMOFO:
193         case UScript.SIMPLIFIED_HAN:
194         case UScript.TRADITIONAL_HAN:
195             return UScript.HAN;
196         case UScript.JAPANESE:
197             return UScript.HIRAGANA;
198         case UScript.JAMO:
199         case UScript.KOREAN:
200             return UScript.HANGUL;
201         case UScript.SYMBOLS_EMOJI:
202             return UScript.SYMBOLS;
203         default:
204             return script;
205         }
206     }
207 
208     @Test
TestScriptMetadata()209     public void TestScriptMetadata() {
210         UnicodeSet rtl = new UnicodeSet("[[:bc=R:][:bc=AL:]-[:Cn:]-[:sc=Common:]]");
211         // So far, sample characters are uppercase.
212         // Georgian is special.
213         UnicodeSet cased = new UnicodeSet("[[:Lu:]-[:sc=Common:]-[:sc=Geor:]]");
214         for(int sc = 0; sc < UScript.CODE_LIMIT; ++sc) {
215             String sn = UScript.getShortName(sc);
216             ScriptUsage usage = UScript.getUsage(sc);
217             String sample = UScript.getSampleString(sc);
218             UnicodeSet scriptSet = new UnicodeSet();
219             scriptSet.applyIntPropertyValue(UProperty.SCRIPT, sc);
220             if(usage == ScriptUsage.NOT_ENCODED) {
221                 assertTrue(sn + " not encoded, no sample", sample.isEmpty());
222                 assertFalse(sn + " not encoded, not RTL", UScript.isRightToLeft(sc));
223                 assertFalse(sn + " not encoded, not LB letters", UScript.breaksBetweenLetters(sc));
224                 assertFalse(sn + " not encoded, not cased", UScript.isCased(sc));
225                 assertTrue(sn + " not encoded, no characters", scriptSet.isEmpty());
226             } else {
227                 assertFalse(sn + " encoded, has a sample character", sample.isEmpty());
228                 int firstChar = sample.codePointAt(0);
229                 int charScript = getCharScript(sc);
230                 assertEquals(sn + " script(sample(script))",
231                              charScript, UScript.getScript(firstChar));
232                 assertEquals(sn + " RTL vs. set", rtl.contains(firstChar), UScript.isRightToLeft(sc));
233                 assertEquals(sn + " cased vs. set", cased.contains(firstChar), UScript.isCased(sc));
234                 assertEquals(sn + " encoded, has characters", sc == charScript, !scriptSet.isEmpty());
235                 if(UScript.isRightToLeft(sc)) {
236                     rtl.removeAll(scriptSet);
237                 }
238                 if(UScript.isCased(sc)) {
239                     cased.removeAll(scriptSet);
240                 }
241             }
242         }
243         assertEquals("no remaining RTL characters", "[]", rtl.toPattern(true));
244         assertEquals("no remaining cased characters", "[]", cased.toPattern(true));
245 
246         assertTrue("Hani breaks between letters", UScript.breaksBetweenLetters(UScript.HAN));
247         assertTrue("Thai breaks between letters", UScript.breaksBetweenLetters(UScript.THAI));
248         assertFalse("Latn does not break between letters", UScript.breaksBetweenLetters(UScript.LATIN));
249     }
250 
251     @Test
TestScriptNames()252     public void TestScriptNames(){
253         for(int i=0; i<UScript.CODE_LIMIT;i++){
254             String name = UScript.getName(i);
255             if(name.equals("") ){
256                 errln("FAILED: getName for code : "+i);
257             }
258             String shortName= UScript.getShortName(i);
259             if(shortName.equals("")){
260                 errln("FAILED: getName for code : "+i);
261             }
262         }
263     }
264     @Test
TestAllCodepoints()265     public void TestAllCodepoints(){
266         int code;
267         //String oldId="";
268         //String oldAbbrId="";
269         for( int i =0; i <= 0x10ffff; i++){
270           code =UScript.INVALID_CODE;
271           code = UScript.getScript(i);
272           if(code==UScript.INVALID_CODE){
273                 errln("UScript.getScript for codepoint 0x"+ hex(i)+" failed");
274           }
275           String id =UScript.getName(code);
276           if(id.indexOf("INVALID")>=0){
277                  errln("UScript.getScript for codepoint 0x"+ hex(i)+" failed");
278           }
279           String abbr = UScript.getShortName(code);
280           if(abbr.indexOf("INV")>=0){
281                  errln("UScript.getScript for codepoint 0x"+ hex(i)+" failed");
282           }
283         }
284     }
285     @Test
TestNewCode()286     public void TestNewCode(){
287         /*
288          * These script codes were originally added to ICU pre-3.6, so that ICU would
289          * have all ISO 15924 script codes. ICU was then based on Unicode 4.1.
290          * These script codes were added with only short names because we don't
291          * want to invent long names ourselves.
292          * Unicode 5 and later encode some of these scripts and give them long names.
293          * Whenever this happens, the long script names here need to be updated.
294          */
295         String[] expectedLong = new String[]{
296             "Balinese", "Batak", "Blis", "Brahmi", "Cham", "Cirt", "Cyrs",
297             "Egyd", "Egyh", "Egyptian_Hieroglyphs",
298             "Geok", "Hans", "Hant", "Pahawh_Hmong", "Old_Hungarian", "Inds",
299             "Javanese", "Kayah_Li", "Latf", "Latg",
300             "Lepcha", "Linear_A", "Mandaic", "Maya", "Meroitic_Hieroglyphs",
301             "Nko", "Old_Turkic", "Old_Permic", "Phags_Pa", "Phoenician",
302             "Miao", "Roro", "Sara", "Syre", "Syrj", "Syrn", "Teng", "Vai", "Visp", "Cuneiform",
303             "Zxxx", "Unknown",
304             "Carian", "Jpan", "Tai_Tham", "Lycian", "Lydian", "Ol_Chiki", "Rejang", "Saurashtra", "SignWriting", "Sundanese",
305             "Moon", "Meetei_Mayek",
306             /* new in ICU 4.0 */
307             "Imperial_Aramaic", "Avestan", "Chakma", "Kore",
308             "Kaithi", "Manichaean", "Inscriptional_Pahlavi", "Psalter_Pahlavi", "Phlv",
309             "Inscriptional_Parthian", "Samaritan", "Tai_Viet",
310             "Zmth", "Zsym",
311             /* new in ICU 4.4 */
312             "Bamum", "Lisu", "Nkgb", "Old_South_Arabian",
313             /* new in ICU 4.6 */
314             "Bassa_Vah", "Duployan", "Elbasan", "Grantha", "Kpel",
315             "Loma", "Mende_Kikakui", "Meroitic_Cursive",
316             "Old_North_Arabian", "Nabataean", "Palmyrene", "Khudawadi", "Warang_Citi",
317             /* new in ICU 4.8 */
318             "Afak", "Jurc", "Mro", "Nushu", "Sharada", "Sora_Sompeng", "Takri", "Tangut", "Wole",
319             /* new in ICU 49 */
320             "Anatolian_Hieroglyphs", "Khojki", "Tirhuta",
321             /* new in ICU 52 */
322             "Caucasian_Albanian", "Mahajani",
323             /* new in ICU 54 */
324             "Ahom", "Hatran", "Modi", "Multani", "Pau_Cin_Hau", "Siddham",
325             // new in ICU 58
326             "Adlam", "Bhaiksuki", "Marchen", "Newa", "Osage", "Hanb", "Jamo", "Zsye",
327             // new in ICU 60
328             "Masaram_Gondi", "Soyombo", "Zanabazar_Square",
329             // new in ICU 61
330             "Dogra", "Gunjala_Gondi", "Makasar", "Medefaidrin",
331             "Hanifi_Rohingya", "Sogdian", "Old_Sogdian",
332         };
333         String[] expectedShort = new String[]{
334             "Bali", "Batk", "Blis", "Brah", "Cham", "Cirt", "Cyrs", "Egyd", "Egyh", "Egyp",
335             "Geok", "Hans", "Hant", "Hmng", "Hung", "Inds", "Java", "Kali", "Latf", "Latg",
336             "Lepc", "Lina", "Mand", "Maya", "Mero", "Nkoo", "Orkh", "Perm", "Phag", "Phnx",
337             "Plrd", "Roro", "Sara", "Syre", "Syrj", "Syrn", "Teng", "Vaii", "Visp", "Xsux",
338             "Zxxx", "Zzzz",
339             "Cari", "Jpan", "Lana", "Lyci", "Lydi", "Olck", "Rjng", "Saur", "Sgnw", "Sund",
340             "Moon", "Mtei",
341             /* new in ICU 4.0 */
342             "Armi", "Avst", "Cakm", "Kore",
343             "Kthi", "Mani", "Phli", "Phlp", "Phlv", "Prti", "Samr", "Tavt",
344             "Zmth", "Zsym",
345             /* new in ICU 4.4 */
346             "Bamu", "Lisu", "Nkgb", "Sarb",
347             /* new in ICU 4.6 */
348             "Bass", "Dupl", "Elba", "Gran", "Kpel", "Loma", "Mend", "Merc",
349             "Narb", "Nbat", "Palm", "Sind", "Wara",
350             /* new in ICU 4.8 */
351             "Afak", "Jurc", "Mroo", "Nshu", "Shrd", "Sora", "Takr", "Tang", "Wole",
352             /* new in ICU 49 */
353             "Hluw", "Khoj", "Tirh",
354             /* new in ICU 52 */
355             "Aghb", "Mahj",
356             /* new in ICU 54 */
357             "Ahom", "Hatr", "Modi", "Mult", "Pauc", "Sidd",
358             // new in ICU 58
359             "Adlm", "Bhks", "Marc", "Newa", "Osge", "Hanb", "Jamo", "Zsye",
360             // new in ICU 60
361             "Gonm", "Soyo", "Zanb",
362             // new in ICU 61
363             "Dogr", "Gong", "Maka", "Medf", "Rohg", "Sogd", "Sogo",
364         };
365         if(expectedLong.length!=(UScript.CODE_LIMIT-UScript.BALINESE)) {
366             errln("need to add new script codes in lang.TestUScript.java!");
367             return;
368         }
369         int j = 0;
370         int i = 0;
371         for(i=UScript.BALINESE; i<UScript.CODE_LIMIT; i++, j++){
372             String name = UScript.getName(i);
373             if(name==null || !name.equals(expectedLong[j])){
374                 errln("UScript.getName failed for code"+ i + name +"!=" +expectedLong[j]);
375             }
376             name = UScript.getShortName(i);
377             if(name==null || !name.equals(expectedShort[j])){
378                 errln("UScript.getShortName failed for code"+ i + name +"!=" +expectedShort[j]);
379             }
380         }
381         for(i=0; i<expectedLong.length; i++){
382             int[] ret = UScript.getCode(expectedShort[i]);
383             if(ret.length>1){
384                 errln("UScript.getCode did not return expected number of codes for script"+ expectedShort[i]+". EXPECTED: 1 GOT: "+ ret.length);
385             }
386             if(ret[0]!= (UScript.BALINESE+i)){
387                 errln("UScript.getCode did not return expected code for script"+ expectedShort[i]+". EXPECTED: "+ (UScript.BALINESE+i)+" GOT: %i\n"+ ret[0] );
388             }
389         }
390     }
391 }
392