1 /* GENERATED SOURCE. DO NOT MODIFY. */ 2 // © 2016 and later: Unicode, Inc. and others. 3 // License & terms of use: http://www.unicode.org/copyright.html#License 4 /* 5 ******************************************************************************* 6 * Copyright (C) 2003-2015, International Business Machines Corporation and 7 * others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10 11 package ohos.global.icu.text; 12 13 import java.io.IOException; 14 import java.io.InputStream; 15 import java.lang.ref.WeakReference; 16 import java.nio.ByteBuffer; 17 18 import ohos.global.icu.impl.CharTrie; 19 import ohos.global.icu.impl.ICUBinary; 20 import ohos.global.icu.impl.StringPrepDataReader; 21 import ohos.global.icu.impl.UBiDiProps; 22 import ohos.global.icu.lang.UCharacter; 23 import ohos.global.icu.lang.UCharacterDirection; 24 import ohos.global.icu.util.ICUUncheckedIOException; 25 import ohos.global.icu.util.VersionInfo; 26 27 /** 28 * StringPrep API implements the StingPrep framework as described by 29 * <a href="http://www.ietf.org/rfc/rfc3454.txt">RFC 3454</a>. 30 * StringPrep prepares Unicode strings for use in network protocols. 31 * Profiles of StingPrep are set of rules and data according to which the 32 * Unicode Strings are prepared. Each profiles contains tables which describe 33 * how a code point should be treated. The tables are broadly classied into 34 * <ul> 35 * <li> Unassigned Table: Contains code points that are unassigned 36 * in the Unicode Version supported by StringPrep. Currently 37 * RFC 3454 supports Unicode 3.2. </li> 38 * <li> Prohibited Table: Contains code points that are prohibted from 39 * the output of the StringPrep processing function. </li> 40 * <li> Mapping Table: Contains code ponts that are deleted from the output or case mapped. </li> 41 * </ul> 42 * 43 * The procedure for preparing Unicode strings: 44 * <ol> 45 * <li> Map: For each character in the input, check if it has a mapping 46 * and, if so, replace it with its mapping. </li> 47 * <li> Normalize: Possibly normalize the result of step 1 using Unicode 48 * normalization. </li> 49 * <li> Prohibit: Check for any characters that are not allowed in the 50 * output. If any are found, return an error.</li> 51 * <li> Check bidi: Possibly check for right-to-left characters, and if 52 * any are found, make sure that the whole string satisfies the 53 * requirements for bidirectional strings. If the string does not 54 * satisfy the requirements for bidirectional strings, return an 55 * error. </li> 56 * </ol> 57 * @author Ram Viswanadha 58 * @hide exposed on OHOS 59 */ 60 public final class StringPrep { 61 /** 62 * Option to prohibit processing of unassigned code points in the input 63 * 64 * @see #prepare 65 */ 66 public static final int DEFAULT = 0x0000; 67 68 /** 69 * Option to allow processing of unassigned code points in the input 70 * 71 * @see #prepare 72 */ 73 public static final int ALLOW_UNASSIGNED = 0x0001; 74 75 /** 76 * Profile type: RFC3491 Nameprep 77 * @see #getInstance(int) 78 */ 79 public static final int RFC3491_NAMEPREP = 0; 80 81 /** 82 * Profile type: RFC3530 nfs4_cs_prep 83 * @see #getInstance(int) 84 */ 85 public static final int RFC3530_NFS4_CS_PREP = 1; 86 87 /** 88 * Profile type: RFC3530 nfs4_cs_prep with case insensitive option 89 * @see #getInstance(int) 90 */ 91 public static final int RFC3530_NFS4_CS_PREP_CI = 2; 92 93 /** 94 * Profile type: RFC3530 nfs4_cis_prep 95 * @see #getInstance(int) 96 */ 97 public static final int RFC3530_NFS4_CIS_PREP = 3; 98 99 /** 100 * Profile type: RFC3530 nfs4_mixed_prep for prefix 101 * @see #getInstance(int) 102 */ 103 public static final int RFC3530_NFS4_MIXED_PREP_PREFIX = 4; 104 105 /** 106 * Profile type: RFC3530 nfs4_mixed_prep for suffix 107 * @see #getInstance(int) 108 */ 109 public static final int RFC3530_NFS4_MIXED_PREP_SUFFIX = 5; 110 111 /** 112 * Profile type: RFC3722 iSCSI 113 * @see #getInstance(int) 114 */ 115 public static final int RFC3722_ISCSI = 6; 116 117 /** 118 * Profile type: RFC3920 XMPP Nodeprep 119 * @see #getInstance(int) 120 */ 121 public static final int RFC3920_NODEPREP = 7; 122 123 /** 124 * Profile type: RFC3920 XMPP Resourceprep 125 * @see #getInstance(int) 126 */ 127 public static final int RFC3920_RESOURCEPREP = 8; 128 129 /** 130 * Profile type: RFC4011 Policy MIB Stringprep 131 * @see #getInstance(int) 132 */ 133 public static final int RFC4011_MIB = 9; 134 135 /** 136 * Profile type: RFC4013 SASLprep 137 * @see #getInstance(int) 138 */ 139 public static final int RFC4013_SASLPREP = 10; 140 141 /** 142 * Profile type: RFC4505 trace 143 * @see #getInstance(int) 144 */ 145 public static final int RFC4505_TRACE = 11; 146 147 /** 148 * Profile type: RFC4518 LDAP 149 * @see #getInstance(int) 150 */ 151 public static final int RFC4518_LDAP = 12; 152 153 /** 154 * Profile type: RFC4518 LDAP for case ignore, numeric and stored prefix 155 * matching rules 156 * @see #getInstance(int) 157 */ 158 public static final int RFC4518_LDAP_CI = 13; 159 160 // Last available profile 161 private static final int MAX_PROFILE = RFC4518_LDAP_CI; 162 163 // Profile names must be aligned to profile type definitions 164 private static final String[] PROFILE_NAMES = { 165 "rfc3491", /* RFC3491_NAMEPREP */ 166 "rfc3530cs", /* RFC3530_NFS4_CS_PREP */ 167 "rfc3530csci", /* RFC3530_NFS4_CS_PREP_CI */ 168 "rfc3491", /* RFC3530_NSF4_CIS_PREP */ 169 "rfc3530mixp", /* RFC3530_NSF4_MIXED_PREP_PREFIX */ 170 "rfc3491", /* RFC3530_NSF4_MIXED_PREP_SUFFIX */ 171 "rfc3722", /* RFC3722_ISCSI */ 172 "rfc3920node", /* RFC3920_NODEPREP */ 173 "rfc3920res", /* RFC3920_RESOURCEPREP */ 174 "rfc4011", /* RFC4011_MIB */ 175 "rfc4013", /* RFC4013_SASLPREP */ 176 "rfc4505", /* RFC4505_TRACE */ 177 "rfc4518", /* RFC4518_LDAP */ 178 "rfc4518ci", /* RFC4518_LDAP_CI */ 179 }; 180 181 @SuppressWarnings({"unchecked", "rawtypes"}) 182 private static final WeakReference<StringPrep>[] CACHE = (WeakReference<StringPrep>[])new WeakReference[MAX_PROFILE+1]; 183 184 private static final int UNASSIGNED = 0x0000; 185 private static final int MAP = 0x0001; 186 private static final int PROHIBITED = 0x0002; 187 private static final int DELETE = 0x0003; 188 private static final int TYPE_LIMIT = 0x0004; 189 190 private static final int NORMALIZATION_ON = 0x0001; 191 private static final int CHECK_BIDI_ON = 0x0002; 192 193 private static final int TYPE_THRESHOLD = 0xFFF0; 194 private static final int MAX_INDEX_VALUE = 0x3FBF; /*16139*/ 195 //private static final int MAX_INDEX_TOP_LENGTH = 0x0003; 196 197 /* indexes[] value names */ 198 // private static final int INDEX_TRIE_SIZE = 0; /* number of bytes in normalization trie */ 199 private static final int INDEX_MAPPING_DATA_SIZE = 1; /* The array that contains the mapping */ 200 private static final int NORM_CORRECTNS_LAST_UNI_VERSION = 2; /* The index of Unicode version of last entry in NormalizationCorrections.txt */ 201 private static final int ONE_UCHAR_MAPPING_INDEX_START = 3; /* The starting index of 1 UChar mapping index in the mapping data array */ 202 private static final int TWO_UCHARS_MAPPING_INDEX_START = 4; /* The starting index of 2 UChars mapping index in the mapping data array */ 203 private static final int THREE_UCHARS_MAPPING_INDEX_START = 5; 204 private static final int FOUR_UCHARS_MAPPING_INDEX_START = 6; 205 private static final int OPTIONS = 7; /* Bit set of options to turn on in the profile */ 206 private static final int INDEX_TOP = 16; /* changing this requires a new formatVersion */ 207 208 209 // CharTrie implmentation for reading the trie data 210 private CharTrie sprepTrie; 211 // Indexes read from the data file 212 private int[] indexes; 213 // mapping data read from the data file 214 private char[] mappingData; 215 // the version of Unicode supported by the data file 216 private VersionInfo sprepUniVer; 217 // the Unicode version of last entry in the 218 // NormalizationCorrections.txt file if normalization 219 // is turned on 220 private VersionInfo normCorrVer; 221 // Option to turn on Normalization 222 private boolean doNFKC; 223 // Option to turn on checking for BiDi rules 224 private boolean checkBiDi; 225 // bidi properties 226 private UBiDiProps bdp; 227 getCodePointValue(int ch)228 private char getCodePointValue(int ch){ 229 return sprepTrie.getCodePointValue(ch); 230 } 231 getVersionInfo(int comp)232 private static VersionInfo getVersionInfo(int comp){ 233 int micro = comp & 0xFF; 234 int milli =(comp >> 8) & 0xFF; 235 int minor =(comp >> 16) & 0xFF; 236 int major =(comp >> 24) & 0xFF; 237 return VersionInfo.getInstance(major,minor,milli,micro); 238 } 239 getVersionInfo(byte[] version)240 private static VersionInfo getVersionInfo(byte[] version){ 241 if(version.length != 4){ 242 return null; 243 } 244 return VersionInfo.getInstance((int)version[0],(int) version[1],(int) version[2],(int) version[3]); 245 } 246 247 /** 248 * Creates an StringPrep object after reading the input stream. 249 * The object does not hold a reference to the input steam, so the stream can be 250 * closed after the method returns. 251 * 252 * @param inputStream The stream for reading the StringPrep profile binarySun 253 * @throws IOException An exception occurs when I/O of the inputstream is invalid 254 */ StringPrep(InputStream inputStream)255 public StringPrep(InputStream inputStream) throws IOException{ 256 // TODO: Add a public constructor that takes ByteBuffer directly. 257 this(ICUBinary.getByteBufferFromInputStreamAndCloseStream(inputStream)); 258 } 259 StringPrep(ByteBuffer bytes)260 private StringPrep(ByteBuffer bytes) throws IOException { 261 StringPrepDataReader reader = new StringPrepDataReader(bytes); 262 263 // read the indexes 264 indexes = reader.readIndexes(INDEX_TOP); 265 266 sprepTrie = new CharTrie(bytes, null); 267 268 //indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes 269 // load the rest of the data data and initialize the data members 270 mappingData = reader.read(indexes[INDEX_MAPPING_DATA_SIZE]/2); 271 272 // get the options 273 doNFKC = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0); 274 checkBiDi = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0); 275 sprepUniVer = getVersionInfo(reader.getUnicodeVersion()); 276 normCorrVer = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]); 277 VersionInfo normUniVer = UCharacter.getUnicodeVersion(); 278 if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */ 279 normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */ 280 ((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/ 281 ){ 282 throw new IOException("Normalization Correction version not supported"); 283 } 284 285 if(checkBiDi) { 286 bdp=UBiDiProps.INSTANCE; 287 } 288 } 289 290 /** 291 * Gets a StringPrep instance for the specified profile 292 * 293 * @param profile The profile passed to find the StringPrep instance. 294 */ getInstance(int profile)295 public static StringPrep getInstance(int profile) { 296 if (profile < 0 || profile > MAX_PROFILE) { 297 throw new IllegalArgumentException("Bad profile type"); 298 } 299 300 StringPrep instance = null; 301 302 // A StringPrep instance is immutable. We use a single instance 303 // per type and store it in the internal cache. 304 synchronized (CACHE) { 305 WeakReference<StringPrep> ref = CACHE[profile]; 306 if (ref != null) { 307 instance = ref.get(); 308 } 309 310 if (instance == null) { 311 ByteBuffer bytes = ICUBinary.getRequiredData(PROFILE_NAMES[profile] + ".spp"); 312 if (bytes != null) { 313 try { 314 instance = new StringPrep(bytes); 315 } catch (IOException e) { 316 throw new ICUUncheckedIOException(e); 317 } 318 } 319 if (instance != null) { 320 CACHE[profile] = new WeakReference<StringPrep>(instance); 321 } 322 } 323 } 324 return instance; 325 } 326 327 private static final class Values{ 328 boolean isIndex; 329 int value; 330 int type; reset()331 public void reset(){ 332 isIndex = false; 333 value = 0; 334 type = -1; 335 } 336 } 337 getValues(char trieWord,Values values)338 private static final void getValues(char trieWord,Values values){ 339 values.reset(); 340 if(trieWord == 0){ 341 /* 342 * Initial value stored in the mapping table 343 * just return TYPE_LIMIT .. so that 344 * the source codepoint is copied to the destination 345 */ 346 values.type = TYPE_LIMIT; 347 }else if(trieWord >= TYPE_THRESHOLD){ 348 values.type = (trieWord - TYPE_THRESHOLD); 349 }else{ 350 /* get the type */ 351 values.type = MAP; 352 /* ascertain if the value is index or delta */ 353 if((trieWord & 0x02)>0){ 354 values.isIndex = true; 355 values.value = trieWord >> 2; //mask off the lower 2 bits and shift 356 357 }else{ 358 values.isIndex = false; 359 values.value = (trieWord<<16)>>16; 360 values.value = (values.value >> 2); 361 362 } 363 364 if((trieWord>>2) == MAX_INDEX_VALUE){ 365 values.type = DELETE; 366 values.isIndex = false; 367 values.value = 0; 368 } 369 } 370 } 371 372 373 map( UCharacterIterator iter, int options)374 private StringBuffer map( UCharacterIterator iter, int options) 375 throws StringPrepParseException{ 376 377 Values val = new Values(); 378 char result = 0; 379 int ch = UCharacterIterator.DONE; 380 StringBuffer dest = new StringBuffer(); 381 boolean allowUnassigned = ((options & ALLOW_UNASSIGNED)>0); 382 383 while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){ 384 385 result = getCodePointValue(ch); 386 getValues(result,val); 387 388 // check if the source codepoint is unassigned 389 if(val.type == UNASSIGNED && allowUnassigned == false){ 390 throw new StringPrepParseException("An unassigned code point was found in the input", 391 StringPrepParseException.UNASSIGNED_ERROR, 392 iter.getText(),iter.getIndex()); 393 }else if((val.type == MAP)){ 394 int index, length; 395 396 if(val.isIndex){ 397 index = val.value; 398 if(index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] && 399 index < indexes[TWO_UCHARS_MAPPING_INDEX_START]){ 400 length = 1; 401 }else if(index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] && 402 index < indexes[THREE_UCHARS_MAPPING_INDEX_START]){ 403 length = 2; 404 }else if(index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] && 405 index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]){ 406 length = 3; 407 }else{ 408 length = mappingData[index++]; 409 } 410 /* copy mapping to destination */ 411 dest.append(mappingData,index,length); 412 continue; 413 414 }else{ 415 ch -= val.value; 416 } 417 }else if(val.type == DELETE){ 418 // just consume the codepoint and contine 419 continue; 420 } 421 //copy the source into destination 422 UTF16.append(dest,ch); 423 } 424 425 return dest; 426 } 427 428 normalize(StringBuffer src)429 private StringBuffer normalize(StringBuffer src){ 430 return new StringBuffer( 431 Normalizer.normalize( 432 src.toString(), 433 Normalizer.NFKC, 434 Normalizer.UNICODE_3_2)); 435 } 436 /* 437 boolean isLabelSeparator(int ch){ 438 int result = getCodePointValue(ch); 439 if( (result & 0x07) == LABEL_SEPARATOR){ 440 return true; 441 } 442 return false; 443 } 444 */ 445 /* 446 1) Map -- For each character in the input, check if it has a mapping 447 and, if so, replace it with its mapping. 448 449 2) Normalize -- Possibly normalize the result of step 1 using Unicode 450 normalization. 451 452 3) Prohibit -- Check for any characters that are not allowed in the 453 output. If any are found, return an error. 454 455 4) Check bidi -- Possibly check for right-to-left characters, and if 456 any are found, make sure that the whole string satisfies the 457 requirements for bidirectional strings. If the string does not 458 satisfy the requirements for bidirectional strings, return an 459 error. 460 [Unicode3.2] defines several bidirectional categories; each character 461 has one bidirectional category assigned to it. For the purposes of 462 the requirements below, an "RandALCat character" is a character that 463 has Unicode bidirectional categories "R" or "AL"; an "LCat character" 464 is a character that has Unicode bidirectional category "L". Note 465 466 467 that there are many characters which fall in neither of the above 468 definitions; Latin digits (<U+0030> through <U+0039>) are examples of 469 this because they have bidirectional category "EN". 470 471 In any profile that specifies bidirectional character handling, all 472 three of the following requirements MUST be met: 473 474 1) The characters in section 5.8 MUST be prohibited. 475 476 2) If a string contains any RandALCat character, the string MUST NOT 477 contain any LCat character. 478 479 3) If a string contains any RandALCat character, a RandALCat 480 character MUST be the first character of the string, and a 481 RandALCat character MUST be the last character of the string. 482 */ 483 /** 484 * Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC), 485 * checks for prohibited and BiDi characters in the order defined by RFC 3454 486 * depending on the options specified in the profile. 487 * 488 * @param src A UCharacterIterator object containing the source string 489 * @param options A bit set of options: 490 * <ul> 491 * <li>{@link #DEFAULT} Prohibit processing of unassigned code points in the input</li> 492 * <li>{@link #ALLOW_UNASSIGNED} Treat the unassigned code points are in the input 493 * as normal Unicode code points.</li> 494 * </ul> 495 * @return StringBuffer A StringBuffer containing the output 496 * @throws StringPrepParseException An exception occurs when parsing a string is invalid. 497 */ prepare(UCharacterIterator src, int options)498 public StringBuffer prepare(UCharacterIterator src, int options) 499 throws StringPrepParseException{ 500 501 // map 502 StringBuffer mapOut = map(src,options); 503 StringBuffer normOut = mapOut;// initialize 504 505 if(doNFKC){ 506 // normalize 507 normOut = normalize(mapOut); 508 } 509 510 int ch; 511 char result; 512 UCharacterIterator iter = UCharacterIterator.getInstance(normOut); 513 Values val = new Values(); 514 int direction=UCharacterDirection.CHAR_DIRECTION_COUNT, 515 firstCharDir=UCharacterDirection.CHAR_DIRECTION_COUNT; 516 int rtlPos=-1, ltrPos=-1; 517 boolean rightToLeft=false, leftToRight=false; 518 519 while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){ 520 result = getCodePointValue(ch); 521 getValues(result,val); 522 523 if(val.type == PROHIBITED ){ 524 throw new StringPrepParseException("A prohibited code point was found in the input", 525 StringPrepParseException.PROHIBITED_ERROR,iter.getText(),val.value); 526 } 527 528 if(checkBiDi) { 529 direction = bdp.getClass(ch); 530 if(firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT){ 531 firstCharDir = direction; 532 } 533 if(direction == UCharacterDirection.LEFT_TO_RIGHT){ 534 leftToRight = true; 535 ltrPos = iter.getIndex()-1; 536 } 537 if(direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC){ 538 rightToLeft = true; 539 rtlPos = iter.getIndex()-1; 540 } 541 } 542 } 543 if(checkBiDi == true){ 544 // satisfy 2 545 if( leftToRight == true && rightToLeft == true){ 546 throw new StringPrepParseException("The input does not conform to the rules for BiDi code points.", 547 StringPrepParseException.CHECK_BIDI_ERROR,iter.getText(), 548 (rtlPos>ltrPos) ? rtlPos : ltrPos); 549 } 550 551 //satisfy 3 552 if( rightToLeft == true && 553 !((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT || firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) && 554 (direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC)) 555 ){ 556 throw new StringPrepParseException("The input does not conform to the rules for BiDi code points.", 557 StringPrepParseException.CHECK_BIDI_ERROR,iter.getText(), 558 (rtlPos>ltrPos) ? rtlPos : ltrPos); 559 } 560 } 561 return normOut; 562 563 } 564 565 /** 566 * Prepare the input String for use in applications with the given profile. This operation maps, normalizes(NFKC), 567 * checks for prohibited and BiDi characters in the order defined by RFC 3454 568 * depending on the options specified in the profile. 569 * 570 * @param src A string 571 * @param options A bit set of options: 572 * <ul> 573 * <li>{@link #DEFAULT} Prohibit processing of unassigned code points in the input</li> 574 * <li>{@link #ALLOW_UNASSIGNED} Treat the unassigned code points are in the input 575 * as normal Unicode code points.</li> 576 * </ul> 577 * @return String A String containing the output 578 * @throws StringPrepParseException An exception when parsing or preparing a string is invalid. 579 */ prepare(String src, int options)580 public String prepare(String src, int options) 581 throws StringPrepParseException{ 582 StringBuffer result = prepare(UCharacterIterator.getInstance(src), options); 583 return result.toString(); 584 } 585 } 586