1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 1996-2014, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 package com.ibm.icu.impl; 11 12 import java.io.IOException; 13 import java.nio.ByteBuffer; 14 import java.util.Locale; 15 import java.util.MissingResourceException; 16 17 import com.ibm.icu.lang.UCharacter; 18 import com.ibm.icu.lang.UCharacterCategory; 19 import com.ibm.icu.text.UTF16; 20 import com.ibm.icu.text.UnicodeSet; 21 22 /** 23 * Internal class to manage character names. 24 * Since data for names are stored 25 * in an array of char, by default indexes used in this class is refering to 26 * a 2 byte count, unless otherwise stated. Cases where the index is refering 27 * to a byte count, the index is halved and depending on whether the index is 28 * even or odd, the MSB or LSB of the result char at the halved index is 29 * returned. For indexes to an array of int, the index is multiplied by 2, 30 * result char at the multiplied index and its following char is returned as an 31 * int. 32 * <a href=../lang/UCharacter.html>UCharacter</a> acts as a public facade for this class 33 * Note : 0 - 0x1F are control characters without names in Unicode 3.0 34 * @author Syn Wee Quek 35 * @since nov0700 36 */ 37 38 public final class UCharacterName 39 { 40 // public data members ---------------------------------------------- 41 42 /* 43 * public singleton instance 44 */ 45 public static final UCharacterName INSTANCE; 46 47 static { 48 try { 49 INSTANCE = new UCharacterName(); 50 } catch (IOException e) { 51 ///CLOVER:OFF 52 throw new MissingResourceException("Could not construct UCharacterName. Missing unames.icu","",""); 53 ///CLOVER:ON 54 } 55 } 56 57 /** 58 * Number of lines per group 59 * 1 << GROUP_SHIFT_ 60 */ 61 public static final int LINES_PER_GROUP_ = 1 << 5; 62 /** 63 * Maximum number of groups 64 */ 65 public int m_groupcount_ = 0; 66 67 // public methods --------------------------------------------------- 68 69 /** 70 * Retrieve the name of a Unicode code point. 71 * Depending on <code>choice</code>, the character name written into the 72 * buffer is the "modern" name or the name that was defined in Unicode 73 * version 1.0. 74 * The name contains only "invariant" characters 75 * like A-Z, 0-9, space, and '-'. 76 * 77 * @param ch the code point for which to get the name. 78 * @param choice Selector for which name to get. 79 * @return if code point is above 0x1fff, null is returned 80 */ getName(int ch, int choice)81 public String getName(int ch, int choice) 82 { 83 if (ch < UCharacter.MIN_VALUE || ch > UCharacter.MAX_VALUE || 84 choice > UCharacterNameChoice.CHAR_NAME_CHOICE_COUNT) { 85 return null; 86 } 87 88 String result = null; 89 90 result = getAlgName(ch, choice); 91 92 // getting normal character name 93 if (result == null || result.length() == 0) { 94 if (choice == UCharacterNameChoice.EXTENDED_CHAR_NAME) { 95 result = getExtendedName(ch); 96 } else { 97 result = getGroupName(ch, choice); 98 } 99 } 100 101 return result; 102 } 103 104 /** 105 * Find a character by its name and return its code point value 106 * @param choice selector to indicate if argument name is a Unicode 1.0 107 * or the most current version 108 * @param name the name to search for 109 * @return code point 110 */ getCharFromName(int choice, String name)111 public int getCharFromName(int choice, String name) 112 { 113 // checks for illegal arguments 114 if (choice >= UCharacterNameChoice.CHAR_NAME_CHOICE_COUNT || 115 name == null || name.length() == 0) { 116 return -1; 117 } 118 119 // try extended names first 120 int result = getExtendedChar(name.toLowerCase(Locale.ENGLISH), choice); 121 if (result >= -1) { 122 return result; 123 } 124 125 String upperCaseName = name.toUpperCase(Locale.ENGLISH); 126 // try algorithmic names first, if fails then try group names 127 // int result = getAlgorithmChar(choice, uppercasename); 128 129 if (choice == UCharacterNameChoice.UNICODE_CHAR_NAME || 130 choice == UCharacterNameChoice.EXTENDED_CHAR_NAME 131 ) { 132 int count = 0; 133 if (m_algorithm_ != null) { 134 count = m_algorithm_.length; 135 } 136 for (count --; count >= 0; count --) { 137 result = m_algorithm_[count].getChar(upperCaseName); 138 if (result >= 0) { 139 return result; 140 } 141 } 142 } 143 144 if (choice == UCharacterNameChoice.EXTENDED_CHAR_NAME) { 145 result = getGroupChar(upperCaseName, 146 UCharacterNameChoice.UNICODE_CHAR_NAME); 147 if (result == -1) { 148 result = getGroupChar(upperCaseName, 149 UCharacterNameChoice.CHAR_NAME_ALIAS); 150 } 151 } 152 else { 153 result = getGroupChar(upperCaseName, choice); 154 } 155 return result; 156 } 157 158 // these are all UCharacterNameIterator use methods ------------------- 159 160 /** 161 * Reads a block of compressed lengths of 32 strings and expands them into 162 * offsets and lengths for each string. Lengths are stored with a 163 * variable-width encoding in consecutive nibbles: 164 * If a nibble<0xc, then it is the length itself (0 = empty string). 165 * If a nibble>=0xc, then it forms a length value with the following 166 * nibble. 167 * The offsets and lengths arrays must be at least 33 (one more) long 168 * because there is no check here at the end if the last nibble is still 169 * used. 170 * @param index of group string object in array 171 * @param offsets array to store the value of the string offsets 172 * @param lengths array to store the value of the string length 173 * @return next index of the data string immediately after the lengths 174 * in terms of byte address 175 */ getGroupLengths(int index, char offsets[], char lengths[])176 public int getGroupLengths(int index, char offsets[], char lengths[]) 177 { 178 char length = 0xffff; 179 byte b = 0, 180 n = 0; 181 int shift; 182 index = index * m_groupsize_; // byte count offsets of group strings 183 int stringoffset = UCharacterUtility.toInt( 184 m_groupinfo_[index + OFFSET_HIGH_OFFSET_], 185 m_groupinfo_[index + OFFSET_LOW_OFFSET_]); 186 187 offsets[0] = 0; 188 189 // all 32 lengths must be read to get the offset of the first group 190 // string 191 for (int i = 0; i < LINES_PER_GROUP_; stringoffset ++) { 192 b = m_groupstring_[stringoffset]; 193 shift = 4; 194 195 while (shift >= 0) { 196 // getting nibble 197 n = (byte)((b >> shift) & 0x0F); 198 if (length == 0xffff && n > SINGLE_NIBBLE_MAX_) { 199 length = (char)((n - 12) << 4); 200 } 201 else { 202 if (length != 0xffff) { 203 lengths[i] = (char)((length | n) + 12); 204 } 205 else { 206 lengths[i] = (char)n; 207 } 208 209 if (i < LINES_PER_GROUP_) { 210 offsets[i + 1] = (char)(offsets[i] + lengths[i]); 211 } 212 213 length = 0xffff; 214 i ++; 215 } 216 217 shift -= 4; 218 } 219 } 220 return stringoffset; 221 } 222 223 /** 224 * Gets the name of the argument group index. 225 * UnicodeData.txt uses ';' as a field separator, so no field can contain 226 * ';' as part of its contents. In unames.icu, it is marked as 227 * token[';'] == -1 only if the semicolon is used in the data file - which 228 * is iff we have Unicode 1.0 names or ISO comments or aliases. 229 * So, it will be token[';'] == -1 if we store U1.0 names/ISO comments/aliases 230 * although we know that it will never be part of a name. 231 * Equivalent to ICU4C's expandName. 232 * @param index of the group name string in byte count 233 * @param length of the group name string 234 * @param choice of Unicode 1.0 name or the most current name 235 * @return name of the group 236 */ getGroupName(int index, int length, int choice)237 public String getGroupName(int index, int length, int choice) 238 { 239 if (choice != UCharacterNameChoice.UNICODE_CHAR_NAME && 240 choice != UCharacterNameChoice.EXTENDED_CHAR_NAME 241 ) { 242 if (';' >= m_tokentable_.length || m_tokentable_[';'] == 0xFFFF) { 243 /* 244 * skip the modern name if it is not requested _and_ 245 * if the semicolon byte value is a character, not a token number 246 */ 247 int fieldIndex= choice==UCharacterNameChoice.ISO_COMMENT_ ? 2 : choice; 248 do { 249 int oldindex = index; 250 index += UCharacterUtility.skipByteSubString(m_groupstring_, 251 index, length, (byte)';'); 252 length -= (index - oldindex); 253 } while(--fieldIndex>0); 254 } 255 else { 256 // the semicolon byte is a token number, therefore only modern 257 // names are stored in unames.dat and there is no such 258 // requested alternate name here 259 length = 0; 260 } 261 } 262 263 synchronized (m_utilStringBuffer_) { 264 m_utilStringBuffer_.setLength(0); 265 byte b; 266 char token; 267 for (int i = 0; i < length;) { 268 b = m_groupstring_[index + i]; 269 i ++; 270 271 if (b >= m_tokentable_.length) { 272 if (b == ';') { 273 break; 274 } 275 m_utilStringBuffer_.append(b); // implicit letter 276 } 277 else { 278 token = m_tokentable_[b & 0x00ff]; 279 if (token == 0xFFFE) { 280 // this is a lead byte for a double-byte token 281 token = m_tokentable_[b << 8 | 282 (m_groupstring_[index + i] & 0x00ff)]; 283 i ++; 284 } 285 if (token == 0xFFFF) { 286 if (b == ';') { 287 // skip the semicolon if we are seeking extended 288 // names and there was no 2.0 name but there 289 // is a 1.0 name. 290 if (m_utilStringBuffer_.length() == 0 && choice == 291 UCharacterNameChoice.EXTENDED_CHAR_NAME) { 292 continue; 293 } 294 break; 295 } 296 // explicit letter 297 m_utilStringBuffer_.append((char)(b & 0x00ff)); 298 } 299 else { // write token word 300 UCharacterUtility.getNullTermByteSubString( 301 m_utilStringBuffer_, m_tokenstring_, token); 302 } 303 } 304 } 305 306 if (m_utilStringBuffer_.length() > 0) { 307 return m_utilStringBuffer_.toString(); 308 } 309 } 310 return null; 311 } 312 313 /** 314 * Retrieves the extended name 315 */ getExtendedName(int ch)316 public String getExtendedName(int ch) 317 { 318 String result = getName(ch, UCharacterNameChoice.UNICODE_CHAR_NAME); 319 if (result == null) { 320 // TODO: Return Name_Alias/control names for control codes 0..1F & 7F..9F. 321 result = getExtendedOr10Name(ch); 322 } 323 return result; 324 } 325 326 /** 327 * Gets the group index for the codepoint, or the group before it. 328 * @param codepoint The codepoint index. 329 * @return group index containing codepoint or the group before it. 330 */ getGroup(int codepoint)331 public int getGroup(int codepoint) 332 { 333 int endGroup = m_groupcount_; 334 int msb = getCodepointMSB(codepoint); 335 int result = 0; 336 // binary search for the group of names that contains the one for 337 // code 338 // find the group that contains codepoint, or the highest before it 339 while (result < endGroup - 1) { 340 int gindex = (result + endGroup) >> 1; 341 if (msb < getGroupMSB(gindex)) { 342 endGroup = gindex; 343 } 344 else { 345 result = gindex; 346 } 347 } 348 return result; 349 } 350 351 /** 352 * Gets the extended and 1.0 name when the most current unicode names 353 * fail 354 * @param ch codepoint 355 * @return name of codepoint extended or 1.0 356 */ getExtendedOr10Name(int ch)357 public String getExtendedOr10Name(int ch) 358 { 359 String result = null; 360 // TODO: Return Name_Alias/control names for control codes 0..1F & 7F..9F. 361 if (result == null) { 362 int type = getType(ch); 363 // Return unknown if the table of names above is not up to 364 // date. 365 if (type >= TYPE_NAMES_.length) { 366 result = UNKNOWN_TYPE_NAME_; 367 } 368 else { 369 result = TYPE_NAMES_[type]; 370 } 371 synchronized (m_utilStringBuffer_) { 372 m_utilStringBuffer_.setLength(0); 373 m_utilStringBuffer_.append('<'); 374 m_utilStringBuffer_.append(result); 375 m_utilStringBuffer_.append('-'); 376 String chStr = Integer.toHexString(ch).toUpperCase(Locale.ENGLISH); 377 int zeros = 4 - chStr.length(); 378 while (zeros > 0) { 379 m_utilStringBuffer_.append('0'); 380 zeros --; 381 } 382 m_utilStringBuffer_.append(chStr); 383 m_utilStringBuffer_.append('>'); 384 result = m_utilStringBuffer_.toString(); 385 } 386 } 387 return result; 388 } 389 390 /** 391 * Gets the MSB from the group index 392 * @param gindex group index 393 * @return the MSB of the group if gindex is valid, -1 otherwise 394 */ getGroupMSB(int gindex)395 public int getGroupMSB(int gindex) 396 { 397 if (gindex >= m_groupcount_) { 398 return -1; 399 } 400 return m_groupinfo_[gindex * m_groupsize_]; 401 } 402 403 /** 404 * Gets the MSB of the codepoint 405 * @param codepoint The codepoint value. 406 * @return the MSB of the codepoint 407 */ getCodepointMSB(int codepoint)408 public static int getCodepointMSB(int codepoint) 409 { 410 return codepoint >> GROUP_SHIFT_; 411 } 412 413 /** 414 * Gets the maximum codepoint + 1 of the group 415 * @param msb most significant byte of the group 416 * @return limit codepoint of the group 417 */ getGroupLimit(int msb)418 public static int getGroupLimit(int msb) 419 { 420 return (msb << GROUP_SHIFT_) + LINES_PER_GROUP_; 421 } 422 423 /** 424 * Gets the minimum codepoint of the group 425 * @param msb most significant byte of the group 426 * @return minimum codepoint of the group 427 */ getGroupMin(int msb)428 public static int getGroupMin(int msb) 429 { 430 return msb << GROUP_SHIFT_; 431 } 432 433 /** 434 * Gets the offset to a group 435 * @param codepoint The codepoint value. 436 * @return offset to a group 437 */ getGroupOffset(int codepoint)438 public static int getGroupOffset(int codepoint) 439 { 440 return codepoint & GROUP_MASK_; 441 } 442 443 /** 444 * Gets the minimum codepoint of a group 445 * @param codepoint The codepoint value. 446 * @return minimum codepoint in the group which codepoint belongs to 447 */ 448 ///CLOVER:OFF getGroupMinFromCodepoint(int codepoint)449 public static int getGroupMinFromCodepoint(int codepoint) 450 { 451 return codepoint & ~GROUP_MASK_; 452 } 453 ///CLOVER:ON 454 455 /** 456 * Get the Algorithm range length 457 * @return Algorithm range length 458 */ getAlgorithmLength()459 public int getAlgorithmLength() 460 { 461 return m_algorithm_.length; 462 } 463 464 /** 465 * Gets the start of the range 466 * @param index algorithm index 467 * @return algorithm range start 468 */ getAlgorithmStart(int index)469 public int getAlgorithmStart(int index) 470 { 471 return m_algorithm_[index].m_rangestart_; 472 } 473 474 /** 475 * Gets the end of the range 476 * @param index algorithm index 477 * @return algorithm range end 478 */ getAlgorithmEnd(int index)479 public int getAlgorithmEnd(int index) 480 { 481 return m_algorithm_[index].m_rangeend_; 482 } 483 484 /** 485 * Gets the Algorithmic name of the codepoint 486 * @param index algorithmic range index 487 * @param codepoint The codepoint value. 488 * @return algorithmic name of codepoint 489 */ getAlgorithmName(int index, int codepoint)490 public String getAlgorithmName(int index, int codepoint) 491 { 492 String result = null; 493 synchronized (m_utilStringBuffer_) { 494 m_utilStringBuffer_.setLength(0); 495 m_algorithm_[index].appendName(codepoint, m_utilStringBuffer_); 496 result = m_utilStringBuffer_.toString(); 497 } 498 return result; 499 } 500 501 /** 502 * Gets the group name of the character 503 * @param ch character to get the group name 504 * @param choice name choice selector to choose a unicode 1.0 or newer name 505 */ getGroupName(int ch, int choice)506 public synchronized String getGroupName(int ch, int choice) 507 { 508 // gets the msb 509 int msb = getCodepointMSB(ch); 510 int group = getGroup(ch); 511 512 // return this if it is an exact match 513 if (msb == m_groupinfo_[group * m_groupsize_]) { 514 int index = getGroupLengths(group, m_groupoffsets_, 515 m_grouplengths_); 516 int offset = ch & GROUP_MASK_; 517 return getGroupName(index + m_groupoffsets_[offset], 518 m_grouplengths_[offset], choice); 519 } 520 521 return null; 522 } 523 524 // these are transliterator use methods --------------------------------- 525 526 /** 527 * Gets the maximum length of any codepoint name. 528 * Equivalent to uprv_getMaxCharNameLength. 529 * @return the maximum length of any codepoint name 530 */ getMaxCharNameLength()531 public int getMaxCharNameLength() 532 { 533 if (initNameSetsLengths()) { 534 return m_maxNameLength_; 535 } 536 else { 537 return 0; 538 } 539 } 540 541 /** 542 * Gets the maximum length of any iso comments. 543 * Equivalent to uprv_getMaxISOCommentLength. 544 * @return the maximum length of any codepoint name 545 */ 546 ///CLOVER:OFF getMaxISOCommentLength()547 public int getMaxISOCommentLength() 548 { 549 if (initNameSetsLengths()) { 550 return m_maxISOCommentLength_; 551 } 552 else { 553 return 0; 554 } 555 } 556 ///CLOVER:ON 557 558 /** 559 * Fills set with characters that are used in Unicode character names. 560 * Equivalent to uprv_getCharNameCharacters. 561 * @param set USet to receive characters. Existing contents are deleted. 562 */ getCharNameCharacters(UnicodeSet set)563 public void getCharNameCharacters(UnicodeSet set) 564 { 565 convert(m_nameSet_, set); 566 } 567 568 /** 569 * Fills set with characters that are used in Unicode character names. 570 * Equivalent to uprv_getISOCommentCharacters. 571 * @param set USet to receive characters. Existing contents are deleted. 572 */ 573 ///CLOVER:OFF getISOCommentCharacters(UnicodeSet set)574 public void getISOCommentCharacters(UnicodeSet set) 575 { 576 convert(m_ISOCommentSet_, set); 577 } 578 ///CLOVER:ON 579 580 // package private inner class -------------------------------------- 581 582 /** 583 * Algorithmic name class 584 */ 585 static final class AlgorithmName 586 { 587 // package private data members ---------------------------------- 588 589 /** 590 * Constant type value of the different AlgorithmName 591 */ 592 static final int TYPE_0_ = 0; 593 static final int TYPE_1_ = 1; 594 595 // package private constructors ---------------------------------- 596 597 /** 598 * Constructor 599 */ AlgorithmName()600 AlgorithmName() 601 { 602 } 603 604 // package private methods --------------------------------------- 605 606 /** 607 * Sets the information for accessing the algorithmic names 608 * @param rangestart starting code point that lies within this name group 609 * @param rangeend end code point that lies within this name group 610 * @param type algorithm type. There's 2 kinds of algorithmic type. First 611 * which uses code point as part of its name and the other uses 612 * variant postfix strings 613 * @param variant algorithmic variant 614 * @return true if values are valid 615 */ setInfo(int rangestart, int rangeend, byte type, byte variant)616 boolean setInfo(int rangestart, int rangeend, byte type, byte variant) 617 { 618 if (rangestart >= UCharacter.MIN_VALUE && rangestart <= rangeend 619 && rangeend <= UCharacter.MAX_VALUE && 620 (type == TYPE_0_ || type == TYPE_1_)) { 621 m_rangestart_ = rangestart; 622 m_rangeend_ = rangeend; 623 m_type_ = type; 624 m_variant_ = variant; 625 return true; 626 } 627 return false; 628 } 629 630 /** 631 * Sets the factor data 632 * @param factor Array of factor 633 * @return true if factors are valid 634 */ setFactor(char factor[])635 boolean setFactor(char factor[]) 636 { 637 if (factor.length == m_variant_) { 638 m_factor_ = factor; 639 return true; 640 } 641 return false; 642 } 643 644 /** 645 * Sets the name prefix 646 * @param prefix 647 * @return true if prefix is set 648 */ setPrefix(String prefix)649 boolean setPrefix(String prefix) 650 { 651 if (prefix != null && prefix.length() > 0) { 652 m_prefix_ = prefix; 653 return true; 654 } 655 return false; 656 } 657 658 /** 659 * Sets the variant factorized name data 660 * @param string variant factorized name data 661 * @return true if values are set 662 */ setFactorString(byte string[])663 boolean setFactorString(byte string[]) 664 { 665 // factor and variant string can be empty for things like 666 // hanggul code points 667 m_factorstring_ = string; 668 return true; 669 } 670 671 /** 672 * Checks if code point lies in Algorithm object at index 673 * @param ch code point 674 */ contains(int ch)675 boolean contains(int ch) 676 { 677 return m_rangestart_ <= ch && ch <= m_rangeend_; 678 } 679 680 /** 681 * Appends algorithm name of code point into StringBuffer. 682 * Note this method does not check for validity of code point in Algorithm, 683 * result is undefined if code point does not belong in Algorithm. 684 * @param ch code point 685 * @param str StringBuffer to append to 686 */ appendName(int ch, StringBuffer str)687 void appendName(int ch, StringBuffer str) 688 { 689 str.append(m_prefix_); 690 switch (m_type_) 691 { 692 case TYPE_0_: 693 // prefix followed by hex digits indicating variants 694 str.append(Utility.hex(ch,m_variant_)); 695 break; 696 case TYPE_1_: 697 // prefix followed by factorized-elements 698 int offset = ch - m_rangestart_; 699 int indexes[] = m_utilIntBuffer_; 700 int factor; 701 702 // write elements according to the factors 703 // the factorized elements are determined by modulo 704 // arithmetic 705 synchronized (m_utilIntBuffer_) { 706 for (int i = m_variant_ - 1; i > 0; i --) 707 { 708 factor = m_factor_[i] & 0x00FF; 709 indexes[i] = offset % factor; 710 offset /= factor; 711 } 712 713 // we don't need to calculate the last modulus because 714 // start <= code <= end guarantees here that 715 // code <= factors[0] 716 indexes[0] = offset; 717 718 // joining up the factorized strings 719 str.append(getFactorString(indexes, m_variant_)); 720 } 721 break; 722 } 723 } 724 725 /** 726 * Gets the character for the argument algorithmic name 727 * @return the algorithmic char or -1 otherwise. 728 */ getChar(String name)729 int getChar(String name) 730 { 731 int prefixlen = m_prefix_.length(); 732 if (name.length() < prefixlen || 733 !m_prefix_.equals(name.substring(0, prefixlen))) { 734 return -1; 735 } 736 737 switch (m_type_) 738 { 739 case TYPE_0_ : 740 try 741 { 742 int result = Integer.parseInt(name.substring(prefixlen), 743 16); 744 // does it fit into the range? 745 if (m_rangestart_ <= result && result <= m_rangeend_) { 746 return result; 747 } 748 } 749 catch (NumberFormatException e) 750 { 751 return -1; 752 } 753 break; 754 case TYPE_1_ : 755 // repetitative suffix name comparison done here 756 // offset is the character code - start 757 for (int ch = m_rangestart_; ch <= m_rangeend_; ch ++) 758 { 759 int offset = ch - m_rangestart_; 760 int indexes[] = m_utilIntBuffer_; 761 int factor; 762 763 // write elements according to the factors 764 // the factorized elements are determined by modulo 765 // arithmetic 766 synchronized (m_utilIntBuffer_) { 767 for (int i = m_variant_ - 1; i > 0; i --) 768 { 769 factor = m_factor_[i] & 0x00FF; 770 indexes[i] = offset % factor; 771 offset /= factor; 772 } 773 774 // we don't need to calculate the last modulus 775 // because start <= code <= end guarantees here that 776 // code <= factors[0] 777 indexes[0] = offset; 778 779 // joining up the factorized strings 780 if (compareFactorString(indexes, m_variant_, name, 781 prefixlen)) { 782 return ch; 783 } 784 } 785 } 786 } 787 788 return -1; 789 } 790 791 /** 792 * Adds all chars in the set of algorithmic names into the set. 793 * Equivalent to part of calcAlgNameSetsLengths. 794 * @param set int set to add the chars of the algorithm names into 795 * @param maxlength maximum length to compare to 796 * @return the length that is either maxlength of the length of this 797 * algorithm name if it is longer than maxlength 798 */ add(int set[], int maxlength)799 int add(int set[], int maxlength) 800 { 801 // prefix length 802 int length = UCharacterName.add(set, m_prefix_); 803 switch (m_type_) { 804 case TYPE_0_ : { 805 // name = prefix + (range->variant times) hex-digits 806 // prefix 807 length += m_variant_; 808 /* synwee to check 809 * addString(set, (const char *)(range + 1)) 810 + range->variant;*/ 811 break; 812 } 813 case TYPE_1_ : { 814 // name = prefix factorized-elements 815 // get the set and maximum factor suffix length for each 816 // factor 817 for (int i = m_variant_ - 1; i > 0; i --) 818 { 819 int maxfactorlength = 0; 820 int count = 0; 821 for (int factor = m_factor_[i]; factor > 0; -- factor) { 822 synchronized (m_utilStringBuffer_) { 823 m_utilStringBuffer_.setLength(0); 824 count 825 = UCharacterUtility.getNullTermByteSubString( 826 m_utilStringBuffer_, 827 m_factorstring_, count); 828 UCharacterName.add(set, m_utilStringBuffer_); 829 if (m_utilStringBuffer_.length() 830 > maxfactorlength) 831 { 832 maxfactorlength 833 = m_utilStringBuffer_.length(); 834 } 835 } 836 } 837 length += maxfactorlength; 838 } 839 } 840 } 841 if (length > maxlength) { 842 return length; 843 } 844 return maxlength; 845 } 846 847 // private data members ------------------------------------------ 848 849 /** 850 * Algorithmic data information 851 */ 852 private int m_rangestart_; 853 private int m_rangeend_; 854 private byte m_type_; 855 private byte m_variant_; 856 private char m_factor_[]; 857 private String m_prefix_; 858 private byte m_factorstring_[]; 859 /** 860 * Utility StringBuffer 861 */ 862 private StringBuffer m_utilStringBuffer_ = new StringBuffer(); 863 /** 864 * Utility int buffer 865 */ 866 private int m_utilIntBuffer_[] = new int[256]; 867 868 // private methods ----------------------------------------------- 869 870 /** 871 * Gets the indexth string in each of the argument factor block 872 * @param index array with each index corresponding to each factor block 873 * @param length length of the array index 874 * @return the combined string of the array of indexth factor string in 875 * factor block 876 */ getFactorString(int index[], int length)877 private String getFactorString(int index[], int length) 878 { 879 int size = m_factor_.length; 880 if (index == null || length != size) { 881 return null; 882 } 883 884 synchronized (m_utilStringBuffer_) { 885 m_utilStringBuffer_.setLength(0); 886 int count = 0; 887 int factor; 888 size --; 889 for (int i = 0; i <= size; i ++) { 890 factor = m_factor_[i]; 891 count = UCharacterUtility.skipNullTermByteSubString( 892 m_factorstring_, count, index[i]); 893 count = UCharacterUtility.getNullTermByteSubString( 894 m_utilStringBuffer_, m_factorstring_, 895 count); 896 if (i != size) { 897 count = UCharacterUtility.skipNullTermByteSubString( 898 m_factorstring_, count, 899 factor - index[i] - 1); 900 } 901 } 902 return m_utilStringBuffer_.toString(); 903 } 904 } 905 906 /** 907 * Compares the indexth string in each of the argument factor block with 908 * the argument string 909 * @param index array with each index corresponding to each factor block 910 * @param length index array length 911 * @param str string to compare with 912 * @param offset of str to start comparison 913 * @return true if string matches 914 */ compareFactorString(int index[], int length, String str, int offset)915 private boolean compareFactorString(int index[], int length, String str, 916 int offset) 917 { 918 int size = m_factor_.length; 919 if (index == null || length != size) 920 return false; 921 922 int count = 0; 923 int strcount = offset; 924 int factor; 925 size --; 926 for (int i = 0; i <= size; i ++) 927 { 928 factor = m_factor_[i]; 929 count = UCharacterUtility.skipNullTermByteSubString( 930 m_factorstring_, count, index[i]); 931 strcount = UCharacterUtility.compareNullTermByteSubString(str, 932 m_factorstring_, strcount, count); 933 if (strcount < 0) { 934 return false; 935 } 936 937 if (i != size) { 938 count = UCharacterUtility.skipNullTermByteSubString( 939 m_factorstring_, count, factor - index[i]); 940 } 941 } 942 if (strcount != str.length()) { 943 return false; 944 } 945 return true; 946 } 947 } 948 949 // package private data members -------------------------------------- 950 951 /** 952 * Size of each groups 953 */ 954 int m_groupsize_ = 0; 955 956 // package private methods -------------------------------------------- 957 958 /** 959 * Sets the token data 960 * @param token array of tokens 961 * @param tokenstring array of string values of the tokens 962 * @return false if there is a data error 963 */ setToken(char token[], byte tokenstring[])964 boolean setToken(char token[], byte tokenstring[]) 965 { 966 if (token != null && tokenstring != null && token.length > 0 && 967 tokenstring.length > 0) { 968 m_tokentable_ = token; 969 m_tokenstring_ = tokenstring; 970 return true; 971 } 972 return false; 973 } 974 975 /** 976 * Set the algorithm name information array 977 * @param alg Algorithm information array 978 * @return true if the group string offset has been set correctly 979 */ setAlgorithm(AlgorithmName alg[])980 boolean setAlgorithm(AlgorithmName alg[]) 981 { 982 if (alg != null && alg.length != 0) { 983 m_algorithm_ = alg; 984 return true; 985 } 986 return false; 987 } 988 989 /** 990 * Sets the number of group and size of each group in number of char 991 * @param count number of groups 992 * @param size size of group in char 993 * @return true if group size is set correctly 994 */ setGroupCountSize(int count, int size)995 boolean setGroupCountSize(int count, int size) 996 { 997 if (count <= 0 || size <= 0) { 998 return false; 999 } 1000 m_groupcount_ = count; 1001 m_groupsize_ = size; 1002 return true; 1003 } 1004 1005 /** 1006 * Sets the group name data 1007 * @param group index information array 1008 * @param groupstring name information array 1009 * @return false if there is a data error 1010 */ setGroup(char group[], byte groupstring[])1011 boolean setGroup(char group[], byte groupstring[]) 1012 { 1013 if (group != null && groupstring != null && group.length > 0 && 1014 groupstring.length > 0) { 1015 m_groupinfo_ = group; 1016 m_groupstring_ = groupstring; 1017 return true; 1018 } 1019 return false; 1020 } 1021 1022 // private data members ---------------------------------------------- 1023 1024 /** 1025 * Data used in unames.icu 1026 */ 1027 private char m_tokentable_[]; 1028 private byte m_tokenstring_[]; 1029 private char m_groupinfo_[]; 1030 private byte m_groupstring_[]; 1031 private AlgorithmName m_algorithm_[]; 1032 1033 /** 1034 * Group use. Note - access must be synchronized. 1035 */ 1036 private char m_groupoffsets_[] = new char[LINES_PER_GROUP_ + 1]; 1037 private char m_grouplengths_[] = new char[LINES_PER_GROUP_ + 1]; 1038 1039 /** 1040 * Default name of the name datafile 1041 */ 1042 private static final String FILE_NAME_ = "unames.icu"; 1043 /** 1044 * Shift count to retrieve group information 1045 */ 1046 private static final int GROUP_SHIFT_ = 5; 1047 /** 1048 * Mask to retrieve the offset for a particular character within a group 1049 */ 1050 private static final int GROUP_MASK_ = LINES_PER_GROUP_ - 1; 1051 1052 /** 1053 * Position of offsethigh in group information array 1054 */ 1055 private static final int OFFSET_HIGH_OFFSET_ = 1; 1056 1057 /** 1058 * Position of offsetlow in group information array 1059 */ 1060 private static final int OFFSET_LOW_OFFSET_ = 2; 1061 /** 1062 * Double nibble indicator, any nibble > this number has to be combined 1063 * with its following nibble 1064 */ 1065 private static final int SINGLE_NIBBLE_MAX_ = 11; 1066 1067 /* 1068 * Maximum length of character names (regular & 1.0). 1069 */ 1070 //private static int MAX_NAME_LENGTH_ = 0; 1071 /* 1072 * Maximum length of ISO comments. 1073 */ 1074 //private static int MAX_ISO_COMMENT_LENGTH_ = 0; 1075 1076 /** 1077 * Set of chars used in character names (regular & 1.0). 1078 * Chars are platform-dependent (can be EBCDIC). 1079 */ 1080 private int m_nameSet_[] = new int[8]; 1081 /** 1082 * Set of chars used in ISO comments. (regular & 1.0). 1083 * Chars are platform-dependent (can be EBCDIC). 1084 */ 1085 private int m_ISOCommentSet_[] = new int[8]; 1086 /** 1087 * Utility StringBuffer 1088 */ 1089 private StringBuffer m_utilStringBuffer_ = new StringBuffer(); 1090 /** 1091 * Utility int buffer 1092 */ 1093 private int m_utilIntBuffer_[] = new int[2]; 1094 /** 1095 * Maximum ISO comment length 1096 */ 1097 private int m_maxISOCommentLength_; 1098 /** 1099 * Maximum name length 1100 */ 1101 private int m_maxNameLength_; 1102 /** 1103 * Type names used for extended names 1104 */ 1105 private static final String TYPE_NAMES_[] = {"unassigned", 1106 "uppercase letter", 1107 "lowercase letter", 1108 "titlecase letter", 1109 "modifier letter", 1110 "other letter", 1111 "non spacing mark", 1112 "enclosing mark", 1113 "combining spacing mark", 1114 "decimal digit number", 1115 "letter number", 1116 "other number", 1117 "space separator", 1118 "line separator", 1119 "paragraph separator", 1120 "control", 1121 "format", 1122 "private use area", 1123 "surrogate", 1124 "dash punctuation", 1125 "start punctuation", 1126 "end punctuation", 1127 "connector punctuation", 1128 "other punctuation", 1129 "math symbol", 1130 "currency symbol", 1131 "modifier symbol", 1132 "other symbol", 1133 "initial punctuation", 1134 "final punctuation", 1135 "noncharacter", 1136 "lead surrogate", 1137 "trail surrogate"}; 1138 /** 1139 * Unknown type name 1140 */ 1141 private static final String UNKNOWN_TYPE_NAME_ = "unknown"; 1142 /** 1143 * Not a character type 1144 */ 1145 private static final int NON_CHARACTER_ 1146 = UCharacterCategory.CHAR_CATEGORY_COUNT; 1147 /** 1148 * Lead surrogate type 1149 */ 1150 private static final int LEAD_SURROGATE_ 1151 = UCharacterCategory.CHAR_CATEGORY_COUNT + 1; 1152 /** 1153 * Trail surrogate type 1154 */ 1155 private static final int TRAIL_SURROGATE_ 1156 = UCharacterCategory.CHAR_CATEGORY_COUNT + 2; 1157 /** 1158 * Extended category count 1159 */ 1160 static final int EXTENDED_CATEGORY_ 1161 = UCharacterCategory.CHAR_CATEGORY_COUNT + 3; 1162 1163 // private constructor ------------------------------------------------ 1164 1165 /** 1166 * <p>Protected constructor for use in UCharacter.</p> 1167 * @exception IOException thrown when data reading fails 1168 */ UCharacterName()1169 private UCharacterName() throws IOException 1170 { 1171 ByteBuffer b = ICUBinary.getRequiredData(FILE_NAME_); 1172 UCharacterNameReader reader = new UCharacterNameReader(b); 1173 reader.read(this); 1174 } 1175 1176 // private methods --------------------------------------------------- 1177 1178 /** 1179 * Gets the algorithmic name for the argument character 1180 * @param ch character to determine name for 1181 * @param choice name choice 1182 * @return the algorithmic name or null if not found 1183 */ getAlgName(int ch, int choice)1184 private String getAlgName(int ch, int choice) 1185 { 1186 /* Only the normative character name can be algorithmic. */ 1187 if (choice == UCharacterNameChoice.UNICODE_CHAR_NAME || 1188 choice == UCharacterNameChoice.EXTENDED_CHAR_NAME 1189 ) { 1190 // index in terms integer index 1191 synchronized (m_utilStringBuffer_) { 1192 m_utilStringBuffer_.setLength(0); 1193 1194 for (int index = m_algorithm_.length - 1; index >= 0; index --) 1195 { 1196 if (m_algorithm_[index].contains(ch)) { 1197 m_algorithm_[index].appendName(ch, m_utilStringBuffer_); 1198 return m_utilStringBuffer_.toString(); 1199 } 1200 } 1201 } 1202 } 1203 return null; 1204 } 1205 1206 /** 1207 * Getting the character with the tokenized argument name 1208 * @param name of the character 1209 * @return character with the tokenized argument name or -1 if character 1210 * is not found 1211 */ getGroupChar(String name, int choice)1212 private synchronized int getGroupChar(String name, int choice) 1213 { 1214 for (int i = 0; i < m_groupcount_; i ++) { 1215 // populating the data set of grouptable 1216 1217 int startgpstrindex = getGroupLengths(i, m_groupoffsets_, 1218 m_grouplengths_); 1219 1220 // shift out to function 1221 int result = getGroupChar(startgpstrindex, m_grouplengths_, name, 1222 choice); 1223 if (result != -1) { 1224 return (m_groupinfo_[i * m_groupsize_] << GROUP_SHIFT_) 1225 | result; 1226 } 1227 } 1228 return -1; 1229 } 1230 1231 /** 1232 * Compares and retrieve character if name is found within the argument 1233 * group 1234 * @param index index where the set of names reside in the group block 1235 * @param length list of lengths of the strings 1236 * @param name character name to search for 1237 * @param choice of either 1.0 or the most current unicode name 1238 * @return relative character in the group which matches name, otherwise if 1239 * not found, -1 will be returned 1240 */ getGroupChar(int index, char length[], String name, int choice)1241 private int getGroupChar(int index, char length[], String name, 1242 int choice) 1243 { 1244 byte b = 0; 1245 char token; 1246 int len; 1247 int namelen = name.length(); 1248 int nindex; 1249 int count; 1250 1251 for (int result = 0; result <= LINES_PER_GROUP_; result ++) { 1252 nindex = 0; 1253 len = length[result]; 1254 1255 if (choice != UCharacterNameChoice.UNICODE_CHAR_NAME && 1256 choice != UCharacterNameChoice.EXTENDED_CHAR_NAME 1257 ) { 1258 /* 1259 * skip the modern name if it is not requested _and_ 1260 * if the semicolon byte value is a character, not a token number 1261 */ 1262 int fieldIndex= choice==UCharacterNameChoice.ISO_COMMENT_ ? 2 : choice; 1263 do { 1264 int oldindex = index; 1265 index += UCharacterUtility.skipByteSubString(m_groupstring_, 1266 index, len, (byte)';'); 1267 len -= (index - oldindex); 1268 } while(--fieldIndex>0); 1269 } 1270 1271 // number of tokens is > the length of the name 1272 // write each letter directly, and write a token word per token 1273 for (count = 0; count < len && nindex != -1 && nindex < namelen; 1274 ) { 1275 b = m_groupstring_[index + count]; 1276 count ++; 1277 1278 if (b >= m_tokentable_.length) { 1279 if (name.charAt(nindex ++) != (b & 0xFF)) { 1280 nindex = -1; 1281 } 1282 } 1283 else { 1284 token = m_tokentable_[b & 0xFF]; 1285 if (token == 0xFFFE) { 1286 // this is a lead byte for a double-byte token 1287 token = m_tokentable_[b << 8 | 1288 (m_groupstring_[index + count] & 0x00ff)]; 1289 count ++; 1290 } 1291 if (token == 0xFFFF) { 1292 if (name.charAt(nindex ++) != (b & 0xFF)) { 1293 nindex = -1; 1294 } 1295 } 1296 else { 1297 // compare token with name 1298 nindex = UCharacterUtility.compareNullTermByteSubString( 1299 name, m_tokenstring_, nindex, token); 1300 } 1301 } 1302 } 1303 1304 if (namelen == nindex && 1305 (count == len || m_groupstring_[index + count] == ';')) { 1306 return result; 1307 } 1308 1309 index += len; 1310 } 1311 return -1; 1312 } 1313 1314 /** 1315 * Gets the character extended type 1316 * @param ch character to be tested 1317 * @return extended type it is associated with 1318 */ getType(int ch)1319 private static int getType(int ch) 1320 { 1321 if (UCharacterUtility.isNonCharacter(ch)) { 1322 // not a character we return a invalid category count 1323 return NON_CHARACTER_; 1324 } 1325 int result = UCharacter.getType(ch); 1326 if (result == UCharacterCategory.SURROGATE) { 1327 if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) { 1328 result = LEAD_SURROGATE_; 1329 } 1330 else { 1331 result = TRAIL_SURROGATE_; 1332 } 1333 } 1334 return result; 1335 } 1336 1337 /** 1338 * Getting the character with extended name of the form <....>. 1339 * @param name of the character to be found 1340 * @param choice name choice 1341 * @return character associated with the name, -1 if such character is not 1342 * found and -2 if we should continue with the search. 1343 */ getExtendedChar(String name, int choice)1344 private static int getExtendedChar(String name, int choice) 1345 { 1346 if (name.charAt(0) == '<') { 1347 if (choice == UCharacterNameChoice.EXTENDED_CHAR_NAME) { 1348 int endIndex = name.length() - 1; 1349 if (name.charAt(endIndex) == '>') { 1350 int startIndex = name.lastIndexOf('-'); 1351 if (startIndex >= 0) { // We've got a category. 1352 startIndex ++; 1353 int result = -1; 1354 try { 1355 result = Integer.parseInt( 1356 name.substring(startIndex, endIndex), 1357 16); 1358 } 1359 catch (NumberFormatException e) { 1360 return -1; 1361 } 1362 // Now validate the category name. We could use a 1363 // binary search, or a trie, if we really wanted to. 1364 String type = name.substring(1, startIndex - 1); 1365 int length = TYPE_NAMES_.length; 1366 for (int i = 0; i < length; ++ i) { 1367 if (type.compareTo(TYPE_NAMES_[i]) == 0) { 1368 if (getType(result) == i) { 1369 return result; 1370 } 1371 break; 1372 } 1373 } 1374 } 1375 } 1376 } 1377 return -1; 1378 } 1379 return -2; 1380 } 1381 1382 // sets of name characters, maximum name lengths ----------------------- 1383 1384 /** 1385 * Adds a codepoint into a set of ints. 1386 * Equivalent to SET_ADD. 1387 * @param set set to add to 1388 * @param ch 16 bit char to add 1389 */ add(int set[], char ch)1390 private static void add(int set[], char ch) 1391 { 1392 set[ch >>> 5] |= 1 << (ch & 0x1f); 1393 } 1394 1395 /** 1396 * Checks if a codepoint is a part of a set of ints. 1397 * Equivalent to SET_CONTAINS. 1398 * @param set set to check in 1399 * @param ch 16 bit char to check 1400 * @return true if codepoint is part of the set, false otherwise 1401 */ contains(int set[], char ch)1402 private static boolean contains(int set[], char ch) 1403 { 1404 return (set[ch >>> 5] & (1 << (ch & 0x1f))) != 0; 1405 } 1406 1407 /** 1408 * Adds all characters of the argument str and gets the length 1409 * Equivalent to calcStringSetLength. 1410 * @param set set to add all chars of str to 1411 * @param str string to add 1412 */ add(int set[], String str)1413 private static int add(int set[], String str) 1414 { 1415 int result = str.length(); 1416 1417 for (int i = result - 1; i >= 0; i --) { 1418 add(set, str.charAt(i)); 1419 } 1420 return result; 1421 } 1422 1423 /** 1424 * Adds all characters of the argument str and gets the length 1425 * Equivalent to calcStringSetLength. 1426 * @param set set to add all chars of str to 1427 * @param str string to add 1428 */ add(int set[], StringBuffer str)1429 private static int add(int set[], StringBuffer str) 1430 { 1431 int result = str.length(); 1432 1433 for (int i = result - 1; i >= 0; i --) { 1434 add(set, str.charAt(i)); 1435 } 1436 return result; 1437 } 1438 1439 /** 1440 * Adds all algorithmic names into the name set. 1441 * Equivalent to part of calcAlgNameSetsLengths. 1442 * @param maxlength length to compare to 1443 * @return the maximum length of any possible algorithmic name if it is > 1444 * maxlength, otherwise maxlength is returned. 1445 */ addAlgorithmName(int maxlength)1446 private int addAlgorithmName(int maxlength) 1447 { 1448 int result = 0; 1449 for (int i = m_algorithm_.length - 1; i >= 0; i --) { 1450 result = m_algorithm_[i].add(m_nameSet_, maxlength); 1451 if (result > maxlength) { 1452 maxlength = result; 1453 } 1454 } 1455 return maxlength; 1456 } 1457 1458 /** 1459 * Adds all extended names into the name set. 1460 * Equivalent to part of calcExtNameSetsLengths. 1461 * @param maxlength length to compare to 1462 * @return the maxlength of any possible extended name. 1463 */ addExtendedName(int maxlength)1464 private int addExtendedName(int maxlength) 1465 { 1466 for (int i = TYPE_NAMES_.length - 1; i >= 0; i --) { 1467 // for each category, count the length of the category name 1468 // plus 9 = 1469 // 2 for <> 1470 // 1 for - 1471 // 6 for most hex digits per code point 1472 int length = 9 + add(m_nameSet_, TYPE_NAMES_[i]); 1473 if (length > maxlength) { 1474 maxlength = length; 1475 } 1476 } 1477 return maxlength; 1478 } 1479 1480 /** 1481 * Adds names of a group to the argument set. 1482 * Equivalent to calcNameSetLength. 1483 * @param offset of the group name string in byte count 1484 * @param length of the group name string 1485 * @param tokenlength array to store the length of each token 1486 * @param set to add to 1487 * @return the length of the name string and the length of the group 1488 * string parsed 1489 */ addGroupName(int offset, int length, byte tokenlength[], int set[])1490 private int[] addGroupName(int offset, int length, byte tokenlength[], 1491 int set[]) 1492 { 1493 int resultnlength = 0; 1494 int resultplength = 0; 1495 while (resultplength < length) { 1496 char b = (char)(m_groupstring_[offset + resultplength] & 0xff); 1497 resultplength ++; 1498 if (b == ';') { 1499 break; 1500 } 1501 1502 if (b >= m_tokentable_.length) { 1503 add(set, b); // implicit letter 1504 resultnlength ++; 1505 } 1506 else { 1507 char token = m_tokentable_[b & 0x00ff]; 1508 if (token == 0xFFFE) { 1509 // this is a lead byte for a double-byte token 1510 b = (char)(b << 8 | (m_groupstring_[offset + resultplength] 1511 & 0x00ff)); 1512 token = m_tokentable_[b]; 1513 resultplength ++; 1514 } 1515 if (token == 0xFFFF) { 1516 add(set, b); 1517 resultnlength ++; 1518 } 1519 else { 1520 // count token word 1521 // use cached token length 1522 byte tlength = tokenlength[b]; 1523 if (tlength == 0) { 1524 synchronized (m_utilStringBuffer_) { 1525 m_utilStringBuffer_.setLength(0); 1526 UCharacterUtility.getNullTermByteSubString( 1527 m_utilStringBuffer_, m_tokenstring_, 1528 token); 1529 tlength = (byte)add(set, m_utilStringBuffer_); 1530 } 1531 tokenlength[b] = tlength; 1532 } 1533 resultnlength += tlength; 1534 } 1535 } 1536 } 1537 m_utilIntBuffer_[0] = resultnlength; 1538 m_utilIntBuffer_[1] = resultplength; 1539 return m_utilIntBuffer_; 1540 } 1541 1542 /** 1543 * Adds names of all group to the argument set. 1544 * Sets the data member m_max*Length_. 1545 * Method called only once. 1546 * Equivalent to calcGroupNameSetsLength. 1547 * @param maxlength length to compare to 1548 */ addGroupName(int maxlength)1549 private void addGroupName(int maxlength) 1550 { 1551 int maxisolength = 0; 1552 char offsets[] = new char[LINES_PER_GROUP_ + 2]; 1553 char lengths[] = new char[LINES_PER_GROUP_ + 2]; 1554 byte tokenlengths[] = new byte[m_tokentable_.length]; 1555 1556 // enumerate all groups 1557 // for (int i = m_groupcount_ - 1; i >= 0; i --) { 1558 for (int i = 0; i < m_groupcount_ ; i ++) { 1559 int offset = getGroupLengths(i, offsets, lengths); 1560 // enumerate all lines in each group 1561 // for (int linenumber = LINES_PER_GROUP_ - 1; linenumber >= 0; 1562 // linenumber --) { 1563 for (int linenumber = 0; linenumber < LINES_PER_GROUP_; 1564 linenumber ++) { 1565 int lineoffset = offset + offsets[linenumber]; 1566 int length = lengths[linenumber]; 1567 if (length == 0) { 1568 continue; 1569 } 1570 1571 // read regular name 1572 int parsed[] = addGroupName(lineoffset, length, tokenlengths, 1573 m_nameSet_); 1574 if (parsed[0] > maxlength) { 1575 // 0 for name length 1576 maxlength = parsed[0]; 1577 } 1578 lineoffset += parsed[1]; 1579 if (parsed[1] >= length) { 1580 // 1 for parsed group string length 1581 continue; 1582 } 1583 length -= parsed[1]; 1584 // read Unicode 1.0 name 1585 parsed = addGroupName(lineoffset, length, tokenlengths, 1586 m_nameSet_); 1587 if (parsed[0] > maxlength) { 1588 // 0 for name length 1589 maxlength = parsed[0]; 1590 } 1591 lineoffset += parsed[1]; 1592 if (parsed[1] >= length) { 1593 // 1 for parsed group string length 1594 continue; 1595 } 1596 length -= parsed[1]; 1597 // read ISO comment 1598 parsed = addGroupName(lineoffset, length, tokenlengths, 1599 m_ISOCommentSet_); 1600 if (parsed[1] > maxisolength) { 1601 maxisolength = length; 1602 } 1603 } 1604 } 1605 1606 // set gMax... - name length last for threading 1607 m_maxISOCommentLength_ = maxisolength; 1608 m_maxNameLength_ = maxlength; 1609 } 1610 1611 /** 1612 * Sets up the name sets and the calculation of the maximum lengths. 1613 * Equivalent to calcNameSetsLengths. 1614 */ initNameSetsLengths()1615 private boolean initNameSetsLengths() 1616 { 1617 if (m_maxNameLength_ > 0) { 1618 return true; 1619 } 1620 1621 String extra = "0123456789ABCDEF<>-"; 1622 // set hex digits, used in various names, and <>-, used in extended 1623 // names 1624 for (int i = extra.length() - 1; i >= 0; i --) { 1625 add(m_nameSet_, extra.charAt(i)); 1626 } 1627 1628 // set sets and lengths from algorithmic names 1629 m_maxNameLength_ = addAlgorithmName(0); 1630 // set sets and lengths from extended names 1631 m_maxNameLength_ = addExtendedName(m_maxNameLength_); 1632 // set sets and lengths from group names, set global maximum values 1633 addGroupName(m_maxNameLength_); 1634 return true; 1635 } 1636 1637 /** 1638 * Converts the char set cset into a Unicode set uset. 1639 * Equivalent to charSetToUSet. 1640 * @param set Set of 256 bit flags corresponding to a set of chars. 1641 * @param uset USet to receive characters. Existing contents are deleted. 1642 */ convert(int set[], UnicodeSet uset)1643 private void convert(int set[], UnicodeSet uset) 1644 { 1645 uset.clear(); 1646 if (!initNameSetsLengths()) { 1647 return; 1648 } 1649 1650 // build a char string with all chars that are used in character names 1651 for (char c = 255; c > 0; c --) { 1652 if (contains(set, c)) { 1653 uset.add(c); 1654 } 1655 } 1656 } 1657 } 1658