1 /* 2 ******************************************************************************* 3 * Copyright (C) 1996-2014, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 8 package com.ibm.icu.impl; 9 10 import java.io.IOException; 11 import java.nio.ByteBuffer; 12 import java.util.Locale; 13 import java.util.MissingResourceException; 14 15 import com.ibm.icu.lang.UCharacter; 16 import com.ibm.icu.lang.UCharacterCategory; 17 import com.ibm.icu.text.UTF16; 18 import com.ibm.icu.text.UnicodeSet; 19 20 /** 21 * Internal class to manage character names. 22 * Since data for names are stored 23 * in an array of char, by default indexes used in this class is refering to 24 * a 2 byte count, unless otherwise stated. Cases where the index is refering 25 * to a byte count, the index is halved and depending on whether the index is 26 * even or odd, the MSB or LSB of the result char at the halved index is 27 * returned. For indexes to an array of int, the index is multiplied by 2, 28 * result char at the multiplied index and its following char is returned as an 29 * int. 30 * <a href=../lang/UCharacter.html>UCharacter</a> acts as a public facade for this class 31 * Note : 0 - 0x1F are control characters without names in Unicode 3.0 32 * @author Syn Wee Quek 33 * @since nov0700 34 */ 35 36 public final class UCharacterName 37 { 38 // public data members ---------------------------------------------- 39 40 /* 41 * public singleton instance 42 */ 43 public static final UCharacterName INSTANCE; 44 45 static { 46 try { 47 INSTANCE = new UCharacterName(); 48 } catch (IOException e) { 49 ///CLOVER:OFF 50 throw new MissingResourceException("Could not construct UCharacterName. Missing unames.icu","",""); 51 ///CLOVER:ON 52 } 53 } 54 55 /** 56 * Number of lines per group 57 * 1 << GROUP_SHIFT_ 58 */ 59 public static final int LINES_PER_GROUP_ = 1 << 5; 60 /** 61 * Maximum number of groups 62 */ 63 public int m_groupcount_ = 0; 64 65 // public methods --------------------------------------------------- 66 67 /** 68 * Retrieve the name of a Unicode code point. 69 * Depending on <code>choice</code>, the character name written into the 70 * buffer is the "modern" name or the name that was defined in Unicode 71 * version 1.0. 72 * The name contains only "invariant" characters 73 * like A-Z, 0-9, space, and '-'. 74 * 75 * @param ch the code point for which to get the name. 76 * @param choice Selector for which name to get. 77 * @return if code point is above 0x1fff, null is returned 78 */ getName(int ch, int choice)79 public String getName(int ch, int choice) 80 { 81 if (ch < UCharacter.MIN_VALUE || ch > UCharacter.MAX_VALUE || 82 choice > UCharacterNameChoice.CHAR_NAME_CHOICE_COUNT) { 83 return null; 84 } 85 86 String result = null; 87 88 result = getAlgName(ch, choice); 89 90 // getting normal character name 91 if (result == null || result.length() == 0) { 92 if (choice == UCharacterNameChoice.EXTENDED_CHAR_NAME) { 93 result = getExtendedName(ch); 94 } else { 95 result = getGroupName(ch, choice); 96 } 97 } 98 99 return result; 100 } 101 102 /** 103 * Find a character by its name and return its code point value 104 * @param choice selector to indicate if argument name is a Unicode 1.0 105 * or the most current version 106 * @param name the name to search for 107 * @return code point 108 */ getCharFromName(int choice, String name)109 public int getCharFromName(int choice, String name) 110 { 111 // checks for illegal arguments 112 if (choice >= UCharacterNameChoice.CHAR_NAME_CHOICE_COUNT || 113 name == null || name.length() == 0) { 114 return -1; 115 } 116 117 // try extended names first 118 int result = getExtendedChar(name.toLowerCase(Locale.ENGLISH), choice); 119 if (result >= -1) { 120 return result; 121 } 122 123 String upperCaseName = name.toUpperCase(Locale.ENGLISH); 124 // try algorithmic names first, if fails then try group names 125 // int result = getAlgorithmChar(choice, uppercasename); 126 127 if (choice == UCharacterNameChoice.UNICODE_CHAR_NAME || 128 choice == UCharacterNameChoice.EXTENDED_CHAR_NAME 129 ) { 130 int count = 0; 131 if (m_algorithm_ != null) { 132 count = m_algorithm_.length; 133 } 134 for (count --; count >= 0; count --) { 135 result = m_algorithm_[count].getChar(upperCaseName); 136 if (result >= 0) { 137 return result; 138 } 139 } 140 } 141 142 if (choice == UCharacterNameChoice.EXTENDED_CHAR_NAME) { 143 result = getGroupChar(upperCaseName, 144 UCharacterNameChoice.UNICODE_CHAR_NAME); 145 if (result == -1) { 146 result = getGroupChar(upperCaseName, 147 UCharacterNameChoice.CHAR_NAME_ALIAS); 148 } 149 } 150 else { 151 result = getGroupChar(upperCaseName, choice); 152 } 153 return result; 154 } 155 156 // these are all UCharacterNameIterator use methods ------------------- 157 158 /** 159 * Reads a block of compressed lengths of 32 strings and expands them into 160 * offsets and lengths for each string. Lengths are stored with a 161 * variable-width encoding in consecutive nibbles: 162 * If a nibble<0xc, then it is the length itself (0 = empty string). 163 * If a nibble>=0xc, then it forms a length value with the following 164 * nibble. 165 * The offsets and lengths arrays must be at least 33 (one more) long 166 * because there is no check here at the end if the last nibble is still 167 * used. 168 * @param index of group string object in array 169 * @param offsets array to store the value of the string offsets 170 * @param lengths array to store the value of the string length 171 * @return next index of the data string immediately after the lengths 172 * in terms of byte address 173 */ getGroupLengths(int index, char offsets[], char lengths[])174 public int getGroupLengths(int index, char offsets[], char lengths[]) 175 { 176 char length = 0xffff; 177 byte b = 0, 178 n = 0; 179 int shift; 180 index = index * m_groupsize_; // byte count offsets of group strings 181 int stringoffset = UCharacterUtility.toInt( 182 m_groupinfo_[index + OFFSET_HIGH_OFFSET_], 183 m_groupinfo_[index + OFFSET_LOW_OFFSET_]); 184 185 offsets[0] = 0; 186 187 // all 32 lengths must be read to get the offset of the first group 188 // string 189 for (int i = 0; i < LINES_PER_GROUP_; stringoffset ++) { 190 b = m_groupstring_[stringoffset]; 191 shift = 4; 192 193 while (shift >= 0) { 194 // getting nibble 195 n = (byte)((b >> shift) & 0x0F); 196 if (length == 0xffff && n > SINGLE_NIBBLE_MAX_) { 197 length = (char)((n - 12) << 4); 198 } 199 else { 200 if (length != 0xffff) { 201 lengths[i] = (char)((length | n) + 12); 202 } 203 else { 204 lengths[i] = (char)n; 205 } 206 207 if (i < LINES_PER_GROUP_) { 208 offsets[i + 1] = (char)(offsets[i] + lengths[i]); 209 } 210 211 length = 0xffff; 212 i ++; 213 } 214 215 shift -= 4; 216 } 217 } 218 return stringoffset; 219 } 220 221 /** 222 * Gets the name of the argument group index. 223 * UnicodeData.txt uses ';' as a field separator, so no field can contain 224 * ';' as part of its contents. In unames.icu, it is marked as 225 * token[';'] == -1 only if the semicolon is used in the data file - which 226 * is iff we have Unicode 1.0 names or ISO comments or aliases. 227 * So, it will be token[';'] == -1 if we store U1.0 names/ISO comments/aliases 228 * although we know that it will never be part of a name. 229 * Equivalent to ICU4C's expandName. 230 * @param index of the group name string in byte count 231 * @param length of the group name string 232 * @param choice of Unicode 1.0 name or the most current name 233 * @return name of the group 234 */ getGroupName(int index, int length, int choice)235 public String getGroupName(int index, int length, int choice) 236 { 237 if (choice != UCharacterNameChoice.UNICODE_CHAR_NAME && 238 choice != UCharacterNameChoice.EXTENDED_CHAR_NAME 239 ) { 240 if (';' >= m_tokentable_.length || m_tokentable_[';'] == 0xFFFF) { 241 /* 242 * skip the modern name if it is not requested _and_ 243 * if the semicolon byte value is a character, not a token number 244 */ 245 int fieldIndex= choice==UCharacterNameChoice.ISO_COMMENT_ ? 2 : choice; 246 do { 247 int oldindex = index; 248 index += UCharacterUtility.skipByteSubString(m_groupstring_, 249 index, length, (byte)';'); 250 length -= (index - oldindex); 251 } while(--fieldIndex>0); 252 } 253 else { 254 // the semicolon byte is a token number, therefore only modern 255 // names are stored in unames.dat and there is no such 256 // requested alternate name here 257 length = 0; 258 } 259 } 260 261 synchronized (m_utilStringBuffer_) { 262 m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length()); 263 byte b; 264 char token; 265 for (int i = 0; i < length;) { 266 b = m_groupstring_[index + i]; 267 i ++; 268 269 if (b >= m_tokentable_.length) { 270 if (b == ';') { 271 break; 272 } 273 m_utilStringBuffer_.append(b); // implicit letter 274 } 275 else { 276 token = m_tokentable_[b & 0x00ff]; 277 if (token == 0xFFFE) { 278 // this is a lead byte for a double-byte token 279 token = m_tokentable_[b << 8 | 280 (m_groupstring_[index + i] & 0x00ff)]; 281 i ++; 282 } 283 if (token == 0xFFFF) { 284 if (b == ';') { 285 // skip the semicolon if we are seeking extended 286 // names and there was no 2.0 name but there 287 // is a 1.0 name. 288 if (m_utilStringBuffer_.length() == 0 && choice == 289 UCharacterNameChoice.EXTENDED_CHAR_NAME) { 290 continue; 291 } 292 break; 293 } 294 // explicit letter 295 m_utilStringBuffer_.append((char)(b & 0x00ff)); 296 } 297 else { // write token word 298 UCharacterUtility.getNullTermByteSubString( 299 m_utilStringBuffer_, m_tokenstring_, token); 300 } 301 } 302 } 303 304 if (m_utilStringBuffer_.length() > 0) { 305 return m_utilStringBuffer_.toString(); 306 } 307 } 308 return null; 309 } 310 311 /** 312 * Retrieves the extended name 313 */ getExtendedName(int ch)314 public String getExtendedName(int ch) 315 { 316 String result = getName(ch, UCharacterNameChoice.UNICODE_CHAR_NAME); 317 if (result == null) { 318 // TODO: Return Name_Alias/control names for control codes 0..1F & 7F..9F. 319 result = getExtendedOr10Name(ch); 320 } 321 return result; 322 } 323 324 /** 325 * Gets the group index for the codepoint, or the group before it. 326 * @param codepoint The codepoint index. 327 * @return group index containing codepoint or the group before it. 328 */ getGroup(int codepoint)329 public int getGroup(int codepoint) 330 { 331 int endGroup = m_groupcount_; 332 int msb = getCodepointMSB(codepoint); 333 int result = 0; 334 // binary search for the group of names that contains the one for 335 // code 336 // find the group that contains codepoint, or the highest before it 337 while (result < endGroup - 1) { 338 int gindex = (result + endGroup) >> 1; 339 if (msb < getGroupMSB(gindex)) { 340 endGroup = gindex; 341 } 342 else { 343 result = gindex; 344 } 345 } 346 return result; 347 } 348 349 /** 350 * Gets the extended and 1.0 name when the most current unicode names 351 * fail 352 * @param ch codepoint 353 * @return name of codepoint extended or 1.0 354 */ getExtendedOr10Name(int ch)355 public String getExtendedOr10Name(int ch) 356 { 357 String result = null; 358 // TODO: Return Name_Alias/control names for control codes 0..1F & 7F..9F. 359 if (result == null) { 360 int type = getType(ch); 361 // Return unknown if the table of names above is not up to 362 // date. 363 if (type >= TYPE_NAMES_.length) { 364 result = UNKNOWN_TYPE_NAME_; 365 } 366 else { 367 result = TYPE_NAMES_[type]; 368 } 369 synchronized (m_utilStringBuffer_) { 370 m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length()); 371 m_utilStringBuffer_.append('<'); 372 m_utilStringBuffer_.append(result); 373 m_utilStringBuffer_.append('-'); 374 String chStr = Integer.toHexString(ch).toUpperCase(Locale.ENGLISH); 375 int zeros = 4 - chStr.length(); 376 while (zeros > 0) { 377 m_utilStringBuffer_.append('0'); 378 zeros --; 379 } 380 m_utilStringBuffer_.append(chStr); 381 m_utilStringBuffer_.append('>'); 382 result = m_utilStringBuffer_.toString(); 383 } 384 } 385 return result; 386 } 387 388 /** 389 * Gets the MSB from the group index 390 * @param gindex group index 391 * @return the MSB of the group if gindex is valid, -1 otherwise 392 */ getGroupMSB(int gindex)393 public int getGroupMSB(int gindex) 394 { 395 if (gindex >= m_groupcount_) { 396 return -1; 397 } 398 return m_groupinfo_[gindex * m_groupsize_]; 399 } 400 401 /** 402 * Gets the MSB of the codepoint 403 * @param codepoint The codepoint value. 404 * @return the MSB of the codepoint 405 */ getCodepointMSB(int codepoint)406 public static int getCodepointMSB(int codepoint) 407 { 408 return codepoint >> GROUP_SHIFT_; 409 } 410 411 /** 412 * Gets the maximum codepoint + 1 of the group 413 * @param msb most significant byte of the group 414 * @return limit codepoint of the group 415 */ getGroupLimit(int msb)416 public static int getGroupLimit(int msb) 417 { 418 return (msb << GROUP_SHIFT_) + LINES_PER_GROUP_; 419 } 420 421 /** 422 * Gets the minimum codepoint of the group 423 * @param msb most significant byte of the group 424 * @return minimum codepoint of the group 425 */ getGroupMin(int msb)426 public static int getGroupMin(int msb) 427 { 428 return msb << GROUP_SHIFT_; 429 } 430 431 /** 432 * Gets the offset to a group 433 * @param codepoint The codepoint value. 434 * @return offset to a group 435 */ getGroupOffset(int codepoint)436 public static int getGroupOffset(int codepoint) 437 { 438 return codepoint & GROUP_MASK_; 439 } 440 441 /** 442 * Gets the minimum codepoint of a group 443 * @param codepoint The codepoint value. 444 * @return minimum codepoint in the group which codepoint belongs to 445 */ 446 ///CLOVER:OFF getGroupMinFromCodepoint(int codepoint)447 public static int getGroupMinFromCodepoint(int codepoint) 448 { 449 return codepoint & ~GROUP_MASK_; 450 } 451 ///CLOVER:ON 452 453 /** 454 * Get the Algorithm range length 455 * @return Algorithm range length 456 */ getAlgorithmLength()457 public int getAlgorithmLength() 458 { 459 return m_algorithm_.length; 460 } 461 462 /** 463 * Gets the start of the range 464 * @param index algorithm index 465 * @return algorithm range start 466 */ getAlgorithmStart(int index)467 public int getAlgorithmStart(int index) 468 { 469 return m_algorithm_[index].m_rangestart_; 470 } 471 472 /** 473 * Gets the end of the range 474 * @param index algorithm index 475 * @return algorithm range end 476 */ getAlgorithmEnd(int index)477 public int getAlgorithmEnd(int index) 478 { 479 return m_algorithm_[index].m_rangeend_; 480 } 481 482 /** 483 * Gets the Algorithmic name of the codepoint 484 * @param index algorithmic range index 485 * @param codepoint The codepoint value. 486 * @return algorithmic name of codepoint 487 */ getAlgorithmName(int index, int codepoint)488 public String getAlgorithmName(int index, int codepoint) 489 { 490 String result = null; 491 synchronized (m_utilStringBuffer_) { 492 m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length()); 493 m_algorithm_[index].appendName(codepoint, m_utilStringBuffer_); 494 result = m_utilStringBuffer_.toString(); 495 } 496 return result; 497 } 498 499 /** 500 * Gets the group name of the character 501 * @param ch character to get the group name 502 * @param choice name choice selector to choose a unicode 1.0 or newer name 503 */ getGroupName(int ch, int choice)504 public synchronized String getGroupName(int ch, int choice) 505 { 506 // gets the msb 507 int msb = getCodepointMSB(ch); 508 int group = getGroup(ch); 509 510 // return this if it is an exact match 511 if (msb == m_groupinfo_[group * m_groupsize_]) { 512 int index = getGroupLengths(group, m_groupoffsets_, 513 m_grouplengths_); 514 int offset = ch & GROUP_MASK_; 515 return getGroupName(index + m_groupoffsets_[offset], 516 m_grouplengths_[offset], choice); 517 } 518 519 return null; 520 } 521 522 // these are transliterator use methods --------------------------------- 523 524 /** 525 * Gets the maximum length of any codepoint name. 526 * Equivalent to uprv_getMaxCharNameLength. 527 * @return the maximum length of any codepoint name 528 */ getMaxCharNameLength()529 public int getMaxCharNameLength() 530 { 531 if (initNameSetsLengths()) { 532 return m_maxNameLength_; 533 } 534 else { 535 return 0; 536 } 537 } 538 539 /** 540 * Gets the maximum length of any iso comments. 541 * Equivalent to uprv_getMaxISOCommentLength. 542 * @return the maximum length of any codepoint name 543 */ 544 ///CLOVER:OFF getMaxISOCommentLength()545 public int getMaxISOCommentLength() 546 { 547 if (initNameSetsLengths()) { 548 return m_maxISOCommentLength_; 549 } 550 else { 551 return 0; 552 } 553 } 554 ///CLOVER:ON 555 556 /** 557 * Fills set with characters that are used in Unicode character names. 558 * Equivalent to uprv_getCharNameCharacters. 559 * @param set USet to receive characters. Existing contents are deleted. 560 */ getCharNameCharacters(UnicodeSet set)561 public void getCharNameCharacters(UnicodeSet set) 562 { 563 convert(m_nameSet_, set); 564 } 565 566 /** 567 * Fills set with characters that are used in Unicode character names. 568 * Equivalent to uprv_getISOCommentCharacters. 569 * @param set USet to receive characters. Existing contents are deleted. 570 */ 571 ///CLOVER:OFF getISOCommentCharacters(UnicodeSet set)572 public void getISOCommentCharacters(UnicodeSet set) 573 { 574 convert(m_ISOCommentSet_, set); 575 } 576 ///CLOVER:ON 577 578 // package private inner class -------------------------------------- 579 580 /** 581 * Algorithmic name class 582 */ 583 static final class AlgorithmName 584 { 585 // package private data members ---------------------------------- 586 587 /** 588 * Constant type value of the different AlgorithmName 589 */ 590 static final int TYPE_0_ = 0; 591 static final int TYPE_1_ = 1; 592 593 // package private constructors ---------------------------------- 594 595 /** 596 * Constructor 597 */ AlgorithmName()598 AlgorithmName() 599 { 600 } 601 602 // package private methods --------------------------------------- 603 604 /** 605 * Sets the information for accessing the algorithmic names 606 * @param rangestart starting code point that lies within this name group 607 * @param rangeend end code point that lies within this name group 608 * @param type algorithm type. There's 2 kinds of algorithmic type. First 609 * which uses code point as part of its name and the other uses 610 * variant postfix strings 611 * @param variant algorithmic variant 612 * @return true if values are valid 613 */ setInfo(int rangestart, int rangeend, byte type, byte variant)614 boolean setInfo(int rangestart, int rangeend, byte type, byte variant) 615 { 616 if (rangestart >= UCharacter.MIN_VALUE && rangestart <= rangeend 617 && rangeend <= UCharacter.MAX_VALUE && 618 (type == TYPE_0_ || type == TYPE_1_)) { 619 m_rangestart_ = rangestart; 620 m_rangeend_ = rangeend; 621 m_type_ = type; 622 m_variant_ = variant; 623 return true; 624 } 625 return false; 626 } 627 628 /** 629 * Sets the factor data 630 * @param factor Array of factor 631 * @return true if factors are valid 632 */ setFactor(char factor[])633 boolean setFactor(char factor[]) 634 { 635 if (factor.length == m_variant_) { 636 m_factor_ = factor; 637 return true; 638 } 639 return false; 640 } 641 642 /** 643 * Sets the name prefix 644 * @param prefix 645 * @return true if prefix is set 646 */ setPrefix(String prefix)647 boolean setPrefix(String prefix) 648 { 649 if (prefix != null && prefix.length() > 0) { 650 m_prefix_ = prefix; 651 return true; 652 } 653 return false; 654 } 655 656 /** 657 * Sets the variant factorized name data 658 * @param string variant factorized name data 659 * @return true if values are set 660 */ setFactorString(byte string[])661 boolean setFactorString(byte string[]) 662 { 663 // factor and variant string can be empty for things like 664 // hanggul code points 665 m_factorstring_ = string; 666 return true; 667 } 668 669 /** 670 * Checks if code point lies in Algorithm object at index 671 * @param ch code point 672 */ contains(int ch)673 boolean contains(int ch) 674 { 675 return m_rangestart_ <= ch && ch <= m_rangeend_; 676 } 677 678 /** 679 * Appends algorithm name of code point into StringBuffer. 680 * Note this method does not check for validity of code point in Algorithm, 681 * result is undefined if code point does not belong in Algorithm. 682 * @param ch code point 683 * @param str StringBuffer to append to 684 */ appendName(int ch, StringBuffer str)685 void appendName(int ch, StringBuffer str) 686 { 687 str.append(m_prefix_); 688 switch (m_type_) 689 { 690 case TYPE_0_: 691 // prefix followed by hex digits indicating variants 692 str.append(Utility.hex(ch,m_variant_)); 693 break; 694 case TYPE_1_: 695 // prefix followed by factorized-elements 696 int offset = ch - m_rangestart_; 697 int indexes[] = m_utilIntBuffer_; 698 int factor; 699 700 // write elements according to the factors 701 // the factorized elements are determined by modulo 702 // arithmetic 703 synchronized (m_utilIntBuffer_) { 704 for (int i = m_variant_ - 1; i > 0; i --) 705 { 706 factor = m_factor_[i] & 0x00FF; 707 indexes[i] = offset % factor; 708 offset /= factor; 709 } 710 711 // we don't need to calculate the last modulus because 712 // start <= code <= end guarantees here that 713 // code <= factors[0] 714 indexes[0] = offset; 715 716 // joining up the factorized strings 717 str.append(getFactorString(indexes, m_variant_)); 718 } 719 break; 720 } 721 } 722 723 /** 724 * Gets the character for the argument algorithmic name 725 * @return the algorithmic char or -1 otherwise. 726 */ getChar(String name)727 int getChar(String name) 728 { 729 int prefixlen = m_prefix_.length(); 730 if (name.length() < prefixlen || 731 !m_prefix_.equals(name.substring(0, prefixlen))) { 732 return -1; 733 } 734 735 switch (m_type_) 736 { 737 case TYPE_0_ : 738 try 739 { 740 int result = Integer.parseInt(name.substring(prefixlen), 741 16); 742 // does it fit into the range? 743 if (m_rangestart_ <= result && result <= m_rangeend_) { 744 return result; 745 } 746 } 747 catch (NumberFormatException e) 748 { 749 return -1; 750 } 751 break; 752 case TYPE_1_ : 753 // repetitative suffix name comparison done here 754 // offset is the character code - start 755 for (int ch = m_rangestart_; ch <= m_rangeend_; ch ++) 756 { 757 int offset = ch - m_rangestart_; 758 int indexes[] = m_utilIntBuffer_; 759 int factor; 760 761 // write elements according to the factors 762 // the factorized elements are determined by modulo 763 // arithmetic 764 synchronized (m_utilIntBuffer_) { 765 for (int i = m_variant_ - 1; i > 0; i --) 766 { 767 factor = m_factor_[i] & 0x00FF; 768 indexes[i] = offset % factor; 769 offset /= factor; 770 } 771 772 // we don't need to calculate the last modulus 773 // because start <= code <= end guarantees here that 774 // code <= factors[0] 775 indexes[0] = offset; 776 777 // joining up the factorized strings 778 if (compareFactorString(indexes, m_variant_, name, 779 prefixlen)) { 780 return ch; 781 } 782 } 783 } 784 } 785 786 return -1; 787 } 788 789 /** 790 * Adds all chars in the set of algorithmic names into the set. 791 * Equivalent to part of calcAlgNameSetsLengths. 792 * @param set int set to add the chars of the algorithm names into 793 * @param maxlength maximum length to compare to 794 * @return the length that is either maxlength of the length of this 795 * algorithm name if it is longer than maxlength 796 */ add(int set[], int maxlength)797 int add(int set[], int maxlength) 798 { 799 // prefix length 800 int length = UCharacterName.add(set, m_prefix_); 801 switch (m_type_) { 802 case TYPE_0_ : { 803 // name = prefix + (range->variant times) hex-digits 804 // prefix 805 length += m_variant_; 806 /* synwee to check 807 * addString(set, (const char *)(range + 1)) 808 + range->variant;*/ 809 break; 810 } 811 case TYPE_1_ : { 812 // name = prefix factorized-elements 813 // get the set and maximum factor suffix length for each 814 // factor 815 for (int i = m_variant_ - 1; i > 0; i --) 816 { 817 int maxfactorlength = 0; 818 int count = 0; 819 for (int factor = m_factor_[i]; factor > 0; -- factor) { 820 synchronized (m_utilStringBuffer_) { 821 m_utilStringBuffer_.delete(0, 822 m_utilStringBuffer_.length()); 823 count 824 = UCharacterUtility.getNullTermByteSubString( 825 m_utilStringBuffer_, 826 m_factorstring_, count); 827 UCharacterName.add(set, m_utilStringBuffer_); 828 if (m_utilStringBuffer_.length() 829 > maxfactorlength) 830 { 831 maxfactorlength 832 = m_utilStringBuffer_.length(); 833 } 834 } 835 } 836 length += maxfactorlength; 837 } 838 } 839 } 840 if (length > maxlength) { 841 return length; 842 } 843 return maxlength; 844 } 845 846 // private data members ------------------------------------------ 847 848 /** 849 * Algorithmic data information 850 */ 851 private int m_rangestart_; 852 private int m_rangeend_; 853 private byte m_type_; 854 private byte m_variant_; 855 private char m_factor_[]; 856 private String m_prefix_; 857 private byte m_factorstring_[]; 858 /** 859 * Utility StringBuffer 860 */ 861 private StringBuffer m_utilStringBuffer_ = new StringBuffer(); 862 /** 863 * Utility int buffer 864 */ 865 private int m_utilIntBuffer_[] = new int[256]; 866 867 // private methods ----------------------------------------------- 868 869 /** 870 * Gets the indexth string in each of the argument factor block 871 * @param index array with each index corresponding to each factor block 872 * @param length length of the array index 873 * @return the combined string of the array of indexth factor string in 874 * factor block 875 */ getFactorString(int index[], int length)876 private String getFactorString(int index[], int length) 877 { 878 int size = m_factor_.length; 879 if (index == null || length != size) { 880 return null; 881 } 882 883 synchronized (m_utilStringBuffer_) { 884 m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length()); 885 int count = 0; 886 int factor; 887 size --; 888 for (int i = 0; i <= size; i ++) { 889 factor = m_factor_[i]; 890 count = UCharacterUtility.skipNullTermByteSubString( 891 m_factorstring_, count, index[i]); 892 count = UCharacterUtility.getNullTermByteSubString( 893 m_utilStringBuffer_, m_factorstring_, 894 count); 895 if (i != size) { 896 count = UCharacterUtility.skipNullTermByteSubString( 897 m_factorstring_, count, 898 factor - index[i] - 1); 899 } 900 } 901 return m_utilStringBuffer_.toString(); 902 } 903 } 904 905 /** 906 * Compares the indexth string in each of the argument factor block with 907 * the argument string 908 * @param index array with each index corresponding to each factor block 909 * @param length index array length 910 * @param str string to compare with 911 * @param offset of str to start comparison 912 * @return true if string matches 913 */ compareFactorString(int index[], int length, String str, int offset)914 private boolean compareFactorString(int index[], int length, String str, 915 int offset) 916 { 917 int size = m_factor_.length; 918 if (index == null || length != size) 919 return false; 920 921 int count = 0; 922 int strcount = offset; 923 int factor; 924 size --; 925 for (int i = 0; i <= size; i ++) 926 { 927 factor = m_factor_[i]; 928 count = UCharacterUtility.skipNullTermByteSubString( 929 m_factorstring_, count, index[i]); 930 strcount = UCharacterUtility.compareNullTermByteSubString(str, 931 m_factorstring_, strcount, count); 932 if (strcount < 0) { 933 return false; 934 } 935 936 if (i != size) { 937 count = UCharacterUtility.skipNullTermByteSubString( 938 m_factorstring_, count, factor - index[i]); 939 } 940 } 941 if (strcount != str.length()) { 942 return false; 943 } 944 return true; 945 } 946 } 947 948 // package private data members -------------------------------------- 949 950 /** 951 * Size of each groups 952 */ 953 int m_groupsize_ = 0; 954 955 // package private methods -------------------------------------------- 956 957 /** 958 * Sets the token data 959 * @param token array of tokens 960 * @param tokenstring array of string values of the tokens 961 * @return false if there is a data error 962 */ setToken(char token[], byte tokenstring[])963 boolean setToken(char token[], byte tokenstring[]) 964 { 965 if (token != null && tokenstring != null && token.length > 0 && 966 tokenstring.length > 0) { 967 m_tokentable_ = token; 968 m_tokenstring_ = tokenstring; 969 return true; 970 } 971 return false; 972 } 973 974 /** 975 * Set the algorithm name information array 976 * @param alg Algorithm information array 977 * @return true if the group string offset has been set correctly 978 */ setAlgorithm(AlgorithmName alg[])979 boolean setAlgorithm(AlgorithmName alg[]) 980 { 981 if (alg != null && alg.length != 0) { 982 m_algorithm_ = alg; 983 return true; 984 } 985 return false; 986 } 987 988 /** 989 * Sets the number of group and size of each group in number of char 990 * @param count number of groups 991 * @param size size of group in char 992 * @return true if group size is set correctly 993 */ setGroupCountSize(int count, int size)994 boolean setGroupCountSize(int count, int size) 995 { 996 if (count <= 0 || size <= 0) { 997 return false; 998 } 999 m_groupcount_ = count; 1000 m_groupsize_ = size; 1001 return true; 1002 } 1003 1004 /** 1005 * Sets the group name data 1006 * @param group index information array 1007 * @param groupstring name information array 1008 * @return false if there is a data error 1009 */ setGroup(char group[], byte groupstring[])1010 boolean setGroup(char group[], byte groupstring[]) 1011 { 1012 if (group != null && groupstring != null && group.length > 0 && 1013 groupstring.length > 0) { 1014 m_groupinfo_ = group; 1015 m_groupstring_ = groupstring; 1016 return true; 1017 } 1018 return false; 1019 } 1020 1021 // private data members ---------------------------------------------- 1022 1023 /** 1024 * Data used in unames.icu 1025 */ 1026 private char m_tokentable_[]; 1027 private byte m_tokenstring_[]; 1028 private char m_groupinfo_[]; 1029 private byte m_groupstring_[]; 1030 private AlgorithmName m_algorithm_[]; 1031 1032 /** 1033 * Group use. Note - access must be synchronized. 1034 */ 1035 private char m_groupoffsets_[] = new char[LINES_PER_GROUP_ + 1]; 1036 private char m_grouplengths_[] = new char[LINES_PER_GROUP_ + 1]; 1037 1038 /** 1039 * Default name of the name datafile 1040 */ 1041 private static final String FILE_NAME_ = "unames.icu"; 1042 /** 1043 * Shift count to retrieve group information 1044 */ 1045 private static final int GROUP_SHIFT_ = 5; 1046 /** 1047 * Mask to retrieve the offset for a particular character within a group 1048 */ 1049 private static final int GROUP_MASK_ = LINES_PER_GROUP_ - 1; 1050 1051 /** 1052 * Position of offsethigh in group information array 1053 */ 1054 private static final int OFFSET_HIGH_OFFSET_ = 1; 1055 1056 /** 1057 * Position of offsetlow in group information array 1058 */ 1059 private static final int OFFSET_LOW_OFFSET_ = 2; 1060 /** 1061 * Double nibble indicator, any nibble > this number has to be combined 1062 * with its following nibble 1063 */ 1064 private static final int SINGLE_NIBBLE_MAX_ = 11; 1065 1066 /* 1067 * Maximum length of character names (regular & 1.0). 1068 */ 1069 //private static int MAX_NAME_LENGTH_ = 0; 1070 /* 1071 * Maximum length of ISO comments. 1072 */ 1073 //private static int MAX_ISO_COMMENT_LENGTH_ = 0; 1074 1075 /** 1076 * Set of chars used in character names (regular & 1.0). 1077 * Chars are platform-dependent (can be EBCDIC). 1078 */ 1079 private int m_nameSet_[] = new int[8]; 1080 /** 1081 * Set of chars used in ISO comments. (regular & 1.0). 1082 * Chars are platform-dependent (can be EBCDIC). 1083 */ 1084 private int m_ISOCommentSet_[] = new int[8]; 1085 /** 1086 * Utility StringBuffer 1087 */ 1088 private StringBuffer m_utilStringBuffer_ = new StringBuffer(); 1089 /** 1090 * Utility int buffer 1091 */ 1092 private int m_utilIntBuffer_[] = new int[2]; 1093 /** 1094 * Maximum ISO comment length 1095 */ 1096 private int m_maxISOCommentLength_; 1097 /** 1098 * Maximum name length 1099 */ 1100 private int m_maxNameLength_; 1101 /** 1102 * Type names used for extended names 1103 */ 1104 private static final String TYPE_NAMES_[] = {"unassigned", 1105 "uppercase letter", 1106 "lowercase letter", 1107 "titlecase letter", 1108 "modifier letter", 1109 "other letter", 1110 "non spacing mark", 1111 "enclosing mark", 1112 "combining spacing mark", 1113 "decimal digit number", 1114 "letter number", 1115 "other number", 1116 "space separator", 1117 "line separator", 1118 "paragraph separator", 1119 "control", 1120 "format", 1121 "private use area", 1122 "surrogate", 1123 "dash punctuation", 1124 "start punctuation", 1125 "end punctuation", 1126 "connector punctuation", 1127 "other punctuation", 1128 "math symbol", 1129 "currency symbol", 1130 "modifier symbol", 1131 "other symbol", 1132 "initial punctuation", 1133 "final punctuation", 1134 "noncharacter", 1135 "lead surrogate", 1136 "trail surrogate"}; 1137 /** 1138 * Unknown type name 1139 */ 1140 private static final String UNKNOWN_TYPE_NAME_ = "unknown"; 1141 /** 1142 * Not a character type 1143 */ 1144 private static final int NON_CHARACTER_ 1145 = UCharacterCategory.CHAR_CATEGORY_COUNT; 1146 /** 1147 * Lead surrogate type 1148 */ 1149 private static final int LEAD_SURROGATE_ 1150 = UCharacterCategory.CHAR_CATEGORY_COUNT + 1; 1151 /** 1152 * Trail surrogate type 1153 */ 1154 private static final int TRAIL_SURROGATE_ 1155 = UCharacterCategory.CHAR_CATEGORY_COUNT + 2; 1156 /** 1157 * Extended category count 1158 */ 1159 static final int EXTENDED_CATEGORY_ 1160 = UCharacterCategory.CHAR_CATEGORY_COUNT + 3; 1161 1162 // private constructor ------------------------------------------------ 1163 1164 /** 1165 * <p>Protected constructor for use in UCharacter.</p> 1166 * @exception IOException thrown when data reading fails 1167 */ UCharacterName()1168 private UCharacterName() throws IOException 1169 { 1170 ByteBuffer b = ICUBinary.getRequiredData(FILE_NAME_); 1171 UCharacterNameReader reader = new UCharacterNameReader(b); 1172 reader.read(this); 1173 } 1174 1175 // private methods --------------------------------------------------- 1176 1177 /** 1178 * Gets the algorithmic name for the argument character 1179 * @param ch character to determine name for 1180 * @param choice name choice 1181 * @return the algorithmic name or null if not found 1182 */ getAlgName(int ch, int choice)1183 private String getAlgName(int ch, int choice) 1184 { 1185 /* Only the normative character name can be algorithmic. */ 1186 if (choice == UCharacterNameChoice.UNICODE_CHAR_NAME || 1187 choice == UCharacterNameChoice.EXTENDED_CHAR_NAME 1188 ) { 1189 // index in terms integer index 1190 synchronized (m_utilStringBuffer_) { 1191 m_utilStringBuffer_.delete(0, m_utilStringBuffer_.length()); 1192 1193 for (int index = m_algorithm_.length - 1; index >= 0; index --) 1194 { 1195 if (m_algorithm_[index].contains(ch)) { 1196 m_algorithm_[index].appendName(ch, m_utilStringBuffer_); 1197 return m_utilStringBuffer_.toString(); 1198 } 1199 } 1200 } 1201 } 1202 return null; 1203 } 1204 1205 /** 1206 * Getting the character with the tokenized argument name 1207 * @param name of the character 1208 * @return character with the tokenized argument name or -1 if character 1209 * is not found 1210 */ getGroupChar(String name, int choice)1211 private synchronized int getGroupChar(String name, int choice) 1212 { 1213 for (int i = 0; i < m_groupcount_; i ++) { 1214 // populating the data set of grouptable 1215 1216 int startgpstrindex = getGroupLengths(i, m_groupoffsets_, 1217 m_grouplengths_); 1218 1219 // shift out to function 1220 int result = getGroupChar(startgpstrindex, m_grouplengths_, name, 1221 choice); 1222 if (result != -1) { 1223 return (m_groupinfo_[i * m_groupsize_] << GROUP_SHIFT_) 1224 | result; 1225 } 1226 } 1227 return -1; 1228 } 1229 1230 /** 1231 * Compares and retrieve character if name is found within the argument 1232 * group 1233 * @param index index where the set of names reside in the group block 1234 * @param length list of lengths of the strings 1235 * @param name character name to search for 1236 * @param choice of either 1.0 or the most current unicode name 1237 * @return relative character in the group which matches name, otherwise if 1238 * not found, -1 will be returned 1239 */ getGroupChar(int index, char length[], String name, int choice)1240 private int getGroupChar(int index, char length[], String name, 1241 int choice) 1242 { 1243 byte b = 0; 1244 char token; 1245 int len; 1246 int namelen = name.length(); 1247 int nindex; 1248 int count; 1249 1250 for (int result = 0; result <= LINES_PER_GROUP_; result ++) { 1251 nindex = 0; 1252 len = length[result]; 1253 1254 if (choice != UCharacterNameChoice.UNICODE_CHAR_NAME && 1255 choice != UCharacterNameChoice.EXTENDED_CHAR_NAME 1256 ) { 1257 /* 1258 * skip the modern name if it is not requested _and_ 1259 * if the semicolon byte value is a character, not a token number 1260 */ 1261 int fieldIndex= choice==UCharacterNameChoice.ISO_COMMENT_ ? 2 : choice; 1262 do { 1263 int oldindex = index; 1264 index += UCharacterUtility.skipByteSubString(m_groupstring_, 1265 index, len, (byte)';'); 1266 len -= (index - oldindex); 1267 } while(--fieldIndex>0); 1268 } 1269 1270 // number of tokens is > the length of the name 1271 // write each letter directly, and write a token word per token 1272 for (count = 0; count < len && nindex != -1 && nindex < namelen; 1273 ) { 1274 b = m_groupstring_[index + count]; 1275 count ++; 1276 1277 if (b >= m_tokentable_.length) { 1278 if (name.charAt(nindex ++) != (b & 0xFF)) { 1279 nindex = -1; 1280 } 1281 } 1282 else { 1283 token = m_tokentable_[b & 0xFF]; 1284 if (token == 0xFFFE) { 1285 // this is a lead byte for a double-byte token 1286 token = m_tokentable_[b << 8 | 1287 (m_groupstring_[index + count] & 0x00ff)]; 1288 count ++; 1289 } 1290 if (token == 0xFFFF) { 1291 if (name.charAt(nindex ++) != (b & 0xFF)) { 1292 nindex = -1; 1293 } 1294 } 1295 else { 1296 // compare token with name 1297 nindex = UCharacterUtility.compareNullTermByteSubString( 1298 name, m_tokenstring_, nindex, token); 1299 } 1300 } 1301 } 1302 1303 if (namelen == nindex && 1304 (count == len || m_groupstring_[index + count] == ';')) { 1305 return result; 1306 } 1307 1308 index += len; 1309 } 1310 return -1; 1311 } 1312 1313 /** 1314 * Gets the character extended type 1315 * @param ch character to be tested 1316 * @return extended type it is associated with 1317 */ getType(int ch)1318 private static int getType(int ch) 1319 { 1320 if (UCharacterUtility.isNonCharacter(ch)) { 1321 // not a character we return a invalid category count 1322 return NON_CHARACTER_; 1323 } 1324 int result = UCharacter.getType(ch); 1325 if (result == UCharacterCategory.SURROGATE) { 1326 if (ch <= UTF16.LEAD_SURROGATE_MAX_VALUE) { 1327 result = LEAD_SURROGATE_; 1328 } 1329 else { 1330 result = TRAIL_SURROGATE_; 1331 } 1332 } 1333 return result; 1334 } 1335 1336 /** 1337 * Getting the character with extended name of the form <....>. 1338 * @param name of the character to be found 1339 * @param choice name choice 1340 * @return character associated with the name, -1 if such character is not 1341 * found and -2 if we should continue with the search. 1342 */ getExtendedChar(String name, int choice)1343 private static int getExtendedChar(String name, int choice) 1344 { 1345 if (name.charAt(0) == '<') { 1346 if (choice == UCharacterNameChoice.EXTENDED_CHAR_NAME) { 1347 int endIndex = name.length() - 1; 1348 if (name.charAt(endIndex) == '>') { 1349 int startIndex = name.lastIndexOf('-'); 1350 if (startIndex >= 0) { // We've got a category. 1351 startIndex ++; 1352 int result = -1; 1353 try { 1354 result = Integer.parseInt( 1355 name.substring(startIndex, endIndex), 1356 16); 1357 } 1358 catch (NumberFormatException e) { 1359 return -1; 1360 } 1361 // Now validate the category name. We could use a 1362 // binary search, or a trie, if we really wanted to. 1363 String type = name.substring(1, startIndex - 1); 1364 int length = TYPE_NAMES_.length; 1365 for (int i = 0; i < length; ++ i) { 1366 if (type.compareTo(TYPE_NAMES_[i]) == 0) { 1367 if (getType(result) == i) { 1368 return result; 1369 } 1370 break; 1371 } 1372 } 1373 } 1374 } 1375 } 1376 return -1; 1377 } 1378 return -2; 1379 } 1380 1381 // sets of name characters, maximum name lengths ----------------------- 1382 1383 /** 1384 * Adds a codepoint into a set of ints. 1385 * Equivalent to SET_ADD. 1386 * @param set set to add to 1387 * @param ch 16 bit char to add 1388 */ add(int set[], char ch)1389 private static void add(int set[], char ch) 1390 { 1391 set[ch >>> 5] |= 1 << (ch & 0x1f); 1392 } 1393 1394 /** 1395 * Checks if a codepoint is a part of a set of ints. 1396 * Equivalent to SET_CONTAINS. 1397 * @param set set to check in 1398 * @param ch 16 bit char to check 1399 * @return true if codepoint is part of the set, false otherwise 1400 */ contains(int set[], char ch)1401 private static boolean contains(int set[], char ch) 1402 { 1403 return (set[ch >>> 5] & (1 << (ch & 0x1f))) != 0; 1404 } 1405 1406 /** 1407 * Adds all characters of the argument str and gets the length 1408 * Equivalent to calcStringSetLength. 1409 * @param set set to add all chars of str to 1410 * @param str string to add 1411 */ add(int set[], String str)1412 private static int add(int set[], String str) 1413 { 1414 int result = str.length(); 1415 1416 for (int i = result - 1; i >= 0; i --) { 1417 add(set, str.charAt(i)); 1418 } 1419 return result; 1420 } 1421 1422 /** 1423 * Adds all characters of the argument str and gets the length 1424 * Equivalent to calcStringSetLength. 1425 * @param set set to add all chars of str to 1426 * @param str string to add 1427 */ add(int set[], StringBuffer str)1428 private static int add(int set[], StringBuffer str) 1429 { 1430 int result = str.length(); 1431 1432 for (int i = result - 1; i >= 0; i --) { 1433 add(set, str.charAt(i)); 1434 } 1435 return result; 1436 } 1437 1438 /** 1439 * Adds all algorithmic names into the name set. 1440 * Equivalent to part of calcAlgNameSetsLengths. 1441 * @param maxlength length to compare to 1442 * @return the maximum length of any possible algorithmic name if it is > 1443 * maxlength, otherwise maxlength is returned. 1444 */ addAlgorithmName(int maxlength)1445 private int addAlgorithmName(int maxlength) 1446 { 1447 int result = 0; 1448 for (int i = m_algorithm_.length - 1; i >= 0; i --) { 1449 result = m_algorithm_[i].add(m_nameSet_, maxlength); 1450 if (result > maxlength) { 1451 maxlength = result; 1452 } 1453 } 1454 return maxlength; 1455 } 1456 1457 /** 1458 * Adds all extended names into the name set. 1459 * Equivalent to part of calcExtNameSetsLengths. 1460 * @param maxlength length to compare to 1461 * @return the maxlength of any possible extended name. 1462 */ addExtendedName(int maxlength)1463 private int addExtendedName(int maxlength) 1464 { 1465 for (int i = TYPE_NAMES_.length - 1; i >= 0; i --) { 1466 // for each category, count the length of the category name 1467 // plus 9 = 1468 // 2 for <> 1469 // 1 for - 1470 // 6 for most hex digits per code point 1471 int length = 9 + add(m_nameSet_, TYPE_NAMES_[i]); 1472 if (length > maxlength) { 1473 maxlength = length; 1474 } 1475 } 1476 return maxlength; 1477 } 1478 1479 /** 1480 * Adds names of a group to the argument set. 1481 * Equivalent to calcNameSetLength. 1482 * @param offset of the group name string in byte count 1483 * @param length of the group name string 1484 * @param tokenlength array to store the length of each token 1485 * @param set to add to 1486 * @return the length of the name string and the length of the group 1487 * string parsed 1488 */ addGroupName(int offset, int length, byte tokenlength[], int set[])1489 private int[] addGroupName(int offset, int length, byte tokenlength[], 1490 int set[]) 1491 { 1492 int resultnlength = 0; 1493 int resultplength = 0; 1494 while (resultplength < length) { 1495 char b = (char)(m_groupstring_[offset + resultplength] & 0xff); 1496 resultplength ++; 1497 if (b == ';') { 1498 break; 1499 } 1500 1501 if (b >= m_tokentable_.length) { 1502 add(set, b); // implicit letter 1503 resultnlength ++; 1504 } 1505 else { 1506 char token = m_tokentable_[b & 0x00ff]; 1507 if (token == 0xFFFE) { 1508 // this is a lead byte for a double-byte token 1509 b = (char)(b << 8 | (m_groupstring_[offset + resultplength] 1510 & 0x00ff)); 1511 token = m_tokentable_[b]; 1512 resultplength ++; 1513 } 1514 if (token == 0xFFFF) { 1515 add(set, b); 1516 resultnlength ++; 1517 } 1518 else { 1519 // count token word 1520 // use cached token length 1521 byte tlength = tokenlength[b]; 1522 if (tlength == 0) { 1523 synchronized (m_utilStringBuffer_) { 1524 m_utilStringBuffer_.delete(0, 1525 m_utilStringBuffer_.length()); 1526 UCharacterUtility.getNullTermByteSubString( 1527 m_utilStringBuffer_, m_tokenstring_, 1528 token); 1529 tlength = (byte)add(set, m_utilStringBuffer_); 1530 } 1531 tokenlength[b] = tlength; 1532 } 1533 resultnlength += tlength; 1534 } 1535 } 1536 } 1537 m_utilIntBuffer_[0] = resultnlength; 1538 m_utilIntBuffer_[1] = resultplength; 1539 return m_utilIntBuffer_; 1540 } 1541 1542 /** 1543 * Adds names of all group to the argument set. 1544 * Sets the data member m_max*Length_. 1545 * Method called only once. 1546 * Equivalent to calcGroupNameSetsLength. 1547 * @param maxlength length to compare to 1548 */ addGroupName(int maxlength)1549 private void addGroupName(int maxlength) 1550 { 1551 int maxisolength = 0; 1552 char offsets[] = new char[LINES_PER_GROUP_ + 2]; 1553 char lengths[] = new char[LINES_PER_GROUP_ + 2]; 1554 byte tokenlengths[] = new byte[m_tokentable_.length]; 1555 1556 // enumerate all groups 1557 // for (int i = m_groupcount_ - 1; i >= 0; i --) { 1558 for (int i = 0; i < m_groupcount_ ; i ++) { 1559 int offset = getGroupLengths(i, offsets, lengths); 1560 // enumerate all lines in each group 1561 // for (int linenumber = LINES_PER_GROUP_ - 1; linenumber >= 0; 1562 // linenumber --) { 1563 for (int linenumber = 0; linenumber < LINES_PER_GROUP_; 1564 linenumber ++) { 1565 int lineoffset = offset + offsets[linenumber]; 1566 int length = lengths[linenumber]; 1567 if (length == 0) { 1568 continue; 1569 } 1570 1571 // read regular name 1572 int parsed[] = addGroupName(lineoffset, length, tokenlengths, 1573 m_nameSet_); 1574 if (parsed[0] > maxlength) { 1575 // 0 for name length 1576 maxlength = parsed[0]; 1577 } 1578 lineoffset += parsed[1]; 1579 if (parsed[1] >= length) { 1580 // 1 for parsed group string length 1581 continue; 1582 } 1583 length -= parsed[1]; 1584 // read Unicode 1.0 name 1585 parsed = addGroupName(lineoffset, length, tokenlengths, 1586 m_nameSet_); 1587 if (parsed[0] > maxlength) { 1588 // 0 for name length 1589 maxlength = parsed[0]; 1590 } 1591 lineoffset += parsed[1]; 1592 if (parsed[1] >= length) { 1593 // 1 for parsed group string length 1594 continue; 1595 } 1596 length -= parsed[1]; 1597 // read ISO comment 1598 parsed = addGroupName(lineoffset, length, tokenlengths, 1599 m_ISOCommentSet_); 1600 if (parsed[1] > maxisolength) { 1601 maxisolength = length; 1602 } 1603 } 1604 } 1605 1606 // set gMax... - name length last for threading 1607 m_maxISOCommentLength_ = maxisolength; 1608 m_maxNameLength_ = maxlength; 1609 } 1610 1611 /** 1612 * Sets up the name sets and the calculation of the maximum lengths. 1613 * Equivalent to calcNameSetsLengths. 1614 */ initNameSetsLengths()1615 private boolean initNameSetsLengths() 1616 { 1617 if (m_maxNameLength_ > 0) { 1618 return true; 1619 } 1620 1621 String extra = "0123456789ABCDEF<>-"; 1622 // set hex digits, used in various names, and <>-, used in extended 1623 // names 1624 for (int i = extra.length() - 1; i >= 0; i --) { 1625 add(m_nameSet_, extra.charAt(i)); 1626 } 1627 1628 // set sets and lengths from algorithmic names 1629 m_maxNameLength_ = addAlgorithmName(0); 1630 // set sets and lengths from extended names 1631 m_maxNameLength_ = addExtendedName(m_maxNameLength_); 1632 // set sets and lengths from group names, set global maximum values 1633 addGroupName(m_maxNameLength_); 1634 return true; 1635 } 1636 1637 /** 1638 * Converts the char set cset into a Unicode set uset. 1639 * Equivalent to charSetToUSet. 1640 * @param set Set of 256 bit flags corresponding to a set of chars. 1641 * @param uset USet to receive characters. Existing contents are deleted. 1642 */ convert(int set[], UnicodeSet uset)1643 private void convert(int set[], UnicodeSet uset) 1644 { 1645 uset.clear(); 1646 if (!initNameSetsLengths()) { 1647 return; 1648 } 1649 1650 // build a char string with all chars that are used in character names 1651 for (char c = 255; c > 0; c --) { 1652 if (contains(set, c)) { 1653 uset.add(c); 1654 } 1655 } 1656 } 1657 } 1658