1 /* 2 ********************************************************************** 3 * Copyright (c) 2002-2004, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * Author: Alan Liu 7 * Created: October 30 2002 8 * Since: ICU 2.4 9 ********************************************************************** 10 */ 11 #ifndef PROPNAME_H 12 #define PROPNAME_H 13 14 #include "unicode/utypes.h" 15 #include "unicode/uchar.h" 16 #include "udataswp.h" 17 #include "uprops.h" 18 19 /* 20 * This header defines the in-memory layout of the property names data 21 * structure representing the UCD data files PropertyAliases.txt and 22 * PropertyValueAliases.txt. It is used by: 23 * propname.cpp - reads data 24 * genpname - creates data 25 */ 26 27 /* low-level char * property name comparison -------------------------------- */ 28 29 U_CDECL_BEGIN 30 31 /** 32 * \var uprv_comparePropertyNames 33 * Unicode property names and property value names are compared "loosely". 34 * 35 * UCD.html 4.0.1 says: 36 * For all property names, property value names, and for property values for 37 * Enumerated, Binary, or Catalog properties, use the following 38 * loose matching rule: 39 * 40 * LM3. Ignore case, whitespace, underscore ('_'), and hyphens. 41 * 42 * This function does just that, for (char *) name strings. 43 * It is almost identical to ucnv_compareNames() but also ignores 44 * C0 White_Space characters (U+0009..U+000d, and U+0085 on EBCDIC). 45 * 46 * @internal 47 */ 48 49 U_CAPI int32_t U_EXPORT2 50 uprv_compareASCIIPropertyNames(const char *name1, const char *name2); 51 52 U_CAPI int32_t U_EXPORT2 53 uprv_compareEBCDICPropertyNames(const char *name1, const char *name2); 54 55 #if U_CHARSET_FAMILY==U_ASCII_FAMILY 56 # define uprv_comparePropertyNames uprv_compareASCIIPropertyNames 57 #elif U_CHARSET_FAMILY==U_EBCDIC_FAMILY 58 # define uprv_comparePropertyNames uprv_compareEBCDICPropertyNames 59 #else 60 # error U_CHARSET_FAMILY is not valid 61 #endif 62 63 U_CDECL_END 64 65 /* UDataMemory structure and signatures ------------------------------------- */ 66 67 #define PNAME_DATA_NAME "pnames" 68 #define PNAME_DATA_TYPE "icu" 69 70 /* Fields in UDataInfo: */ 71 72 /* PNAME_SIG[] is encoded as numeric literals for compatibility with the HP compiler */ 73 #define PNAME_SIG_0 ((uint8_t)0x70) /* p */ 74 #define PNAME_SIG_1 ((uint8_t)0x6E) /* n */ 75 #define PNAME_SIG_2 ((uint8_t)0x61) /* a */ 76 #define PNAME_SIG_3 ((uint8_t)0x6D) /* m */ 77 78 #define PNAME_FORMAT_VERSION ((int8_t)1) /* formatVersion[0] */ 79 80 /** 81 * Swap pnames.icu. See udataswp.h. 82 * @internal 83 */ 84 U_CAPI int32_t U_EXPORT2 85 upname_swap(const UDataSwapper *ds, 86 const void *inData, int32_t length, void *outData, 87 UErrorCode *pErrorCode); 88 89 90 #ifdef XP_CPLUSPLUS 91 92 class Builder; 93 94 U_NAMESPACE_BEGIN 95 96 /** 97 * An offset from the start of the pnames data to a contained entity. 98 * This must be a signed value, since negative offsets are used as an 99 * end-of-list marker. Offsets to actual objects are non-zero. A 100 * zero offset indicates an absent entry; this corresponds to aliases 101 * marked "n/a" in the original Unicode data files. 102 */ 103 typedef int16_t Offset; /* must be signed */ 104 105 #define MAX_OFFSET 0x7FFF 106 107 /** 108 * A generic value for a property or property value. Typically an 109 * enum from uchar.h, but sometimes a non-enum value. It must be 110 * large enough to accomodate the largest enum value, which as of this 111 * writing is the largest general category mask. Need not be signed 112 * but may be. Typically it doesn't matter, since the caller will 113 * cast it to the proper type before use. Takes the special value 114 * UCHAR_INVALID_CODE for invalid input. 115 */ 116 typedef int32_t EnumValue; 117 118 /* ---------------------------------------------------------------------- */ 119 /* ValueMap */ 120 121 /** 122 * For any top-level property that has named values (binary and 123 * enumerated properties), there is a ValueMap object. This object 124 * maps from enum values to two other maps. One goes from value enums 125 * to value names. The other goes from value names to value enums. 126 * 127 * The value enum values may be contiguous or disjoint. If they are 128 * contiguous then the enumToName_offset is nonzero, and the 129 * ncEnumToName_offset is zero. Vice versa if the value enums are 130 * disjoint. 131 * 132 * There are n of these objects, where n is the number of binary 133 * properties + the number of enumerated properties. 134 */ 135 struct ValueMap { 136 137 /* -- begin pnames data -- */ 138 /* Enum=>name EnumToOffset / NonContiguousEnumToOffset objects. */ 139 /* Exactly one of these will be nonzero. */ 140 Offset enumToName_offset; 141 Offset ncEnumToName_offset; 142 143 Offset nameToEnum_offset; /* Name=>enum data */ 144 /* -- end pnames data -- */ 145 }; 146 147 /* ---------------------------------------------------------------------- */ 148 /* PropertyAliases class */ 149 150 /** 151 * A class encapsulating access to the memory-mapped data representing 152 * property aliases and property value aliases (pnames). The class 153 * MUST have no v-table and declares certain methods inline -- small 154 * methods and methods that are called from only one point. 155 * 156 * The data members in this class correspond to the in-memory layout 157 * of the header of the pnames data. 158 */ 159 class PropertyAliases { 160 161 /* -- begin pnames data -- */ 162 /* Enum=>name EnumToOffset object for binary and enumerated */ 163 /* properties */ 164 Offset enumToName_offset; 165 166 /* Name=>enum data for binary & enumerated properties */ 167 Offset nameToEnum_offset; 168 169 /* Enum=>offset EnumToOffset object mapping enumerated properties */ 170 /* to ValueMap objects */ 171 Offset enumToValue_offset; 172 173 /* The following are needed by external readers of this data. */ 174 /* We don't use them ourselves. */ 175 int16_t total_size; /* size in bytes excluding the udata header */ 176 Offset valueMap_offset; /* offset to start of array */ 177 int16_t valueMap_count; /* number of entries */ 178 Offset nameGroupPool_offset; /* offset to start of array */ 179 int16_t nameGroupPool_count; /* number of entries (not groups) */ 180 Offset stringPool_offset; /* offset to start of pool */ 181 int16_t stringPool_count; /* number of strings (not size in bytes) */ 182 183 /* -- end pnames data -- */ 184 185 friend class ::Builder; 186 187 const ValueMap* getValueMap(EnumValue prop) const; 188 189 const char* chooseNameInGroup(Offset offset, 190 UPropertyNameChoice choice) const; 191 192 public: 193 getPointer(Offset o)194 inline const int8_t* getPointer(Offset o) const { 195 return ((const int8_t*) this) + o; 196 } 197 getPointerNull(Offset o)198 inline const int8_t* getPointerNull(Offset o) const { 199 return o ? getPointer(o) : NULL; 200 } 201 202 inline const char* getPropertyName(EnumValue prop, 203 UPropertyNameChoice choice) const; 204 205 inline EnumValue getPropertyEnum(const char* alias) const; 206 207 inline const char* getPropertyValueName(EnumValue prop, EnumValue value, 208 UPropertyNameChoice choice) const; 209 210 inline EnumValue getPropertyValueEnum(EnumValue prop, 211 const char* alias) const; 212 213 static int32_t 214 swap(const UDataSwapper *ds, 215 const uint8_t *inBytes, int32_t length, uint8_t *outBytes, 216 UErrorCode *pErrorCode); 217 }; 218 219 /* ---------------------------------------------------------------------- */ 220 /* EnumToOffset */ 221 222 /** 223 * A generic map from enum values to Offsets. The enum values must be 224 * contiguous, from enumStart to enumLimit. The Offset values may 225 * point to anything. 226 */ 227 class EnumToOffset { 228 229 /* -- begin pnames data -- */ 230 EnumValue enumStart; 231 EnumValue enumLimit; 232 Offset _offsetArray; /* [array of enumLimit-enumStart] */ 233 /* -- end pnames data -- */ 234 235 friend class ::Builder; 236 getOffsetArray()237 Offset* getOffsetArray() { 238 return &_offsetArray; 239 } 240 getOffsetArray()241 const Offset* getOffsetArray() const { 242 return &_offsetArray; 243 } 244 getSize(int32_t n)245 static int32_t getSize(int32_t n) { 246 return sizeof(EnumToOffset) + sizeof(Offset) * (n - 1); 247 } 248 getSize()249 int32_t getSize() { 250 return getSize(enumLimit - enumStart); 251 } 252 253 public: 254 getOffset(EnumValue enumProbe)255 Offset getOffset(EnumValue enumProbe) const { 256 if (enumProbe < enumStart || 257 enumProbe >= enumLimit) { 258 return 0; /* not found */ 259 } 260 const Offset* p = getOffsetArray(); 261 return p[enumProbe - enumStart]; 262 } 263 264 static int32_t 265 swap(const UDataSwapper *ds, 266 const uint8_t *inBytes, int32_t length, uint8_t *outBytes, 267 uint8_t *temp, int32_t pos, 268 UErrorCode *pErrorCode); 269 }; 270 271 /* ---------------------------------------------------------------------- */ 272 /* NonContiguousEnumToOffset */ 273 274 /** 275 * A generic map from enum values to Offsets. The enum values may be 276 * disjoint. If they are contiguous, an EnumToOffset should be used 277 * instead. The Offset values may point to anything. 278 */ 279 class NonContiguousEnumToOffset { 280 281 /* -- begin pnames data -- */ 282 int32_t count; 283 EnumValue _enumArray; /* [array of count] */ 284 /* Offset _offsetArray; // [array of count] after enumValue[count-1] */ 285 /* -- end pnames data -- */ 286 287 friend class ::Builder; 288 getEnumArray()289 EnumValue* getEnumArray() { 290 return &_enumArray; 291 } 292 getEnumArray()293 const EnumValue* getEnumArray() const { 294 return &_enumArray; 295 } 296 getOffsetArray()297 Offset* getOffsetArray() { 298 return (Offset*) (getEnumArray() + count); 299 } 300 getOffsetArray()301 const Offset* getOffsetArray() const { 302 return (Offset*) (getEnumArray() + count); 303 } 304 getSize(int32_t n)305 static int32_t getSize(int32_t n) { 306 return sizeof(int32_t) + (sizeof(EnumValue) + sizeof(Offset)) * n; 307 } 308 getSize()309 int32_t getSize() { 310 return getSize(count); 311 } 312 313 public: 314 getOffset(EnumValue enumProbe)315 Offset getOffset(EnumValue enumProbe) const { 316 const EnumValue* e = getEnumArray(); 317 const Offset* p = getOffsetArray(); 318 /* linear search; binary later if warranted */ 319 /* (binary is not faster for short lists) */ 320 for (int32_t i=0; i<count; ++i) { 321 if (e[i] < enumProbe) continue; 322 if (e[i] > enumProbe) break; 323 return p[i]; 324 } 325 return 0; /* not found */ 326 } 327 328 static int32_t 329 swap(const UDataSwapper *ds, 330 const uint8_t *inBytes, int32_t length, uint8_t *outBytes, 331 uint8_t *temp, int32_t pos, 332 UErrorCode *pErrorCode); 333 }; 334 335 /* ---------------------------------------------------------------------- */ 336 /* NameToEnum */ 337 338 /** 339 * A map from names to enum values. 340 */ 341 class NameToEnum { 342 343 /* -- begin pnames data -- */ 344 int32_t count; /* number of entries */ 345 EnumValue _enumArray; /* [array of count] EnumValues */ 346 /* Offset _nameArray; // [array of count] offsets to names */ 347 /* -- end pnames data -- */ 348 349 friend class ::Builder; 350 getEnumArray()351 EnumValue* getEnumArray() { 352 return &_enumArray; 353 } 354 getEnumArray()355 const EnumValue* getEnumArray() const { 356 return &_enumArray; 357 } 358 getNameArray()359 Offset* getNameArray() { 360 return (Offset*) (getEnumArray() + count); 361 } 362 getNameArray()363 const Offset* getNameArray() const { 364 return (Offset*) (getEnumArray() + count); 365 } 366 getSize(int32_t n)367 static int32_t getSize(int32_t n) { 368 return sizeof(int32_t) + (sizeof(Offset) + sizeof(EnumValue)) * n; 369 } 370 getSize()371 int32_t getSize() { 372 return getSize(count); 373 } 374 375 public: 376 getEnum(const char * alias,const PropertyAliases & data)377 EnumValue getEnum(const char* alias, const PropertyAliases& data) const { 378 379 const Offset* n = getNameArray(); 380 const EnumValue* e = getEnumArray(); 381 382 /* linear search; binary later if warranted */ 383 /* (binary is not faster for short lists) */ 384 for (int32_t i=0; i<count; ++i) { 385 const char* name = (const char*) data.getPointer(n[i]); 386 int32_t c = uprv_comparePropertyNames(alias, name); 387 if (c > 0) continue; 388 if (c < 0) break; 389 return e[i]; 390 } 391 392 return UCHAR_INVALID_CODE; 393 } 394 395 static int32_t 396 swap(const UDataSwapper *ds, 397 const uint8_t *inBytes, int32_t length, uint8_t *outBytes, 398 uint8_t *temp, int32_t pos, 399 UErrorCode *pErrorCode); 400 }; 401 402 /*---------------------------------------------------------------------- 403 * 404 * In-memory layout. THIS IS NOT A STANDALONE DOCUMENT. It goes 405 * together with above C++ declarations and gives an overview. 406 * 407 * See above for definitions of Offset and EnumValue. Also, refer to 408 * above class declarations for the "bottom line" on data layout. 409 * 410 * Sizes: 411 * '*_offset' is an Offset (see above) 412 * 'count' members are typically int32_t (see above declarations) 413 * 'enumArray' is an array of EnumValue (see above) 414 * 'offsetArray' is an array of Offset (see above) 415 * 'nameArray' is an array of Offset (see above) 416 * 'enum*' is an EnumValue (see above) 417 * '*Array [x n]' means that *Array has n elements 418 * 419 * References: 420 * Instead of pointers, this flat data structure contains offsets. 421 * All offsets are relative to the start of 'header'. A notation 422 * is used to indicate what structure each offset points to: 423 * 'foo (>x)' the offset(s) in foo point to structure x 424 * 425 * Structures: 426 * Each structure is assigned a number, except for the header, 427 * which is called 'header'. The numbers are not contiguous 428 * for historical reasons. Some structures have sub-parts 429 * that are denoted with a letter, e.g., "5a". 430 * 431 * BEGIN LAYOUT 432 * ============ 433 * header: 434 * enumToName_offset (>0) 435 * nameToEnum_offset (>2) 436 * enumToValue_offset (>3) 437 * (alignment padding build in to header) 438 * 439 * The header also contains the following, used by "external readers" 440 * like ICU4J and icuswap. 441 * 442 * // The following are needed by external readers of this data. 443 * // We don't use them ourselves. 444 * int16_t total_size; // size in bytes excluding the udata header 445 * Offset valueMap_offset; // offset to start of array 446 * int16_t valueMap_count; // number of entries 447 * Offset nameGroupPool_offset; // offset to start of array 448 * int16_t nameGroupPool_count; // number of entries (not groups) 449 * Offset stringPool_offset; // offset to start of pool 450 * int16_t stringPool_count; // number of strings (not size in bytes) 451 * 452 * 0: # NonContiguousEnumToOffset obj for props => name groups 453 * count 454 * enumArray [x count] 455 * offsetArray [x count] (>98) 456 * 457 * => pad to next 4-byte boundary 458 * 459 * (1: omitted -- no longer used) 460 * 461 * 2: # NameToEnum obj for binary & enumerated props 462 * count 463 * enumArray [x count] 464 * nameArray [x count] (>99) 465 * 466 * => pad to next 4-byte boundary 467 * 468 * 3: # NonContiguousEnumToOffset obj for enumerated props => ValueMaps 469 * count 470 * enumArray [x count] 471 * offsetArray [x count] (>4) 472 * 473 * => pad to next 4-byte boundary 474 * 475 * 4: # ValueMap array [x one for each enumerated prop i] 476 * enumToName_offset (>5a +2*i) one of these two is NULL, one is not 477 * ncEnumToName_offset (>5b +2*i) 478 * nameToEnums_offset (>6 +2*i) 479 * 480 * => pad to next 4-byte boundary 481 * 482 * for each enumerated prop (either 5a or 5b): 483 * 484 * 5a: # EnumToOffset for enumerated prop's values => name groups 485 * enumStart 486 * enumLimit 487 * offsetArray [x enumLimit - enumStart] (>98) 488 * 489 * => pad to next 4-byte boundary 490 * 491 * 5b: # NonContiguousEnumToOffset for enumerated prop's values => name groups 492 * count 493 * enumArray [x count] 494 * offsetArray [x count] (>98) 495 * 496 * => pad to next 4-byte boundary 497 * 498 * 6: # NameToEnum for enumerated prop's values 499 * count 500 * enumArray [x count] 501 * nameArray [x count] (>99) 502 * 503 * => pad to next 4-byte boundary 504 * 505 * 98: # name group pool {NGP} 506 * [array of Offset values] (>99) 507 * 508 * 99: # string pool {SP} 509 * [pool of nul-terminated char* strings] 510 */ 511 U_NAMESPACE_END 512 513 #endif /* C++ */ 514 515 #endif 516