1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2011-2013, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * file name: ppucd.h 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2011dec11 14 * created by: Markus W. Scherer 15 */ 16 17 #ifndef __PPUCD_H__ 18 #define __PPUCD_H__ 19 20 #include "unicode/utypes.h" 21 #include "unicode/uniset.h" 22 #include "unicode/unistr.h" 23 24 #include <stdio.h> 25 26 /** Additions to the uchar.h enum UProperty. */ 27 enum { 28 /** Name_Alias */ 29 PPUCD_NAME_ALIAS=UCHAR_STRING_LIMIT, 30 PPUCD_CONDITIONAL_CASE_MAPPINGS, 31 PPUCD_TURKIC_CASE_FOLDING 32 }; 33 34 U_NAMESPACE_BEGIN 35 36 class U_TOOLUTIL_API PropertyNames { 37 public: 38 virtual ~PropertyNames(); 39 virtual int32_t getPropertyEnum(const char *name) const; 40 virtual int32_t getPropertyValueEnum(int32_t property, const char *name) const; 41 }; 42 43 struct U_TOOLUTIL_API UniProps { 44 UniProps(); 45 ~UniProps(); 46 getIntPropUniProps47 int32_t getIntProp(int32_t prop) const { return intProps[prop-UCHAR_INT_START]; } 48 49 UChar32 start, end; 50 UBool binProps[UCHAR_BINARY_LIMIT]; 51 int32_t intProps[UCHAR_INT_LIMIT-UCHAR_INT_START]; 52 UVersionInfo age; 53 UChar32 bmg, bpb; 54 UChar32 scf, slc, stc, suc; 55 int32_t digitValue; 56 const char *numericValue; 57 const char *name; 58 const char *nameAlias; 59 UnicodeString cf, lc, tc, uc; 60 UnicodeSet scx; 61 }; 62 63 class U_TOOLUTIL_API PreparsedUCD { 64 public: 65 enum LineType { 66 /** No line, end of file. */ 67 NO_LINE, 68 /** Empty line. (Might contain a comment.) */ 69 EMPTY_LINE, 70 71 /** ucd;6.1.0 */ 72 UNICODE_VERSION_LINE, 73 74 /** property;Binary;Alpha;Alphabetic */ 75 PROPERTY_LINE, 76 /** binary;N;No;F;False */ 77 BINARY_LINE, 78 /** value;gc;Zs;Space_Separator */ 79 VALUE_LINE, 80 81 /** defaults;0000..10FFFF;age=NA;bc=L;... */ 82 DEFAULTS_LINE, 83 /** block;0000..007F;age=1.1;blk=ASCII;ea=Na;... */ 84 BLOCK_LINE, 85 /** cp;0030;AHex;bc=EN;gc=Nd;na=DIGIT ZERO;... */ 86 CP_LINE, 87 88 /** algnamesrange;4E00..9FCC;han;CJK UNIFIED IDEOGRAPH- */ 89 ALG_NAMES_RANGE_LINE, 90 91 LINE_TYPE_COUNT 92 }; 93 94 /** 95 * Constructor. 96 * Prepare this object for a new, empty package. 97 */ 98 PreparsedUCD(const char *filename, UErrorCode &errorCode); 99 100 /** Destructor. */ 101 ~PreparsedUCD(); 102 103 /** Sets (aliases) a non-standard PropertyNames implementation. Caller retains ownership. */ setPropertyNames(const PropertyNames * pn)104 void setPropertyNames(const PropertyNames *pn) { pnames=pn; } 105 106 /** 107 * Reads a line from the preparsed UCD file. 108 * Splits the line by replacing each ';' with a NUL. 109 */ 110 LineType readLine(UErrorCode &errorCode); 111 112 /** Returns the number of the line read by readLine(). */ getLineNumber()113 int32_t getLineNumber() const { return lineNumber; } 114 115 /** Returns the line's next field, or NULL. */ 116 const char *nextField(); 117 118 /** Returns the Unicode version when or after the UNICODE_VERSION_LINE has been read. */ getUnicodeVersion()119 const UVersionInfo &getUnicodeVersion() const { return ucdVersion; } 120 121 /** Returns TRUE if the current line has property values. */ lineHasPropertyValues()122 UBool lineHasPropertyValues() const { return DEFAULTS_LINE<=lineType && lineType<=CP_LINE; } 123 124 /** 125 * Parses properties from the current line. 126 * Clears newValues and sets UProperty codes for property values mentioned 127 * on the current line (as opposed to being inherited). 128 * Returns a pointer to the filled-in UniProps, or NULL if something went wrong. 129 * The returned UniProps are usable until the next line of the same type is read. 130 */ 131 const UniProps *getProps(UnicodeSet &newValues, UErrorCode &errorCode); 132 133 /** 134 * Returns the code point range for the current algnamesrange line. 135 * Calls & parses nextField(). 136 * Further nextField() calls will yield the range's type & prefix string. 137 * Returns U_SUCCESS(errorCode). 138 */ 139 UBool getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode); 140 141 private: isLineBufferAvailable(int32_t i)142 UBool isLineBufferAvailable(int32_t i) { 143 return defaultLineIndex!=i && blockLineIndex!=i; 144 } 145 146 /** Resets the field iterator and returns the line's first field (the line type field). */ 147 const char *firstField(); 148 149 UBool parseProperty(UniProps &props, const char *field, UnicodeSet &newValues, 150 UErrorCode &errorCode); 151 UChar32 parseCodePoint(const char *s, UErrorCode &errorCode); 152 UBool parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode); 153 void parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode); 154 void parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode); 155 156 static const int32_t kNumLineBuffers=3; 157 158 PropertyNames *icuPnames; // owned 159 const PropertyNames *pnames; // aliased 160 FILE *file; 161 int32_t defaultLineIndex, blockLineIndex, lineIndex; 162 int32_t lineNumber; 163 LineType lineType; 164 char *fieldLimit; 165 char *lineLimit; 166 167 UVersionInfo ucdVersion; 168 UniProps defaultProps, blockProps, cpProps; 169 // Multiple lines so that default and block properties can maintain pointers 170 // into their line buffers. 171 char lines[kNumLineBuffers][4096]; 172 }; 173 174 U_NAMESPACE_END 175 176 #endif // __PPUCD_H__ 177