1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2011-2013, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * file name: ppucd.h 9 * encoding: UTF-8 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2011dec11 14 * created by: Markus W. Scherer 15 */ 16 17 #ifndef __PPUCD_H__ 18 #define __PPUCD_H__ 19 20 #include "unicode/utypes.h" 21 #include "unicode/uniset.h" 22 #include "unicode/unistr.h" 23 24 #include <stdio.h> 25 26 /** Additions to the uchar.h enum UProperty. */ 27 enum { 28 /** Name_Alias */ 29 PPUCD_NAME_ALIAS=UCHAR_STRING_LIMIT, 30 PPUCD_CONDITIONAL_CASE_MAPPINGS, 31 PPUCD_TURKIC_CASE_FOLDING 32 }; 33 34 U_NAMESPACE_BEGIN 35 36 class U_TOOLUTIL_API PropertyNames { 37 public: 38 virtual ~PropertyNames(); 39 virtual int32_t getPropertyEnum(const char *name) const; 40 virtual int32_t getPropertyValueEnum(int32_t property, const char *name) const; 41 }; 42 43 struct U_TOOLUTIL_API UniProps { 44 UniProps(); 45 ~UniProps(); 46 getIntPropUniProps47 int32_t getIntProp(int32_t prop) const { return intProps[prop-UCHAR_INT_START]; } 48 49 UChar32 start, end; 50 UBool binProps[UCHAR_BINARY_LIMIT]; 51 int32_t intProps[UCHAR_INT_LIMIT-UCHAR_INT_START]; 52 UVersionInfo age; 53 UChar32 bmg, bpb; 54 UChar32 scf, slc, stc, suc; 55 int32_t digitValue; 56 const char *numericValue; 57 const char *name; 58 const char *nameAlias; 59 UnicodeString cf, lc, tc, uc; 60 UnicodeSet scx; 61 }; 62 63 class U_TOOLUTIL_API PreparsedUCD { 64 public: 65 enum LineType { 66 /** No line, end of file. */ 67 NO_LINE, 68 /** Empty line. (Might contain a comment.) */ 69 EMPTY_LINE, 70 71 /** ucd;6.1.0 */ 72 UNICODE_VERSION_LINE, 73 74 /** property;Binary;Alpha;Alphabetic */ 75 PROPERTY_LINE, 76 /** binary;N;No;F;False */ 77 BINARY_LINE, 78 /** value;gc;Zs;Space_Separator */ 79 VALUE_LINE, 80 81 /** defaults;0000..10FFFF;age=NA;bc=L;... */ 82 DEFAULTS_LINE, 83 /** block;0000..007F;age=1.1;blk=ASCII;ea=Na;... */ 84 BLOCK_LINE, 85 /** cp;0030;AHex;bc=EN;gc=Nd;na=DIGIT ZERO;... */ 86 CP_LINE, 87 /** unassigned;E01F0..E0FFF;bc=BN;CWKCF;DI;GCB=CN;NFKC_CF= */ 88 UNASSIGNED_LINE, 89 90 /** algnamesrange;4E00..9FCC;han;CJK UNIFIED IDEOGRAPH- */ 91 ALG_NAMES_RANGE_LINE, 92 93 LINE_TYPE_COUNT 94 }; 95 96 /** 97 * Constructor. 98 * Prepare this object for a new, empty package. 99 */ 100 PreparsedUCD(const char *filename, UErrorCode &errorCode); 101 102 /** Destructor. */ 103 ~PreparsedUCD(); 104 105 /** Sets (aliases) a non-standard PropertyNames implementation. Caller retains ownership. */ setPropertyNames(const PropertyNames * pn)106 void setPropertyNames(const PropertyNames *pn) { pnames=pn; } 107 108 /** 109 * Reads a line from the preparsed UCD file. 110 * Splits the line by replacing each ';' with a NUL. 111 */ 112 LineType readLine(UErrorCode &errorCode); 113 114 /** Returns the number of the line read by readLine(). */ getLineNumber()115 int32_t getLineNumber() const { return lineNumber; } 116 117 /** Returns the line's next field, or NULL. */ 118 const char *nextField(); 119 120 /** Returns the Unicode version when or after the UNICODE_VERSION_LINE has been read. */ getUnicodeVersion()121 const UVersionInfo &getUnicodeVersion() const { return ucdVersion; } 122 123 /** Returns TRUE if the current line has property values. */ lineHasPropertyValues()124 UBool lineHasPropertyValues() const { 125 return DEFAULTS_LINE<=lineType && lineType<=UNASSIGNED_LINE; 126 } 127 128 /** 129 * Parses properties from the current line. 130 * Clears newValues and sets UProperty codes for property values mentioned 131 * on the current line (as opposed to being inherited). 132 * Returns a pointer to the filled-in UniProps, or NULL if something went wrong. 133 * The returned UniProps are usable until the next line of the same type is read. 134 */ 135 const UniProps *getProps(UnicodeSet &newValues, UErrorCode &errorCode); 136 137 /** 138 * Returns the code point range for the current algnamesrange line. 139 * Calls & parses nextField(). 140 * Further nextField() calls will yield the range's type & prefix string. 141 * Returns U_SUCCESS(errorCode). 142 */ 143 UBool getRangeForAlgNames(UChar32 &start, UChar32 &end, UErrorCode &errorCode); 144 145 private: isLineBufferAvailable(int32_t i)146 UBool isLineBufferAvailable(int32_t i) { 147 return defaultLineIndex!=i && blockLineIndex!=i; 148 } 149 150 /** Resets the field iterator and returns the line's first field (the line type field). */ 151 const char *firstField(); 152 153 UBool parseProperty(UniProps &props, const char *field, UnicodeSet &newValues, 154 UErrorCode &errorCode); 155 UChar32 parseCodePoint(const char *s, UErrorCode &errorCode); 156 UBool parseCodePointRange(const char *s, UChar32 &start, UChar32 &end, UErrorCode &errorCode); 157 void parseString(const char *s, UnicodeString &uni, UErrorCode &errorCode); 158 void parseScriptExtensions(const char *s, UnicodeSet &scx, UErrorCode &errorCode); 159 160 static const int32_t kNumLineBuffers=3; 161 162 PropertyNames *icuPnames; // owned 163 const PropertyNames *pnames; // aliased 164 FILE *file; 165 int32_t defaultLineIndex, blockLineIndex, lineIndex; 166 int32_t lineNumber; 167 LineType lineType; 168 char *fieldLimit; 169 char *lineLimit; 170 171 UVersionInfo ucdVersion; 172 UniProps defaultProps, blockProps, cpProps; 173 UnicodeSet blockValues; 174 // Multiple lines so that default and block properties can maintain pointers 175 // into their line buffers. 176 char lines[kNumLineBuffers][4096]; 177 }; 178 179 U_NAMESPACE_END 180 181 #endif // __PPUCD_H__ 182