/* ******************************************************************************* * * Copyright (C) 2009-2011, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: normalizer2impl.h * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2009nov22 * created by: Markus W. Scherer */ #ifndef __NORMALIZER2IMPL_H__ #define __NORMALIZER2IMPL_H__ #include "unicode/utypes.h" #if !UCONFIG_NO_NORMALIZATION #include "unicode/normalizer2.h" #include "unicode/udata.h" #include "unicode/unistr.h" #include "unicode/unorm.h" #include "mutex.h" #include "uset_imp.h" #include "utrie2.h" U_NAMESPACE_BEGIN struct CanonIterData; class Hangul { public: /* Korean Hangul and Jamo constants */ enum { JAMO_L_BASE=0x1100, /* "lead" jamo */ JAMO_V_BASE=0x1161, /* "vowel" jamo */ JAMO_T_BASE=0x11a7, /* "trail" jamo */ HANGUL_BASE=0xac00, JAMO_L_COUNT=19, JAMO_V_COUNT=21, JAMO_T_COUNT=28, JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT, HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT, HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT }; static inline UBool isHangul(UChar32 c) { return HANGUL_BASE<=c && c=MIN_NORMAL_MAYBE_YES) { return (uint8_t)norm16; } if(norm16=MIN_NORMAL_MAYBE_YES ? (uint8_t)norm16 : 0; } uint16_t getFCD16(UChar32 c) const { return UTRIE2_GET16(fcdTrie(), c); } uint16_t getFCD16FromSingleLead(UChar c) const { return UTRIE2_GET16_FROM_U16_SINGLE_LEAD(fcdTrie(), c); } uint16_t getFCD16FromSupplementary(UChar32 c) const { return UTRIE2_GET16_FROM_SUPP(fcdTrie(), c); } uint16_t getFCD16FromSurrogatePair(UChar c, UChar c2) const { return getFCD16FromSupplementary(U16_GET_SUPPLEMENTARY(c, c2)); } void setFCD16FromNorm16(UChar32 start, UChar32 end, uint16_t norm16, UTrie2 *newFCDTrie, UErrorCode &errorCode) const; void makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16, CanonIterData &newData, UErrorCode &errorCode) const; /** * Get the decomposition for one code point. * @param c code point * @param buffer out-only buffer for algorithmic decompositions * @param length out-only, takes the length of the decomposition, if any * @return pointer to the decomposition, or NULL if none */ const UChar *getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const; UBool isCanonSegmentStarter(UChar32 c) const; UBool getCanonStartSet(UChar32 c, UnicodeSet &set) const; enum { MIN_CCC_LCCC_CP=0x300 }; enum { MIN_YES_YES_WITH_CC=0xff01, JAMO_VT=0xff00, MIN_NORMAL_MAYBE_YES=0xfe00, JAMO_L=1, MAX_DELTA=0x40 }; enum { // Byte offsets from the start of the data, after the generic header. IX_NORM_TRIE_OFFSET, IX_EXTRA_DATA_OFFSET, IX_RESERVED2_OFFSET, IX_RESERVED3_OFFSET, IX_RESERVED4_OFFSET, IX_RESERVED5_OFFSET, IX_RESERVED6_OFFSET, IX_TOTAL_SIZE, // Code point thresholds for quick check codes. IX_MIN_DECOMP_NO_CP, IX_MIN_COMP_NO_MAYBE_CP, // Norm16 value thresholds for quick check combinations and types of extra data. IX_MIN_YES_NO, IX_MIN_NO_NO, IX_LIMIT_NO_NO, IX_MIN_MAYBE_YES, IX_RESERVED14, IX_RESERVED15, IX_COUNT }; enum { MAPPING_HAS_CCC_LCCC_WORD=0x80, MAPPING_PLUS_COMPOSITION_LIST=0x40, MAPPING_NO_COMP_BOUNDARY_AFTER=0x20, MAPPING_LENGTH_MASK=0x1f }; enum { COMP_1_LAST_TUPLE=0x8000, COMP_1_TRIPLE=1, COMP_1_TRAIL_LIMIT=0x3400, COMP_1_TRAIL_MASK=0x7ffe, COMP_1_TRAIL_SHIFT=9, // 10-1 for the "triple" bit COMP_2_TRAIL_SHIFT=6, COMP_2_TRAIL_MASK=0xffc0 }; // higher-level functionality ------------------------------------------ *** const UChar *decompose(const UChar *src, const UChar *limit, ReorderingBuffer *buffer, UErrorCode &errorCode) const; void decomposeAndAppend(const UChar *src, const UChar *limit, UBool doDecompose, UnicodeString &safeMiddle, ReorderingBuffer &buffer, UErrorCode &errorCode) const; UBool compose(const UChar *src, const UChar *limit, UBool onlyContiguous, UBool doCompose, ReorderingBuffer &buffer, UErrorCode &errorCode) const; const UChar *composeQuickCheck(const UChar *src, const UChar *limit, UBool onlyContiguous, UNormalizationCheckResult *pQCResult) const; void composeAndAppend(const UChar *src, const UChar *limit, UBool doCompose, UBool onlyContiguous, UnicodeString &safeMiddle, ReorderingBuffer &buffer, UErrorCode &errorCode) const; const UChar *makeFCD(const UChar *src, const UChar *limit, ReorderingBuffer *buffer, UErrorCode &errorCode) const; void makeFCDAndAppend(const UChar *src, const UChar *limit, UBool doMakeFCD, UnicodeString &safeMiddle, ReorderingBuffer &buffer, UErrorCode &errorCode) const; UBool hasDecompBoundary(UChar32 c, UBool before) const; UBool isDecompInert(UChar32 c) const { return isDecompYesAndZeroCC(getNorm16(c)); } UBool hasCompBoundaryBefore(UChar32 c) const { return c=minMaybeYes; } static UBool isInert(uint16_t norm16) { return norm16==0; } // static UBool isJamoL(uint16_t norm16) const { return norm16==1; } static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; } UBool isHangul(uint16_t norm16) const { return norm16==minYesNo; } UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16=MIN_YES_YES_WITH_CC || norm16=limitNoNo; } // For use with isCompYes(). // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. // static uint8_t getCCFromYes(uint16_t norm16) { // return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0; // } uint8_t getCCFromNoNo(uint16_t norm16) const { const uint16_t *mapping=getMapping(norm16); if(*mapping&MAPPING_HAS_CCC_LCCC_WORD) { return (uint8_t)mapping[1]; } else { return 0; } } // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC() uint8_t getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const; // Requires algorithmic-NoNo. UChar32 mapAlgorithmic(UChar32 c, uint16_t norm16) const { return c+norm16-(minMaybeYes-MAX_DELTA-1); } // Requires minYesNo>7)&1); // +1 if MAPPING_HAS_CCC_LCCC_WORD } /** * @param c code point must have compositions * @return compositions list pointer */ const uint16_t *getCompositionsList(uint16_t norm16) const { return isDecompYes(norm16) ? getCompositionsListForDecompYes(norm16) : getCompositionsListForComposite(norm16); } const UChar *copyLowPrefixFromNulTerminated(const UChar *src, UChar32 minNeedDataCP, ReorderingBuffer *buffer, UErrorCode &errorCode) const; UBool decomposeShort(const UChar *src, const UChar *limit, ReorderingBuffer &buffer, UErrorCode &errorCode) const; UBool decompose(UChar32 c, uint16_t norm16, ReorderingBuffer &buffer, UErrorCode &errorCode) const; static int32_t combine(const uint16_t *list, UChar32 trail); void addComposites(const uint16_t *list, UnicodeSet &set) const; void recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex, UBool onlyContiguous) const; UBool hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const; const UChar *findPreviousCompBoundary(const UChar *start, const UChar *p) const; const UChar *findNextCompBoundary(const UChar *p, const UChar *limit) const; const UTrie2 *fcdTrie() const { return (const UTrie2 *)fcdTrieSingleton.fInstance; } const UChar *findPreviousFCDBoundary(const UChar *start, const UChar *p) const; const UChar *findNextFCDBoundary(const UChar *p, const UChar *limit) const; int32_t getCanonValue(UChar32 c) const; const UnicodeSet &getCanonStartSet(int32_t n) const; UDataMemory *memory; UVersionInfo dataVersion; // Code point thresholds for quick check codes. UChar32 minDecompNoCP; UChar32 minCompNoMaybeCP; // Norm16 value thresholds for quick check combinations and types of extra data. uint16_t minYesNo; uint16_t minNoNo; uint16_t limitNoNo; uint16_t minMaybeYes; UTrie2 *normTrie; const uint16_t *maybeYesCompositions; const uint16_t *extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters SimpleSingleton fcdTrieSingleton; SimpleSingleton canonIterDataSingleton; }; // bits in canonIterData #define CANON_NOT_SEGMENT_STARTER 0x80000000 #define CANON_HAS_COMPOSITIONS 0x40000000 #define CANON_HAS_SET 0x200000 #define CANON_VALUE_MASK 0x1fffff /** * ICU-internal shortcut for quick access to standard Unicode normalization. */ class U_COMMON_API Normalizer2Factory { public: static const Normalizer2 *getNFCInstance(UErrorCode &errorCode); static const Normalizer2 *getNFDInstance(UErrorCode &errorCode); static const Normalizer2 *getFCDInstance(UErrorCode &errorCode); static const Normalizer2 *getFCCInstance(UErrorCode &errorCode); static const Normalizer2 *getNFKCInstance(UErrorCode &errorCode); static const Normalizer2 *getNFKDInstance(UErrorCode &errorCode); static const Normalizer2 *getNFKC_CFInstance(UErrorCode &errorCode); static const Normalizer2 *getNoopInstance(UErrorCode &errorCode); static const Normalizer2 *getInstance(UNormalizationMode mode, UErrorCode &errorCode); static const Normalizer2Impl *getNFCImpl(UErrorCode &errorCode); static const Normalizer2Impl *getNFKCImpl(UErrorCode &errorCode); static const Normalizer2Impl *getNFKC_CFImpl(UErrorCode &errorCode); // Get the Impl instance of the Normalizer2. // Must be used only when it is known that norm2 is a Normalizer2WithImpl instance. static const Normalizer2Impl *getImpl(const Normalizer2 *norm2); static const UTrie2 *getFCDTrie(UErrorCode &errorCode); private: Normalizer2Factory(); // No instantiation. }; U_NAMESPACE_END U_CAPI int32_t U_EXPORT2 unorm2_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, UErrorCode *pErrorCode); /** * Get the NF*_QC property for a code point, for u_getIntPropertyValue(). * @internal */ U_CFUNC UNormalizationCheckResult U_EXPORT2 unorm_getQuickCheck(UChar32 c, UNormalizationMode mode); /** * Internal API, used by collation code. * Get access to the internal FCD trie table to be able to perform * incremental, per-code unit, FCD checks in collation. * One pointer is sufficient because the trie index values are offset * by the index size, so that the same pointer is used to access the trie data. * Code points at fcdHighStart and above have a zero FCD value. * @internal */ U_CAPI const uint16_t * U_EXPORT2 unorm_getFCDTrieIndex(UChar32 &fcdHighStart, UErrorCode *pErrorCode); /** * Internal API, used by collation code. * Get the FCD value for a code unit, with * bits 15..8 lead combining class * bits 7..0 trail combining class * * If c is a lead surrogate and the value is not 0, * then some of c's associated supplementary code points have a non-zero FCD value. * * @internal */ static inline uint16_t unorm_getFCD16(const uint16_t *fcdTrieIndex, UChar c) { return fcdTrieIndex[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(fcdTrieIndex, c)]; } /** * Internal API, used by collation code. * Get the FCD value of the next code point (post-increment), with * bits 15..8 lead combining class * bits 7..0 trail combining class * * @internal */ static inline uint16_t unorm_nextFCD16(const uint16_t *fcdTrieIndex, UChar32 fcdHighStart, const UChar *&s, const UChar *limit) { UChar32 c=*s++; uint16_t fcd=fcdTrieIndex[_UTRIE2_INDEX_FROM_U16_SINGLE_LEAD(fcdTrieIndex, c)]; if(fcd!=0 && U16_IS_LEAD(c)) { UChar c2; if(s!=limit && U16_IS_TRAIL(c2=*s)) { ++s; c=U16_GET_SUPPLEMENTARY(c, c2); if(c