1 /* 2 ********************************************************************** 3 * Copyright (C) 2005-2012, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 */ 7 8 #ifndef __CSRMBCS_H 9 #define __CSRMBCS_H 10 11 #include "unicode/utypes.h" 12 13 #if !UCONFIG_NO_CONVERSION 14 15 #include "csrecog.h" 16 17 U_NAMESPACE_BEGIN 18 19 // "Character" iterated character class. 20 // Recognizers for specific mbcs encodings make their "characters" available 21 // by providing a nextChar() function that fills in an instance of IteratedChar 22 // with the next char from the input. 23 // The returned characters are not converted to Unicode, but remain as the raw 24 // bytes (concatenated into an int) from the codepage data. 25 // 26 // For Asian charsets, use the raw input rather than the input that has been 27 // stripped of markup. Detection only considers multi-byte chars, effectively 28 // stripping markup anyway, and double byte chars do occur in markup too. 29 // 30 class IteratedChar : public UMemory 31 { 32 public: 33 uint32_t charValue; // 1-4 bytes from the raw input data 34 int32_t index; 35 int32_t nextIndex; 36 UBool error; 37 UBool done; 38 39 public: 40 IteratedChar(); 41 //void reset(); 42 int32_t nextByte(InputText* det); 43 }; 44 45 46 class CharsetRecog_mbcs : public CharsetRecognizer { 47 48 protected: 49 /** 50 * Test the match of this charset with the input text data 51 * which is obtained via the CharsetDetector object. 52 * 53 * @param det The CharsetDetector, which contains the input text 54 * to be checked for being in this charset. 55 * @return Two values packed into one int (Damn java, anyhow) 56 * <br/> 57 * bits 0-7: the match confidence, ranging from 0-100 58 * <br/> 59 * bits 8-15: The match reason, an enum-like value. 60 */ 61 int32_t match_mbcs(InputText* det, const uint16_t commonChars[], int32_t commonCharsLen) const; 62 63 public: 64 65 virtual ~CharsetRecog_mbcs(); 66 67 /** 68 * Get the IANA name of this charset. 69 * @return the charset name. 70 */ 71 72 const char *getName() const = 0; 73 const char *getLanguage() const = 0; 74 UBool match(InputText* input, CharsetMatch *results) const = 0; 75 76 /** 77 * Get the next character (however many bytes it is) from the input data 78 * Subclasses for specific charset encodings must implement this function 79 * to get characters according to the rules of their encoding scheme. 80 * 81 * This function is not a method of class IteratedChar only because 82 * that would require a lot of extra derived classes, which is awkward. 83 * @param it The IteratedChar "struct" into which the returned char is placed. 84 * @param det The charset detector, which is needed to get at the input byte data 85 * being iterated over. 86 * @return True if a character was returned, false at end of input. 87 */ 88 virtual UBool nextChar(IteratedChar *it, InputText *textIn) const = 0; 89 90 }; 91 92 93 /** 94 * Shift-JIS charset recognizer. 95 * 96 */ 97 class CharsetRecog_sjis : public CharsetRecog_mbcs { 98 public: 99 virtual ~CharsetRecog_sjis(); 100 101 UBool nextChar(IteratedChar *it, InputText *det) const; 102 103 UBool match(InputText* input, CharsetMatch *results) const; 104 105 const char *getName() const; 106 const char *getLanguage() const; 107 108 }; 109 110 111 /** 112 * EUC charset recognizers. One abstract class that provides the common function 113 * for getting the next character according to the EUC encoding scheme, 114 * and nested derived classes for EUC_KR, EUC_JP, EUC_CN. 115 * 116 */ 117 class CharsetRecog_euc : public CharsetRecog_mbcs 118 { 119 public: 120 virtual ~CharsetRecog_euc(); 121 122 const char *getName() const = 0; 123 const char *getLanguage() const = 0; 124 125 UBool match(InputText* input, CharsetMatch *results) const = 0; 126 /* 127 * (non-Javadoc) 128 * Get the next character value for EUC based encodings. 129 * Character "value" is simply the raw bytes that make up the character 130 * packed into an int. 131 */ 132 UBool nextChar(IteratedChar *it, InputText *det) const; 133 }; 134 135 /** 136 * The charset recognize for EUC-JP. A singleton instance of this class 137 * is created and kept by the public CharsetDetector class 138 */ 139 class CharsetRecog_euc_jp : public CharsetRecog_euc 140 { 141 public: 142 virtual ~CharsetRecog_euc_jp(); 143 144 const char *getName() const; 145 const char *getLanguage() const; 146 147 UBool match(InputText* input, CharsetMatch *results) const; 148 }; 149 150 /** 151 * The charset recognize for EUC-KR. A singleton instance of this class 152 * is created and kept by the public CharsetDetector class 153 */ 154 class CharsetRecog_euc_kr : public CharsetRecog_euc 155 { 156 public: 157 virtual ~CharsetRecog_euc_kr(); 158 159 const char *getName() const; 160 const char *getLanguage() const; 161 162 UBool match(InputText* input, CharsetMatch *results) const; 163 }; 164 165 /** 166 * 167 * Big5 charset recognizer. 168 * 169 */ 170 class CharsetRecog_big5 : public CharsetRecog_mbcs 171 { 172 public: 173 virtual ~CharsetRecog_big5(); 174 175 UBool nextChar(IteratedChar* it, InputText* det) const; 176 177 const char *getName() const; 178 const char *getLanguage() const; 179 180 UBool match(InputText* input, CharsetMatch *results) const; 181 }; 182 183 184 /** 185 * 186 * GB-18030 recognizer. Uses simplified Chinese statistics. 187 * 188 */ 189 class CharsetRecog_gb_18030 : public CharsetRecog_mbcs 190 { 191 public: 192 virtual ~CharsetRecog_gb_18030(); 193 194 UBool nextChar(IteratedChar* it, InputText* det) const; 195 196 const char *getName() const; 197 const char *getLanguage() const; 198 199 UBool match(InputText* input, CharsetMatch *results) const; 200 }; 201 202 U_NAMESPACE_END 203 204 #endif 205 #endif /* __CSRMBCS_H */ 206