1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2005-2012, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 */ 9 10 #ifndef __CSRMBCS_H 11 #define __CSRMBCS_H 12 13 #include "unicode/utypes.h" 14 15 #if !UCONFIG_NO_CONVERSION 16 17 #include "csrecog.h" 18 19 U_NAMESPACE_BEGIN 20 21 // "Character" iterated character class. 22 // Recognizers for specific mbcs encodings make their "characters" available 23 // by providing a nextChar() function that fills in an instance of IteratedChar 24 // with the next char from the input. 25 // The returned characters are not converted to Unicode, but remain as the raw 26 // bytes (concatenated into an int) from the codepage data. 27 // 28 // For Asian charsets, use the raw input rather than the input that has been 29 // stripped of markup. Detection only considers multi-byte chars, effectively 30 // stripping markup anyway, and double byte chars do occur in markup too. 31 // 32 class IteratedChar : public UMemory 33 { 34 public: 35 uint32_t charValue; // 1-4 bytes from the raw input data 36 int32_t index; 37 int32_t nextIndex; 38 UBool error; 39 UBool done; 40 41 public: 42 IteratedChar(); 43 //void reset(); 44 int32_t nextByte(InputText* det); 45 }; 46 47 48 class CharsetRecog_mbcs : public CharsetRecognizer { 49 50 protected: 51 /** 52 * Test the match of this charset with the input text data 53 * which is obtained via the CharsetDetector object. 54 * 55 * @param det The CharsetDetector, which contains the input text 56 * to be checked for being in this charset. 57 * @return Two values packed into one int (Damn java, anyhow) 58 * <br/> 59 * bits 0-7: the match confidence, ranging from 0-100 60 * <br/> 61 * bits 8-15: The match reason, an enum-like value. 62 */ 63 int32_t match_mbcs(InputText* det, const uint16_t commonChars[], int32_t commonCharsLen) const; 64 65 public: 66 67 virtual ~CharsetRecog_mbcs(); 68 69 /** 70 * Get the IANA name of this charset. 71 * @return the charset name. 72 */ 73 74 const char *getName() const = 0; 75 const char *getLanguage() const = 0; 76 UBool match(InputText* input, CharsetMatch *results) const = 0; 77 78 /** 79 * Get the next character (however many bytes it is) from the input data 80 * Subclasses for specific charset encodings must implement this function 81 * to get characters according to the rules of their encoding scheme. 82 * 83 * This function is not a method of class IteratedChar only because 84 * that would require a lot of extra derived classes, which is awkward. 85 * @param it The IteratedChar "struct" into which the returned char is placed. 86 * @param det The charset detector, which is needed to get at the input byte data 87 * being iterated over. 88 * @return True if a character was returned, false at end of input. 89 */ 90 virtual UBool nextChar(IteratedChar *it, InputText *textIn) const = 0; 91 92 }; 93 94 95 /** 96 * Shift-JIS charset recognizer. 97 * 98 */ 99 class CharsetRecog_sjis : public CharsetRecog_mbcs { 100 public: 101 virtual ~CharsetRecog_sjis(); 102 103 UBool nextChar(IteratedChar *it, InputText *det) const; 104 105 UBool match(InputText* input, CharsetMatch *results) const; 106 107 const char *getName() const; 108 const char *getLanguage() const; 109 110 }; 111 112 113 /** 114 * EUC charset recognizers. One abstract class that provides the common function 115 * for getting the next character according to the EUC encoding scheme, 116 * and nested derived classes for EUC_KR, EUC_JP, EUC_CN. 117 * 118 */ 119 class CharsetRecog_euc : public CharsetRecog_mbcs 120 { 121 public: 122 virtual ~CharsetRecog_euc(); 123 124 const char *getName() const = 0; 125 const char *getLanguage() const = 0; 126 127 UBool match(InputText* input, CharsetMatch *results) const = 0; 128 /* 129 * (non-Javadoc) 130 * Get the next character value for EUC based encodings. 131 * Character "value" is simply the raw bytes that make up the character 132 * packed into an int. 133 */ 134 UBool nextChar(IteratedChar *it, InputText *det) const; 135 }; 136 137 /** 138 * The charset recognize for EUC-JP. A singleton instance of this class 139 * is created and kept by the public CharsetDetector class 140 */ 141 class CharsetRecog_euc_jp : public CharsetRecog_euc 142 { 143 public: 144 virtual ~CharsetRecog_euc_jp(); 145 146 const char *getName() const; 147 const char *getLanguage() const; 148 149 UBool match(InputText* input, CharsetMatch *results) const; 150 }; 151 152 /** 153 * The charset recognize for EUC-KR. A singleton instance of this class 154 * is created and kept by the public CharsetDetector class 155 */ 156 class CharsetRecog_euc_kr : public CharsetRecog_euc 157 { 158 public: 159 virtual ~CharsetRecog_euc_kr(); 160 161 const char *getName() const; 162 const char *getLanguage() const; 163 164 UBool match(InputText* input, CharsetMatch *results) const; 165 }; 166 167 /** 168 * 169 * Big5 charset recognizer. 170 * 171 */ 172 class CharsetRecog_big5 : public CharsetRecog_mbcs 173 { 174 public: 175 virtual ~CharsetRecog_big5(); 176 177 UBool nextChar(IteratedChar* it, InputText* det) const; 178 179 const char *getName() const; 180 const char *getLanguage() const; 181 182 UBool match(InputText* input, CharsetMatch *results) const; 183 }; 184 185 186 /** 187 * 188 * GB-18030 recognizer. Uses simplified Chinese statistics. 189 * 190 */ 191 class CharsetRecog_gb_18030 : public CharsetRecog_mbcs 192 { 193 public: 194 virtual ~CharsetRecog_gb_18030(); 195 196 UBool nextChar(IteratedChar* it, InputText* det) const; 197 198 const char *getName() const; 199 const char *getLanguage() const; 200 201 UBool match(InputText* input, CharsetMatch *results) const; 202 }; 203 204 U_NAMESPACE_END 205 206 #endif 207 #endif /* __CSRMBCS_H */ 208