• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  **********************************************************************
5  *   Copyright (C) 2005-2015, International Business Machines
6  *   Corporation and others.  All Rights Reserved.
7  **********************************************************************
8  */
9 
10 #ifndef __CSRSBCS_H
11 #define __CSRSBCS_H
12 
13 #include "unicode/uobject.h"
14 
15 #if !UCONFIG_NO_CONVERSION
16 
17 #include "csrecog.h"
18 
19 U_NAMESPACE_BEGIN
20 
21 class NGramParser : public UMemory
22 {
23 private:
24     int32_t ngram;
25     const int32_t *ngramList;
26 
27     int32_t ngramCount;
28     int32_t hitCount;
29 
30 protected:
31 	int32_t byteIndex;
32     const uint8_t *charMap;
33 
34 	void addByte(int32_t b);
35 
36 public:
37     NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap);
38     virtual ~NGramParser();
39 
40 private:
41     /*
42     * Binary search for value in table, which must have exactly 64 entries.
43     */
44     int32_t search(const int32_t *table, int32_t value);
45 
46     void lookup(int32_t thisNgram);
47 
48     virtual int32_t nextByte(InputText *det);
49 	virtual void parseCharacters(InputText *det);
50 
51 public:
52     int32_t parse(InputText *det);
53 
54 };
55 
56 #if !UCONFIG_ONLY_HTML_CONVERSION
57 class NGramParser_IBM420 : public NGramParser
58 {
59 public:
60     NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap);
61     ~NGramParser_IBM420();
62 
63 private:
64     int32_t alef;
65     int32_t isLamAlef(int32_t b);
66     int32_t nextByte(InputText *det) override;
67     void parseCharacters(InputText *det) override;
68 };
69 #endif
70 
71 
72 class CharsetRecog_sbcs : public CharsetRecognizer
73 {
74 public:
75     CharsetRecog_sbcs();
76     virtual ~CharsetRecog_sbcs();
77     virtual const char *getName() const override = 0;
78     virtual UBool match(InputText *det, CharsetMatch *results) const override = 0;
79     virtual int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const;
80 };
81 
82 class CharsetRecog_8859_1 : public CharsetRecog_sbcs
83 {
84 public:
85     virtual ~CharsetRecog_8859_1();
86     const char *getName() const override;
87     virtual UBool match(InputText *det, CharsetMatch *results) const override;
88 };
89 
90 class CharsetRecog_8859_2 : public CharsetRecog_sbcs
91 {
92 public:
93     virtual ~CharsetRecog_8859_2();
94     const char *getName() const override;
95     virtual UBool match(InputText *det, CharsetMatch *results) const override;
96 };
97 
98 class CharsetRecog_8859_5 : public CharsetRecog_sbcs
99 {
100 public:
101     virtual ~CharsetRecog_8859_5();
102     const char *getName() const override;
103 };
104 
105 class CharsetRecog_8859_6 : public CharsetRecog_sbcs
106 {
107 public:
108     virtual ~CharsetRecog_8859_6();
109 
110     const char *getName() const override;
111 };
112 
113 class CharsetRecog_8859_7 : public CharsetRecog_sbcs
114 {
115 public:
116     virtual ~CharsetRecog_8859_7();
117 
118     const char *getName() const override;
119 };
120 
121 class CharsetRecog_8859_8 : public CharsetRecog_sbcs
122 {
123 public:
124     virtual ~CharsetRecog_8859_8();
125 
126     virtual const char *getName() const override;
127 };
128 
129 class CharsetRecog_8859_9 : public CharsetRecog_sbcs
130 {
131 public:
132     virtual ~CharsetRecog_8859_9();
133 
134     const char *getName() const override;
135 };
136 
137 
138 
139 class CharsetRecog_8859_5_ru : public CharsetRecog_8859_5
140 {
141 public:
142     virtual ~CharsetRecog_8859_5_ru();
143 
144     const char *getLanguage() const override;
145 
146     virtual UBool match(InputText *det, CharsetMatch *results) const override;
147 };
148 
149 class CharsetRecog_8859_6_ar : public CharsetRecog_8859_6
150 {
151 public:
152     virtual ~CharsetRecog_8859_6_ar();
153 
154     const char *getLanguage() const override;
155 
156     virtual UBool match(InputText *det, CharsetMatch *results) const override;
157 };
158 
159 class CharsetRecog_8859_7_el : public CharsetRecog_8859_7
160 {
161 public:
162     virtual ~CharsetRecog_8859_7_el();
163 
164     const char *getLanguage() const override;
165 
166     virtual UBool match(InputText *det, CharsetMatch *results) const override;
167 };
168 
169 class CharsetRecog_8859_8_I_he : public CharsetRecog_8859_8
170 {
171 public:
172     virtual ~CharsetRecog_8859_8_I_he();
173 
174     const char *getName() const override;
175 
176     const char *getLanguage() const override;
177 
178     virtual UBool match(InputText *det, CharsetMatch *results) const override;
179 };
180 
181 class CharsetRecog_8859_8_he : public CharsetRecog_8859_8
182 {
183 public:
184     virtual ~CharsetRecog_8859_8_he ();
185 
186     const char *getLanguage() const override;
187 
188     virtual UBool match(InputText *det, CharsetMatch *results) const override;
189 };
190 
191 class CharsetRecog_8859_9_tr : public CharsetRecog_8859_9
192 {
193 public:
194     virtual ~CharsetRecog_8859_9_tr ();
195 
196     const char *getLanguage() const override;
197 
198     virtual UBool match(InputText *det, CharsetMatch *results) const override;
199 };
200 
201 class CharsetRecog_windows_1256 : public CharsetRecog_sbcs
202 {
203 public:
204     virtual ~CharsetRecog_windows_1256();
205 
206     const char *getName() const override;
207 
208     const char *getLanguage() const override;
209 
210     virtual UBool match(InputText *det, CharsetMatch *results) const override;
211 };
212 
213 class CharsetRecog_windows_1251 : public CharsetRecog_sbcs
214 {
215 public:
216     virtual ~CharsetRecog_windows_1251();
217 
218     const char *getName() const override;
219 
220     const char *getLanguage() const override;
221 
222     virtual UBool match(InputText *det, CharsetMatch *results) const override;
223 };
224 
225 
226 class CharsetRecog_KOI8_R : public CharsetRecog_sbcs
227 {
228 public:
229     virtual ~CharsetRecog_KOI8_R();
230 
231     const char *getName() const override;
232 
233     const char *getLanguage() const override;
234 
235     virtual UBool match(InputText *det, CharsetMatch *results) const override;
236 };
237 
238 #if !UCONFIG_ONLY_HTML_CONVERSION
239 class CharsetRecog_IBM424_he : public CharsetRecog_sbcs
240 {
241 public:
242     virtual ~CharsetRecog_IBM424_he();
243 
244     const char *getLanguage() const override;
245 };
246 
247 class CharsetRecog_IBM424_he_rtl : public CharsetRecog_IBM424_he {
248 public:
249     virtual ~CharsetRecog_IBM424_he_rtl();
250 
251     const char *getName() const override;
252 
253     virtual UBool match(InputText *det, CharsetMatch *results) const override;
254 };
255 
256 class CharsetRecog_IBM424_he_ltr : public CharsetRecog_IBM424_he {
257     virtual ~CharsetRecog_IBM424_he_ltr();
258 
259     const char *getName() const override;
260 
261     virtual UBool match(InputText *det, CharsetMatch *results) const override;
262 };
263 
264 class CharsetRecog_IBM420_ar : public CharsetRecog_sbcs
265 {
266 public:
267     virtual ~CharsetRecog_IBM420_ar();
268 
269     const char *getLanguage() const override;
270 	int32_t match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t charMap[]) const override;
271 
272 };
273 
274 class CharsetRecog_IBM420_ar_rtl : public CharsetRecog_IBM420_ar {
275 public:
276     virtual ~CharsetRecog_IBM420_ar_rtl();
277 
278     const char *getName() const override;
279 
280     virtual UBool match(InputText *det, CharsetMatch *results) const override;
281 };
282 
283 class CharsetRecog_IBM420_ar_ltr : public CharsetRecog_IBM420_ar {
284     virtual ~CharsetRecog_IBM420_ar_ltr();
285 
286     const char *getName() const override;
287 
288     virtual UBool match(InputText *det, CharsetMatch *results) const override;
289 };
290 #endif
291 
292 U_NAMESPACE_END
293 
294 #endif /* !UCONFIG_NO_CONVERSION */
295 #endif /* __CSRSBCS_H */
296