1 /*
2 **********************************************************************
3 * Copyright (C) 2005-2015, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #include "cmemory.h"
11
12 #if !UCONFIG_NO_CONVERSION
13 #include "csrsbcs.h"
14 #include "csmatch.h"
15
16 #define N_GRAM_SIZE 3
17 #define N_GRAM_MASK 0xFFFFFF
18 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
19
20 U_NAMESPACE_BEGIN
21
NGramParser(const int32_t * theNgramList,const uint8_t * theCharMap)22 NGramParser::NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap)
23 : ngram(0), byteIndex(0)
24 {
25 ngramList = theNgramList;
26 charMap = theCharMap;
27
28 ngramCount = hitCount = 0;
29 }
30
~NGramParser()31 NGramParser::~NGramParser()
32 {
33 }
34
35 /*
36 * Binary search for value in table, which must have exactly 64 entries.
37 */
38
search(const int32_t * table,int32_t value)39 int32_t NGramParser::search(const int32_t *table, int32_t value)
40 {
41 int32_t index = 0;
42
43 if (table[index + 32] <= value) {
44 index += 32;
45 }
46
47 if (table[index + 16] <= value) {
48 index += 16;
49 }
50
51 if (table[index + 8] <= value) {
52 index += 8;
53 }
54
55 if (table[index + 4] <= value) {
56 index += 4;
57 }
58
59 if (table[index + 2] <= value) {
60 index += 2;
61 }
62
63 if (table[index + 1] <= value) {
64 index += 1;
65 }
66
67 if (table[index] > value) {
68 index -= 1;
69 }
70
71 if (index < 0 || table[index] != value) {
72 return -1;
73 }
74
75 return index;
76 }
77
lookup(int32_t thisNgram)78 void NGramParser::lookup(int32_t thisNgram)
79 {
80 ngramCount += 1;
81
82 if (search(ngramList, thisNgram) >= 0) {
83 hitCount += 1;
84 }
85
86 }
87
addByte(int32_t b)88 void NGramParser::addByte(int32_t b)
89 {
90 ngram = ((ngram << 8) + b) & N_GRAM_MASK;
91 lookup(ngram);
92 }
93
nextByte(InputText * det)94 int32_t NGramParser::nextByte(InputText *det)
95 {
96 if (byteIndex >= det->fInputLen) {
97 return -1;
98 }
99
100 return det->fInputBytes[byteIndex++];
101 }
102
parseCharacters(InputText * det)103 void NGramParser::parseCharacters(InputText *det)
104 {
105 int32_t b;
106 bool ignoreSpace = FALSE;
107
108 while ((b = nextByte(det)) >= 0) {
109 uint8_t mb = charMap[b];
110
111 // TODO: 0x20 might not be a space in all character sets...
112 if (mb != 0) {
113 if (!(mb == 0x20 && ignoreSpace)) {
114 addByte(mb);
115 }
116
117 ignoreSpace = (mb == 0x20);
118 }
119 }
120 }
121
parse(InputText * det)122 int32_t NGramParser::parse(InputText *det)
123 {
124 parseCharacters(det);
125
126 // TODO: Is this OK? The buffer could have ended in the middle of a word...
127 addByte(0x20);
128
129 double rawPercent = (double) hitCount / (double) ngramCount;
130
131 // if (rawPercent <= 2.0) {
132 // return 0;
133 // }
134
135 // TODO - This is a bit of a hack to take care of a case
136 // were we were getting a confidence of 135...
137 if (rawPercent > 0.33) {
138 return 98;
139 }
140
141 return (int32_t) (rawPercent * 300.0);
142 }
143
144 #if !UCONFIG_ONLY_HTML_CONVERSION
145 static const uint8_t unshapeMap_IBM420[] = {
146 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
147 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
148 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
149 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
150 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
151 /* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
152 /* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
153 /* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
154 /* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
155 /* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F,
156 /* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E,
157 /* A- */ 0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF,
158 /* B- */ 0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF,
159 /* C- */ 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF,
160 /* D- */ 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF,
161 /* E- */ 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
162 /* F- */ 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
163 };
164
NGramParser_IBM420(const int32_t * theNgramList,const uint8_t * theCharMap)165 NGramParser_IBM420::NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap):NGramParser(theNgramList, theCharMap)
166 {
167 alef = 0x00;
168 }
169
~NGramParser_IBM420()170 NGramParser_IBM420::~NGramParser_IBM420() {}
171
isLamAlef(int32_t b)172 int32_t NGramParser_IBM420::isLamAlef(int32_t b)
173 {
174 if(b == 0xB2 || b == 0xB3){
175 return 0x47;
176 }else if(b == 0xB4 || b == 0xB5){
177 return 0x49;
178 }else if(b == 0xB8 || b == 0xB9){
179 return 0x56;
180 }else
181 return 0x00;
182 }
183
184 /*
185 * Arabic shaping needs to be done manually. Cannot call ArabicShaping class
186 * because CharsetDetector is dealing with bytes not Unicode code points. We could
187 * convert the bytes to Unicode code points but that would leave us dependent
188 * on CharsetICU which we try to avoid. IBM420 converter amongst different versions
189 * of JDK can produce different results and therefore is also avoided.
190 */
nextByte(InputText * det)191 int32_t NGramParser_IBM420::nextByte(InputText *det)
192 {
193
194 if (byteIndex >= det->fInputLen || det->fInputBytes[byteIndex] == 0) {
195 return -1;
196 }
197 int next;
198
199 alef = isLamAlef(det->fInputBytes[byteIndex]);
200 if(alef != 0x00)
201 next = 0xB1 & 0xFF;
202 else
203 next = unshapeMap_IBM420[det->fInputBytes[byteIndex]& 0xFF] & 0xFF;
204
205 byteIndex++;
206
207 return next;
208 }
209
parseCharacters(InputText * det)210 void NGramParser_IBM420::parseCharacters(InputText *det)
211 {
212 int32_t b;
213 bool ignoreSpace = FALSE;
214
215 while ((b = nextByte(det)) >= 0) {
216 uint8_t mb = charMap[b];
217
218 // TODO: 0x20 might not be a space in all character sets...
219 if (mb != 0) {
220 if (!(mb == 0x20 && ignoreSpace)) {
221 addByte(mb);
222 }
223 ignoreSpace = (mb == 0x20);
224 }
225
226 if(alef != 0x00){
227 mb = charMap[alef & 0xFF];
228
229 // TODO: 0x20 might not be a space in all character sets...
230 if (mb != 0) {
231 if (!(mb == 0x20 && ignoreSpace)) {
232 addByte(mb);
233 }
234
235 ignoreSpace = (mb == 0x20);
236 }
237
238 }
239 }
240 }
241 #endif
242
CharsetRecog_sbcs()243 CharsetRecog_sbcs::CharsetRecog_sbcs()
244 {
245 // nothing else to do
246 }
247
~CharsetRecog_sbcs()248 CharsetRecog_sbcs::~CharsetRecog_sbcs()
249 {
250 // nothing to do
251 }
252
match_sbcs(InputText * det,const int32_t ngrams[],const uint8_t byteMap[]) const253 int32_t CharsetRecog_sbcs::match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t byteMap[]) const
254 {
255 NGramParser parser(ngrams, byteMap);
256 int32_t result;
257
258 result = parser.parse(det);
259
260 return result;
261 }
262
263 static const uint8_t charMap_8859_1[] = {
264 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
265 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
266 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
267 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
268 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
269 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
270 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
271 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
272 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
273 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
274 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
275 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
276 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
277 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
278 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
279 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
280 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
281 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
282 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
283 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
284 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
285 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
286 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
287 0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
288 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
289 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
290 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
291 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
292 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
293 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
294 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
295 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
296 };
297
298 static const uint8_t charMap_8859_2[] = {
299 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
300 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
301 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
302 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
303 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
304 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
305 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
306 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
307 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
308 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
309 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
310 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
311 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
312 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
313 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
314 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
315 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
316 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
317 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
318 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
319 0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0x20,
320 0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
321 0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0xB7,
322 0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
323 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
324 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
325 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
326 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
327 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
328 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
329 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
330 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
331 };
332
333 static const uint8_t charMap_8859_5[] = {
334 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
335 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
336 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
337 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
338 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
339 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
340 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
341 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
342 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
343 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
344 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
345 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
346 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
347 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
348 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
349 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
350 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
351 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
352 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
353 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
354 0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
355 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
356 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
357 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
358 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
359 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
360 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
361 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
362 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
363 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
364 0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
365 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
366 };
367
368 static const uint8_t charMap_8859_6[] = {
369 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
370 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
371 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
372 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
373 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
374 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
375 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
376 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
377 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
378 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
379 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
380 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
381 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
382 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
383 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
384 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
385 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
386 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
387 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
388 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
389 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
390 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
391 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
392 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
393 0x20, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
394 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
395 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
396 0xD8, 0xD9, 0xDA, 0x20, 0x20, 0x20, 0x20, 0x20,
397 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
398 0xE8, 0xE9, 0xEA, 0x20, 0x20, 0x20, 0x20, 0x20,
399 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
400 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
401 };
402
403 static const uint8_t charMap_8859_7[] = {
404 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
405 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
406 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
407 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
408 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
409 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
410 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
411 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
412 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
413 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
414 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
415 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
416 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
417 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
418 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
419 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
420 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
421 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
422 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
423 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
424 0x20, 0xA1, 0xA2, 0x20, 0x20, 0x20, 0x20, 0x20,
425 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
426 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xDC, 0x20,
427 0xDD, 0xDE, 0xDF, 0x20, 0xFC, 0x20, 0xFD, 0xFE,
428 0xC0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
429 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
430 0xF0, 0xF1, 0x20, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
431 0xF8, 0xF9, 0xFA, 0xFB, 0xDC, 0xDD, 0xDE, 0xDF,
432 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
433 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
434 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
435 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
436 };
437
438 static const uint8_t charMap_8859_8[] = {
439 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
440 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
441 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
442 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
443 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
444 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
445 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
446 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
447 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
448 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
449 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
450 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
451 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
452 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
453 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
454 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
455 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
456 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
457 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
458 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
459 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
460 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
461 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
462 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
463 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
464 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
465 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
466 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
467 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
468 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
469 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
470 0xF8, 0xF9, 0xFA, 0x20, 0x20, 0x20, 0x20, 0x20,
471 };
472
473 static const uint8_t charMap_8859_9[] = {
474 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
475 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
476 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
477 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
478 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
479 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
480 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
481 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
482 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
483 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
484 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
485 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
486 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
487 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
488 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
489 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
490 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
491 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
492 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
493 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
494 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
495 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
496 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
497 0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
498 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
499 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
500 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
501 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x69, 0xFE, 0xDF,
502 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
503 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
504 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
505 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
506 };
507
508 static const int32_t ngrams_windows_1251[] = {
509 0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 0x20EAEE, 0x20EDE0, 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 0x20F1F2, 0x20F2EE,
510 0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 0xE5EDE8, 0xE5F1F2, 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 0xEBE820, 0xEBFCED,
511 0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 0xEE20E2, 0xEE20EF, 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 0xEEEC20, 0xEEF1F2,
512 0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 0xF1F2E0, 0xF1F2E2, 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 0xF7F2EE, 0xFBF520,
513 };
514
515 static const uint8_t charMap_windows_1251[] = {
516 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
517 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
518 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
519 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
520 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
521 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
522 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
523 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
524 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
525 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
526 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
527 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
528 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
529 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
530 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
531 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
532 0x90, 0x83, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
533 0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
534 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
535 0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
536 0x20, 0xA2, 0xA2, 0xBC, 0x20, 0xB4, 0x20, 0x20,
537 0xB8, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0xBF,
538 0x20, 0x20, 0xB3, 0xB3, 0xB4, 0xB5, 0x20, 0x20,
539 0xB8, 0x20, 0xBA, 0x20, 0xBC, 0xBE, 0xBE, 0xBF,
540 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
541 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
542 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
543 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
544 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
545 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
546 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
547 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
548 };
549
550 static const int32_t ngrams_windows_1256[] = {
551 0x20C7E1, 0x20C7E4, 0x20C8C7, 0x20DAE1, 0x20DDED, 0x20E1E1, 0x20E3E4, 0x20E6C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E120, 0xC7E1C3, 0xC7E1C7, 0xC7E1C8,
552 0xC7E1CA, 0xC7E1CC, 0xC7E1CD, 0xC7E1CF, 0xC7E1D3, 0xC7E1DA, 0xC7E1DE, 0xC7E1E3, 0xC7E1E6, 0xC7E1ED, 0xC7E320, 0xC7E420, 0xC7E4CA, 0xC820C7, 0xC920C7, 0xC920DD,
553 0xC920E1, 0xC920E3, 0xC920E6, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xDA20C7, 0xDAE1EC, 0xDDED20, 0xE120C7, 0xE1C920, 0xE1EC20, 0xE1ED20,
554 0xE320C7, 0xE3C720, 0xE3C920, 0xE3E420, 0xE420C7, 0xE520C7, 0xE5C720, 0xE6C7E1, 0xE6E420, 0xEC20C7, 0xED20C7, 0xED20E3, 0xED20E6, 0xEDC920, 0xEDD120, 0xEDE420,
555 };
556
557 static const uint8_t charMap_windows_1256[] = {
558 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
559 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
560 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
561 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
562 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
563 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
564 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
565 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
566 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
567 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
568 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
569 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
570 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
571 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
572 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
573 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
574 0x20, 0x81, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
575 0x88, 0x20, 0x8A, 0x20, 0x9C, 0x8D, 0x8E, 0x8F,
576 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
577 0x98, 0x20, 0x9A, 0x20, 0x9C, 0x20, 0x20, 0x9F,
578 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
579 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
580 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
581 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
582 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
583 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
584 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0x20,
585 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
586 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
587 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
588 0x20, 0x20, 0x20, 0x20, 0xF4, 0x20, 0x20, 0x20,
589 0x20, 0xF9, 0x20, 0xFB, 0xFC, 0x20, 0x20, 0xFF,
590 };
591
592 static const int32_t ngrams_KOI8_R[] = {
593 0x20C4CF, 0x20C920, 0x20CBC1, 0x20CBCF, 0x20CEC1, 0x20CEC5, 0x20CFC2, 0x20D0CF, 0x20D0D2, 0x20D2C1, 0x20D3CF, 0x20D3D4, 0x20D4CF, 0x20D720, 0x20D7CF, 0x20DAC1,
594 0x20DCD4, 0x20DED4, 0xC1CEC9, 0xC1D4D8, 0xC5CCD8, 0xC5CEC9, 0xC5D3D4, 0xC5D420, 0xC7CF20, 0xC920D0, 0xC9C520, 0xC9C920, 0xC9D120, 0xCCC5CE, 0xCCC920, 0xCCD8CE,
595 0xCEC120, 0xCEC520, 0xCEC9C5, 0xCEC9D1, 0xCECF20, 0xCECFD7, 0xCF20D0, 0xCF20D3, 0xCF20D7, 0xCFC7CF, 0xCFCA20, 0xCFCCD8, 0xCFCD20, 0xCFD3D4, 0xCFD720, 0xCFD7C1,
596 0xD0CFCC, 0xD0D2C5, 0xD0D2C9, 0xD0D2CF, 0xD2C1D7, 0xD2C5C4, 0xD3D120, 0xD3D4C1, 0xD3D4C9, 0xD3D4D7, 0xD4C5CC, 0xD4CF20, 0xD4CFD2, 0xD4D820, 0xD9C820, 0xDED4CF,
597 };
598
599 static const uint8_t charMap_KOI8_R[] = {
600 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
601 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
602 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
603 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
604 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
605 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
606 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
607 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
608 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
609 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
610 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
611 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
612 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
613 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
614 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
615 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
616 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
617 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
618 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
619 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
620 0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
621 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
622 0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
623 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
624 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
625 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
626 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
627 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
628 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
629 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
630 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
631 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
632 };
633
634 #if !UCONFIG_ONLY_HTML_CONVERSION
635 static const int32_t ngrams_IBM424_he_rtl[] = {
636 0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641,
637 0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045,
638 0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x544041, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x554056,
639 0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x684045, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x714069,
640 };
641
642 static const int32_t ngrams_IBM424_he_ltr[] = {
643 0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x405462, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x407141,
644 0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x414645, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x454054,
645 0x454055, 0x454057, 0x454068, 0x454071, 0x455440, 0x464140, 0x464540, 0x484140, 0x514140, 0x514240, 0x514540, 0x544045, 0x544055, 0x544071, 0x546240, 0x546940,
646 0x555151, 0x555158, 0x555168, 0x564045, 0x564055, 0x564071, 0x564240, 0x564540, 0x624540, 0x694045, 0x694055, 0x694071, 0x694540, 0x714140, 0x714540, 0x714651,
647 };
648
649 static const uint8_t charMap_IBM424_he[] = {
650 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
651 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
652 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
653 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
654 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
655 /* 4- */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
656 /* 5- */ 0x40, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
657 /* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
658 /* 7- */ 0x40, 0x71, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x00, 0x40, 0x40,
659 /* 8- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
660 /* 9- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
661 /* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
662 /* B- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
663 /* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
664 /* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
665 /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
666 /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
667 };
668
669 static const int32_t ngrams_IBM420_ar_rtl[] = {
670 0x4056B1, 0x4056BD, 0x405856, 0x409AB1, 0x40ABDC, 0x40B1B1, 0x40BBBD, 0x40CF56, 0x564056, 0x564640, 0x566340, 0x567540, 0x56B140, 0x56B149, 0x56B156, 0x56B158,
671 0x56B163, 0x56B167, 0x56B169, 0x56B173, 0x56B178, 0x56B19A, 0x56B1AD, 0x56B1BB, 0x56B1CF, 0x56B1DC, 0x56BB40, 0x56BD40, 0x56BD63, 0x584056, 0x624056, 0x6240AB,
672 0x6240B1, 0x6240BB, 0x6240CF, 0x634056, 0x734056, 0x736240, 0x754056, 0x756240, 0x784056, 0x9A4056, 0x9AB1DA, 0xABDC40, 0xB14056, 0xB16240, 0xB1DA40, 0xB1DC40,
673 0xBB4056, 0xBB5640, 0xBB6240, 0xBBBD40, 0xBD4056, 0xBF4056, 0xBF5640, 0xCF56B1, 0xCFBD40, 0xDA4056, 0xDC4056, 0xDC40BB, 0xDC40CF, 0xDC6240, 0xDC7540, 0xDCBD40,
674 };
675
676 static const int32_t ngrams_IBM420_ar_ltr[] = {
677 0x404656, 0x4056BB, 0x4056BF, 0x406273, 0x406275, 0x4062B1, 0x4062BB, 0x4062DC, 0x406356, 0x407556, 0x4075DC, 0x40B156, 0x40BB56, 0x40BD56, 0x40BDBB, 0x40BDCF,
678 0x40BDDC, 0x40DAB1, 0x40DCAB, 0x40DCB1, 0x49B156, 0x564056, 0x564058, 0x564062, 0x564063, 0x564073, 0x564075, 0x564078, 0x56409A, 0x5640B1, 0x5640BB, 0x5640BD,
679 0x5640BF, 0x5640DA, 0x5640DC, 0x565840, 0x56B156, 0x56CF40, 0x58B156, 0x63B156, 0x63BD56, 0x67B156, 0x69B156, 0x73B156, 0x78B156, 0x9AB156, 0xAB4062, 0xADB156,
680 0xB14062, 0xB15640, 0xB156CF, 0xB19A40, 0xB1B140, 0xBB4062, 0xBB40DC, 0xBBB156, 0xBD5640, 0xBDBB40, 0xCF4062, 0xCF40DC, 0xCFB156, 0xDAB19A, 0xDCAB40, 0xDCB156
681 };
682
683 static const uint8_t charMap_IBM420_ar[]= {
684 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
685 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
686 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
687 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
688 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
689 /* 4- */ 0x40, 0x40, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
690 /* 5- */ 0x40, 0x51, 0x52, 0x40, 0x40, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
691 /* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
692 /* 7- */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
693 /* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
694 /* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
695 /* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
696 /* B- */ 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0x40, 0x40, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
697 /* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0xCB, 0x40, 0xCD, 0x40, 0xCF,
698 /* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
699 /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0xEB, 0x40, 0xED, 0xEE, 0xEF,
700 /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0xFB, 0xFC, 0xFD, 0xFE, 0x40,
701 };
702 #endif
703
704 //ISO-8859-1,2,5,6,7,8,9 Ngrams
705
706 struct NGramsPlusLang {
707 const int32_t ngrams[64];
708 const char * lang;
709 };
710
711 static const NGramsPlusLang ngrams_8859_1[] = {
712 {
713 {
714 0x206120, 0x20616E, 0x206265, 0x20636F, 0x20666F, 0x206861, 0x206865, 0x20696E, 0x206D61, 0x206F66, 0x207072, 0x207265, 0x207361, 0x207374, 0x207468, 0x20746F,
715 0x207768, 0x616964, 0x616C20, 0x616E20, 0x616E64, 0x617320, 0x617420, 0x617465, 0x617469, 0x642061, 0x642074, 0x652061, 0x652073, 0x652074, 0x656420, 0x656E74,
716 0x657220, 0x657320, 0x666F72, 0x686174, 0x686520, 0x686572, 0x696420, 0x696E20, 0x696E67, 0x696F6E, 0x697320, 0x6E2061, 0x6E2074, 0x6E6420, 0x6E6720, 0x6E7420,
717 0x6F6620, 0x6F6E20, 0x6F7220, 0x726520, 0x727320, 0x732061, 0x732074, 0x736169, 0x737420, 0x742074, 0x746572, 0x746861, 0x746865, 0x74696F, 0x746F20, 0x747320,
718 },
719 "en"
720 },
721 {
722 {
723 0x206166, 0x206174, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207369, 0x207374, 0x207469, 0x207669, 0x616620,
724 0x616E20, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656E20, 0x656E64, 0x657220, 0x657265, 0x657320,
725 0x657420, 0x666F72, 0x676520, 0x67656E, 0x676572, 0x696765, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6572, 0x6C6967, 0x6C6C65, 0x6D6564, 0x6E6465, 0x6E6520,
726 0x6E6720, 0x6E6765, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722064, 0x722065, 0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696C, 0x766572,
727 },
728 "da"
729 },
730 {
731 {
732 0x20616E, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 0x206765, 0x206861, 0x20696E, 0x206D69, 0x207363, 0x207365, 0x20756E, 0x207665, 0x20766F,
733 0x207765, 0x207A75, 0x626572, 0x636820, 0x636865, 0x636874, 0x646173, 0x64656E, 0x646572, 0x646965, 0x652064, 0x652073, 0x65696E, 0x656974, 0x656E20, 0x657220,
734 0x657320, 0x67656E, 0x68656E, 0x687420, 0x696368, 0x696520, 0x696E20, 0x696E65, 0x697420, 0x6C6963, 0x6C6C65, 0x6E2061, 0x6E2064, 0x6E2073, 0x6E6420, 0x6E6465,
735 0x6E6520, 0x6E6720, 0x6E6765, 0x6E7465, 0x722064, 0x726465, 0x726569, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x756E64, 0x756E67, 0x766572,
736 },
737 "de"
738 },
739 {
740 {
741 0x206120, 0x206361, 0x20636F, 0x206465, 0x20656C, 0x20656E, 0x206573, 0x20696E, 0x206C61, 0x206C6F, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
742 0x20756E, 0x207920, 0x612063, 0x612064, 0x612065, 0x61206C, 0x612070, 0x616369, 0x61646F, 0x616C20, 0x617220, 0x617320, 0x6369F3, 0x636F6E, 0x646520, 0x64656C,
743 0x646F20, 0x652064, 0x652065, 0x65206C, 0x656C20, 0x656E20, 0x656E74, 0x657320, 0x657374, 0x69656E, 0x69F36E, 0x6C6120, 0x6C6F73, 0x6E2065, 0x6E7465, 0x6F2064,
744 0x6F2065, 0x6F6E20, 0x6F7220, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732064, 0x732065, 0x732070, 0x736520, 0x746520, 0x746F20, 0x756520, 0xF36E20,
745 },
746 "es"
747 },
748 {
749 {
750 0x206175, 0x20636F, 0x206461, 0x206465, 0x206475, 0x20656E, 0x206574, 0x206C61, 0x206C65, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207365, 0x20736F, 0x20756E,
751 0x20E020, 0x616E74, 0x617469, 0x636520, 0x636F6E, 0x646520, 0x646573, 0x647520, 0x652061, 0x652063, 0x652064, 0x652065, 0x65206C, 0x652070, 0x652073, 0x656E20,
752 0x656E74, 0x657220, 0x657320, 0x657420, 0x657572, 0x696F6E, 0x697320, 0x697420, 0x6C6120, 0x6C6520, 0x6C6573, 0x6D656E, 0x6E2064, 0x6E6520, 0x6E7320, 0x6E7420,
753 0x6F6E20, 0x6F6E74, 0x6F7572, 0x717565, 0x72206C, 0x726520, 0x732061, 0x732064, 0x732065, 0x73206C, 0x732070, 0x742064, 0x746520, 0x74696F, 0x756520, 0x757220,
754 },
755 "fr"
756 },
757 {
758 {
759 0x20616C, 0x206368, 0x20636F, 0x206465, 0x206469, 0x206520, 0x20696C, 0x20696E, 0x206C61, 0x207065, 0x207072, 0x20756E, 0x612063, 0x612064, 0x612070, 0x612073,
760 0x61746F, 0x636865, 0x636F6E, 0x64656C, 0x646920, 0x652061, 0x652063, 0x652064, 0x652069, 0x65206C, 0x652070, 0x652073, 0x656C20, 0x656C6C, 0x656E74, 0x657220,
761 0x686520, 0x692061, 0x692063, 0x692064, 0x692073, 0x696120, 0x696C20, 0x696E20, 0x696F6E, 0x6C6120, 0x6C6520, 0x6C6920, 0x6C6C61, 0x6E6520, 0x6E6920, 0x6E6F20,
762 0x6E7465, 0x6F2061, 0x6F2064, 0x6F2069, 0x6F2073, 0x6F6E20, 0x6F6E65, 0x706572, 0x726120, 0x726520, 0x736920, 0x746120, 0x746520, 0x746920, 0x746F20, 0x7A696F,
763 },
764 "it"
765 },
766 {
767 {
768 0x20616C, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656E, 0x206765, 0x206865, 0x20696E, 0x206D61, 0x206D65, 0x206F70, 0x207465, 0x207661, 0x207665,
769 0x20766F, 0x207765, 0x207A69, 0x61616E, 0x616172, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656E, 0x646572, 0x652062, 0x652076, 0x65656E,
770 0x656572, 0x656E20, 0x657220, 0x657273, 0x657420, 0x67656E, 0x686574, 0x696520, 0x696E20, 0x696E67, 0x697320, 0x6E2062, 0x6E2064, 0x6E2065, 0x6E2068, 0x6E206F,
771 0x6E2076, 0x6E6465, 0x6E6720, 0x6F6E64, 0x6F6F72, 0x6F7020, 0x6F7220, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x76616E, 0x766572, 0x766F6F,
772 },
773 "nl"
774 },
775 {
776 {
777 0x206174, 0x206176, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207365, 0x20736B, 0x20736F, 0x207374, 0x207469,
778 0x207669, 0x20E520, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646574, 0x652073, 0x656420, 0x656E20, 0x656E65, 0x657220, 0x657265, 0x657420, 0x657474,
779 0x666F72, 0x67656E, 0x696B6B, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6520, 0x6C6C65, 0x6D6564, 0x6D656E, 0x6E2073, 0x6E6520, 0x6E6720, 0x6E6765, 0x6E6E65,
780 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722073, 0x726520, 0x736F6D, 0x737465, 0x742073, 0x746520, 0x74656E, 0x746572, 0x74696C, 0x747420, 0x747465, 0x766572,
781 },
782 "no"
783 },
784 {
785 {
786 0x206120, 0x20636F, 0x206461, 0x206465, 0x20646F, 0x206520, 0x206573, 0x206D61, 0x206E6F, 0x206F20, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
787 0x20756D, 0x612061, 0x612063, 0x612064, 0x612070, 0x616465, 0x61646F, 0x616C20, 0x617220, 0x617261, 0x617320, 0x636F6D, 0x636F6E, 0x646120, 0x646520, 0x646F20,
788 0x646F73, 0x652061, 0x652064, 0x656D20, 0x656E74, 0x657320, 0x657374, 0x696120, 0x696361, 0x6D656E, 0x6E7465, 0x6E746F, 0x6F2061, 0x6F2063, 0x6F2064, 0x6F2065,
789 0x6F2070, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732061, 0x732064, 0x732065, 0x732070, 0x737461, 0x746520, 0x746F20, 0x756520, 0xE36F20, 0xE7E36F,
790 },
791 "pt"
792 },
793 {
794 {
795 0x206174, 0x206176, 0x206465, 0x20656E, 0x2066F6, 0x206861, 0x206920, 0x20696E, 0x206B6F, 0x206D65, 0x206F63, 0x2070E5, 0x20736B, 0x20736F, 0x207374, 0x207469,
796 0x207661, 0x207669, 0x20E472, 0x616465, 0x616E20, 0x616E64, 0x617220, 0x617474, 0x636820, 0x646520, 0x64656E, 0x646572, 0x646574, 0x656420, 0x656E20, 0x657220,
797 0x657420, 0x66F672, 0x67656E, 0x696C6C, 0x696E67, 0x6B6120, 0x6C6C20, 0x6D6564, 0x6E2073, 0x6E6120, 0x6E6465, 0x6E6720, 0x6E6765, 0x6E696E, 0x6F6368, 0x6F6D20,
798 0x6F6E20, 0x70E520, 0x722061, 0x722073, 0x726120, 0x736B61, 0x736F6D, 0x742073, 0x746120, 0x746520, 0x746572, 0x74696C, 0x747420, 0x766172, 0xE47220, 0xF67220,
799 },
800 "sv"
801 }
802 };
803
804
805 static const NGramsPlusLang ngrams_8859_2[] = {
806 {
807 {
808 0x206120, 0x206279, 0x20646F, 0x206A65, 0x206E61, 0x206E65, 0x206F20, 0x206F64, 0x20706F, 0x207072, 0x2070F8, 0x20726F, 0x207365, 0x20736F, 0x207374, 0x20746F,
809 0x207620, 0x207679, 0x207A61, 0x612070, 0x636520, 0x636820, 0x652070, 0x652073, 0x652076, 0x656D20, 0x656EED, 0x686F20, 0x686F64, 0x697374, 0x6A6520, 0x6B7465,
810 0x6C6520, 0x6C6920, 0x6E6120, 0x6EE920, 0x6EEC20, 0x6EED20, 0x6F2070, 0x6F646E, 0x6F6A69, 0x6F7374, 0x6F7520, 0x6F7661, 0x706F64, 0x706F6A, 0x70726F, 0x70F865,
811 0x736520, 0x736F75, 0x737461, 0x737469, 0x73746E, 0x746572, 0x746EED, 0x746F20, 0x752070, 0xBE6520, 0xE16EED, 0xE9686F, 0xED2070, 0xED2073, 0xED6D20, 0xF86564,
812 },
813 "cs"
814 },
815 {
816 {
817 0x206120, 0x20617A, 0x206265, 0x206567, 0x20656C, 0x206665, 0x206861, 0x20686F, 0x206973, 0x206B65, 0x206B69, 0x206BF6, 0x206C65, 0x206D61, 0x206D65, 0x206D69,
818 0x206E65, 0x20737A, 0x207465, 0x20E973, 0x612061, 0x61206B, 0x61206D, 0x612073, 0x616B20, 0x616E20, 0x617A20, 0x62616E, 0x62656E, 0x656779, 0x656B20, 0x656C20,
819 0x656C65, 0x656D20, 0x656E20, 0x657265, 0x657420, 0x657465, 0x657474, 0x677920, 0x686F67, 0x696E74, 0x697320, 0x6B2061, 0x6BF67A, 0x6D6567, 0x6D696E, 0x6E2061,
820 0x6E616B, 0x6E656B, 0x6E656D, 0x6E7420, 0x6F6779, 0x732061, 0x737A65, 0x737A74, 0x737AE1, 0x73E967, 0x742061, 0x747420, 0x74E173, 0x7A6572, 0xE16E20, 0xE97320,
821 },
822 "hu"
823 },
824 {
825 {
826 0x20637A, 0x20646F, 0x206920, 0x206A65, 0x206B6F, 0x206D61, 0x206D69, 0x206E61, 0x206E69, 0x206F64, 0x20706F, 0x207072, 0x207369, 0x207720, 0x207769, 0x207779,
827 0x207A20, 0x207A61, 0x612070, 0x612077, 0x616E69, 0x636820, 0x637A65, 0x637A79, 0x646F20, 0x647A69, 0x652070, 0x652073, 0x652077, 0x65207A, 0x65676F, 0x656A20,
828 0x656D20, 0x656E69, 0x676F20, 0x696120, 0x696520, 0x69656A, 0x6B6120, 0x6B6920, 0x6B6965, 0x6D6965, 0x6E6120, 0x6E6961, 0x6E6965, 0x6F2070, 0x6F7761, 0x6F7769,
829 0x706F6C, 0x707261, 0x70726F, 0x70727A, 0x727A65, 0x727A79, 0x7369EA, 0x736B69, 0x737461, 0x776965, 0x796368, 0x796D20, 0x7A6520, 0x7A6965, 0x7A7920, 0xF37720,
830 },
831 "pl"
832 },
833 {
834 {
835 0x206120, 0x206163, 0x206361, 0x206365, 0x20636F, 0x206375, 0x206465, 0x206469, 0x206C61, 0x206D61, 0x207065, 0x207072, 0x207365, 0x2073E3, 0x20756E, 0x20BA69,
836 0x20EE6E, 0x612063, 0x612064, 0x617265, 0x617420, 0x617465, 0x617520, 0x636172, 0x636F6E, 0x637520, 0x63E320, 0x646520, 0x652061, 0x652063, 0x652064, 0x652070,
837 0x652073, 0x656120, 0x656920, 0x656C65, 0x656E74, 0x657374, 0x692061, 0x692063, 0x692064, 0x692070, 0x696520, 0x696920, 0x696E20, 0x6C6120, 0x6C6520, 0x6C6F72,
838 0x6C7569, 0x6E6520, 0x6E7472, 0x6F7220, 0x70656E, 0x726520, 0x726561, 0x727520, 0x73E320, 0x746520, 0x747275, 0x74E320, 0x756920, 0x756C20, 0xBA6920, 0xEE6E20,
839 },
840 "ro"
841 }
842 };
843
844 static const int32_t ngrams_8859_5_ru[] = {
845 0x20D220, 0x20D2DE, 0x20D4DE, 0x20D7D0, 0x20D820, 0x20DAD0, 0x20DADE, 0x20DDD0, 0x20DDD5, 0x20DED1, 0x20DFDE, 0x20DFE0, 0x20E0D0, 0x20E1DE, 0x20E1E2, 0x20E2DE,
846 0x20E7E2, 0x20EDE2, 0xD0DDD8, 0xD0E2EC, 0xD3DE20, 0xD5DBEC, 0xD5DDD8, 0xD5E1E2, 0xD5E220, 0xD820DF, 0xD8D520, 0xD8D820, 0xD8EF20, 0xDBD5DD, 0xDBD820, 0xDBECDD,
847 0xDDD020, 0xDDD520, 0xDDD8D5, 0xDDD8EF, 0xDDDE20, 0xDDDED2, 0xDE20D2, 0xDE20DF, 0xDE20E1, 0xDED220, 0xDED2D0, 0xDED3DE, 0xDED920, 0xDEDBEC, 0xDEDC20, 0xDEE1E2,
848 0xDFDEDB, 0xDFE0D5, 0xDFE0D8, 0xDFE0DE, 0xE0D0D2, 0xE0D5D4, 0xE1E2D0, 0xE1E2D2, 0xE1E2D8, 0xE1EF20, 0xE2D5DB, 0xE2DE20, 0xE2DEE0, 0xE2EC20, 0xE7E2DE, 0xEBE520,
849 };
850
851 static const int32_t ngrams_8859_6_ar[] = {
852 0x20C7E4, 0x20C7E6, 0x20C8C7, 0x20D9E4, 0x20E1EA, 0x20E4E4, 0x20E5E6, 0x20E8C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E420, 0xC7E4C3, 0xC7E4C7, 0xC7E4C8,
853 0xC7E4CA, 0xC7E4CC, 0xC7E4CD, 0xC7E4CF, 0xC7E4D3, 0xC7E4D9, 0xC7E4E2, 0xC7E4E5, 0xC7E4E8, 0xC7E4EA, 0xC7E520, 0xC7E620, 0xC7E6CA, 0xC820C7, 0xC920C7, 0xC920E1,
854 0xC920E4, 0xC920E5, 0xC920E8, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xD920C7, 0xD9E4E9, 0xE1EA20, 0xE420C7, 0xE4C920, 0xE4E920, 0xE4EA20,
855 0xE520C7, 0xE5C720, 0xE5C920, 0xE5E620, 0xE620C7, 0xE720C7, 0xE7C720, 0xE8C7E4, 0xE8E620, 0xE920C7, 0xEA20C7, 0xEA20E5, 0xEA20E8, 0xEAC920, 0xEAD120, 0xEAE620,
856 };
857
858 static const int32_t ngrams_8859_7_el[] = {
859 0x20E1ED, 0x20E1F0, 0x20E3E9, 0x20E4E9, 0x20E5F0, 0x20E720, 0x20EAE1, 0x20ECE5, 0x20EDE1, 0x20EF20, 0x20F0E1, 0x20F0EF, 0x20F0F1, 0x20F3F4, 0x20F3F5, 0x20F4E7,
860 0x20F4EF, 0xDFE120, 0xE120E1, 0xE120F4, 0xE1E920, 0xE1ED20, 0xE1F0FC, 0xE1F220, 0xE3E9E1, 0xE5E920, 0xE5F220, 0xE720F4, 0xE7ED20, 0xE7F220, 0xE920F4, 0xE9E120,
861 0xE9EADE, 0xE9F220, 0xEAE1E9, 0xEAE1F4, 0xECE520, 0xED20E1, 0xED20E5, 0xED20F0, 0xEDE120, 0xEFF220, 0xEFF520, 0xF0EFF5, 0xF0F1EF, 0xF0FC20, 0xF220E1, 0xF220E5,
862 0xF220EA, 0xF220F0, 0xF220F4, 0xF3E520, 0xF3E720, 0xF3F4EF, 0xF4E120, 0xF4E1E9, 0xF4E7ED, 0xF4E7F2, 0xF4E9EA, 0xF4EF20, 0xF4EFF5, 0xF4F9ED, 0xF9ED20, 0xFEED20,
863 };
864
865 static const int32_t ngrams_8859_8_I_he[] = {
866 0x20E0E5, 0x20E0E7, 0x20E0E9, 0x20E0FA, 0x20E1E9, 0x20E1EE, 0x20E4E0, 0x20E4E5, 0x20E4E9, 0x20E4EE, 0x20E4F2, 0x20E4F9, 0x20E4FA, 0x20ECE0, 0x20ECE4, 0x20EEE0,
867 0x20F2EC, 0x20F9EC, 0xE0FA20, 0xE420E0, 0xE420E1, 0xE420E4, 0xE420EC, 0xE420EE, 0xE420F9, 0xE4E5E0, 0xE5E020, 0xE5ED20, 0xE5EF20, 0xE5F820, 0xE5FA20, 0xE920E4,
868 0xE9E420, 0xE9E5FA, 0xE9E9ED, 0xE9ED20, 0xE9EF20, 0xE9F820, 0xE9FA20, 0xEC20E0, 0xEC20E4, 0xECE020, 0xECE420, 0xED20E0, 0xED20E1, 0xED20E4, 0xED20EC, 0xED20EE,
869 0xED20F9, 0xEEE420, 0xEF20E4, 0xF0E420, 0xF0E920, 0xF0E9ED, 0xF2EC20, 0xF820E4, 0xF8E9ED, 0xF9EC20, 0xFA20E0, 0xFA20E1, 0xFA20E4, 0xFA20EC, 0xFA20EE, 0xFA20F9,
870 };
871
872 static const int32_t ngrams_8859_8_he[] = {
873 0x20E0E5, 0x20E0EC, 0x20E4E9, 0x20E4EC, 0x20E4EE, 0x20E4F0, 0x20E9F0, 0x20ECF2, 0x20ECF9, 0x20EDE5, 0x20EDE9, 0x20EFE5, 0x20EFE9, 0x20F8E5, 0x20F8E9, 0x20FAE0,
874 0x20FAE5, 0x20FAE9, 0xE020E4, 0xE020EC, 0xE020ED, 0xE020FA, 0xE0E420, 0xE0E5E4, 0xE0EC20, 0xE0EE20, 0xE120E4, 0xE120ED, 0xE120FA, 0xE420E4, 0xE420E9, 0xE420EC,
875 0xE420ED, 0xE420EF, 0xE420F8, 0xE420FA, 0xE4EC20, 0xE5E020, 0xE5E420, 0xE7E020, 0xE9E020, 0xE9E120, 0xE9E420, 0xEC20E4, 0xEC20ED, 0xEC20FA, 0xECF220, 0xECF920,
876 0xEDE9E9, 0xEDE9F0, 0xEDE9F8, 0xEE20E4, 0xEE20ED, 0xEE20FA, 0xEEE120, 0xEEE420, 0xF2E420, 0xF920E4, 0xF920ED, 0xF920FA, 0xF9E420, 0xFAE020, 0xFAE420, 0xFAE5E9,
877 };
878
879 static const int32_t ngrams_8859_9_tr[] = {
880 0x206261, 0x206269, 0x206275, 0x206461, 0x206465, 0x206765, 0x206861, 0x20696C, 0x206B61, 0x206B6F, 0x206D61, 0x206F6C, 0x207361, 0x207461, 0x207665, 0x207961,
881 0x612062, 0x616B20, 0x616C61, 0x616D61, 0x616E20, 0x616EFD, 0x617220, 0x617261, 0x6172FD, 0x6173FD, 0x617961, 0x626972, 0x646120, 0x646520, 0x646920, 0x652062,
882 0x65206B, 0x656469, 0x656E20, 0x657220, 0x657269, 0x657369, 0x696C65, 0x696E20, 0x696E69, 0x697220, 0x6C616E, 0x6C6172, 0x6C6520, 0x6C6572, 0x6E2061, 0x6E2062,
883 0x6E206B, 0x6E6461, 0x6E6465, 0x6E6520, 0x6E6920, 0x6E696E, 0x6EFD20, 0x72696E, 0x72FD6E, 0x766520, 0x796120, 0x796F72, 0xFD6E20, 0xFD6E64, 0xFD6EFD, 0xFDF0FD,
884 };
885
~CharsetRecog_8859_1()886 CharsetRecog_8859_1::~CharsetRecog_8859_1()
887 {
888 // nothing to do
889 }
890
match(InputText * textIn,CharsetMatch * results) const891 UBool CharsetRecog_8859_1::match(InputText *textIn, CharsetMatch *results) const {
892 const char *name = textIn->fC1Bytes? "windows-1252" : "ISO-8859-1";
893 uint32_t i;
894 int32_t bestConfidenceSoFar = -1;
895 for (i=0; i < ARRAY_SIZE(ngrams_8859_1) ; i++) {
896 const int32_t *ngrams = ngrams_8859_1[i].ngrams;
897 const char *lang = ngrams_8859_1[i].lang;
898 int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_1);
899 if (confidence > bestConfidenceSoFar) {
900 results->set(textIn, this, confidence, name, lang);
901 bestConfidenceSoFar = confidence;
902 }
903 }
904 return (bestConfidenceSoFar > 0);
905 }
906
getName() const907 const char *CharsetRecog_8859_1::getName() const
908 {
909 return "ISO-8859-1";
910 }
911
912
~CharsetRecog_8859_2()913 CharsetRecog_8859_2::~CharsetRecog_8859_2()
914 {
915 // nothing to do
916 }
917
match(InputText * textIn,CharsetMatch * results) const918 UBool CharsetRecog_8859_2::match(InputText *textIn, CharsetMatch *results) const {
919 const char *name = textIn->fC1Bytes? "windows-1250" : "ISO-8859-2";
920 uint32_t i;
921 int32_t bestConfidenceSoFar = -1;
922 for (i=0; i < ARRAY_SIZE(ngrams_8859_2) ; i++) {
923 const int32_t *ngrams = ngrams_8859_2[i].ngrams;
924 const char *lang = ngrams_8859_2[i].lang;
925 int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_2);
926 if (confidence > bestConfidenceSoFar) {
927 results->set(textIn, this, confidence, name, lang);
928 bestConfidenceSoFar = confidence;
929 }
930 }
931 return (bestConfidenceSoFar > 0);
932 }
933
getName() const934 const char *CharsetRecog_8859_2::getName() const
935 {
936 return "ISO-8859-2";
937 }
938
939
~CharsetRecog_8859_5()940 CharsetRecog_8859_5::~CharsetRecog_8859_5()
941 {
942 // nothing to do
943 }
944
getName() const945 const char *CharsetRecog_8859_5::getName() const
946 {
947 return "ISO-8859-5";
948 }
949
~CharsetRecog_8859_5_ru()950 CharsetRecog_8859_5_ru::~CharsetRecog_8859_5_ru()
951 {
952 // nothing to do
953 }
954
getLanguage() const955 const char *CharsetRecog_8859_5_ru::getLanguage() const
956 {
957 return "ru";
958 }
959
match(InputText * textIn,CharsetMatch * results) const960 UBool CharsetRecog_8859_5_ru::match(InputText *textIn, CharsetMatch *results) const
961 {
962 int32_t confidence = match_sbcs(textIn, ngrams_8859_5_ru, charMap_8859_5);
963 results->set(textIn, this, confidence);
964 return (confidence > 0);
965 }
966
~CharsetRecog_8859_6()967 CharsetRecog_8859_6::~CharsetRecog_8859_6()
968 {
969 // nothing to do
970 }
971
getName() const972 const char *CharsetRecog_8859_6::getName() const
973 {
974 return "ISO-8859-6";
975 }
976
~CharsetRecog_8859_6_ar()977 CharsetRecog_8859_6_ar::~CharsetRecog_8859_6_ar()
978 {
979 // nothing to do
980 }
981
getLanguage() const982 const char *CharsetRecog_8859_6_ar::getLanguage() const
983 {
984 return "ar";
985 }
986
match(InputText * textIn,CharsetMatch * results) const987 UBool CharsetRecog_8859_6_ar::match(InputText *textIn, CharsetMatch *results) const
988 {
989 int32_t confidence = match_sbcs(textIn, ngrams_8859_6_ar, charMap_8859_6);
990 results->set(textIn, this, confidence);
991 return (confidence > 0);
992 }
993
~CharsetRecog_8859_7()994 CharsetRecog_8859_7::~CharsetRecog_8859_7()
995 {
996 // nothing to do
997 }
998
getName() const999 const char *CharsetRecog_8859_7::getName() const
1000 {
1001 return "ISO-8859-7";
1002 }
1003
~CharsetRecog_8859_7_el()1004 CharsetRecog_8859_7_el::~CharsetRecog_8859_7_el()
1005 {
1006 // nothing to do
1007 }
1008
getLanguage() const1009 const char *CharsetRecog_8859_7_el::getLanguage() const
1010 {
1011 return "el";
1012 }
1013
match(InputText * textIn,CharsetMatch * results) const1014 UBool CharsetRecog_8859_7_el::match(InputText *textIn, CharsetMatch *results) const
1015 {
1016 const char *name = textIn->fC1Bytes? "windows-1253" : "ISO-8859-7";
1017 int32_t confidence = match_sbcs(textIn, ngrams_8859_7_el, charMap_8859_7);
1018 results->set(textIn, this, confidence, name, "el");
1019 return (confidence > 0);
1020 }
1021
~CharsetRecog_8859_8()1022 CharsetRecog_8859_8::~CharsetRecog_8859_8()
1023 {
1024 // nothing to do
1025 }
1026
getName() const1027 const char *CharsetRecog_8859_8::getName() const
1028 {
1029 return "ISO-8859-8";
1030 }
1031
~CharsetRecog_8859_8_I_he()1032 CharsetRecog_8859_8_I_he::~CharsetRecog_8859_8_I_he ()
1033 {
1034 // nothing to do
1035 }
1036
getName() const1037 const char *CharsetRecog_8859_8_I_he::getName() const
1038 {
1039 return "ISO-8859-8-I";
1040 }
1041
getLanguage() const1042 const char *CharsetRecog_8859_8_I_he::getLanguage() const
1043 {
1044 return "he";
1045 }
1046
match(InputText * textIn,CharsetMatch * results) const1047 UBool CharsetRecog_8859_8_I_he::match(InputText *textIn, CharsetMatch *results) const
1048 {
1049 const char *name = textIn->fC1Bytes? "windows-1255" : "ISO-8859-8-I";
1050 int32_t confidence = match_sbcs(textIn, ngrams_8859_8_I_he, charMap_8859_8);
1051 results->set(textIn, this, confidence, name, "he");
1052 return (confidence > 0);
1053 }
1054
~CharsetRecog_8859_8_he()1055 CharsetRecog_8859_8_he::~CharsetRecog_8859_8_he()
1056 {
1057 // od ot gnihton
1058 }
1059
getLanguage() const1060 const char *CharsetRecog_8859_8_he::getLanguage() const
1061 {
1062 return "he";
1063 }
1064
match(InputText * textIn,CharsetMatch * results) const1065 UBool CharsetRecog_8859_8_he::match(InputText *textIn, CharsetMatch *results) const
1066 {
1067 const char *name = textIn->fC1Bytes? "windows-1255" : "ISO-8859-8";
1068 int32_t confidence = match_sbcs(textIn, ngrams_8859_8_he, charMap_8859_8);
1069 results->set(textIn, this, confidence, name, "he");
1070 return (confidence > 0);
1071 }
1072
~CharsetRecog_8859_9()1073 CharsetRecog_8859_9::~CharsetRecog_8859_9()
1074 {
1075 // nothing to do
1076 }
1077
getName() const1078 const char *CharsetRecog_8859_9::getName() const
1079 {
1080 return "ISO-8859-9";
1081 }
1082
~CharsetRecog_8859_9_tr()1083 CharsetRecog_8859_9_tr::~CharsetRecog_8859_9_tr ()
1084 {
1085 // nothing to do
1086 }
1087
getLanguage() const1088 const char *CharsetRecog_8859_9_tr::getLanguage() const
1089 {
1090 return "tr";
1091 }
1092
match(InputText * textIn,CharsetMatch * results) const1093 UBool CharsetRecog_8859_9_tr::match(InputText *textIn, CharsetMatch *results) const
1094 {
1095 const char *name = textIn->fC1Bytes? "windows-1254" : "ISO-8859-9";
1096 int32_t confidence = match_sbcs(textIn, ngrams_8859_9_tr, charMap_8859_9);
1097 results->set(textIn, this, confidence, name, "tr");
1098 return (confidence > 0);
1099 }
1100
~CharsetRecog_windows_1256()1101 CharsetRecog_windows_1256::~CharsetRecog_windows_1256()
1102 {
1103 // nothing to do
1104 }
1105
getName() const1106 const char *CharsetRecog_windows_1256::getName() const
1107 {
1108 return "windows-1256";
1109 }
1110
getLanguage() const1111 const char *CharsetRecog_windows_1256::getLanguage() const
1112 {
1113 return "ar";
1114 }
1115
match(InputText * textIn,CharsetMatch * results) const1116 UBool CharsetRecog_windows_1256::match(InputText *textIn, CharsetMatch *results) const
1117 {
1118 int32_t confidence = match_sbcs(textIn, ngrams_windows_1256, charMap_windows_1256);
1119 results->set(textIn, this, confidence);
1120 return (confidence > 0);
1121 }
1122
~CharsetRecog_windows_1251()1123 CharsetRecog_windows_1251::~CharsetRecog_windows_1251()
1124 {
1125 // nothing to do
1126 }
1127
getName() const1128 const char *CharsetRecog_windows_1251::getName() const
1129 {
1130 return "windows-1251";
1131 }
1132
getLanguage() const1133 const char *CharsetRecog_windows_1251::getLanguage() const
1134 {
1135 return "ru";
1136 }
1137
match(InputText * textIn,CharsetMatch * results) const1138 UBool CharsetRecog_windows_1251::match(InputText *textIn, CharsetMatch *results) const
1139 {
1140 int32_t confidence = match_sbcs(textIn, ngrams_windows_1251, charMap_windows_1251);
1141 results->set(textIn, this, confidence);
1142 return (confidence > 0);
1143 }
1144
~CharsetRecog_KOI8_R()1145 CharsetRecog_KOI8_R::~CharsetRecog_KOI8_R()
1146 {
1147 // nothing to do
1148 }
1149
getName() const1150 const char *CharsetRecog_KOI8_R::getName() const
1151 {
1152 return "KOI8-R";
1153 }
1154
getLanguage() const1155 const char *CharsetRecog_KOI8_R::getLanguage() const
1156 {
1157 return "ru";
1158 }
1159
match(InputText * textIn,CharsetMatch * results) const1160 UBool CharsetRecog_KOI8_R::match(InputText *textIn, CharsetMatch *results) const
1161 {
1162 int32_t confidence = match_sbcs(textIn, ngrams_KOI8_R, charMap_KOI8_R);
1163 results->set(textIn, this, confidence);
1164 return (confidence > 0);
1165 }
1166
1167 #if !UCONFIG_ONLY_HTML_CONVERSION
~CharsetRecog_IBM424_he()1168 CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he()
1169 {
1170 // nothing to do
1171 }
1172
getLanguage() const1173 const char *CharsetRecog_IBM424_he::getLanguage() const
1174 {
1175 return "he";
1176 }
1177
~CharsetRecog_IBM424_he_rtl()1178 CharsetRecog_IBM424_he_rtl::~CharsetRecog_IBM424_he_rtl()
1179 {
1180 // nothing to do
1181 }
1182
getName() const1183 const char *CharsetRecog_IBM424_he_rtl::getName() const
1184 {
1185 return "IBM424_rtl";
1186 }
1187
match(InputText * textIn,CharsetMatch * results) const1188 UBool CharsetRecog_IBM424_he_rtl::match(InputText *textIn, CharsetMatch *results) const
1189 {
1190 int32_t confidence = match_sbcs(textIn, ngrams_IBM424_he_rtl, charMap_IBM424_he);
1191 results->set(textIn, this, confidence);
1192 return (confidence > 0);
1193 }
1194
~CharsetRecog_IBM424_he_ltr()1195 CharsetRecog_IBM424_he_ltr::~CharsetRecog_IBM424_he_ltr()
1196 {
1197 // nothing to do
1198 }
1199
getName() const1200 const char *CharsetRecog_IBM424_he_ltr::getName() const
1201 {
1202 return "IBM424_ltr";
1203 }
1204
match(InputText * textIn,CharsetMatch * results) const1205 UBool CharsetRecog_IBM424_he_ltr::match(InputText *textIn, CharsetMatch *results) const
1206 {
1207 int32_t confidence = match_sbcs(textIn, ngrams_IBM424_he_ltr, charMap_IBM424_he);
1208 results->set(textIn, this, confidence);
1209 return (confidence > 0);
1210 }
1211
~CharsetRecog_IBM420_ar()1212 CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar()
1213 {
1214 // nothing to do
1215 }
1216
getLanguage() const1217 const char *CharsetRecog_IBM420_ar::getLanguage() const
1218 {
1219 return "ar";
1220 }
1221
1222
match_sbcs(InputText * det,const int32_t ngrams[],const uint8_t byteMap[]) const1223 int32_t CharsetRecog_IBM420_ar::match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t byteMap[]) const
1224 {
1225 NGramParser_IBM420 parser(ngrams, byteMap);
1226 int32_t result;
1227
1228 result = parser.parse(det);
1229
1230 return result;
1231 }
1232
~CharsetRecog_IBM420_ar_rtl()1233 CharsetRecog_IBM420_ar_rtl::~CharsetRecog_IBM420_ar_rtl()
1234 {
1235 // nothing to do
1236 }
1237
getName() const1238 const char *CharsetRecog_IBM420_ar_rtl::getName() const
1239 {
1240 return "IBM420_rtl";
1241 }
1242
match(InputText * textIn,CharsetMatch * results) const1243 UBool CharsetRecog_IBM420_ar_rtl::match(InputText *textIn, CharsetMatch *results) const
1244 {
1245 int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_rtl, charMap_IBM420_ar);
1246 results->set(textIn, this, confidence);
1247 return (confidence > 0);
1248 }
1249
~CharsetRecog_IBM420_ar_ltr()1250 CharsetRecog_IBM420_ar_ltr::~CharsetRecog_IBM420_ar_ltr()
1251 {
1252 // nothing to do
1253 }
1254
getName() const1255 const char *CharsetRecog_IBM420_ar_ltr::getName() const
1256 {
1257 return "IBM420_ltr";
1258 }
1259
match(InputText * textIn,CharsetMatch * results) const1260 UBool CharsetRecog_IBM420_ar_ltr::match(InputText *textIn, CharsetMatch *results) const
1261 {
1262 int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_ltr, charMap_IBM420_ar);
1263 results->set(textIn, this, confidence);
1264 return (confidence > 0);
1265 }
1266 #endif
1267
1268 U_NAMESPACE_END
1269 #endif
1270
1271