• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  **********************************************************************
5  *   Copyright (C) 2005-2016, International Business Machines
6  *   Corporation and others.  All Rights Reserved.
7  **********************************************************************
8  */
9 
10 #include "unicode/utypes.h"
11 
12 #include "cmemory.h"
13 
14 #if !UCONFIG_NO_CONVERSION
15 #include "csrsbcs.h"
16 #include "csmatch.h"
17 
18 #define N_GRAM_SIZE 3
19 #define N_GRAM_MASK 0xFFFFFF
20 
21 U_NAMESPACE_BEGIN
22 
NGramParser(const int32_t * theNgramList,const uint8_t * theCharMap)23 NGramParser::NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap)
24  : ngram(0), byteIndex(0)
25 {
26     ngramList = theNgramList;
27     charMap   = theCharMap;
28 
29     ngramCount = hitCount = 0;
30 }
31 
~NGramParser()32 NGramParser::~NGramParser()
33 {
34 }
35 
36 /*
37  * Binary search for value in table, which must have exactly 64 entries.
38  */
39 
search(const int32_t * table,int32_t value)40 int32_t NGramParser::search(const int32_t *table, int32_t value)
41 {
42     int32_t index = 0;
43 
44     if (table[index + 32] <= value) {
45         index += 32;
46     }
47 
48     if (table[index + 16] <= value) {
49         index += 16;
50     }
51 
52     if (table[index + 8] <= value) {
53         index += 8;
54     }
55 
56     if (table[index + 4] <= value) {
57         index += 4;
58     }
59 
60     if (table[index + 2] <= value) {
61         index += 2;
62     }
63 
64     if (table[index + 1] <= value) {
65         index += 1;
66     }
67 
68     if (table[index] > value) {
69         index -= 1;
70     }
71 
72     if (index < 0 || table[index] != value) {
73         return -1;
74     }
75 
76     return index;
77 }
78 
lookup(int32_t thisNgram)79 void NGramParser::lookup(int32_t thisNgram)
80 {
81     ngramCount += 1;
82 
83     if (search(ngramList, thisNgram) >= 0) {
84         hitCount += 1;
85     }
86 
87 }
88 
addByte(int32_t b)89 void NGramParser::addByte(int32_t b)
90 {
91     ngram = ((ngram << 8) + b) & N_GRAM_MASK;
92     lookup(ngram);
93 }
94 
nextByte(InputText * det)95 int32_t NGramParser::nextByte(InputText *det)
96 {
97     if (byteIndex >= det->fInputLen) {
98         return -1;
99     }
100 
101     return det->fInputBytes[byteIndex++];
102 }
103 
parseCharacters(InputText * det)104 void NGramParser::parseCharacters(InputText *det)
105 {
106     int32_t b;
107     bool ignoreSpace = FALSE;
108 
109     while ((b = nextByte(det)) >= 0) {
110         uint8_t mb = charMap[b];
111 
112         // TODO: 0x20 might not be a space in all character sets...
113         if (mb != 0) {
114             if (!(mb == 0x20 && ignoreSpace)) {
115                 addByte(mb);
116             }
117 
118             ignoreSpace = (mb == 0x20);
119         }
120     }
121 }
122 
parse(InputText * det)123 int32_t NGramParser::parse(InputText *det)
124 {
125     parseCharacters(det);
126 
127     // TODO: Is this OK? The buffer could have ended in the middle of a word...
128     addByte(0x20);
129 
130     double rawPercent = (double) hitCount / (double) ngramCount;
131 
132     //            if (rawPercent <= 2.0) {
133     //                return 0;
134     //            }
135 
136     // TODO - This is a bit of a hack to take care of a case
137     // were we were getting a confidence of 135...
138     if (rawPercent > 0.33) {
139         return 98;
140     }
141 
142     return (int32_t) (rawPercent * 300.0);
143 }
144 
145 #if !UCONFIG_ONLY_HTML_CONVERSION
146 static const uint8_t unshapeMap_IBM420[] = {
147 /*           -0    -1    -2    -3    -4    -5    -6    -7    -8    -9    -A    -B    -C    -D    -E    -F   */
148 /* 0- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
149 /* 1- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
150 /* 2- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
151 /* 3- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
152 /* 4- */    0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
153 /* 5- */    0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
154 /* 6- */    0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
155 /* 7- */    0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
156 /* 8- */    0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F,
157 /* 9- */    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E,
158 /* A- */    0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF,
159 /* B- */    0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF,
160 /* C- */    0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF,
161 /* D- */    0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF,
162 /* E- */    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
163 /* F- */    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
164 };
165 
NGramParser_IBM420(const int32_t * theNgramList,const uint8_t * theCharMap)166 NGramParser_IBM420::NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap):NGramParser(theNgramList, theCharMap)
167 {
168 	alef = 0x00;
169 }
170 
~NGramParser_IBM420()171 NGramParser_IBM420::~NGramParser_IBM420() {}
172 
isLamAlef(int32_t b)173 int32_t NGramParser_IBM420::isLamAlef(int32_t b)
174 {
175 	if(b == 0xB2 || b == 0xB3){
176          	return 0x47;
177         }else if(b == 0xB4 || b == 0xB5){
178          	return 0x49;
179         }else if(b == 0xB8 || b == 0xB9){
180          	return 0x56;
181         }else
182          	return 0x00;
183 }
184 
185 /*
186 * Arabic shaping needs to be done manually. Cannot call ArabicShaping class
187 * because CharsetDetector is dealing with bytes not Unicode code points. We could
188 * convert the bytes to Unicode code points but that would leave us dependent
189 * on CharsetICU which we try to avoid. IBM420 converter amongst different versions
190 * of JDK can produce different results and therefore is also avoided.
191 */
nextByte(InputText * det)192 int32_t NGramParser_IBM420::nextByte(InputText *det)
193 {
194 
195     if (byteIndex >= det->fInputLen || det->fInputBytes[byteIndex] == 0) {
196         return -1;
197     }
198     int next;
199 
200     alef = isLamAlef(det->fInputBytes[byteIndex]);
201     if(alef != 0x00)
202         next = 0xB1 & 0xFF;
203     else
204         next = unshapeMap_IBM420[det->fInputBytes[byteIndex]& 0xFF] & 0xFF;
205 
206     byteIndex++;
207 
208     return next;
209 }
210 
parseCharacters(InputText * det)211 void NGramParser_IBM420::parseCharacters(InputText *det)
212 {
213 	int32_t b;
214     bool ignoreSpace = FALSE;
215 
216     while ((b = nextByte(det)) >= 0) {
217         uint8_t mb = charMap[b];
218 
219         // TODO: 0x20 might not be a space in all character sets...
220         if (mb != 0) {
221             if (!(mb == 0x20 && ignoreSpace)) {
222                 addByte(mb);
223             }
224             ignoreSpace = (mb == 0x20);
225         }
226 
227 		if(alef != 0x00){
228             mb = charMap[alef & 0xFF];
229 
230             // TODO: 0x20 might not be a space in all character sets...
231             if (mb != 0) {
232                 if (!(mb == 0x20 && ignoreSpace)) {
233                     addByte(mb);
234                 }
235 
236                 ignoreSpace = (mb == 0x20);
237             }
238 
239         }
240     }
241 }
242 #endif
243 
CharsetRecog_sbcs()244 CharsetRecog_sbcs::CharsetRecog_sbcs()
245 {
246     // nothing else to do
247 }
248 
~CharsetRecog_sbcs()249 CharsetRecog_sbcs::~CharsetRecog_sbcs()
250 {
251     // nothing to do
252 }
253 
match_sbcs(InputText * det,const int32_t ngrams[],const uint8_t byteMap[]) const254 int32_t CharsetRecog_sbcs::match_sbcs(InputText *det, const int32_t ngrams[],  const uint8_t byteMap[]) const
255 {
256     NGramParser parser(ngrams, byteMap);
257     int32_t result;
258 
259     result = parser.parse(det);
260 
261     return result;
262 }
263 
264 static const uint8_t charMap_8859_1[] = {
265     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
266     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
267     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
268     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
269     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
270     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
271     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
272     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
273     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
274     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
275     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
276     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
277     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
278     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
279     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
280     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
281     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
282     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
283     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
284     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
285     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
286     0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
287     0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
288     0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
289     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
290     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
291     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
292     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
293     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
294     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
295     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
296     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
297 };
298 
299 static const uint8_t charMap_8859_2[] = {
300     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
301     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
302     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
303     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
304     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
305     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
306     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
307     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
308     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
309     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
310     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
311     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
312     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
313     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
314     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
315     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
316     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
317     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
318     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
319     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
320     0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0x20,
321     0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
322     0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0xB7,
323     0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
324     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
325     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
326     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
327     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
328     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
329     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
330     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
331     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
332 };
333 
334 static const uint8_t charMap_8859_5[] = {
335     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
336     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
337     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
338     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
339     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
340     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
341     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
342     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
343     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
344     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
345     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
346     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
347     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
348     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
349     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
350     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
351     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
352     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
353     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
354     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
355     0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
356     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
357     0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
358     0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
359     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
360     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
361     0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
362     0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
363     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
364     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
365     0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
366     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
367 };
368 
369 static const uint8_t charMap_8859_6[] = {
370     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
371     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
372     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
373     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
374     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
375     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
376     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
377     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
378     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
379     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
380     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
381     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
382     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
383     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
384     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
385     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
386     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
387     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
388     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
389     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
390     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
391     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
392     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
393     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
394     0x20, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
395     0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
396     0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
397     0xD8, 0xD9, 0xDA, 0x20, 0x20, 0x20, 0x20, 0x20,
398     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
399     0xE8, 0xE9, 0xEA, 0x20, 0x20, 0x20, 0x20, 0x20,
400     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
401     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
402 };
403 
404 static const uint8_t charMap_8859_7[] = {
405     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
406     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
407     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
408     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
409     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
410     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
411     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
412     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
413     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
414     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
415     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
416     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
417     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
418     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
419     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
420     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
421     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
422     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
423     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
424     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
425     0x20, 0xA1, 0xA2, 0x20, 0x20, 0x20, 0x20, 0x20,
426     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
427     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xDC, 0x20,
428     0xDD, 0xDE, 0xDF, 0x20, 0xFC, 0x20, 0xFD, 0xFE,
429     0xC0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
430     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
431     0xF0, 0xF1, 0x20, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
432     0xF8, 0xF9, 0xFA, 0xFB, 0xDC, 0xDD, 0xDE, 0xDF,
433     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
434     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
435     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
436     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
437 };
438 
439 static const uint8_t charMap_8859_8[] = {
440     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
441     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
442     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
443     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
444     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
445     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
446     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
447     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
448     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
449     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
450     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
451     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
452     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
453     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
454     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
455     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
456     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
457     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
458     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
459     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
460     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
461     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
462     0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
463     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
464     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
465     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
466     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
467     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
468     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
469     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
470     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
471     0xF8, 0xF9, 0xFA, 0x20, 0x20, 0x20, 0x20, 0x20,
472 };
473 
474 static const uint8_t charMap_8859_9[] = {
475     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
476     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
477     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
478     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
479     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
480     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
481     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
482     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
483     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
484     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
485     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
486     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
487     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
488     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
489     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
490     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
491     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
492     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
493     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
494     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
495     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
496     0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
497     0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
498     0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
499     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
500     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
501     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
502     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x69, 0xFE, 0xDF,
503     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
504     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
505     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
506     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
507 };
508 
509 static const int32_t ngrams_windows_1251[] = {
510     0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 0x20EAEE, 0x20EDE0, 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 0x20F1F2, 0x20F2EE,
511     0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 0xE5EDE8, 0xE5F1F2, 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 0xEBE820, 0xEBFCED,
512     0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 0xEE20E2, 0xEE20EF, 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 0xEEEC20, 0xEEF1F2,
513     0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 0xF1F2E0, 0xF1F2E2, 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 0xF7F2EE, 0xFBF520,
514 };
515 
516 static const uint8_t charMap_windows_1251[] = {
517     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
518     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
519     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
520     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
521     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
522     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
523     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
524     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
525     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
526     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
527     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
528     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
529     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
530     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
531     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
532     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
533     0x90, 0x83, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
534     0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
535     0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
536     0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
537     0x20, 0xA2, 0xA2, 0xBC, 0x20, 0xB4, 0x20, 0x20,
538     0xB8, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0xBF,
539     0x20, 0x20, 0xB3, 0xB3, 0xB4, 0xB5, 0x20, 0x20,
540     0xB8, 0x20, 0xBA, 0x20, 0xBC, 0xBE, 0xBE, 0xBF,
541     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
542     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
543     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
544     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
545     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
546     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
547     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
548     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
549 };
550 
551 static const int32_t ngrams_windows_1256[] = {
552     0x20C7E1, 0x20C7E4, 0x20C8C7, 0x20DAE1, 0x20DDED, 0x20E1E1, 0x20E3E4, 0x20E6C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E120, 0xC7E1C3, 0xC7E1C7, 0xC7E1C8,
553     0xC7E1CA, 0xC7E1CC, 0xC7E1CD, 0xC7E1CF, 0xC7E1D3, 0xC7E1DA, 0xC7E1DE, 0xC7E1E3, 0xC7E1E6, 0xC7E1ED, 0xC7E320, 0xC7E420, 0xC7E4CA, 0xC820C7, 0xC920C7, 0xC920DD,
554     0xC920E1, 0xC920E3, 0xC920E6, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xDA20C7, 0xDAE1EC, 0xDDED20, 0xE120C7, 0xE1C920, 0xE1EC20, 0xE1ED20,
555     0xE320C7, 0xE3C720, 0xE3C920, 0xE3E420, 0xE420C7, 0xE520C7, 0xE5C720, 0xE6C7E1, 0xE6E420, 0xEC20C7, 0xED20C7, 0xED20E3, 0xED20E6, 0xEDC920, 0xEDD120, 0xEDE420,
556 };
557 
558 static const uint8_t charMap_windows_1256[] = {
559     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
560     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
561     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
562     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
563     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
564     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
565     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
566     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
567     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
568     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
569     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
570     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
571     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
572     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
573     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
574     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
575     0x20, 0x81, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
576     0x88, 0x20, 0x8A, 0x20, 0x9C, 0x8D, 0x8E, 0x8F,
577     0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
578     0x98, 0x20, 0x9A, 0x20, 0x9C, 0x20, 0x20, 0x9F,
579     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
580     0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
581     0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
582     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
583     0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
584     0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
585     0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0x20,
586     0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
587     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
588     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
589     0x20, 0x20, 0x20, 0x20, 0xF4, 0x20, 0x20, 0x20,
590     0x20, 0xF9, 0x20, 0xFB, 0xFC, 0x20, 0x20, 0xFF,
591 };
592 
593 static const int32_t ngrams_KOI8_R[] = {
594     0x20C4CF, 0x20C920, 0x20CBC1, 0x20CBCF, 0x20CEC1, 0x20CEC5, 0x20CFC2, 0x20D0CF, 0x20D0D2, 0x20D2C1, 0x20D3CF, 0x20D3D4, 0x20D4CF, 0x20D720, 0x20D7CF, 0x20DAC1,
595     0x20DCD4, 0x20DED4, 0xC1CEC9, 0xC1D4D8, 0xC5CCD8, 0xC5CEC9, 0xC5D3D4, 0xC5D420, 0xC7CF20, 0xC920D0, 0xC9C520, 0xC9C920, 0xC9D120, 0xCCC5CE, 0xCCC920, 0xCCD8CE,
596     0xCEC120, 0xCEC520, 0xCEC9C5, 0xCEC9D1, 0xCECF20, 0xCECFD7, 0xCF20D0, 0xCF20D3, 0xCF20D7, 0xCFC7CF, 0xCFCA20, 0xCFCCD8, 0xCFCD20, 0xCFD3D4, 0xCFD720, 0xCFD7C1,
597     0xD0CFCC, 0xD0D2C5, 0xD0D2C9, 0xD0D2CF, 0xD2C1D7, 0xD2C5C4, 0xD3D120, 0xD3D4C1, 0xD3D4C9, 0xD3D4D7, 0xD4C5CC, 0xD4CF20, 0xD4CFD2, 0xD4D820, 0xD9C820, 0xDED4CF,
598 };
599 
600 static const uint8_t charMap_KOI8_R[] = {
601     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
602     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
603     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
604     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
605     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
606     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
607     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
608     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
609     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
610     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
611     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
612     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
613     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
614     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
615     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
616     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
617     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
618     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
619     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
620     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
621     0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
622     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
623     0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
624     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
625     0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
626     0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
627     0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
628     0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
629     0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
630     0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
631     0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
632     0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
633 };
634 
635 #if !UCONFIG_ONLY_HTML_CONVERSION
636 static const int32_t ngrams_IBM424_he_rtl[] = {
637     0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641,
638     0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045,
639     0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x544041, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x554056,
640     0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x684045, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x714069,
641 };
642 
643 static const int32_t ngrams_IBM424_he_ltr[] = {
644     0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x405462, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x407141,
645     0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x414645, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x454054,
646     0x454055, 0x454057, 0x454068, 0x454071, 0x455440, 0x464140, 0x464540, 0x484140, 0x514140, 0x514240, 0x514540, 0x544045, 0x544055, 0x544071, 0x546240, 0x546940,
647     0x555151, 0x555158, 0x555168, 0x564045, 0x564055, 0x564071, 0x564240, 0x564540, 0x624540, 0x694045, 0x694055, 0x694071, 0x694540, 0x714140, 0x714540, 0x714651,
648 };
649 
650 static const uint8_t charMap_IBM424_he[] = {
651 /*           -0    -1    -2    -3    -4    -5    -6    -7    -8    -9    -A    -B    -C    -D    -E    -F   */
652 /* 0- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
653 /* 1- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
654 /* 2- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
655 /* 3- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
656 /* 4- */    0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
657 /* 5- */    0x40, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
658 /* 6- */    0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
659 /* 7- */    0x40, 0x71, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x00, 0x40, 0x40,
660 /* 8- */    0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
661 /* 9- */    0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
662 /* A- */    0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
663 /* B- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
664 /* C- */    0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
665 /* D- */    0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
666 /* E- */    0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
667 /* F- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
668 };
669 
670 static const int32_t ngrams_IBM420_ar_rtl[] = {
671     0x4056B1, 0x4056BD, 0x405856, 0x409AB1, 0x40ABDC, 0x40B1B1, 0x40BBBD, 0x40CF56, 0x564056, 0x564640, 0x566340, 0x567540, 0x56B140, 0x56B149, 0x56B156, 0x56B158,
672     0x56B163, 0x56B167, 0x56B169, 0x56B173, 0x56B178, 0x56B19A, 0x56B1AD, 0x56B1BB, 0x56B1CF, 0x56B1DC, 0x56BB40, 0x56BD40, 0x56BD63, 0x584056, 0x624056, 0x6240AB,
673     0x6240B1, 0x6240BB, 0x6240CF, 0x634056, 0x734056, 0x736240, 0x754056, 0x756240, 0x784056, 0x9A4056, 0x9AB1DA, 0xABDC40, 0xB14056, 0xB16240, 0xB1DA40, 0xB1DC40,
674     0xBB4056, 0xBB5640, 0xBB6240, 0xBBBD40, 0xBD4056, 0xBF4056, 0xBF5640, 0xCF56B1, 0xCFBD40, 0xDA4056, 0xDC4056, 0xDC40BB, 0xDC40CF, 0xDC6240, 0xDC7540, 0xDCBD40,
675 };
676 
677 static const int32_t ngrams_IBM420_ar_ltr[] = {
678     0x404656, 0x4056BB, 0x4056BF, 0x406273, 0x406275, 0x4062B1, 0x4062BB, 0x4062DC, 0x406356, 0x407556, 0x4075DC, 0x40B156, 0x40BB56, 0x40BD56, 0x40BDBB, 0x40BDCF,
679     0x40BDDC, 0x40DAB1, 0x40DCAB, 0x40DCB1, 0x49B156, 0x564056, 0x564058, 0x564062, 0x564063, 0x564073, 0x564075, 0x564078, 0x56409A, 0x5640B1, 0x5640BB, 0x5640BD,
680     0x5640BF, 0x5640DA, 0x5640DC, 0x565840, 0x56B156, 0x56CF40, 0x58B156, 0x63B156, 0x63BD56, 0x67B156, 0x69B156, 0x73B156, 0x78B156, 0x9AB156, 0xAB4062, 0xADB156,
681     0xB14062, 0xB15640, 0xB156CF, 0xB19A40, 0xB1B140, 0xBB4062, 0xBB40DC, 0xBBB156, 0xBD5640, 0xBDBB40, 0xCF4062, 0xCF40DC, 0xCFB156, 0xDAB19A, 0xDCAB40, 0xDCB156
682 };
683 
684 static const uint8_t charMap_IBM420_ar[]= {
685 /*           -0    -1    -2    -3    -4    -5    -6    -7    -8    -9    -A    -B    -C    -D    -E    -F   */
686 /* 0- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
687 /* 1- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
688 /* 2- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
689 /* 3- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
690 /* 4- */    0x40, 0x40, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
691 /* 5- */    0x40, 0x51, 0x52, 0x40, 0x40, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
692 /* 6- */    0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
693 /* 7- */    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
694 /* 8- */    0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
695 /* 9- */    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
696 /* A- */    0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
697 /* B- */    0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0x40, 0x40, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
698 /* C- */    0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0xCB, 0x40, 0xCD, 0x40, 0xCF,
699 /* D- */    0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
700 /* E- */    0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0xEB, 0x40, 0xED, 0xEE, 0xEF,
701 /* F- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0xFB, 0xFC, 0xFD, 0xFE, 0x40,
702 };
703 #endif
704 
705 //ISO-8859-1,2,5,6,7,8,9 Ngrams
706 
707 struct NGramsPlusLang {
708     const int32_t ngrams[64];
709     const char *  lang;
710 };
711 
712 static const NGramsPlusLang ngrams_8859_1[] =  {
713   {
714     {
715     0x206120, 0x20616E, 0x206265, 0x20636F, 0x20666F, 0x206861, 0x206865, 0x20696E, 0x206D61, 0x206F66, 0x207072, 0x207265, 0x207361, 0x207374, 0x207468, 0x20746F,
716     0x207768, 0x616964, 0x616C20, 0x616E20, 0x616E64, 0x617320, 0x617420, 0x617465, 0x617469, 0x642061, 0x642074, 0x652061, 0x652073, 0x652074, 0x656420, 0x656E74,
717     0x657220, 0x657320, 0x666F72, 0x686174, 0x686520, 0x686572, 0x696420, 0x696E20, 0x696E67, 0x696F6E, 0x697320, 0x6E2061, 0x6E2074, 0x6E6420, 0x6E6720, 0x6E7420,
718     0x6F6620, 0x6F6E20, 0x6F7220, 0x726520, 0x727320, 0x732061, 0x732074, 0x736169, 0x737420, 0x742074, 0x746572, 0x746861, 0x746865, 0x74696F, 0x746F20, 0x747320,
719     },
720     "en"
721   },
722   {
723     {
724     0x206166, 0x206174, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207369, 0x207374, 0x207469, 0x207669, 0x616620,
725     0x616E20, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656E20, 0x656E64, 0x657220, 0x657265, 0x657320,
726     0x657420, 0x666F72, 0x676520, 0x67656E, 0x676572, 0x696765, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6572, 0x6C6967, 0x6C6C65, 0x6D6564, 0x6E6465, 0x6E6520,
727     0x6E6720, 0x6E6765, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722064, 0x722065, 0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696C, 0x766572,
728     },
729     "da"
730   },
731   {
732     {
733     0x20616E, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 0x206765, 0x206861, 0x20696E, 0x206D69, 0x207363, 0x207365, 0x20756E, 0x207665, 0x20766F,
734     0x207765, 0x207A75, 0x626572, 0x636820, 0x636865, 0x636874, 0x646173, 0x64656E, 0x646572, 0x646965, 0x652064, 0x652073, 0x65696E, 0x656974, 0x656E20, 0x657220,
735     0x657320, 0x67656E, 0x68656E, 0x687420, 0x696368, 0x696520, 0x696E20, 0x696E65, 0x697420, 0x6C6963, 0x6C6C65, 0x6E2061, 0x6E2064, 0x6E2073, 0x6E6420, 0x6E6465,
736     0x6E6520, 0x6E6720, 0x6E6765, 0x6E7465, 0x722064, 0x726465, 0x726569, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x756E64, 0x756E67, 0x766572,
737     },
738     "de"
739   },
740   {
741     {
742     0x206120, 0x206361, 0x20636F, 0x206465, 0x20656C, 0x20656E, 0x206573, 0x20696E, 0x206C61, 0x206C6F, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
743     0x20756E, 0x207920, 0x612063, 0x612064, 0x612065, 0x61206C, 0x612070, 0x616369, 0x61646F, 0x616C20, 0x617220, 0x617320, 0x6369F3, 0x636F6E, 0x646520, 0x64656C,
744     0x646F20, 0x652064, 0x652065, 0x65206C, 0x656C20, 0x656E20, 0x656E74, 0x657320, 0x657374, 0x69656E, 0x69F36E, 0x6C6120, 0x6C6F73, 0x6E2065, 0x6E7465, 0x6F2064,
745     0x6F2065, 0x6F6E20, 0x6F7220, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732064, 0x732065, 0x732070, 0x736520, 0x746520, 0x746F20, 0x756520, 0xF36E20,
746     },
747     "es"
748   },
749   {
750     {
751     0x206175, 0x20636F, 0x206461, 0x206465, 0x206475, 0x20656E, 0x206574, 0x206C61, 0x206C65, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207365, 0x20736F, 0x20756E,
752     0x20E020, 0x616E74, 0x617469, 0x636520, 0x636F6E, 0x646520, 0x646573, 0x647520, 0x652061, 0x652063, 0x652064, 0x652065, 0x65206C, 0x652070, 0x652073, 0x656E20,
753     0x656E74, 0x657220, 0x657320, 0x657420, 0x657572, 0x696F6E, 0x697320, 0x697420, 0x6C6120, 0x6C6520, 0x6C6573, 0x6D656E, 0x6E2064, 0x6E6520, 0x6E7320, 0x6E7420,
754     0x6F6E20, 0x6F6E74, 0x6F7572, 0x717565, 0x72206C, 0x726520, 0x732061, 0x732064, 0x732065, 0x73206C, 0x732070, 0x742064, 0x746520, 0x74696F, 0x756520, 0x757220,
755     },
756     "fr"
757   },
758   {
759     {
760     0x20616C, 0x206368, 0x20636F, 0x206465, 0x206469, 0x206520, 0x20696C, 0x20696E, 0x206C61, 0x207065, 0x207072, 0x20756E, 0x612063, 0x612064, 0x612070, 0x612073,
761     0x61746F, 0x636865, 0x636F6E, 0x64656C, 0x646920, 0x652061, 0x652063, 0x652064, 0x652069, 0x65206C, 0x652070, 0x652073, 0x656C20, 0x656C6C, 0x656E74, 0x657220,
762     0x686520, 0x692061, 0x692063, 0x692064, 0x692073, 0x696120, 0x696C20, 0x696E20, 0x696F6E, 0x6C6120, 0x6C6520, 0x6C6920, 0x6C6C61, 0x6E6520, 0x6E6920, 0x6E6F20,
763     0x6E7465, 0x6F2061, 0x6F2064, 0x6F2069, 0x6F2073, 0x6F6E20, 0x6F6E65, 0x706572, 0x726120, 0x726520, 0x736920, 0x746120, 0x746520, 0x746920, 0x746F20, 0x7A696F,
764     },
765     "it"
766   },
767   {
768     {
769     0x20616C, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656E, 0x206765, 0x206865, 0x20696E, 0x206D61, 0x206D65, 0x206F70, 0x207465, 0x207661, 0x207665,
770     0x20766F, 0x207765, 0x207A69, 0x61616E, 0x616172, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656E, 0x646572, 0x652062, 0x652076, 0x65656E,
771     0x656572, 0x656E20, 0x657220, 0x657273, 0x657420, 0x67656E, 0x686574, 0x696520, 0x696E20, 0x696E67, 0x697320, 0x6E2062, 0x6E2064, 0x6E2065, 0x6E2068, 0x6E206F,
772     0x6E2076, 0x6E6465, 0x6E6720, 0x6F6E64, 0x6F6F72, 0x6F7020, 0x6F7220, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x76616E, 0x766572, 0x766F6F,
773     },
774     "nl"
775   },
776   {
777     {
778     0x206174, 0x206176, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207365, 0x20736B, 0x20736F, 0x207374, 0x207469,
779     0x207669, 0x20E520, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646574, 0x652073, 0x656420, 0x656E20, 0x656E65, 0x657220, 0x657265, 0x657420, 0x657474,
780     0x666F72, 0x67656E, 0x696B6B, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6520, 0x6C6C65, 0x6D6564, 0x6D656E, 0x6E2073, 0x6E6520, 0x6E6720, 0x6E6765, 0x6E6E65,
781     0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722073, 0x726520, 0x736F6D, 0x737465, 0x742073, 0x746520, 0x74656E, 0x746572, 0x74696C, 0x747420, 0x747465, 0x766572,
782     },
783     "no"
784   },
785   {
786     {
787     0x206120, 0x20636F, 0x206461, 0x206465, 0x20646F, 0x206520, 0x206573, 0x206D61, 0x206E6F, 0x206F20, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
788     0x20756D, 0x612061, 0x612063, 0x612064, 0x612070, 0x616465, 0x61646F, 0x616C20, 0x617220, 0x617261, 0x617320, 0x636F6D, 0x636F6E, 0x646120, 0x646520, 0x646F20,
789     0x646F73, 0x652061, 0x652064, 0x656D20, 0x656E74, 0x657320, 0x657374, 0x696120, 0x696361, 0x6D656E, 0x6E7465, 0x6E746F, 0x6F2061, 0x6F2063, 0x6F2064, 0x6F2065,
790     0x6F2070, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732061, 0x732064, 0x732065, 0x732070, 0x737461, 0x746520, 0x746F20, 0x756520, 0xE36F20, 0xE7E36F,
791     },
792     "pt"
793   },
794   {
795     {
796     0x206174, 0x206176, 0x206465, 0x20656E, 0x2066F6, 0x206861, 0x206920, 0x20696E, 0x206B6F, 0x206D65, 0x206F63, 0x2070E5, 0x20736B, 0x20736F, 0x207374, 0x207469,
797     0x207661, 0x207669, 0x20E472, 0x616465, 0x616E20, 0x616E64, 0x617220, 0x617474, 0x636820, 0x646520, 0x64656E, 0x646572, 0x646574, 0x656420, 0x656E20, 0x657220,
798     0x657420, 0x66F672, 0x67656E, 0x696C6C, 0x696E67, 0x6B6120, 0x6C6C20, 0x6D6564, 0x6E2073, 0x6E6120, 0x6E6465, 0x6E6720, 0x6E6765, 0x6E696E, 0x6F6368, 0x6F6D20,
799     0x6F6E20, 0x70E520, 0x722061, 0x722073, 0x726120, 0x736B61, 0x736F6D, 0x742073, 0x746120, 0x746520, 0x746572, 0x74696C, 0x747420, 0x766172, 0xE47220, 0xF67220,
800     },
801     "sv"
802   }
803 };
804 
805 
806 static const NGramsPlusLang ngrams_8859_2[] =  {
807   {
808     {
809     0x206120, 0x206279, 0x20646F, 0x206A65, 0x206E61, 0x206E65, 0x206F20, 0x206F64, 0x20706F, 0x207072, 0x2070F8, 0x20726F, 0x207365, 0x20736F, 0x207374, 0x20746F,
810     0x207620, 0x207679, 0x207A61, 0x612070, 0x636520, 0x636820, 0x652070, 0x652073, 0x652076, 0x656D20, 0x656EED, 0x686F20, 0x686F64, 0x697374, 0x6A6520, 0x6B7465,
811     0x6C6520, 0x6C6920, 0x6E6120, 0x6EE920, 0x6EEC20, 0x6EED20, 0x6F2070, 0x6F646E, 0x6F6A69, 0x6F7374, 0x6F7520, 0x6F7661, 0x706F64, 0x706F6A, 0x70726F, 0x70F865,
812     0x736520, 0x736F75, 0x737461, 0x737469, 0x73746E, 0x746572, 0x746EED, 0x746F20, 0x752070, 0xBE6520, 0xE16EED, 0xE9686F, 0xED2070, 0xED2073, 0xED6D20, 0xF86564,
813     },
814     "cs"
815   },
816   {
817     {
818     0x206120, 0x20617A, 0x206265, 0x206567, 0x20656C, 0x206665, 0x206861, 0x20686F, 0x206973, 0x206B65, 0x206B69, 0x206BF6, 0x206C65, 0x206D61, 0x206D65, 0x206D69,
819     0x206E65, 0x20737A, 0x207465, 0x20E973, 0x612061, 0x61206B, 0x61206D, 0x612073, 0x616B20, 0x616E20, 0x617A20, 0x62616E, 0x62656E, 0x656779, 0x656B20, 0x656C20,
820     0x656C65, 0x656D20, 0x656E20, 0x657265, 0x657420, 0x657465, 0x657474, 0x677920, 0x686F67, 0x696E74, 0x697320, 0x6B2061, 0x6BF67A, 0x6D6567, 0x6D696E, 0x6E2061,
821     0x6E616B, 0x6E656B, 0x6E656D, 0x6E7420, 0x6F6779, 0x732061, 0x737A65, 0x737A74, 0x737AE1, 0x73E967, 0x742061, 0x747420, 0x74E173, 0x7A6572, 0xE16E20, 0xE97320,
822     },
823     "hu"
824   },
825   {
826     {
827     0x20637A, 0x20646F, 0x206920, 0x206A65, 0x206B6F, 0x206D61, 0x206D69, 0x206E61, 0x206E69, 0x206F64, 0x20706F, 0x207072, 0x207369, 0x207720, 0x207769, 0x207779,
828     0x207A20, 0x207A61, 0x612070, 0x612077, 0x616E69, 0x636820, 0x637A65, 0x637A79, 0x646F20, 0x647A69, 0x652070, 0x652073, 0x652077, 0x65207A, 0x65676F, 0x656A20,
829     0x656D20, 0x656E69, 0x676F20, 0x696120, 0x696520, 0x69656A, 0x6B6120, 0x6B6920, 0x6B6965, 0x6D6965, 0x6E6120, 0x6E6961, 0x6E6965, 0x6F2070, 0x6F7761, 0x6F7769,
830     0x706F6C, 0x707261, 0x70726F, 0x70727A, 0x727A65, 0x727A79, 0x7369EA, 0x736B69, 0x737461, 0x776965, 0x796368, 0x796D20, 0x7A6520, 0x7A6965, 0x7A7920, 0xF37720,
831     },
832     "pl"
833   },
834   {
835     {
836     0x206120, 0x206163, 0x206361, 0x206365, 0x20636F, 0x206375, 0x206465, 0x206469, 0x206C61, 0x206D61, 0x207065, 0x207072, 0x207365, 0x2073E3, 0x20756E, 0x20BA69,
837     0x20EE6E, 0x612063, 0x612064, 0x617265, 0x617420, 0x617465, 0x617520, 0x636172, 0x636F6E, 0x637520, 0x63E320, 0x646520, 0x652061, 0x652063, 0x652064, 0x652070,
838     0x652073, 0x656120, 0x656920, 0x656C65, 0x656E74, 0x657374, 0x692061, 0x692063, 0x692064, 0x692070, 0x696520, 0x696920, 0x696E20, 0x6C6120, 0x6C6520, 0x6C6F72,
839     0x6C7569, 0x6E6520, 0x6E7472, 0x6F7220, 0x70656E, 0x726520, 0x726561, 0x727520, 0x73E320, 0x746520, 0x747275, 0x74E320, 0x756920, 0x756C20, 0xBA6920, 0xEE6E20,
840     },
841     "ro"
842   }
843 };
844 
845 static const int32_t ngrams_8859_5_ru[] = {
846     0x20D220, 0x20D2DE, 0x20D4DE, 0x20D7D0, 0x20D820, 0x20DAD0, 0x20DADE, 0x20DDD0, 0x20DDD5, 0x20DED1, 0x20DFDE, 0x20DFE0, 0x20E0D0, 0x20E1DE, 0x20E1E2, 0x20E2DE,
847     0x20E7E2, 0x20EDE2, 0xD0DDD8, 0xD0E2EC, 0xD3DE20, 0xD5DBEC, 0xD5DDD8, 0xD5E1E2, 0xD5E220, 0xD820DF, 0xD8D520, 0xD8D820, 0xD8EF20, 0xDBD5DD, 0xDBD820, 0xDBECDD,
848     0xDDD020, 0xDDD520, 0xDDD8D5, 0xDDD8EF, 0xDDDE20, 0xDDDED2, 0xDE20D2, 0xDE20DF, 0xDE20E1, 0xDED220, 0xDED2D0, 0xDED3DE, 0xDED920, 0xDEDBEC, 0xDEDC20, 0xDEE1E2,
849     0xDFDEDB, 0xDFE0D5, 0xDFE0D8, 0xDFE0DE, 0xE0D0D2, 0xE0D5D4, 0xE1E2D0, 0xE1E2D2, 0xE1E2D8, 0xE1EF20, 0xE2D5DB, 0xE2DE20, 0xE2DEE0, 0xE2EC20, 0xE7E2DE, 0xEBE520,
850 };
851 
852 static const int32_t ngrams_8859_6_ar[] = {
853     0x20C7E4, 0x20C7E6, 0x20C8C7, 0x20D9E4, 0x20E1EA, 0x20E4E4, 0x20E5E6, 0x20E8C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E420, 0xC7E4C3, 0xC7E4C7, 0xC7E4C8,
854     0xC7E4CA, 0xC7E4CC, 0xC7E4CD, 0xC7E4CF, 0xC7E4D3, 0xC7E4D9, 0xC7E4E2, 0xC7E4E5, 0xC7E4E8, 0xC7E4EA, 0xC7E520, 0xC7E620, 0xC7E6CA, 0xC820C7, 0xC920C7, 0xC920E1,
855     0xC920E4, 0xC920E5, 0xC920E8, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xD920C7, 0xD9E4E9, 0xE1EA20, 0xE420C7, 0xE4C920, 0xE4E920, 0xE4EA20,
856     0xE520C7, 0xE5C720, 0xE5C920, 0xE5E620, 0xE620C7, 0xE720C7, 0xE7C720, 0xE8C7E4, 0xE8E620, 0xE920C7, 0xEA20C7, 0xEA20E5, 0xEA20E8, 0xEAC920, 0xEAD120, 0xEAE620,
857 };
858 
859 static const int32_t ngrams_8859_7_el[] = {
860     0x20E1ED, 0x20E1F0, 0x20E3E9, 0x20E4E9, 0x20E5F0, 0x20E720, 0x20EAE1, 0x20ECE5, 0x20EDE1, 0x20EF20, 0x20F0E1, 0x20F0EF, 0x20F0F1, 0x20F3F4, 0x20F3F5, 0x20F4E7,
861     0x20F4EF, 0xDFE120, 0xE120E1, 0xE120F4, 0xE1E920, 0xE1ED20, 0xE1F0FC, 0xE1F220, 0xE3E9E1, 0xE5E920, 0xE5F220, 0xE720F4, 0xE7ED20, 0xE7F220, 0xE920F4, 0xE9E120,
862     0xE9EADE, 0xE9F220, 0xEAE1E9, 0xEAE1F4, 0xECE520, 0xED20E1, 0xED20E5, 0xED20F0, 0xEDE120, 0xEFF220, 0xEFF520, 0xF0EFF5, 0xF0F1EF, 0xF0FC20, 0xF220E1, 0xF220E5,
863     0xF220EA, 0xF220F0, 0xF220F4, 0xF3E520, 0xF3E720, 0xF3F4EF, 0xF4E120, 0xF4E1E9, 0xF4E7ED, 0xF4E7F2, 0xF4E9EA, 0xF4EF20, 0xF4EFF5, 0xF4F9ED, 0xF9ED20, 0xFEED20,
864 };
865 
866 static const int32_t ngrams_8859_8_I_he[] = {
867     0x20E0E5, 0x20E0E7, 0x20E0E9, 0x20E0FA, 0x20E1E9, 0x20E1EE, 0x20E4E0, 0x20E4E5, 0x20E4E9, 0x20E4EE, 0x20E4F2, 0x20E4F9, 0x20E4FA, 0x20ECE0, 0x20ECE4, 0x20EEE0,
868     0x20F2EC, 0x20F9EC, 0xE0FA20, 0xE420E0, 0xE420E1, 0xE420E4, 0xE420EC, 0xE420EE, 0xE420F9, 0xE4E5E0, 0xE5E020, 0xE5ED20, 0xE5EF20, 0xE5F820, 0xE5FA20, 0xE920E4,
869     0xE9E420, 0xE9E5FA, 0xE9E9ED, 0xE9ED20, 0xE9EF20, 0xE9F820, 0xE9FA20, 0xEC20E0, 0xEC20E4, 0xECE020, 0xECE420, 0xED20E0, 0xED20E1, 0xED20E4, 0xED20EC, 0xED20EE,
870     0xED20F9, 0xEEE420, 0xEF20E4, 0xF0E420, 0xF0E920, 0xF0E9ED, 0xF2EC20, 0xF820E4, 0xF8E9ED, 0xF9EC20, 0xFA20E0, 0xFA20E1, 0xFA20E4, 0xFA20EC, 0xFA20EE, 0xFA20F9,
871 };
872 
873 static const int32_t ngrams_8859_8_he[] = {
874     0x20E0E5, 0x20E0EC, 0x20E4E9, 0x20E4EC, 0x20E4EE, 0x20E4F0, 0x20E9F0, 0x20ECF2, 0x20ECF9, 0x20EDE5, 0x20EDE9, 0x20EFE5, 0x20EFE9, 0x20F8E5, 0x20F8E9, 0x20FAE0,
875     0x20FAE5, 0x20FAE9, 0xE020E4, 0xE020EC, 0xE020ED, 0xE020FA, 0xE0E420, 0xE0E5E4, 0xE0EC20, 0xE0EE20, 0xE120E4, 0xE120ED, 0xE120FA, 0xE420E4, 0xE420E9, 0xE420EC,
876     0xE420ED, 0xE420EF, 0xE420F8, 0xE420FA, 0xE4EC20, 0xE5E020, 0xE5E420, 0xE7E020, 0xE9E020, 0xE9E120, 0xE9E420, 0xEC20E4, 0xEC20ED, 0xEC20FA, 0xECF220, 0xECF920,
877     0xEDE9E9, 0xEDE9F0, 0xEDE9F8, 0xEE20E4, 0xEE20ED, 0xEE20FA, 0xEEE120, 0xEEE420, 0xF2E420, 0xF920E4, 0xF920ED, 0xF920FA, 0xF9E420, 0xFAE020, 0xFAE420, 0xFAE5E9,
878 };
879 
880 static const int32_t ngrams_8859_9_tr[] = {
881     0x206261, 0x206269, 0x206275, 0x206461, 0x206465, 0x206765, 0x206861, 0x20696C, 0x206B61, 0x206B6F, 0x206D61, 0x206F6C, 0x207361, 0x207461, 0x207665, 0x207961,
882     0x612062, 0x616B20, 0x616C61, 0x616D61, 0x616E20, 0x616EFD, 0x617220, 0x617261, 0x6172FD, 0x6173FD, 0x617961, 0x626972, 0x646120, 0x646520, 0x646920, 0x652062,
883     0x65206B, 0x656469, 0x656E20, 0x657220, 0x657269, 0x657369, 0x696C65, 0x696E20, 0x696E69, 0x697220, 0x6C616E, 0x6C6172, 0x6C6520, 0x6C6572, 0x6E2061, 0x6E2062,
884     0x6E206B, 0x6E6461, 0x6E6465, 0x6E6520, 0x6E6920, 0x6E696E, 0x6EFD20, 0x72696E, 0x72FD6E, 0x766520, 0x796120, 0x796F72, 0xFD6E20, 0xFD6E64, 0xFD6EFD, 0xFDF0FD,
885 };
886 
~CharsetRecog_8859_1()887 CharsetRecog_8859_1::~CharsetRecog_8859_1()
888 {
889     // nothing to do
890 }
891 
match(InputText * textIn,CharsetMatch * results) const892 UBool CharsetRecog_8859_1::match(InputText *textIn, CharsetMatch *results) const {
893     const char *name = textIn->fC1Bytes? "windows-1252" : "ISO-8859-1";
894     uint32_t i;
895     int32_t bestConfidenceSoFar = -1;
896     for (i=0; i < UPRV_LENGTHOF(ngrams_8859_1) ; i++) {
897         const int32_t *ngrams = ngrams_8859_1[i].ngrams;
898         const char    *lang   = ngrams_8859_1[i].lang;
899         int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_1);
900         if (confidence > bestConfidenceSoFar) {
901             results->set(textIn, this, confidence, name, lang);
902             bestConfidenceSoFar = confidence;
903         }
904     }
905     return (bestConfidenceSoFar > 0);
906 }
907 
getName() const908 const char *CharsetRecog_8859_1::getName() const
909 {
910     return "ISO-8859-1";
911 }
912 
913 
~CharsetRecog_8859_2()914 CharsetRecog_8859_2::~CharsetRecog_8859_2()
915 {
916     // nothing to do
917 }
918 
match(InputText * textIn,CharsetMatch * results) const919 UBool CharsetRecog_8859_2::match(InputText *textIn, CharsetMatch *results) const {
920     const char *name = textIn->fC1Bytes? "windows-1250" : "ISO-8859-2";
921     uint32_t i;
922     int32_t bestConfidenceSoFar = -1;
923     for (i=0; i < UPRV_LENGTHOF(ngrams_8859_2) ; i++) {
924         const int32_t *ngrams = ngrams_8859_2[i].ngrams;
925         const char    *lang   = ngrams_8859_2[i].lang;
926         int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_2);
927         if (confidence > bestConfidenceSoFar) {
928             results->set(textIn, this, confidence, name, lang);
929             bestConfidenceSoFar = confidence;
930         }
931     }
932     return (bestConfidenceSoFar > 0);
933 }
934 
getName() const935 const char *CharsetRecog_8859_2::getName() const
936 {
937     return "ISO-8859-2";
938 }
939 
940 
~CharsetRecog_8859_5()941 CharsetRecog_8859_5::~CharsetRecog_8859_5()
942 {
943     // nothing to do
944 }
945 
getName() const946 const char *CharsetRecog_8859_5::getName() const
947 {
948     return "ISO-8859-5";
949 }
950 
~CharsetRecog_8859_5_ru()951 CharsetRecog_8859_5_ru::~CharsetRecog_8859_5_ru()
952 {
953     // nothing to do
954 }
955 
getLanguage() const956 const char *CharsetRecog_8859_5_ru::getLanguage() const
957 {
958     return "ru";
959 }
960 
match(InputText * textIn,CharsetMatch * results) const961 UBool CharsetRecog_8859_5_ru::match(InputText *textIn, CharsetMatch *results) const
962 {
963     int32_t confidence = match_sbcs(textIn, ngrams_8859_5_ru, charMap_8859_5);
964     results->set(textIn, this, confidence);
965     return (confidence > 0);
966 }
967 
~CharsetRecog_8859_6()968 CharsetRecog_8859_6::~CharsetRecog_8859_6()
969 {
970     // nothing to do
971 }
972 
getName() const973 const char *CharsetRecog_8859_6::getName() const
974 {
975     return "ISO-8859-6";
976 }
977 
~CharsetRecog_8859_6_ar()978 CharsetRecog_8859_6_ar::~CharsetRecog_8859_6_ar()
979 {
980     // nothing to do
981 }
982 
getLanguage() const983 const char *CharsetRecog_8859_6_ar::getLanguage() const
984 {
985     return "ar";
986 }
987 
match(InputText * textIn,CharsetMatch * results) const988 UBool CharsetRecog_8859_6_ar::match(InputText *textIn, CharsetMatch *results) const
989 {
990     int32_t confidence = match_sbcs(textIn, ngrams_8859_6_ar, charMap_8859_6);
991     results->set(textIn, this, confidence);
992     return (confidence > 0);
993 }
994 
~CharsetRecog_8859_7()995 CharsetRecog_8859_7::~CharsetRecog_8859_7()
996 {
997     // nothing to do
998 }
999 
getName() const1000 const char *CharsetRecog_8859_7::getName() const
1001 {
1002     return "ISO-8859-7";
1003 }
1004 
~CharsetRecog_8859_7_el()1005 CharsetRecog_8859_7_el::~CharsetRecog_8859_7_el()
1006 {
1007     // nothing to do
1008 }
1009 
getLanguage() const1010 const char *CharsetRecog_8859_7_el::getLanguage() const
1011 {
1012     return "el";
1013 }
1014 
match(InputText * textIn,CharsetMatch * results) const1015 UBool CharsetRecog_8859_7_el::match(InputText *textIn, CharsetMatch *results) const
1016 {
1017     const char *name = textIn->fC1Bytes? "windows-1253" : "ISO-8859-7";
1018     int32_t confidence = match_sbcs(textIn, ngrams_8859_7_el, charMap_8859_7);
1019     results->set(textIn, this, confidence, name, "el");
1020     return (confidence > 0);
1021 }
1022 
~CharsetRecog_8859_8()1023 CharsetRecog_8859_8::~CharsetRecog_8859_8()
1024 {
1025     // nothing to do
1026 }
1027 
getName() const1028 const char *CharsetRecog_8859_8::getName() const
1029 {
1030     return "ISO-8859-8";
1031 }
1032 
~CharsetRecog_8859_8_I_he()1033 CharsetRecog_8859_8_I_he::~CharsetRecog_8859_8_I_he ()
1034 {
1035     // nothing to do
1036 }
1037 
getName() const1038 const char *CharsetRecog_8859_8_I_he::getName() const
1039 {
1040     return "ISO-8859-8-I";
1041 }
1042 
getLanguage() const1043 const char *CharsetRecog_8859_8_I_he::getLanguage() const
1044 {
1045     return "he";
1046 }
1047 
match(InputText * textIn,CharsetMatch * results) const1048 UBool CharsetRecog_8859_8_I_he::match(InputText *textIn, CharsetMatch *results) const
1049 {
1050     const char *name = textIn->fC1Bytes? "windows-1255" : "ISO-8859-8-I";
1051     int32_t confidence = match_sbcs(textIn, ngrams_8859_8_I_he, charMap_8859_8);
1052     results->set(textIn, this, confidence, name, "he");
1053     return (confidence > 0);
1054 }
1055 
~CharsetRecog_8859_8_he()1056 CharsetRecog_8859_8_he::~CharsetRecog_8859_8_he()
1057 {
1058     // od ot gnihton
1059 }
1060 
getLanguage() const1061 const char *CharsetRecog_8859_8_he::getLanguage() const
1062 {
1063     return "he";
1064 }
1065 
match(InputText * textIn,CharsetMatch * results) const1066 UBool CharsetRecog_8859_8_he::match(InputText *textIn, CharsetMatch *results) const
1067 {
1068     const char *name = textIn->fC1Bytes? "windows-1255" : "ISO-8859-8";
1069     int32_t confidence = match_sbcs(textIn, ngrams_8859_8_he, charMap_8859_8);
1070     results->set(textIn, this, confidence, name, "he");
1071     return (confidence > 0);
1072 }
1073 
~CharsetRecog_8859_9()1074 CharsetRecog_8859_9::~CharsetRecog_8859_9()
1075 {
1076     // nothing to do
1077 }
1078 
getName() const1079 const char *CharsetRecog_8859_9::getName() const
1080 {
1081     return "ISO-8859-9";
1082 }
1083 
~CharsetRecog_8859_9_tr()1084 CharsetRecog_8859_9_tr::~CharsetRecog_8859_9_tr ()
1085 {
1086     // nothing to do
1087 }
1088 
getLanguage() const1089 const char *CharsetRecog_8859_9_tr::getLanguage() const
1090 {
1091     return "tr";
1092 }
1093 
match(InputText * textIn,CharsetMatch * results) const1094 UBool CharsetRecog_8859_9_tr::match(InputText *textIn, CharsetMatch *results) const
1095 {
1096     const char *name = textIn->fC1Bytes? "windows-1254" : "ISO-8859-9";
1097     int32_t confidence = match_sbcs(textIn, ngrams_8859_9_tr, charMap_8859_9);
1098     results->set(textIn, this, confidence, name, "tr");
1099     return (confidence > 0);
1100 }
1101 
~CharsetRecog_windows_1256()1102 CharsetRecog_windows_1256::~CharsetRecog_windows_1256()
1103 {
1104     // nothing to do
1105 }
1106 
getName() const1107 const char *CharsetRecog_windows_1256::getName() const
1108 {
1109     return  "windows-1256";
1110 }
1111 
getLanguage() const1112 const char *CharsetRecog_windows_1256::getLanguage() const
1113 {
1114     return "ar";
1115 }
1116 
match(InputText * textIn,CharsetMatch * results) const1117 UBool CharsetRecog_windows_1256::match(InputText *textIn, CharsetMatch *results) const
1118 {
1119     int32_t confidence = match_sbcs(textIn, ngrams_windows_1256, charMap_windows_1256);
1120     results->set(textIn, this, confidence);
1121     return (confidence > 0);
1122 }
1123 
~CharsetRecog_windows_1251()1124 CharsetRecog_windows_1251::~CharsetRecog_windows_1251()
1125 {
1126     // nothing to do
1127 }
1128 
getName() const1129 const char *CharsetRecog_windows_1251::getName() const
1130 {
1131     return  "windows-1251";
1132 }
1133 
getLanguage() const1134 const char *CharsetRecog_windows_1251::getLanguage() const
1135 {
1136     return "ru";
1137 }
1138 
match(InputText * textIn,CharsetMatch * results) const1139 UBool CharsetRecog_windows_1251::match(InputText *textIn, CharsetMatch *results) const
1140 {
1141     int32_t confidence = match_sbcs(textIn, ngrams_windows_1251, charMap_windows_1251);
1142     results->set(textIn, this, confidence);
1143     return (confidence > 0);
1144 }
1145 
~CharsetRecog_KOI8_R()1146 CharsetRecog_KOI8_R::~CharsetRecog_KOI8_R()
1147 {
1148     // nothing to do
1149 }
1150 
getName() const1151 const char *CharsetRecog_KOI8_R::getName() const
1152 {
1153     return  "KOI8-R";
1154 }
1155 
getLanguage() const1156 const char *CharsetRecog_KOI8_R::getLanguage() const
1157 {
1158     return "ru";
1159 }
1160 
match(InputText * textIn,CharsetMatch * results) const1161 UBool CharsetRecog_KOI8_R::match(InputText *textIn, CharsetMatch *results) const
1162 {
1163     int32_t confidence = match_sbcs(textIn, ngrams_KOI8_R, charMap_KOI8_R);
1164     results->set(textIn, this, confidence);
1165     return (confidence > 0);
1166 }
1167 
1168 #if !UCONFIG_ONLY_HTML_CONVERSION
~CharsetRecog_IBM424_he()1169 CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he()
1170 {
1171     // nothing to do
1172 }
1173 
getLanguage() const1174 const char *CharsetRecog_IBM424_he::getLanguage() const
1175 {
1176     return "he";
1177 }
1178 
~CharsetRecog_IBM424_he_rtl()1179 CharsetRecog_IBM424_he_rtl::~CharsetRecog_IBM424_he_rtl()
1180 {
1181     // nothing to do
1182 }
1183 
getName() const1184 const char *CharsetRecog_IBM424_he_rtl::getName() const
1185 {
1186     return  "IBM424_rtl";
1187 }
1188 
match(InputText * textIn,CharsetMatch * results) const1189 UBool CharsetRecog_IBM424_he_rtl::match(InputText *textIn, CharsetMatch *results) const
1190 {
1191     int32_t confidence = match_sbcs(textIn, ngrams_IBM424_he_rtl, charMap_IBM424_he);
1192     results->set(textIn, this, confidence);
1193     return (confidence > 0);
1194 }
1195 
~CharsetRecog_IBM424_he_ltr()1196 CharsetRecog_IBM424_he_ltr::~CharsetRecog_IBM424_he_ltr()
1197 {
1198     // nothing to do
1199 }
1200 
getName() const1201 const char *CharsetRecog_IBM424_he_ltr::getName() const
1202 {
1203     return  "IBM424_ltr";
1204 }
1205 
match(InputText * textIn,CharsetMatch * results) const1206 UBool CharsetRecog_IBM424_he_ltr::match(InputText *textIn, CharsetMatch *results) const
1207 {
1208     int32_t confidence = match_sbcs(textIn, ngrams_IBM424_he_ltr, charMap_IBM424_he);
1209     results->set(textIn, this, confidence);
1210     return (confidence > 0);
1211 }
1212 
~CharsetRecog_IBM420_ar()1213 CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar()
1214 {
1215     // nothing to do
1216 }
1217 
getLanguage() const1218 const char *CharsetRecog_IBM420_ar::getLanguage() const
1219 {
1220     return "ar";
1221 }
1222 
1223 
match_sbcs(InputText * det,const int32_t ngrams[],const uint8_t byteMap[]) const1224 int32_t CharsetRecog_IBM420_ar::match_sbcs(InputText *det, const int32_t ngrams[],  const uint8_t byteMap[]) const
1225 {
1226     NGramParser_IBM420 parser(ngrams, byteMap);
1227     int32_t result;
1228 
1229     result = parser.parse(det);
1230 
1231     return result;
1232 }
1233 
~CharsetRecog_IBM420_ar_rtl()1234 CharsetRecog_IBM420_ar_rtl::~CharsetRecog_IBM420_ar_rtl()
1235 {
1236     // nothing to do
1237 }
1238 
getName() const1239 const char *CharsetRecog_IBM420_ar_rtl::getName() const
1240 {
1241     return  "IBM420_rtl";
1242 }
1243 
match(InputText * textIn,CharsetMatch * results) const1244 UBool CharsetRecog_IBM420_ar_rtl::match(InputText *textIn, CharsetMatch *results) const
1245 {
1246     int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_rtl, charMap_IBM420_ar);
1247     results->set(textIn, this, confidence);
1248     return (confidence > 0);
1249 }
1250 
~CharsetRecog_IBM420_ar_ltr()1251 CharsetRecog_IBM420_ar_ltr::~CharsetRecog_IBM420_ar_ltr()
1252 {
1253     // nothing to do
1254 }
1255 
getName() const1256 const char *CharsetRecog_IBM420_ar_ltr::getName() const
1257 {
1258     return  "IBM420_ltr";
1259 }
1260 
match(InputText * textIn,CharsetMatch * results) const1261 UBool CharsetRecog_IBM420_ar_ltr::match(InputText *textIn, CharsetMatch *results) const
1262 {
1263     int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_ltr, charMap_IBM420_ar);
1264     results->set(textIn, this, confidence);
1265     return (confidence > 0);
1266 }
1267 #endif
1268 
1269 U_NAMESPACE_END
1270 #endif
1271 
1272