1 /*
2 **********************************************************************
3 * Copyright (C) 2005-2013, International Business Machines
4 * Corporation and others. All Rights Reserved.
5 **********************************************************************
6 */
7
8 #include "unicode/utypes.h"
9
10 #include "cmemory.h"
11
12 #if !UCONFIG_NO_CONVERSION
13 #include "csrsbcs.h"
14 #include "csmatch.h"
15
16 #define N_GRAM_SIZE 3
17 #define N_GRAM_MASK 0xFFFFFF
18 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
19
20 U_NAMESPACE_BEGIN
21
NGramParser(const int32_t * theNgramList,const uint8_t * theCharMap)22 NGramParser::NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap)
23 : ngram(0), byteIndex(0)
24 {
25 ngramList = theNgramList;
26 charMap = theCharMap;
27
28 ngramCount = hitCount = 0;
29 }
30
31 /*
32 * Binary search for value in table, which must have exactly 64 entries.
33 */
34
search(const int32_t * table,int32_t value)35 int32_t NGramParser::search(const int32_t *table, int32_t value)
36 {
37 int32_t index = 0;
38
39 if (table[index + 32] <= value) {
40 index += 32;
41 }
42
43 if (table[index + 16] <= value) {
44 index += 16;
45 }
46
47 if (table[index + 8] <= value) {
48 index += 8;
49 }
50
51 if (table[index + 4] <= value) {
52 index += 4;
53 }
54
55 if (table[index + 2] <= value) {
56 index += 2;
57 }
58
59 if (table[index + 1] <= value) {
60 index += 1;
61 }
62
63 if (table[index] > value) {
64 index -= 1;
65 }
66
67 if (index < 0 || table[index] != value) {
68 return -1;
69 }
70
71 return index;
72 }
73
lookup(int32_t thisNgram)74 void NGramParser::lookup(int32_t thisNgram)
75 {
76 ngramCount += 1;
77
78 if (search(ngramList, thisNgram) >= 0) {
79 hitCount += 1;
80 }
81
82 }
83
addByte(int32_t b)84 void NGramParser::addByte(int32_t b)
85 {
86 ngram = ((ngram << 8) + b) & N_GRAM_MASK;
87 lookup(ngram);
88 }
89
nextByte(InputText * det)90 int32_t NGramParser::nextByte(InputText *det)
91 {
92 if (byteIndex >= det->fInputLen) {
93 return -1;
94 }
95
96 return det->fInputBytes[byteIndex++];
97 }
98
parseCharacters(InputText * det)99 void NGramParser::parseCharacters(InputText *det)
100 {
101 int32_t b;
102 bool ignoreSpace = FALSE;
103
104 while ((b = nextByte(det)) >= 0) {
105 uint8_t mb = charMap[b];
106
107 // TODO: 0x20 might not be a space in all character sets...
108 if (mb != 0) {
109 if (!(mb == 0x20 && ignoreSpace)) {
110 addByte(mb);
111 }
112
113 ignoreSpace = (mb == 0x20);
114 }
115 }
116 }
117
parse(InputText * det)118 int32_t NGramParser::parse(InputText *det)
119 {
120 parseCharacters(det);
121
122 // TODO: Is this OK? The buffer could have ended in the middle of a word...
123 addByte(0x20);
124
125 double rawPercent = (double) hitCount / (double) ngramCount;
126
127 // if (rawPercent <= 2.0) {
128 // return 0;
129 // }
130
131 // TODO - This is a bit of a hack to take care of a case
132 // were we were getting a confidence of 135...
133 if (rawPercent > 0.33) {
134 return 98;
135 }
136
137 return (int32_t) (rawPercent * 300.0);
138 }
139
140 static const uint8_t unshapeMap_IBM420[] = {
141 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
142 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
143 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
144 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
145 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
146 /* 4- */ 0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
147 /* 5- */ 0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
148 /* 6- */ 0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
149 /* 7- */ 0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
150 /* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F,
151 /* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E,
152 /* A- */ 0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF,
153 /* B- */ 0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF,
154 /* C- */ 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF,
155 /* D- */ 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF,
156 /* E- */ 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
157 /* F- */ 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
158 };
159
NGramParser_IBM420(const int32_t * theNgramList,const uint8_t * theCharMap)160 NGramParser_IBM420::NGramParser_IBM420(const int32_t *theNgramList, const uint8_t *theCharMap):NGramParser(theNgramList, theCharMap)
161 {
162 alef = 0x00;
163 }
164
165
isLamAlef(int32_t b)166 int32_t NGramParser_IBM420::isLamAlef(int32_t b)
167 {
168 if(b == 0xB2 || b == 0xB3){
169 return 0x47;
170 }else if(b == 0xB4 || b == 0xB5){
171 return 0x49;
172 }else if(b == 0xB8 || b == 0xB9){
173 return 0x56;
174 }else
175 return 0x00;
176 }
177
178 /*
179 * Arabic shaping needs to be done manually. Cannot call ArabicShaping class
180 * because CharsetDetector is dealing with bytes not Unicode code points. We could
181 * convert the bytes to Unicode code points but that would leave us dependent
182 * on CharsetICU which we try to avoid. IBM420 converter amongst different versions
183 * of JDK can produce different results and therefore is also avoided.
184 */
nextByte(InputText * det)185 int32_t NGramParser_IBM420::nextByte(InputText *det)
186 {
187
188 if (byteIndex >= det->fInputLen || det->fInputBytes[byteIndex] == 0) {
189 return -1;
190 }
191 int next;
192
193 alef = isLamAlef(det->fInputBytes[byteIndex]);
194 if(alef != 0x00)
195 next = 0xB1 & 0xFF;
196 else
197 next = unshapeMap_IBM420[det->fInputBytes[byteIndex]& 0xFF] & 0xFF;
198
199 byteIndex++;
200
201 return next;
202 }
203
parseCharacters(InputText * det)204 void NGramParser_IBM420::parseCharacters(InputText *det)
205 {
206 int32_t b;
207 bool ignoreSpace = FALSE;
208
209 while ((b = nextByte(det)) >= 0) {
210 uint8_t mb = charMap[b];
211
212 // TODO: 0x20 might not be a space in all character sets...
213 if (mb != 0) {
214 if (!(mb == 0x20 && ignoreSpace)) {
215 addByte(mb);
216 }
217 ignoreSpace = (mb == 0x20);
218 }
219
220 if(alef != 0x00){
221 mb = charMap[alef & 0xFF];
222
223 // TODO: 0x20 might not be a space in all character sets...
224 if (mb != 0) {
225 if (!(mb == 0x20 && ignoreSpace)) {
226 addByte(mb);
227 }
228
229 ignoreSpace = (mb == 0x20);
230 }
231
232 }
233 }
234 }
235
CharsetRecog_sbcs()236 CharsetRecog_sbcs::CharsetRecog_sbcs()
237 {
238 // nothing else to do
239 }
240
~CharsetRecog_sbcs()241 CharsetRecog_sbcs::~CharsetRecog_sbcs()
242 {
243 // nothing to do
244 }
245
match_sbcs(InputText * det,const int32_t ngrams[],const uint8_t byteMap[]) const246 int32_t CharsetRecog_sbcs::match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t byteMap[]) const
247 {
248 NGramParser parser(ngrams, byteMap);
249 int32_t result;
250
251 result = parser.parse(det);
252
253 return result;
254 }
255
256 static const uint8_t charMap_8859_1[] = {
257 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
258 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
259 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
260 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
261 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
262 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
263 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
264 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
265 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
266 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
267 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
268 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
269 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
270 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
271 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
272 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
273 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
274 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
275 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
276 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
277 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
278 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
279 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
280 0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
281 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
282 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
283 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
284 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
285 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
286 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
287 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
288 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
289 };
290
291 static const uint8_t charMap_8859_2[] = {
292 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
293 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
294 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
295 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
296 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
297 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
298 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
299 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
300 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
301 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
302 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
303 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
304 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
305 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
306 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
307 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
308 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
309 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
310 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
311 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
312 0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0x20,
313 0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
314 0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0xB7,
315 0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
316 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
317 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
318 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
319 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
320 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
321 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
322 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
323 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
324 };
325
326 static const uint8_t charMap_8859_5[] = {
327 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
328 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
329 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
330 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
331 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
332 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
333 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
334 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
335 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
336 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
337 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
338 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
339 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
340 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
341 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
342 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
343 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
344 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
345 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
346 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
347 0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
348 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
349 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
350 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
351 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
352 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
353 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
354 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
355 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
356 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
357 0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
358 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
359 };
360
361 static const uint8_t charMap_8859_6[] = {
362 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
363 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
364 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
365 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
366 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
367 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
368 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
369 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
370 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
371 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
372 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
373 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
374 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
375 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
376 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
377 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
378 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
379 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
380 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
381 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
382 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
383 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
384 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
385 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
386 0x20, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
387 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
388 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
389 0xD8, 0xD9, 0xDA, 0x20, 0x20, 0x20, 0x20, 0x20,
390 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
391 0xE8, 0xE9, 0xEA, 0x20, 0x20, 0x20, 0x20, 0x20,
392 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
393 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
394 };
395
396 static const uint8_t charMap_8859_7[] = {
397 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
398 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
399 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
400 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
401 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
402 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
403 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
404 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
405 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
406 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
407 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
408 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
409 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
410 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
411 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
412 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
413 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
414 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
415 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
416 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
417 0x20, 0xA1, 0xA2, 0x20, 0x20, 0x20, 0x20, 0x20,
418 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
419 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xDC, 0x20,
420 0xDD, 0xDE, 0xDF, 0x20, 0xFC, 0x20, 0xFD, 0xFE,
421 0xC0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
422 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
423 0xF0, 0xF1, 0x20, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
424 0xF8, 0xF9, 0xFA, 0xFB, 0xDC, 0xDD, 0xDE, 0xDF,
425 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
426 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
427 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
428 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
429 };
430
431 static const uint8_t charMap_8859_8[] = {
432 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
433 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
434 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
435 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
436 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
437 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
438 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
439 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
440 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
441 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
442 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
443 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
444 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
445 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
446 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
447 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
448 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
449 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
450 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
451 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
452 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
453 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
454 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
455 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
456 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
457 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
458 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
459 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
460 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
461 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
462 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
463 0xF8, 0xF9, 0xFA, 0x20, 0x20, 0x20, 0x20, 0x20,
464 };
465
466 static const uint8_t charMap_8859_9[] = {
467 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
468 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
469 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
470 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
471 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
472 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
473 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
474 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
475 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
476 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
477 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
478 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
479 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
480 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
481 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
482 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
483 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
484 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
485 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
486 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
487 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
488 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
489 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
490 0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
491 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
492 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
493 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
494 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x69, 0xFE, 0xDF,
495 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
496 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
497 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
498 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
499 };
500
501 static const int32_t ngrams_windows_1251[] = {
502 0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 0x20EAEE, 0x20EDE0, 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 0x20F1F2, 0x20F2EE,
503 0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 0xE5EDE8, 0xE5F1F2, 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 0xEBE820, 0xEBFCED,
504 0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 0xEE20E2, 0xEE20EF, 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 0xEEEC20, 0xEEF1F2,
505 0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 0xF1F2E0, 0xF1F2E2, 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 0xF7F2EE, 0xFBF520,
506 };
507
508 static const uint8_t charMap_windows_1251[] = {
509 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
510 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
511 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
512 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
513 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
514 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
515 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
516 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
517 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
518 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
519 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
520 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
521 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
522 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
523 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
524 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
525 0x90, 0x83, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
526 0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
527 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
528 0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
529 0x20, 0xA2, 0xA2, 0xBC, 0x20, 0xB4, 0x20, 0x20,
530 0xB8, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0xBF,
531 0x20, 0x20, 0xB3, 0xB3, 0xB4, 0xB5, 0x20, 0x20,
532 0xB8, 0x20, 0xBA, 0x20, 0xBC, 0xBE, 0xBE, 0xBF,
533 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
534 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
535 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
536 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
537 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
538 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
539 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
540 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
541 };
542
543 static const int32_t ngrams_windows_1256[] = {
544 0x20C7E1, 0x20C7E4, 0x20C8C7, 0x20DAE1, 0x20DDED, 0x20E1E1, 0x20E3E4, 0x20E6C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E120, 0xC7E1C3, 0xC7E1C7, 0xC7E1C8,
545 0xC7E1CA, 0xC7E1CC, 0xC7E1CD, 0xC7E1CF, 0xC7E1D3, 0xC7E1DA, 0xC7E1DE, 0xC7E1E3, 0xC7E1E6, 0xC7E1ED, 0xC7E320, 0xC7E420, 0xC7E4CA, 0xC820C7, 0xC920C7, 0xC920DD,
546 0xC920E1, 0xC920E3, 0xC920E6, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xDA20C7, 0xDAE1EC, 0xDDED20, 0xE120C7, 0xE1C920, 0xE1EC20, 0xE1ED20,
547 0xE320C7, 0xE3C720, 0xE3C920, 0xE3E420, 0xE420C7, 0xE520C7, 0xE5C720, 0xE6C7E1, 0xE6E420, 0xEC20C7, 0xED20C7, 0xED20E3, 0xED20E6, 0xEDC920, 0xEDD120, 0xEDE420,
548 };
549
550 static const uint8_t charMap_windows_1256[] = {
551 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
552 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
553 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
554 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
555 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
556 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
557 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
558 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
559 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
560 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
561 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
562 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
563 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
564 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
565 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
566 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
567 0x20, 0x81, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
568 0x88, 0x20, 0x8A, 0x20, 0x9C, 0x8D, 0x8E, 0x8F,
569 0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
570 0x98, 0x20, 0x9A, 0x20, 0x9C, 0x20, 0x20, 0x9F,
571 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
572 0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
573 0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
574 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
575 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
576 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
577 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0x20,
578 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
579 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
580 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
581 0x20, 0x20, 0x20, 0x20, 0xF4, 0x20, 0x20, 0x20,
582 0x20, 0xF9, 0x20, 0xFB, 0xFC, 0x20, 0x20, 0xFF,
583 };
584
585 static const int32_t ngrams_KOI8_R[] = {
586 0x20C4CF, 0x20C920, 0x20CBC1, 0x20CBCF, 0x20CEC1, 0x20CEC5, 0x20CFC2, 0x20D0CF, 0x20D0D2, 0x20D2C1, 0x20D3CF, 0x20D3D4, 0x20D4CF, 0x20D720, 0x20D7CF, 0x20DAC1,
587 0x20DCD4, 0x20DED4, 0xC1CEC9, 0xC1D4D8, 0xC5CCD8, 0xC5CEC9, 0xC5D3D4, 0xC5D420, 0xC7CF20, 0xC920D0, 0xC9C520, 0xC9C920, 0xC9D120, 0xCCC5CE, 0xCCC920, 0xCCD8CE,
588 0xCEC120, 0xCEC520, 0xCEC9C5, 0xCEC9D1, 0xCECF20, 0xCECFD7, 0xCF20D0, 0xCF20D3, 0xCF20D7, 0xCFC7CF, 0xCFCA20, 0xCFCCD8, 0xCFCD20, 0xCFD3D4, 0xCFD720, 0xCFD7C1,
589 0xD0CFCC, 0xD0D2C5, 0xD0D2C9, 0xD0D2CF, 0xD2C1D7, 0xD2C5C4, 0xD3D120, 0xD3D4C1, 0xD3D4C9, 0xD3D4D7, 0xD4C5CC, 0xD4CF20, 0xD4CFD2, 0xD4D820, 0xD9C820, 0xDED4CF,
590 };
591
592 static const uint8_t charMap_KOI8_R[] = {
593 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
594 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
595 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
596 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
597 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
598 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
599 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
600 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
601 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
602 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
603 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
604 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
605 0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
606 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
607 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
608 0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
609 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
610 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
611 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
612 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
613 0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
614 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
615 0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
616 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
617 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
618 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
619 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
620 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
621 0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
622 0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
623 0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
624 0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
625 };
626
627 static const int32_t ngrams_IBM424_he_rtl[] = {
628 0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641,
629 0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045,
630 0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x544041, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x554056,
631 0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x684045, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x714069,
632 };
633
634 static const int32_t ngrams_IBM424_he_ltr[] = {
635 0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x405462, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x407141,
636 0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x414645, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x454054,
637 0x454055, 0x454057, 0x454068, 0x454071, 0x455440, 0x464140, 0x464540, 0x484140, 0x514140, 0x514240, 0x514540, 0x544045, 0x544055, 0x544071, 0x546240, 0x546940,
638 0x555151, 0x555158, 0x555168, 0x564045, 0x564055, 0x564071, 0x564240, 0x564540, 0x624540, 0x694045, 0x694055, 0x694071, 0x694540, 0x714140, 0x714540, 0x714651,
639 };
640
641 static const uint8_t charMap_IBM424_he[] = {
642 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
643 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
644 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
645 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
646 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
647 /* 4- */ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
648 /* 5- */ 0x40, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
649 /* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
650 /* 7- */ 0x40, 0x71, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x00, 0x40, 0x40,
651 /* 8- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
652 /* 9- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
653 /* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
654 /* B- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
655 /* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
656 /* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
657 /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
658 /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
659 };
660
661 static const int32_t ngrams_IBM420_ar_rtl[] = {
662 0x4056B1, 0x4056BD, 0x405856, 0x409AB1, 0x40ABDC, 0x40B1B1, 0x40BBBD, 0x40CF56, 0x564056, 0x564640, 0x566340, 0x567540, 0x56B140, 0x56B149, 0x56B156, 0x56B158,
663 0x56B163, 0x56B167, 0x56B169, 0x56B173, 0x56B178, 0x56B19A, 0x56B1AD, 0x56B1BB, 0x56B1CF, 0x56B1DC, 0x56BB40, 0x56BD40, 0x56BD63, 0x584056, 0x624056, 0x6240AB,
664 0x6240B1, 0x6240BB, 0x6240CF, 0x634056, 0x734056, 0x736240, 0x754056, 0x756240, 0x784056, 0x9A4056, 0x9AB1DA, 0xABDC40, 0xB14056, 0xB16240, 0xB1DA40, 0xB1DC40,
665 0xBB4056, 0xBB5640, 0xBB6240, 0xBBBD40, 0xBD4056, 0xBF4056, 0xBF5640, 0xCF56B1, 0xCFBD40, 0xDA4056, 0xDC4056, 0xDC40BB, 0xDC40CF, 0xDC6240, 0xDC7540, 0xDCBD40,
666 };
667
668 static const int32_t ngrams_IBM420_ar_ltr[] = {
669 0x404656, 0x4056BB, 0x4056BF, 0x406273, 0x406275, 0x4062B1, 0x4062BB, 0x4062DC, 0x406356, 0x407556, 0x4075DC, 0x40B156, 0x40BB56, 0x40BD56, 0x40BDBB, 0x40BDCF,
670 0x40BDDC, 0x40DAB1, 0x40DCAB, 0x40DCB1, 0x49B156, 0x564056, 0x564058, 0x564062, 0x564063, 0x564073, 0x564075, 0x564078, 0x56409A, 0x5640B1, 0x5640BB, 0x5640BD,
671 0x5640BF, 0x5640DA, 0x5640DC, 0x565840, 0x56B156, 0x56CF40, 0x58B156, 0x63B156, 0x63BD56, 0x67B156, 0x69B156, 0x73B156, 0x78B156, 0x9AB156, 0xAB4062, 0xADB156,
672 0xB14062, 0xB15640, 0xB156CF, 0xB19A40, 0xB1B140, 0xBB4062, 0xBB40DC, 0xBBB156, 0xBD5640, 0xBDBB40, 0xCF4062, 0xCF40DC, 0xCFB156, 0xDAB19A, 0xDCAB40, 0xDCB156
673 };
674
675 static const uint8_t charMap_IBM420_ar[]= {
676 /* -0 -1 -2 -3 -4 -5 -6 -7 -8 -9 -A -B -C -D -E -F */
677 /* 0- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
678 /* 1- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
679 /* 2- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
680 /* 3- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
681 /* 4- */ 0x40, 0x40, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
682 /* 5- */ 0x40, 0x51, 0x52, 0x40, 0x40, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
683 /* 6- */ 0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
684 /* 7- */ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
685 /* 8- */ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
686 /* 9- */ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
687 /* A- */ 0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
688 /* B- */ 0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0x40, 0x40, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
689 /* C- */ 0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0xCB, 0x40, 0xCD, 0x40, 0xCF,
690 /* D- */ 0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
691 /* E- */ 0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0xEB, 0x40, 0xED, 0xEE, 0xEF,
692 /* F- */ 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0xFB, 0xFC, 0xFD, 0xFE, 0x40,
693 };
694
695 //ISO-8859-1,2,5,6,7,8,9 Ngrams
696
697 struct NGramsPlusLang {
698 const int32_t ngrams[64];
699 const char * lang;
700 };
701
702 static const NGramsPlusLang ngrams_8859_1[] = {
703 {
704 {
705 0x206120, 0x20616E, 0x206265, 0x20636F, 0x20666F, 0x206861, 0x206865, 0x20696E, 0x206D61, 0x206F66, 0x207072, 0x207265, 0x207361, 0x207374, 0x207468, 0x20746F,
706 0x207768, 0x616964, 0x616C20, 0x616E20, 0x616E64, 0x617320, 0x617420, 0x617465, 0x617469, 0x642061, 0x642074, 0x652061, 0x652073, 0x652074, 0x656420, 0x656E74,
707 0x657220, 0x657320, 0x666F72, 0x686174, 0x686520, 0x686572, 0x696420, 0x696E20, 0x696E67, 0x696F6E, 0x697320, 0x6E2061, 0x6E2074, 0x6E6420, 0x6E6720, 0x6E7420,
708 0x6F6620, 0x6F6E20, 0x6F7220, 0x726520, 0x727320, 0x732061, 0x732074, 0x736169, 0x737420, 0x742074, 0x746572, 0x746861, 0x746865, 0x74696F, 0x746F20, 0x747320,
709 },
710 "en"
711 },
712 {
713 {
714 0x206166, 0x206174, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207369, 0x207374, 0x207469, 0x207669, 0x616620,
715 0x616E20, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656E20, 0x656E64, 0x657220, 0x657265, 0x657320,
716 0x657420, 0x666F72, 0x676520, 0x67656E, 0x676572, 0x696765, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6572, 0x6C6967, 0x6C6C65, 0x6D6564, 0x6E6465, 0x6E6520,
717 0x6E6720, 0x6E6765, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722064, 0x722065, 0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696C, 0x766572,
718 },
719 "da"
720 },
721 {
722 {
723 0x20616E, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 0x206765, 0x206861, 0x20696E, 0x206D69, 0x207363, 0x207365, 0x20756E, 0x207665, 0x20766F,
724 0x207765, 0x207A75, 0x626572, 0x636820, 0x636865, 0x636874, 0x646173, 0x64656E, 0x646572, 0x646965, 0x652064, 0x652073, 0x65696E, 0x656974, 0x656E20, 0x657220,
725 0x657320, 0x67656E, 0x68656E, 0x687420, 0x696368, 0x696520, 0x696E20, 0x696E65, 0x697420, 0x6C6963, 0x6C6C65, 0x6E2061, 0x6E2064, 0x6E2073, 0x6E6420, 0x6E6465,
726 0x6E6520, 0x6E6720, 0x6E6765, 0x6E7465, 0x722064, 0x726465, 0x726569, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x756E64, 0x756E67, 0x766572,
727 },
728 "de"
729 },
730 {
731 {
732 0x206120, 0x206361, 0x20636F, 0x206465, 0x20656C, 0x20656E, 0x206573, 0x20696E, 0x206C61, 0x206C6F, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
733 0x20756E, 0x207920, 0x612063, 0x612064, 0x612065, 0x61206C, 0x612070, 0x616369, 0x61646F, 0x616C20, 0x617220, 0x617320, 0x6369F3, 0x636F6E, 0x646520, 0x64656C,
734 0x646F20, 0x652064, 0x652065, 0x65206C, 0x656C20, 0x656E20, 0x656E74, 0x657320, 0x657374, 0x69656E, 0x69F36E, 0x6C6120, 0x6C6F73, 0x6E2065, 0x6E7465, 0x6F2064,
735 0x6F2065, 0x6F6E20, 0x6F7220, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732064, 0x732065, 0x732070, 0x736520, 0x746520, 0x746F20, 0x756520, 0xF36E20,
736 },
737 "es"
738 },
739 {
740 {
741 0x206175, 0x20636F, 0x206461, 0x206465, 0x206475, 0x20656E, 0x206574, 0x206C61, 0x206C65, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207365, 0x20736F, 0x20756E,
742 0x20E020, 0x616E74, 0x617469, 0x636520, 0x636F6E, 0x646520, 0x646573, 0x647520, 0x652061, 0x652063, 0x652064, 0x652065, 0x65206C, 0x652070, 0x652073, 0x656E20,
743 0x656E74, 0x657220, 0x657320, 0x657420, 0x657572, 0x696F6E, 0x697320, 0x697420, 0x6C6120, 0x6C6520, 0x6C6573, 0x6D656E, 0x6E2064, 0x6E6520, 0x6E7320, 0x6E7420,
744 0x6F6E20, 0x6F6E74, 0x6F7572, 0x717565, 0x72206C, 0x726520, 0x732061, 0x732064, 0x732065, 0x73206C, 0x732070, 0x742064, 0x746520, 0x74696F, 0x756520, 0x757220,
745 },
746 "fr"
747 },
748 {
749 {
750 0x20616C, 0x206368, 0x20636F, 0x206465, 0x206469, 0x206520, 0x20696C, 0x20696E, 0x206C61, 0x207065, 0x207072, 0x20756E, 0x612063, 0x612064, 0x612070, 0x612073,
751 0x61746F, 0x636865, 0x636F6E, 0x64656C, 0x646920, 0x652061, 0x652063, 0x652064, 0x652069, 0x65206C, 0x652070, 0x652073, 0x656C20, 0x656C6C, 0x656E74, 0x657220,
752 0x686520, 0x692061, 0x692063, 0x692064, 0x692073, 0x696120, 0x696C20, 0x696E20, 0x696F6E, 0x6C6120, 0x6C6520, 0x6C6920, 0x6C6C61, 0x6E6520, 0x6E6920, 0x6E6F20,
753 0x6E7465, 0x6F2061, 0x6F2064, 0x6F2069, 0x6F2073, 0x6F6E20, 0x6F6E65, 0x706572, 0x726120, 0x726520, 0x736920, 0x746120, 0x746520, 0x746920, 0x746F20, 0x7A696F,
754 },
755 "it"
756 },
757 {
758 {
759 0x20616C, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656E, 0x206765, 0x206865, 0x20696E, 0x206D61, 0x206D65, 0x206F70, 0x207465, 0x207661, 0x207665,
760 0x20766F, 0x207765, 0x207A69, 0x61616E, 0x616172, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656E, 0x646572, 0x652062, 0x652076, 0x65656E,
761 0x656572, 0x656E20, 0x657220, 0x657273, 0x657420, 0x67656E, 0x686574, 0x696520, 0x696E20, 0x696E67, 0x697320, 0x6E2062, 0x6E2064, 0x6E2065, 0x6E2068, 0x6E206F,
762 0x6E2076, 0x6E6465, 0x6E6720, 0x6F6E64, 0x6F6F72, 0x6F7020, 0x6F7220, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x76616E, 0x766572, 0x766F6F,
763 },
764 "nl"
765 },
766 {
767 {
768 0x206174, 0x206176, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207365, 0x20736B, 0x20736F, 0x207374, 0x207469,
769 0x207669, 0x20E520, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646574, 0x652073, 0x656420, 0x656E20, 0x656E65, 0x657220, 0x657265, 0x657420, 0x657474,
770 0x666F72, 0x67656E, 0x696B6B, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6520, 0x6C6C65, 0x6D6564, 0x6D656E, 0x6E2073, 0x6E6520, 0x6E6720, 0x6E6765, 0x6E6E65,
771 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722073, 0x726520, 0x736F6D, 0x737465, 0x742073, 0x746520, 0x74656E, 0x746572, 0x74696C, 0x747420, 0x747465, 0x766572,
772 },
773 "no"
774 },
775 {
776 {
777 0x206120, 0x20636F, 0x206461, 0x206465, 0x20646F, 0x206520, 0x206573, 0x206D61, 0x206E6F, 0x206F20, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
778 0x20756D, 0x612061, 0x612063, 0x612064, 0x612070, 0x616465, 0x61646F, 0x616C20, 0x617220, 0x617261, 0x617320, 0x636F6D, 0x636F6E, 0x646120, 0x646520, 0x646F20,
779 0x646F73, 0x652061, 0x652064, 0x656D20, 0x656E74, 0x657320, 0x657374, 0x696120, 0x696361, 0x6D656E, 0x6E7465, 0x6E746F, 0x6F2061, 0x6F2063, 0x6F2064, 0x6F2065,
780 0x6F2070, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732061, 0x732064, 0x732065, 0x732070, 0x737461, 0x746520, 0x746F20, 0x756520, 0xE36F20, 0xE7E36F,
781 },
782 "pt"
783 },
784 {
785 {
786 0x206174, 0x206176, 0x206465, 0x20656E, 0x2066F6, 0x206861, 0x206920, 0x20696E, 0x206B6F, 0x206D65, 0x206F63, 0x2070E5, 0x20736B, 0x20736F, 0x207374, 0x207469,
787 0x207661, 0x207669, 0x20E472, 0x616465, 0x616E20, 0x616E64, 0x617220, 0x617474, 0x636820, 0x646520, 0x64656E, 0x646572, 0x646574, 0x656420, 0x656E20, 0x657220,
788 0x657420, 0x66F672, 0x67656E, 0x696C6C, 0x696E67, 0x6B6120, 0x6C6C20, 0x6D6564, 0x6E2073, 0x6E6120, 0x6E6465, 0x6E6720, 0x6E6765, 0x6E696E, 0x6F6368, 0x6F6D20,
789 0x6F6E20, 0x70E520, 0x722061, 0x722073, 0x726120, 0x736B61, 0x736F6D, 0x742073, 0x746120, 0x746520, 0x746572, 0x74696C, 0x747420, 0x766172, 0xE47220, 0xF67220,
790 },
791 "sv"
792 }
793 };
794
795
796 static const NGramsPlusLang ngrams_8859_2[] = {
797 {
798 {
799 0x206120, 0x206279, 0x20646F, 0x206A65, 0x206E61, 0x206E65, 0x206F20, 0x206F64, 0x20706F, 0x207072, 0x2070F8, 0x20726F, 0x207365, 0x20736F, 0x207374, 0x20746F,
800 0x207620, 0x207679, 0x207A61, 0x612070, 0x636520, 0x636820, 0x652070, 0x652073, 0x652076, 0x656D20, 0x656EED, 0x686F20, 0x686F64, 0x697374, 0x6A6520, 0x6B7465,
801 0x6C6520, 0x6C6920, 0x6E6120, 0x6EE920, 0x6EEC20, 0x6EED20, 0x6F2070, 0x6F646E, 0x6F6A69, 0x6F7374, 0x6F7520, 0x6F7661, 0x706F64, 0x706F6A, 0x70726F, 0x70F865,
802 0x736520, 0x736F75, 0x737461, 0x737469, 0x73746E, 0x746572, 0x746EED, 0x746F20, 0x752070, 0xBE6520, 0xE16EED, 0xE9686F, 0xED2070, 0xED2073, 0xED6D20, 0xF86564,
803 },
804 "cs"
805 },
806 {
807 {
808 0x206120, 0x20617A, 0x206265, 0x206567, 0x20656C, 0x206665, 0x206861, 0x20686F, 0x206973, 0x206B65, 0x206B69, 0x206BF6, 0x206C65, 0x206D61, 0x206D65, 0x206D69,
809 0x206E65, 0x20737A, 0x207465, 0x20E973, 0x612061, 0x61206B, 0x61206D, 0x612073, 0x616B20, 0x616E20, 0x617A20, 0x62616E, 0x62656E, 0x656779, 0x656B20, 0x656C20,
810 0x656C65, 0x656D20, 0x656E20, 0x657265, 0x657420, 0x657465, 0x657474, 0x677920, 0x686F67, 0x696E74, 0x697320, 0x6B2061, 0x6BF67A, 0x6D6567, 0x6D696E, 0x6E2061,
811 0x6E616B, 0x6E656B, 0x6E656D, 0x6E7420, 0x6F6779, 0x732061, 0x737A65, 0x737A74, 0x737AE1, 0x73E967, 0x742061, 0x747420, 0x74E173, 0x7A6572, 0xE16E20, 0xE97320,
812 },
813 "hu"
814 },
815 {
816 {
817 0x20637A, 0x20646F, 0x206920, 0x206A65, 0x206B6F, 0x206D61, 0x206D69, 0x206E61, 0x206E69, 0x206F64, 0x20706F, 0x207072, 0x207369, 0x207720, 0x207769, 0x207779,
818 0x207A20, 0x207A61, 0x612070, 0x612077, 0x616E69, 0x636820, 0x637A65, 0x637A79, 0x646F20, 0x647A69, 0x652070, 0x652073, 0x652077, 0x65207A, 0x65676F, 0x656A20,
819 0x656D20, 0x656E69, 0x676F20, 0x696120, 0x696520, 0x69656A, 0x6B6120, 0x6B6920, 0x6B6965, 0x6D6965, 0x6E6120, 0x6E6961, 0x6E6965, 0x6F2070, 0x6F7761, 0x6F7769,
820 0x706F6C, 0x707261, 0x70726F, 0x70727A, 0x727A65, 0x727A79, 0x7369EA, 0x736B69, 0x737461, 0x776965, 0x796368, 0x796D20, 0x7A6520, 0x7A6965, 0x7A7920, 0xF37720,
821 },
822 "pl"
823 },
824 {
825 {
826 0x206120, 0x206163, 0x206361, 0x206365, 0x20636F, 0x206375, 0x206465, 0x206469, 0x206C61, 0x206D61, 0x207065, 0x207072, 0x207365, 0x2073E3, 0x20756E, 0x20BA69,
827 0x20EE6E, 0x612063, 0x612064, 0x617265, 0x617420, 0x617465, 0x617520, 0x636172, 0x636F6E, 0x637520, 0x63E320, 0x646520, 0x652061, 0x652063, 0x652064, 0x652070,
828 0x652073, 0x656120, 0x656920, 0x656C65, 0x656E74, 0x657374, 0x692061, 0x692063, 0x692064, 0x692070, 0x696520, 0x696920, 0x696E20, 0x6C6120, 0x6C6520, 0x6C6F72,
829 0x6C7569, 0x6E6520, 0x6E7472, 0x6F7220, 0x70656E, 0x726520, 0x726561, 0x727520, 0x73E320, 0x746520, 0x747275, 0x74E320, 0x756920, 0x756C20, 0xBA6920, 0xEE6E20,
830 },
831 "ro"
832 }
833 };
834
835 static const int32_t ngrams_8859_5_ru[] = {
836 0x20D220, 0x20D2DE, 0x20D4DE, 0x20D7D0, 0x20D820, 0x20DAD0, 0x20DADE, 0x20DDD0, 0x20DDD5, 0x20DED1, 0x20DFDE, 0x20DFE0, 0x20E0D0, 0x20E1DE, 0x20E1E2, 0x20E2DE,
837 0x20E7E2, 0x20EDE2, 0xD0DDD8, 0xD0E2EC, 0xD3DE20, 0xD5DBEC, 0xD5DDD8, 0xD5E1E2, 0xD5E220, 0xD820DF, 0xD8D520, 0xD8D820, 0xD8EF20, 0xDBD5DD, 0xDBD820, 0xDBECDD,
838 0xDDD020, 0xDDD520, 0xDDD8D5, 0xDDD8EF, 0xDDDE20, 0xDDDED2, 0xDE20D2, 0xDE20DF, 0xDE20E1, 0xDED220, 0xDED2D0, 0xDED3DE, 0xDED920, 0xDEDBEC, 0xDEDC20, 0xDEE1E2,
839 0xDFDEDB, 0xDFE0D5, 0xDFE0D8, 0xDFE0DE, 0xE0D0D2, 0xE0D5D4, 0xE1E2D0, 0xE1E2D2, 0xE1E2D8, 0xE1EF20, 0xE2D5DB, 0xE2DE20, 0xE2DEE0, 0xE2EC20, 0xE7E2DE, 0xEBE520,
840 };
841
842 static const int32_t ngrams_8859_6_ar[] = {
843 0x20C7E4, 0x20C7E6, 0x20C8C7, 0x20D9E4, 0x20E1EA, 0x20E4E4, 0x20E5E6, 0x20E8C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E420, 0xC7E4C3, 0xC7E4C7, 0xC7E4C8,
844 0xC7E4CA, 0xC7E4CC, 0xC7E4CD, 0xC7E4CF, 0xC7E4D3, 0xC7E4D9, 0xC7E4E2, 0xC7E4E5, 0xC7E4E8, 0xC7E4EA, 0xC7E520, 0xC7E620, 0xC7E6CA, 0xC820C7, 0xC920C7, 0xC920E1,
845 0xC920E4, 0xC920E5, 0xC920E8, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xD920C7, 0xD9E4E9, 0xE1EA20, 0xE420C7, 0xE4C920, 0xE4E920, 0xE4EA20,
846 0xE520C7, 0xE5C720, 0xE5C920, 0xE5E620, 0xE620C7, 0xE720C7, 0xE7C720, 0xE8C7E4, 0xE8E620, 0xE920C7, 0xEA20C7, 0xEA20E5, 0xEA20E8, 0xEAC920, 0xEAD120, 0xEAE620,
847 };
848
849 static const int32_t ngrams_8859_7_el[] = {
850 0x20E1ED, 0x20E1F0, 0x20E3E9, 0x20E4E9, 0x20E5F0, 0x20E720, 0x20EAE1, 0x20ECE5, 0x20EDE1, 0x20EF20, 0x20F0E1, 0x20F0EF, 0x20F0F1, 0x20F3F4, 0x20F3F5, 0x20F4E7,
851 0x20F4EF, 0xDFE120, 0xE120E1, 0xE120F4, 0xE1E920, 0xE1ED20, 0xE1F0FC, 0xE1F220, 0xE3E9E1, 0xE5E920, 0xE5F220, 0xE720F4, 0xE7ED20, 0xE7F220, 0xE920F4, 0xE9E120,
852 0xE9EADE, 0xE9F220, 0xEAE1E9, 0xEAE1F4, 0xECE520, 0xED20E1, 0xED20E5, 0xED20F0, 0xEDE120, 0xEFF220, 0xEFF520, 0xF0EFF5, 0xF0F1EF, 0xF0FC20, 0xF220E1, 0xF220E5,
853 0xF220EA, 0xF220F0, 0xF220F4, 0xF3E520, 0xF3E720, 0xF3F4EF, 0xF4E120, 0xF4E1E9, 0xF4E7ED, 0xF4E7F2, 0xF4E9EA, 0xF4EF20, 0xF4EFF5, 0xF4F9ED, 0xF9ED20, 0xFEED20,
854 };
855
856 static const int32_t ngrams_8859_8_I_he[] = {
857 0x20E0E5, 0x20E0E7, 0x20E0E9, 0x20E0FA, 0x20E1E9, 0x20E1EE, 0x20E4E0, 0x20E4E5, 0x20E4E9, 0x20E4EE, 0x20E4F2, 0x20E4F9, 0x20E4FA, 0x20ECE0, 0x20ECE4, 0x20EEE0,
858 0x20F2EC, 0x20F9EC, 0xE0FA20, 0xE420E0, 0xE420E1, 0xE420E4, 0xE420EC, 0xE420EE, 0xE420F9, 0xE4E5E0, 0xE5E020, 0xE5ED20, 0xE5EF20, 0xE5F820, 0xE5FA20, 0xE920E4,
859 0xE9E420, 0xE9E5FA, 0xE9E9ED, 0xE9ED20, 0xE9EF20, 0xE9F820, 0xE9FA20, 0xEC20E0, 0xEC20E4, 0xECE020, 0xECE420, 0xED20E0, 0xED20E1, 0xED20E4, 0xED20EC, 0xED20EE,
860 0xED20F9, 0xEEE420, 0xEF20E4, 0xF0E420, 0xF0E920, 0xF0E9ED, 0xF2EC20, 0xF820E4, 0xF8E9ED, 0xF9EC20, 0xFA20E0, 0xFA20E1, 0xFA20E4, 0xFA20EC, 0xFA20EE, 0xFA20F9,
861 };
862
863 static const int32_t ngrams_8859_8_he[] = {
864 0x20E0E5, 0x20E0EC, 0x20E4E9, 0x20E4EC, 0x20E4EE, 0x20E4F0, 0x20E9F0, 0x20ECF2, 0x20ECF9, 0x20EDE5, 0x20EDE9, 0x20EFE5, 0x20EFE9, 0x20F8E5, 0x20F8E9, 0x20FAE0,
865 0x20FAE5, 0x20FAE9, 0xE020E4, 0xE020EC, 0xE020ED, 0xE020FA, 0xE0E420, 0xE0E5E4, 0xE0EC20, 0xE0EE20, 0xE120E4, 0xE120ED, 0xE120FA, 0xE420E4, 0xE420E9, 0xE420EC,
866 0xE420ED, 0xE420EF, 0xE420F8, 0xE420FA, 0xE4EC20, 0xE5E020, 0xE5E420, 0xE7E020, 0xE9E020, 0xE9E120, 0xE9E420, 0xEC20E4, 0xEC20ED, 0xEC20FA, 0xECF220, 0xECF920,
867 0xEDE9E9, 0xEDE9F0, 0xEDE9F8, 0xEE20E4, 0xEE20ED, 0xEE20FA, 0xEEE120, 0xEEE420, 0xF2E420, 0xF920E4, 0xF920ED, 0xF920FA, 0xF9E420, 0xFAE020, 0xFAE420, 0xFAE5E9,
868 };
869
870 static const int32_t ngrams_8859_9_tr[] = {
871 0x206261, 0x206269, 0x206275, 0x206461, 0x206465, 0x206765, 0x206861, 0x20696C, 0x206B61, 0x206B6F, 0x206D61, 0x206F6C, 0x207361, 0x207461, 0x207665, 0x207961,
872 0x612062, 0x616B20, 0x616C61, 0x616D61, 0x616E20, 0x616EFD, 0x617220, 0x617261, 0x6172FD, 0x6173FD, 0x617961, 0x626972, 0x646120, 0x646520, 0x646920, 0x652062,
873 0x65206B, 0x656469, 0x656E20, 0x657220, 0x657269, 0x657369, 0x696C65, 0x696E20, 0x696E69, 0x697220, 0x6C616E, 0x6C6172, 0x6C6520, 0x6C6572, 0x6E2061, 0x6E2062,
874 0x6E206B, 0x6E6461, 0x6E6465, 0x6E6520, 0x6E6920, 0x6E696E, 0x6EFD20, 0x72696E, 0x72FD6E, 0x766520, 0x796120, 0x796F72, 0xFD6E20, 0xFD6E64, 0xFD6EFD, 0xFDF0FD,
875 };
876
~CharsetRecog_8859_1()877 CharsetRecog_8859_1::~CharsetRecog_8859_1()
878 {
879 // nothing to do
880 }
881
match(InputText * textIn,CharsetMatch * results) const882 UBool CharsetRecog_8859_1::match(InputText *textIn, CharsetMatch *results) const {
883 const char *name = textIn->fC1Bytes? "windows-1252" : "ISO-8859-1";
884 uint32_t i;
885 int32_t bestConfidenceSoFar = -1;
886 for (i=0; i < ARRAY_SIZE(ngrams_8859_1) ; i++) {
887 const int32_t *ngrams = ngrams_8859_1[i].ngrams;
888 const char *lang = ngrams_8859_1[i].lang;
889 int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_1);
890 if (confidence > bestConfidenceSoFar) {
891 results->set(textIn, this, confidence, name, lang);
892 bestConfidenceSoFar = confidence;
893 }
894 }
895 return (bestConfidenceSoFar > 0);
896 }
897
getName() const898 const char *CharsetRecog_8859_1::getName() const
899 {
900 return "ISO-8859-1";
901 }
902
903
~CharsetRecog_8859_2()904 CharsetRecog_8859_2::~CharsetRecog_8859_2()
905 {
906 // nothing to do
907 }
908
match(InputText * textIn,CharsetMatch * results) const909 UBool CharsetRecog_8859_2::match(InputText *textIn, CharsetMatch *results) const {
910 const char *name = textIn->fC1Bytes? "windows-1250" : "ISO-8859-2";
911 uint32_t i;
912 int32_t bestConfidenceSoFar = -1;
913 for (i=0; i < ARRAY_SIZE(ngrams_8859_2) ; i++) {
914 const int32_t *ngrams = ngrams_8859_2[i].ngrams;
915 const char *lang = ngrams_8859_2[i].lang;
916 int32_t confidence = match_sbcs(textIn, ngrams, charMap_8859_2);
917 if (confidence > bestConfidenceSoFar) {
918 results->set(textIn, this, confidence, name, lang);
919 bestConfidenceSoFar = confidence;
920 }
921 }
922 return (bestConfidenceSoFar > 0);
923 }
924
getName() const925 const char *CharsetRecog_8859_2::getName() const
926 {
927 return "ISO-8859-2";
928 }
929
930
~CharsetRecog_8859_5()931 CharsetRecog_8859_5::~CharsetRecog_8859_5()
932 {
933 // nothing to do
934 }
935
getName() const936 const char *CharsetRecog_8859_5::getName() const
937 {
938 return "ISO-8859-5";
939 }
940
~CharsetRecog_8859_5_ru()941 CharsetRecog_8859_5_ru::~CharsetRecog_8859_5_ru()
942 {
943 // nothing to do
944 }
945
getLanguage() const946 const char *CharsetRecog_8859_5_ru::getLanguage() const
947 {
948 return "ru";
949 }
950
match(InputText * textIn,CharsetMatch * results) const951 UBool CharsetRecog_8859_5_ru::match(InputText *textIn, CharsetMatch *results) const
952 {
953 int32_t confidence = match_sbcs(textIn, ngrams_8859_5_ru, charMap_8859_5);
954 results->set(textIn, this, confidence);
955 return (confidence > 0);
956 }
957
~CharsetRecog_8859_6()958 CharsetRecog_8859_6::~CharsetRecog_8859_6()
959 {
960 // nothing to do
961 }
962
getName() const963 const char *CharsetRecog_8859_6::getName() const
964 {
965 return "ISO-8859-6";
966 }
967
~CharsetRecog_8859_6_ar()968 CharsetRecog_8859_6_ar::~CharsetRecog_8859_6_ar()
969 {
970 // nothing to do
971 }
972
getLanguage() const973 const char *CharsetRecog_8859_6_ar::getLanguage() const
974 {
975 return "ar";
976 }
977
match(InputText * textIn,CharsetMatch * results) const978 UBool CharsetRecog_8859_6_ar::match(InputText *textIn, CharsetMatch *results) const
979 {
980 int32_t confidence = match_sbcs(textIn, ngrams_8859_6_ar, charMap_8859_6);
981 results->set(textIn, this, confidence);
982 return (confidence > 0);
983 }
984
~CharsetRecog_8859_7()985 CharsetRecog_8859_7::~CharsetRecog_8859_7()
986 {
987 // nothing to do
988 }
989
getName() const990 const char *CharsetRecog_8859_7::getName() const
991 {
992 return "ISO-8859-7";
993 }
994
~CharsetRecog_8859_7_el()995 CharsetRecog_8859_7_el::~CharsetRecog_8859_7_el()
996 {
997 // nothing to do
998 }
999
getLanguage() const1000 const char *CharsetRecog_8859_7_el::getLanguage() const
1001 {
1002 return "el";
1003 }
1004
match(InputText * textIn,CharsetMatch * results) const1005 UBool CharsetRecog_8859_7_el::match(InputText *textIn, CharsetMatch *results) const
1006 {
1007 const char *name = textIn->fC1Bytes? "windows-1253" : "ISO-8859-7";
1008 int32_t confidence = match_sbcs(textIn, ngrams_8859_7_el, charMap_8859_7);
1009 results->set(textIn, this, confidence, name, "el");
1010 return (confidence > 0);
1011 }
1012
~CharsetRecog_8859_8()1013 CharsetRecog_8859_8::~CharsetRecog_8859_8()
1014 {
1015 // nothing to do
1016 }
1017
getName() const1018 const char *CharsetRecog_8859_8::getName() const
1019 {
1020 return "ISO-8859-8";
1021 }
1022
~CharsetRecog_8859_8_I_he()1023 CharsetRecog_8859_8_I_he::~CharsetRecog_8859_8_I_he ()
1024 {
1025 // nothing to do
1026 }
1027
getName() const1028 const char *CharsetRecog_8859_8_I_he::getName() const
1029 {
1030 return "ISO-8859-8-I";
1031 }
1032
getLanguage() const1033 const char *CharsetRecog_8859_8_I_he::getLanguage() const
1034 {
1035 return "he";
1036 }
1037
match(InputText * textIn,CharsetMatch * results) const1038 UBool CharsetRecog_8859_8_I_he::match(InputText *textIn, CharsetMatch *results) const
1039 {
1040 const char *name = textIn->fC1Bytes? "windows-1255" : "ISO-8859-8-I";
1041 int32_t confidence = match_sbcs(textIn, ngrams_8859_8_I_he, charMap_8859_8);
1042 results->set(textIn, this, confidence, name, "he");
1043 return (confidence > 0);
1044 }
1045
~CharsetRecog_8859_8_he()1046 CharsetRecog_8859_8_he::~CharsetRecog_8859_8_he()
1047 {
1048 // od ot gnihton
1049 }
1050
getLanguage() const1051 const char *CharsetRecog_8859_8_he::getLanguage() const
1052 {
1053 return "he";
1054 }
1055
match(InputText * textIn,CharsetMatch * results) const1056 UBool CharsetRecog_8859_8_he::match(InputText *textIn, CharsetMatch *results) const
1057 {
1058 const char *name = textIn->fC1Bytes? "windows-1255" : "ISO-8859-8";
1059 int32_t confidence = match_sbcs(textIn, ngrams_8859_8_he, charMap_8859_8);
1060 results->set(textIn, this, confidence, name, "he");
1061 return (confidence > 0);
1062 }
1063
~CharsetRecog_8859_9()1064 CharsetRecog_8859_9::~CharsetRecog_8859_9()
1065 {
1066 // nothing to do
1067 }
1068
getName() const1069 const char *CharsetRecog_8859_9::getName() const
1070 {
1071 return "ISO-8859-9";
1072 }
1073
~CharsetRecog_8859_9_tr()1074 CharsetRecog_8859_9_tr::~CharsetRecog_8859_9_tr ()
1075 {
1076 // nothing to do
1077 }
1078
getLanguage() const1079 const char *CharsetRecog_8859_9_tr::getLanguage() const
1080 {
1081 return "tr";
1082 }
1083
match(InputText * textIn,CharsetMatch * results) const1084 UBool CharsetRecog_8859_9_tr::match(InputText *textIn, CharsetMatch *results) const
1085 {
1086 const char *name = textIn->fC1Bytes? "windows-1254" : "ISO-8859-9";
1087 int32_t confidence = match_sbcs(textIn, ngrams_8859_9_tr, charMap_8859_9);
1088 results->set(textIn, this, confidence, name, "tr");
1089 return (confidence > 0);
1090 }
1091
~CharsetRecog_windows_1256()1092 CharsetRecog_windows_1256::~CharsetRecog_windows_1256()
1093 {
1094 // nothing to do
1095 }
1096
getName() const1097 const char *CharsetRecog_windows_1256::getName() const
1098 {
1099 return "windows-1256";
1100 }
1101
getLanguage() const1102 const char *CharsetRecog_windows_1256::getLanguage() const
1103 {
1104 return "ar";
1105 }
1106
match(InputText * textIn,CharsetMatch * results) const1107 UBool CharsetRecog_windows_1256::match(InputText *textIn, CharsetMatch *results) const
1108 {
1109 int32_t confidence = match_sbcs(textIn, ngrams_windows_1256, charMap_windows_1256);
1110 results->set(textIn, this, confidence);
1111 return (confidence > 0);
1112 }
1113
~CharsetRecog_windows_1251()1114 CharsetRecog_windows_1251::~CharsetRecog_windows_1251()
1115 {
1116 // nothing to do
1117 }
1118
getName() const1119 const char *CharsetRecog_windows_1251::getName() const
1120 {
1121 return "windows-1251";
1122 }
1123
getLanguage() const1124 const char *CharsetRecog_windows_1251::getLanguage() const
1125 {
1126 return "ru";
1127 }
1128
match(InputText * textIn,CharsetMatch * results) const1129 UBool CharsetRecog_windows_1251::match(InputText *textIn, CharsetMatch *results) const
1130 {
1131 int32_t confidence = match_sbcs(textIn, ngrams_windows_1251, charMap_windows_1251);
1132 results->set(textIn, this, confidence);
1133 return (confidence > 0);
1134 }
1135
~CharsetRecog_KOI8_R()1136 CharsetRecog_KOI8_R::~CharsetRecog_KOI8_R()
1137 {
1138 // nothing to do
1139 }
1140
getName() const1141 const char *CharsetRecog_KOI8_R::getName() const
1142 {
1143 return "KOI8-R";
1144 }
1145
getLanguage() const1146 const char *CharsetRecog_KOI8_R::getLanguage() const
1147 {
1148 return "ru";
1149 }
1150
match(InputText * textIn,CharsetMatch * results) const1151 UBool CharsetRecog_KOI8_R::match(InputText *textIn, CharsetMatch *results) const
1152 {
1153 int32_t confidence = match_sbcs(textIn, ngrams_KOI8_R, charMap_KOI8_R);
1154 results->set(textIn, this, confidence);
1155 return (confidence > 0);
1156 }
1157
~CharsetRecog_IBM424_he()1158 CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he()
1159 {
1160 // nothing to do
1161 }
1162
getLanguage() const1163 const char *CharsetRecog_IBM424_he::getLanguage() const
1164 {
1165 return "he";
1166 }
1167
~CharsetRecog_IBM424_he_rtl()1168 CharsetRecog_IBM424_he_rtl::~CharsetRecog_IBM424_he_rtl()
1169 {
1170 // nothing to do
1171 }
1172
getName() const1173 const char *CharsetRecog_IBM424_he_rtl::getName() const
1174 {
1175 return "IBM424_rtl";
1176 }
1177
match(InputText * textIn,CharsetMatch * results) const1178 UBool CharsetRecog_IBM424_he_rtl::match(InputText *textIn, CharsetMatch *results) const
1179 {
1180 int32_t confidence = match_sbcs(textIn, ngrams_IBM424_he_rtl, charMap_IBM424_he);
1181 results->set(textIn, this, confidence);
1182 return (confidence > 0);
1183 }
1184
~CharsetRecog_IBM424_he_ltr()1185 CharsetRecog_IBM424_he_ltr::~CharsetRecog_IBM424_he_ltr()
1186 {
1187 // nothing to do
1188 }
1189
getName() const1190 const char *CharsetRecog_IBM424_he_ltr::getName() const
1191 {
1192 return "IBM424_ltr";
1193 }
1194
match(InputText * textIn,CharsetMatch * results) const1195 UBool CharsetRecog_IBM424_he_ltr::match(InputText *textIn, CharsetMatch *results) const
1196 {
1197 int32_t confidence = match_sbcs(textIn, ngrams_IBM424_he_ltr, charMap_IBM424_he);
1198 results->set(textIn, this, confidence);
1199 return (confidence > 0);
1200 }
1201
~CharsetRecog_IBM420_ar()1202 CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar()
1203 {
1204 // nothing to do
1205 }
1206
getLanguage() const1207 const char *CharsetRecog_IBM420_ar::getLanguage() const
1208 {
1209 return "ar";
1210 }
1211
1212
match_sbcs(InputText * det,const int32_t ngrams[],const uint8_t byteMap[]) const1213 int32_t CharsetRecog_IBM420_ar::match_sbcs(InputText *det, const int32_t ngrams[], const uint8_t byteMap[]) const
1214 {
1215 NGramParser_IBM420 parser(ngrams, byteMap);
1216 int32_t result;
1217
1218 result = parser.parse(det);
1219
1220 return result;
1221 }
1222
~CharsetRecog_IBM420_ar_rtl()1223 CharsetRecog_IBM420_ar_rtl::~CharsetRecog_IBM420_ar_rtl()
1224 {
1225 // nothing to do
1226 }
1227
getName() const1228 const char *CharsetRecog_IBM420_ar_rtl::getName() const
1229 {
1230 return "IBM420_rtl";
1231 }
1232
match(InputText * textIn,CharsetMatch * results) const1233 UBool CharsetRecog_IBM420_ar_rtl::match(InputText *textIn, CharsetMatch *results) const
1234 {
1235 int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_rtl, charMap_IBM420_ar);
1236 results->set(textIn, this, confidence);
1237 return (confidence > 0);
1238 }
1239
~CharsetRecog_IBM420_ar_ltr()1240 CharsetRecog_IBM420_ar_ltr::~CharsetRecog_IBM420_ar_ltr()
1241 {
1242 // nothing to do
1243 }
1244
getName() const1245 const char *CharsetRecog_IBM420_ar_ltr::getName() const
1246 {
1247 return "IBM420_ltr";
1248 }
1249
match(InputText * textIn,CharsetMatch * results) const1250 UBool CharsetRecog_IBM420_ar_ltr::match(InputText *textIn, CharsetMatch *results) const
1251 {
1252 int32_t confidence = match_sbcs(textIn, ngrams_IBM420_ar_ltr, charMap_IBM420_ar);
1253 results->set(textIn, this, confidence);
1254 return (confidence > 0);
1255 }
1256
1257 U_NAMESPACE_END
1258 #endif
1259
1260