• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  **********************************************************************
3  *   Copyright (C) 2005-2010, International Business Machines
4  *   Corporation and others.  All Rights Reserved.
5  **********************************************************************
6  */
7 
8 #include "unicode/utypes.h"
9 
10 #include "cmemory.h"
11 
12 #if !UCONFIG_NO_CONVERSION
13 #include "csrsbcs.h"
14 
15 #define N_GRAM_SIZE 3
16 #define N_GRAM_MASK 0xFFFFFF
17 
18 U_NAMESPACE_BEGIN
19 
NGramParser(const int32_t * theNgramList,const uint8_t * theCharMap)20 NGramParser::NGramParser(const int32_t *theNgramList, const uint8_t *theCharMap)
21   :byteIndex(0), ngram(0)
22 {
23     ngramList = theNgramList;
24     charMap   = theCharMap;
25 
26     ngramCount = hitCount = 0;
27 }
28 
29 /*
30  * Binary search for value in table, which must have exactly 64 entries.
31  */
32 
search(const int32_t * table,int32_t value)33 int32_t NGramParser::search(const int32_t *table, int32_t value)
34 {
35     int32_t index = 0;
36 
37     if (table[index + 32] <= value) {
38         index += 32;
39     }
40 
41     if (table[index + 16] <= value) {
42         index += 16;
43     }
44 
45     if (table[index + 8] <= value) {
46         index += 8;
47     }
48 
49     if (table[index + 4] <= value) {
50         index += 4;
51     }
52 
53     if (table[index + 2] <= value) {
54         index += 2;
55     }
56 
57     if (table[index + 1] <= value) {
58         index += 1;
59     }
60 
61     if (table[index] > value) {
62         index -= 1;
63     }
64 
65     if (index < 0 || table[index] != value) {
66         return -1;
67     }
68 
69     return index;
70 }
71 
lookup(int32_t thisNgram)72 void NGramParser::lookup(int32_t thisNgram)
73 {
74     ngramCount += 1;
75 
76     if (search(ngramList, thisNgram) >= 0) {
77         hitCount += 1;
78     }
79 
80 }
81 
addByte(int32_t b)82 void NGramParser::addByte(int32_t b)
83 {
84     ngram = ((ngram << 8) + b) & N_GRAM_MASK;
85     lookup(ngram);
86 }
87 
nextByte(InputText * det)88 int32_t NGramParser::nextByte(InputText *det)
89 {
90     if (byteIndex >= det->fInputLen) {
91         return -1;
92     }
93 
94     return det->fInputBytes[byteIndex++];
95 }
96 
parse(InputText * det)97 int32_t NGramParser::parse(InputText *det)
98 {
99     int32_t b;
100     bool ignoreSpace = FALSE;
101 
102     while ((b = nextByte(det)) >= 0) {
103         uint8_t mb = charMap[b];
104 
105         // TODO: 0x20 might not be a space in all character sets...
106         if (mb != 0) {
107             if (!(mb == 0x20 && ignoreSpace)) {
108                 addByte(mb);
109             }
110 
111             ignoreSpace = (mb == 0x20);
112         }
113     }
114 
115     // TODO: Is this OK? The buffer could have ended in the middle of a word...
116     addByte(0x20);
117 
118     double rawPercent = (double) hitCount / (double) ngramCount;
119 
120     //            if (rawPercent <= 2.0) {
121     //                return 0;
122     //            }
123 
124     // TODO - This is a bit of a hack to take care of a case
125     // were we were getting a confidence of 135...
126     if (rawPercent > 0.33) {
127         return 98;
128     }
129 
130     return (int32_t) (rawPercent * 300.0);
131 }
132 
CharsetRecog_sbcs()133 CharsetRecog_sbcs::CharsetRecog_sbcs()
134 : haveC1Bytes(FALSE)
135 {
136     // nothing else to do
137 }
138 
~CharsetRecog_sbcs()139 CharsetRecog_sbcs::~CharsetRecog_sbcs()
140 {
141     // nothing to do
142 }
143 
match_sbcs(InputText * det,const int32_t ngrams[],const uint8_t byteMap[])144 int32_t CharsetRecog_sbcs::match_sbcs(InputText *det, const int32_t ngrams[],  const uint8_t byteMap[])
145 {
146     NGramParser parser(ngrams, byteMap);
147     int32_t result;
148 
149     haveC1Bytes = det->fC1Bytes;
150     result = parser.parse(det);
151 
152     return result;
153 }
154 
155 static const uint8_t charMap_8859_1[] = {
156     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
157     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
158     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
159     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
160     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
161     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
162     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
163     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
164     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
165     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
166     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
167     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
168     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
169     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
170     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
171     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
172     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
173     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
174     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
175     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
176     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
177     0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
178     0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
179     0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
180     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
181     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
182     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
183     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
184     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
185     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
186     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
187     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
188 };
189 
190 static const uint8_t charMap_8859_2[] = {
191     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
192     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
193     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
194     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
195     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
196     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
197     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
198     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
199     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
200     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
201     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
202     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
203     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
204     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
205     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
206     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
207     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
208     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
209     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
210     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
211     0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0x20,
212     0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
213     0x20, 0xB1, 0x20, 0xB3, 0x20, 0xB5, 0xB6, 0xB7,
214     0x20, 0xB9, 0xBA, 0xBB, 0xBC, 0x20, 0xBE, 0xBF,
215     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
216     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
217     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
218     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF,
219     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
220     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
221     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
222     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
223 };
224 
225 static const uint8_t charMap_8859_5[] = {
226     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
227     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
228     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
229     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
230     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
231     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
232     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
233     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
234     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
235     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
236     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
237     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
238     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
239     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
240     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
241     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
242     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
243     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
244     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
245     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
246     0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
247     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
248     0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
249     0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
250     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
251     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
252     0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
253     0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
254     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
255     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
256     0x20, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
257     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x20, 0xFE, 0xFF,
258 };
259 
260 static const uint8_t charMap_8859_6[] = {
261     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
262     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
263     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
264     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
265     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
266     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
267     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
268     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
269     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
270     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
271     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
272     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
273     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
274     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
275     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
276     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
277     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
278     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
279     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
280     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
281     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
282     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
283     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
284     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
285     0x20, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
286     0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
287     0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
288     0xD8, 0xD9, 0xDA, 0x20, 0x20, 0x20, 0x20, 0x20,
289     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
290     0xE8, 0xE9, 0xEA, 0x20, 0x20, 0x20, 0x20, 0x20,
291     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
292     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
293 };
294 
295 static const uint8_t charMap_8859_7[] = {
296     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
297     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
298     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
299     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
300     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
301     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
302     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
303     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
304     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
305     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
306     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
307     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
308     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
309     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
310     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
311     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
312     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
313     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
314     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
315     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
316     0x20, 0xA1, 0xA2, 0x20, 0x20, 0x20, 0x20, 0x20,
317     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
318     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0xDC, 0x20,
319     0xDD, 0xDE, 0xDF, 0x20, 0xFC, 0x20, 0xFD, 0xFE,
320     0xC0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
321     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
322     0xF0, 0xF1, 0x20, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
323     0xF8, 0xF9, 0xFA, 0xFB, 0xDC, 0xDD, 0xDE, 0xDF,
324     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
325     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
326     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
327     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0x20,
328 };
329 
330 static const uint8_t charMap_8859_8[] = {
331     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
332     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
333     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
334     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
335     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
336     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
337     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
338     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
339     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
340     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
341     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
342     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
343     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
344     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
345     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
346     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
347     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
348     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
349     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
350     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
351     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
352     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
353     0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
354     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
355     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
356     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
357     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
358     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
359     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
360     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
361     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
362     0xF8, 0xF9, 0xFA, 0x20, 0x20, 0x20, 0x20, 0x20,
363 };
364 
365 static const uint8_t charMap_8859_9[] = {
366     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
367     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
368     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
369     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
370     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
371     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
372     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
373     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
374     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
375     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
376     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
377     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
378     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
379     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
380     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
381     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
382     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
383     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
384     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
385     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
386     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
387     0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
388     0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
389     0x20, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0x20,
390     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
391     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
392     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
393     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0x69, 0xFE, 0xDF,
394     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
395     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
396     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0x20,
397     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
398 };
399 
400 static const int32_t ngrams_windows_1251[] = {
401     0x20E220, 0x20E2EE, 0x20E4EE, 0x20E7E0, 0x20E820, 0x20EAE0, 0x20EAEE, 0x20EDE0, 0x20EDE5, 0x20EEE1, 0x20EFEE, 0x20EFF0, 0x20F0E0, 0x20F1EE, 0x20F1F2, 0x20F2EE,
402     0x20F7F2, 0x20FDF2, 0xE0EDE8, 0xE0F2FC, 0xE3EE20, 0xE5EBFC, 0xE5EDE8, 0xE5F1F2, 0xE5F220, 0xE820EF, 0xE8E520, 0xE8E820, 0xE8FF20, 0xEBE5ED, 0xEBE820, 0xEBFCED,
403     0xEDE020, 0xEDE520, 0xEDE8E5, 0xEDE8FF, 0xEDEE20, 0xEDEEE2, 0xEE20E2, 0xEE20EF, 0xEE20F1, 0xEEE220, 0xEEE2E0, 0xEEE3EE, 0xEEE920, 0xEEEBFC, 0xEEEC20, 0xEEF1F2,
404     0xEFEEEB, 0xEFF0E5, 0xEFF0E8, 0xEFF0EE, 0xF0E0E2, 0xF0E5E4, 0xF1F2E0, 0xF1F2E2, 0xF1F2E8, 0xF1FF20, 0xF2E5EB, 0xF2EE20, 0xF2EEF0, 0xF2FC20, 0xF7F2EE, 0xFBF520,
405 };
406 
407 static const uint8_t charMap_windows_1251[] = {
408     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
409     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
410     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
411     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
412     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
413     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
414     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
415     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
416     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
417     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
418     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
419     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
420     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
421     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
422     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
423     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
424     0x90, 0x83, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
425     0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
426     0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
427     0x20, 0x20, 0x9A, 0x20, 0x9C, 0x9D, 0x9E, 0x9F,
428     0x20, 0xA2, 0xA2, 0xBC, 0x20, 0xB4, 0x20, 0x20,
429     0xB8, 0x20, 0xBA, 0x20, 0x20, 0x20, 0x20, 0xBF,
430     0x20, 0x20, 0xB3, 0xB3, 0xB4, 0xB5, 0x20, 0x20,
431     0xB8, 0x20, 0xBA, 0x20, 0xBC, 0xBE, 0xBE, 0xBF,
432     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
433     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
434     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
435     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
436     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
437     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
438     0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7,
439     0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
440 };
441 
442 static const int32_t ngrams_windows_1256[] = {
443     0x20C7E1, 0x20C7E4, 0x20C8C7, 0x20DAE1, 0x20DDED, 0x20E1E1, 0x20E3E4, 0x20E6C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E120, 0xC7E1C3, 0xC7E1C7, 0xC7E1C8,
444     0xC7E1CA, 0xC7E1CC, 0xC7E1CD, 0xC7E1CF, 0xC7E1D3, 0xC7E1DA, 0xC7E1DE, 0xC7E1E3, 0xC7E1E6, 0xC7E1ED, 0xC7E320, 0xC7E420, 0xC7E4CA, 0xC820C7, 0xC920C7, 0xC920DD,
445     0xC920E1, 0xC920E3, 0xC920E6, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xDA20C7, 0xDAE1EC, 0xDDED20, 0xE120C7, 0xE1C920, 0xE1EC20, 0xE1ED20,
446     0xE320C7, 0xE3C720, 0xE3C920, 0xE3E420, 0xE420C7, 0xE520C7, 0xE5C720, 0xE6C7E1, 0xE6E420, 0xEC20C7, 0xED20C7, 0xED20E3, 0xED20E6, 0xEDC920, 0xEDD120, 0xEDE420,
447 };
448 
449 static const uint8_t charMap_windows_1256[] = {
450     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
451     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
452     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
453     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
454     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
455     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
456     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
457     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
458     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
459     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
460     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
461     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
462     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
463     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
464     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
465     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
466     0x20, 0x81, 0x20, 0x83, 0x20, 0x20, 0x20, 0x20,
467     0x88, 0x20, 0x8A, 0x20, 0x9C, 0x8D, 0x8E, 0x8F,
468     0x90, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
469     0x98, 0x20, 0x9A, 0x20, 0x9C, 0x20, 0x20, 0x9F,
470     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
471     0x20, 0x20, 0xAA, 0x20, 0x20, 0x20, 0x20, 0x20,
472     0x20, 0x20, 0x20, 0x20, 0x20, 0xB5, 0x20, 0x20,
473     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
474     0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
475     0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
476     0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0x20,
477     0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
478     0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
479     0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
480     0x20, 0x20, 0x20, 0x20, 0xF4, 0x20, 0x20, 0x20,
481     0x20, 0xF9, 0x20, 0xFB, 0xFC, 0x20, 0x20, 0xFF,
482 };
483 
484 static const int32_t ngrams_KOI8_R[] = {
485     0x20C4CF, 0x20C920, 0x20CBC1, 0x20CBCF, 0x20CEC1, 0x20CEC5, 0x20CFC2, 0x20D0CF, 0x20D0D2, 0x20D2C1, 0x20D3CF, 0x20D3D4, 0x20D4CF, 0x20D720, 0x20D7CF, 0x20DAC1,
486     0x20DCD4, 0x20DED4, 0xC1CEC9, 0xC1D4D8, 0xC5CCD8, 0xC5CEC9, 0xC5D3D4, 0xC5D420, 0xC7CF20, 0xC920D0, 0xC9C520, 0xC9C920, 0xC9D120, 0xCCC5CE, 0xCCC920, 0xCCD8CE,
487     0xCEC120, 0xCEC520, 0xCEC9C5, 0xCEC9D1, 0xCECF20, 0xCECFD7, 0xCF20D0, 0xCF20D3, 0xCF20D7, 0xCFC7CF, 0xCFCA20, 0xCFCCD8, 0xCFCD20, 0xCFD3D4, 0xCFD720, 0xCFD7C1,
488     0xD0CFCC, 0xD0D2C5, 0xD0D2C9, 0xD0D2CF, 0xD2C1D7, 0xD2C5C4, 0xD3D120, 0xD3D4C1, 0xD3D4C9, 0xD3D4D7, 0xD4C5CC, 0xD4CF20, 0xD4CFD2, 0xD4D820, 0xD9C820, 0xDED4CF,
489 };
490 
491 static const uint8_t charMap_KOI8_R[] = {
492     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
493     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
494     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
495     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
496     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x00,
497     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
498     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
499     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
500     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
501     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
502     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
503     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
504     0x20, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
505     0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
506     0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
507     0x78, 0x79, 0x7A, 0x20, 0x20, 0x20, 0x20, 0x20,
508     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
509     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
510     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
511     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
512     0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
513     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
514     0x20, 0x20, 0x20, 0xA3, 0x20, 0x20, 0x20, 0x20,
515     0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,
516     0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
517     0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
518     0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
519     0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
520     0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7,
521     0xC8, 0xC9, 0xCA, 0xCB, 0xCC, 0xCD, 0xCE, 0xCF,
522     0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7,
523     0xD8, 0xD9, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
524 };
525 
526 static const int32_t ngrams_IBM424_he_rtl[] = {
527     0x404146, 0x404148, 0x404151, 0x404171, 0x404251, 0x404256, 0x404541, 0x404546, 0x404551, 0x404556, 0x404562, 0x404569, 0x404571, 0x405441, 0x405445, 0x405641,
528     0x406254, 0x406954, 0x417140, 0x454041, 0x454042, 0x454045, 0x454054, 0x454056, 0x454069, 0x454641, 0x464140, 0x465540, 0x465740, 0x466840, 0x467140, 0x514045,
529     0x514540, 0x514671, 0x515155, 0x515540, 0x515740, 0x516840, 0x517140, 0x544041, 0x544045, 0x544140, 0x544540, 0x554041, 0x554042, 0x554045, 0x554054, 0x554056,
530     0x554069, 0x564540, 0x574045, 0x584540, 0x585140, 0x585155, 0x625440, 0x684045, 0x685155, 0x695440, 0x714041, 0x714042, 0x714045, 0x714054, 0x714056, 0x714069,
531 };
532 
533 static const int32_t ngrams_IBM424_he_ltr[] = {
534     0x404146, 0x404154, 0x404551, 0x404554, 0x404556, 0x404558, 0x405158, 0x405462, 0x405469, 0x405546, 0x405551, 0x405746, 0x405751, 0x406846, 0x406851, 0x407141,
535     0x407146, 0x407151, 0x414045, 0x414054, 0x414055, 0x414071, 0x414540, 0x414645, 0x415440, 0x415640, 0x424045, 0x424055, 0x424071, 0x454045, 0x454051, 0x454054,
536     0x454055, 0x454057, 0x454068, 0x454071, 0x455440, 0x464140, 0x464540, 0x484140, 0x514140, 0x514240, 0x514540, 0x544045, 0x544055, 0x544071, 0x546240, 0x546940,
537     0x555151, 0x555158, 0x555168, 0x564045, 0x564055, 0x564071, 0x564240, 0x564540, 0x624540, 0x694045, 0x694055, 0x694071, 0x694540, 0x714140, 0x714540, 0x714651,
538 };
539 
540 static const uint8_t charMap_IBM424_he[] = {
541 /*           -0    -1    -2    -3    -4    -5    -6    -7    -8    -9    -A    -B    -C    -D    -E    -F   */
542 /* 0- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
543 /* 1- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
544 /* 2- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
545 /* 3- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
546 /* 4- */    0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
547 /* 5- */    0x40, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
548 /* 6- */    0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
549 /* 7- */    0x40, 0x71, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x00, 0x40, 0x40,
550 /* 8- */    0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
551 /* 9- */    0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
552 /* A- */    0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
553 /* B- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
554 /* C- */    0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
555 /* D- */    0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
556 /* E- */    0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
557 /* F- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
558 };
559 
560 static const int32_t ngrams_IBM420_ar_rtl[] = {
561     0x4056B1, 0x4056BD, 0x405856, 0x409AB1, 0x40ABDC, 0x40B1B1, 0x40BBBD, 0x40CF56, 0x564056, 0x564640, 0x566340, 0x567540, 0x56B140, 0x56B149, 0x56B156, 0x56B158,
562     0x56B163, 0x56B167, 0x56B169, 0x56B173, 0x56B178, 0x56B19A, 0x56B1AD, 0x56B1BB, 0x56B1CF, 0x56B1DC, 0x56BB40, 0x56BD40, 0x56BD63, 0x584056, 0x624056, 0x6240AB,
563     0x6240B1, 0x6240BB, 0x6240CF, 0x634056, 0x734056, 0x736240, 0x754056, 0x756240, 0x784056, 0x9A4056, 0x9AB1DA, 0xABDC40, 0xB14056, 0xB16240, 0xB1DA40, 0xB1DC40,
564     0xBB4056, 0xBB5640, 0xBB6240, 0xBBBD40, 0xBD4056, 0xBF4056, 0xBF5640, 0xCF56B1, 0xCFBD40, 0xDA4056, 0xDC4056, 0xDC40BB, 0xDC40CF, 0xDC6240, 0xDC7540, 0xDCBD40,
565 };
566 
567 static const int32_t ngrams_IBM420_ar_ltr[] = {
568     0x404656, 0x4056BB, 0x4056BF, 0x406273, 0x406275, 0x4062B1, 0x4062BB, 0x4062DC, 0x406356, 0x407556, 0x4075DC, 0x40B156, 0x40BB56, 0x40BD56, 0x40BDBB, 0x40BDCF,
569     0x40BDDC, 0x40DAB1, 0x40DCAB, 0x40DCB1, 0x49B156, 0x564056, 0x564058, 0x564062, 0x564063, 0x564073, 0x564075, 0x564078, 0x56409A, 0x5640B1, 0x5640BB, 0x5640BD,
570     0x5640BF, 0x5640DA, 0x5640DC, 0x565840, 0x56B156, 0x56CF40, 0x58B156, 0x63B156, 0x63BD56, 0x67B156, 0x69B156, 0x73B156, 0x78B156, 0x9AB156, 0xAB4062, 0xADB156,
571     0xB14062, 0xB15640, 0xB156CF, 0xB19A40, 0xB1B140, 0xBB4062, 0xBB40DC, 0xBBB156, 0xBD5640, 0xBDBB40, 0xCF4062, 0xCF40DC, 0xCFB156, 0xDAB19A, 0xDCAB40, 0xDCB156
572 };
573 
574 static const uint8_t charMap_IBM420_ar[]= {
575 /*           -0    -1    -2    -3    -4    -5    -6    -7    -8    -9    -A    -B    -C    -D    -E    -F   */
576 /* 0- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
577 /* 1- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
578 /* 2- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
579 /* 3- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
580 /* 4- */    0x40, 0x40, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
581 /* 5- */    0x40, 0x51, 0x52, 0x40, 0x40, 0x55, 0x56, 0x57, 0x58, 0x59, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
582 /* 6- */    0x40, 0x40, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
583 /* 7- */    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
584 /* 8- */    0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
585 /* 9- */    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 0x9C, 0x9D, 0x9E, 0x9F,
586 /* A- */    0xA0, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
587 /* B- */    0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0x40, 0x40, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
588 /* C- */    0x40, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x40, 0xCB, 0x40, 0xCD, 0x40, 0xCF,
589 /* D- */    0x40, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0xDA, 0xDB, 0xDC, 0xDD, 0xDE, 0xDF,
590 /* E- */    0x40, 0x40, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xEA, 0xEB, 0x40, 0xED, 0xEE, 0xEF,
591 /* F- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0xFB, 0xFC, 0xFD, 0xFE, 0x40,
592 };
593 
594 //ISO-8859-1,2,5,6,7,8,9 Ngrams
595 static const int32_t ngrams_8859_1_en[] = {
596     0x206120, 0x20616E, 0x206265, 0x20636F, 0x20666F, 0x206861, 0x206865, 0x20696E, 0x206D61, 0x206F66, 0x207072, 0x207265, 0x207361, 0x207374, 0x207468, 0x20746F,
597     0x207768, 0x616964, 0x616C20, 0x616E20, 0x616E64, 0x617320, 0x617420, 0x617465, 0x617469, 0x642061, 0x642074, 0x652061, 0x652073, 0x652074, 0x656420, 0x656E74,
598     0x657220, 0x657320, 0x666F72, 0x686174, 0x686520, 0x686572, 0x696420, 0x696E20, 0x696E67, 0x696F6E, 0x697320, 0x6E2061, 0x6E2074, 0x6E6420, 0x6E6720, 0x6E7420,
599     0x6F6620, 0x6F6E20, 0x6F7220, 0x726520, 0x727320, 0x732061, 0x732074, 0x736169, 0x737420, 0x742074, 0x746572, 0x746861, 0x746865, 0x74696F, 0x746F20, 0x747320,
600 };
601 
602 static const int32_t ngrams_8859_1_da[] = {
603     0x206166, 0x206174, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207369, 0x207374, 0x207469, 0x207669, 0x616620,
604     0x616E20, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646572, 0x646574, 0x652073, 0x656420, 0x656465, 0x656E20, 0x656E64, 0x657220, 0x657265, 0x657320,
605     0x657420, 0x666F72, 0x676520, 0x67656E, 0x676572, 0x696765, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6572, 0x6C6967, 0x6C6C65, 0x6D6564, 0x6E6465, 0x6E6520,
606     0x6E6720, 0x6E6765, 0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722064, 0x722065, 0x722073, 0x726520, 0x737465, 0x742073, 0x746520, 0x746572, 0x74696C, 0x766572,
607 };
608 
609 static const int32_t ngrams_8859_1_de[] = {
610     0x20616E, 0x206175, 0x206265, 0x206461, 0x206465, 0x206469, 0x206569, 0x206765, 0x206861, 0x20696E, 0x206D69, 0x207363, 0x207365, 0x20756E, 0x207665, 0x20766F,
611     0x207765, 0x207A75, 0x626572, 0x636820, 0x636865, 0x636874, 0x646173, 0x64656E, 0x646572, 0x646965, 0x652064, 0x652073, 0x65696E, 0x656974, 0x656E20, 0x657220,
612     0x657320, 0x67656E, 0x68656E, 0x687420, 0x696368, 0x696520, 0x696E20, 0x696E65, 0x697420, 0x6C6963, 0x6C6C65, 0x6E2061, 0x6E2064, 0x6E2073, 0x6E6420, 0x6E6465,
613     0x6E6520, 0x6E6720, 0x6E6765, 0x6E7465, 0x722064, 0x726465, 0x726569, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x756E64, 0x756E67, 0x766572,
614 };
615 
616 static const int32_t ngrams_8859_1_es[] = {
617     0x206120, 0x206361, 0x20636F, 0x206465, 0x20656C, 0x20656E, 0x206573, 0x20696E, 0x206C61, 0x206C6F, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
618     0x20756E, 0x207920, 0x612063, 0x612064, 0x612065, 0x61206C, 0x612070, 0x616369, 0x61646F, 0x616C20, 0x617220, 0x617320, 0x6369F3, 0x636F6E, 0x646520, 0x64656C,
619     0x646F20, 0x652064, 0x652065, 0x65206C, 0x656C20, 0x656E20, 0x656E74, 0x657320, 0x657374, 0x69656E, 0x69F36E, 0x6C6120, 0x6C6F73, 0x6E2065, 0x6E7465, 0x6F2064,
620     0x6F2065, 0x6F6E20, 0x6F7220, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732064, 0x732065, 0x732070, 0x736520, 0x746520, 0x746F20, 0x756520, 0xF36E20,
621 };
622 
623 static const int32_t ngrams_8859_1_fr[] = {
624     0x206175, 0x20636F, 0x206461, 0x206465, 0x206475, 0x20656E, 0x206574, 0x206C61, 0x206C65, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207365, 0x20736F, 0x20756E,
625     0x20E020, 0x616E74, 0x617469, 0x636520, 0x636F6E, 0x646520, 0x646573, 0x647520, 0x652061, 0x652063, 0x652064, 0x652065, 0x65206C, 0x652070, 0x652073, 0x656E20,
626     0x656E74, 0x657220, 0x657320, 0x657420, 0x657572, 0x696F6E, 0x697320, 0x697420, 0x6C6120, 0x6C6520, 0x6C6573, 0x6D656E, 0x6E2064, 0x6E6520, 0x6E7320, 0x6E7420,
627     0x6F6E20, 0x6F6E74, 0x6F7572, 0x717565, 0x72206C, 0x726520, 0x732061, 0x732064, 0x732065, 0x73206C, 0x732070, 0x742064, 0x746520, 0x74696F, 0x756520, 0x757220,
628 };
629 
630 static const int32_t ngrams_8859_1_it[] = {
631     0x20616C, 0x206368, 0x20636F, 0x206465, 0x206469, 0x206520, 0x20696C, 0x20696E, 0x206C61, 0x207065, 0x207072, 0x20756E, 0x612063, 0x612064, 0x612070, 0x612073,
632     0x61746F, 0x636865, 0x636F6E, 0x64656C, 0x646920, 0x652061, 0x652063, 0x652064, 0x652069, 0x65206C, 0x652070, 0x652073, 0x656C20, 0x656C6C, 0x656E74, 0x657220,
633     0x686520, 0x692061, 0x692063, 0x692064, 0x692073, 0x696120, 0x696C20, 0x696E20, 0x696F6E, 0x6C6120, 0x6C6520, 0x6C6920, 0x6C6C61, 0x6E6520, 0x6E6920, 0x6E6F20,
634     0x6E7465, 0x6F2061, 0x6F2064, 0x6F2069, 0x6F2073, 0x6F6E20, 0x6F6E65, 0x706572, 0x726120, 0x726520, 0x736920, 0x746120, 0x746520, 0x746920, 0x746F20, 0x7A696F,
635 };
636 
637 static const int32_t ngrams_8859_1_nl[] = {
638     0x20616C, 0x206265, 0x206461, 0x206465, 0x206469, 0x206565, 0x20656E, 0x206765, 0x206865, 0x20696E, 0x206D61, 0x206D65, 0x206F70, 0x207465, 0x207661, 0x207665,
639     0x20766F, 0x207765, 0x207A69, 0x61616E, 0x616172, 0x616E20, 0x616E64, 0x617220, 0x617420, 0x636874, 0x646520, 0x64656E, 0x646572, 0x652062, 0x652076, 0x65656E,
640     0x656572, 0x656E20, 0x657220, 0x657273, 0x657420, 0x67656E, 0x686574, 0x696520, 0x696E20, 0x696E67, 0x697320, 0x6E2062, 0x6E2064, 0x6E2065, 0x6E2068, 0x6E206F,
641     0x6E2076, 0x6E6465, 0x6E6720, 0x6F6E64, 0x6F6F72, 0x6F7020, 0x6F7220, 0x736368, 0x737465, 0x742064, 0x746520, 0x74656E, 0x746572, 0x76616E, 0x766572, 0x766F6F,
642 };
643 
644 static const int32_t ngrams_8859_1_no[] = {
645     0x206174, 0x206176, 0x206465, 0x20656E, 0x206572, 0x20666F, 0x206861, 0x206920, 0x206D65, 0x206F67, 0x2070E5, 0x207365, 0x20736B, 0x20736F, 0x207374, 0x207469,
646     0x207669, 0x20E520, 0x616E64, 0x617220, 0x617420, 0x646520, 0x64656E, 0x646574, 0x652073, 0x656420, 0x656E20, 0x656E65, 0x657220, 0x657265, 0x657420, 0x657474,
647     0x666F72, 0x67656E, 0x696B6B, 0x696C20, 0x696E67, 0x6B6520, 0x6B6B65, 0x6C6520, 0x6C6C65, 0x6D6564, 0x6D656E, 0x6E2073, 0x6E6520, 0x6E6720, 0x6E6765, 0x6E6E65,
648     0x6F6720, 0x6F6D20, 0x6F7220, 0x70E520, 0x722073, 0x726520, 0x736F6D, 0x737465, 0x742073, 0x746520, 0x74656E, 0x746572, 0x74696C, 0x747420, 0x747465, 0x766572,
649 };
650 
651 static const int32_t ngrams_8859_1_pt[] = {
652     0x206120, 0x20636F, 0x206461, 0x206465, 0x20646F, 0x206520, 0x206573, 0x206D61, 0x206E6F, 0x206F20, 0x207061, 0x20706F, 0x207072, 0x207175, 0x207265, 0x207365,
653     0x20756D, 0x612061, 0x612063, 0x612064, 0x612070, 0x616465, 0x61646F, 0x616C20, 0x617220, 0x617261, 0x617320, 0x636F6D, 0x636F6E, 0x646120, 0x646520, 0x646F20,
654     0x646F73, 0x652061, 0x652064, 0x656D20, 0x656E74, 0x657320, 0x657374, 0x696120, 0x696361, 0x6D656E, 0x6E7465, 0x6E746F, 0x6F2061, 0x6F2063, 0x6F2064, 0x6F2065,
655     0x6F2070, 0x6F7320, 0x706172, 0x717565, 0x726120, 0x726573, 0x732061, 0x732064, 0x732065, 0x732070, 0x737461, 0x746520, 0x746F20, 0x756520, 0xE36F20, 0xE7E36F,
656 };
657 
658 static const int32_t ngrams_8859_1_sv[] = {
659     0x206174, 0x206176, 0x206465, 0x20656E, 0x2066F6, 0x206861, 0x206920, 0x20696E, 0x206B6F, 0x206D65, 0x206F63, 0x2070E5, 0x20736B, 0x20736F, 0x207374, 0x207469,
660     0x207661, 0x207669, 0x20E472, 0x616465, 0x616E20, 0x616E64, 0x617220, 0x617474, 0x636820, 0x646520, 0x64656E, 0x646572, 0x646574, 0x656420, 0x656E20, 0x657220,
661     0x657420, 0x66F672, 0x67656E, 0x696C6C, 0x696E67, 0x6B6120, 0x6C6C20, 0x6D6564, 0x6E2073, 0x6E6120, 0x6E6465, 0x6E6720, 0x6E6765, 0x6E696E, 0x6F6368, 0x6F6D20,
662     0x6F6E20, 0x70E520, 0x722061, 0x722073, 0x726120, 0x736B61, 0x736F6D, 0x742073, 0x746120, 0x746520, 0x746572, 0x74696C, 0x747420, 0x766172, 0xE47220, 0xF67220,
663 };
664 
665 static const int32_t ngrams_8859_2_cs[] = {
666     0x206120, 0x206279, 0x20646F, 0x206A65, 0x206E61, 0x206E65, 0x206F20, 0x206F64, 0x20706F, 0x207072, 0x2070F8, 0x20726F, 0x207365, 0x20736F, 0x207374, 0x20746F,
667     0x207620, 0x207679, 0x207A61, 0x612070, 0x636520, 0x636820, 0x652070, 0x652073, 0x652076, 0x656D20, 0x656EED, 0x686F20, 0x686F64, 0x697374, 0x6A6520, 0x6B7465,
668     0x6C6520, 0x6C6920, 0x6E6120, 0x6EE920, 0x6EEC20, 0x6EED20, 0x6F2070, 0x6F646E, 0x6F6A69, 0x6F7374, 0x6F7520, 0x6F7661, 0x706F64, 0x706F6A, 0x70726F, 0x70F865,
669     0x736520, 0x736F75, 0x737461, 0x737469, 0x73746E, 0x746572, 0x746EED, 0x746F20, 0x752070, 0xBE6520, 0xE16EED, 0xE9686F, 0xED2070, 0xED2073, 0xED6D20, 0xF86564,
670 };
671 
672 static const int32_t ngrams_8859_2_hu[] = {
673     0x206120, 0x20617A, 0x206265, 0x206567, 0x20656C, 0x206665, 0x206861, 0x20686F, 0x206973, 0x206B65, 0x206B69, 0x206BF6, 0x206C65, 0x206D61, 0x206D65, 0x206D69,
674     0x206E65, 0x20737A, 0x207465, 0x20E973, 0x612061, 0x61206B, 0x61206D, 0x612073, 0x616B20, 0x616E20, 0x617A20, 0x62616E, 0x62656E, 0x656779, 0x656B20, 0x656C20,
675     0x656C65, 0x656D20, 0x656E20, 0x657265, 0x657420, 0x657465, 0x657474, 0x677920, 0x686F67, 0x696E74, 0x697320, 0x6B2061, 0x6BF67A, 0x6D6567, 0x6D696E, 0x6E2061,
676     0x6E616B, 0x6E656B, 0x6E656D, 0x6E7420, 0x6F6779, 0x732061, 0x737A65, 0x737A74, 0x737AE1, 0x73E967, 0x742061, 0x747420, 0x74E173, 0x7A6572, 0xE16E20, 0xE97320,
677 };
678 
679 static const int32_t ngrams_8859_2_pl[] = {
680     0x20637A, 0x20646F, 0x206920, 0x206A65, 0x206B6F, 0x206D61, 0x206D69, 0x206E61, 0x206E69, 0x206F64, 0x20706F, 0x207072, 0x207369, 0x207720, 0x207769, 0x207779,
681     0x207A20, 0x207A61, 0x612070, 0x612077, 0x616E69, 0x636820, 0x637A65, 0x637A79, 0x646F20, 0x647A69, 0x652070, 0x652073, 0x652077, 0x65207A, 0x65676F, 0x656A20,
682     0x656D20, 0x656E69, 0x676F20, 0x696120, 0x696520, 0x69656A, 0x6B6120, 0x6B6920, 0x6B6965, 0x6D6965, 0x6E6120, 0x6E6961, 0x6E6965, 0x6F2070, 0x6F7761, 0x6F7769,
683     0x706F6C, 0x707261, 0x70726F, 0x70727A, 0x727A65, 0x727A79, 0x7369EA, 0x736B69, 0x737461, 0x776965, 0x796368, 0x796D20, 0x7A6520, 0x7A6965, 0x7A7920, 0xF37720,
684 };
685 
686 static const int32_t ngrams_8859_2_ro[] = {
687     0x206120, 0x206163, 0x206361, 0x206365, 0x20636F, 0x206375, 0x206465, 0x206469, 0x206C61, 0x206D61, 0x207065, 0x207072, 0x207365, 0x2073E3, 0x20756E, 0x20BA69,
688     0x20EE6E, 0x612063, 0x612064, 0x617265, 0x617420, 0x617465, 0x617520, 0x636172, 0x636F6E, 0x637520, 0x63E320, 0x646520, 0x652061, 0x652063, 0x652064, 0x652070,
689     0x652073, 0x656120, 0x656920, 0x656C65, 0x656E74, 0x657374, 0x692061, 0x692063, 0x692064, 0x692070, 0x696520, 0x696920, 0x696E20, 0x6C6120, 0x6C6520, 0x6C6F72,
690     0x6C7569, 0x6E6520, 0x6E7472, 0x6F7220, 0x70656E, 0x726520, 0x726561, 0x727520, 0x73E320, 0x746520, 0x747275, 0x74E320, 0x756920, 0x756C20, 0xBA6920, 0xEE6E20,
691 };
692 
693 static const int32_t ngrams_8859_5_ru[] = {
694     0x20D220, 0x20D2DE, 0x20D4DE, 0x20D7D0, 0x20D820, 0x20DAD0, 0x20DADE, 0x20DDD0, 0x20DDD5, 0x20DED1, 0x20DFDE, 0x20DFE0, 0x20E0D0, 0x20E1DE, 0x20E1E2, 0x20E2DE,
695     0x20E7E2, 0x20EDE2, 0xD0DDD8, 0xD0E2EC, 0xD3DE20, 0xD5DBEC, 0xD5DDD8, 0xD5E1E2, 0xD5E220, 0xD820DF, 0xD8D520, 0xD8D820, 0xD8EF20, 0xDBD5DD, 0xDBD820, 0xDBECDD,
696     0xDDD020, 0xDDD520, 0xDDD8D5, 0xDDD8EF, 0xDDDE20, 0xDDDED2, 0xDE20D2, 0xDE20DF, 0xDE20E1, 0xDED220, 0xDED2D0, 0xDED3DE, 0xDED920, 0xDEDBEC, 0xDEDC20, 0xDEE1E2,
697     0xDFDEDB, 0xDFE0D5, 0xDFE0D8, 0xDFE0DE, 0xE0D0D2, 0xE0D5D4, 0xE1E2D0, 0xE1E2D2, 0xE1E2D8, 0xE1EF20, 0xE2D5DB, 0xE2DE20, 0xE2DEE0, 0xE2EC20, 0xE7E2DE, 0xEBE520,
698 };
699 
700 static const int32_t ngrams_8859_6_ar[] = {
701     0x20C7E4, 0x20C7E6, 0x20C8C7, 0x20D9E4, 0x20E1EA, 0x20E4E4, 0x20E5E6, 0x20E8C7, 0xC720C7, 0xC7C120, 0xC7CA20, 0xC7D120, 0xC7E420, 0xC7E4C3, 0xC7E4C7, 0xC7E4C8,
702     0xC7E4CA, 0xC7E4CC, 0xC7E4CD, 0xC7E4CF, 0xC7E4D3, 0xC7E4D9, 0xC7E4E2, 0xC7E4E5, 0xC7E4E8, 0xC7E4EA, 0xC7E520, 0xC7E620, 0xC7E6CA, 0xC820C7, 0xC920C7, 0xC920E1,
703     0xC920E4, 0xC920E5, 0xC920E8, 0xCA20C7, 0xCF20C7, 0xCFC920, 0xD120C7, 0xD1C920, 0xD320C7, 0xD920C7, 0xD9E4E9, 0xE1EA20, 0xE420C7, 0xE4C920, 0xE4E920, 0xE4EA20,
704     0xE520C7, 0xE5C720, 0xE5C920, 0xE5E620, 0xE620C7, 0xE720C7, 0xE7C720, 0xE8C7E4, 0xE8E620, 0xE920C7, 0xEA20C7, 0xEA20E5, 0xEA20E8, 0xEAC920, 0xEAD120, 0xEAE620,
705 };
706 
707 static const int32_t ngrams_8859_7_el[] = {
708     0x20E1ED, 0x20E1F0, 0x20E3E9, 0x20E4E9, 0x20E5F0, 0x20E720, 0x20EAE1, 0x20ECE5, 0x20EDE1, 0x20EF20, 0x20F0E1, 0x20F0EF, 0x20F0F1, 0x20F3F4, 0x20F3F5, 0x20F4E7,
709     0x20F4EF, 0xDFE120, 0xE120E1, 0xE120F4, 0xE1E920, 0xE1ED20, 0xE1F0FC, 0xE1F220, 0xE3E9E1, 0xE5E920, 0xE5F220, 0xE720F4, 0xE7ED20, 0xE7F220, 0xE920F4, 0xE9E120,
710     0xE9EADE, 0xE9F220, 0xEAE1E9, 0xEAE1F4, 0xECE520, 0xED20E1, 0xED20E5, 0xED20F0, 0xEDE120, 0xEFF220, 0xEFF520, 0xF0EFF5, 0xF0F1EF, 0xF0FC20, 0xF220E1, 0xF220E5,
711     0xF220EA, 0xF220F0, 0xF220F4, 0xF3E520, 0xF3E720, 0xF3F4EF, 0xF4E120, 0xF4E1E9, 0xF4E7ED, 0xF4E7F2, 0xF4E9EA, 0xF4EF20, 0xF4EFF5, 0xF4F9ED, 0xF9ED20, 0xFEED20,
712 };
713 
714 static const int32_t ngrams_8859_8_I_he[] = {
715     0x20E0E5, 0x20E0E7, 0x20E0E9, 0x20E0FA, 0x20E1E9, 0x20E1EE, 0x20E4E0, 0x20E4E5, 0x20E4E9, 0x20E4EE, 0x20E4F2, 0x20E4F9, 0x20E4FA, 0x20ECE0, 0x20ECE4, 0x20EEE0,
716     0x20F2EC, 0x20F9EC, 0xE0FA20, 0xE420E0, 0xE420E1, 0xE420E4, 0xE420EC, 0xE420EE, 0xE420F9, 0xE4E5E0, 0xE5E020, 0xE5ED20, 0xE5EF20, 0xE5F820, 0xE5FA20, 0xE920E4,
717     0xE9E420, 0xE9E5FA, 0xE9E9ED, 0xE9ED20, 0xE9EF20, 0xE9F820, 0xE9FA20, 0xEC20E0, 0xEC20E4, 0xECE020, 0xECE420, 0xED20E0, 0xED20E1, 0xED20E4, 0xED20EC, 0xED20EE,
718     0xED20F9, 0xEEE420, 0xEF20E4, 0xF0E420, 0xF0E920, 0xF0E9ED, 0xF2EC20, 0xF820E4, 0xF8E9ED, 0xF9EC20, 0xFA20E0, 0xFA20E1, 0xFA20E4, 0xFA20EC, 0xFA20EE, 0xFA20F9,
719 };
720 
721 static const int32_t ngrams_8859_8_he[] = {
722     0x20E0E5, 0x20E0EC, 0x20E4E9, 0x20E4EC, 0x20E4EE, 0x20E4F0, 0x20E9F0, 0x20ECF2, 0x20ECF9, 0x20EDE5, 0x20EDE9, 0x20EFE5, 0x20EFE9, 0x20F8E5, 0x20F8E9, 0x20FAE0,
723     0x20FAE5, 0x20FAE9, 0xE020E4, 0xE020EC, 0xE020ED, 0xE020FA, 0xE0E420, 0xE0E5E4, 0xE0EC20, 0xE0EE20, 0xE120E4, 0xE120ED, 0xE120FA, 0xE420E4, 0xE420E9, 0xE420EC,
724     0xE420ED, 0xE420EF, 0xE420F8, 0xE420FA, 0xE4EC20, 0xE5E020, 0xE5E420, 0xE7E020, 0xE9E020, 0xE9E120, 0xE9E420, 0xEC20E4, 0xEC20ED, 0xEC20FA, 0xECF220, 0xECF920,
725     0xEDE9E9, 0xEDE9F0, 0xEDE9F8, 0xEE20E4, 0xEE20ED, 0xEE20FA, 0xEEE120, 0xEEE420, 0xF2E420, 0xF920E4, 0xF920ED, 0xF920FA, 0xF9E420, 0xFAE020, 0xFAE420, 0xFAE5E9,
726 };
727 
728 static const int32_t ngrams_8859_9_tr[] = {
729     0x206261, 0x206269, 0x206275, 0x206461, 0x206465, 0x206765, 0x206861, 0x20696C, 0x206B61, 0x206B6F, 0x206D61, 0x206F6C, 0x207361, 0x207461, 0x207665, 0x207961,
730     0x612062, 0x616B20, 0x616C61, 0x616D61, 0x616E20, 0x616EFD, 0x617220, 0x617261, 0x6172FD, 0x6173FD, 0x617961, 0x626972, 0x646120, 0x646520, 0x646920, 0x652062,
731     0x65206B, 0x656469, 0x656E20, 0x657220, 0x657269, 0x657369, 0x696C65, 0x696E20, 0x696E69, 0x697220, 0x6C616E, 0x6C6172, 0x6C6520, 0x6C6572, 0x6E2061, 0x6E2062,
732     0x6E206B, 0x6E6461, 0x6E6465, 0x6E6520, 0x6E6920, 0x6E696E, 0x6EFD20, 0x72696E, 0x72FD6E, 0x766520, 0x796120, 0x796F72, 0xFD6E20, 0xFD6E64, 0xFD6EFD, 0xFDF0FD,
733 };
734 
~CharsetRecog_8859_1()735 CharsetRecog_8859_1::~CharsetRecog_8859_1()
736 {
737     // nothing to do
738 }
739 
getName() const740 const char *CharsetRecog_8859_1::getName() const
741 {
742     return haveC1Bytes? "windows-1252" : "ISO-8859-1";
743 }
744 
getLanguage() const745 const char *CharsetRecog_8859_1_en::getLanguage() const
746 {
747     return "en";
748 }
749 
~CharsetRecog_8859_1_en()750 CharsetRecog_8859_1_en::~CharsetRecog_8859_1_en()
751 {
752     // nothing to do
753 }
754 
match(InputText * textIn)755 int32_t CharsetRecog_8859_1_en::match(InputText *textIn)
756 {
757     int32_t result = match_sbcs(textIn, ngrams_8859_1_en, charMap_8859_1);
758 
759    // printf("8859_1_en: result = %d\n", result);
760     return result; //match_sbcs(textIn, ngrams, charMap);
761 }
762 
~CharsetRecog_8859_1_da()763 CharsetRecog_8859_1_da::~CharsetRecog_8859_1_da()
764 {
765     // nothing to do
766 }
767 
getLanguage() const768 const char *CharsetRecog_8859_1_da::getLanguage() const
769 {
770     return "da";
771 }
772 
match(InputText * textIn)773 int32_t CharsetRecog_8859_1_da::match(InputText *textIn)
774 {
775     return match_sbcs(textIn, ngrams_8859_1_da, charMap_8859_1);
776 }
777 
~CharsetRecog_8859_1_de()778 CharsetRecog_8859_1_de::~CharsetRecog_8859_1_de() {}
779 
getLanguage() const780 const char *CharsetRecog_8859_1_de::getLanguage() const
781 {
782     return "de";
783 }
784 
match(InputText * textIn)785 int32_t CharsetRecog_8859_1_de::match(InputText *textIn)
786 {
787     return match_sbcs(textIn, ngrams_8859_1_de, charMap_8859_1);
788 }
789 
~CharsetRecog_8859_1_es()790 CharsetRecog_8859_1_es::~CharsetRecog_8859_1_es()
791 {
792     // nothing to do
793 }
794 
getLanguage() const795 const char *CharsetRecog_8859_1_es::getLanguage() const
796 {
797     return "es";
798 }
799 
match(InputText * textIn)800 int32_t CharsetRecog_8859_1_es::match(InputText *textIn)
801 {
802     return match_sbcs(textIn, ngrams_8859_1_es, charMap_8859_1);
803 }
804 
~CharsetRecog_8859_1_fr()805 CharsetRecog_8859_1_fr::~CharsetRecog_8859_1_fr()
806 {
807     // nothing to do
808 }
809 
getLanguage() const810 const char *CharsetRecog_8859_1_fr::getLanguage() const
811 {
812     return "fr";
813 }
814 
match(InputText * textIn)815 int32_t CharsetRecog_8859_1_fr::match(InputText *textIn)
816 {
817     return match_sbcs(textIn, ngrams_8859_1_fr, charMap_8859_1);
818 }
819 
~CharsetRecog_8859_1_it()820 CharsetRecog_8859_1_it::~CharsetRecog_8859_1_it()
821 {
822     // nothing to do
823 }
824 
getLanguage() const825 const char *CharsetRecog_8859_1_it::getLanguage() const
826 {
827     return "it";
828 }
829 
match(InputText * textIn)830 int32_t CharsetRecog_8859_1_it::match(InputText *textIn)
831 {
832     return match_sbcs(textIn, ngrams_8859_1_it, charMap_8859_1);
833 }
834 
~CharsetRecog_8859_1_nl()835 CharsetRecog_8859_1_nl::~CharsetRecog_8859_1_nl()
836 {
837     // nothing to do
838 }
839 
getLanguage() const840 const char *CharsetRecog_8859_1_nl::getLanguage() const
841 {
842     return "nl";
843 }
844 
match(InputText * textIn)845 int32_t CharsetRecog_8859_1_nl::match(InputText *textIn)
846 {
847     return match_sbcs(textIn, ngrams_8859_1_nl, charMap_8859_1);
848 }
849 
~CharsetRecog_8859_1_no()850 CharsetRecog_8859_1_no::~CharsetRecog_8859_1_no() {}
851 
getLanguage() const852 const char *CharsetRecog_8859_1_no::getLanguage() const
853 {
854     return "no";
855 }
856 
match(InputText * textIn)857 int32_t CharsetRecog_8859_1_no::match(InputText *textIn)
858 {
859     return match_sbcs(textIn, ngrams_8859_1_no, charMap_8859_1);
860 }
861 
~CharsetRecog_8859_1_pt()862 CharsetRecog_8859_1_pt::~CharsetRecog_8859_1_pt()
863 {
864     // nothing to do
865 }
866 
getLanguage() const867 const char *CharsetRecog_8859_1_pt::getLanguage() const
868 {
869     return "pt";
870 }
871 
match(InputText * textIn)872 int32_t CharsetRecog_8859_1_pt::match(InputText *textIn)
873 {
874     return match_sbcs(textIn, ngrams_8859_1_pt, charMap_8859_1);
875 }
876 
~CharsetRecog_8859_1_sv()877 CharsetRecog_8859_1_sv::~CharsetRecog_8859_1_sv() {}
878 
getLanguage() const879 const char *CharsetRecog_8859_1_sv::getLanguage() const
880 {
881     return "sv";
882 }
883 
match(InputText * textIn)884 int32_t CharsetRecog_8859_1_sv::match(InputText *textIn)
885 {
886     return match_sbcs(textIn, ngrams_8859_1_sv, charMap_8859_1);
887 }
888 
~CharsetRecog_8859_2()889 CharsetRecog_8859_2::~CharsetRecog_8859_2()
890 {
891     // nothing to do
892 }
893 
getName() const894 const char *CharsetRecog_8859_2::getName() const
895 {
896     return haveC1Bytes? "windows-1250" : "ISO-8859-2";
897 }
898 
~CharsetRecog_8859_2_cs()899 CharsetRecog_8859_2_cs::~CharsetRecog_8859_2_cs()
900 {
901     // nothing to do
902 }
903 
getLanguage() const904 const char *CharsetRecog_8859_2_cs::getLanguage() const
905 {
906     return "cs";
907 }
908 
match(InputText * textIn)909 int32_t CharsetRecog_8859_2_cs::match(InputText *textIn)
910 {
911     return match_sbcs(textIn, ngrams_8859_2_cs, charMap_8859_2);
912 }
913 
~CharsetRecog_8859_2_hu()914 CharsetRecog_8859_2_hu::~CharsetRecog_8859_2_hu()
915 {
916     // nothing to do
917 }
918 
getLanguage() const919 const char *CharsetRecog_8859_2_hu::getLanguage() const
920 {
921     return "hu";
922 }
923 
match(InputText * textIn)924 int32_t CharsetRecog_8859_2_hu::match(InputText *textIn)
925 {
926     return match_sbcs(textIn, ngrams_8859_2_hu, charMap_8859_2);
927 }
928 
~CharsetRecog_8859_2_pl()929 CharsetRecog_8859_2_pl::~CharsetRecog_8859_2_pl()
930 {
931     // nothing to do
932 }
933 
getLanguage() const934 const char *CharsetRecog_8859_2_pl::getLanguage() const
935 {
936     return "pl";
937 }
938 
match(InputText * textIn)939 int32_t CharsetRecog_8859_2_pl::match(InputText *textIn)
940 {
941     return match_sbcs(textIn, ngrams_8859_2_pl, charMap_8859_2);
942 }
943 
~CharsetRecog_8859_2_ro()944 CharsetRecog_8859_2_ro::~CharsetRecog_8859_2_ro()
945 {
946     // nothing to do
947 }
948 
getLanguage() const949 const char *CharsetRecog_8859_2_ro::getLanguage() const
950 {
951     return "ro";
952 }
953 
match(InputText * textIn)954 int32_t CharsetRecog_8859_2_ro::match(InputText *textIn)
955 {
956     return match_sbcs(textIn, ngrams_8859_2_ro, charMap_8859_2);
957 }
958 
~CharsetRecog_8859_5()959 CharsetRecog_8859_5::~CharsetRecog_8859_5()
960 {
961     // nothing to do
962 }
963 
getName() const964 const char *CharsetRecog_8859_5::getName() const
965 {
966     return "ISO-8859-5";
967 }
968 
~CharsetRecog_8859_5_ru()969 CharsetRecog_8859_5_ru::~CharsetRecog_8859_5_ru()
970 {
971     // nothing to do
972 }
973 
getLanguage() const974 const char *CharsetRecog_8859_5_ru::getLanguage() const
975 {
976     return "ru";
977 }
978 
match(InputText * textIn)979 int32_t CharsetRecog_8859_5_ru::match(InputText *textIn)
980 {
981     return match_sbcs(textIn, ngrams_8859_5_ru, charMap_8859_5);
982 }
983 
~CharsetRecog_8859_6()984 CharsetRecog_8859_6::~CharsetRecog_8859_6()
985 {
986     // nothing to do
987 }
988 
getName() const989 const char *CharsetRecog_8859_6::getName() const
990 {
991     return "ISO-8859-6";
992 }
993 
~CharsetRecog_8859_6_ar()994 CharsetRecog_8859_6_ar::~CharsetRecog_8859_6_ar()
995 {
996     // nothing to do
997 }
998 
getLanguage() const999 const char *CharsetRecog_8859_6_ar::getLanguage() const
1000 {
1001     return "ar";
1002 }
1003 
match(InputText * textIn)1004 int32_t CharsetRecog_8859_6_ar::match(InputText *textIn)
1005 {
1006     return match_sbcs(textIn, ngrams_8859_6_ar, charMap_8859_6);
1007 }
1008 
~CharsetRecog_8859_7()1009 CharsetRecog_8859_7::~CharsetRecog_8859_7()
1010 {
1011     // nothing to do
1012 }
1013 
getName() const1014 const char *CharsetRecog_8859_7::getName() const
1015 {
1016     return haveC1Bytes? "windows-1253" : "ISO-8859-7";
1017 }
1018 
~CharsetRecog_8859_7_el()1019 CharsetRecog_8859_7_el::~CharsetRecog_8859_7_el()
1020 {
1021     // nothing to do
1022 }
1023 
getLanguage() const1024 const char *CharsetRecog_8859_7_el::getLanguage() const
1025 {
1026     return "el";
1027 }
1028 
match(InputText * textIn)1029 int32_t CharsetRecog_8859_7_el::match(InputText *textIn)
1030 {
1031     return match_sbcs(textIn, ngrams_8859_7_el, charMap_8859_7);
1032 }
1033 
~CharsetRecog_8859_8()1034 CharsetRecog_8859_8::~CharsetRecog_8859_8()
1035 {
1036     // nothing to do
1037 }
1038 
getName() const1039 const char *CharsetRecog_8859_8::getName() const
1040 {
1041     return haveC1Bytes? "windows-1255" : "ISO-8859-8";
1042 }
1043 
~CharsetRecog_8859_8_I_he()1044 CharsetRecog_8859_8_I_he::~CharsetRecog_8859_8_I_he ()
1045 {
1046     // nothing to do
1047 }
1048 
getName() const1049 const char *CharsetRecog_8859_8_I_he::getName() const
1050 {
1051     return haveC1Bytes? "windows-1255" : "ISO-8859-8-I";
1052 }
1053 
getLanguage() const1054 const char *CharsetRecog_8859_8_I_he::getLanguage() const
1055 {
1056     return "he";
1057 }
1058 
match(InputText * textIn)1059 int32_t CharsetRecog_8859_8_I_he::match(InputText *textIn)
1060 {
1061     return match_sbcs(textIn, ngrams_8859_8_I_he, charMap_8859_8);
1062 }
1063 
~CharsetRecog_8859_8_he()1064 CharsetRecog_8859_8_he::~CharsetRecog_8859_8_he()
1065 {
1066     // od ot gnihton
1067 }
1068 
getLanguage() const1069 const char *CharsetRecog_8859_8_he::getLanguage() const
1070 {
1071     return "he";
1072 }
1073 
match(InputText * textIn)1074 int32_t CharsetRecog_8859_8_he::match(InputText *textIn)
1075 {
1076     return match_sbcs(textIn, ngrams_8859_8_he, charMap_8859_8);
1077 }
1078 
~CharsetRecog_8859_9()1079 CharsetRecog_8859_9::~CharsetRecog_8859_9()
1080 {
1081     // nothing to do
1082 }
1083 
getName() const1084 const char *CharsetRecog_8859_9::getName() const
1085 {
1086     return haveC1Bytes? "windows-1254" : "ISO-8859-9";
1087 }
1088 
~CharsetRecog_8859_9_tr()1089 CharsetRecog_8859_9_tr::~CharsetRecog_8859_9_tr ()
1090 {
1091     // nothing to do
1092 }
1093 
getLanguage() const1094 const char *CharsetRecog_8859_9_tr::getLanguage() const
1095 {
1096     return "tr";
1097 }
1098 
match(InputText * textIn)1099 int32_t CharsetRecog_8859_9_tr::match(InputText *textIn)
1100 {
1101     return match_sbcs(textIn, ngrams_8859_9_tr, charMap_8859_9);
1102 }
1103 
~CharsetRecog_windows_1256()1104 CharsetRecog_windows_1256::~CharsetRecog_windows_1256()
1105 {
1106     // nothing to do
1107 }
1108 
getName() const1109 const char *CharsetRecog_windows_1256::getName() const
1110 {
1111     return  "windows-1256";
1112 }
1113 
getLanguage() const1114 const char *CharsetRecog_windows_1256::getLanguage() const
1115 {
1116     return "ar";
1117 }
1118 
match(InputText * textIn)1119 int32_t CharsetRecog_windows_1256::match(InputText *textIn)
1120 {
1121     return match_sbcs(textIn, ngrams_windows_1256, charMap_windows_1256);
1122 }
1123 
~CharsetRecog_windows_1251()1124 CharsetRecog_windows_1251::~CharsetRecog_windows_1251()
1125 {
1126     // nothing to do
1127 }
1128 
getName() const1129 const char *CharsetRecog_windows_1251::getName() const
1130 {
1131     return  "windows-1251";
1132 }
1133 
getLanguage() const1134 const char *CharsetRecog_windows_1251::getLanguage() const
1135 {
1136     return "ru";
1137 }
1138 
match(InputText * textIn)1139 int32_t CharsetRecog_windows_1251::match(InputText *textIn)
1140 {
1141     return match_sbcs(textIn, ngrams_windows_1251, charMap_windows_1251);
1142 }
1143 
~CharsetRecog_KOI8_R()1144 CharsetRecog_KOI8_R::~CharsetRecog_KOI8_R()
1145 {
1146     // nothing to do
1147 }
1148 
getName() const1149 const char *CharsetRecog_KOI8_R::getName() const
1150 {
1151     return  "KOI8-R";
1152 }
1153 
getLanguage() const1154 const char *CharsetRecog_KOI8_R::getLanguage() const
1155 {
1156     return "ru";
1157 }
1158 
match(InputText * textIn)1159 int32_t CharsetRecog_KOI8_R::match(InputText *textIn)
1160 {
1161     return match_sbcs(textIn, ngrams_KOI8_R, charMap_KOI8_R);
1162 }
1163 
~CharsetRecog_IBM424_he()1164 CharsetRecog_IBM424_he::~CharsetRecog_IBM424_he()
1165 {
1166     // nothing to do
1167 }
1168 
getLanguage() const1169 const char *CharsetRecog_IBM424_he::getLanguage() const
1170 {
1171     return "he";
1172 }
1173 
~CharsetRecog_IBM424_he_rtl()1174 CharsetRecog_IBM424_he_rtl::~CharsetRecog_IBM424_he_rtl()
1175 {
1176     // nothing to do
1177 }
1178 
getName() const1179 const char *CharsetRecog_IBM424_he_rtl::getName() const
1180 {
1181     return  "IBM424_rtl";
1182 }
1183 
match(InputText * textIn)1184 int32_t CharsetRecog_IBM424_he_rtl::match(InputText *textIn)
1185 {
1186     return match_sbcs(textIn, ngrams_IBM424_he_rtl, charMap_IBM424_he);
1187 }
1188 
~CharsetRecog_IBM424_he_ltr()1189 CharsetRecog_IBM424_he_ltr::~CharsetRecog_IBM424_he_ltr()
1190 {
1191     // nothing to do
1192 }
1193 
getName() const1194 const char *CharsetRecog_IBM424_he_ltr::getName() const
1195 {
1196     return  "IBM424_ltr";
1197 }
1198 
match(InputText * textIn)1199 int32_t CharsetRecog_IBM424_he_ltr::match(InputText *textIn)
1200 {
1201     return match_sbcs(textIn, ngrams_IBM424_he_ltr, charMap_IBM424_he);
1202 }
1203 
1204 static const uint8_t unshapeMap_IBM420[] = {
1205 /*           -0    -1    -2    -3    -4    -5    -6    -7    -8    -9    -A    -B    -C    -D    -E    -F   */
1206 /* 0- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
1207 /* 1- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
1208 /* 2- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
1209 /* 3- */    0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x40,
1210 /* 4- */    0x40, 0x40, 0x42, 0x42, 0x44, 0x45, 0x46, 0x47, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F,
1211 /* 5- */    0x50, 0x49, 0x52, 0x53, 0x54, 0x55, 0x56, 0x56, 0x58, 0x58, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
1212 /* 6- */    0x60, 0x61, 0x62, 0x63, 0x63, 0x65, 0x65, 0x67, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F,
1213 /* 7- */    0x69, 0x71, 0x71, 0x73, 0x74, 0x75, 0x76, 0x77, 0x77, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F,
1214 /* 8- */    0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x80, 0x8B, 0x8B, 0x8D, 0x8D, 0x8F,
1215 /* 9- */    0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9A, 0x9A, 0x9A, 0x9E, 0x9E,
1216 /* A- */    0x9E, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0x9E, 0xAB, 0xAB, 0xAD, 0xAD, 0xAF,
1217 /* B- */    0xAF, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xB1, 0xBB, 0xBB, 0xBD, 0xBD, 0xBF,
1218 /* C- */    0xC0, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8, 0xC9, 0xCA, 0xBF, 0xCC, 0xBF, 0xCE, 0xCF,
1219 /* D- */    0xD0, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6, 0xD7, 0xD8, 0xD9, 0xDA, 0xDA, 0xDC, 0xDC, 0xDC, 0xDF,
1220 /* E- */    0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
1221 /* F- */    0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF,
1222 };
1223 
~CharsetRecog_IBM420_ar()1224 CharsetRecog_IBM420_ar::~CharsetRecog_IBM420_ar()
1225 {
1226     // nothing to do
1227 }
1228 
getLanguage() const1229 const char *CharsetRecog_IBM420_ar::getLanguage() const
1230 {
1231     return "ar";
1232 }
1233 
matchInit(InputText * textIn)1234 void CharsetRecog_IBM420_ar::matchInit(InputText *textIn) {
1235     prev_fInputBytesLength = textIn->fInputLen;
1236     prev_fInputBytes = textIn->fInputBytes;
1237 
1238     int32_t length = 0;
1239     uint8_t *bb = unshape(prev_fInputBytes, prev_fInputBytesLength, length);
1240 
1241     if (bb != NULL) {
1242         textIn->fInputBytes = bb;
1243         textIn->fInputLen = length;
1244 
1245         deleteBuffer = TRUE;
1246     } else {
1247         deleteBuffer = FALSE;
1248     }
1249 }
1250 
unshape(const uint8_t * inputBytes,int32_t inputBytesLength,int32_t & length)1251 uint8_t *CharsetRecog_IBM420_ar::unshape(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length) {
1252     uint8_t *resultArray = unshapeLamAlef(inputBytes, inputBytesLength, length);
1253 
1254     if (resultArray != NULL) {
1255         for (int32_t i = 0; i < inputBytesLength; i++) {
1256             resultArray[i] = unshapeMap_IBM420[resultArray[i]];
1257         }
1258     }
1259 
1260     return resultArray;
1261 }
1262 
unshapeLamAlef(const uint8_t * inputBytes,int32_t inputBytesLength,int32_t & length)1263 uint8_t *CharsetRecog_IBM420_ar::unshapeLamAlef(const uint8_t *inputBytes, int32_t inputBytesLength, int32_t &length) {
1264     int32_t bigBufferLength = inputBytesLength * 2;
1265     uint8_t *bigBuffer = (uint8_t *)uprv_malloc(bigBufferLength);
1266     uint8_t *resultBuffer = NULL;
1267 
1268     if (bigBuffer != NULL) {
1269         int32_t bufferIndex;
1270         uint8_t unshapedLamAlef[] = { 0xb1, 0x56 };
1271 
1272         for (int32_t i = bufferIndex = 0; i < inputBytesLength; i++) {
1273             if (isLamAlef(inputBytes[i])) {
1274                 bigBuffer[bufferIndex++] = unshapedLamAlef[0];
1275                 bigBuffer[bufferIndex++] = unshapedLamAlef[1];
1276             } else {
1277                 bigBuffer[bufferIndex++] = inputBytes[i];
1278             }
1279         }
1280 
1281         length = bufferIndex;
1282         resultBuffer = (uint8_t *)uprv_malloc(length);
1283         if (resultBuffer != NULL) {
1284             uprv_memcpy(resultBuffer, bigBuffer, length);
1285         }
1286     }
1287 
1288     if (bigBuffer != NULL) {
1289         uprv_free(bigBuffer);
1290     }
1291 
1292     return resultBuffer;
1293 }
1294 
matchFinish(InputText * textIn)1295 void CharsetRecog_IBM420_ar::matchFinish(InputText *textIn) {
1296     if (deleteBuffer) {
1297         uprv_free(textIn->fInputBytes);
1298 
1299         textIn->fInputBytes = prev_fInputBytes;
1300         textIn->fInputLen = prev_fInputBytesLength;
1301     }
1302 }
1303 
isLamAlef(uint8_t b)1304 UBool CharsetRecog_IBM420_ar::isLamAlef(uint8_t b) {
1305     uint8_t shapedLamAlef[] = {
1306         0xb2, 0xb3, 0xb4, 0xb5, 0xb7, 0xb8
1307     };
1308 
1309     for (uint32_t i = 0; i < sizeof(shapedLamAlef); i++) {
1310         if (b == shapedLamAlef[i]) {
1311             return TRUE;
1312         }
1313     }
1314 
1315     return FALSE;
1316 }
1317 
~CharsetRecog_IBM420_ar_rtl()1318 CharsetRecog_IBM420_ar_rtl::~CharsetRecog_IBM420_ar_rtl()
1319 {
1320     // nothing to do
1321 }
1322 
getName() const1323 const char *CharsetRecog_IBM420_ar_rtl::getName() const
1324 {
1325     return  "IBM420_rtl";
1326 }
1327 
match(InputText * textIn)1328 int32_t CharsetRecog_IBM420_ar_rtl::match(InputText *textIn)
1329 {
1330     return match_sbcs(textIn, ngrams_IBM420_ar_rtl, charMap_IBM420_ar);
1331 }
1332 
~CharsetRecog_IBM420_ar_ltr()1333 CharsetRecog_IBM420_ar_ltr::~CharsetRecog_IBM420_ar_ltr()
1334 {
1335     // nothing to do
1336 }
1337 
getName() const1338 const char *CharsetRecog_IBM420_ar_ltr::getName() const
1339 {
1340     return  "IBM420_ltr";
1341 }
1342 
match(InputText * textIn)1343 int32_t CharsetRecog_IBM420_ar_ltr::match(InputText *textIn)
1344 {
1345     return match_sbcs(textIn, ngrams_IBM420_ar_ltr, charMap_IBM420_ar);
1346 }
1347 
1348 U_NAMESPACE_END
1349 #endif
1350 
1351