1 /* Recode Serbian text from Cyrillic to Latin script.
2 Copyright (C) 2006-2007, 2009 Free Software Foundation, Inc.
3 Written by Danilo Šegan <danilo@gnome.org>, 2006,
4 and Bruno Haible <bruno@clisp.org>, 2006.
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18
19 #ifdef HAVE_CONFIG_H
20 # include <config.h>
21 #endif
22
23 /* Specification. */
24 #include "filters.h"
25
26 #include <stdlib.h>
27
28 #include "xalloc.h"
29
30
31 /* Table for Serbian Cyrillic to Latin transcription.
32 The table is indexed by the Unicode code point, in the range 0x0400..0x04ef.
33 The longest table entry is three bytes long. */
34 static const char table[240][3 + 1] =
35 {
36 /* U+0400 */ "\xC3\x88", /* "È" */
37 /* U+0401 */ "",
38 /* U+0402 */ "\xC4\x90", /* "Đ" */
39 /* U+0403 */ "",
40 /* U+0404 */ "",
41 /* U+0405 */ "",
42 /* U+0406 */ "",
43 /* U+0407 */ "",
44 /* U+0408 */ "J",
45 /* U+0409 */ "Lj",
46 /* U+040A */ "Nj",
47 /* U+040B */ "\xC4\x86", /* "Ć" */
48 /* U+040C */ "",
49 /* U+040D */ "\xC3\x8C", /* "Ì" */
50 /* U+040E */ "",
51 /* U+040F */ "D\xC5\xBE", /* "Dž" */
52 /* U+0410 */ "A",
53 /* U+0411 */ "B",
54 /* U+0412 */ "V",
55 /* U+0413 */ "G",
56 /* U+0414 */ "D",
57 /* U+0415 */ "E",
58 /* U+0416 */ "\xC5\xBD", /* "Ž" */
59 /* U+0417 */ "Z",
60 /* U+0418 */ "I",
61 /* U+0419 */ "",
62 /* U+041A */ "K",
63 /* U+041B */ "L",
64 /* U+041C */ "M",
65 /* U+041D */ "N",
66 /* U+041E */ "O",
67 /* U+041F */ "P",
68 /* U+0420 */ "R",
69 /* U+0421 */ "S",
70 /* U+0422 */ "T",
71 /* U+0423 */ "U",
72 /* U+0424 */ "F",
73 /* U+0425 */ "H",
74 /* U+0426 */ "C",
75 /* U+0427 */ "\xC4\x8C", /* "Č" */
76 /* U+0428 */ "\xC5\xA0", /* "Š" */
77 /* U+0429 */ "",
78 /* U+042A */ "",
79 /* U+042B */ "",
80 /* U+042C */ "",
81 /* U+042D */ "",
82 /* U+042E */ "",
83 /* U+042F */ "",
84 /* U+0430 */ "a",
85 /* U+0431 */ "b",
86 /* U+0432 */ "v",
87 /* U+0433 */ "g",
88 /* U+0434 */ "d",
89 /* U+0435 */ "e",
90 /* U+0436 */ "\xC5\xBE", /* "ž" */
91 /* U+0437 */ "z",
92 /* U+0438 */ "i",
93 /* U+0439 */ "",
94 /* U+043A */ "k",
95 /* U+043B */ "l",
96 /* U+043C */ "m",
97 /* U+043D */ "n",
98 /* U+043E */ "o",
99 /* U+043F */ "p",
100 /* U+0440 */ "r",
101 /* U+0441 */ "s",
102 /* U+0442 */ "t",
103 /* U+0443 */ "u",
104 /* U+0444 */ "f",
105 /* U+0445 */ "h",
106 /* U+0446 */ "c",
107 /* U+0447 */ "\xC4\x8D", /* "č" */
108 /* U+0448 */ "\xC5\xA1", /* "š" */
109 /* U+0449 */ "",
110 /* U+044A */ "",
111 /* U+044B */ "",
112 /* U+044C */ "",
113 /* U+044D */ "",
114 /* U+044E */ "",
115 /* U+044F */ "",
116 /* U+0450 */ "\xC3\xA8", /* "è" */
117 /* U+0451 */ "",
118 /* U+0452 */ "\xC4\x91", /* "đ" */
119 /* U+0453 */ "",
120 /* U+0454 */ "",
121 /* U+0455 */ "",
122 /* U+0456 */ "",
123 /* U+0457 */ "",
124 /* U+0458 */ "j",
125 /* U+0459 */ "lj",
126 /* U+045A */ "nj",
127 /* U+045B */ "\xC4\x87", /* "ć" */
128 /* U+045C */ "",
129 /* U+045D */ "\xC3\xAC", /* "ì" */
130 /* U+045E */ "",
131 /* U+045F */ "d\xC5\xBE", /* "dž" */
132 /* U+0460 */ "",
133 /* U+0461 */ "",
134 /* U+0462 */ "",
135 /* U+0463 */ "",
136 /* U+0464 */ "",
137 /* U+0465 */ "",
138 /* U+0466 */ "",
139 /* U+0467 */ "",
140 /* U+0468 */ "",
141 /* U+0469 */ "",
142 /* U+046A */ "",
143 /* U+046B */ "",
144 /* U+046C */ "",
145 /* U+046D */ "",
146 /* U+046E */ "",
147 /* U+046F */ "",
148 /* U+0470 */ "",
149 /* U+0471 */ "",
150 /* U+0472 */ "",
151 /* U+0473 */ "",
152 /* U+0474 */ "",
153 /* U+0475 */ "",
154 /* U+0476 */ "",
155 /* U+0477 */ "",
156 /* U+0478 */ "",
157 /* U+0479 */ "",
158 /* U+047A */ "",
159 /* U+047B */ "",
160 /* U+047C */ "",
161 /* U+047D */ "",
162 /* U+047E */ "",
163 /* U+047F */ "",
164 /* U+0480 */ "",
165 /* U+0481 */ "",
166 /* U+0482 */ "",
167 /* U+0483 */ "",
168 /* U+0484 */ "",
169 /* U+0485 */ "",
170 /* U+0486 */ "",
171 /* U+0487 */ "",
172 /* U+0488 */ "",
173 /* U+0489 */ "",
174 /* U+048A */ "",
175 /* U+048B */ "",
176 /* U+048C */ "",
177 /* U+048D */ "",
178 /* U+048E */ "",
179 /* U+048F */ "",
180 /* U+0490 */ "",
181 /* U+0491 */ "",
182 /* U+0492 */ "",
183 /* U+0493 */ "",
184 /* U+0494 */ "",
185 /* U+0495 */ "",
186 /* U+0496 */ "",
187 /* U+0497 */ "",
188 /* U+0498 */ "",
189 /* U+0499 */ "",
190 /* U+049A */ "",
191 /* U+049B */ "",
192 /* U+049C */ "",
193 /* U+049D */ "",
194 /* U+049E */ "",
195 /* U+049F */ "",
196 /* U+04A0 */ "",
197 /* U+04A1 */ "",
198 /* U+04A2 */ "",
199 /* U+04A3 */ "",
200 /* U+04A4 */ "",
201 /* U+04A5 */ "",
202 /* U+04A6 */ "",
203 /* U+04A7 */ "",
204 /* U+04A8 */ "",
205 /* U+04A9 */ "",
206 /* U+04AA */ "",
207 /* U+04AB */ "",
208 /* U+04AC */ "",
209 /* U+04AD */ "",
210 /* U+04AE */ "",
211 /* U+04AF */ "",
212 /* U+04B0 */ "",
213 /* U+04B1 */ "",
214 /* U+04B2 */ "",
215 /* U+04B3 */ "",
216 /* U+04B4 */ "",
217 /* U+04B5 */ "",
218 /* U+04B6 */ "",
219 /* U+04B7 */ "",
220 /* U+04B8 */ "",
221 /* U+04B9 */ "",
222 /* U+04BA */ "",
223 /* U+04BB */ "",
224 /* U+04BC */ "",
225 /* U+04BD */ "",
226 /* U+04BE */ "",
227 /* U+04BF */ "",
228 /* U+04C0 */ "",
229 /* U+04C1 */ "",
230 /* U+04C2 */ "",
231 /* U+04C3 */ "",
232 /* U+04C4 */ "",
233 /* U+04C5 */ "",
234 /* U+04C6 */ "",
235 /* U+04C7 */ "",
236 /* U+04C8 */ "",
237 /* U+04C9 */ "",
238 /* U+04CA */ "",
239 /* U+04CB */ "",
240 /* U+04CC */ "",
241 /* U+04CD */ "",
242 /* U+04CE */ "",
243 /* U+04CF */ "",
244 /* U+04D0 */ "",
245 /* U+04D1 */ "",
246 /* U+04D2 */ "",
247 /* U+04D3 */ "",
248 /* U+04D4 */ "",
249 /* U+04D5 */ "",
250 /* U+04D6 */ "",
251 /* U+04D7 */ "",
252 /* U+04D8 */ "",
253 /* U+04D9 */ "",
254 /* U+04DA */ "",
255 /* U+04DB */ "",
256 /* U+04DC */ "",
257 /* U+04DD */ "",
258 /* U+04DE */ "",
259 /* U+04DF */ "",
260 /* U+04E0 */ "",
261 /* U+04E1 */ "",
262 /* U+04E2 */ "\xC4\xAA", /* "Ī" */
263 /* U+04E3 */ "\xC4\xAB", /* "ī" */
264 /* U+04E4 */ "",
265 /* U+04E5 */ "",
266 /* U+04E6 */ "",
267 /* U+04E7 */ "",
268 /* U+04E8 */ "",
269 /* U+04E9 */ "",
270 /* U+04EA */ "",
271 /* U+04EB */ "",
272 /* U+04EC */ "",
273 /* U+04ED */ "",
274 /* U+04EE */ "\xC5\xAA", /* "Ū" */
275 /* U+04EF */ "\xC5\xAB" /* "ū" */
276 };
277
278 /* Quick test for an uppercase character in the range U+0041..U+005A.
279 The argument must be a byte in the range 0..UCHAR_MAX. */
280 #define IS_UPPERCASE_LATIN(byte) \
281 ((unsigned char) ((byte) - 'A') <= 'Z' - 'A')
282
283 /* Quick test for an uppercase character in the range U+0400..U+042F,
284 or exactly U+04E2 or U+04EE.
285 The arguments must be bytes in the range 0..UCHAR_MAX. */
286 #define IS_UPPERCASE_CYRILLIC(byte1,byte2) \
287 (((byte1) == 0xd0 && (unsigned char) ((byte2) - 0x80) < 0x30) \
288 || ((byte1) == 0xd3 && ((byte2) == 0xa2 || (byte2) == 0xae)))
289
290 void
serbian_to_latin(const char * input,size_t input_len,char ** output_p,size_t * output_len_p)291 serbian_to_latin (const char *input, size_t input_len,
292 char **output_p, size_t *output_len_p)
293 {
294 /* Loop through the input string, producing a replacement for each character.
295 Only characters in the range U+0400..U+04EF (\xD0\x80..\xD3\xAF) need to
296 be handled, and more precisely only those for which a replacement exists
297 in the table. Other characters are copied without modification.
298 The characters U+0409, U+040A, U+040F are transliterated to uppercase or
299 mixed-case replacements ("LJ" / "Lj", "NJ" / "Nj", "DŽ" / "Dž"), depending
300 on the case of the surrounding characters.
301 Since we assume UTF-8 encoding, the bytes \xD0..\xD3 can only occur at the
302 beginning of a character; the second and further bytes of a character are
303 all in the range \x80..\xBF. */
304
305 /* Since sequences of 2 bytes are mapped to sequences of at most 3 bytes,
306 the size of the output will be at most 1.5 * input_len. */
307 size_t allocated = input_len + (input_len >> 1);
308 char *output = XNMALLOC (allocated, char);
309
310 const char *input_end = input + input_len;
311 const char *ip;
312 char *op;
313
314 for (ip = input, op = output; ip < input_end; )
315 {
316 unsigned char byte = (unsigned char) *ip;
317
318 /* Test for the first byte of a Cyrillic character. */
319 if ((byte >= 0xd0 && byte <= 0xd3) && (ip + 1 < input_end))
320 {
321 unsigned char second_byte = (unsigned char) ip[1];
322
323 /* Verify the second byte is valid. */
324 if (second_byte >= 0x80 && second_byte < 0xc0)
325 {
326 unsigned int uc = ((byte & 0x1f) << 6) | (second_byte & 0x3f);
327
328 if (uc >= 0x0400 && uc <= 0x04ef)
329 {
330 /* Look up replacement from the table. */
331 const char *repl = table[uc - 0x0400];
332
333 if (repl[0] != '\0')
334 {
335 /* Found a replacement.
336 Now handle the special cases. */
337 if (uc == 0x0409 || uc == 0x040a || uc == 0x040f)
338 if ((ip + 2 < input_end
339 && IS_UPPERCASE_LATIN ((unsigned char) ip[2]))
340 || (ip + 3 < input_end
341 && IS_UPPERCASE_CYRILLIC ((unsigned char) ip[2],
342 (unsigned char) ip[3]))
343 || (ip >= input + 1
344 && IS_UPPERCASE_LATIN ((unsigned char) ip[-1]))
345 || (ip >= input + 2
346 && IS_UPPERCASE_CYRILLIC ((unsigned char) ip[-2],
347 (unsigned char) ip[-1])))
348 {
349 /* Use the upper-case replacement instead of
350 the mixed-case replacement. */
351 switch (uc)
352 {
353 case 0x0409:
354 repl = "LJ"; break;
355 case 0x040a:
356 repl = "NJ"; break;
357 case 0x040f:
358 repl = "D\xC5\xBD"/* "DŽ" */; break;
359 default:
360 abort ();
361 }
362 }
363
364 /* Use the replacement. */
365 *op++ = *repl++;
366 if (*repl != '\0')
367 {
368 *op++ = *repl++;
369 if (*repl != '\0')
370 {
371 *op++ = *repl++;
372 /* All replacements have at most 3 bytes. */
373 if (*repl != '\0')
374 abort ();
375 }
376 }
377 ip += 2;
378 continue;
379 }
380 }
381 }
382 }
383 *op++ = *ip++;
384 }
385
386 {
387 size_t output_len = op - output;
388
389 /* Verify that the allocated size was not exceeded. */
390 if (output_len > allocated)
391 abort ();
392 /* Shrink the result. */
393 if (output_len < allocated)
394 output = (char *) xrealloc (output, output_len);
395
396 /* Done. */
397 *output_p = output;
398 *output_len_p = output_len;
399 }
400 }
401