1 /* decomp.c - Character decomposition.
2 *
3 * Copyright (C) 1999, 2000 Tom Tromey
4 * Copyright 2000 Red Hat, Inc.
5 *
6 * The Gnome Library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
10 *
11 * The Gnome Library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with the Gnome Library; see the file COPYING.LIB. If not,
18 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
19 * Boston, MA 02111-1307, USA.
20 */
21
22 #include "config.h"
23
24 #include <stdlib.h>
25
26 #include "glib.h"
27 #include "gunidecomp.h"
28 #include "gunicomp.h"
29 #include "gunicodeprivate.h"
30 #include "galias.h"
31
32
33 #define CC_PART1(Page, Char) \
34 ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
35 ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
36 : (cclass_data[combining_class_table_part1[Page]][Char]))
37
38 #define CC_PART2(Page, Char) \
39 ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
40 ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
41 : (cclass_data[combining_class_table_part2[Page]][Char]))
42
43 #define COMBINING_CLASS(Char) \
44 (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
45 ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
46 : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
47 ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
48 : 0))
49
50 /**
51 * g_unichar_combining_class:
52 * @uc: a Unicode character
53 *
54 * Determines the canonical combining class of a Unicode character.
55 *
56 * Return value: the combining class of the character
57 *
58 * Since: 2.14
59 **/
60 gint
g_unichar_combining_class(gunichar uc)61 g_unichar_combining_class (gunichar uc)
62 {
63 return COMBINING_CLASS (uc);
64 }
65
66 /* constants for hangul syllable [de]composition */
67 #define SBase 0xAC00
68 #define LBase 0x1100
69 #define VBase 0x1161
70 #define TBase 0x11A7
71 #define LCount 19
72 #define VCount 21
73 #define TCount 28
74 #define NCount (VCount * TCount)
75 #define SCount (LCount * NCount)
76
77 /**
78 * g_unicode_canonical_ordering:
79 * @string: a UCS-4 encoded string.
80 * @len: the maximum length of @string to use.
81 *
82 * Computes the canonical ordering of a string in-place.
83 * This rearranges decomposed characters in the string
84 * according to their combining classes. See the Unicode
85 * manual for more information.
86 **/
87 void
g_unicode_canonical_ordering(gunichar * string,gsize len)88 g_unicode_canonical_ordering (gunichar *string,
89 gsize len)
90 {
91 gsize i;
92 int swap = 1;
93
94 while (swap)
95 {
96 int last;
97 swap = 0;
98 last = COMBINING_CLASS (string[0]);
99 for (i = 0; i < len - 1; ++i)
100 {
101 int next = COMBINING_CLASS (string[i + 1]);
102 if (next != 0 && last > next)
103 {
104 gsize j;
105 /* Percolate item leftward through string. */
106 for (j = i + 1; j > 0; --j)
107 {
108 gunichar t;
109 if (COMBINING_CLASS (string[j - 1]) <= next)
110 break;
111 t = string[j];
112 string[j] = string[j - 1];
113 string[j - 1] = t;
114 swap = 1;
115 }
116 /* We're re-entering the loop looking at the old
117 character again. */
118 next = last;
119 }
120 last = next;
121 }
122 }
123 }
124
125 /* http://www.unicode.org/unicode/reports/tr15/#Hangul
126 * r should be null or have sufficient space. Calling with r == NULL will
127 * only calculate the result_len; however, a buffer with space for three
128 * characters will always be big enough. */
129 static void
decompose_hangul(gunichar s,gunichar * r,gsize * result_len)130 decompose_hangul (gunichar s,
131 gunichar *r,
132 gsize *result_len)
133 {
134 gint SIndex = s - SBase;
135
136 /* not a hangul syllable */
137 if (SIndex < 0 || SIndex >= SCount)
138 {
139 if (r)
140 r[0] = s;
141 *result_len = 1;
142 }
143 else
144 {
145 gunichar L = LBase + SIndex / NCount;
146 gunichar V = VBase + (SIndex % NCount) / TCount;
147 gunichar T = TBase + SIndex % TCount;
148
149 if (r)
150 {
151 r[0] = L;
152 r[1] = V;
153 }
154
155 if (T != TBase)
156 {
157 if (r)
158 r[2] = T;
159 *result_len = 3;
160 }
161 else
162 *result_len = 2;
163 }
164 }
165
166 /* returns a pointer to a null-terminated UTF-8 string */
167 static const gchar *
find_decomposition(gunichar ch,gboolean compat)168 find_decomposition (gunichar ch,
169 gboolean compat)
170 {
171 int start = 0;
172 int end = G_N_ELEMENTS (decomp_table);
173
174 if (ch >= decomp_table[start].ch &&
175 ch <= decomp_table[end - 1].ch)
176 {
177 while (TRUE)
178 {
179 int half = (start + end) / 2;
180 if (ch == decomp_table[half].ch)
181 {
182 int offset;
183
184 if (compat)
185 {
186 offset = decomp_table[half].compat_offset;
187 if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
188 offset = decomp_table[half].canon_offset;
189 }
190 else
191 {
192 offset = decomp_table[half].canon_offset;
193 if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
194 return NULL;
195 }
196
197 return &(decomp_expansion_string[offset]);
198 }
199 else if (half == start)
200 break;
201 else if (ch > decomp_table[half].ch)
202 start = half;
203 else
204 end = half;
205 }
206 }
207
208 return NULL;
209 }
210
211 /**
212 * g_unicode_canonical_decomposition:
213 * @ch: a Unicode character.
214 * @result_len: location to store the length of the return value.
215 *
216 * Computes the canonical decomposition of a Unicode character.
217 *
218 * Return value: a newly allocated string of Unicode characters.
219 * @result_len is set to the resulting length of the string.
220 **/
221 gunichar *
g_unicode_canonical_decomposition(gunichar ch,gsize * result_len)222 g_unicode_canonical_decomposition (gunichar ch,
223 gsize *result_len)
224 {
225 const gchar *decomp;
226 const gchar *p;
227 gunichar *r;
228
229 /* Hangul syllable */
230 if (ch >= 0xac00 && ch <= 0xd7a3)
231 {
232 decompose_hangul (ch, NULL, result_len);
233 r = g_malloc (*result_len * sizeof (gunichar));
234 decompose_hangul (ch, r, result_len);
235 }
236 else if ((decomp = find_decomposition (ch, FALSE)) != NULL)
237 {
238 /* Found it. */
239 int i;
240
241 *result_len = g_utf8_strlen (decomp, -1);
242 r = g_malloc (*result_len * sizeof (gunichar));
243
244 for (p = decomp, i = 0; *p != '\0'; p = g_utf8_next_char (p), i++)
245 r[i] = g_utf8_get_char (p);
246 }
247 else
248 {
249 /* Not in our table. */
250 r = g_malloc (sizeof (gunichar));
251 *r = ch;
252 *result_len = 1;
253 }
254
255 /* Supposedly following the Unicode 2.1.9 table means that the
256 decompositions come out in canonical order. I haven't tested
257 this, but we rely on it here. */
258 return r;
259 }
260
261 /* L,V => LV and LV,T => LVT */
262 static gboolean
combine_hangul(gunichar a,gunichar b,gunichar * result)263 combine_hangul (gunichar a,
264 gunichar b,
265 gunichar *result)
266 {
267 gint LIndex = a - LBase;
268 gint SIndex = a - SBase;
269
270 gint VIndex = b - VBase;
271 gint TIndex = b - TBase;
272
273 if (0 <= LIndex && LIndex < LCount
274 && 0 <= VIndex && VIndex < VCount)
275 {
276 *result = SBase + (LIndex * VCount + VIndex) * TCount;
277 return TRUE;
278 }
279 else if (0 <= SIndex && SIndex < SCount && (SIndex % TCount) == 0
280 && 0 < TIndex && TIndex < TCount)
281 {
282 *result = a + TIndex;
283 return TRUE;
284 }
285
286 return FALSE;
287 }
288
289 #define CI(Page, Char) \
290 ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
291 ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
292 : (compose_data[compose_table[Page]][Char]))
293
294 #define COMPOSE_INDEX(Char) \
295 (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
296
297 static gboolean
combine(gunichar a,gunichar b,gunichar * result)298 combine (gunichar a,
299 gunichar b,
300 gunichar *result)
301 {
302 gushort index_a, index_b;
303
304 if (combine_hangul (a, b, result))
305 return TRUE;
306
307 index_a = COMPOSE_INDEX(a);
308
309 if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
310 {
311 if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
312 {
313 *result = compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
314 return TRUE;
315 }
316 else
317 return FALSE;
318 }
319
320 index_b = COMPOSE_INDEX(b);
321
322 if (index_b >= COMPOSE_SECOND_SINGLE_START)
323 {
324 if (a == compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
325 {
326 *result = compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
327 return TRUE;
328 }
329 else
330 return FALSE;
331 }
332
333 if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START &&
334 index_b >= COMPOSE_SECOND_START && index_b < COMPOSE_SECOND_SINGLE_START)
335 {
336 gunichar res = compose_array[index_a - COMPOSE_FIRST_START][index_b - COMPOSE_SECOND_START];
337
338 if (res)
339 {
340 *result = res;
341 return TRUE;
342 }
343 }
344
345 return FALSE;
346 }
347
348 gunichar *
_g_utf8_normalize_wc(const gchar * str,gssize max_len,GNormalizeMode mode)349 _g_utf8_normalize_wc (const gchar *str,
350 gssize max_len,
351 GNormalizeMode mode)
352 {
353 gsize n_wc;
354 gunichar *wc_buffer;
355 const char *p;
356 gsize last_start;
357 gboolean do_compat = (mode == G_NORMALIZE_NFKC ||
358 mode == G_NORMALIZE_NFKD);
359 gboolean do_compose = (mode == G_NORMALIZE_NFC ||
360 mode == G_NORMALIZE_NFKC);
361
362 n_wc = 0;
363 p = str;
364 while ((max_len < 0 || p < str + max_len) && *p)
365 {
366 const gchar *decomp;
367 gunichar wc = g_utf8_get_char (p);
368
369 if (wc >= 0xac00 && wc <= 0xd7a3)
370 {
371 gsize result_len;
372 decompose_hangul (wc, NULL, &result_len);
373 n_wc += result_len;
374 }
375 else
376 {
377 decomp = find_decomposition (wc, do_compat);
378
379 if (decomp)
380 n_wc += g_utf8_strlen (decomp, -1);
381 else
382 n_wc++;
383 }
384
385 p = g_utf8_next_char (p);
386 }
387
388 wc_buffer = g_new (gunichar, n_wc + 1);
389
390 last_start = 0;
391 n_wc = 0;
392 p = str;
393 while ((max_len < 0 || p < str + max_len) && *p)
394 {
395 gunichar wc = g_utf8_get_char (p);
396 const gchar *decomp;
397 int cc;
398 gsize old_n_wc = n_wc;
399
400 if (wc >= 0xac00 && wc <= 0xd7a3)
401 {
402 gsize result_len;
403 decompose_hangul (wc, wc_buffer + n_wc, &result_len);
404 n_wc += result_len;
405 }
406 else
407 {
408 decomp = find_decomposition (wc, do_compat);
409
410 if (decomp)
411 {
412 const char *pd;
413 for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
414 wc_buffer[n_wc++] = g_utf8_get_char (pd);
415 }
416 else
417 wc_buffer[n_wc++] = wc;
418 }
419
420 if (n_wc > 0)
421 {
422 cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
423
424 if (cc == 0)
425 {
426 g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start);
427 last_start = old_n_wc;
428 }
429 }
430
431 p = g_utf8_next_char (p);
432 }
433
434 if (n_wc > 0)
435 {
436 g_unicode_canonical_ordering (wc_buffer + last_start, n_wc - last_start);
437 last_start = n_wc;
438 }
439
440 wc_buffer[n_wc] = 0;
441
442 /* All decomposed and reordered */
443
444 if (do_compose && n_wc > 0)
445 {
446 gsize i, j;
447 int last_cc = 0;
448 last_start = 0;
449
450 for (i = 0; i < n_wc; i++)
451 {
452 int cc = COMBINING_CLASS (wc_buffer[i]);
453
454 if (i > 0 &&
455 (last_cc == 0 || last_cc < cc) &&
456 combine (wc_buffer[last_start], wc_buffer[i],
457 &wc_buffer[last_start]))
458 {
459 for (j = i + 1; j < n_wc; j++)
460 wc_buffer[j-1] = wc_buffer[j];
461 n_wc--;
462 i--;
463
464 if (i == last_start)
465 last_cc = 0;
466 else
467 last_cc = COMBINING_CLASS (wc_buffer[i-1]);
468
469 continue;
470 }
471
472 if (cc == 0)
473 last_start = i;
474
475 last_cc = cc;
476 }
477 }
478
479 wc_buffer[n_wc] = 0;
480
481 return wc_buffer;
482 }
483
484 /**
485 * g_utf8_normalize:
486 * @str: a UTF-8 encoded string.
487 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
488 * @mode: the type of normalization to perform.
489 *
490 * Converts a string into canonical form, standardizing
491 * such issues as whether a character with an accent
492 * is represented as a base character and combining
493 * accent or as a single precomposed character. The
494 * string has to be valid UTF-8, otherwise %NULL is
495 * returned. You should generally call g_utf8_normalize()
496 * before comparing two Unicode strings.
497 *
498 * The normalization mode %G_NORMALIZE_DEFAULT only
499 * standardizes differences that do not affect the
500 * text content, such as the above-mentioned accent
501 * representation. %G_NORMALIZE_ALL also standardizes
502 * the "compatibility" characters in Unicode, such
503 * as SUPERSCRIPT THREE to the standard forms
504 * (in this case DIGIT THREE). Formatting information
505 * may be lost but for most text operations such
506 * characters should be considered the same.
507 *
508 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
509 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
510 * but returned a result with composed forms rather
511 * than a maximally decomposed form. This is often
512 * useful if you intend to convert the string to
513 * a legacy encoding or pass it to a system with
514 * less capable Unicode handling.
515 *
516 * Return value: a newly allocated string, that is the
517 * normalized form of @str, or %NULL if @str is not
518 * valid UTF-8.
519 **/
520 gchar *
g_utf8_normalize(const gchar * str,gssize len,GNormalizeMode mode)521 g_utf8_normalize (const gchar *str,
522 gssize len,
523 GNormalizeMode mode)
524 {
525 gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
526 gchar *result;
527
528 result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL, NULL);
529 g_free (result_wc);
530
531 return result;
532 }
533
534 #define __G_UNIDECOMP_C__
535 #include "galiasdef.c"
536