• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2012 Grigori Goronzy <greg@kinoho.net>
3  *
4  * Permission to use, copy, modify, and/or distribute this software for any
5  * purpose with or without fee is hereby granted, provided that the above
6  * copyright notice and this permission notice appear in all copies.
7  *
8  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15  */
16 
17 #include <stdio.h>
18 #include <stdlib.h>
19 #include <stdint.h>
20 #include "ucdn.h"
21 
22 typedef struct {
23     unsigned char category;
24     unsigned char combining;
25     unsigned char bidi_class;
26     unsigned char mirrored;
27     unsigned char east_asian_width;
28     unsigned char normalization_check;
29     unsigned char script;
30 } UCDRecord;
31 
32 typedef struct {
33     unsigned short from, to;
34 } MirrorPair;
35 
36 typedef struct {
37     int start;
38     short count, index;
39 } Reindex;
40 
41 #include "unicodedata_db.h"
42 
43 /* constants required for Hangul (de)composition */
44 #define SBASE 0xAC00
45 #define LBASE 0x1100
46 #define VBASE 0x1161
47 #define TBASE 0x11A7
48 #define SCOUNT 11172
49 #define LCOUNT 19
50 #define VCOUNT 21
51 #define TCOUNT 28
52 #define NCOUNT (VCOUNT * TCOUNT)
53 
get_ucd_record(uint32_t code)54 static const UCDRecord *get_ucd_record(uint32_t code)
55 {
56     int index, offset;
57 
58     if (code >= 0x110000)
59         index = 0;
60     else {
61         index  = index0[code >> (SHIFT1+SHIFT2)] << SHIFT1;
62         offset = (code >> SHIFT2) & ((1<<SHIFT1) - 1);
63         index  = index1[index + offset] << SHIFT2;
64         offset = code & ((1<<SHIFT2) - 1);
65         index  = index2[index + offset];
66     }
67 
68     return &ucd_records[index];
69 }
70 
get_decomp_record(uint32_t code)71 static const unsigned short *get_decomp_record(uint32_t code)
72 {
73     int index, offset;
74 
75     if (code >= 0x110000)
76         index = 0;
77     else {
78         index  = decomp_index0[code >> (DECOMP_SHIFT1+DECOMP_SHIFT2)]
79             << DECOMP_SHIFT1;
80         offset = (code >> DECOMP_SHIFT2) & ((1<<DECOMP_SHIFT1) - 1);
81         index  = decomp_index1[index + offset] << DECOMP_SHIFT2;
82         offset = code & ((1<<DECOMP_SHIFT2) - 1);
83         index  = decomp_index2[index + offset];
84     }
85 
86     return &decomp_data[index];
87 }
88 
get_comp_index(uint32_t code,const Reindex * idx)89 static int get_comp_index(uint32_t code, const Reindex *idx)
90 {
91     int i;
92 
93     for (i = 0; idx[i].start; i++) {
94         const Reindex *cur = &idx[i];
95         if (code < cur->start)
96             return -1;
97         if (code <= cur->start + cur->count) {
98             return cur->index + (code - cur->start);
99         }
100     }
101 
102     return -1;
103 }
104 
compare_mp(const void * a,const void * b)105 static int compare_mp(const void *a, const void *b)
106 {
107     MirrorPair *mpa = (MirrorPair *)a;
108     MirrorPair *mpb = (MirrorPair *)b;
109     return mpa->from - mpb->from;
110 }
111 
hangul_pair_decompose(uint32_t code,uint32_t * a,uint32_t * b)112 static int hangul_pair_decompose(uint32_t code, uint32_t *a, uint32_t *b)
113 {
114     int si = code - SBASE;
115 
116     if (si < 0 || si >= SCOUNT)
117         return 0;
118 
119     if (si % TCOUNT) {
120         /* LV,T */
121         *a = SBASE + (si / TCOUNT) * TCOUNT;
122         *b = TBASE + (si % TCOUNT);
123         return 3;
124     } else {
125         /* L,V */
126         *a = LBASE + (si / NCOUNT);
127         *b = VBASE + (si % NCOUNT) / TCOUNT;
128         return 2;
129     }
130 }
131 
hangul_pair_compose(uint32_t * code,uint32_t a,uint32_t b)132 static int hangul_pair_compose(uint32_t *code, uint32_t a, uint32_t b)
133 {
134     if (b < VBASE || b >= (TBASE + TCOUNT))
135         return 0;
136 
137     if ((a < LBASE || a >= (LBASE + LCOUNT))
138             && (a < SBASE || a >= (SBASE + SCOUNT)))
139         return 0;
140 
141     if (a >= SBASE) {
142         /* LV,T */
143         *code = a + (b - TBASE);
144         return 3;
145     } else {
146         /* L,V */
147         int li = a - LBASE;
148         int vi = b - VBASE;
149         *code = SBASE + li * NCOUNT + vi * TCOUNT;
150         return 2;
151     }
152 }
153 
decode_utf16(const unsigned short ** code_ptr)154 static uint32_t decode_utf16(const unsigned short **code_ptr)
155 {
156     const unsigned short *code = *code_ptr;
157 
158     if ((code[0] & 0xd800) != 0xd800) {
159         *code_ptr += 1;
160         return (uint32_t)code[0];
161     } else {
162         *code_ptr += 2;
163         return 0x10000 + ((uint32_t)code[1] - 0xdc00) +
164             (((uint32_t)code[0] - 0xd800) << 10);
165     }
166 }
167 
ucdn_get_unicode_version(void)168 const char *ucdn_get_unicode_version(void)
169 {
170     return UNIDATA_VERSION;
171 }
172 
ucdn_get_combining_class(uint32_t code)173 int ucdn_get_combining_class(uint32_t code)
174 {
175     return get_ucd_record(code)->combining;
176 }
177 
ucdn_get_east_asian_width(uint32_t code)178 int ucdn_get_east_asian_width(uint32_t code)
179 {
180     return get_ucd_record(code)->east_asian_width;
181 }
182 
ucdn_get_general_category(uint32_t code)183 int ucdn_get_general_category(uint32_t code)
184 {
185     return get_ucd_record(code)->category;
186 }
187 
ucdn_get_bidi_class(uint32_t code)188 int ucdn_get_bidi_class(uint32_t code)
189 {
190     return get_ucd_record(code)->bidi_class;
191 }
192 
ucdn_get_mirrored(uint32_t code)193 int ucdn_get_mirrored(uint32_t code)
194 {
195     return get_ucd_record(code)->mirrored;
196 }
197 
ucdn_get_script(uint32_t code)198 int ucdn_get_script(uint32_t code)
199 {
200     return get_ucd_record(code)->script;
201 }
202 
ucdn_mirror(uint32_t code)203 uint32_t ucdn_mirror(uint32_t code)
204 {
205     MirrorPair mp = {0};
206     MirrorPair *res;
207 
208     if (get_ucd_record(code)->mirrored == 0)
209         return code;
210 
211     mp.from = code;
212     res = bsearch(&mp, mirror_pairs, BIDI_MIRROR_LEN, sizeof(MirrorPair),
213             compare_mp);
214 
215     if (res == NULL)
216         return code;
217     else
218         return res->to;
219 }
220 
ucdn_decompose(uint32_t code,uint32_t * a,uint32_t * b)221 int ucdn_decompose(uint32_t code, uint32_t *a, uint32_t *b)
222 {
223     const unsigned short *rec;
224     int len;
225 
226     if (hangul_pair_decompose(code, a, b))
227         return 1;
228 
229     rec = get_decomp_record(code);
230     len = rec[0] >> 8;
231 
232     if ((rec[0] & 0xff) != 0 || len == 0)
233         return 0;
234 
235     rec++;
236     *a = decode_utf16(&rec);
237     if (len > 1)
238         *b = decode_utf16(&rec);
239     else
240         *b = 0;
241 
242     return 1;
243 }
244 
ucdn_compose(uint32_t * code,uint32_t a,uint32_t b)245 int ucdn_compose(uint32_t *code, uint32_t a, uint32_t b)
246 {
247     int l, r, index, indexi, offset;
248 
249     if (hangul_pair_compose(code, a, b))
250         return 1;
251 
252     l = get_comp_index(a, nfc_first);
253     r = get_comp_index(b, nfc_last);
254 
255     if (l < 0 || r < 0)
256         return 0;
257 
258     indexi = l * TOTAL_LAST + r;
259     index  = comp_index0[indexi >> (COMP_SHIFT1+COMP_SHIFT2)] << COMP_SHIFT1;
260     offset = (indexi >> COMP_SHIFT2) & ((1<<COMP_SHIFT1) - 1);
261     index  = comp_index1[index + offset] << COMP_SHIFT2;
262     offset = indexi & ((1<<COMP_SHIFT2) - 1);
263     *code  = comp_data[index + offset];
264 
265     return *code != 0;
266 }
267 
ucdn_compat_decompose(uint32_t code,uint32_t * decomposed)268 int ucdn_compat_decompose(uint32_t code, uint32_t *decomposed)
269 {
270     int i, len;
271     const unsigned short *rec = get_decomp_record(code);
272     len = rec[0] >> 8;
273 
274     if (len == 0)
275         return 0;
276 
277     rec++;
278     for (i = 0; i < len; i++)
279         decomposed[i] = decode_utf16(&rec);
280 
281     return len;
282 }
283