1 /*
2 * Copyright (C) 2012 Grigori Goronzy <greg@kinoho.net>
3 *
4 * Permission to use, copy, modify, and/or distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 */
16
17 #include <stdio.h>
18 #include <stdlib.h>
19 #include <stdint.h>
20 #include "ucdn.h"
21
22 typedef struct {
23 unsigned char category;
24 unsigned char combining;
25 unsigned char bidi_class;
26 unsigned char mirrored;
27 unsigned char east_asian_width;
28 unsigned char normalization_check;
29 unsigned char script;
30 } UCDRecord;
31
32 typedef struct {
33 unsigned short from, to;
34 } MirrorPair;
35
36 typedef struct {
37 int start;
38 short count, index;
39 } Reindex;
40
41 #include "unicodedata_db.h"
42
43 /* constants required for Hangul (de)composition */
44 #define SBASE 0xAC00
45 #define LBASE 0x1100
46 #define VBASE 0x1161
47 #define TBASE 0x11A7
48 #define SCOUNT 11172
49 #define LCOUNT 19
50 #define VCOUNT 21
51 #define TCOUNT 28
52 #define NCOUNT (VCOUNT * TCOUNT)
53
get_ucd_record(uint32_t code)54 static const UCDRecord *get_ucd_record(uint32_t code)
55 {
56 int index, offset;
57
58 if (code >= 0x110000)
59 index = 0;
60 else {
61 index = index0[code >> (SHIFT1+SHIFT2)] << SHIFT1;
62 offset = (code >> SHIFT2) & ((1<<SHIFT1) - 1);
63 index = index1[index + offset] << SHIFT2;
64 offset = code & ((1<<SHIFT2) - 1);
65 index = index2[index + offset];
66 }
67
68 return &ucd_records[index];
69 }
70
get_decomp_record(uint32_t code)71 static const unsigned short *get_decomp_record(uint32_t code)
72 {
73 int index, offset;
74
75 if (code >= 0x110000)
76 index = 0;
77 else {
78 index = decomp_index0[code >> (DECOMP_SHIFT1+DECOMP_SHIFT2)]
79 << DECOMP_SHIFT1;
80 offset = (code >> DECOMP_SHIFT2) & ((1<<DECOMP_SHIFT1) - 1);
81 index = decomp_index1[index + offset] << DECOMP_SHIFT2;
82 offset = code & ((1<<DECOMP_SHIFT2) - 1);
83 index = decomp_index2[index + offset];
84 }
85
86 return &decomp_data[index];
87 }
88
get_comp_index(uint32_t code,const Reindex * idx)89 static int get_comp_index(uint32_t code, const Reindex *idx)
90 {
91 int i;
92
93 for (i = 0; idx[i].start; i++) {
94 const Reindex *cur = &idx[i];
95 if (code < cur->start)
96 return -1;
97 if (code <= cur->start + cur->count) {
98 return cur->index + (code - cur->start);
99 }
100 }
101
102 return -1;
103 }
104
compare_mp(const void * a,const void * b)105 static int compare_mp(const void *a, const void *b)
106 {
107 MirrorPair *mpa = (MirrorPair *)a;
108 MirrorPair *mpb = (MirrorPair *)b;
109 return mpa->from - mpb->from;
110 }
111
hangul_pair_decompose(uint32_t code,uint32_t * a,uint32_t * b)112 static int hangul_pair_decompose(uint32_t code, uint32_t *a, uint32_t *b)
113 {
114 int si = code - SBASE;
115
116 if (si < 0 || si >= SCOUNT)
117 return 0;
118
119 if (si % TCOUNT) {
120 /* LV,T */
121 *a = SBASE + (si / TCOUNT) * TCOUNT;
122 *b = TBASE + (si % TCOUNT);
123 return 3;
124 } else {
125 /* L,V */
126 *a = LBASE + (si / NCOUNT);
127 *b = VBASE + (si % NCOUNT) / TCOUNT;
128 return 2;
129 }
130 }
131
hangul_pair_compose(uint32_t * code,uint32_t a,uint32_t b)132 static int hangul_pair_compose(uint32_t *code, uint32_t a, uint32_t b)
133 {
134 if (b < VBASE || b >= (TBASE + TCOUNT))
135 return 0;
136
137 if ((a < LBASE || a >= (LBASE + LCOUNT))
138 && (a < SBASE || a >= (SBASE + SCOUNT)))
139 return 0;
140
141 if (a >= SBASE) {
142 /* LV,T */
143 *code = a + (b - TBASE);
144 return 3;
145 } else {
146 /* L,V */
147 int li = a - LBASE;
148 int vi = b - VBASE;
149 *code = SBASE + li * NCOUNT + vi * TCOUNT;
150 return 2;
151 }
152 }
153
decode_utf16(const unsigned short ** code_ptr)154 static uint32_t decode_utf16(const unsigned short **code_ptr)
155 {
156 const unsigned short *code = *code_ptr;
157
158 if ((code[0] & 0xd800) != 0xd800) {
159 *code_ptr += 1;
160 return (uint32_t)code[0];
161 } else {
162 *code_ptr += 2;
163 return 0x10000 + ((uint32_t)code[1] - 0xdc00) +
164 (((uint32_t)code[0] - 0xd800) << 10);
165 }
166 }
167
ucdn_get_unicode_version(void)168 const char *ucdn_get_unicode_version(void)
169 {
170 return UNIDATA_VERSION;
171 }
172
ucdn_get_combining_class(uint32_t code)173 int ucdn_get_combining_class(uint32_t code)
174 {
175 return get_ucd_record(code)->combining;
176 }
177
ucdn_get_east_asian_width(uint32_t code)178 int ucdn_get_east_asian_width(uint32_t code)
179 {
180 return get_ucd_record(code)->east_asian_width;
181 }
182
ucdn_get_general_category(uint32_t code)183 int ucdn_get_general_category(uint32_t code)
184 {
185 return get_ucd_record(code)->category;
186 }
187
ucdn_get_bidi_class(uint32_t code)188 int ucdn_get_bidi_class(uint32_t code)
189 {
190 return get_ucd_record(code)->bidi_class;
191 }
192
ucdn_get_mirrored(uint32_t code)193 int ucdn_get_mirrored(uint32_t code)
194 {
195 return get_ucd_record(code)->mirrored;
196 }
197
ucdn_get_script(uint32_t code)198 int ucdn_get_script(uint32_t code)
199 {
200 return get_ucd_record(code)->script;
201 }
202
ucdn_mirror(uint32_t code)203 uint32_t ucdn_mirror(uint32_t code)
204 {
205 MirrorPair mp = {0};
206 MirrorPair *res;
207
208 if (get_ucd_record(code)->mirrored == 0)
209 return code;
210
211 mp.from = code;
212 res = bsearch(&mp, mirror_pairs, BIDI_MIRROR_LEN, sizeof(MirrorPair),
213 compare_mp);
214
215 if (res == NULL)
216 return code;
217 else
218 return res->to;
219 }
220
ucdn_decompose(uint32_t code,uint32_t * a,uint32_t * b)221 int ucdn_decompose(uint32_t code, uint32_t *a, uint32_t *b)
222 {
223 const unsigned short *rec;
224 int len;
225
226 if (hangul_pair_decompose(code, a, b))
227 return 1;
228
229 rec = get_decomp_record(code);
230 len = rec[0] >> 8;
231
232 if ((rec[0] & 0xff) != 0 || len == 0)
233 return 0;
234
235 rec++;
236 *a = decode_utf16(&rec);
237 if (len > 1)
238 *b = decode_utf16(&rec);
239 else
240 *b = 0;
241
242 return 1;
243 }
244
ucdn_compose(uint32_t * code,uint32_t a,uint32_t b)245 int ucdn_compose(uint32_t *code, uint32_t a, uint32_t b)
246 {
247 int l, r, index, indexi, offset;
248
249 if (hangul_pair_compose(code, a, b))
250 return 1;
251
252 l = get_comp_index(a, nfc_first);
253 r = get_comp_index(b, nfc_last);
254
255 if (l < 0 || r < 0)
256 return 0;
257
258 indexi = l * TOTAL_LAST + r;
259 index = comp_index0[indexi >> (COMP_SHIFT1+COMP_SHIFT2)] << COMP_SHIFT1;
260 offset = (indexi >> COMP_SHIFT2) & ((1<<COMP_SHIFT1) - 1);
261 index = comp_index1[index + offset] << COMP_SHIFT2;
262 offset = indexi & ((1<<COMP_SHIFT2) - 1);
263 *code = comp_data[index + offset];
264
265 return *code != 0;
266 }
267
ucdn_compat_decompose(uint32_t code,uint32_t * decomposed)268 int ucdn_compat_decompose(uint32_t code, uint32_t *decomposed)
269 {
270 int i, len;
271 const unsigned short *rec = get_decomp_record(code);
272 len = rec[0] >> 8;
273
274 if (len == 0)
275 return 0;
276
277 rec++;
278 for (i = 0; i < len; i++)
279 decomposed[i] = decode_utf16(&rec);
280
281 return len;
282 }
283