1 /* ------------------------------------------------------------------------
2
3 unicodedata -- Provides access to the Unicode database.
4
5 Data was extracted from the UnicodeData.txt file.
6 The current version number is reported in the unidata_version constant.
7
8 Written by Marc-Andre Lemburg (mal@lemburg.com).
9 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
10 Modified by Martin v. Löwis (martin@v.loewis.de)
11
12 Copyright (c) Corporation for National Research Initiatives.
13
14 ------------------------------------------------------------------------ */
15
16 #define PY_SSIZE_T_CLEAN
17
18 #include "Python.h"
19 #include "ucnhash.h"
20 #include "structmember.h"
21
22 /*[clinic input]
23 module unicodedata
24 class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type'
25 [clinic start generated code]*/
26 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=6dac153082d150bc]*/
27
28 /* character properties */
29
30 typedef struct {
31 const unsigned char category; /* index into
32 _PyUnicode_CategoryNames */
33 const unsigned char combining; /* combining class value 0 - 255 */
34 const unsigned char bidirectional; /* index into
35 _PyUnicode_BidirectionalNames */
36 const unsigned char mirrored; /* true if mirrored in bidir mode */
37 const unsigned char east_asian_width; /* index into
38 _PyUnicode_EastAsianWidth */
39 const unsigned char normalization_quick_check; /* see is_normalized() */
40 } _PyUnicode_DatabaseRecord;
41
42 typedef struct change_record {
43 /* sequence of fields should be the same as in merge_old_version */
44 const unsigned char bidir_changed;
45 const unsigned char category_changed;
46 const unsigned char decimal_changed;
47 const unsigned char mirrored_changed;
48 const unsigned char east_asian_width_changed;
49 const double numeric_changed;
50 } change_record;
51
52 /* data file generated by Tools/unicode/makeunicodedata.py */
53 #include "unicodedata_db.h"
54
55 static const _PyUnicode_DatabaseRecord*
_getrecord_ex(Py_UCS4 code)56 _getrecord_ex(Py_UCS4 code)
57 {
58 int index;
59 if (code >= 0x110000)
60 index = 0;
61 else {
62 index = index1[(code>>SHIFT)];
63 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
64 }
65
66 return &_PyUnicode_Database_Records[index];
67 }
68
69 /* ------------- Previous-version API ------------------------------------- */
70 typedef struct previous_version {
71 PyObject_HEAD
72 const char *name;
73 const change_record* (*getrecord)(Py_UCS4);
74 Py_UCS4 (*normalization)(Py_UCS4);
75 } PreviousDBVersion;
76
77 #include "clinic/unicodedata.c.h"
78
79 #define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
80
81 static PyMemberDef DB_members[] = {
82 {"unidata_version", T_STRING, offsetof(PreviousDBVersion, name), READONLY},
83 {NULL}
84 };
85
86 /* forward declaration */
87 static PyTypeObject UCD_Type;
88 #define UCD_Check(o) (Py_TYPE(o)==&UCD_Type)
89
90 static PyObject*
new_previous_version(const char * name,const change_record * (* getrecord)(Py_UCS4),Py_UCS4 (* normalization)(Py_UCS4))91 new_previous_version(const char*name, const change_record* (*getrecord)(Py_UCS4),
92 Py_UCS4 (*normalization)(Py_UCS4))
93 {
94 PreviousDBVersion *self;
95 self = PyObject_New(PreviousDBVersion, &UCD_Type);
96 if (self == NULL)
97 return NULL;
98 self->name = name;
99 self->getrecord = getrecord;
100 self->normalization = normalization;
101 return (PyObject*)self;
102 }
103
104
105 /* --- Module API --------------------------------------------------------- */
106
107 /*[clinic input]
108 unicodedata.UCD.decimal
109
110 self: self
111 chr: int(accept={str})
112 default: object=NULL
113 /
114
115 Converts a Unicode character into its equivalent decimal value.
116
117 Returns the decimal value assigned to the character chr as integer.
118 If no such value is defined, default is returned, or, if not given,
119 ValueError is raised.
120 [clinic start generated code]*/
121
122 static PyObject *
unicodedata_UCD_decimal_impl(PyObject * self,int chr,PyObject * default_value)123 unicodedata_UCD_decimal_impl(PyObject *self, int chr,
124 PyObject *default_value)
125 /*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
126 {
127 int have_old = 0;
128 long rc;
129 Py_UCS4 c = (Py_UCS4)chr;
130
131 if (self && UCD_Check(self)) {
132 const change_record *old = get_old_record(self, c);
133 if (old->category_changed == 0) {
134 /* unassigned */
135 have_old = 1;
136 rc = -1;
137 }
138 else if (old->decimal_changed != 0xFF) {
139 have_old = 1;
140 rc = old->decimal_changed;
141 }
142 }
143
144 if (!have_old)
145 rc = Py_UNICODE_TODECIMAL(c);
146 if (rc < 0) {
147 if (default_value == NULL) {
148 PyErr_SetString(PyExc_ValueError,
149 "not a decimal");
150 return NULL;
151 }
152 else {
153 Py_INCREF(default_value);
154 return default_value;
155 }
156 }
157 return PyLong_FromLong(rc);
158 }
159
160 /*[clinic input]
161 unicodedata.UCD.digit
162
163 self: self
164 chr: int(accept={str})
165 default: object=NULL
166 /
167
168 Converts a Unicode character into its equivalent digit value.
169
170 Returns the digit value assigned to the character chr as integer.
171 If no such value is defined, default is returned, or, if not given,
172 ValueError is raised.
173 [clinic start generated code]*/
174
175 static PyObject *
unicodedata_UCD_digit_impl(PyObject * self,int chr,PyObject * default_value)176 unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
177 /*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
178 {
179 long rc;
180 Py_UCS4 c = (Py_UCS4)chr;
181 rc = Py_UNICODE_TODIGIT(c);
182 if (rc < 0) {
183 if (default_value == NULL) {
184 PyErr_SetString(PyExc_ValueError, "not a digit");
185 return NULL;
186 }
187 else {
188 Py_INCREF(default_value);
189 return default_value;
190 }
191 }
192 return PyLong_FromLong(rc);
193 }
194
195 /*[clinic input]
196 unicodedata.UCD.numeric
197
198 self: self
199 chr: int(accept={str})
200 default: object=NULL
201 /
202
203 Converts a Unicode character into its equivalent numeric value.
204
205 Returns the numeric value assigned to the character chr as float.
206 If no such value is defined, default is returned, or, if not given,
207 ValueError is raised.
208 [clinic start generated code]*/
209
210 static PyObject *
unicodedata_UCD_numeric_impl(PyObject * self,int chr,PyObject * default_value)211 unicodedata_UCD_numeric_impl(PyObject *self, int chr,
212 PyObject *default_value)
213 /*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
214 {
215 int have_old = 0;
216 double rc;
217 Py_UCS4 c = (Py_UCS4)chr;
218
219 if (self && UCD_Check(self)) {
220 const change_record *old = get_old_record(self, c);
221 if (old->category_changed == 0) {
222 /* unassigned */
223 have_old = 1;
224 rc = -1.0;
225 }
226 else if (old->decimal_changed != 0xFF) {
227 have_old = 1;
228 rc = old->decimal_changed;
229 }
230 }
231
232 if (!have_old)
233 rc = Py_UNICODE_TONUMERIC(c);
234 if (rc == -1.0) {
235 if (default_value == NULL) {
236 PyErr_SetString(PyExc_ValueError, "not a numeric character");
237 return NULL;
238 }
239 else {
240 Py_INCREF(default_value);
241 return default_value;
242 }
243 }
244 return PyFloat_FromDouble(rc);
245 }
246
247 /*[clinic input]
248 unicodedata.UCD.category
249
250 self: self
251 chr: int(accept={str})
252 /
253
254 Returns the general category assigned to the character chr as string.
255 [clinic start generated code]*/
256
257 static PyObject *
unicodedata_UCD_category_impl(PyObject * self,int chr)258 unicodedata_UCD_category_impl(PyObject *self, int chr)
259 /*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
260 {
261 int index;
262 Py_UCS4 c = (Py_UCS4)chr;
263 index = (int) _getrecord_ex(c)->category;
264 if (self && UCD_Check(self)) {
265 const change_record *old = get_old_record(self, c);
266 if (old->category_changed != 0xFF)
267 index = old->category_changed;
268 }
269 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
270 }
271
272 /*[clinic input]
273 unicodedata.UCD.bidirectional
274
275 self: self
276 chr: int(accept={str})
277 /
278
279 Returns the bidirectional class assigned to the character chr as string.
280
281 If no such value is defined, an empty string is returned.
282 [clinic start generated code]*/
283
284 static PyObject *
unicodedata_UCD_bidirectional_impl(PyObject * self,int chr)285 unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
286 /*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
287 {
288 int index;
289 Py_UCS4 c = (Py_UCS4)chr;
290 index = (int) _getrecord_ex(c)->bidirectional;
291 if (self && UCD_Check(self)) {
292 const change_record *old = get_old_record(self, c);
293 if (old->category_changed == 0)
294 index = 0; /* unassigned */
295 else if (old->bidir_changed != 0xFF)
296 index = old->bidir_changed;
297 }
298 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
299 }
300
301 /*[clinic input]
302 unicodedata.UCD.combining -> int
303
304 self: self
305 chr: int(accept={str})
306 /
307
308 Returns the canonical combining class assigned to the character chr as integer.
309
310 Returns 0 if no combining class is defined.
311 [clinic start generated code]*/
312
313 static int
unicodedata_UCD_combining_impl(PyObject * self,int chr)314 unicodedata_UCD_combining_impl(PyObject *self, int chr)
315 /*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
316 {
317 int index;
318 Py_UCS4 c = (Py_UCS4)chr;
319 index = (int) _getrecord_ex(c)->combining;
320 if (self && UCD_Check(self)) {
321 const change_record *old = get_old_record(self, c);
322 if (old->category_changed == 0)
323 index = 0; /* unassigned */
324 }
325 return index;
326 }
327
328 /*[clinic input]
329 unicodedata.UCD.mirrored -> int
330
331 self: self
332 chr: int(accept={str})
333 /
334
335 Returns the mirrored property assigned to the character chr as integer.
336
337 Returns 1 if the character has been identified as a "mirrored"
338 character in bidirectional text, 0 otherwise.
339 [clinic start generated code]*/
340
341 static int
unicodedata_UCD_mirrored_impl(PyObject * self,int chr)342 unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
343 /*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
344 {
345 int index;
346 Py_UCS4 c = (Py_UCS4)chr;
347 index = (int) _getrecord_ex(c)->mirrored;
348 if (self && UCD_Check(self)) {
349 const change_record *old = get_old_record(self, c);
350 if (old->category_changed == 0)
351 index = 0; /* unassigned */
352 else if (old->mirrored_changed != 0xFF)
353 index = old->mirrored_changed;
354 }
355 return index;
356 }
357
358 /*[clinic input]
359 unicodedata.UCD.east_asian_width
360
361 self: self
362 chr: int(accept={str})
363 /
364
365 Returns the east asian width assigned to the character chr as string.
366 [clinic start generated code]*/
367
368 static PyObject *
unicodedata_UCD_east_asian_width_impl(PyObject * self,int chr)369 unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
370 /*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
371 {
372 int index;
373 Py_UCS4 c = (Py_UCS4)chr;
374 index = (int) _getrecord_ex(c)->east_asian_width;
375 if (self && UCD_Check(self)) {
376 const change_record *old = get_old_record(self, c);
377 if (old->category_changed == 0)
378 index = 0; /* unassigned */
379 else if (old->east_asian_width_changed != 0xFF)
380 index = old->east_asian_width_changed;
381 }
382 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
383 }
384
385 /*[clinic input]
386 unicodedata.UCD.decomposition
387
388 self: self
389 chr: int(accept={str})
390 /
391
392 Returns the character decomposition mapping assigned to the character chr as string.
393
394 An empty string is returned in case no such mapping is defined.
395 [clinic start generated code]*/
396
397 static PyObject *
unicodedata_UCD_decomposition_impl(PyObject * self,int chr)398 unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
399 /*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
400 {
401 char decomp[256];
402 int code, index, count;
403 size_t i;
404 unsigned int prefix_index;
405 Py_UCS4 c = (Py_UCS4)chr;
406
407 code = (int)c;
408
409 if (self && UCD_Check(self)) {
410 const change_record *old = get_old_record(self, c);
411 if (old->category_changed == 0)
412 return PyUnicode_FromString(""); /* unassigned */
413 }
414
415 if (code < 0 || code >= 0x110000)
416 index = 0;
417 else {
418 index = decomp_index1[(code>>DECOMP_SHIFT)];
419 index = decomp_index2[(index<<DECOMP_SHIFT)+
420 (code&((1<<DECOMP_SHIFT)-1))];
421 }
422
423 /* high byte is number of hex bytes (usually one or two), low byte
424 is prefix code (from*/
425 count = decomp_data[index] >> 8;
426
427 /* XXX: could allocate the PyString up front instead
428 (strlen(prefix) + 5 * count + 1 bytes) */
429
430 /* Based on how index is calculated above and decomp_data is generated
431 from Tools/unicode/makeunicodedata.py, it should not be possible
432 to overflow decomp_prefix. */
433 prefix_index = decomp_data[index] & 255;
434 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
435
436 /* copy prefix */
437 i = strlen(decomp_prefix[prefix_index]);
438 memcpy(decomp, decomp_prefix[prefix_index], i);
439
440 while (count-- > 0) {
441 if (i)
442 decomp[i++] = ' ';
443 assert(i < sizeof(decomp));
444 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
445 decomp_data[++index]);
446 i += strlen(decomp + i);
447 }
448 return PyUnicode_FromStringAndSize(decomp, i);
449 }
450
451 static void
get_decomp_record(PyObject * self,Py_UCS4 code,int * index,int * prefix,int * count)452 get_decomp_record(PyObject *self, Py_UCS4 code, int *index, int *prefix, int *count)
453 {
454 if (code >= 0x110000) {
455 *index = 0;
456 } else if (self && UCD_Check(self) &&
457 get_old_record(self, code)->category_changed==0) {
458 /* unassigned in old version */
459 *index = 0;
460 }
461 else {
462 *index = decomp_index1[(code>>DECOMP_SHIFT)];
463 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
464 (code&((1<<DECOMP_SHIFT)-1))];
465 }
466
467 /* high byte is number of hex bytes (usually one or two), low byte
468 is prefix code (from*/
469 *count = decomp_data[*index] >> 8;
470 *prefix = decomp_data[*index] & 255;
471
472 (*index)++;
473 }
474
475 #define SBase 0xAC00
476 #define LBase 0x1100
477 #define VBase 0x1161
478 #define TBase 0x11A7
479 #define LCount 19
480 #define VCount 21
481 #define TCount 28
482 #define NCount (VCount*TCount)
483 #define SCount (LCount*NCount)
484
485 static PyObject*
nfd_nfkd(PyObject * self,PyObject * input,int k)486 nfd_nfkd(PyObject *self, PyObject *input, int k)
487 {
488 PyObject *result;
489 Py_UCS4 *output;
490 Py_ssize_t i, o, osize;
491 int kind;
492 void *data;
493 /* Longest decomposition in Unicode 3.2: U+FDFA */
494 Py_UCS4 stack[20];
495 Py_ssize_t space, isize;
496 int index, prefix, count, stackptr;
497 unsigned char prev, cur;
498
499 stackptr = 0;
500 isize = PyUnicode_GET_LENGTH(input);
501 space = isize;
502 /* Overallocate at most 10 characters. */
503 if (space > 10) {
504 if (space <= PY_SSIZE_T_MAX - 10)
505 space += 10;
506 }
507 else {
508 space *= 2;
509 }
510 osize = space;
511 output = PyMem_NEW(Py_UCS4, space);
512 if (!output) {
513 PyErr_NoMemory();
514 return NULL;
515 }
516 i = o = 0;
517 kind = PyUnicode_KIND(input);
518 data = PyUnicode_DATA(input);
519
520 while (i < isize) {
521 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
522 while(stackptr) {
523 Py_UCS4 code = stack[--stackptr];
524 /* Hangul Decomposition adds three characters in
525 a single step, so we need at least that much room. */
526 if (space < 3) {
527 Py_UCS4 *new_output;
528 osize += 10;
529 space += 10;
530 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
531 if (new_output == NULL) {
532 PyMem_Free(output);
533 PyErr_NoMemory();
534 return NULL;
535 }
536 output = new_output;
537 }
538 /* Hangul Decomposition. */
539 if (SBase <= code && code < (SBase+SCount)) {
540 int SIndex = code - SBase;
541 int L = LBase + SIndex / NCount;
542 int V = VBase + (SIndex % NCount) / TCount;
543 int T = TBase + SIndex % TCount;
544 output[o++] = L;
545 output[o++] = V;
546 space -= 2;
547 if (T != TBase) {
548 output[o++] = T;
549 space --;
550 }
551 continue;
552 }
553 /* normalization changes */
554 if (self && UCD_Check(self)) {
555 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
556 if (value != 0) {
557 stack[stackptr++] = value;
558 continue;
559 }
560 }
561
562 /* Other decompositions. */
563 get_decomp_record(self, code, &index, &prefix, &count);
564
565 /* Copy character if it is not decomposable, or has a
566 compatibility decomposition, but we do NFD. */
567 if (!count || (prefix && !k)) {
568 output[o++] = code;
569 space--;
570 continue;
571 }
572 /* Copy decomposition onto the stack, in reverse
573 order. */
574 while(count) {
575 code = decomp_data[index + (--count)];
576 stack[stackptr++] = code;
577 }
578 }
579 }
580
581 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
582 output, o);
583 PyMem_Free(output);
584 if (!result)
585 return NULL;
586 /* result is guaranteed to be ready, as it is compact. */
587 kind = PyUnicode_KIND(result);
588 data = PyUnicode_DATA(result);
589
590 /* Sort canonically. */
591 i = 0;
592 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
593 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
594 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
595 if (prev == 0 || cur == 0 || prev <= cur) {
596 prev = cur;
597 continue;
598 }
599 /* Non-canonical order. Need to switch *i with previous. */
600 o = i - 1;
601 while (1) {
602 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
603 PyUnicode_WRITE(kind, data, o+1,
604 PyUnicode_READ(kind, data, o));
605 PyUnicode_WRITE(kind, data, o, tmp);
606 o--;
607 if (o < 0)
608 break;
609 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
610 if (prev == 0 || prev <= cur)
611 break;
612 }
613 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
614 }
615 return result;
616 }
617
618 static int
find_nfc_index(PyObject * self,struct reindex * nfc,Py_UCS4 code)619 find_nfc_index(PyObject *self, struct reindex* nfc, Py_UCS4 code)
620 {
621 unsigned int index;
622 for (index = 0; nfc[index].start; index++) {
623 unsigned int start = nfc[index].start;
624 if (code < start)
625 return -1;
626 if (code <= start + nfc[index].count) {
627 unsigned int delta = code - start;
628 return nfc[index].index + delta;
629 }
630 }
631 return -1;
632 }
633
634 static PyObject*
nfc_nfkc(PyObject * self,PyObject * input,int k)635 nfc_nfkc(PyObject *self, PyObject *input, int k)
636 {
637 PyObject *result;
638 int kind;
639 void *data;
640 Py_UCS4 *output;
641 Py_ssize_t i, i1, o, len;
642 int f,l,index,index1,comb;
643 Py_UCS4 code;
644 Py_ssize_t skipped[20];
645 int cskipped = 0;
646
647 result = nfd_nfkd(self, input, k);
648 if (!result)
649 return NULL;
650 /* result will be "ready". */
651 kind = PyUnicode_KIND(result);
652 data = PyUnicode_DATA(result);
653 len = PyUnicode_GET_LENGTH(result);
654
655 /* We allocate a buffer for the output.
656 If we find that we made no changes, we still return
657 the NFD result. */
658 output = PyMem_NEW(Py_UCS4, len);
659 if (!output) {
660 PyErr_NoMemory();
661 Py_DECREF(result);
662 return 0;
663 }
664 i = o = 0;
665
666 again:
667 while (i < len) {
668 for (index = 0; index < cskipped; index++) {
669 if (skipped[index] == i) {
670 /* *i character is skipped.
671 Remove from list. */
672 skipped[index] = skipped[cskipped-1];
673 cskipped--;
674 i++;
675 goto again; /* continue while */
676 }
677 }
678 /* Hangul Composition. We don't need to check for <LV,T>
679 pairs, since we always have decomposed data. */
680 code = PyUnicode_READ(kind, data, i);
681 if (LBase <= code && code < (LBase+LCount) &&
682 i + 1 < len &&
683 VBase <= PyUnicode_READ(kind, data, i+1) &&
684 PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
685 /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
686 and V character is a modern vowel (0x1161 ~ 0x1175). */
687 int LIndex, VIndex;
688 LIndex = code - LBase;
689 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
690 code = SBase + (LIndex*VCount+VIndex)*TCount;
691 i+=2;
692 if (i < len &&
693 TBase < PyUnicode_READ(kind, data, i) &&
694 PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
695 /* check T character is a modern trailing consonant
696 (0x11A8 ~ 0x11C2). */
697 code += PyUnicode_READ(kind, data, i)-TBase;
698 i++;
699 }
700 output[o++] = code;
701 continue;
702 }
703
704 /* code is still input[i] here */
705 f = find_nfc_index(self, nfc_first, code);
706 if (f == -1) {
707 output[o++] = code;
708 i++;
709 continue;
710 }
711 /* Find next unblocked character. */
712 i1 = i+1;
713 comb = 0;
714 /* output base character for now; might be updated later. */
715 output[o] = PyUnicode_READ(kind, data, i);
716 while (i1 < len) {
717 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
718 int comb1 = _getrecord_ex(code1)->combining;
719 if (comb) {
720 if (comb1 == 0)
721 break;
722 if (comb >= comb1) {
723 /* Character is blocked. */
724 i1++;
725 continue;
726 }
727 }
728 l = find_nfc_index(self, nfc_last, code1);
729 /* i1 cannot be combined with i. If i1
730 is a starter, we don't need to look further.
731 Otherwise, record the combining class. */
732 if (l == -1) {
733 not_combinable:
734 if (comb1 == 0)
735 break;
736 comb = comb1;
737 i1++;
738 continue;
739 }
740 index = f*TOTAL_LAST + l;
741 index1 = comp_index[index >> COMP_SHIFT];
742 code = comp_data[(index1<<COMP_SHIFT)+
743 (index&((1<<COMP_SHIFT)-1))];
744 if (code == 0)
745 goto not_combinable;
746
747 /* Replace the original character. */
748 output[o] = code;
749 /* Mark the second character unused. */
750 assert(cskipped < 20);
751 skipped[cskipped++] = i1;
752 i1++;
753 f = find_nfc_index(self, nfc_first, output[o]);
754 if (f == -1)
755 break;
756 }
757 /* Output character was already written.
758 Just advance the indices. */
759 o++; i++;
760 }
761 if (o == len) {
762 /* No changes. Return original string. */
763 PyMem_Free(output);
764 return result;
765 }
766 Py_DECREF(result);
767 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
768 output, o);
769 PyMem_Free(output);
770 return result;
771 }
772
773 /* Return 1 if the input is certainly normalized, 0 if it might not be. */
774 static int
is_normalized(PyObject * self,PyObject * input,int nfc,int k)775 is_normalized(PyObject *self, PyObject *input, int nfc, int k)
776 {
777 Py_ssize_t i, len;
778 int kind;
779 void *data;
780 unsigned char prev_combining = 0, quickcheck_mask;
781
782 /* An older version of the database is requested, quickchecks must be
783 disabled. */
784 if (self && UCD_Check(self))
785 return 0;
786
787 /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
788 as described in http://unicode.org/reports/tr15/#Annex8. */
789 quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
790
791 i = 0;
792 kind = PyUnicode_KIND(input);
793 data = PyUnicode_DATA(input);
794 len = PyUnicode_GET_LENGTH(input);
795 while (i < len) {
796 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
797 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
798 unsigned char combining = record->combining;
799 unsigned char quickcheck = record->normalization_quick_check;
800
801 if (quickcheck & quickcheck_mask)
802 return 0; /* this string might need normalization */
803 if (combining && prev_combining > combining)
804 return 0; /* non-canonical sort order, not normalized */
805 prev_combining = combining;
806 }
807 return 1; /* certainly normalized */
808 }
809
810 /*[clinic input]
811 unicodedata.UCD.normalize
812
813 self: self
814 form: str
815 unistr as input: unicode
816 /
817
818 Return the normal form 'form' for the Unicode string unistr.
819
820 Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
821 [clinic start generated code]*/
822
823 static PyObject *
unicodedata_UCD_normalize_impl(PyObject * self,const char * form,PyObject * input)824 unicodedata_UCD_normalize_impl(PyObject *self, const char *form,
825 PyObject *input)
826 /*[clinic end generated code: output=62d1f8870027efdc input=1744c55f4ab79bf0]*/
827 {
828 if (PyUnicode_GET_LENGTH(input) == 0) {
829 /* Special case empty input strings, since resizing
830 them later would cause internal errors. */
831 Py_INCREF(input);
832 return input;
833 }
834
835 if (strcmp(form, "NFC") == 0) {
836 if (is_normalized(self, input, 1, 0)) {
837 Py_INCREF(input);
838 return input;
839 }
840 return nfc_nfkc(self, input, 0);
841 }
842 if (strcmp(form, "NFKC") == 0) {
843 if (is_normalized(self, input, 1, 1)) {
844 Py_INCREF(input);
845 return input;
846 }
847 return nfc_nfkc(self, input, 1);
848 }
849 if (strcmp(form, "NFD") == 0) {
850 if (is_normalized(self, input, 0, 0)) {
851 Py_INCREF(input);
852 return input;
853 }
854 return nfd_nfkd(self, input, 0);
855 }
856 if (strcmp(form, "NFKD") == 0) {
857 if (is_normalized(self, input, 0, 1)) {
858 Py_INCREF(input);
859 return input;
860 }
861 return nfd_nfkd(self, input, 1);
862 }
863 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
864 return NULL;
865 }
866
867 /* -------------------------------------------------------------------- */
868 /* unicode character name tables */
869
870 /* data file generated by Tools/unicode/makeunicodedata.py */
871 #include "unicodename_db.h"
872
873 /* -------------------------------------------------------------------- */
874 /* database code (cut and pasted from the unidb package) */
875
876 static unsigned long
_gethash(const char * s,int len,int scale)877 _gethash(const char *s, int len, int scale)
878 {
879 int i;
880 unsigned long h = 0;
881 unsigned long ix;
882 for (i = 0; i < len; i++) {
883 h = (h * scale) + (unsigned char) Py_TOUPPER(Py_CHARMASK(s[i]));
884 ix = h & 0xff000000;
885 if (ix)
886 h = (h ^ ((ix>>24) & 0xff)) & 0x00ffffff;
887 }
888 return h;
889 }
890
891 static const char * const hangul_syllables[][3] = {
892 { "G", "A", "" },
893 { "GG", "AE", "G" },
894 { "N", "YA", "GG" },
895 { "D", "YAE", "GS" },
896 { "DD", "EO", "N", },
897 { "R", "E", "NJ" },
898 { "M", "YEO", "NH" },
899 { "B", "YE", "D" },
900 { "BB", "O", "L" },
901 { "S", "WA", "LG" },
902 { "SS", "WAE", "LM" },
903 { "", "OE", "LB" },
904 { "J", "YO", "LS" },
905 { "JJ", "U", "LT" },
906 { "C", "WEO", "LP" },
907 { "K", "WE", "LH" },
908 { "T", "WI", "M" },
909 { "P", "YU", "B" },
910 { "H", "EU", "BS" },
911 { 0, "YI", "S" },
912 { 0, "I", "SS" },
913 { 0, 0, "NG" },
914 { 0, 0, "J" },
915 { 0, 0, "C" },
916 { 0, 0, "K" },
917 { 0, 0, "T" },
918 { 0, 0, "P" },
919 { 0, 0, "H" }
920 };
921
922 /* These ranges need to match makeunicodedata.py:cjk_ranges. */
923 static int
is_unified_ideograph(Py_UCS4 code)924 is_unified_ideograph(Py_UCS4 code)
925 {
926 return
927 (0x3400 <= code && code <= 0x4DB5) || /* CJK Ideograph Extension A */
928 (0x4E00 <= code && code <= 0x9FEF) || /* CJK Ideograph */
929 (0x20000 <= code && code <= 0x2A6D6) || /* CJK Ideograph Extension B */
930 (0x2A700 <= code && code <= 0x2B734) || /* CJK Ideograph Extension C */
931 (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
932 (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
933 (0x2CEB0 <= code && code <= 0x2EBEF); /* CJK Ideograph Extension F */
934 }
935
936 /* macros used to determine if the given code point is in the PUA range that
937 * we are using to store aliases and named sequences */
938 #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
939 #define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
940 (cp < named_sequences_end))
941
942 static int
_getucname(PyObject * self,Py_UCS4 code,char * buffer,int buflen,int with_alias_and_seq)943 _getucname(PyObject *self, Py_UCS4 code, char* buffer, int buflen,
944 int with_alias_and_seq)
945 {
946 /* Find the name associated with the given code point.
947 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
948 * that we are using for aliases and named sequences. */
949 int offset;
950 int i;
951 int word;
952 unsigned char* w;
953
954 if (code >= 0x110000)
955 return 0;
956
957 /* XXX should we just skip all the code points in the PUAs here? */
958 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
959 return 0;
960
961 if (self && UCD_Check(self)) {
962 /* in 3.2.0 there are no aliases and named sequences */
963 const change_record *old;
964 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
965 return 0;
966 old = get_old_record(self, code);
967 if (old->category_changed == 0) {
968 /* unassigned */
969 return 0;
970 }
971 }
972
973 if (SBase <= code && code < SBase+SCount) {
974 /* Hangul syllable. */
975 int SIndex = code - SBase;
976 int L = SIndex / NCount;
977 int V = (SIndex % NCount) / TCount;
978 int T = SIndex % TCount;
979
980 if (buflen < 27)
981 /* Worst case: HANGUL SYLLABLE <10chars>. */
982 return 0;
983 strcpy(buffer, "HANGUL SYLLABLE ");
984 buffer += 16;
985 strcpy(buffer, hangul_syllables[L][0]);
986 buffer += strlen(hangul_syllables[L][0]);
987 strcpy(buffer, hangul_syllables[V][1]);
988 buffer += strlen(hangul_syllables[V][1]);
989 strcpy(buffer, hangul_syllables[T][2]);
990 buffer += strlen(hangul_syllables[T][2]);
991 *buffer = '\0';
992 return 1;
993 }
994
995 if (is_unified_ideograph(code)) {
996 if (buflen < 28)
997 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
998 return 0;
999 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1000 return 1;
1001 }
1002
1003 /* get offset into phrasebook */
1004 offset = phrasebook_offset1[(code>>phrasebook_shift)];
1005 offset = phrasebook_offset2[(offset<<phrasebook_shift) +
1006 (code&((1<<phrasebook_shift)-1))];
1007 if (!offset)
1008 return 0;
1009
1010 i = 0;
1011
1012 for (;;) {
1013 /* get word index */
1014 word = phrasebook[offset] - phrasebook_short;
1015 if (word >= 0) {
1016 word = (word << 8) + phrasebook[offset+1];
1017 offset += 2;
1018 } else
1019 word = phrasebook[offset++];
1020 if (i) {
1021 if (i > buflen)
1022 return 0; /* buffer overflow */
1023 buffer[i++] = ' ';
1024 }
1025 /* copy word string from lexicon. the last character in the
1026 word has bit 7 set. the last word in a string ends with
1027 0x80 */
1028 w = lexicon + lexicon_offset[word];
1029 while (*w < 128) {
1030 if (i >= buflen)
1031 return 0; /* buffer overflow */
1032 buffer[i++] = *w++;
1033 }
1034 if (i >= buflen)
1035 return 0; /* buffer overflow */
1036 buffer[i++] = *w & 127;
1037 if (*w == 128)
1038 break; /* end of word */
1039 }
1040
1041 return 1;
1042 }
1043
1044 static int
_cmpname(PyObject * self,int code,const char * name,int namelen)1045 _cmpname(PyObject *self, int code, const char* name, int namelen)
1046 {
1047 /* check if code corresponds to the given name */
1048 int i;
1049 char buffer[NAME_MAXLEN+1];
1050 if (!_getucname(self, code, buffer, NAME_MAXLEN, 1))
1051 return 0;
1052 for (i = 0; i < namelen; i++) {
1053 if (Py_TOUPPER(Py_CHARMASK(name[i])) != buffer[i])
1054 return 0;
1055 }
1056 return buffer[namelen] == '\0';
1057 }
1058
1059 static void
find_syllable(const char * str,int * len,int * pos,int count,int column)1060 find_syllable(const char *str, int *len, int *pos, int count, int column)
1061 {
1062 int i, len1;
1063 *len = -1;
1064 for (i = 0; i < count; i++) {
1065 const char *s = hangul_syllables[i][column];
1066 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
1067 if (len1 <= *len)
1068 continue;
1069 if (strncmp(str, s, len1) == 0) {
1070 *len = len1;
1071 *pos = i;
1072 }
1073 }
1074 if (*len == -1) {
1075 *len = 0;
1076 }
1077 }
1078
1079 static int
_check_alias_and_seq(unsigned int cp,Py_UCS4 * code,int with_named_seq)1080 _check_alias_and_seq(unsigned int cp, Py_UCS4* code, int with_named_seq)
1081 {
1082 /* check if named sequences are allowed */
1083 if (!with_named_seq && IS_NAMED_SEQ(cp))
1084 return 0;
1085 /* if the code point is in the PUA range that we use for aliases,
1086 * convert it to obtain the right code point */
1087 if (IS_ALIAS(cp))
1088 *code = name_aliases[cp-aliases_start];
1089 else
1090 *code = cp;
1091 return 1;
1092 }
1093
1094 static int
_getcode(PyObject * self,const char * name,int namelen,Py_UCS4 * code,int with_named_seq)1095 _getcode(PyObject* self, const char* name, int namelen, Py_UCS4* code,
1096 int with_named_seq)
1097 {
1098 /* Return the code point associated with the given name.
1099 * Named aliases are resolved too (unless self != NULL (i.e. we are using
1100 * 3.2.0)). If with_named_seq is 1, returns the PUA code point that we are
1101 * using for the named sequence, and the caller must then convert it. */
1102 unsigned int h, v;
1103 unsigned int mask = code_size-1;
1104 unsigned int i, incr;
1105
1106 /* Check for hangul syllables. */
1107 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1108 int len, L = -1, V = -1, T = -1;
1109 const char *pos = name + 16;
1110 find_syllable(pos, &len, &L, LCount, 0);
1111 pos += len;
1112 find_syllable(pos, &len, &V, VCount, 1);
1113 pos += len;
1114 find_syllable(pos, &len, &T, TCount, 2);
1115 pos += len;
1116 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1117 *code = SBase + (L*VCount+V)*TCount + T;
1118 return 1;
1119 }
1120 /* Otherwise, it's an illegal syllable name. */
1121 return 0;
1122 }
1123
1124 /* Check for unified ideographs. */
1125 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1126 /* Four or five hexdigits must follow. */
1127 v = 0;
1128 name += 22;
1129 namelen -= 22;
1130 if (namelen != 4 && namelen != 5)
1131 return 0;
1132 while (namelen--) {
1133 v *= 16;
1134 if (*name >= '0' && *name <= '9')
1135 v += *name - '0';
1136 else if (*name >= 'A' && *name <= 'F')
1137 v += *name - 'A' + 10;
1138 else
1139 return 0;
1140 name++;
1141 }
1142 if (!is_unified_ideograph(v))
1143 return 0;
1144 *code = v;
1145 return 1;
1146 }
1147
1148 /* the following is the same as python's dictionary lookup, with
1149 only minor changes. see the makeunicodedata script for more
1150 details */
1151
1152 h = (unsigned int) _gethash(name, namelen, code_magic);
1153 i = (~h) & mask;
1154 v = code_hash[i];
1155 if (!v)
1156 return 0;
1157 if (_cmpname(self, v, name, namelen))
1158 return _check_alias_and_seq(v, code, with_named_seq);
1159 incr = (h ^ (h >> 3)) & mask;
1160 if (!incr)
1161 incr = mask;
1162 for (;;) {
1163 i = (i + incr) & mask;
1164 v = code_hash[i];
1165 if (!v)
1166 return 0;
1167 if (_cmpname(self, v, name, namelen))
1168 return _check_alias_and_seq(v, code, with_named_seq);
1169 incr = incr << 1;
1170 if (incr > mask)
1171 incr = incr ^ code_poly;
1172 }
1173 }
1174
1175 static const _PyUnicode_Name_CAPI hashAPI =
1176 {
1177 sizeof(_PyUnicode_Name_CAPI),
1178 _getucname,
1179 _getcode
1180 };
1181
1182 /* -------------------------------------------------------------------- */
1183 /* Python bindings */
1184
1185 /*[clinic input]
1186 unicodedata.UCD.name
1187
1188 self: self
1189 chr: int(accept={str})
1190 default: object=NULL
1191 /
1192
1193 Returns the name assigned to the character chr as a string.
1194
1195 If no name is defined, default is returned, or, if not given,
1196 ValueError is raised.
1197 [clinic start generated code]*/
1198
1199 static PyObject *
unicodedata_UCD_name_impl(PyObject * self,int chr,PyObject * default_value)1200 unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
1201 /*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
1202 {
1203 char name[NAME_MAXLEN+1];
1204 Py_UCS4 c = (Py_UCS4)chr;
1205
1206 if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
1207 if (default_value == NULL) {
1208 PyErr_SetString(PyExc_ValueError, "no such name");
1209 return NULL;
1210 }
1211 else {
1212 Py_INCREF(default_value);
1213 return default_value;
1214 }
1215 }
1216
1217 return PyUnicode_FromString(name);
1218 }
1219
1220 /*[clinic input]
1221 unicodedata.UCD.lookup
1222
1223 self: self
1224 name: str(accept={str, robuffer}, zeroes=True)
1225 /
1226
1227 Look up character by name.
1228
1229 If a character with the given name is found, return the
1230 corresponding character. If not found, KeyError is raised.
1231 [clinic start generated code]*/
1232
1233 static PyObject *
unicodedata_UCD_lookup_impl(PyObject * self,const char * name,Py_ssize_clean_t name_length)1234 unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
1235 Py_ssize_clean_t name_length)
1236 /*[clinic end generated code: output=765cb8186788e6be input=a557be0f8607a0d6]*/
1237 {
1238 Py_UCS4 code;
1239 unsigned int index;
1240 if (name_length > NAME_MAXLEN) {
1241 PyErr_SetString(PyExc_KeyError, "name too long");
1242 return NULL;
1243 }
1244
1245 if (!_getcode(self, name, (int)name_length, &code, 1)) {
1246 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
1247 return NULL;
1248 }
1249 /* check if code is in the PUA range that we use for named sequences
1250 and convert it */
1251 if (IS_NAMED_SEQ(code)) {
1252 index = code-named_sequences_start;
1253 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1254 named_sequences[index].seq,
1255 named_sequences[index].seqlen);
1256 }
1257 return PyUnicode_FromOrdinal(code);
1258 }
1259
1260 /* XXX Add doc strings. */
1261
1262 static PyMethodDef unicodedata_functions[] = {
1263 UNICODEDATA_UCD_DECIMAL_METHODDEF
1264 UNICODEDATA_UCD_DIGIT_METHODDEF
1265 UNICODEDATA_UCD_NUMERIC_METHODDEF
1266 UNICODEDATA_UCD_CATEGORY_METHODDEF
1267 UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
1268 UNICODEDATA_UCD_COMBINING_METHODDEF
1269 UNICODEDATA_UCD_MIRRORED_METHODDEF
1270 UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
1271 UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
1272 UNICODEDATA_UCD_NAME_METHODDEF
1273 UNICODEDATA_UCD_LOOKUP_METHODDEF
1274 UNICODEDATA_UCD_NORMALIZE_METHODDEF
1275 {NULL, NULL} /* sentinel */
1276 };
1277
1278 static PyTypeObject UCD_Type = {
1279 /* The ob_type field must be initialized in the module init function
1280 * to be portable to Windows without using C++. */
1281 PyVarObject_HEAD_INIT(NULL, 0)
1282 "unicodedata.UCD", /*tp_name*/
1283 sizeof(PreviousDBVersion), /*tp_basicsize*/
1284 0, /*tp_itemsize*/
1285 /* methods */
1286 (destructor)PyObject_Del, /*tp_dealloc*/
1287 0, /*tp_print*/
1288 0, /*tp_getattr*/
1289 0, /*tp_setattr*/
1290 0, /*tp_reserved*/
1291 0, /*tp_repr*/
1292 0, /*tp_as_number*/
1293 0, /*tp_as_sequence*/
1294 0, /*tp_as_mapping*/
1295 0, /*tp_hash*/
1296 0, /*tp_call*/
1297 0, /*tp_str*/
1298 PyObject_GenericGetAttr,/*tp_getattro*/
1299 0, /*tp_setattro*/
1300 0, /*tp_as_buffer*/
1301 Py_TPFLAGS_DEFAULT, /*tp_flags*/
1302 0, /*tp_doc*/
1303 0, /*tp_traverse*/
1304 0, /*tp_clear*/
1305 0, /*tp_richcompare*/
1306 0, /*tp_weaklistoffset*/
1307 0, /*tp_iter*/
1308 0, /*tp_iternext*/
1309 unicodedata_functions, /*tp_methods*/
1310 DB_members, /*tp_members*/
1311 0, /*tp_getset*/
1312 0, /*tp_base*/
1313 0, /*tp_dict*/
1314 0, /*tp_descr_get*/
1315 0, /*tp_descr_set*/
1316 0, /*tp_dictoffset*/
1317 0, /*tp_init*/
1318 0, /*tp_alloc*/
1319 0, /*tp_new*/
1320 0, /*tp_free*/
1321 0, /*tp_is_gc*/
1322 };
1323
1324 PyDoc_STRVAR(unicodedata_docstring,
1325 "This module provides access to the Unicode Character Database which\n\
1326 defines character properties for all Unicode characters. The data in\n\
1327 this database is based on the UnicodeData.txt file version\n\
1328 " UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
1329 \n\
1330 The module uses the same names and symbols as defined by the\n\
1331 UnicodeData File Format " UNIDATA_VERSION ".");
1332
1333 static struct PyModuleDef unicodedatamodule = {
1334 PyModuleDef_HEAD_INIT,
1335 "unicodedata",
1336 unicodedata_docstring,
1337 -1,
1338 unicodedata_functions,
1339 NULL,
1340 NULL,
1341 NULL,
1342 NULL
1343 };
1344
1345 PyMODINIT_FUNC
PyInit_unicodedata(void)1346 PyInit_unicodedata(void)
1347 {
1348 PyObject *m, *v;
1349
1350 Py_TYPE(&UCD_Type) = &PyType_Type;
1351
1352 m = PyModule_Create(&unicodedatamodule);
1353 if (!m)
1354 return NULL;
1355
1356 PyModule_AddStringConstant(m, "unidata_version", UNIDATA_VERSION);
1357 Py_INCREF(&UCD_Type);
1358 PyModule_AddObject(m, "UCD", (PyObject*)&UCD_Type);
1359
1360 /* Previous versions */
1361 v = new_previous_version("3.2.0", get_change_3_2_0, normalization_3_2_0);
1362 if (v != NULL)
1363 PyModule_AddObject(m, "ucd_3_2_0", v);
1364
1365 /* Export C API */
1366 v = PyCapsule_New((void *)&hashAPI, PyUnicodeData_CAPSULE_NAME, NULL);
1367 if (v != NULL)
1368 PyModule_AddObject(m, "ucnhash_CAPI", v);
1369 return m;
1370 }
1371
1372 /*
1373 Local variables:
1374 c-basic-offset: 4
1375 indent-tabs-mode: nil
1376 End:
1377 */
1378