1 /* ------------------------------------------------------------------------
2
3 unicodedata -- Provides access to the Unicode database.
4
5 The current version number is reported in the unidata_version constant.
6
7 Written by Marc-Andre Lemburg (mal@lemburg.com).
8 Modified for Python 2.0 by Fredrik Lundh (fredrik@pythonware.com)
9 Modified by Martin v. Löwis (martin@v.loewis.de)
10
11 Copyright (c) Corporation for National Research Initiatives.
12
13 ------------------------------------------------------------------------ */
14
15 #ifndef Py_BUILD_CORE_BUILTIN
16 # define Py_BUILD_CORE_MODULE 1
17 #endif
18
19 #include "Python.h"
20 #include "pycore_ucnhash.h" // _PyUnicode_Name_CAPI
21
22 #include <stdbool.h>
23 #include <stddef.h> // offsetof()
24
25 /*[clinic input]
26 module unicodedata
27 class unicodedata.UCD 'PreviousDBVersion *' '<not used>'
28 [clinic start generated code]*/
29 /*[clinic end generated code: output=da39a3ee5e6b4b0d input=e47113e05924be43]*/
30
31 /* character properties */
32
33 typedef struct {
34 const unsigned char category; /* index into
35 _PyUnicode_CategoryNames */
36 const unsigned char combining; /* combining class value 0 - 255 */
37 const unsigned char bidirectional; /* index into
38 _PyUnicode_BidirectionalNames */
39 const unsigned char mirrored; /* true if mirrored in bidir mode */
40 const unsigned char east_asian_width; /* index into
41 _PyUnicode_EastAsianWidth */
42 const unsigned char normalization_quick_check; /* see is_normalized() */
43 } _PyUnicode_DatabaseRecord;
44
45 typedef struct change_record {
46 /* sequence of fields should be the same as in merge_old_version */
47 const unsigned char bidir_changed;
48 const unsigned char category_changed;
49 const unsigned char decimal_changed;
50 const unsigned char mirrored_changed;
51 const unsigned char east_asian_width_changed;
52 const double numeric_changed;
53 } change_record;
54
55 /* data file generated by Tools/unicode/makeunicodedata.py */
56 #include "unicodedata_db.h"
57
58 static const _PyUnicode_DatabaseRecord*
_getrecord_ex(Py_UCS4 code)59 _getrecord_ex(Py_UCS4 code)
60 {
61 int index;
62 if (code >= 0x110000)
63 index = 0;
64 else {
65 index = index1[(code>>SHIFT)];
66 index = index2[(index<<SHIFT)+(code&((1<<SHIFT)-1))];
67 }
68
69 return &_PyUnicode_Database_Records[index];
70 }
71
72 /* ------------- Previous-version API ------------------------------------- */
73 typedef struct previous_version {
74 PyObject_HEAD
75 const char *name;
76 const change_record* (*getrecord)(Py_UCS4);
77 Py_UCS4 (*normalization)(Py_UCS4);
78 } PreviousDBVersion;
79
80 #include "clinic/unicodedata.c.h"
81
82 #define get_old_record(self, v) ((((PreviousDBVersion*)self)->getrecord)(v))
83
84 static PyMemberDef DB_members[] = {
85 {"unidata_version", Py_T_STRING, offsetof(PreviousDBVersion, name), Py_READONLY},
86 {NULL}
87 };
88
89 // Check if self is an unicodedata.UCD instance.
90 // If self is NULL (when the PyCapsule C API is used), return 0.
91 // PyModule_Check() is used to avoid having to retrieve the ucd_type.
92 // See unicodedata_functions comment to the rationale of this macro.
93 #define UCD_Check(self) (self != NULL && !PyModule_Check(self))
94
95 static PyObject*
new_previous_version(PyTypeObject * ucd_type,const char * name,const change_record * (* getrecord)(Py_UCS4),Py_UCS4 (* normalization)(Py_UCS4))96 new_previous_version(PyTypeObject *ucd_type,
97 const char*name, const change_record* (*getrecord)(Py_UCS4),
98 Py_UCS4 (*normalization)(Py_UCS4))
99 {
100 PreviousDBVersion *self;
101 self = PyObject_GC_New(PreviousDBVersion, ucd_type);
102 if (self == NULL)
103 return NULL;
104 self->name = name;
105 self->getrecord = getrecord;
106 self->normalization = normalization;
107 PyObject_GC_Track(self);
108 return (PyObject*)self;
109 }
110
111
112 /* --- Module API --------------------------------------------------------- */
113
114 /*[clinic input]
115 unicodedata.UCD.decimal
116
117 self: self
118 chr: int(accept={str})
119 default: object=NULL
120 /
121
122 Converts a Unicode character into its equivalent decimal value.
123
124 Returns the decimal value assigned to the character chr as integer.
125 If no such value is defined, default is returned, or, if not given,
126 ValueError is raised.
127 [clinic start generated code]*/
128
129 static PyObject *
unicodedata_UCD_decimal_impl(PyObject * self,int chr,PyObject * default_value)130 unicodedata_UCD_decimal_impl(PyObject *self, int chr,
131 PyObject *default_value)
132 /*[clinic end generated code: output=be23376e1a185231 input=933f8107993f23d0]*/
133 {
134 int have_old = 0;
135 long rc;
136 Py_UCS4 c = (Py_UCS4)chr;
137
138 if (UCD_Check(self)) {
139 const change_record *old = get_old_record(self, c);
140 if (old->category_changed == 0) {
141 /* unassigned */
142 have_old = 1;
143 rc = -1;
144 }
145 else if (old->decimal_changed != 0xFF) {
146 have_old = 1;
147 rc = old->decimal_changed;
148 }
149 }
150
151 if (!have_old)
152 rc = Py_UNICODE_TODECIMAL(c);
153 if (rc < 0) {
154 if (default_value == NULL) {
155 PyErr_SetString(PyExc_ValueError,
156 "not a decimal");
157 return NULL;
158 }
159 else {
160 return Py_NewRef(default_value);
161 }
162 }
163 return PyLong_FromLong(rc);
164 }
165
166 /*[clinic input]
167 unicodedata.UCD.digit
168
169 self: self
170 chr: int(accept={str})
171 default: object=NULL
172 /
173
174 Converts a Unicode character into its equivalent digit value.
175
176 Returns the digit value assigned to the character chr as integer.
177 If no such value is defined, default is returned, or, if not given,
178 ValueError is raised.
179 [clinic start generated code]*/
180
181 static PyObject *
unicodedata_UCD_digit_impl(PyObject * self,int chr,PyObject * default_value)182 unicodedata_UCD_digit_impl(PyObject *self, int chr, PyObject *default_value)
183 /*[clinic end generated code: output=96e18c950171fd2f input=e27d6e4565cd29f2]*/
184 {
185 long rc;
186 Py_UCS4 c = (Py_UCS4)chr;
187 rc = Py_UNICODE_TODIGIT(c);
188 if (rc < 0) {
189 if (default_value == NULL) {
190 PyErr_SetString(PyExc_ValueError, "not a digit");
191 return NULL;
192 }
193 else {
194 return Py_NewRef(default_value);
195 }
196 }
197 return PyLong_FromLong(rc);
198 }
199
200 /*[clinic input]
201 unicodedata.UCD.numeric
202
203 self: self
204 chr: int(accept={str})
205 default: object=NULL
206 /
207
208 Converts a Unicode character into its equivalent numeric value.
209
210 Returns the numeric value assigned to the character chr as float.
211 If no such value is defined, default is returned, or, if not given,
212 ValueError is raised.
213 [clinic start generated code]*/
214
215 static PyObject *
unicodedata_UCD_numeric_impl(PyObject * self,int chr,PyObject * default_value)216 unicodedata_UCD_numeric_impl(PyObject *self, int chr,
217 PyObject *default_value)
218 /*[clinic end generated code: output=53ce281fe85b10c4 input=fdf5871a5542893c]*/
219 {
220 int have_old = 0;
221 double rc;
222 Py_UCS4 c = (Py_UCS4)chr;
223
224 if (UCD_Check(self)) {
225 const change_record *old = get_old_record(self, c);
226 if (old->category_changed == 0) {
227 /* unassigned */
228 have_old = 1;
229 rc = -1.0;
230 }
231 else if (old->decimal_changed != 0xFF) {
232 have_old = 1;
233 rc = old->decimal_changed;
234 }
235 }
236
237 if (!have_old)
238 rc = Py_UNICODE_TONUMERIC(c);
239 if (rc == -1.0) {
240 if (default_value == NULL) {
241 PyErr_SetString(PyExc_ValueError, "not a numeric character");
242 return NULL;
243 }
244 else {
245 return Py_NewRef(default_value);
246 }
247 }
248 return PyFloat_FromDouble(rc);
249 }
250
251 /*[clinic input]
252 unicodedata.UCD.category
253
254 self: self
255 chr: int(accept={str})
256 /
257
258 Returns the general category assigned to the character chr as string.
259 [clinic start generated code]*/
260
261 static PyObject *
unicodedata_UCD_category_impl(PyObject * self,int chr)262 unicodedata_UCD_category_impl(PyObject *self, int chr)
263 /*[clinic end generated code: output=8571539ee2e6783a input=27d6f3d85050bc06]*/
264 {
265 int index;
266 Py_UCS4 c = (Py_UCS4)chr;
267 index = (int) _getrecord_ex(c)->category;
268 if (UCD_Check(self)) {
269 const change_record *old = get_old_record(self, c);
270 if (old->category_changed != 0xFF)
271 index = old->category_changed;
272 }
273 return PyUnicode_FromString(_PyUnicode_CategoryNames[index]);
274 }
275
276 /*[clinic input]
277 unicodedata.UCD.bidirectional
278
279 self: self
280 chr: int(accept={str})
281 /
282
283 Returns the bidirectional class assigned to the character chr as string.
284
285 If no such value is defined, an empty string is returned.
286 [clinic start generated code]*/
287
288 static PyObject *
unicodedata_UCD_bidirectional_impl(PyObject * self,int chr)289 unicodedata_UCD_bidirectional_impl(PyObject *self, int chr)
290 /*[clinic end generated code: output=d36310ce2039bb92 input=b3d8f42cebfcf475]*/
291 {
292 int index;
293 Py_UCS4 c = (Py_UCS4)chr;
294 index = (int) _getrecord_ex(c)->bidirectional;
295 if (UCD_Check(self)) {
296 const change_record *old = get_old_record(self, c);
297 if (old->category_changed == 0)
298 index = 0; /* unassigned */
299 else if (old->bidir_changed != 0xFF)
300 index = old->bidir_changed;
301 }
302 return PyUnicode_FromString(_PyUnicode_BidirectionalNames[index]);
303 }
304
305 /*[clinic input]
306 unicodedata.UCD.combining -> int
307
308 self: self
309 chr: int(accept={str})
310 /
311
312 Returns the canonical combining class assigned to the character chr as integer.
313
314 Returns 0 if no combining class is defined.
315 [clinic start generated code]*/
316
317 static int
unicodedata_UCD_combining_impl(PyObject * self,int chr)318 unicodedata_UCD_combining_impl(PyObject *self, int chr)
319 /*[clinic end generated code: output=cad056d0cb6a5920 input=9f2d6b2a95d0a22a]*/
320 {
321 int index;
322 Py_UCS4 c = (Py_UCS4)chr;
323 index = (int) _getrecord_ex(c)->combining;
324 if (UCD_Check(self)) {
325 const change_record *old = get_old_record(self, c);
326 if (old->category_changed == 0)
327 index = 0; /* unassigned */
328 }
329 return index;
330 }
331
332 /*[clinic input]
333 unicodedata.UCD.mirrored -> int
334
335 self: self
336 chr: int(accept={str})
337 /
338
339 Returns the mirrored property assigned to the character chr as integer.
340
341 Returns 1 if the character has been identified as a "mirrored"
342 character in bidirectional text, 0 otherwise.
343 [clinic start generated code]*/
344
345 static int
unicodedata_UCD_mirrored_impl(PyObject * self,int chr)346 unicodedata_UCD_mirrored_impl(PyObject *self, int chr)
347 /*[clinic end generated code: output=2532dbf8121b50e6 input=5dd400d351ae6f3b]*/
348 {
349 int index;
350 Py_UCS4 c = (Py_UCS4)chr;
351 index = (int) _getrecord_ex(c)->mirrored;
352 if (UCD_Check(self)) {
353 const change_record *old = get_old_record(self, c);
354 if (old->category_changed == 0)
355 index = 0; /* unassigned */
356 else if (old->mirrored_changed != 0xFF)
357 index = old->mirrored_changed;
358 }
359 return index;
360 }
361
362 /*[clinic input]
363 unicodedata.UCD.east_asian_width
364
365 self: self
366 chr: int(accept={str})
367 /
368
369 Returns the east asian width assigned to the character chr as string.
370 [clinic start generated code]*/
371
372 static PyObject *
unicodedata_UCD_east_asian_width_impl(PyObject * self,int chr)373 unicodedata_UCD_east_asian_width_impl(PyObject *self, int chr)
374 /*[clinic end generated code: output=484e8537d9ee8197 input=c4854798aab026e0]*/
375 {
376 int index;
377 Py_UCS4 c = (Py_UCS4)chr;
378 index = (int) _getrecord_ex(c)->east_asian_width;
379 if (UCD_Check(self)) {
380 const change_record *old = get_old_record(self, c);
381 if (old->category_changed == 0)
382 index = 0; /* unassigned */
383 else if (old->east_asian_width_changed != 0xFF)
384 index = old->east_asian_width_changed;
385 }
386 return PyUnicode_FromString(_PyUnicode_EastAsianWidthNames[index]);
387 }
388
389 /*[clinic input]
390 unicodedata.UCD.decomposition
391
392 self: self
393 chr: int(accept={str})
394 /
395
396 Returns the character decomposition mapping assigned to the character chr as string.
397
398 An empty string is returned in case no such mapping is defined.
399 [clinic start generated code]*/
400
401 static PyObject *
unicodedata_UCD_decomposition_impl(PyObject * self,int chr)402 unicodedata_UCD_decomposition_impl(PyObject *self, int chr)
403 /*[clinic end generated code: output=7d699f3ec7565d27 input=e4c12459ad68507b]*/
404 {
405 char decomp[256];
406 int code, index, count;
407 size_t i;
408 unsigned int prefix_index;
409 Py_UCS4 c = (Py_UCS4)chr;
410
411 code = (int)c;
412
413 if (UCD_Check(self)) {
414 const change_record *old = get_old_record(self, c);
415 if (old->category_changed == 0)
416 return PyUnicode_FromString(""); /* unassigned */
417 }
418
419 if (code < 0 || code >= 0x110000)
420 index = 0;
421 else {
422 index = decomp_index1[(code>>DECOMP_SHIFT)];
423 index = decomp_index2[(index<<DECOMP_SHIFT)+
424 (code&((1<<DECOMP_SHIFT)-1))];
425 }
426
427 /* high byte is number of hex bytes (usually one or two), low byte
428 is prefix code (from*/
429 count = decomp_data[index] >> 8;
430
431 /* XXX: could allocate the PyString up front instead
432 (strlen(prefix) + 5 * count + 1 bytes) */
433
434 /* Based on how index is calculated above and decomp_data is generated
435 from Tools/unicode/makeunicodedata.py, it should not be possible
436 to overflow decomp_prefix. */
437 prefix_index = decomp_data[index] & 255;
438 assert(prefix_index < Py_ARRAY_LENGTH(decomp_prefix));
439
440 /* copy prefix */
441 i = strlen(decomp_prefix[prefix_index]);
442 memcpy(decomp, decomp_prefix[prefix_index], i);
443
444 while (count-- > 0) {
445 if (i)
446 decomp[i++] = ' ';
447 assert(i < sizeof(decomp));
448 PyOS_snprintf(decomp + i, sizeof(decomp) - i, "%04X",
449 decomp_data[++index]);
450 i += strlen(decomp + i);
451 }
452 return PyUnicode_FromStringAndSize(decomp, i);
453 }
454
455 static void
get_decomp_record(PyObject * self,Py_UCS4 code,int * index,int * prefix,int * count)456 get_decomp_record(PyObject *self, Py_UCS4 code,
457 int *index, int *prefix, int *count)
458 {
459 if (code >= 0x110000) {
460 *index = 0;
461 }
462 else if (UCD_Check(self)
463 && get_old_record(self, code)->category_changed==0) {
464 /* unassigned in old version */
465 *index = 0;
466 }
467 else {
468 *index = decomp_index1[(code>>DECOMP_SHIFT)];
469 *index = decomp_index2[(*index<<DECOMP_SHIFT)+
470 (code&((1<<DECOMP_SHIFT)-1))];
471 }
472
473 /* high byte is number of hex bytes (usually one or two), low byte
474 is prefix code (from*/
475 *count = decomp_data[*index] >> 8;
476 *prefix = decomp_data[*index] & 255;
477
478 (*index)++;
479 }
480
481 #define SBase 0xAC00
482 #define LBase 0x1100
483 #define VBase 0x1161
484 #define TBase 0x11A7
485 #define LCount 19
486 #define VCount 21
487 #define TCount 28
488 #define NCount (VCount*TCount)
489 #define SCount (LCount*NCount)
490
491 static PyObject*
nfd_nfkd(PyObject * self,PyObject * input,int k)492 nfd_nfkd(PyObject *self, PyObject *input, int k)
493 {
494 PyObject *result;
495 Py_UCS4 *output;
496 Py_ssize_t i, o, osize;
497 int kind;
498 const void *data;
499 /* Longest decomposition in Unicode 3.2: U+FDFA */
500 Py_UCS4 stack[20];
501 Py_ssize_t space, isize;
502 int index, prefix, count, stackptr;
503 unsigned char prev, cur;
504
505 stackptr = 0;
506 isize = PyUnicode_GET_LENGTH(input);
507 space = isize;
508 /* Overallocate at most 10 characters. */
509 if (space > 10) {
510 if (space <= PY_SSIZE_T_MAX - 10)
511 space += 10;
512 }
513 else {
514 space *= 2;
515 }
516 osize = space;
517 output = PyMem_NEW(Py_UCS4, space);
518 if (!output) {
519 PyErr_NoMemory();
520 return NULL;
521 }
522 i = o = 0;
523 kind = PyUnicode_KIND(input);
524 data = PyUnicode_DATA(input);
525
526 while (i < isize) {
527 stack[stackptr++] = PyUnicode_READ(kind, data, i++);
528 while(stackptr) {
529 Py_UCS4 code = stack[--stackptr];
530 /* Hangul Decomposition adds three characters in
531 a single step, so we need at least that much room. */
532 if (space < 3) {
533 Py_UCS4 *new_output;
534 osize += 10;
535 space += 10;
536 new_output = PyMem_Realloc(output, osize*sizeof(Py_UCS4));
537 if (new_output == NULL) {
538 PyMem_Free(output);
539 PyErr_NoMemory();
540 return NULL;
541 }
542 output = new_output;
543 }
544 /* Hangul Decomposition. */
545 if (SBase <= code && code < (SBase+SCount)) {
546 int SIndex = code - SBase;
547 int L = LBase + SIndex / NCount;
548 int V = VBase + (SIndex % NCount) / TCount;
549 int T = TBase + SIndex % TCount;
550 output[o++] = L;
551 output[o++] = V;
552 space -= 2;
553 if (T != TBase) {
554 output[o++] = T;
555 space --;
556 }
557 continue;
558 }
559 /* normalization changes */
560 if (UCD_Check(self)) {
561 Py_UCS4 value = ((PreviousDBVersion*)self)->normalization(code);
562 if (value != 0) {
563 stack[stackptr++] = value;
564 continue;
565 }
566 }
567
568 /* Other decompositions. */
569 get_decomp_record(self, code, &index, &prefix, &count);
570
571 /* Copy character if it is not decomposable, or has a
572 compatibility decomposition, but we do NFD. */
573 if (!count || (prefix && !k)) {
574 output[o++] = code;
575 space--;
576 continue;
577 }
578 /* Copy decomposition onto the stack, in reverse
579 order. */
580 while(count) {
581 code = decomp_data[index + (--count)];
582 stack[stackptr++] = code;
583 }
584 }
585 }
586
587 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
588 output, o);
589 PyMem_Free(output);
590 if (!result)
591 return NULL;
592 /* result is guaranteed to be ready, as it is compact. */
593 kind = PyUnicode_KIND(result);
594 data = PyUnicode_DATA(result);
595
596 /* Sort canonically. */
597 i = 0;
598 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
599 for (i++; i < PyUnicode_GET_LENGTH(result); i++) {
600 cur = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
601 if (prev == 0 || cur == 0 || prev <= cur) {
602 prev = cur;
603 continue;
604 }
605 /* Non-canonical order. Need to switch *i with previous. */
606 o = i - 1;
607 while (1) {
608 Py_UCS4 tmp = PyUnicode_READ(kind, data, o+1);
609 PyUnicode_WRITE(kind, data, o+1,
610 PyUnicode_READ(kind, data, o));
611 PyUnicode_WRITE(kind, data, o, tmp);
612 o--;
613 if (o < 0)
614 break;
615 prev = _getrecord_ex(PyUnicode_READ(kind, data, o))->combining;
616 if (prev == 0 || prev <= cur)
617 break;
618 }
619 prev = _getrecord_ex(PyUnicode_READ(kind, data, i))->combining;
620 }
621 return result;
622 }
623
624 static int
find_nfc_index(const struct reindex * nfc,Py_UCS4 code)625 find_nfc_index(const struct reindex* nfc, Py_UCS4 code)
626 {
627 unsigned int index;
628 for (index = 0; nfc[index].start; index++) {
629 unsigned int start = nfc[index].start;
630 if (code < start)
631 return -1;
632 if (code <= start + nfc[index].count) {
633 unsigned int delta = code - start;
634 return nfc[index].index + delta;
635 }
636 }
637 return -1;
638 }
639
640 static PyObject*
nfc_nfkc(PyObject * self,PyObject * input,int k)641 nfc_nfkc(PyObject *self, PyObject *input, int k)
642 {
643 PyObject *result;
644 int kind;
645 const void *data;
646 Py_UCS4 *output;
647 Py_ssize_t i, i1, o, len;
648 int f,l,index,index1,comb;
649 Py_UCS4 code;
650 Py_ssize_t skipped[20];
651 int cskipped = 0;
652
653 result = nfd_nfkd(self, input, k);
654 if (!result)
655 return NULL;
656 /* result will be "ready". */
657 kind = PyUnicode_KIND(result);
658 data = PyUnicode_DATA(result);
659 len = PyUnicode_GET_LENGTH(result);
660
661 /* We allocate a buffer for the output.
662 If we find that we made no changes, we still return
663 the NFD result. */
664 output = PyMem_NEW(Py_UCS4, len);
665 if (!output) {
666 PyErr_NoMemory();
667 Py_DECREF(result);
668 return 0;
669 }
670 i = o = 0;
671
672 again:
673 while (i < len) {
674 for (index = 0; index < cskipped; index++) {
675 if (skipped[index] == i) {
676 /* *i character is skipped.
677 Remove from list. */
678 skipped[index] = skipped[cskipped-1];
679 cskipped--;
680 i++;
681 goto again; /* continue while */
682 }
683 }
684 /* Hangul Composition. We don't need to check for <LV,T>
685 pairs, since we always have decomposed data. */
686 code = PyUnicode_READ(kind, data, i);
687 if (LBase <= code && code < (LBase+LCount) &&
688 i + 1 < len &&
689 VBase <= PyUnicode_READ(kind, data, i+1) &&
690 PyUnicode_READ(kind, data, i+1) < (VBase+VCount)) {
691 /* check L character is a modern leading consonant (0x1100 ~ 0x1112)
692 and V character is a modern vowel (0x1161 ~ 0x1175). */
693 int LIndex, VIndex;
694 LIndex = code - LBase;
695 VIndex = PyUnicode_READ(kind, data, i+1) - VBase;
696 code = SBase + (LIndex*VCount+VIndex)*TCount;
697 i+=2;
698 if (i < len &&
699 TBase < PyUnicode_READ(kind, data, i) &&
700 PyUnicode_READ(kind, data, i) < (TBase+TCount)) {
701 /* check T character is a modern trailing consonant
702 (0x11A8 ~ 0x11C2). */
703 code += PyUnicode_READ(kind, data, i)-TBase;
704 i++;
705 }
706 output[o++] = code;
707 continue;
708 }
709
710 /* code is still input[i] here */
711 f = find_nfc_index(nfc_first, code);
712 if (f == -1) {
713 output[o++] = code;
714 i++;
715 continue;
716 }
717 /* Find next unblocked character. */
718 i1 = i+1;
719 comb = 0;
720 /* output base character for now; might be updated later. */
721 output[o] = PyUnicode_READ(kind, data, i);
722 while (i1 < len) {
723 Py_UCS4 code1 = PyUnicode_READ(kind, data, i1);
724 int comb1 = _getrecord_ex(code1)->combining;
725 if (comb) {
726 if (comb1 == 0)
727 break;
728 if (comb >= comb1) {
729 /* Character is blocked. */
730 i1++;
731 continue;
732 }
733 }
734 l = find_nfc_index(nfc_last, code1);
735 /* i1 cannot be combined with i. If i1
736 is a starter, we don't need to look further.
737 Otherwise, record the combining class. */
738 if (l == -1) {
739 not_combinable:
740 if (comb1 == 0)
741 break;
742 comb = comb1;
743 i1++;
744 continue;
745 }
746 index = f*TOTAL_LAST + l;
747 index1 = comp_index[index >> COMP_SHIFT];
748 code = comp_data[(index1<<COMP_SHIFT)+
749 (index&((1<<COMP_SHIFT)-1))];
750 if (code == 0)
751 goto not_combinable;
752
753 /* Replace the original character. */
754 output[o] = code;
755 /* Mark the second character unused. */
756 assert(cskipped < 20);
757 skipped[cskipped++] = i1;
758 i1++;
759 f = find_nfc_index(nfc_first, output[o]);
760 if (f == -1)
761 break;
762 }
763 /* Output character was already written.
764 Just advance the indices. */
765 o++; i++;
766 }
767 if (o == len) {
768 /* No changes. Return original string. */
769 PyMem_Free(output);
770 return result;
771 }
772 Py_DECREF(result);
773 result = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
774 output, o);
775 PyMem_Free(output);
776 return result;
777 }
778
779 // This needs to match the logic in makeunicodedata.py
780 // which constructs the quickcheck data.
781 typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;
782
783 /* Run the Unicode normalization "quickcheck" algorithm.
784 *
785 * Return YES or NO if quickcheck determines the input is certainly
786 * normalized or certainly not, and MAYBE if quickcheck is unable to
787 * tell.
788 *
789 * If `yes_only` is true, then return MAYBE as soon as we determine
790 * the answer is not YES.
791 *
792 * For background and details on the algorithm, see UAX #15:
793 * https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
794 */
795 static QuickcheckResult
is_normalized_quickcheck(PyObject * self,PyObject * input,bool nfc,bool k,bool yes_only)796 is_normalized_quickcheck(PyObject *self, PyObject *input, bool nfc, bool k,
797 bool yes_only)
798 {
799 /* UCD 3.2.0 is requested, quickchecks must be disabled. */
800 if (UCD_Check(self)) {
801 return MAYBE;
802 }
803
804 if (PyUnicode_IS_ASCII(input)) {
805 return YES;
806 }
807
808 Py_ssize_t i, len;
809 int kind;
810 const void *data;
811 unsigned char prev_combining = 0;
812
813 /* The two quickcheck bits at this shift have type QuickcheckResult. */
814 int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);
815
816 QuickcheckResult result = YES; /* certainly normalized, unless we find something */
817
818 i = 0;
819 kind = PyUnicode_KIND(input);
820 data = PyUnicode_DATA(input);
821 len = PyUnicode_GET_LENGTH(input);
822 while (i < len) {
823 Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
824 const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
825
826 unsigned char combining = record->combining;
827 if (combining && prev_combining > combining)
828 return NO; /* non-canonical sort order, not normalized */
829 prev_combining = combining;
830
831 unsigned char quickcheck_whole = record->normalization_quick_check;
832 if (yes_only) {
833 if (quickcheck_whole & (3 << quickcheck_shift))
834 return MAYBE;
835 } else {
836 switch ((quickcheck_whole >> quickcheck_shift) & 3) {
837 case NO:
838 return NO;
839 case MAYBE:
840 result = MAYBE; /* this string might need normalization */
841 }
842 }
843 }
844 return result;
845 }
846
847 /*[clinic input]
848 unicodedata.UCD.is_normalized
849
850 self: self
851 form: unicode
852 unistr as input: unicode
853 /
854
855 Return whether the Unicode string unistr is in the normal form 'form'.
856
857 Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
858 [clinic start generated code]*/
859
860 static PyObject *
unicodedata_UCD_is_normalized_impl(PyObject * self,PyObject * form,PyObject * input)861 unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
862 PyObject *input)
863 /*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/
864 {
865 if (PyUnicode_GET_LENGTH(input) == 0) {
866 /* special case empty input strings. */
867 Py_RETURN_TRUE;
868 }
869
870 PyObject *result;
871 bool nfc = false;
872 bool k = false;
873 QuickcheckResult m;
874
875 PyObject *cmp;
876 int match = 0;
877
878 if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) {
879 nfc = true;
880 }
881 else if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) {
882 nfc = true;
883 k = true;
884 }
885 else if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) {
886 /* matches default values for `nfc` and `k` */
887 }
888 else if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) {
889 k = true;
890 }
891 else {
892 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
893 return NULL;
894 }
895
896 m = is_normalized_quickcheck(self, input, nfc, k, false);
897
898 if (m == MAYBE) {
899 cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
900 if (cmp == NULL) {
901 return NULL;
902 }
903 match = PyUnicode_Compare(input, cmp);
904 Py_DECREF(cmp);
905 result = (match == 0) ? Py_True : Py_False;
906 }
907 else {
908 result = (m == YES) ? Py_True : Py_False;
909 }
910
911 return Py_NewRef(result);
912 }
913
914
915 /*[clinic input]
916 unicodedata.UCD.normalize
917
918 self: self
919 form: unicode
920 unistr as input: unicode
921 /
922
923 Return the normal form 'form' for the Unicode string unistr.
924
925 Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'.
926 [clinic start generated code]*/
927
928 static PyObject *
unicodedata_UCD_normalize_impl(PyObject * self,PyObject * form,PyObject * input)929 unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
930 PyObject *input)
931 /*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/
932 {
933 if (PyUnicode_GET_LENGTH(input) == 0) {
934 /* Special case empty input strings, since resizing
935 them later would cause internal errors. */
936 return Py_NewRef(input);
937 }
938
939 if (PyUnicode_CompareWithASCIIString(form, "NFC") == 0) {
940 if (is_normalized_quickcheck(self, input,
941 true, false, true) == YES) {
942 return Py_NewRef(input);
943 }
944 return nfc_nfkc(self, input, 0);
945 }
946 if (PyUnicode_CompareWithASCIIString(form, "NFKC") == 0) {
947 if (is_normalized_quickcheck(self, input,
948 true, true, true) == YES) {
949 return Py_NewRef(input);
950 }
951 return nfc_nfkc(self, input, 1);
952 }
953 if (PyUnicode_CompareWithASCIIString(form, "NFD") == 0) {
954 if (is_normalized_quickcheck(self, input,
955 false, false, true) == YES) {
956 return Py_NewRef(input);
957 }
958 return nfd_nfkd(self, input, 0);
959 }
960 if (PyUnicode_CompareWithASCIIString(form, "NFKD") == 0) {
961 if (is_normalized_quickcheck(self, input,
962 false, true, true) == YES) {
963 return Py_NewRef(input);
964 }
965 return nfd_nfkd(self, input, 1);
966 }
967 PyErr_SetString(PyExc_ValueError, "invalid normalization form");
968 return NULL;
969 }
970
971 /* -------------------------------------------------------------------- */
972 /* unicode character name tables */
973
974 /* data file generated by Tools/unicode/makeunicodedata.py */
975 #include "unicodename_db.h"
976
977 /* -------------------------------------------------------------------- */
978 /* database code (cut and pasted from the unidb package) */
979
980 static const char * const hangul_syllables[][3] = {
981 { "G", "A", "" },
982 { "GG", "AE", "G" },
983 { "N", "YA", "GG" },
984 { "D", "YAE", "GS" },
985 { "DD", "EO", "N", },
986 { "R", "E", "NJ" },
987 { "M", "YEO", "NH" },
988 { "B", "YE", "D" },
989 { "BB", "O", "L" },
990 { "S", "WA", "LG" },
991 { "SS", "WAE", "LM" },
992 { "", "OE", "LB" },
993 { "J", "YO", "LS" },
994 { "JJ", "U", "LT" },
995 { "C", "WEO", "LP" },
996 { "K", "WE", "LH" },
997 { "T", "WI", "M" },
998 { "P", "YU", "B" },
999 { "H", "EU", "BS" },
1000 { 0, "YI", "S" },
1001 { 0, "I", "SS" },
1002 { 0, 0, "NG" },
1003 { 0, 0, "J" },
1004 { 0, 0, "C" },
1005 { 0, 0, "K" },
1006 { 0, 0, "T" },
1007 { 0, 0, "P" },
1008 { 0, 0, "H" }
1009 };
1010
1011 /* These ranges need to match makeunicodedata.py:cjk_ranges. */
1012 static int
is_unified_ideograph(Py_UCS4 code)1013 is_unified_ideograph(Py_UCS4 code)
1014 {
1015 return
1016 (0x3400 <= code && code <= 0x4DBF) || /* CJK Ideograph Extension A */
1017 (0x4E00 <= code && code <= 0x9FFF) || /* CJK Ideograph */
1018 (0x20000 <= code && code <= 0x2A6DF) || /* CJK Ideograph Extension B */
1019 (0x2A700 <= code && code <= 0x2B739) || /* CJK Ideograph Extension C */
1020 (0x2B740 <= code && code <= 0x2B81D) || /* CJK Ideograph Extension D */
1021 (0x2B820 <= code && code <= 0x2CEA1) || /* CJK Ideograph Extension E */
1022 (0x2CEB0 <= code && code <= 0x2EBE0) || /* CJK Ideograph Extension F */
1023 (0x2EBF0 <= code && code <= 0x2EE5D) || /* CJK Ideograph Extension I */
1024 (0x30000 <= code && code <= 0x3134A) || /* CJK Ideograph Extension G */
1025 (0x31350 <= code && code <= 0x323AF); /* CJK Ideograph Extension H */
1026 }
1027
1028 /* macros used to determine if the given code point is in the PUA range that
1029 * we are using to store aliases and named sequences */
1030 #define IS_ALIAS(cp) ((cp >= aliases_start) && (cp < aliases_end))
1031 #define IS_NAMED_SEQ(cp) ((cp >= named_sequences_start) && \
1032 (cp < named_sequences_end))
1033
1034
1035 // DAWG decoding functions
1036
1037 static unsigned int
_dawg_decode_varint_unsigned(unsigned int index,unsigned int * result)1038 _dawg_decode_varint_unsigned(unsigned int index, unsigned int* result)
1039 {
1040 unsigned int res = 0;
1041 unsigned int shift = 0;
1042 for (;;) {
1043 unsigned char byte = packed_name_dawg[index];
1044 res |= (byte & 0x7f) << shift;
1045 index++;
1046 shift += 7;
1047 if (!(byte & 0x80)) {
1048 *result = res;
1049 return index;
1050 }
1051 }
1052 }
1053
1054 static int
_dawg_match_edge(const char * name,unsigned int namelen,unsigned int size,unsigned int label_offset,unsigned int namepos)1055 _dawg_match_edge(const char* name, unsigned int namelen, unsigned int size,
1056 unsigned int label_offset, unsigned int namepos)
1057 {
1058 // This returns 1 if the edge matched, 0 if it didn't (but further edges
1059 // could match) and -1 if the name cannot match at all.
1060 if (size > 1 && namepos + size > namelen) {
1061 return 0;
1062 }
1063 for (unsigned int i = 0; i < size; i++) {
1064 if (packed_name_dawg[label_offset + i] != Py_TOUPPER(name[namepos + i])) {
1065 if (i > 0) {
1066 return -1; // cannot match at all
1067 }
1068 return 0;
1069 }
1070 }
1071 return 1;
1072 }
1073
1074 // reading DAWG node information:
1075 // a node is encoded by a varint. The lowest bit of that int is set if the node
1076 // is a final, accepting state. The higher bits of that int represent the
1077 // number of names that are encoded by the sub-DAWG started by this node. It's
1078 // used to compute the position of a name.
1079 //
1080 // the starting node of the DAWG is at position 0.
1081 //
1082 // the varint representing a node is followed by the node's edges, the encoding
1083 // is described below
1084
1085
1086 static unsigned int
_dawg_decode_node(unsigned int node_offset,bool * final)1087 _dawg_decode_node(unsigned int node_offset, bool* final)
1088 {
1089 unsigned int num;
1090 node_offset = _dawg_decode_varint_unsigned(node_offset, &num);
1091 *final = num & 1;
1092 return node_offset;
1093 }
1094
1095 static bool
_dawg_node_is_final(unsigned int node_offset)1096 _dawg_node_is_final(unsigned int node_offset)
1097 {
1098 unsigned int num;
1099 _dawg_decode_varint_unsigned(node_offset, &num);
1100 return num & 1;
1101 }
1102
1103 static unsigned int
_dawg_node_descendant_count(unsigned int node_offset)1104 _dawg_node_descendant_count(unsigned int node_offset)
1105 {
1106 unsigned int num;
1107 _dawg_decode_varint_unsigned(node_offset, &num);
1108 return num >> 1;
1109 }
1110
1111
1112 // reading DAWG edge information:
1113 // a DAWG edge is comprised of the following information:
1114 // (1) the size of the label of the string attached to the edge
1115 // (2) the characters of that edge
1116 // (3) the target node
1117 // (4) whether the edge is the last edge in the list of edges following a node
1118 //
1119 // this information is encoded in a compact form as follows:
1120 //
1121 // +---------+-----------------+--------------+--------------------
1122 // | varint | size (if != 1) | label chars | ... next edge ...
1123 // +---------+-----------------+--------------+--------------------
1124 //
1125 // - first comes a varint
1126 // - the lowest bit of that varint is whether the edge is final (4)
1127 // - the second lowest bit of that varint is true if the size of
1128 // the length of the label is 1 (1)
1129 // - the rest of the varint is an offset that can be used to compute
1130 // the offset of the target node of that edge (3)
1131 // - if the size is not 1, the first varint is followed by a
1132 // character encoding the number of characters of the label (1)
1133 // (unicode character names aren't larger than 256 bytes, therefore each
1134 // edge label can be at most 256 chars, but is usually smaller)
1135 // - the next size bytes are the characters of the label (2)
1136 //
1137 // the offset of the target node is computed as follows: the number in the
1138 // upper bits of the varint needs to be added to the offset of the target node
1139 // of the previous edge. For the first edge, where there is no previous target
1140 // node, the offset of the first edge is used.
1141 // The intuition here is that edges going out from a node often lead to nodes
1142 // that are close by, leading to small offsets from the current node and thus
1143 // fewer bytes.
1144 //
1145 // There is a special case: if a final node has no outgoing edges, it has to be
1146 // followed by a 0 byte to indicate that there are no edges (because the end of
1147 // the edge list is normally indicated in a bit in the edge encoding). This is
1148 // indicated by _dawg_decode_edge returning -1
1149
1150
1151 static int
_dawg_decode_edge(bool is_first_edge,unsigned int prev_target_node_offset,unsigned int edge_offset,unsigned int * size,unsigned int * label_offset,unsigned int * target_node_offset)1152 _dawg_decode_edge(bool is_first_edge, unsigned int prev_target_node_offset,
1153 unsigned int edge_offset, unsigned int* size,
1154 unsigned int* label_offset, unsigned int* target_node_offset)
1155 {
1156 unsigned int num;
1157 edge_offset = _dawg_decode_varint_unsigned(edge_offset, &num);
1158 if (num == 0 && is_first_edge) {
1159 return -1; // trying to decode past a final node without outgoing edges
1160 }
1161 bool last_edge = num & 1;
1162 num >>= 1;
1163 bool len_is_one = num & 1;
1164 num >>= 1;
1165 *target_node_offset = prev_target_node_offset + num;
1166 if (len_is_one) {
1167 *size = 1;
1168 } else {
1169 *size = packed_name_dawg[edge_offset++];
1170 }
1171 *label_offset = edge_offset;
1172 return last_edge;
1173 }
1174
1175 static int
_lookup_dawg_packed(const char * name,unsigned int namelen)1176 _lookup_dawg_packed(const char* name, unsigned int namelen)
1177 {
1178 unsigned int stringpos = 0;
1179 unsigned int node_offset = 0;
1180 unsigned int result = 0; // this is the number of final nodes that we skipped to match name
1181 while (stringpos < namelen) {
1182 bool final;
1183 unsigned int edge_offset = _dawg_decode_node(node_offset, &final);
1184 unsigned int prev_target_node_offset = edge_offset;
1185 bool is_first_edge = true;
1186 for (;;) {
1187 unsigned int size;
1188 unsigned int label_offset, target_node_offset;
1189 int last_edge = _dawg_decode_edge(
1190 is_first_edge, prev_target_node_offset, edge_offset,
1191 &size, &label_offset, &target_node_offset);
1192 if (last_edge == -1) {
1193 return -1;
1194 }
1195 is_first_edge = false;
1196 prev_target_node_offset = target_node_offset;
1197 int matched = _dawg_match_edge(name, namelen, size, label_offset, stringpos);
1198 if (matched == -1) {
1199 return -1;
1200 }
1201 if (matched) {
1202 if (final)
1203 result += 1;
1204 stringpos += size;
1205 node_offset = target_node_offset;
1206 break;
1207 }
1208 if (last_edge) {
1209 return -1;
1210 }
1211 result += _dawg_node_descendant_count(target_node_offset);
1212 edge_offset = label_offset + size;
1213 }
1214 }
1215 if (_dawg_node_is_final(node_offset)) {
1216 return result;
1217 }
1218 return -1;
1219 }
1220
1221 static int
_inverse_dawg_lookup(char * buffer,unsigned int buflen,unsigned int pos)1222 _inverse_dawg_lookup(char* buffer, unsigned int buflen, unsigned int pos)
1223 {
1224 unsigned int node_offset = 0;
1225 unsigned int bufpos = 0;
1226 for (;;) {
1227 bool final;
1228 unsigned int edge_offset = _dawg_decode_node(node_offset, &final);
1229
1230 if (final) {
1231 if (pos == 0) {
1232 if (bufpos + 1 == buflen) {
1233 return 0;
1234 }
1235 buffer[bufpos] = '\0';
1236 return 1;
1237 }
1238 pos--;
1239 }
1240 unsigned int prev_target_node_offset = edge_offset;
1241 bool is_first_edge = true;
1242 for (;;) {
1243 unsigned int size;
1244 unsigned int label_offset, target_node_offset;
1245 int last_edge = _dawg_decode_edge(
1246 is_first_edge, prev_target_node_offset, edge_offset,
1247 &size, &label_offset, &target_node_offset);
1248 if (last_edge == -1) {
1249 return 0;
1250 }
1251 is_first_edge = false;
1252 prev_target_node_offset = target_node_offset;
1253
1254 unsigned int descendant_count = _dawg_node_descendant_count(target_node_offset);
1255 if (pos < descendant_count) {
1256 if (bufpos + size >= buflen) {
1257 return 0; // buffer overflow
1258 }
1259 for (unsigned int i = 0; i < size; i++) {
1260 buffer[bufpos++] = packed_name_dawg[label_offset++];
1261 }
1262 node_offset = target_node_offset;
1263 break;
1264 } else if (!last_edge) {
1265 pos -= descendant_count;
1266 edge_offset = label_offset + size;
1267 } else {
1268 return 0;
1269 }
1270 }
1271 }
1272 }
1273
1274
1275 static int
_getucname(PyObject * self,Py_UCS4 code,char * buffer,int buflen,int with_alias_and_seq)1276 _getucname(PyObject *self,
1277 Py_UCS4 code, char* buffer, int buflen, int with_alias_and_seq)
1278 {
1279 /* Find the name associated with the given code point.
1280 * If with_alias_and_seq is 1, check for names in the Private Use Area 15
1281 * that we are using for aliases and named sequences. */
1282 int offset;
1283
1284 if (code >= 0x110000)
1285 return 0;
1286
1287 /* XXX should we just skip all the code points in the PUAs here? */
1288 if (!with_alias_and_seq && (IS_ALIAS(code) || IS_NAMED_SEQ(code)))
1289 return 0;
1290
1291 if (UCD_Check(self)) {
1292 /* in 3.2.0 there are no aliases and named sequences */
1293 const change_record *old;
1294 if (IS_ALIAS(code) || IS_NAMED_SEQ(code))
1295 return 0;
1296 old = get_old_record(self, code);
1297 if (old->category_changed == 0) {
1298 /* unassigned */
1299 return 0;
1300 }
1301 }
1302
1303 if (SBase <= code && code < SBase+SCount) {
1304 /* Hangul syllable. */
1305 int SIndex = code - SBase;
1306 int L = SIndex / NCount;
1307 int V = (SIndex % NCount) / TCount;
1308 int T = SIndex % TCount;
1309
1310 if (buflen < 27)
1311 /* Worst case: HANGUL SYLLABLE <10chars>. */
1312 return 0;
1313 strcpy(buffer, "HANGUL SYLLABLE ");
1314 buffer += 16;
1315 strcpy(buffer, hangul_syllables[L][0]);
1316 buffer += strlen(hangul_syllables[L][0]);
1317 strcpy(buffer, hangul_syllables[V][1]);
1318 buffer += strlen(hangul_syllables[V][1]);
1319 strcpy(buffer, hangul_syllables[T][2]);
1320 buffer += strlen(hangul_syllables[T][2]);
1321 *buffer = '\0';
1322 return 1;
1323 }
1324
1325 if (is_unified_ideograph(code)) {
1326 if (buflen < 28)
1327 /* Worst case: CJK UNIFIED IDEOGRAPH-20000 */
1328 return 0;
1329 sprintf(buffer, "CJK UNIFIED IDEOGRAPH-%X", code);
1330 return 1;
1331 }
1332
1333 /* get position of codepoint in order of names in the dawg */
1334 offset = dawg_codepoint_to_pos_index1[(code>>DAWG_CODEPOINT_TO_POS_SHIFT)];
1335 offset = dawg_codepoint_to_pos_index2[(offset<<DAWG_CODEPOINT_TO_POS_SHIFT) +
1336 (code&((1<<DAWG_CODEPOINT_TO_POS_SHIFT)-1))];
1337 if (offset == DAWG_CODEPOINT_TO_POS_NOTFOUND)
1338 return 0;
1339
1340 assert(buflen >= 0);
1341 return _inverse_dawg_lookup(buffer, Py_SAFE_DOWNCAST(buflen, int, unsigned int), offset);
1342 }
1343
1344 static int
capi_getucname(Py_UCS4 code,char * buffer,int buflen,int with_alias_and_seq)1345 capi_getucname(Py_UCS4 code,
1346 char* buffer, int buflen,
1347 int with_alias_and_seq)
1348 {
1349 return _getucname(NULL, code, buffer, buflen, with_alias_and_seq);
1350
1351 }
1352
1353 static void
find_syllable(const char * str,int * len,int * pos,int count,int column)1354 find_syllable(const char *str, int *len, int *pos, int count, int column)
1355 {
1356 int i, len1;
1357 *len = -1;
1358 for (i = 0; i < count; i++) {
1359 const char *s = hangul_syllables[i][column];
1360 len1 = Py_SAFE_DOWNCAST(strlen(s), size_t, int);
1361 if (len1 <= *len)
1362 continue;
1363 if (strncmp(str, s, len1) == 0) {
1364 *len = len1;
1365 *pos = i;
1366 }
1367 }
1368 if (*len == -1) {
1369 *len = 0;
1370 }
1371 }
1372
1373 static int
_check_alias_and_seq(Py_UCS4 * code,int with_named_seq)1374 _check_alias_and_seq(Py_UCS4* code, int with_named_seq)
1375 {
1376 /* check if named sequences are allowed */
1377 if (!with_named_seq && IS_NAMED_SEQ(*code))
1378 return 0;
1379 /* if the code point is in the PUA range that we use for aliases,
1380 * convert it to obtain the right code point */
1381 if (IS_ALIAS(*code))
1382 *code = name_aliases[*code-aliases_start];
1383 return 1;
1384 }
1385
1386
1387 static int
_getcode(const char * name,int namelen,Py_UCS4 * code)1388 _getcode(const char* name, int namelen, Py_UCS4* code)
1389 {
1390 /* Return the code point associated with the given name.
1391 * Named aliases are not resolved, they are returned as a code point in the
1392 * PUA */
1393
1394 /* Check for hangul syllables. */
1395 if (strncmp(name, "HANGUL SYLLABLE ", 16) == 0) {
1396 int len, L = -1, V = -1, T = -1;
1397 const char *pos = name + 16;
1398 find_syllable(pos, &len, &L, LCount, 0);
1399 pos += len;
1400 find_syllable(pos, &len, &V, VCount, 1);
1401 pos += len;
1402 find_syllable(pos, &len, &T, TCount, 2);
1403 pos += len;
1404 if (L != -1 && V != -1 && T != -1 && pos-name == namelen) {
1405 *code = SBase + (L*VCount+V)*TCount + T;
1406 return 1;
1407 }
1408 /* Otherwise, it's an illegal syllable name. */
1409 return 0;
1410 }
1411
1412 /* Check for unified ideographs. */
1413 if (strncmp(name, "CJK UNIFIED IDEOGRAPH-", 22) == 0) {
1414 /* Four or five hexdigits must follow. */
1415 unsigned int v;
1416 v = 0;
1417 name += 22;
1418 namelen -= 22;
1419 if (namelen != 4 && namelen != 5)
1420 return 0;
1421 while (namelen--) {
1422 v *= 16;
1423 if (*name >= '0' && *name <= '9')
1424 v += *name - '0';
1425 else if (*name >= 'A' && *name <= 'F')
1426 v += *name - 'A' + 10;
1427 else
1428 return 0;
1429 name++;
1430 }
1431 if (!is_unified_ideograph(v))
1432 return 0;
1433 *code = v;
1434 return 1;
1435 }
1436
1437 assert(namelen >= 0);
1438 int position = _lookup_dawg_packed(name, Py_SAFE_DOWNCAST(namelen, int, unsigned int));
1439 if (position < 0) {
1440 return 0;
1441 }
1442 *code = dawg_pos_to_codepoint[position];
1443 return 1;
1444 }
1445
1446
1447 static int
capi_getcode(const char * name,int namelen,Py_UCS4 * code,int with_named_seq)1448 capi_getcode(const char* name, int namelen, Py_UCS4* code,
1449 int with_named_seq)
1450 {
1451 if (!_getcode(name, namelen, code)) {
1452 return 0;
1453 }
1454 return _check_alias_and_seq(code, with_named_seq);
1455 }
1456
1457 static void
unicodedata_destroy_capi(PyObject * capsule)1458 unicodedata_destroy_capi(PyObject *capsule)
1459 {
1460 void *capi = PyCapsule_GetPointer(capsule, PyUnicodeData_CAPSULE_NAME);
1461 PyMem_Free(capi);
1462 }
1463
1464 static PyObject *
unicodedata_create_capi(void)1465 unicodedata_create_capi(void)
1466 {
1467 _PyUnicode_Name_CAPI *capi = PyMem_Malloc(sizeof(_PyUnicode_Name_CAPI));
1468 if (capi == NULL) {
1469 PyErr_NoMemory();
1470 return NULL;
1471 }
1472 capi->getname = capi_getucname;
1473 capi->getcode = capi_getcode;
1474
1475 PyObject *capsule = PyCapsule_New(capi,
1476 PyUnicodeData_CAPSULE_NAME,
1477 unicodedata_destroy_capi);
1478 if (capsule == NULL) {
1479 PyMem_Free(capi);
1480 }
1481 return capsule;
1482 };
1483
1484
1485 /* -------------------------------------------------------------------- */
1486 /* Python bindings */
1487
1488 /*[clinic input]
1489 unicodedata.UCD.name
1490
1491 self: self
1492 chr: int(accept={str})
1493 default: object=NULL
1494 /
1495
1496 Returns the name assigned to the character chr as a string.
1497
1498 If no name is defined, default is returned, or, if not given,
1499 ValueError is raised.
1500 [clinic start generated code]*/
1501
1502 static PyObject *
unicodedata_UCD_name_impl(PyObject * self,int chr,PyObject * default_value)1503 unicodedata_UCD_name_impl(PyObject *self, int chr, PyObject *default_value)
1504 /*[clinic end generated code: output=6bbb37a326407707 input=3e0367f534de56d9]*/
1505 {
1506 char name[NAME_MAXLEN+1];
1507 Py_UCS4 c = (Py_UCS4)chr;
1508
1509 if (!_getucname(self, c, name, NAME_MAXLEN, 0)) {
1510 if (default_value == NULL) {
1511 PyErr_SetString(PyExc_ValueError, "no such name");
1512 return NULL;
1513 }
1514 else {
1515 return Py_NewRef(default_value);
1516 }
1517 }
1518
1519 return PyUnicode_FromString(name);
1520 }
1521
1522 /*[clinic input]
1523 unicodedata.UCD.lookup
1524
1525 self: self
1526 name: str(accept={str, robuffer}, zeroes=True)
1527 /
1528
1529 Look up character by name.
1530
1531 If a character with the given name is found, return the
1532 corresponding character. If not found, KeyError is raised.
1533 [clinic start generated code]*/
1534
1535 static PyObject *
unicodedata_UCD_lookup_impl(PyObject * self,const char * name,Py_ssize_t name_length)1536 unicodedata_UCD_lookup_impl(PyObject *self, const char *name,
1537 Py_ssize_t name_length)
1538 /*[clinic end generated code: output=7f03fc4959b242f6 input=a557be0f8607a0d6]*/
1539 {
1540 Py_UCS4 code;
1541 unsigned int index;
1542 if (name_length > NAME_MAXLEN) {
1543 PyErr_SetString(PyExc_KeyError, "name too long");
1544 return NULL;
1545 }
1546
1547 if (!_getcode(name, (int)name_length, &code)) {
1548 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
1549 return NULL;
1550 }
1551 if (UCD_Check(self)) {
1552 /* in 3.2.0 there are no aliases and named sequences */
1553 if (IS_ALIAS(code) || IS_NAMED_SEQ(code)) {
1554 PyErr_Format(PyExc_KeyError, "undefined character name '%s'", name);
1555 return 0;
1556 }
1557 }
1558 /* check if code is in the PUA range that we use for named sequences
1559 and convert it */
1560 if (IS_NAMED_SEQ(code)) {
1561 index = code-named_sequences_start;
1562 return PyUnicode_FromKindAndData(PyUnicode_2BYTE_KIND,
1563 named_sequences[index].seq,
1564 named_sequences[index].seqlen);
1565 }
1566 if (IS_ALIAS(code)) {
1567 code = name_aliases[code-aliases_start];
1568 }
1569 return PyUnicode_FromOrdinal(code);
1570 }
1571
1572 // List of functions used to define module functions *AND* unicodedata.UCD
1573 // methods. For module functions, self is the module. For UCD methods, self
1574 // is an UCD instance. The UCD_Check() macro is used to check if self is
1575 // an UCD instance.
1576 static PyMethodDef unicodedata_functions[] = {
1577 UNICODEDATA_UCD_DECIMAL_METHODDEF
1578 UNICODEDATA_UCD_DIGIT_METHODDEF
1579 UNICODEDATA_UCD_NUMERIC_METHODDEF
1580 UNICODEDATA_UCD_CATEGORY_METHODDEF
1581 UNICODEDATA_UCD_BIDIRECTIONAL_METHODDEF
1582 UNICODEDATA_UCD_COMBINING_METHODDEF
1583 UNICODEDATA_UCD_MIRRORED_METHODDEF
1584 UNICODEDATA_UCD_EAST_ASIAN_WIDTH_METHODDEF
1585 UNICODEDATA_UCD_DECOMPOSITION_METHODDEF
1586 UNICODEDATA_UCD_NAME_METHODDEF
1587 UNICODEDATA_UCD_LOOKUP_METHODDEF
1588 UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF
1589 UNICODEDATA_UCD_NORMALIZE_METHODDEF
1590 {NULL, NULL} /* sentinel */
1591 };
1592
1593 static int
ucd_traverse(PreviousDBVersion * self,visitproc visit,void * arg)1594 ucd_traverse(PreviousDBVersion *self, visitproc visit, void *arg)
1595 {
1596 Py_VISIT(Py_TYPE(self));
1597 return 0;
1598 }
1599
1600 static void
ucd_dealloc(PreviousDBVersion * self)1601 ucd_dealloc(PreviousDBVersion *self)
1602 {
1603 PyTypeObject *tp = Py_TYPE(self);
1604 PyObject_GC_UnTrack(self);
1605 PyObject_GC_Del(self);
1606 Py_DECREF(tp);
1607 }
1608
1609 static PyType_Slot ucd_type_slots[] = {
1610 {Py_tp_dealloc, ucd_dealloc},
1611 {Py_tp_traverse, ucd_traverse},
1612 {Py_tp_getattro, PyObject_GenericGetAttr},
1613 {Py_tp_methods, unicodedata_functions},
1614 {Py_tp_members, DB_members},
1615 {0, 0}
1616 };
1617
1618 static PyType_Spec ucd_type_spec = {
1619 .name = "unicodedata.UCD",
1620 .basicsize = sizeof(PreviousDBVersion),
1621 .flags = (Py_TPFLAGS_DEFAULT | Py_TPFLAGS_DISALLOW_INSTANTIATION |
1622 Py_TPFLAGS_HAVE_GC | Py_TPFLAGS_IMMUTABLETYPE),
1623 .slots = ucd_type_slots
1624 };
1625
1626 PyDoc_STRVAR(unicodedata_docstring,
1627 "This module provides access to the Unicode Character Database which\n\
1628 defines character properties for all Unicode characters. The data in\n\
1629 this database is based on the UnicodeData.txt file version\n\
1630 " UNIDATA_VERSION " which is publicly available from ftp://ftp.unicode.org/.\n\
1631 \n\
1632 The module uses the same names and symbols as defined by the\n\
1633 UnicodeData File Format " UNIDATA_VERSION ".");
1634
1635 static int
unicodedata_exec(PyObject * module)1636 unicodedata_exec(PyObject *module)
1637 {
1638 if (PyModule_AddStringConstant(module, "unidata_version", UNIDATA_VERSION) < 0) {
1639 return -1;
1640 }
1641
1642 PyTypeObject *ucd_type = (PyTypeObject *)PyType_FromSpec(&ucd_type_spec);
1643 if (ucd_type == NULL) {
1644 return -1;
1645 }
1646
1647 if (PyModule_AddType(module, ucd_type) < 0) {
1648 Py_DECREF(ucd_type);
1649 return -1;
1650 }
1651
1652 // Unicode database version 3.2.0 used by the IDNA encoding
1653 PyObject *v;
1654 v = new_previous_version(ucd_type, "3.2.0",
1655 get_change_3_2_0, normalization_3_2_0);
1656 Py_DECREF(ucd_type);
1657 if (PyModule_Add(module, "ucd_3_2_0", v) < 0) {
1658 return -1;
1659 }
1660
1661 /* Export C API */
1662 if (PyModule_Add(module, "_ucnhash_CAPI", unicodedata_create_capi()) < 0) {
1663 return -1;
1664 }
1665 return 0;
1666 }
1667
1668 static PyModuleDef_Slot unicodedata_slots[] = {
1669 {Py_mod_exec, unicodedata_exec},
1670 {Py_mod_multiple_interpreters, Py_MOD_PER_INTERPRETER_GIL_SUPPORTED},
1671 {Py_mod_gil, Py_MOD_GIL_NOT_USED},
1672 {0, NULL}
1673 };
1674
1675 static struct PyModuleDef unicodedata_module = {
1676 PyModuleDef_HEAD_INIT,
1677 .m_name = "unicodedata",
1678 .m_doc = unicodedata_docstring,
1679 .m_size = 0,
1680 .m_methods = unicodedata_functions,
1681 .m_slots = unicodedata_slots,
1682 };
1683
1684 PyMODINIT_FUNC
PyInit_unicodedata(void)1685 PyInit_unicodedata(void)
1686 {
1687 return PyModuleDef_Init(&unicodedata_module);
1688 }
1689
1690
1691 /*
1692 Local variables:
1693 c-basic-offset: 4
1694 indent-tabs-mode: nil
1695 End:
1696 */
1697