• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* stringlib: codec implementations */
2 
3 #if !STRINGLIB_IS_UNICODE
4 # error "codecs.h is specific to Unicode"
5 #endif
6 
7 #include "pycore_byteswap.h"      // _Py_bswap32()
8 
9 /* Mask to quickly check whether a C 'long' contains a
10    non-ASCII, UTF8-encoded char. */
11 #if (SIZEOF_LONG == 8)
12 # define ASCII_CHAR_MASK 0x8080808080808080UL
13 #elif (SIZEOF_LONG == 4)
14 # define ASCII_CHAR_MASK 0x80808080UL
15 #else
16 # error C 'long' size should be either 4 or 8!
17 #endif
18 
19 /* 10xxxxxx */
20 #define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0)
21 
22 Py_LOCAL_INLINE(Py_UCS4)
STRINGLIB(utf8_decode)23 STRINGLIB(utf8_decode)(const char **inptr, const char *end,
24                        STRINGLIB_CHAR *dest,
25                        Py_ssize_t *outpos)
26 {
27     Py_UCS4 ch;
28     const char *s = *inptr;
29     const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
30     STRINGLIB_CHAR *p = dest + *outpos;
31 
32     while (s < end) {
33         ch = (unsigned char)*s;
34 
35         if (ch < 0x80) {
36             /* Fast path for runs of ASCII characters. Given that common UTF-8
37                input will consist of an overwhelming majority of ASCII
38                characters, we try to optimize for this case by checking
39                as many characters as a C 'long' can contain.
40                First, check if we can do an aligned read, as most CPUs have
41                a penalty for unaligned reads.
42             */
43             if (_Py_IS_ALIGNED(s, SIZEOF_LONG)) {
44                 /* Help register allocation */
45                 const char *_s = s;
46                 STRINGLIB_CHAR *_p = p;
47                 while (_s < aligned_end) {
48                     /* Read a whole long at a time (either 4 or 8 bytes),
49                        and do a fast unrolled copy if it only contains ASCII
50                        characters. */
51                     unsigned long value = *(const unsigned long *) _s;
52                     if (value & ASCII_CHAR_MASK)
53                         break;
54 #if PY_LITTLE_ENDIAN
55                     _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
56                     _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
57                     _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
58                     _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
59 # if SIZEOF_LONG == 8
60                     _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
61                     _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
62                     _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
63                     _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
64 # endif
65 #else
66 # if SIZEOF_LONG == 8
67                     _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
68                     _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
69                     _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
70                     _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
71                     _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
72                     _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
73                     _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
74                     _p[7] = (STRINGLIB_CHAR)(value & 0xFFu);
75 # else
76                     _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
77                     _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
78                     _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
79                     _p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
80 # endif
81 #endif
82                     _s += SIZEOF_LONG;
83                     _p += SIZEOF_LONG;
84                 }
85                 s = _s;
86                 p = _p;
87                 if (s == end)
88                     break;
89                 ch = (unsigned char)*s;
90             }
91             if (ch < 0x80) {
92                 s++;
93                 *p++ = ch;
94                 continue;
95             }
96         }
97 
98         if (ch < 0xE0) {
99             /* \xC2\x80-\xDF\xBF -- 0080-07FF */
100             Py_UCS4 ch2;
101             if (ch < 0xC2) {
102                 /* invalid sequence
103                 \x80-\xBF -- continuation byte
104                 \xC0-\xC1 -- fake 0000-007F */
105                 goto InvalidStart;
106             }
107             if (end - s < 2) {
108                 /* unexpected end of data: the caller will decide whether
109                    it's an error or not */
110                 break;
111             }
112             ch2 = (unsigned char)s[1];
113             if (!IS_CONTINUATION_BYTE(ch2))
114                 /* invalid continuation byte */
115                 goto InvalidContinuation1;
116             ch = (ch << 6) + ch2 -
117                  ((0xC0 << 6) + 0x80);
118             assert ((ch > 0x007F) && (ch <= 0x07FF));
119             s += 2;
120             if (STRINGLIB_MAX_CHAR <= 0x007F ||
121                 (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))
122                 /* Out-of-range */
123                 goto Return;
124             *p++ = ch;
125             continue;
126         }
127 
128         if (ch < 0xF0) {
129             /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
130             Py_UCS4 ch2, ch3;
131             if (end - s < 3) {
132                 /* unexpected end of data: the caller will decide whether
133                    it's an error or not */
134                 if (end - s < 2)
135                     break;
136                 ch2 = (unsigned char)s[1];
137                 if (!IS_CONTINUATION_BYTE(ch2) ||
138                     (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED))
139                     /* for clarification see comments below */
140                     goto InvalidContinuation1;
141                 break;
142             }
143             ch2 = (unsigned char)s[1];
144             ch3 = (unsigned char)s[2];
145             if (!IS_CONTINUATION_BYTE(ch2)) {
146                 /* invalid continuation byte */
147                 goto InvalidContinuation1;
148             }
149             if (ch == 0xE0) {
150                 if (ch2 < 0xA0)
151                     /* invalid sequence
152                        \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
153                     goto InvalidContinuation1;
154             } else if (ch == 0xED && ch2 >= 0xA0) {
155                 /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
156                    will result in surrogates in range D800-DFFF. Surrogates are
157                    not valid UTF-8 so they are rejected.
158                    See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
159                    (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
160                 goto InvalidContinuation1;
161             }
162             if (!IS_CONTINUATION_BYTE(ch3)) {
163                 /* invalid continuation byte */
164                 goto InvalidContinuation2;
165             }
166             ch = (ch << 12) + (ch2 << 6) + ch3 -
167                  ((0xE0 << 12) + (0x80 << 6) + 0x80);
168             assert ((ch > 0x07FF) && (ch <= 0xFFFF));
169             s += 3;
170             if (STRINGLIB_MAX_CHAR <= 0x07FF ||
171                 (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))
172                 /* Out-of-range */
173                 goto Return;
174             *p++ = ch;
175             continue;
176         }
177 
178         if (ch < 0xF5) {
179             /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
180             Py_UCS4 ch2, ch3, ch4;
181             if (end - s < 4) {
182                 /* unexpected end of data: the caller will decide whether
183                    it's an error or not */
184                 if (end - s < 2)
185                     break;
186                 ch2 = (unsigned char)s[1];
187                 if (!IS_CONTINUATION_BYTE(ch2) ||
188                     (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4))
189                     /* for clarification see comments below */
190                     goto InvalidContinuation1;
191                 if (end - s < 3)
192                     break;
193                 ch3 = (unsigned char)s[2];
194                 if (!IS_CONTINUATION_BYTE(ch3))
195                     goto InvalidContinuation2;
196                 break;
197             }
198             ch2 = (unsigned char)s[1];
199             ch3 = (unsigned char)s[2];
200             ch4 = (unsigned char)s[3];
201             if (!IS_CONTINUATION_BYTE(ch2)) {
202                 /* invalid continuation byte */
203                 goto InvalidContinuation1;
204             }
205             if (ch == 0xF0) {
206                 if (ch2 < 0x90)
207                     /* invalid sequence
208                        \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */
209                     goto InvalidContinuation1;
210             } else if (ch == 0xF4 && ch2 >= 0x90) {
211                 /* invalid sequence
212                    \xF4\x90\x80\x80- -- 110000- overflow */
213                 goto InvalidContinuation1;
214             }
215             if (!IS_CONTINUATION_BYTE(ch3)) {
216                 /* invalid continuation byte */
217                 goto InvalidContinuation2;
218             }
219             if (!IS_CONTINUATION_BYTE(ch4)) {
220                 /* invalid continuation byte */
221                 goto InvalidContinuation3;
222             }
223             ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
224                  ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
225             assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
226             s += 4;
227             if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
228                 (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
229                 /* Out-of-range */
230                 goto Return;
231             *p++ = ch;
232             continue;
233         }
234         goto InvalidStart;
235     }
236     ch = 0;
237 Return:
238     *inptr = s;
239     *outpos = p - dest;
240     return ch;
241 InvalidStart:
242     ch = 1;
243     goto Return;
244 InvalidContinuation1:
245     ch = 2;
246     goto Return;
247 InvalidContinuation2:
248     ch = 3;
249     goto Return;
250 InvalidContinuation3:
251     ch = 4;
252     goto Return;
253 }
254 
255 #undef ASCII_CHAR_MASK
256 
257 
258 /* UTF-8 encoder specialized for a Unicode kind to avoid the slow
259    PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
260    UCS-1 strings don't need to handle surrogates for example. */
261 Py_LOCAL_INLINE(char *)
STRINGLIB(utf8_encoder)262 STRINGLIB(utf8_encoder)(_PyBytesWriter *writer,
263                         PyObject *unicode,
264                         const STRINGLIB_CHAR *data,
265                         Py_ssize_t size,
266                         _Py_error_handler error_handler,
267                         const char *errors)
268 {
269     Py_ssize_t i;                /* index into data of next input character */
270     char *p;                     /* next free byte in output buffer */
271 #if STRINGLIB_SIZEOF_CHAR > 1
272     PyObject *error_handler_obj = NULL;
273     PyObject *exc = NULL;
274     PyObject *rep = NULL;
275 #endif
276 #if STRINGLIB_SIZEOF_CHAR == 1
277     const Py_ssize_t max_char_size = 2;
278 #elif STRINGLIB_SIZEOF_CHAR == 2
279     const Py_ssize_t max_char_size = 3;
280 #else /*  STRINGLIB_SIZEOF_CHAR == 4 */
281     const Py_ssize_t max_char_size = 4;
282 #endif
283 
284     assert(size >= 0);
285     if (size > PY_SSIZE_T_MAX / max_char_size) {
286         /* integer overflow */
287         PyErr_NoMemory();
288         return NULL;
289     }
290 
291     _PyBytesWriter_Init(writer);
292     p = _PyBytesWriter_Alloc(writer, size * max_char_size);
293     if (p == NULL)
294         return NULL;
295 
296     for (i = 0; i < size;) {
297         Py_UCS4 ch = data[i++];
298 
299         if (ch < 0x80) {
300             /* Encode ASCII */
301             *p++ = (char) ch;
302 
303         }
304         else
305 #if STRINGLIB_SIZEOF_CHAR > 1
306         if (ch < 0x0800)
307 #endif
308         {
309             /* Encode Latin-1 */
310             *p++ = (char)(0xc0 | (ch >> 6));
311             *p++ = (char)(0x80 | (ch & 0x3f));
312         }
313 #if STRINGLIB_SIZEOF_CHAR > 1
314         else if (Py_UNICODE_IS_SURROGATE(ch)) {
315             Py_ssize_t startpos, endpos, newpos;
316             Py_ssize_t k;
317             if (error_handler == _Py_ERROR_UNKNOWN) {
318                 error_handler = _Py_GetErrorHandler(errors);
319             }
320 
321             startpos = i-1;
322             endpos = startpos+1;
323 
324             while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos]))
325                 endpos++;
326 
327             /* Only overallocate the buffer if it's not the last write */
328             writer->overallocate = (endpos < size);
329 
330             switch (error_handler)
331             {
332             case _Py_ERROR_REPLACE:
333                 memset(p, '?', endpos - startpos);
334                 p += (endpos - startpos);
335                 /* fall through */
336             case _Py_ERROR_IGNORE:
337                 i += (endpos - startpos - 1);
338                 break;
339 
340             case _Py_ERROR_SURROGATEPASS:
341                 for (k=startpos; k<endpos; k++) {
342                     ch = data[k];
343                     *p++ = (char)(0xe0 | (ch >> 12));
344                     *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
345                     *p++ = (char)(0x80 | (ch & 0x3f));
346                 }
347                 i += (endpos - startpos - 1);
348                 break;
349 
350             case _Py_ERROR_BACKSLASHREPLACE:
351                 /* subtract preallocated bytes */
352                 writer->min_size -= max_char_size * (endpos - startpos);
353                 p = backslashreplace(writer, p,
354                                      unicode, startpos, endpos);
355                 if (p == NULL)
356                     goto error;
357                 i += (endpos - startpos - 1);
358                 break;
359 
360             case _Py_ERROR_XMLCHARREFREPLACE:
361                 /* subtract preallocated bytes */
362                 writer->min_size -= max_char_size * (endpos - startpos);
363                 p = xmlcharrefreplace(writer, p,
364                                       unicode, startpos, endpos);
365                 if (p == NULL)
366                     goto error;
367                 i += (endpos - startpos - 1);
368                 break;
369 
370             case _Py_ERROR_SURROGATEESCAPE:
371                 for (k=startpos; k<endpos; k++) {
372                     ch = data[k];
373                     if (!(0xDC80 <= ch && ch <= 0xDCFF))
374                         break;
375                     *p++ = (char)(ch & 0xff);
376                 }
377                 if (k >= endpos) {
378                     i += (endpos - startpos - 1);
379                     break;
380                 }
381                 startpos = k;
382                 assert(startpos < endpos);
383                 /* fall through */
384             default:
385                 rep = unicode_encode_call_errorhandler(
386                       errors, &error_handler_obj, "utf-8", "surrogates not allowed",
387                       unicode, &exc, startpos, endpos, &newpos);
388                 if (!rep)
389                     goto error;
390 
391                 /* subtract preallocated bytes */
392                 writer->min_size -= max_char_size * (newpos - startpos);
393 
394                 if (PyBytes_Check(rep)) {
395                     p = _PyBytesWriter_WriteBytes(writer, p,
396                                                   PyBytes_AS_STRING(rep),
397                                                   PyBytes_GET_SIZE(rep));
398                 }
399                 else {
400                     /* rep is unicode */
401                     if (PyUnicode_READY(rep) < 0)
402                         goto error;
403 
404                     if (!PyUnicode_IS_ASCII(rep)) {
405                         raise_encode_exception(&exc, "utf-8", unicode,
406                                                startpos, endpos,
407                                                "surrogates not allowed");
408                         goto error;
409                     }
410 
411                     p = _PyBytesWriter_WriteBytes(writer, p,
412                                                   PyUnicode_DATA(rep),
413                                                   PyUnicode_GET_LENGTH(rep));
414                 }
415 
416                 if (p == NULL)
417                     goto error;
418                 Py_CLEAR(rep);
419 
420                 i = newpos;
421             }
422 
423             /* If overallocation was disabled, ensure that it was the last
424                write. Otherwise, we missed an optimization */
425             assert(writer->overallocate || i == size);
426         }
427         else
428 #if STRINGLIB_SIZEOF_CHAR > 2
429         if (ch < 0x10000)
430 #endif
431         {
432             *p++ = (char)(0xe0 | (ch >> 12));
433             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
434             *p++ = (char)(0x80 | (ch & 0x3f));
435         }
436 #if STRINGLIB_SIZEOF_CHAR > 2
437         else /* ch >= 0x10000 */
438         {
439             assert(ch <= MAX_UNICODE);
440             /* Encode UCS4 Unicode ordinals */
441             *p++ = (char)(0xf0 | (ch >> 18));
442             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
443             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
444             *p++ = (char)(0x80 | (ch & 0x3f));
445         }
446 #endif /* STRINGLIB_SIZEOF_CHAR > 2 */
447 #endif /* STRINGLIB_SIZEOF_CHAR > 1 */
448     }
449 
450 #if STRINGLIB_SIZEOF_CHAR > 1
451     Py_XDECREF(error_handler_obj);
452     Py_XDECREF(exc);
453 #endif
454     return p;
455 
456 #if STRINGLIB_SIZEOF_CHAR > 1
457  error:
458     Py_XDECREF(rep);
459     Py_XDECREF(error_handler_obj);
460     Py_XDECREF(exc);
461     return NULL;
462 #endif
463 }
464 
465 /* The pattern for constructing UCS2-repeated masks. */
466 #if SIZEOF_LONG == 8
467 # define UCS2_REPEAT_MASK 0x0001000100010001ul
468 #elif SIZEOF_LONG == 4
469 # define UCS2_REPEAT_MASK 0x00010001ul
470 #else
471 # error C 'long' size should be either 4 or 8!
472 #endif
473 
474 /* The mask for fast checking. */
475 #if STRINGLIB_SIZEOF_CHAR == 1
476 /* The mask for fast checking of whether a C 'long' contains a
477    non-ASCII or non-Latin1 UTF16-encoded characters. */
478 # define FAST_CHAR_MASK         (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR))
479 #else
480 /* The mask for fast checking of whether a C 'long' may contain
481    UTF16-encoded surrogate characters. This is an efficient heuristic,
482    assuming that non-surrogate characters with a code point >= 0x8000 are
483    rare in most input.
484 */
485 # define FAST_CHAR_MASK         (UCS2_REPEAT_MASK * 0x8000u)
486 #endif
487 /* The mask for fast byte-swapping. */
488 #define STRIPPED_MASK           (UCS2_REPEAT_MASK * 0x00FFu)
489 /* Swap bytes. */
490 #define SWAB(value)             ((((value) >> 8) & STRIPPED_MASK) | \
491                                  (((value) & STRIPPED_MASK) << 8))
492 
493 Py_LOCAL_INLINE(Py_UCS4)
STRINGLIB(utf16_decode)494 STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
495                         STRINGLIB_CHAR *dest, Py_ssize_t *outpos,
496                         int native_ordering)
497 {
498     Py_UCS4 ch;
499     const unsigned char *aligned_end =
500             (const unsigned char *) _Py_ALIGN_DOWN(e, SIZEOF_LONG);
501     const unsigned char *q = *inptr;
502     STRINGLIB_CHAR *p = dest + *outpos;
503     /* Offsets from q for retrieving byte pairs in the right order. */
504 #if PY_LITTLE_ENDIAN
505     int ihi = !!native_ordering, ilo = !native_ordering;
506 #else
507     int ihi = !native_ordering, ilo = !!native_ordering;
508 #endif
509     --e;
510 
511     while (q < e) {
512         Py_UCS4 ch2;
513         /* First check for possible aligned read of a C 'long'. Unaligned
514            reads are more expensive, better to defer to another iteration. */
515         if (_Py_IS_ALIGNED(q, SIZEOF_LONG)) {
516             /* Fast path for runs of in-range non-surrogate chars. */
517             const unsigned char *_q = q;
518             while (_q < aligned_end) {
519                 unsigned long block = * (const unsigned long *) _q;
520                 if (native_ordering) {
521                     /* Can use buffer directly */
522                     if (block & FAST_CHAR_MASK)
523                         break;
524                 }
525                 else {
526                     /* Need to byte-swap */
527                     if (block & SWAB(FAST_CHAR_MASK))
528                         break;
529 #if STRINGLIB_SIZEOF_CHAR == 1
530                     block >>= 8;
531 #else
532                     block = SWAB(block);
533 #endif
534                 }
535 #if PY_LITTLE_ENDIAN
536 # if SIZEOF_LONG == 4
537                 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
538                 p[1] = (STRINGLIB_CHAR)(block >> 16);
539 # elif SIZEOF_LONG == 8
540                 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
541                 p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
542                 p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
543                 p[3] = (STRINGLIB_CHAR)(block >> 48);
544 # endif
545 #else
546 # if SIZEOF_LONG == 4
547                 p[0] = (STRINGLIB_CHAR)(block >> 16);
548                 p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu);
549 # elif SIZEOF_LONG == 8
550                 p[0] = (STRINGLIB_CHAR)(block >> 48);
551                 p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
552                 p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
553                 p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu);
554 # endif
555 #endif
556                 _q += SIZEOF_LONG;
557                 p += SIZEOF_LONG / 2;
558             }
559             q = _q;
560             if (q >= e)
561                 break;
562         }
563 
564         ch = (q[ihi] << 8) | q[ilo];
565         q += 2;
566         if (!Py_UNICODE_IS_SURROGATE(ch)) {
567 #if STRINGLIB_SIZEOF_CHAR < 2
568             if (ch > STRINGLIB_MAX_CHAR)
569                 /* Out-of-range */
570                 goto Return;
571 #endif
572             *p++ = (STRINGLIB_CHAR)ch;
573             continue;
574         }
575 
576         /* UTF-16 code pair: */
577         if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))
578             goto IllegalEncoding;
579         if (q >= e)
580             goto UnexpectedEnd;
581         ch2 = (q[ihi] << 8) | q[ilo];
582         q += 2;
583         if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))
584             goto IllegalSurrogate;
585         ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
586 #if STRINGLIB_SIZEOF_CHAR < 4
587         /* Out-of-range */
588         goto Return;
589 #else
590         *p++ = (STRINGLIB_CHAR)ch;
591 #endif
592     }
593     ch = 0;
594 Return:
595     *inptr = q;
596     *outpos = p - dest;
597     return ch;
598 UnexpectedEnd:
599     ch = 1;
600     goto Return;
601 IllegalEncoding:
602     ch = 2;
603     goto Return;
604 IllegalSurrogate:
605     ch = 3;
606     goto Return;
607 }
608 #undef UCS2_REPEAT_MASK
609 #undef FAST_CHAR_MASK
610 #undef STRIPPED_MASK
611 #undef SWAB
612 
613 
614 #if STRINGLIB_MAX_CHAR >= 0x80
615 Py_LOCAL_INLINE(Py_ssize_t)
STRINGLIB(utf16_encode)616 STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in,
617                         Py_ssize_t len,
618                         unsigned short **outptr,
619                         int native_ordering)
620 {
621     unsigned short *out = *outptr;
622     const STRINGLIB_CHAR *end = in + len;
623 #if STRINGLIB_SIZEOF_CHAR == 1
624     if (native_ordering) {
625         const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
626         while (in < unrolled_end) {
627             out[0] = in[0];
628             out[1] = in[1];
629             out[2] = in[2];
630             out[3] = in[3];
631             in += 4; out += 4;
632         }
633         while (in < end) {
634             *out++ = *in++;
635         }
636     } else {
637 # define SWAB2(CH)  ((CH) << 8) /* high byte is zero */
638         const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
639         while (in < unrolled_end) {
640             out[0] = SWAB2(in[0]);
641             out[1] = SWAB2(in[1]);
642             out[2] = SWAB2(in[2]);
643             out[3] = SWAB2(in[3]);
644             in += 4; out += 4;
645         }
646         while (in < end) {
647             Py_UCS4 ch = *in++;
648             *out++ = SWAB2((Py_UCS2)ch);
649         }
650 #undef SWAB2
651     }
652     *outptr = out;
653     return len;
654 #else
655     if (native_ordering) {
656 #if STRINGLIB_MAX_CHAR < 0x10000
657         const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
658         while (in < unrolled_end) {
659             /* check if any character is a surrogate character */
660             if (((in[0] ^ 0xd800) &
661                  (in[1] ^ 0xd800) &
662                  (in[2] ^ 0xd800) &
663                  (in[3] ^ 0xd800) & 0xf800) == 0)
664                 break;
665             out[0] = in[0];
666             out[1] = in[1];
667             out[2] = in[2];
668             out[3] = in[3];
669             in += 4; out += 4;
670         }
671 #endif
672         while (in < end) {
673             Py_UCS4 ch;
674             ch = *in++;
675             if (ch < 0xd800)
676                 *out++ = ch;
677             else if (ch < 0xe000)
678                 /* reject surrogate characters (U+D800-U+DFFF) */
679                 goto fail;
680 #if STRINGLIB_MAX_CHAR >= 0x10000
681             else if (ch >= 0x10000) {
682                 out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
683                 out[1] = Py_UNICODE_LOW_SURROGATE(ch);
684                 out += 2;
685             }
686 #endif
687             else
688                 *out++ = ch;
689         }
690     } else {
691 #define SWAB2(CH)  (((CH) << 8) | ((CH) >> 8))
692 #if STRINGLIB_MAX_CHAR < 0x10000
693         const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
694         while (in < unrolled_end) {
695             /* check if any character is a surrogate character */
696             if (((in[0] ^ 0xd800) &
697                  (in[1] ^ 0xd800) &
698                  (in[2] ^ 0xd800) &
699                  (in[3] ^ 0xd800) & 0xf800) == 0)
700                 break;
701             out[0] = SWAB2(in[0]);
702             out[1] = SWAB2(in[1]);
703             out[2] = SWAB2(in[2]);
704             out[3] = SWAB2(in[3]);
705             in += 4; out += 4;
706         }
707 #endif
708         while (in < end) {
709             Py_UCS4 ch = *in++;
710             if (ch < 0xd800)
711                 *out++ = SWAB2((Py_UCS2)ch);
712             else if (ch < 0xe000)
713                 /* reject surrogate characters (U+D800-U+DFFF) */
714                 goto fail;
715 #if STRINGLIB_MAX_CHAR >= 0x10000
716             else if (ch >= 0x10000) {
717                 Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
718                 Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
719                 out[0] = SWAB2(ch1);
720                 out[1] = SWAB2(ch2);
721                 out += 2;
722             }
723 #endif
724             else
725                 *out++ = SWAB2((Py_UCS2)ch);
726         }
727 #undef SWAB2
728     }
729     *outptr = out;
730     return len;
731   fail:
732     *outptr = out;
733     return len - (end - in + 1);
734 #endif
735 }
736 
737 static inline uint32_t
STRINGLIB(SWAB4)738 STRINGLIB(SWAB4)(STRINGLIB_CHAR ch)
739 {
740     uint32_t word = ch;
741 #if STRINGLIB_SIZEOF_CHAR == 1
742     /* high bytes are zero */
743     return (word << 24);
744 #elif STRINGLIB_SIZEOF_CHAR == 2
745     /* high bytes are zero */
746     return ((word & 0x00FFu) << 24) | ((word & 0xFF00u) << 8);
747 #else
748     return _Py_bswap32(word);
749 #endif
750 }
751 
752 Py_LOCAL_INLINE(Py_ssize_t)
STRINGLIB(utf32_encode)753 STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in,
754                         Py_ssize_t len,
755                         uint32_t **outptr,
756                         int native_ordering)
757 {
758     uint32_t *out = *outptr;
759     const STRINGLIB_CHAR *end = in + len;
760     if (native_ordering) {
761         const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
762         while (in < unrolled_end) {
763 #if STRINGLIB_SIZEOF_CHAR > 1
764             /* check if any character is a surrogate character */
765             if (((in[0] ^ 0xd800) &
766                  (in[1] ^ 0xd800) &
767                  (in[2] ^ 0xd800) &
768                  (in[3] ^ 0xd800) & 0xf800) == 0)
769                 break;
770 #endif
771             out[0] = in[0];
772             out[1] = in[1];
773             out[2] = in[2];
774             out[3] = in[3];
775             in += 4; out += 4;
776         }
777         while (in < end) {
778             Py_UCS4 ch;
779             ch = *in++;
780 #if STRINGLIB_SIZEOF_CHAR > 1
781             if (Py_UNICODE_IS_SURROGATE(ch)) {
782                 /* reject surrogate characters (U+D800-U+DFFF) */
783                 goto fail;
784             }
785 #endif
786             *out++ = ch;
787         }
788     } else {
789         const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
790         while (in < unrolled_end) {
791 #if STRINGLIB_SIZEOF_CHAR > 1
792             /* check if any character is a surrogate character */
793             if (((in[0] ^ 0xd800) &
794                  (in[1] ^ 0xd800) &
795                  (in[2] ^ 0xd800) &
796                  (in[3] ^ 0xd800) & 0xf800) == 0)
797                 break;
798 #endif
799             out[0] = STRINGLIB(SWAB4)(in[0]);
800             out[1] = STRINGLIB(SWAB4)(in[1]);
801             out[2] = STRINGLIB(SWAB4)(in[2]);
802             out[3] = STRINGLIB(SWAB4)(in[3]);
803             in += 4; out += 4;
804         }
805         while (in < end) {
806             Py_UCS4 ch = *in++;
807 #if STRINGLIB_SIZEOF_CHAR > 1
808             if (Py_UNICODE_IS_SURROGATE(ch)) {
809                 /* reject surrogate characters (U+D800-U+DFFF) */
810                 goto fail;
811             }
812 #endif
813             *out++ = STRINGLIB(SWAB4)(ch);
814         }
815     }
816     *outptr = out;
817     return len;
818 #if STRINGLIB_SIZEOF_CHAR > 1
819   fail:
820     *outptr = out;
821     return len - (end - in + 1);
822 #endif
823 }
824 
825 #endif
826