• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* stringlib: codec implementations */
2 
3 #if !STRINGLIB_IS_UNICODE
4 # error "codecs.h is specific to Unicode"
5 #endif
6 
7 #include "pycore_bitutils.h"      // _Py_bswap32()
8 
9 /* Mask to quickly check whether a C 'size_t' contains a
10    non-ASCII, UTF8-encoded char. */
11 #if (SIZEOF_SIZE_T == 8)
12 # define ASCII_CHAR_MASK 0x8080808080808080ULL
13 #elif (SIZEOF_SIZE_T == 4)
14 # define ASCII_CHAR_MASK 0x80808080U
15 #else
16 # error C 'size_t' size should be either 4 or 8!
17 #endif
18 
19 /* 10xxxxxx */
20 #define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0)
21 
22 Py_LOCAL_INLINE(Py_UCS4)
STRINGLIB(utf8_decode)23 STRINGLIB(utf8_decode)(const char **inptr, const char *end,
24                        STRINGLIB_CHAR *dest,
25                        Py_ssize_t *outpos)
26 {
27     Py_UCS4 ch;
28     const char *s = *inptr;
29     STRINGLIB_CHAR *p = dest + *outpos;
30 
31     while (s < end) {
32         ch = (unsigned char)*s;
33 
34         if (ch < 0x80) {
35             /* Fast path for runs of ASCII characters. Given that common UTF-8
36                input will consist of an overwhelming majority of ASCII
37                characters, we try to optimize for this case by checking
38                as many characters as a C 'size_t' can contain.
39                First, check if we can do an aligned read, as most CPUs have
40                a penalty for unaligned reads.
41             */
42             if (_Py_IS_ALIGNED(s, ALIGNOF_SIZE_T)) {
43                 /* Help register allocation */
44                 const char *_s = s;
45                 STRINGLIB_CHAR *_p = p;
46                 while (_s + SIZEOF_SIZE_T <= end) {
47                     /* Read a whole size_t at a time (either 4 or 8 bytes),
48                        and do a fast unrolled copy if it only contains ASCII
49                        characters. */
50                     size_t value = *(const size_t *) _s;
51                     if (value & ASCII_CHAR_MASK)
52                         break;
53 #if PY_LITTLE_ENDIAN
54                     _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
55                     _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
56                     _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
57                     _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
58 # if SIZEOF_SIZE_T == 8
59                     _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
60                     _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
61                     _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
62                     _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
63 # endif
64 #else
65 # if SIZEOF_SIZE_T == 8
66                     _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
67                     _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
68                     _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
69                     _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
70                     _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
71                     _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
72                     _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
73                     _p[7] = (STRINGLIB_CHAR)(value & 0xFFu);
74 # else
75                     _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
76                     _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
77                     _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
78                     _p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
79 # endif
80 #endif
81                     _s += SIZEOF_SIZE_T;
82                     _p += SIZEOF_SIZE_T;
83                 }
84                 s = _s;
85                 p = _p;
86                 if (s == end)
87                     break;
88                 ch = (unsigned char)*s;
89             }
90             if (ch < 0x80) {
91                 s++;
92                 *p++ = ch;
93                 continue;
94             }
95         }
96 
97         if (ch < 0xE0) {
98             /* \xC2\x80-\xDF\xBF -- 0080-07FF */
99             Py_UCS4 ch2;
100             if (ch < 0xC2) {
101                 /* invalid sequence
102                 \x80-\xBF -- continuation byte
103                 \xC0-\xC1 -- fake 0000-007F */
104                 goto InvalidStart;
105             }
106             if (end - s < 2) {
107                 /* unexpected end of data: the caller will decide whether
108                    it's an error or not */
109                 break;
110             }
111             ch2 = (unsigned char)s[1];
112             if (!IS_CONTINUATION_BYTE(ch2))
113                 /* invalid continuation byte */
114                 goto InvalidContinuation1;
115             ch = (ch << 6) + ch2 -
116                  ((0xC0 << 6) + 0x80);
117             assert ((ch > 0x007F) && (ch <= 0x07FF));
118             s += 2;
119             if (STRINGLIB_MAX_CHAR <= 0x007F ||
120                 (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))
121                 /* Out-of-range */
122                 goto Return;
123             *p++ = ch;
124             continue;
125         }
126 
127         if (ch < 0xF0) {
128             /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
129             Py_UCS4 ch2, ch3;
130             if (end - s < 3) {
131                 /* unexpected end of data: the caller will decide whether
132                    it's an error or not */
133                 if (end - s < 2)
134                     break;
135                 ch2 = (unsigned char)s[1];
136                 if (!IS_CONTINUATION_BYTE(ch2) ||
137                     (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED))
138                     /* for clarification see comments below */
139                     goto InvalidContinuation1;
140                 break;
141             }
142             ch2 = (unsigned char)s[1];
143             ch3 = (unsigned char)s[2];
144             if (!IS_CONTINUATION_BYTE(ch2)) {
145                 /* invalid continuation byte */
146                 goto InvalidContinuation1;
147             }
148             if (ch == 0xE0) {
149                 if (ch2 < 0xA0)
150                     /* invalid sequence
151                        \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
152                     goto InvalidContinuation1;
153             } else if (ch == 0xED && ch2 >= 0xA0) {
154                 /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
155                    will result in surrogates in range D800-DFFF. Surrogates are
156                    not valid UTF-8 so they are rejected.
157                    See https://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
158                    (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
159                 goto InvalidContinuation1;
160             }
161             if (!IS_CONTINUATION_BYTE(ch3)) {
162                 /* invalid continuation byte */
163                 goto InvalidContinuation2;
164             }
165             ch = (ch << 12) + (ch2 << 6) + ch3 -
166                  ((0xE0 << 12) + (0x80 << 6) + 0x80);
167             assert ((ch > 0x07FF) && (ch <= 0xFFFF));
168             s += 3;
169             if (STRINGLIB_MAX_CHAR <= 0x07FF ||
170                 (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))
171                 /* Out-of-range */
172                 goto Return;
173             *p++ = ch;
174             continue;
175         }
176 
177         if (ch < 0xF5) {
178             /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
179             Py_UCS4 ch2, ch3, ch4;
180             if (end - s < 4) {
181                 /* unexpected end of data: the caller will decide whether
182                    it's an error or not */
183                 if (end - s < 2)
184                     break;
185                 ch2 = (unsigned char)s[1];
186                 if (!IS_CONTINUATION_BYTE(ch2) ||
187                     (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4))
188                     /* for clarification see comments below */
189                     goto InvalidContinuation1;
190                 if (end - s < 3)
191                     break;
192                 ch3 = (unsigned char)s[2];
193                 if (!IS_CONTINUATION_BYTE(ch3))
194                     goto InvalidContinuation2;
195                 break;
196             }
197             ch2 = (unsigned char)s[1];
198             ch3 = (unsigned char)s[2];
199             ch4 = (unsigned char)s[3];
200             if (!IS_CONTINUATION_BYTE(ch2)) {
201                 /* invalid continuation byte */
202                 goto InvalidContinuation1;
203             }
204             if (ch == 0xF0) {
205                 if (ch2 < 0x90)
206                     /* invalid sequence
207                        \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */
208                     goto InvalidContinuation1;
209             } else if (ch == 0xF4 && ch2 >= 0x90) {
210                 /* invalid sequence
211                    \xF4\x90\x80\x80- -- 110000- overflow */
212                 goto InvalidContinuation1;
213             }
214             if (!IS_CONTINUATION_BYTE(ch3)) {
215                 /* invalid continuation byte */
216                 goto InvalidContinuation2;
217             }
218             if (!IS_CONTINUATION_BYTE(ch4)) {
219                 /* invalid continuation byte */
220                 goto InvalidContinuation3;
221             }
222             ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
223                  ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
224             assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
225             s += 4;
226             if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
227                 (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
228                 /* Out-of-range */
229                 goto Return;
230             *p++ = ch;
231             continue;
232         }
233         goto InvalidStart;
234     }
235     ch = 0;
236 Return:
237     *inptr = s;
238     *outpos = p - dest;
239     return ch;
240 InvalidStart:
241     ch = 1;
242     goto Return;
243 InvalidContinuation1:
244     ch = 2;
245     goto Return;
246 InvalidContinuation2:
247     ch = 3;
248     goto Return;
249 InvalidContinuation3:
250     ch = 4;
251     goto Return;
252 }
253 
254 #undef ASCII_CHAR_MASK
255 
256 
257 /* UTF-8 encoder specialized for a Unicode kind to avoid the slow
258    PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
259    UCS-1 strings don't need to handle surrogates for example. */
260 Py_LOCAL_INLINE(char *)
STRINGLIB(utf8_encoder)261 STRINGLIB(utf8_encoder)(_PyBytesWriter *writer,
262                         PyObject *unicode,
263                         const STRINGLIB_CHAR *data,
264                         Py_ssize_t size,
265                         _Py_error_handler error_handler,
266                         const char *errors)
267 {
268     Py_ssize_t i;                /* index into data of next input character */
269     char *p;                     /* next free byte in output buffer */
270 #if STRINGLIB_SIZEOF_CHAR > 1
271     PyObject *error_handler_obj = NULL;
272     PyObject *exc = NULL;
273     PyObject *rep = NULL;
274 #endif
275 #if STRINGLIB_SIZEOF_CHAR == 1
276     const Py_ssize_t max_char_size = 2;
277 #elif STRINGLIB_SIZEOF_CHAR == 2
278     const Py_ssize_t max_char_size = 3;
279 #else /*  STRINGLIB_SIZEOF_CHAR == 4 */
280     const Py_ssize_t max_char_size = 4;
281 #endif
282 
283     assert(size >= 0);
284     if (size > PY_SSIZE_T_MAX / max_char_size) {
285         /* integer overflow */
286         PyErr_NoMemory();
287         return NULL;
288     }
289 
290     _PyBytesWriter_Init(writer);
291     p = _PyBytesWriter_Alloc(writer, size * max_char_size);
292     if (p == NULL)
293         return NULL;
294 
295     for (i = 0; i < size;) {
296         Py_UCS4 ch = data[i++];
297 
298         if (ch < 0x80) {
299             /* Encode ASCII */
300             *p++ = (char) ch;
301 
302         }
303         else
304 #if STRINGLIB_SIZEOF_CHAR > 1
305         if (ch < 0x0800)
306 #endif
307         {
308             /* Encode Latin-1 */
309             *p++ = (char)(0xc0 | (ch >> 6));
310             *p++ = (char)(0x80 | (ch & 0x3f));
311         }
312 #if STRINGLIB_SIZEOF_CHAR > 1
313         else if (Py_UNICODE_IS_SURROGATE(ch)) {
314             Py_ssize_t startpos, endpos, newpos;
315             Py_ssize_t k;
316             if (error_handler == _Py_ERROR_UNKNOWN) {
317                 error_handler = _Py_GetErrorHandler(errors);
318             }
319 
320             startpos = i-1;
321             endpos = startpos+1;
322 
323             while ((endpos < size) && Py_UNICODE_IS_SURROGATE(data[endpos]))
324                 endpos++;
325 
326             /* Only overallocate the buffer if it's not the last write */
327             writer->overallocate = (endpos < size);
328 
329             switch (error_handler)
330             {
331             case _Py_ERROR_REPLACE:
332                 memset(p, '?', endpos - startpos);
333                 p += (endpos - startpos);
334                 /* fall through */
335             case _Py_ERROR_IGNORE:
336                 i += (endpos - startpos - 1);
337                 break;
338 
339             case _Py_ERROR_SURROGATEPASS:
340                 for (k=startpos; k<endpos; k++) {
341                     ch = data[k];
342                     *p++ = (char)(0xe0 | (ch >> 12));
343                     *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
344                     *p++ = (char)(0x80 | (ch & 0x3f));
345                 }
346                 i += (endpos - startpos - 1);
347                 break;
348 
349             case _Py_ERROR_BACKSLASHREPLACE:
350                 /* subtract preallocated bytes */
351                 writer->min_size -= max_char_size * (endpos - startpos);
352                 p = backslashreplace(writer, p,
353                                      unicode, startpos, endpos);
354                 if (p == NULL)
355                     goto error;
356                 i += (endpos - startpos - 1);
357                 break;
358 
359             case _Py_ERROR_XMLCHARREFREPLACE:
360                 /* subtract preallocated bytes */
361                 writer->min_size -= max_char_size * (endpos - startpos);
362                 p = xmlcharrefreplace(writer, p,
363                                       unicode, startpos, endpos);
364                 if (p == NULL)
365                     goto error;
366                 i += (endpos - startpos - 1);
367                 break;
368 
369             case _Py_ERROR_SURROGATEESCAPE:
370                 for (k=startpos; k<endpos; k++) {
371                     ch = data[k];
372                     if (!(0xDC80 <= ch && ch <= 0xDCFF))
373                         break;
374                     *p++ = (char)(ch & 0xff);
375                 }
376                 if (k >= endpos) {
377                     i += (endpos - startpos - 1);
378                     break;
379                 }
380                 startpos = k;
381                 assert(startpos < endpos);
382                 /* fall through */
383             default:
384                 rep = unicode_encode_call_errorhandler(
385                       errors, &error_handler_obj, "utf-8", "surrogates not allowed",
386                       unicode, &exc, startpos, endpos, &newpos);
387                 if (!rep)
388                     goto error;
389 
390                 /* subtract preallocated bytes */
391                 writer->min_size -= max_char_size * (newpos - startpos);
392 
393                 if (PyBytes_Check(rep)) {
394                     p = _PyBytesWriter_WriteBytes(writer, p,
395                                                   PyBytes_AS_STRING(rep),
396                                                   PyBytes_GET_SIZE(rep));
397                 }
398                 else {
399                     /* rep is unicode */
400                     if (PyUnicode_READY(rep) < 0)
401                         goto error;
402 
403                     if (!PyUnicode_IS_ASCII(rep)) {
404                         raise_encode_exception(&exc, "utf-8", unicode,
405                                                startpos, endpos,
406                                                "surrogates not allowed");
407                         goto error;
408                     }
409 
410                     p = _PyBytesWriter_WriteBytes(writer, p,
411                                                   PyUnicode_DATA(rep),
412                                                   PyUnicode_GET_LENGTH(rep));
413                 }
414 
415                 if (p == NULL)
416                     goto error;
417                 Py_CLEAR(rep);
418 
419                 i = newpos;
420             }
421 
422             /* If overallocation was disabled, ensure that it was the last
423                write. Otherwise, we missed an optimization */
424             assert(writer->overallocate || i == size);
425         }
426         else
427 #if STRINGLIB_SIZEOF_CHAR > 2
428         if (ch < 0x10000)
429 #endif
430         {
431             *p++ = (char)(0xe0 | (ch >> 12));
432             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
433             *p++ = (char)(0x80 | (ch & 0x3f));
434         }
435 #if STRINGLIB_SIZEOF_CHAR > 2
436         else /* ch >= 0x10000 */
437         {
438             assert(ch <= MAX_UNICODE);
439             /* Encode UCS4 Unicode ordinals */
440             *p++ = (char)(0xf0 | (ch >> 18));
441             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
442             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
443             *p++ = (char)(0x80 | (ch & 0x3f));
444         }
445 #endif /* STRINGLIB_SIZEOF_CHAR > 2 */
446 #endif /* STRINGLIB_SIZEOF_CHAR > 1 */
447     }
448 
449 #if STRINGLIB_SIZEOF_CHAR > 1
450     Py_XDECREF(error_handler_obj);
451     Py_XDECREF(exc);
452 #endif
453     return p;
454 
455 #if STRINGLIB_SIZEOF_CHAR > 1
456  error:
457     Py_XDECREF(rep);
458     Py_XDECREF(error_handler_obj);
459     Py_XDECREF(exc);
460     return NULL;
461 #endif
462 }
463 
464 /* The pattern for constructing UCS2-repeated masks. */
465 #if SIZEOF_LONG == 8
466 # define UCS2_REPEAT_MASK 0x0001000100010001ul
467 #elif SIZEOF_LONG == 4
468 # define UCS2_REPEAT_MASK 0x00010001ul
469 #else
470 # error C 'long' size should be either 4 or 8!
471 #endif
472 
473 /* The mask for fast checking. */
474 #if STRINGLIB_SIZEOF_CHAR == 1
475 /* The mask for fast checking of whether a C 'long' contains a
476    non-ASCII or non-Latin1 UTF16-encoded characters. */
477 # define FAST_CHAR_MASK         (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR))
478 #else
479 /* The mask for fast checking of whether a C 'long' may contain
480    UTF16-encoded surrogate characters. This is an efficient heuristic,
481    assuming that non-surrogate characters with a code point >= 0x8000 are
482    rare in most input.
483 */
484 # define FAST_CHAR_MASK         (UCS2_REPEAT_MASK * 0x8000u)
485 #endif
486 /* The mask for fast byte-swapping. */
487 #define STRIPPED_MASK           (UCS2_REPEAT_MASK * 0x00FFu)
488 /* Swap bytes. */
489 #define SWAB(value)             ((((value) >> 8) & STRIPPED_MASK) | \
490                                  (((value) & STRIPPED_MASK) << 8))
491 
492 Py_LOCAL_INLINE(Py_UCS4)
STRINGLIB(utf16_decode)493 STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
494                         STRINGLIB_CHAR *dest, Py_ssize_t *outpos,
495                         int native_ordering)
496 {
497     Py_UCS4 ch;
498     const unsigned char *q = *inptr;
499     STRINGLIB_CHAR *p = dest + *outpos;
500     /* Offsets from q for retrieving byte pairs in the right order. */
501 #if PY_LITTLE_ENDIAN
502     int ihi = !!native_ordering, ilo = !native_ordering;
503 #else
504     int ihi = !native_ordering, ilo = !!native_ordering;
505 #endif
506     --e;
507 
508     while (q < e) {
509         Py_UCS4 ch2;
510         /* First check for possible aligned read of a C 'long'. Unaligned
511            reads are more expensive, better to defer to another iteration. */
512         if (_Py_IS_ALIGNED(q, ALIGNOF_LONG)) {
513             /* Fast path for runs of in-range non-surrogate chars. */
514             const unsigned char *_q = q;
515             while (_q + SIZEOF_LONG <= e) {
516                 unsigned long block = * (const unsigned long *) _q;
517                 if (native_ordering) {
518                     /* Can use buffer directly */
519                     if (block & FAST_CHAR_MASK)
520                         break;
521                 }
522                 else {
523                     /* Need to byte-swap */
524                     if (block & SWAB(FAST_CHAR_MASK))
525                         break;
526 #if STRINGLIB_SIZEOF_CHAR == 1
527                     block >>= 8;
528 #else
529                     block = SWAB(block);
530 #endif
531                 }
532 #if PY_LITTLE_ENDIAN
533 # if SIZEOF_LONG == 4
534                 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
535                 p[1] = (STRINGLIB_CHAR)(block >> 16);
536 # elif SIZEOF_LONG == 8
537                 p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
538                 p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
539                 p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
540                 p[3] = (STRINGLIB_CHAR)(block >> 48);
541 # endif
542 #else
543 # if SIZEOF_LONG == 4
544                 p[0] = (STRINGLIB_CHAR)(block >> 16);
545                 p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu);
546 # elif SIZEOF_LONG == 8
547                 p[0] = (STRINGLIB_CHAR)(block >> 48);
548                 p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
549                 p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
550                 p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu);
551 # endif
552 #endif
553                 _q += SIZEOF_LONG;
554                 p += SIZEOF_LONG / 2;
555             }
556             q = _q;
557             if (q >= e)
558                 break;
559         }
560 
561         ch = (q[ihi] << 8) | q[ilo];
562         q += 2;
563         if (!Py_UNICODE_IS_SURROGATE(ch)) {
564 #if STRINGLIB_SIZEOF_CHAR < 2
565             if (ch > STRINGLIB_MAX_CHAR)
566                 /* Out-of-range */
567                 goto Return;
568 #endif
569             *p++ = (STRINGLIB_CHAR)ch;
570             continue;
571         }
572 
573         /* UTF-16 code pair: */
574         if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))
575             goto IllegalEncoding;
576         if (q >= e)
577             goto UnexpectedEnd;
578         ch2 = (q[ihi] << 8) | q[ilo];
579         q += 2;
580         if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))
581             goto IllegalSurrogate;
582         ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
583 #if STRINGLIB_SIZEOF_CHAR < 4
584         /* Out-of-range */
585         goto Return;
586 #else
587         *p++ = (STRINGLIB_CHAR)ch;
588 #endif
589     }
590     ch = 0;
591 Return:
592     *inptr = q;
593     *outpos = p - dest;
594     return ch;
595 UnexpectedEnd:
596     ch = 1;
597     goto Return;
598 IllegalEncoding:
599     ch = 2;
600     goto Return;
601 IllegalSurrogate:
602     ch = 3;
603     goto Return;
604 }
605 #undef UCS2_REPEAT_MASK
606 #undef FAST_CHAR_MASK
607 #undef STRIPPED_MASK
608 #undef SWAB
609 
610 
611 #if STRINGLIB_MAX_CHAR >= 0x80
612 Py_LOCAL_INLINE(Py_ssize_t)
STRINGLIB(utf16_encode)613 STRINGLIB(utf16_encode)(const STRINGLIB_CHAR *in,
614                         Py_ssize_t len,
615                         unsigned short **outptr,
616                         int native_ordering)
617 {
618     unsigned short *out = *outptr;
619     const STRINGLIB_CHAR *end = in + len;
620 #if STRINGLIB_SIZEOF_CHAR == 1
621     if (native_ordering) {
622         const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
623         while (in < unrolled_end) {
624             out[0] = in[0];
625             out[1] = in[1];
626             out[2] = in[2];
627             out[3] = in[3];
628             in += 4; out += 4;
629         }
630         while (in < end) {
631             *out++ = *in++;
632         }
633     } else {
634 # define SWAB2(CH)  ((CH) << 8) /* high byte is zero */
635         const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
636         while (in < unrolled_end) {
637             out[0] = SWAB2(in[0]);
638             out[1] = SWAB2(in[1]);
639             out[2] = SWAB2(in[2]);
640             out[3] = SWAB2(in[3]);
641             in += 4; out += 4;
642         }
643         while (in < end) {
644             Py_UCS4 ch = *in++;
645             *out++ = SWAB2((Py_UCS2)ch);
646         }
647 #undef SWAB2
648     }
649     *outptr = out;
650     return len;
651 #else
652     if (native_ordering) {
653 #if STRINGLIB_MAX_CHAR < 0x10000
654         const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
655         while (in < unrolled_end) {
656             /* check if any character is a surrogate character */
657             if (((in[0] ^ 0xd800) &
658                  (in[1] ^ 0xd800) &
659                  (in[2] ^ 0xd800) &
660                  (in[3] ^ 0xd800) & 0xf800) == 0)
661                 break;
662             out[0] = in[0];
663             out[1] = in[1];
664             out[2] = in[2];
665             out[3] = in[3];
666             in += 4; out += 4;
667         }
668 #endif
669         while (in < end) {
670             Py_UCS4 ch;
671             ch = *in++;
672             if (ch < 0xd800)
673                 *out++ = ch;
674             else if (ch < 0xe000)
675                 /* reject surrogate characters (U+D800-U+DFFF) */
676                 goto fail;
677 #if STRINGLIB_MAX_CHAR >= 0x10000
678             else if (ch >= 0x10000) {
679                 out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
680                 out[1] = Py_UNICODE_LOW_SURROGATE(ch);
681                 out += 2;
682             }
683 #endif
684             else
685                 *out++ = ch;
686         }
687     } else {
688 #define SWAB2(CH)  (((CH) << 8) | ((CH) >> 8))
689 #if STRINGLIB_MAX_CHAR < 0x10000
690         const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
691         while (in < unrolled_end) {
692             /* check if any character is a surrogate character */
693             if (((in[0] ^ 0xd800) &
694                  (in[1] ^ 0xd800) &
695                  (in[2] ^ 0xd800) &
696                  (in[3] ^ 0xd800) & 0xf800) == 0)
697                 break;
698             out[0] = SWAB2(in[0]);
699             out[1] = SWAB2(in[1]);
700             out[2] = SWAB2(in[2]);
701             out[3] = SWAB2(in[3]);
702             in += 4; out += 4;
703         }
704 #endif
705         while (in < end) {
706             Py_UCS4 ch = *in++;
707             if (ch < 0xd800)
708                 *out++ = SWAB2((Py_UCS2)ch);
709             else if (ch < 0xe000)
710                 /* reject surrogate characters (U+D800-U+DFFF) */
711                 goto fail;
712 #if STRINGLIB_MAX_CHAR >= 0x10000
713             else if (ch >= 0x10000) {
714                 Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
715                 Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
716                 out[0] = SWAB2(ch1);
717                 out[1] = SWAB2(ch2);
718                 out += 2;
719             }
720 #endif
721             else
722                 *out++ = SWAB2((Py_UCS2)ch);
723         }
724 #undef SWAB2
725     }
726     *outptr = out;
727     return len;
728   fail:
729     *outptr = out;
730     return len - (end - in + 1);
731 #endif
732 }
733 
734 static inline uint32_t
STRINGLIB(SWAB4)735 STRINGLIB(SWAB4)(STRINGLIB_CHAR ch)
736 {
737     uint32_t word = ch;
738 #if STRINGLIB_SIZEOF_CHAR == 1
739     /* high bytes are zero */
740     return (word << 24);
741 #elif STRINGLIB_SIZEOF_CHAR == 2
742     /* high bytes are zero */
743     return ((word & 0x00FFu) << 24) | ((word & 0xFF00u) << 8);
744 #else
745     return _Py_bswap32(word);
746 #endif
747 }
748 
749 Py_LOCAL_INLINE(Py_ssize_t)
STRINGLIB(utf32_encode)750 STRINGLIB(utf32_encode)(const STRINGLIB_CHAR *in,
751                         Py_ssize_t len,
752                         uint32_t **outptr,
753                         int native_ordering)
754 {
755     uint32_t *out = *outptr;
756     const STRINGLIB_CHAR *end = in + len;
757     if (native_ordering) {
758         const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
759         while (in < unrolled_end) {
760 #if STRINGLIB_SIZEOF_CHAR > 1
761             /* check if any character is a surrogate character */
762             if (((in[0] ^ 0xd800) &
763                  (in[1] ^ 0xd800) &
764                  (in[2] ^ 0xd800) &
765                  (in[3] ^ 0xd800) & 0xf800) == 0)
766                 break;
767 #endif
768             out[0] = in[0];
769             out[1] = in[1];
770             out[2] = in[2];
771             out[3] = in[3];
772             in += 4; out += 4;
773         }
774         while (in < end) {
775             Py_UCS4 ch;
776             ch = *in++;
777 #if STRINGLIB_SIZEOF_CHAR > 1
778             if (Py_UNICODE_IS_SURROGATE(ch)) {
779                 /* reject surrogate characters (U+D800-U+DFFF) */
780                 goto fail;
781             }
782 #endif
783             *out++ = ch;
784         }
785     } else {
786         const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
787         while (in < unrolled_end) {
788 #if STRINGLIB_SIZEOF_CHAR > 1
789             /* check if any character is a surrogate character */
790             if (((in[0] ^ 0xd800) &
791                  (in[1] ^ 0xd800) &
792                  (in[2] ^ 0xd800) &
793                  (in[3] ^ 0xd800) & 0xf800) == 0)
794                 break;
795 #endif
796             out[0] = STRINGLIB(SWAB4)(in[0]);
797             out[1] = STRINGLIB(SWAB4)(in[1]);
798             out[2] = STRINGLIB(SWAB4)(in[2]);
799             out[3] = STRINGLIB(SWAB4)(in[3]);
800             in += 4; out += 4;
801         }
802         while (in < end) {
803             Py_UCS4 ch = *in++;
804 #if STRINGLIB_SIZEOF_CHAR > 1
805             if (Py_UNICODE_IS_SURROGATE(ch)) {
806                 /* reject surrogate characters (U+D800-U+DFFF) */
807                 goto fail;
808             }
809 #endif
810             *out++ = STRINGLIB(SWAB4)(ch);
811         }
812     }
813     *outptr = out;
814     return len;
815 #if STRINGLIB_SIZEOF_CHAR > 1
816   fail:
817     *outptr = out;
818     return len - (end - in + 1);
819 #endif
820 }
821 
822 #endif
823