• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2     unicode_format.h -- implementation of str.format().
3 */
4 
5 /************************************************************************/
6 /***********   Global data structures and forward declarations  *********/
7 /************************************************************************/
8 
9 /*
10    A SubString consists of the characters between two string or
11    unicode pointers.
12 */
13 typedef struct {
14     PyObject *str; /* borrowed reference */
15     Py_ssize_t start, end;
16 } SubString;
17 
18 
19 typedef enum {
20     ANS_INIT,
21     ANS_AUTO,
22     ANS_MANUAL
23 } AutoNumberState;   /* Keep track if we're auto-numbering fields */
24 
25 /* Keeps track of our auto-numbering state, and which number field we're on */
26 typedef struct {
27     AutoNumberState an_state;
28     int an_field_number;
29 } AutoNumber;
30 
31 
32 /* forward declaration for recursion */
33 static PyObject *
34 build_string(SubString *input, PyObject *args, PyObject *kwargs,
35              int recursion_depth, AutoNumber *auto_number);
36 
37 
38 
39 /************************************************************************/
40 /**************************  Utility  functions  ************************/
41 /************************************************************************/
42 
43 static void
AutoNumber_Init(AutoNumber * auto_number)44 AutoNumber_Init(AutoNumber *auto_number)
45 {
46     auto_number->an_state = ANS_INIT;
47     auto_number->an_field_number = 0;
48 }
49 
50 /* fill in a SubString from a pointer and length */
51 Py_LOCAL_INLINE(void)
SubString_init(SubString * str,PyObject * s,Py_ssize_t start,Py_ssize_t end)52 SubString_init(SubString *str, PyObject *s, Py_ssize_t start, Py_ssize_t end)
53 {
54     str->str = s;
55     str->start = start;
56     str->end = end;
57 }
58 
59 /* return a new string.  if str->str is NULL, return None */
60 Py_LOCAL_INLINE(PyObject *)
SubString_new_object(SubString * str)61 SubString_new_object(SubString *str)
62 {
63     if (str->str == NULL) {
64         Py_INCREF(Py_None);
65         return Py_None;
66     }
67     return PyUnicode_Substring(str->str, str->start, str->end);
68 }
69 
70 /* return a new string.  if str->str is NULL, return a new empty string */
71 Py_LOCAL_INLINE(PyObject *)
SubString_new_object_or_empty(SubString * str)72 SubString_new_object_or_empty(SubString *str)
73 {
74     if (str->str == NULL) {
75         return PyUnicode_New(0, 0);
76     }
77     return SubString_new_object(str);
78 }
79 
80 /* Return 1 if an error has been detected switching between automatic
81    field numbering and manual field specification, else return 0. Set
82    ValueError on error. */
83 static int
autonumber_state_error(AutoNumberState state,int field_name_is_empty)84 autonumber_state_error(AutoNumberState state, int field_name_is_empty)
85 {
86     if (state == ANS_MANUAL) {
87         if (field_name_is_empty) {
88             PyErr_SetString(PyExc_ValueError, "cannot switch from "
89                             "manual field specification to "
90                             "automatic field numbering");
91             return 1;
92         }
93     }
94     else {
95         if (!field_name_is_empty) {
96             PyErr_SetString(PyExc_ValueError, "cannot switch from "
97                             "automatic field numbering to "
98                             "manual field specification");
99             return 1;
100         }
101     }
102     return 0;
103 }
104 
105 
106 /************************************************************************/
107 /***********  Format string parsing -- integers and identifiers *********/
108 /************************************************************************/
109 
110 static Py_ssize_t
get_integer(const SubString * str)111 get_integer(const SubString *str)
112 {
113     Py_ssize_t accumulator = 0;
114     Py_ssize_t digitval;
115     Py_ssize_t i;
116 
117     /* empty string is an error */
118     if (str->start >= str->end)
119         return -1;
120 
121     for (i = str->start; i < str->end; i++) {
122         digitval = Py_UNICODE_TODECIMAL(PyUnicode_READ_CHAR(str->str, i));
123         if (digitval < 0)
124             return -1;
125         /*
126            Detect possible overflow before it happens:
127 
128               accumulator * 10 + digitval > PY_SSIZE_T_MAX if and only if
129               accumulator > (PY_SSIZE_T_MAX - digitval) / 10.
130         */
131         if (accumulator > (PY_SSIZE_T_MAX - digitval) / 10) {
132             PyErr_Format(PyExc_ValueError,
133                          "Too many decimal digits in format string");
134             return -1;
135         }
136         accumulator = accumulator * 10 + digitval;
137     }
138     return accumulator;
139 }
140 
141 /************************************************************************/
142 /******** Functions to get field objects and specification strings ******/
143 /************************************************************************/
144 
145 /* do the equivalent of obj.name */
146 static PyObject *
getattr(PyObject * obj,SubString * name)147 getattr(PyObject *obj, SubString *name)
148 {
149     PyObject *newobj;
150     PyObject *str = SubString_new_object(name);
151     if (str == NULL)
152         return NULL;
153     newobj = PyObject_GetAttr(obj, str);
154     Py_DECREF(str);
155     return newobj;
156 }
157 
158 /* do the equivalent of obj[idx], where obj is a sequence */
159 static PyObject *
getitem_sequence(PyObject * obj,Py_ssize_t idx)160 getitem_sequence(PyObject *obj, Py_ssize_t idx)
161 {
162     return PySequence_GetItem(obj, idx);
163 }
164 
165 /* do the equivalent of obj[idx], where obj is not a sequence */
166 static PyObject *
getitem_idx(PyObject * obj,Py_ssize_t idx)167 getitem_idx(PyObject *obj, Py_ssize_t idx)
168 {
169     PyObject *newobj;
170     PyObject *idx_obj = PyLong_FromSsize_t(idx);
171     if (idx_obj == NULL)
172         return NULL;
173     newobj = PyObject_GetItem(obj, idx_obj);
174     Py_DECREF(idx_obj);
175     return newobj;
176 }
177 
178 /* do the equivalent of obj[name] */
179 static PyObject *
getitem_str(PyObject * obj,SubString * name)180 getitem_str(PyObject *obj, SubString *name)
181 {
182     PyObject *newobj;
183     PyObject *str = SubString_new_object(name);
184     if (str == NULL)
185         return NULL;
186     newobj = PyObject_GetItem(obj, str);
187     Py_DECREF(str);
188     return newobj;
189 }
190 
191 typedef struct {
192     /* the entire string we're parsing.  we assume that someone else
193        is managing its lifetime, and that it will exist for the
194        lifetime of the iterator.  can be empty */
195     SubString str;
196 
197     /* index to where we are inside field_name */
198     Py_ssize_t index;
199 } FieldNameIterator;
200 
201 
202 static int
FieldNameIterator_init(FieldNameIterator * self,PyObject * s,Py_ssize_t start,Py_ssize_t end)203 FieldNameIterator_init(FieldNameIterator *self, PyObject *s,
204                        Py_ssize_t start, Py_ssize_t end)
205 {
206     SubString_init(&self->str, s, start, end);
207     self->index = start;
208     return 1;
209 }
210 
211 static int
_FieldNameIterator_attr(FieldNameIterator * self,SubString * name)212 _FieldNameIterator_attr(FieldNameIterator *self, SubString *name)
213 {
214     Py_UCS4 c;
215 
216     name->str = self->str.str;
217     name->start = self->index;
218 
219     /* return everything until '.' or '[' */
220     while (self->index < self->str.end) {
221         c = PyUnicode_READ_CHAR(self->str.str, self->index++);
222         switch (c) {
223         case '[':
224         case '.':
225             /* backup so that we this character will be seen next time */
226             self->index--;
227             break;
228         default:
229             continue;
230         }
231         break;
232     }
233     /* end of string is okay */
234     name->end = self->index;
235     return 1;
236 }
237 
238 static int
_FieldNameIterator_item(FieldNameIterator * self,SubString * name)239 _FieldNameIterator_item(FieldNameIterator *self, SubString *name)
240 {
241     int bracket_seen = 0;
242     Py_UCS4 c;
243 
244     name->str = self->str.str;
245     name->start = self->index;
246 
247     /* return everything until ']' */
248     while (self->index < self->str.end) {
249         c = PyUnicode_READ_CHAR(self->str.str, self->index++);
250         switch (c) {
251         case ']':
252             bracket_seen = 1;
253             break;
254         default:
255             continue;
256         }
257         break;
258     }
259     /* make sure we ended with a ']' */
260     if (!bracket_seen) {
261         PyErr_SetString(PyExc_ValueError, "Missing ']' in format string");
262         return 0;
263     }
264 
265     /* end of string is okay */
266     /* don't include the ']' */
267     name->end = self->index-1;
268     return 1;
269 }
270 
271 /* returns 0 on error, 1 on non-error termination, and 2 if it returns a value */
272 static int
FieldNameIterator_next(FieldNameIterator * self,int * is_attribute,Py_ssize_t * name_idx,SubString * name)273 FieldNameIterator_next(FieldNameIterator *self, int *is_attribute,
274                        Py_ssize_t *name_idx, SubString *name)
275 {
276     /* check at end of input */
277     if (self->index >= self->str.end)
278         return 1;
279 
280     switch (PyUnicode_READ_CHAR(self->str.str, self->index++)) {
281     case '.':
282         *is_attribute = 1;
283         if (_FieldNameIterator_attr(self, name) == 0)
284             return 0;
285         *name_idx = -1;
286         break;
287     case '[':
288         *is_attribute = 0;
289         if (_FieldNameIterator_item(self, name) == 0)
290             return 0;
291         *name_idx = get_integer(name);
292         if (*name_idx == -1 && PyErr_Occurred())
293             return 0;
294         break;
295     default:
296         /* Invalid character follows ']' */
297         PyErr_SetString(PyExc_ValueError, "Only '.' or '[' may "
298                         "follow ']' in format field specifier");
299         return 0;
300     }
301 
302     /* empty string is an error */
303     if (name->start == name->end) {
304         PyErr_SetString(PyExc_ValueError, "Empty attribute in format string");
305         return 0;
306     }
307 
308     return 2;
309 }
310 
311 
312 /* input: field_name
313    output: 'first' points to the part before the first '[' or '.'
314            'first_idx' is -1 if 'first' is not an integer, otherwise
315                        it's the value of first converted to an integer
316            'rest' is an iterator to return the rest
317 */
318 static int
field_name_split(PyObject * str,Py_ssize_t start,Py_ssize_t end,SubString * first,Py_ssize_t * first_idx,FieldNameIterator * rest,AutoNumber * auto_number)319 field_name_split(PyObject *str, Py_ssize_t start, Py_ssize_t end, SubString *first,
320                  Py_ssize_t *first_idx, FieldNameIterator *rest,
321                  AutoNumber *auto_number)
322 {
323     Py_UCS4 c;
324     Py_ssize_t i = start;
325     int field_name_is_empty;
326     int using_numeric_index;
327 
328     /* find the part up until the first '.' or '[' */
329     while (i < end) {
330         switch (c = PyUnicode_READ_CHAR(str, i++)) {
331         case '[':
332         case '.':
333             /* backup so that we this character is available to the
334                "rest" iterator */
335             i--;
336             break;
337         default:
338             continue;
339         }
340         break;
341     }
342 
343     /* set up the return values */
344     SubString_init(first, str, start, i);
345     FieldNameIterator_init(rest, str, i, end);
346 
347     /* see if "first" is an integer, in which case it's used as an index */
348     *first_idx = get_integer(first);
349     if (*first_idx == -1 && PyErr_Occurred())
350         return 0;
351 
352     field_name_is_empty = first->start >= first->end;
353 
354     /* If the field name is omitted or if we have a numeric index
355        specified, then we're doing numeric indexing into args. */
356     using_numeric_index = field_name_is_empty || *first_idx != -1;
357 
358     /* We always get here exactly one time for each field we're
359        processing. And we get here in field order (counting by left
360        braces). So this is the perfect place to handle automatic field
361        numbering if the field name is omitted. */
362 
363     /* Check if we need to do the auto-numbering. It's not needed if
364        we're called from string.Format routines, because it's handled
365        in that class by itself. */
366     if (auto_number) {
367         /* Initialize our auto numbering state if this is the first
368            time we're either auto-numbering or manually numbering. */
369         if (auto_number->an_state == ANS_INIT && using_numeric_index)
370             auto_number->an_state = field_name_is_empty ?
371                 ANS_AUTO : ANS_MANUAL;
372 
373         /* Make sure our state is consistent with what we're doing
374            this time through. Only check if we're using a numeric
375            index. */
376         if (using_numeric_index)
377             if (autonumber_state_error(auto_number->an_state,
378                                        field_name_is_empty))
379                 return 0;
380         /* Zero length field means we want to do auto-numbering of the
381            fields. */
382         if (field_name_is_empty)
383             *first_idx = (auto_number->an_field_number)++;
384     }
385 
386     return 1;
387 }
388 
389 
390 /*
391     get_field_object returns the object inside {}, before the
392     format_spec.  It handles getindex and getattr lookups and consumes
393     the entire input string.
394 */
395 static PyObject *
get_field_object(SubString * input,PyObject * args,PyObject * kwargs,AutoNumber * auto_number)396 get_field_object(SubString *input, PyObject *args, PyObject *kwargs,
397                  AutoNumber *auto_number)
398 {
399     PyObject *obj = NULL;
400     int ok;
401     int is_attribute;
402     SubString name;
403     SubString first;
404     Py_ssize_t index;
405     FieldNameIterator rest;
406 
407     if (!field_name_split(input->str, input->start, input->end, &first,
408                           &index, &rest, auto_number)) {
409         goto error;
410     }
411 
412     if (index == -1) {
413         /* look up in kwargs */
414         PyObject *key = SubString_new_object(&first);
415         if (key == NULL)
416             goto error;
417 
418         /* Use PyObject_GetItem instead of PyDict_GetItem because this
419            code is no longer just used with kwargs. It might be passed
420            a non-dict when called through format_map. */
421         if ((kwargs == NULL) || (obj = PyObject_GetItem(kwargs, key)) == NULL) {
422             PyErr_SetObject(PyExc_KeyError, key);
423             Py_DECREF(key);
424             goto error;
425         }
426         Py_DECREF(key);
427     }
428     else {
429         /* If args is NULL, we have a format string with a positional field
430            with only kwargs to retrieve it from. This can only happen when
431            used with format_map(), where positional arguments are not
432            allowed. */
433         if (args == NULL) {
434             PyErr_SetString(PyExc_ValueError, "Format string contains "
435                             "positional fields");
436             goto error;
437         }
438 
439         /* look up in args */
440         obj = PySequence_GetItem(args, index);
441         if (obj == NULL)
442             goto error;
443     }
444 
445     /* iterate over the rest of the field_name */
446     while ((ok = FieldNameIterator_next(&rest, &is_attribute, &index,
447                                         &name)) == 2) {
448         PyObject *tmp;
449 
450         if (is_attribute)
451             /* getattr lookup "." */
452             tmp = getattr(obj, &name);
453         else
454             /* getitem lookup "[]" */
455             if (index == -1)
456                 tmp = getitem_str(obj, &name);
457             else
458                 if (PySequence_Check(obj))
459                     tmp = getitem_sequence(obj, index);
460                 else
461                     /* not a sequence */
462                     tmp = getitem_idx(obj, index);
463         if (tmp == NULL)
464             goto error;
465 
466         /* assign to obj */
467         Py_DECREF(obj);
468         obj = tmp;
469     }
470     /* end of iterator, this is the non-error case */
471     if (ok == 1)
472         return obj;
473 error:
474     Py_XDECREF(obj);
475     return NULL;
476 }
477 
478 /************************************************************************/
479 /*****************  Field rendering functions  **************************/
480 /************************************************************************/
481 
482 /*
483     render_field() is the main function in this section.  It takes the
484     field object and field specification string generated by
485     get_field_and_spec, and renders the field into the output string.
486 
487     render_field calls fieldobj.__format__(format_spec) method, and
488     appends to the output.
489 */
490 static int
render_field(PyObject * fieldobj,SubString * format_spec,_PyUnicodeWriter * writer)491 render_field(PyObject *fieldobj, SubString *format_spec, _PyUnicodeWriter *writer)
492 {
493     int ok = 0;
494     PyObject *result = NULL;
495     PyObject *format_spec_object = NULL;
496     int (*formatter) (_PyUnicodeWriter*, PyObject *, PyObject *, Py_ssize_t, Py_ssize_t) = NULL;
497     int err;
498 
499     /* If we know the type exactly, skip the lookup of __format__ and just
500        call the formatter directly. */
501     if (PyUnicode_CheckExact(fieldobj))
502         formatter = _PyUnicode_FormatAdvancedWriter;
503     else if (PyLong_CheckExact(fieldobj))
504         formatter = _PyLong_FormatAdvancedWriter;
505     else if (PyFloat_CheckExact(fieldobj))
506         formatter = _PyFloat_FormatAdvancedWriter;
507     else if (PyComplex_CheckExact(fieldobj))
508         formatter = _PyComplex_FormatAdvancedWriter;
509 
510     if (formatter) {
511         /* we know exactly which formatter will be called when __format__ is
512            looked up, so call it directly, instead. */
513         err = formatter(writer, fieldobj, format_spec->str,
514                         format_spec->start, format_spec->end);
515         return (err == 0);
516     }
517     else {
518         /* We need to create an object out of the pointers we have, because
519            __format__ takes a string/unicode object for format_spec. */
520         if (format_spec->str)
521             format_spec_object = PyUnicode_Substring(format_spec->str,
522                                                      format_spec->start,
523                                                      format_spec->end);
524         else
525             format_spec_object = PyUnicode_New(0, 0);
526         if (format_spec_object == NULL)
527             goto done;
528 
529         result = PyObject_Format(fieldobj, format_spec_object);
530     }
531     if (result == NULL)
532         goto done;
533 
534     if (_PyUnicodeWriter_WriteStr(writer, result) == -1)
535         goto done;
536     ok = 1;
537 
538 done:
539     Py_XDECREF(format_spec_object);
540     Py_XDECREF(result);
541     return ok;
542 }
543 
544 static int
parse_field(SubString * str,SubString * field_name,SubString * format_spec,int * format_spec_needs_expanding,Py_UCS4 * conversion)545 parse_field(SubString *str, SubString *field_name, SubString *format_spec,
546             int *format_spec_needs_expanding, Py_UCS4 *conversion)
547 {
548     /* Note this function works if the field name is zero length,
549        which is good.  Zero length field names are handled later, in
550        field_name_split. */
551 
552     Py_UCS4 c = 0;
553 
554     /* initialize these, as they may be empty */
555     *conversion = '\0';
556     SubString_init(format_spec, NULL, 0, 0);
557 
558     /* Search for the field name.  it's terminated by the end of
559        the string, or a ':' or '!' */
560     field_name->str = str->str;
561     field_name->start = str->start;
562     while (str->start < str->end) {
563         switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) {
564         case '{':
565             PyErr_SetString(PyExc_ValueError, "unexpected '{' in field name");
566             return 0;
567         case '[':
568             for (; str->start < str->end; str->start++)
569                 if (PyUnicode_READ_CHAR(str->str, str->start) == ']')
570                     break;
571             continue;
572         case '}':
573         case ':':
574         case '!':
575             break;
576         default:
577             continue;
578         }
579         break;
580     }
581 
582     field_name->end = str->start - 1;
583     if (c == '!' || c == ':') {
584         Py_ssize_t count;
585         /* we have a format specifier and/or a conversion */
586         /* don't include the last character */
587 
588         /* see if there's a conversion specifier */
589         if (c == '!') {
590             /* there must be another character present */
591             if (str->start >= str->end) {
592                 PyErr_SetString(PyExc_ValueError,
593                                 "end of string while looking for conversion "
594                                 "specifier");
595                 return 0;
596             }
597             *conversion = PyUnicode_READ_CHAR(str->str, str->start++);
598 
599             if (str->start < str->end) {
600                 c = PyUnicode_READ_CHAR(str->str, str->start++);
601                 if (c == '}')
602                     return 1;
603                 if (c != ':') {
604                     PyErr_SetString(PyExc_ValueError,
605                                     "expected ':' after conversion specifier");
606                     return 0;
607                 }
608             }
609         }
610         format_spec->str = str->str;
611         format_spec->start = str->start;
612         count = 1;
613         while (str->start < str->end) {
614             switch ((c = PyUnicode_READ_CHAR(str->str, str->start++))) {
615             case '{':
616                 *format_spec_needs_expanding = 1;
617                 count++;
618                 break;
619             case '}':
620                 count--;
621                 if (count == 0) {
622                     format_spec->end = str->start - 1;
623                     return 1;
624                 }
625                 break;
626             default:
627                 break;
628             }
629         }
630 
631         PyErr_SetString(PyExc_ValueError, "unmatched '{' in format spec");
632         return 0;
633     }
634     else if (c != '}') {
635         PyErr_SetString(PyExc_ValueError, "expected '}' before end of string");
636         return 0;
637     }
638 
639     return 1;
640 }
641 
642 /************************************************************************/
643 /******* Output string allocation and escape-to-markup processing  ******/
644 /************************************************************************/
645 
646 /* MarkupIterator breaks the string into pieces of either literal
647    text, or things inside {} that need to be marked up.  it is
648    designed to make it easy to wrap a Python iterator around it, for
649    use with the Formatter class */
650 
651 typedef struct {
652     SubString str;
653 } MarkupIterator;
654 
655 static int
MarkupIterator_init(MarkupIterator * self,PyObject * str,Py_ssize_t start,Py_ssize_t end)656 MarkupIterator_init(MarkupIterator *self, PyObject *str,
657                     Py_ssize_t start, Py_ssize_t end)
658 {
659     SubString_init(&self->str, str, start, end);
660     return 1;
661 }
662 
663 /* returns 0 on error, 1 on non-error termination, and 2 if it got a
664    string (or something to be expanded) */
665 static int
MarkupIterator_next(MarkupIterator * self,SubString * literal,int * field_present,SubString * field_name,SubString * format_spec,Py_UCS4 * conversion,int * format_spec_needs_expanding)666 MarkupIterator_next(MarkupIterator *self, SubString *literal,
667                     int *field_present, SubString *field_name,
668                     SubString *format_spec, Py_UCS4 *conversion,
669                     int *format_spec_needs_expanding)
670 {
671     int at_end;
672     Py_UCS4 c = 0;
673     Py_ssize_t start;
674     Py_ssize_t len;
675     int markup_follows = 0;
676 
677     /* initialize all of the output variables */
678     SubString_init(literal, NULL, 0, 0);
679     SubString_init(field_name, NULL, 0, 0);
680     SubString_init(format_spec, NULL, 0, 0);
681     *conversion = '\0';
682     *format_spec_needs_expanding = 0;
683     *field_present = 0;
684 
685     /* No more input, end of iterator.  This is the normal exit
686        path. */
687     if (self->str.start >= self->str.end)
688         return 1;
689 
690     start = self->str.start;
691 
692     /* First read any literal text. Read until the end of string, an
693        escaped '{' or '}', or an unescaped '{'.  In order to never
694        allocate memory and so I can just pass pointers around, if
695        there's an escaped '{' or '}' then we'll return the literal
696        including the brace, but no format object.  The next time
697        through, we'll return the rest of the literal, skipping past
698        the second consecutive brace. */
699     while (self->str.start < self->str.end) {
700         switch (c = PyUnicode_READ_CHAR(self->str.str, self->str.start++)) {
701         case '{':
702         case '}':
703             markup_follows = 1;
704             break;
705         default:
706             continue;
707         }
708         break;
709     }
710 
711     at_end = self->str.start >= self->str.end;
712     len = self->str.start - start;
713 
714     if ((c == '}') && (at_end ||
715                        (c != PyUnicode_READ_CHAR(self->str.str,
716                                                  self->str.start)))) {
717         PyErr_SetString(PyExc_ValueError, "Single '}' encountered "
718                         "in format string");
719         return 0;
720     }
721     if (at_end && c == '{') {
722         PyErr_SetString(PyExc_ValueError, "Single '{' encountered "
723                         "in format string");
724         return 0;
725     }
726     if (!at_end) {
727         if (c == PyUnicode_READ_CHAR(self->str.str, self->str.start)) {
728             /* escaped } or {, skip it in the input.  there is no
729                markup object following us, just this literal text */
730             self->str.start++;
731             markup_follows = 0;
732         }
733         else
734             len--;
735     }
736 
737     /* record the literal text */
738     literal->str = self->str.str;
739     literal->start = start;
740     literal->end = start + len;
741 
742     if (!markup_follows)
743         return 2;
744 
745     /* this is markup; parse the field */
746     *field_present = 1;
747     if (!parse_field(&self->str, field_name, format_spec,
748                      format_spec_needs_expanding, conversion))
749         return 0;
750     return 2;
751 }
752 
753 
754 /* do the !r or !s conversion on obj */
755 static PyObject *
do_conversion(PyObject * obj,Py_UCS4 conversion)756 do_conversion(PyObject *obj, Py_UCS4 conversion)
757 {
758     /* XXX in pre-3.0, do we need to convert this to unicode, since it
759        might have returned a string? */
760     switch (conversion) {
761     case 'r':
762         return PyObject_Repr(obj);
763     case 's':
764         return PyObject_Str(obj);
765     case 'a':
766         return PyObject_ASCII(obj);
767     default:
768         if (conversion > 32 && conversion < 127) {
769                 /* It's the ASCII subrange; casting to char is safe
770                    (assuming the execution character set is an ASCII
771                    superset). */
772                 PyErr_Format(PyExc_ValueError,
773                      "Unknown conversion specifier %c",
774                      (char)conversion);
775         } else
776                 PyErr_Format(PyExc_ValueError,
777                      "Unknown conversion specifier \\x%x",
778                      (unsigned int)conversion);
779         return NULL;
780     }
781 }
782 
783 /* given:
784 
785    {field_name!conversion:format_spec}
786 
787    compute the result and write it to output.
788    format_spec_needs_expanding is an optimization.  if it's false,
789    just output the string directly, otherwise recursively expand the
790    format_spec string.
791 
792    field_name is allowed to be zero length, in which case we
793    are doing auto field numbering.
794 */
795 
796 static int
output_markup(SubString * field_name,SubString * format_spec,int format_spec_needs_expanding,Py_UCS4 conversion,_PyUnicodeWriter * writer,PyObject * args,PyObject * kwargs,int recursion_depth,AutoNumber * auto_number)797 output_markup(SubString *field_name, SubString *format_spec,
798               int format_spec_needs_expanding, Py_UCS4 conversion,
799               _PyUnicodeWriter *writer, PyObject *args, PyObject *kwargs,
800               int recursion_depth, AutoNumber *auto_number)
801 {
802     PyObject *tmp = NULL;
803     PyObject *fieldobj = NULL;
804     SubString expanded_format_spec;
805     SubString *actual_format_spec;
806     int result = 0;
807 
808     /* convert field_name to an object */
809     fieldobj = get_field_object(field_name, args, kwargs, auto_number);
810     if (fieldobj == NULL)
811         goto done;
812 
813     if (conversion != '\0') {
814         tmp = do_conversion(fieldobj, conversion);
815         if (tmp == NULL || PyUnicode_READY(tmp) == -1)
816             goto done;
817 
818         /* do the assignment, transferring ownership: fieldobj = tmp */
819         Py_DECREF(fieldobj);
820         fieldobj = tmp;
821         tmp = NULL;
822     }
823 
824     /* if needed, recurively compute the format_spec */
825     if (format_spec_needs_expanding) {
826         tmp = build_string(format_spec, args, kwargs, recursion_depth-1,
827                            auto_number);
828         if (tmp == NULL || PyUnicode_READY(tmp) == -1)
829             goto done;
830 
831         /* note that in the case we're expanding the format string,
832            tmp must be kept around until after the call to
833            render_field. */
834         SubString_init(&expanded_format_spec, tmp, 0, PyUnicode_GET_LENGTH(tmp));
835         actual_format_spec = &expanded_format_spec;
836     }
837     else
838         actual_format_spec = format_spec;
839 
840     if (render_field(fieldobj, actual_format_spec, writer) == 0)
841         goto done;
842 
843     result = 1;
844 
845 done:
846     Py_XDECREF(fieldobj);
847     Py_XDECREF(tmp);
848 
849     return result;
850 }
851 
852 /*
853     do_markup is the top-level loop for the format() method.  It
854     searches through the format string for escapes to markup codes, and
855     calls other functions to move non-markup text to the output,
856     and to perform the markup to the output.
857 */
858 static int
do_markup(SubString * input,PyObject * args,PyObject * kwargs,_PyUnicodeWriter * writer,int recursion_depth,AutoNumber * auto_number)859 do_markup(SubString *input, PyObject *args, PyObject *kwargs,
860           _PyUnicodeWriter *writer, int recursion_depth, AutoNumber *auto_number)
861 {
862     MarkupIterator iter;
863     int format_spec_needs_expanding;
864     int result;
865     int field_present;
866     SubString literal;
867     SubString field_name;
868     SubString format_spec;
869     Py_UCS4 conversion;
870 
871     MarkupIterator_init(&iter, input->str, input->start, input->end);
872     while ((result = MarkupIterator_next(&iter, &literal, &field_present,
873                                          &field_name, &format_spec,
874                                          &conversion,
875                                          &format_spec_needs_expanding)) == 2) {
876         if (literal.end != literal.start) {
877             if (!field_present && iter.str.start == iter.str.end)
878                 writer->overallocate = 0;
879             if (_PyUnicodeWriter_WriteSubstring(writer, literal.str,
880                                                 literal.start, literal.end) < 0)
881                 return 0;
882         }
883 
884         if (field_present) {
885             if (iter.str.start == iter.str.end)
886                 writer->overallocate = 0;
887             if (!output_markup(&field_name, &format_spec,
888                                format_spec_needs_expanding, conversion, writer,
889                                args, kwargs, recursion_depth, auto_number))
890                 return 0;
891         }
892     }
893     return result;
894 }
895 
896 
897 /*
898     build_string allocates the output string and then
899     calls do_markup to do the heavy lifting.
900 */
901 static PyObject *
build_string(SubString * input,PyObject * args,PyObject * kwargs,int recursion_depth,AutoNumber * auto_number)902 build_string(SubString *input, PyObject *args, PyObject *kwargs,
903              int recursion_depth, AutoNumber *auto_number)
904 {
905     _PyUnicodeWriter writer;
906 
907     /* check the recursion level */
908     if (recursion_depth <= 0) {
909         PyErr_SetString(PyExc_ValueError,
910                         "Max string recursion exceeded");
911         return NULL;
912     }
913 
914     _PyUnicodeWriter_Init(&writer);
915     writer.overallocate = 1;
916     writer.min_length = PyUnicode_GET_LENGTH(input->str) + 100;
917 
918     if (!do_markup(input, args, kwargs, &writer, recursion_depth,
919                    auto_number)) {
920         _PyUnicodeWriter_Dealloc(&writer);
921         return NULL;
922     }
923 
924     return _PyUnicodeWriter_Finish(&writer);
925 }
926 
927 /************************************************************************/
928 /*********** main routine ***********************************************/
929 /************************************************************************/
930 
931 /* this is the main entry point */
932 static PyObject *
do_string_format(PyObject * self,PyObject * args,PyObject * kwargs)933 do_string_format(PyObject *self, PyObject *args, PyObject *kwargs)
934 {
935     SubString input;
936 
937     /* PEP 3101 says only 2 levels, so that
938        "{0:{1}}".format('abc', 's')            # works
939        "{0:{1:{2}}}".format('abc', 's', '')    # fails
940     */
941     int recursion_depth = 2;
942 
943     AutoNumber auto_number;
944 
945     if (PyUnicode_READY(self) == -1)
946         return NULL;
947 
948     AutoNumber_Init(&auto_number);
949     SubString_init(&input, self, 0, PyUnicode_GET_LENGTH(self));
950     return build_string(&input, args, kwargs, recursion_depth, &auto_number);
951 }
952 
953 static PyObject *
do_string_format_map(PyObject * self,PyObject * obj)954 do_string_format_map(PyObject *self, PyObject *obj)
955 {
956     return do_string_format(self, NULL, obj);
957 }
958 
959 
960 /************************************************************************/
961 /*********** formatteriterator ******************************************/
962 /************************************************************************/
963 
964 /* This is used to implement string.Formatter.vparse().  It exists so
965    Formatter can share code with the built in unicode.format() method.
966    It's really just a wrapper around MarkupIterator that is callable
967    from Python. */
968 
969 typedef struct {
970     PyObject_HEAD
971     PyObject *str;
972     MarkupIterator it_markup;
973 } formatteriterobject;
974 
975 static void
formatteriter_dealloc(formatteriterobject * it)976 formatteriter_dealloc(formatteriterobject *it)
977 {
978     Py_XDECREF(it->str);
979     PyObject_FREE(it);
980 }
981 
982 /* returns a tuple:
983    (literal, field_name, format_spec, conversion)
984 
985    literal is any literal text to output.  might be zero length
986    field_name is the string before the ':'.  might be None
987    format_spec is the string after the ':'.  mibht be None
988    conversion is either None, or the string after the '!'
989 */
990 static PyObject *
formatteriter_next(formatteriterobject * it)991 formatteriter_next(formatteriterobject *it)
992 {
993     SubString literal;
994     SubString field_name;
995     SubString format_spec;
996     Py_UCS4 conversion;
997     int format_spec_needs_expanding;
998     int field_present;
999     int result = MarkupIterator_next(&it->it_markup, &literal, &field_present,
1000                                      &field_name, &format_spec, &conversion,
1001                                      &format_spec_needs_expanding);
1002 
1003     /* all of the SubString objects point into it->str, so no
1004        memory management needs to be done on them */
1005     assert(0 <= result && result <= 2);
1006     if (result == 0 || result == 1)
1007         /* if 0, error has already been set, if 1, iterator is empty */
1008         return NULL;
1009     else {
1010         PyObject *literal_str = NULL;
1011         PyObject *field_name_str = NULL;
1012         PyObject *format_spec_str = NULL;
1013         PyObject *conversion_str = NULL;
1014         PyObject *tuple = NULL;
1015 
1016         literal_str = SubString_new_object(&literal);
1017         if (literal_str == NULL)
1018             goto done;
1019 
1020         field_name_str = SubString_new_object(&field_name);
1021         if (field_name_str == NULL)
1022             goto done;
1023 
1024         /* if field_name is non-zero length, return a string for
1025            format_spec (even if zero length), else return None */
1026         format_spec_str = (field_present ?
1027                            SubString_new_object_or_empty :
1028                            SubString_new_object)(&format_spec);
1029         if (format_spec_str == NULL)
1030             goto done;
1031 
1032         /* if the conversion is not specified, return a None,
1033            otherwise create a one length string with the conversion
1034            character */
1035         if (conversion == '\0') {
1036             conversion_str = Py_None;
1037             Py_INCREF(conversion_str);
1038         }
1039         else
1040             conversion_str = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND,
1041                                                        &conversion, 1);
1042         if (conversion_str == NULL)
1043             goto done;
1044 
1045         tuple = PyTuple_Pack(4, literal_str, field_name_str, format_spec_str,
1046                              conversion_str);
1047     done:
1048         Py_XDECREF(literal_str);
1049         Py_XDECREF(field_name_str);
1050         Py_XDECREF(format_spec_str);
1051         Py_XDECREF(conversion_str);
1052         return tuple;
1053     }
1054 }
1055 
1056 static PyMethodDef formatteriter_methods[] = {
1057     {NULL,              NULL}           /* sentinel */
1058 };
1059 
1060 static PyTypeObject PyFormatterIter_Type = {
1061     PyVarObject_HEAD_INIT(&PyType_Type, 0)
1062     "formatteriterator",                /* tp_name */
1063     sizeof(formatteriterobject),        /* tp_basicsize */
1064     0,                                  /* tp_itemsize */
1065     /* methods */
1066     (destructor)formatteriter_dealloc,  /* tp_dealloc */
1067     0,                                  /* tp_print */
1068     0,                                  /* tp_getattr */
1069     0,                                  /* tp_setattr */
1070     0,                                  /* tp_reserved */
1071     0,                                  /* tp_repr */
1072     0,                                  /* tp_as_number */
1073     0,                                  /* tp_as_sequence */
1074     0,                                  /* tp_as_mapping */
1075     0,                                  /* tp_hash */
1076     0,                                  /* tp_call */
1077     0,                                  /* tp_str */
1078     PyObject_GenericGetAttr,            /* tp_getattro */
1079     0,                                  /* tp_setattro */
1080     0,                                  /* tp_as_buffer */
1081     Py_TPFLAGS_DEFAULT,                 /* tp_flags */
1082     0,                                  /* tp_doc */
1083     0,                                  /* tp_traverse */
1084     0,                                  /* tp_clear */
1085     0,                                  /* tp_richcompare */
1086     0,                                  /* tp_weaklistoffset */
1087     PyObject_SelfIter,                  /* tp_iter */
1088     (iternextfunc)formatteriter_next,   /* tp_iternext */
1089     formatteriter_methods,              /* tp_methods */
1090     0,
1091 };
1092 
1093 /* unicode_formatter_parser is used to implement
1094    string.Formatter.vformat.  it parses a string and returns tuples
1095    describing the parsed elements.  It's a wrapper around
1096    stringlib/string_format.h's MarkupIterator */
1097 static PyObject *
formatter_parser(PyObject * ignored,PyObject * self)1098 formatter_parser(PyObject *ignored, PyObject *self)
1099 {
1100     formatteriterobject *it;
1101 
1102     if (!PyUnicode_Check(self)) {
1103         PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name);
1104         return NULL;
1105     }
1106 
1107     if (PyUnicode_READY(self) == -1)
1108         return NULL;
1109 
1110     it = PyObject_New(formatteriterobject, &PyFormatterIter_Type);
1111     if (it == NULL)
1112         return NULL;
1113 
1114     /* take ownership, give the object to the iterator */
1115     Py_INCREF(self);
1116     it->str = self;
1117 
1118     /* initialize the contained MarkupIterator */
1119     MarkupIterator_init(&it->it_markup, (PyObject*)self, 0, PyUnicode_GET_LENGTH(self));
1120     return (PyObject *)it;
1121 }
1122 
1123 
1124 /************************************************************************/
1125 /*********** fieldnameiterator ******************************************/
1126 /************************************************************************/
1127 
1128 
1129 /* This is used to implement string.Formatter.vparse().  It parses the
1130    field name into attribute and item values.  It's a Python-callable
1131    wrapper around FieldNameIterator */
1132 
1133 typedef struct {
1134     PyObject_HEAD
1135     PyObject *str;
1136     FieldNameIterator it_field;
1137 } fieldnameiterobject;
1138 
1139 static void
fieldnameiter_dealloc(fieldnameiterobject * it)1140 fieldnameiter_dealloc(fieldnameiterobject *it)
1141 {
1142     Py_XDECREF(it->str);
1143     PyObject_FREE(it);
1144 }
1145 
1146 /* returns a tuple:
1147    (is_attr, value)
1148    is_attr is true if we used attribute syntax (e.g., '.foo')
1149               false if we used index syntax (e.g., '[foo]')
1150    value is an integer or string
1151 */
1152 static PyObject *
fieldnameiter_next(fieldnameiterobject * it)1153 fieldnameiter_next(fieldnameiterobject *it)
1154 {
1155     int result;
1156     int is_attr;
1157     Py_ssize_t idx;
1158     SubString name;
1159 
1160     result = FieldNameIterator_next(&it->it_field, &is_attr,
1161                                     &idx, &name);
1162     if (result == 0 || result == 1)
1163         /* if 0, error has already been set, if 1, iterator is empty */
1164         return NULL;
1165     else {
1166         PyObject* result = NULL;
1167         PyObject* is_attr_obj = NULL;
1168         PyObject* obj = NULL;
1169 
1170         is_attr_obj = PyBool_FromLong(is_attr);
1171         if (is_attr_obj == NULL)
1172             goto done;
1173 
1174         /* either an integer or a string */
1175         if (idx != -1)
1176             obj = PyLong_FromSsize_t(idx);
1177         else
1178             obj = SubString_new_object(&name);
1179         if (obj == NULL)
1180             goto done;
1181 
1182         /* return a tuple of values */
1183         result = PyTuple_Pack(2, is_attr_obj, obj);
1184 
1185     done:
1186         Py_XDECREF(is_attr_obj);
1187         Py_XDECREF(obj);
1188         return result;
1189     }
1190 }
1191 
1192 static PyMethodDef fieldnameiter_methods[] = {
1193     {NULL,              NULL}           /* sentinel */
1194 };
1195 
1196 static PyTypeObject PyFieldNameIter_Type = {
1197     PyVarObject_HEAD_INIT(&PyType_Type, 0)
1198     "fieldnameiterator",                /* tp_name */
1199     sizeof(fieldnameiterobject),        /* tp_basicsize */
1200     0,                                  /* tp_itemsize */
1201     /* methods */
1202     (destructor)fieldnameiter_dealloc,  /* tp_dealloc */
1203     0,                                  /* tp_print */
1204     0,                                  /* tp_getattr */
1205     0,                                  /* tp_setattr */
1206     0,                                  /* tp_reserved */
1207     0,                                  /* tp_repr */
1208     0,                                  /* tp_as_number */
1209     0,                                  /* tp_as_sequence */
1210     0,                                  /* tp_as_mapping */
1211     0,                                  /* tp_hash */
1212     0,                                  /* tp_call */
1213     0,                                  /* tp_str */
1214     PyObject_GenericGetAttr,            /* tp_getattro */
1215     0,                                  /* tp_setattro */
1216     0,                                  /* tp_as_buffer */
1217     Py_TPFLAGS_DEFAULT,                 /* tp_flags */
1218     0,                                  /* tp_doc */
1219     0,                                  /* tp_traverse */
1220     0,                                  /* tp_clear */
1221     0,                                  /* tp_richcompare */
1222     0,                                  /* tp_weaklistoffset */
1223     PyObject_SelfIter,                  /* tp_iter */
1224     (iternextfunc)fieldnameiter_next,   /* tp_iternext */
1225     fieldnameiter_methods,              /* tp_methods */
1226     0};
1227 
1228 /* unicode_formatter_field_name_split is used to implement
1229    string.Formatter.vformat.  it takes a PEP 3101 "field name", and
1230    returns a tuple of (first, rest): "first", the part before the
1231    first '.' or '['; and "rest", an iterator for the rest of the field
1232    name.  it's a wrapper around stringlib/string_format.h's
1233    field_name_split.  The iterator it returns is a
1234    FieldNameIterator */
1235 static PyObject *
formatter_field_name_split(PyObject * ignored,PyObject * self)1236 formatter_field_name_split(PyObject *ignored, PyObject *self)
1237 {
1238     SubString first;
1239     Py_ssize_t first_idx;
1240     fieldnameiterobject *it;
1241 
1242     PyObject *first_obj = NULL;
1243     PyObject *result = NULL;
1244 
1245     if (!PyUnicode_Check(self)) {
1246         PyErr_Format(PyExc_TypeError, "expected str, got %s", Py_TYPE(self)->tp_name);
1247         return NULL;
1248     }
1249 
1250     if (PyUnicode_READY(self) == -1)
1251         return NULL;
1252 
1253     it = PyObject_New(fieldnameiterobject, &PyFieldNameIter_Type);
1254     if (it == NULL)
1255         return NULL;
1256 
1257     /* take ownership, give the object to the iterator.  this is
1258        just to keep the field_name alive */
1259     Py_INCREF(self);
1260     it->str = self;
1261 
1262     /* Pass in auto_number = NULL. We'll return an empty string for
1263        first_obj in that case. */
1264     if (!field_name_split((PyObject*)self, 0, PyUnicode_GET_LENGTH(self),
1265                           &first, &first_idx, &it->it_field, NULL))
1266         goto done;
1267 
1268     /* first becomes an integer, if possible; else a string */
1269     if (first_idx != -1)
1270         first_obj = PyLong_FromSsize_t(first_idx);
1271     else
1272         /* convert "first" into a string object */
1273         first_obj = SubString_new_object(&first);
1274     if (first_obj == NULL)
1275         goto done;
1276 
1277     /* return a tuple of values */
1278     result = PyTuple_Pack(2, first_obj, it);
1279 
1280 done:
1281     Py_XDECREF(it);
1282     Py_XDECREF(first_obj);
1283     return result;
1284 }
1285