• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* xgettext Python backend.
2    Copyright (C) 2002-2003, 2005-2011, 2013-2014, 2018-2020 Free Software Foundation, Inc.
3 
4    This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.
5 
6    This program is free software: you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3 of the License, or
9    (at your option) any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
18 
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
22 
23 /* Specification.  */
24 #include "x-python.h"
25 
26 #include <assert.h>
27 #include <errno.h>
28 #include <stdbool.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32 
33 #include "message.h"
34 #include "rc-str-list.h"
35 #include "xgettext.h"
36 #include "xg-pos.h"
37 #include "xg-encoding.h"
38 #include "xg-mixed-string.h"
39 #include "xg-arglist-context.h"
40 #include "xg-arglist-callshape.h"
41 #include "xg-arglist-parser.h"
42 #include "xg-message.h"
43 #include "error.h"
44 #include "error-progname.h"
45 #include "progname.h"
46 #include "basename-lgpl.h"
47 #include "xerror.h"
48 #include "xvasprintf.h"
49 #include "xalloc.h"
50 #include "c-strstr.h"
51 #include "c-ctype.h"
52 #include "po-charset.h"
53 #include "uniname.h"
54 #include "unistr.h"
55 #include "gettext.h"
56 
57 #define _(s) gettext(s)
58 
59 #define max(a,b) ((a) > (b) ? (a) : (b))
60 
61 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
62 
63 
64 /* The Python syntax is defined in the Python Reference Manual
65    /usr/share/doc/packages/python/html/ref/index.html.
66    See also Python-2.0/Parser/tokenizer.c, Python-2.0/Python/compile.c,
67    Python-2.0/Objects/unicodeobject.c.  */
68 
69 
70 /* ====================== Keyword set customization.  ====================== */
71 
72 /* If true extract all strings.  */
73 static bool extract_all = false;
74 
75 static hash_table keywords;
76 static bool default_keywords = true;
77 
78 
79 void
x_python_extract_all()80 x_python_extract_all ()
81 {
82   extract_all = true;
83 }
84 
85 
86 void
x_python_keyword(const char * name)87 x_python_keyword (const char *name)
88 {
89   if (name == NULL)
90     default_keywords = false;
91   else
92     {
93       const char *end;
94       struct callshape shape;
95       const char *colon;
96 
97       if (keywords.table == NULL)
98         hash_init (&keywords, 100);
99 
100       split_keywordspec (name, &end, &shape);
101 
102       /* The characters between name and end should form a valid C identifier.
103          A colon means an invalid parse in split_keywordspec().  */
104       colon = strchr (name, ':');
105       if (colon == NULL || colon >= end)
106         insert_keyword_callshape (&keywords, name, end - name, &shape);
107     }
108 }
109 
110 /* Finish initializing the keywords hash table.
111    Called after argument processing, before each file is processed.  */
112 static void
init_keywords()113 init_keywords ()
114 {
115   if (default_keywords)
116     {
117       /* When adding new keywords here, also update the documentation in
118          xgettext.texi!  */
119       x_python_keyword ("gettext");
120       x_python_keyword ("ugettext");
121       x_python_keyword ("dgettext:2");
122       x_python_keyword ("ngettext:1,2");
123       x_python_keyword ("ungettext:1,2");
124       x_python_keyword ("dngettext:2,3");
125       x_python_keyword ("_");
126       default_keywords = false;
127     }
128 }
129 
130 void
init_flag_table_python()131 init_flag_table_python ()
132 {
133   xgettext_record_flag ("gettext:1:pass-python-format");
134   xgettext_record_flag ("ugettext:1:pass-python-format");
135   xgettext_record_flag ("dgettext:2:pass-python-format");
136   xgettext_record_flag ("ngettext:1:pass-python-format");
137   xgettext_record_flag ("ngettext:2:pass-python-format");
138   xgettext_record_flag ("ungettext:1:pass-python-format");
139   xgettext_record_flag ("ungettext:2:pass-python-format");
140   xgettext_record_flag ("dngettext:2:pass-python-format");
141   xgettext_record_flag ("dngettext:3:pass-python-format");
142   xgettext_record_flag ("_:1:pass-python-format");
143   /* xgettext_record_flag ("%:1:python-format"); // % is an infix operator! */
144 
145   xgettext_record_flag ("gettext:1:pass-python-brace-format");
146   xgettext_record_flag ("ugettext:1:pass-python-brace-format");
147   xgettext_record_flag ("dgettext:2:pass-python-brace-format");
148   xgettext_record_flag ("ngettext:1:pass-python-brace-format");
149   xgettext_record_flag ("ngettext:2:pass-python-brace-format");
150   xgettext_record_flag ("ungettext:1:pass-python-brace-format");
151   xgettext_record_flag ("ungettext:2:pass-python-brace-format");
152   xgettext_record_flag ("dngettext:2:pass-python-brace-format");
153   xgettext_record_flag ("dngettext:3:pass-python-brace-format");
154   xgettext_record_flag ("_:1:pass-python-brace-format");
155   /* xgettext_record_flag ("format:1:python-brace-format"); */
156 }
157 
158 
159 /* ======================== Reading of characters.  ======================== */
160 
161 /* The input file stream.  */
162 static FILE *fp;
163 
164 
165 /* 0. Terminate line by \n, regardless whether the external
166    representation of a line terminator is CR (Mac), and CR/LF
167    (DOS/Windows), as Python treats them equally.  */
168 static int
phase0_getc()169 phase0_getc ()
170 {
171   int c;
172 
173   c = getc (fp);
174   if (c == EOF)
175     {
176       if (ferror (fp))
177         error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
178                real_file_name);
179       return EOF;
180     }
181 
182   if (c == '\r')
183     {
184       int c1 = getc (fp);
185 
186       if (c1 != EOF && c1 != '\n')
187         ungetc (c1, fp);
188 
189       /* Seen line terminator CR or CR/LF.  */
190       return '\n';
191     }
192 
193   return c;
194 }
195 
196 /* Supports only one pushback character, and not '\n'.  */
197 static inline void
phase0_ungetc(int c)198 phase0_ungetc (int c)
199 {
200   if (c != EOF)
201     ungetc (c, fp);
202 }
203 
204 
205 /* 1. line_number handling.  */
206 
207 /* Maximum used, roughly a safer MB_LEN_MAX.  */
208 #define MAX_PHASE1_PUSHBACK 16
209 static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
210 static int phase1_pushback_length;
211 
212 /* Read the next single byte from the input file.  */
213 static int
phase1_getc()214 phase1_getc ()
215 {
216   int c;
217 
218   if (phase1_pushback_length)
219     c = phase1_pushback[--phase1_pushback_length];
220   else
221     c = phase0_getc ();
222 
223   if (c == '\n')
224     ++line_number;
225 
226   return c;
227 }
228 
229 /* Supports MAX_PHASE1_PUSHBACK characters of pushback.  */
230 static void
phase1_ungetc(int c)231 phase1_ungetc (int c)
232 {
233   if (c != EOF)
234     {
235       if (c == '\n')
236         --line_number;
237 
238       if (phase1_pushback_length == SIZEOF (phase1_pushback))
239         abort ();
240       phase1_pushback[phase1_pushback_length++] = c;
241     }
242 }
243 
244 
245 /* Phase 2: Conversion to Unicode.
246    This is done early because PEP 0263 specifies that conversion to Unicode
247    conceptually occurs before tokenization.  A test case where it matters
248    is with encodings like BIG5: when a double-byte character ending in 0x5C
249    is followed by '\' or 'u0021', the tokenizer must not treat the second
250    half of the double-byte character as a backslash.  */
251 
252 /* End-of-file indicator for functions returning an UCS-4 character.  */
253 #define UEOF -1
254 
255 static lexical_context_ty lexical_context;
256 
257 static int phase2_pushback[max (9, UNINAME_MAX + 3)];
258 static int phase2_pushback_length;
259 
260 /* Read the next Unicode UCS-4 character from the input file.  */
261 static int
phase2_getc()262 phase2_getc ()
263 {
264   if (phase2_pushback_length)
265     return phase2_pushback[--phase2_pushback_length];
266 
267   if (xgettext_current_source_encoding == po_charset_ascii)
268     {
269       int c = phase1_getc ();
270       if (c == EOF)
271         return UEOF;
272       if (!c_isascii (c))
273         {
274           multiline_error (xstrdup (""),
275                            xasprintf ("%s\n%s\n",
276                                       non_ascii_error_message (lexical_context,
277                                                                real_file_name,
278                                                                line_number),
279                                       _("\
280 Please specify the source encoding through --from-code or through a comment\n\
281 as specified in https://www.python.org/peps/pep-0263.html.\n")));
282           exit (EXIT_FAILURE);
283         }
284       return c;
285     }
286   else if (xgettext_current_source_encoding != po_charset_utf8)
287     {
288 #if HAVE_ICONV
289       /* Use iconv on an increasing number of bytes.  Read only as many bytes
290          through phase1_getc as needed.  This is needed to give reasonable
291          interactive behaviour when fp is connected to an interactive tty.  */
292       unsigned char buf[MAX_PHASE1_PUSHBACK];
293       size_t bufcount;
294       int c = phase1_getc ();
295       if (c == EOF)
296         return UEOF;
297       buf[0] = (unsigned char) c;
298       bufcount = 1;
299 
300       for (;;)
301         {
302           unsigned char scratchbuf[6];
303           const char *inptr = (const char *) &buf[0];
304           size_t insize = bufcount;
305           char *outptr = (char *) &scratchbuf[0];
306           size_t outsize = sizeof (scratchbuf);
307 
308           size_t res = iconv (xgettext_current_source_iconv,
309                               (ICONV_CONST char **) &inptr, &insize,
310                               &outptr, &outsize);
311           /* We expect that a character has been produced if and only if
312              some input bytes have been consumed.  */
313           if ((insize < bufcount) != (outsize < sizeof (scratchbuf)))
314             abort ();
315           if (outsize == sizeof (scratchbuf))
316             {
317               /* No character has been produced.  Must be an error.  */
318               if (res != (size_t)(-1))
319                 abort ();
320 
321               if (errno == EILSEQ)
322                 {
323                   /* An invalid multibyte sequence was encountered.  */
324                   goto invalid;
325                 }
326               else if (errno == EINVAL)
327                 {
328                   /* An incomplete multibyte character.  */
329                   int c;
330 
331                   if (bufcount == MAX_PHASE1_PUSHBACK)
332                     {
333                       /* An overlong incomplete multibyte sequence was
334                          encountered.  */
335                       multiline_error (xstrdup (""),
336                                        xasprintf (_("\
337 %s:%d: Long incomplete multibyte sequence.\n\
338 Please specify the correct source encoding through --from-code or through a\n\
339 comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
340                                        real_file_name, line_number));
341                       exit (EXIT_FAILURE);
342                     }
343 
344                   /* Read one more byte and retry iconv.  */
345                   c = phase1_getc ();
346                   if (c == EOF)
347                     goto incomplete_at_eof;
348                   if (c == '\n')
349                     goto incomplete_at_eol;
350                   buf[bufcount++] = (unsigned char) c;
351                 }
352               else
353                 error (EXIT_FAILURE, errno, _("%s:%d: iconv failure"),
354                        real_file_name, line_number);
355             }
356           else
357             {
358               size_t outbytes = sizeof (scratchbuf) - outsize;
359               size_t bytes = bufcount - insize;
360               ucs4_t uc;
361 
362               /* We expect that one character has been produced.  */
363               if (bytes == 0)
364                 abort ();
365               if (outbytes == 0)
366                 abort ();
367               /* Push back the unused bytes.  */
368               while (insize > 0)
369                 phase1_ungetc (buf[--insize]);
370               /* Convert the character from UTF-8 to UCS-4.  */
371               if (u8_mbtoucr (&uc, scratchbuf, outbytes) < (int) outbytes)
372                 {
373                   /* scratchbuf contains an out-of-range Unicode character
374                      (> 0x10ffff).  */
375                   goto invalid;
376                 }
377               return uc;
378             }
379         }
380 #else
381       /* If we don't have iconv(), the only supported values for
382          xgettext_global_source_encoding and thus also for
383          xgettext_current_source_encoding are ASCII and UTF-8.  */
384       abort ();
385 #endif
386     }
387   else
388     {
389       /* Read an UTF-8 encoded character.
390          Reject invalid input, like u8_mbtouc does.  */
391       int c;
392       ucs4_t uc;
393 
394       c = phase1_getc ();
395       if (c == EOF)
396         return UEOF;
397       if (c < 0x80)
398         {
399           uc = c;
400         }
401       else if (c < 0xc2)
402         goto invalid;
403       else if (c < 0xe0)
404         {
405           int c1 = phase1_getc ();
406           if (c1 == EOF)
407             goto incomplete_at_eof;
408           if (c1 == '\n')
409             goto incomplete_at_eol;
410           if ((c1 ^ 0x80) < 0x40)
411             uc = ((unsigned int) (c & 0x1f) << 6)
412                  | (unsigned int) (c1 ^ 0x80);
413           else
414             goto invalid;
415         }
416       else if (c < 0xf0)
417         {
418           int c1 = phase1_getc ();
419           if (c1 == EOF)
420             goto incomplete_at_eof;
421           if (c1 == '\n')
422             goto incomplete_at_eol;
423           if ((c1 ^ 0x80) < 0x40
424               && (c >= 0xe1 || c1 >= 0xa0)
425               && (c != 0xed || c1 < 0xa0))
426             {
427               int c2 = phase1_getc ();
428               if (c2 == EOF)
429                 goto incomplete_at_eof;
430               if (c2 == '\n')
431                 goto incomplete_at_eol;
432               if ((c2 ^ 0x80) < 0x40)
433                 uc = ((unsigned int) (c & 0x0f) << 12)
434                      | ((unsigned int) (c1 ^ 0x80) << 6)
435                      | (unsigned int) (c2 ^ 0x80);
436               else
437                 goto invalid;
438             }
439           else
440             goto invalid;
441         }
442       else if (c < 0xf8)
443         {
444           int c1 = phase1_getc ();
445           if (c1 == EOF)
446             goto incomplete_at_eof;
447           if (c1 == '\n')
448             goto incomplete_at_eol;
449           if ((c1 ^ 0x80) < 0x40
450               && (c >= 0xf1 || c1 >= 0x90)
451               && (c < 0xf4 || (c == 0xf4 && c1 < 0x90)))
452             {
453               int c2 = phase1_getc ();
454               if (c2 == EOF)
455                 goto incomplete_at_eof;
456               if (c2 == '\n')
457                 goto incomplete_at_eol;
458               if ((c2 ^ 0x80) < 0x40)
459                 {
460                   int c3 = phase1_getc ();
461                   if (c3 == EOF)
462                     goto incomplete_at_eof;
463                   if (c3 == '\n')
464                     goto incomplete_at_eol;
465                   if ((c3 ^ 0x80) < 0x40)
466                     uc = ((unsigned int) (c & 0x07) << 18)
467                          | ((unsigned int) (c1 ^ 0x80) << 12)
468                          | ((unsigned int) (c2 ^ 0x80) << 6)
469                          | (unsigned int) (c3 ^ 0x80);
470                   else
471                     goto invalid;
472                 }
473               else
474                 goto invalid;
475             }
476           else
477             goto invalid;
478         }
479       else
480         goto invalid;
481 
482       return uc;
483     }
484 
485  invalid:
486   /* An invalid multibyte sequence was encountered.  */
487   multiline_error (xstrdup (""),
488                    xasprintf (_("\
489 %s:%d: Invalid multibyte sequence.\n\
490 Please specify the correct source encoding through --from-code or through a\n\
491 comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
492                    real_file_name, line_number));
493   exit (EXIT_FAILURE);
494 
495  incomplete_at_eof:
496   multiline_error (xstrdup (""),
497                    xasprintf (_("\
498 %s:%d: Incomplete multibyte sequence at end of file.\n\
499 Please specify the correct source encoding through --from-code or through a\n\
500 comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
501                    real_file_name, line_number));
502   exit (EXIT_FAILURE);
503 
504  incomplete_at_eol:
505   multiline_error (xstrdup (""),
506                    xasprintf (_("\
507 %s:%d: Incomplete multibyte sequence at end of line.\n\
508 Please specify the correct source encoding through --from-code or through a\n\
509 comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
510                    real_file_name, line_number - 1));
511   exit (EXIT_FAILURE);
512 }
513 
514 /* Supports max (9, UNINAME_MAX + 3) pushback characters.  */
515 static void
phase2_ungetc(int c)516 phase2_ungetc (int c)
517 {
518   if (c != UEOF)
519     {
520       if (phase2_pushback_length == SIZEOF (phase2_pushback))
521         abort ();
522       phase2_pushback[phase2_pushback_length++] = c;
523     }
524 }
525 
526 
527 /* ========================= Accumulating strings.  ======================== */
528 
529 /* See xg-mixed-string.h for the API.  */
530 
531 
532 /* ======================== Accumulating comments.  ======================== */
533 
534 
535 /* Accumulating a single comment line.  */
536 
537 static struct mixed_string_buffer comment_buffer;
538 
539 static inline void
comment_start()540 comment_start ()
541 {
542   mixed_string_buffer_init (&comment_buffer, lc_comment,
543                             logical_file_name, line_number);
544 }
545 
546 static inline bool
comment_at_start()547 comment_at_start ()
548 {
549   return mixed_string_buffer_is_empty (&comment_buffer);
550 }
551 
552 static inline void
comment_add(int c)553 comment_add (int c)
554 {
555   mixed_string_buffer_append_unicode (&comment_buffer, c);
556 }
557 
558 static inline const char *
comment_line_end()559 comment_line_end ()
560 {
561   char *buffer =
562     mixed_string_contents_free1 (mixed_string_buffer_result (&comment_buffer));
563   size_t buflen = strlen (buffer);
564 
565   while (buflen >= 1
566          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
567     --buflen;
568   buffer[buflen] = '\0';
569   savable_comment_add (buffer);
570   lexical_context = lc_outside;
571   return buffer;
572 }
573 
574 
575 /* These are for tracking whether comments count as immediately before
576    keyword.  */
577 static int last_comment_line;
578 static int last_non_comment_line;
579 
580 
581 /* ======================== Recognizing comments.  ======================== */
582 
583 
584 /* Recognizing the "coding" comment.
585    As specified in PEP 0263, it takes the form
586      "coding" [":"|"="] {alphanumeric or "-" or "_" or "*"}*
587    or
588      "set" "fileencoding" "=" {alphanumeric or "-" or "_" or "*"}*
589    and is located in a comment in a line that
590      - is either the first or second line,
591      - is not a continuation line,
592      - in the first form, contains no other tokens except this comment.  */
593 
594 /* Canonicalized encoding name for the current input file.  */
595 static const char *xgettext_current_file_source_encoding;
596 
597 #if HAVE_ICONV
598 /* Converter from xgettext_current_file_source_encoding to UTF-8 (except from
599    ASCII or UTF-8, when this conversion is a no-op).  */
600 static iconv_t xgettext_current_file_source_iconv;
601 #endif
602 
603 static inline void
set_current_file_source_encoding(const char * canon_encoding)604 set_current_file_source_encoding (const char *canon_encoding)
605 {
606   xgettext_current_file_source_encoding = canon_encoding;
607 
608   if (xgettext_current_file_source_encoding != po_charset_ascii
609       && xgettext_current_file_source_encoding != po_charset_utf8)
610     {
611 #if HAVE_ICONV
612       iconv_t cd;
613 
614       /* Avoid glibc-2.1 bug with EUC-KR.  */
615 # if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
616      && !defined _LIBICONV_VERSION
617       if (strcmp (xgettext_current_file_source_encoding, "EUC-KR") == 0)
618         cd = (iconv_t)(-1);
619       else
620 # endif
621       cd = iconv_open (po_charset_utf8, xgettext_current_file_source_encoding);
622       if (cd == (iconv_t)(-1))
623         error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1,
624                        _("Cannot convert from \"%s\" to \"%s\". %s relies on iconv(), and iconv() does not support this conversion."),
625                        xgettext_current_file_source_encoding, po_charset_utf8,
626                        last_component (program_name));
627       xgettext_current_file_source_iconv = cd;
628 #else
629       error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1,
630                      _("Cannot convert from \"%s\" to \"%s\". %s relies on iconv(). This version was built without iconv()."),
631                      xgettext_current_file_source_encoding, po_charset_utf8,
632                      last_component (program_name));
633 #endif
634     }
635 
636   xgettext_current_source_encoding = xgettext_current_file_source_encoding;
637 #if HAVE_ICONV
638   xgettext_current_source_iconv = xgettext_current_file_source_iconv;
639 #endif
640 }
641 
642 static inline void
try_to_extract_coding(const char * comment)643 try_to_extract_coding (const char *comment)
644 {
645   const char *p = c_strstr (comment, "coding");
646 
647   if (p != NULL)
648     {
649       p += 6;
650       if (*p == ':' || *p == '=')
651         {
652           p++;
653           while (*p == ' ' || *p == '\t')
654             p++;
655           {
656             const char *encoding_start = p;
657 
658             while (c_isalnum (*p) || *p == '-' || *p == '_' || *p == '.')
659               p++;
660             {
661               const char *encoding_end = p;
662 
663               if (encoding_end > encoding_start)
664                 {
665                   /* Extract the encoding string.  */
666                   size_t encoding_len = encoding_end - encoding_start;
667                   char *encoding = XNMALLOC (encoding_len + 1, char);
668 
669                   memcpy (encoding, encoding_start, encoding_len);
670                   encoding[encoding_len] = '\0';
671 
672                   {
673                     /* Canonicalize it.  */
674                     const char *canon_encoding = po_charset_canonicalize (encoding);
675                     if (canon_encoding == NULL)
676                       {
677                         error_at_line (0, 0,
678                                        logical_file_name, line_number - 1,
679                                        _("Unknown encoding \"%s\". Proceeding with ASCII instead."),
680                                        encoding);
681                         canon_encoding = po_charset_ascii;
682                       }
683 
684                     /* Activate it.  */
685                     set_current_file_source_encoding (canon_encoding);
686                   }
687 
688                   free (encoding);
689                 }
690             }
691           }
692         }
693     }
694 }
695 
696 /* Tracking whether the current line is a continuation line or contains a
697    non-blank character.  */
698 static bool continuation_or_nonblank_line;
699 
700 
701 /* Phase 3: Outside strings, replace backslash-newline with nothing and a
702    comment with nothing.  */
703 
704 static int
phase3_getc()705 phase3_getc ()
706 {
707   int c;
708 
709   for (;;)
710     {
711       c = phase2_getc ();
712       if (c == '\\')
713         {
714           c = phase2_getc ();
715           if (c != '\n')
716             {
717               phase2_ungetc (c);
718               /* This shouldn't happen usually, because "A backslash is
719                  illegal elsewhere on a line outside a string literal."  */
720               return '\\';
721             }
722           /* Eat backslash-newline.  */
723           continuation_or_nonblank_line = true;
724         }
725       else if (c == '#')
726         {
727           /* Eat a comment.  */
728           const char *comment;
729 
730           last_comment_line = line_number;
731           comment_start ();
732           for (;;)
733             {
734               c = phase2_getc ();
735               if (c == UEOF || c == '\n')
736                 break;
737               /* We skip all leading white space, but not EOLs.  */
738               if (!(comment_at_start () && (c == ' ' || c == '\t')))
739                 comment_add (c);
740             }
741           comment = comment_line_end ();
742           if (line_number - 1 <= 2 && !continuation_or_nonblank_line)
743             try_to_extract_coding (comment);
744           continuation_or_nonblank_line = false;
745           return c;
746         }
747       else
748         {
749           if (c == '\n')
750             continuation_or_nonblank_line = false;
751           else if (!(c == ' ' || c == '\t' || c == '\f'))
752             continuation_or_nonblank_line = true;
753           return c;
754         }
755     }
756 }
757 
758 /* Supports only one pushback character.  */
759 static void
phase3_ungetc(int c)760 phase3_ungetc (int c)
761 {
762   phase2_ungetc (c);
763 }
764 
765 
766 /* ========================= Accumulating strings.  ======================== */
767 
768 /* Return value of phase7_getuc when EOF is reached.  */
769 #define P7_EOF (-1)
770 #define P7_STRING_END (-2)
771 
772 /* Convert an UTF-16 or UTF-32 code point to a return value that can be
773    distinguished from a single-byte return value.  */
774 #define UNICODE(code) (0x100 + (code))
775 
776 /* Test a return value of phase7_getuc whether it designates an UTF-16 or
777    UTF-32 code point.  */
778 #define IS_UNICODE(p7_result) ((p7_result) >= 0x100)
779 
780 /* Extract the UTF-16 or UTF-32 code of a return value that satisfies
781    IS_UNICODE.  */
782 #define UNICODE_VALUE(p7_result) ((p7_result) - 0x100)
783 
784 
785 /* ========================== Reading of tokens.  ========================== */
786 
787 
788 enum token_type_ty
789 {
790   token_type_eof,
791   token_type_lparen,            /* ( */
792   token_type_rparen,            /* ) */
793   token_type_comma,             /* , */
794   token_type_lbracket,          /* [ */
795   token_type_rbracket,          /* ] */
796   token_type_string,            /* "abc", 'abc', """abc""", '''abc''' */
797   token_type_symbol,            /* symbol, number */
798   token_type_plus,              /* + */
799   token_type_other              /* misc. operator */
800 };
801 typedef enum token_type_ty token_type_ty;
802 
803 typedef struct token_ty token_ty;
804 struct token_ty
805 {
806   token_type_ty type;
807   char *string;                         /* for token_type_symbol */
808   mixed_string_ty *mixed_string;        /* for token_type_string */
809   refcounted_string_list_ty *comment;   /* for token_type_string */
810   int line_number;
811 };
812 
813 /* Free the memory pointed to by a 'struct token_ty'.  */
814 static inline void
free_token(token_ty * tp)815 free_token (token_ty *tp)
816 {
817   if (tp->type == token_type_symbol)
818     free (tp->string);
819   if (tp->type == token_type_string)
820     {
821       mixed_string_free (tp->mixed_string);
822       drop_reference (tp->comment);
823     }
824 }
825 
826 
827 /* There are two different input syntaxes for strings, "abc" and r"abc",
828    and two different input syntaxes for Unicode strings, u"abc" and ur"abc".
829    Which escape sequences are understood, i.e. what is interpreted specially
830    after backslash?
831     "abc"     \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn
832     r"abc"
833     u"abc"    \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn \unnnn \Unnnnnnnn \N{...}
834     ur"abc"                                           \unnnn
835    The \unnnn values are UTF-16 values; a single \Unnnnnnnn can expand to two
836    \unnnn items.  The \ooo and \xnn values are in the current source encoding
837    for byte strings, and Unicode code points for Unicode strings.
838  */
839 
840 static int
phase7_getuc(int quote_char,bool triple,bool interpret_ansic,bool interpret_unicode,unsigned int * backslash_counter)841 phase7_getuc (int quote_char,
842               bool triple, bool interpret_ansic, bool interpret_unicode,
843               unsigned int *backslash_counter)
844 {
845   int c;
846 
847   for (;;)
848     {
849       /* Use phase 2, because phase 3 elides comments.  */
850       c = phase2_getc ();
851 
852       if (c == UEOF)
853         return P7_EOF;
854 
855       if (c == quote_char && (interpret_ansic || (*backslash_counter & 1) == 0))
856         {
857           if (triple)
858             {
859               int c1 = phase2_getc ();
860               if (c1 == quote_char)
861                 {
862                   int c2 = phase2_getc ();
863                   if (c2 == quote_char)
864                     return P7_STRING_END;
865                   phase2_ungetc (c2);
866                 }
867               phase2_ungetc (c1);
868               return UNICODE (c);
869             }
870           else
871             return P7_STRING_END;
872         }
873 
874       if (c == '\n')
875         {
876           if (triple)
877             {
878               *backslash_counter = 0;
879               return UNICODE ('\n');
880             }
881           /* In r"..." and ur"..." strings, newline is only allowed
882              immediately after an odd number of backslashes (although the
883              backslashes are not interpreted!).  */
884           if (!(interpret_ansic || (*backslash_counter & 1) == 0))
885             {
886               *backslash_counter = 0;
887               return UNICODE ('\n');
888             }
889           phase2_ungetc (c);
890           error_with_progname = false;
891           error (0, 0, _("%s:%d: warning: unterminated string"),
892                  logical_file_name, line_number);
893           error_with_progname = true;
894           return P7_STRING_END;
895         }
896 
897       if (c != '\\')
898         {
899           *backslash_counter = 0;
900           return UNICODE (c);
901         }
902 
903       /* Backslash handling.  */
904 
905       if (!interpret_ansic && !interpret_unicode)
906         {
907           ++*backslash_counter;
908           return UNICODE ('\\');
909         }
910 
911       /* Dispatch according to the character following the backslash.  */
912       c = phase2_getc ();
913       if (c == UEOF)
914         {
915           ++*backslash_counter;
916           return UNICODE ('\\');
917         }
918 
919       if (interpret_ansic)
920         switch (c)
921           {
922           case '\n':
923             continue;
924           case '\\':
925             ++*backslash_counter;
926             return UNICODE (c);
927           case '\'': case '"':
928             *backslash_counter = 0;
929             return UNICODE (c);
930           case 'a':
931             *backslash_counter = 0;
932             return UNICODE ('\a');
933           case 'b':
934             *backslash_counter = 0;
935             return UNICODE ('\b');
936           case 'f':
937             *backslash_counter = 0;
938             return UNICODE ('\f');
939           case 'n':
940             *backslash_counter = 0;
941             return UNICODE ('\n');
942           case 'r':
943             *backslash_counter = 0;
944             return UNICODE ('\r');
945           case 't':
946             *backslash_counter = 0;
947             return UNICODE ('\t');
948           case 'v':
949             *backslash_counter = 0;
950             return UNICODE ('\v');
951           case '0': case '1': case '2': case '3': case '4':
952           case '5': case '6': case '7':
953             {
954               int n = c - '0';
955 
956               c = phase2_getc ();
957               if (c != UEOF)
958                 {
959                   if (c >= '0' && c <= '7')
960                     {
961                       n = (n << 3) + (c - '0');
962                       c = phase2_getc ();
963                       if (c != UEOF)
964                         {
965                           if (c >= '0' && c <= '7')
966                             n = (n << 3) + (c - '0');
967                           else
968                             phase2_ungetc (c);
969                         }
970                     }
971                   else
972                     phase2_ungetc (c);
973                 }
974               *backslash_counter = 0;
975               if (interpret_unicode)
976                 return UNICODE (n);
977               else
978                 return (unsigned char) n;
979             }
980           case 'x':
981             {
982               int c1 = phase2_getc ();
983               int n1;
984 
985               if (c1 >= '0' && c1 <= '9')
986                 n1 = c1 - '0';
987               else if (c1 >= 'A' && c1 <= 'F')
988                 n1 = c1 - 'A' + 10;
989               else if (c1 >= 'a' && c1 <= 'f')
990                 n1 = c1 - 'a' + 10;
991               else
992                 n1 = -1;
993 
994               if (n1 >= 0)
995                 {
996                   int c2 = phase2_getc ();
997                   int n2;
998 
999                   if (c2 >= '0' && c2 <= '9')
1000                     n2 = c2 - '0';
1001                   else if (c2 >= 'A' && c2 <= 'F')
1002                     n2 = c2 - 'A' + 10;
1003                   else if (c2 >= 'a' && c2 <= 'f')
1004                     n2 = c2 - 'a' + 10;
1005                   else
1006                     n2 = -1;
1007 
1008                   if (n2 >= 0)
1009                     {
1010                       int n = (n1 << 4) + n2;
1011                       *backslash_counter = 0;
1012                       if (interpret_unicode)
1013                         return UNICODE (n);
1014                       else
1015                         return (unsigned char) n;
1016                     }
1017 
1018                   phase2_ungetc (c2);
1019                 }
1020               phase2_ungetc (c1);
1021               phase2_ungetc (c);
1022               ++*backslash_counter;
1023               return UNICODE ('\\');
1024             }
1025           }
1026 
1027       if (interpret_unicode)
1028         {
1029           if (c == 'u')
1030             {
1031               unsigned char buf[4];
1032               unsigned int n = 0;
1033               int i;
1034 
1035               for (i = 0; i < 4; i++)
1036                 {
1037                   int c1 = phase2_getc ();
1038 
1039                   if (c1 >= '0' && c1 <= '9')
1040                     n = (n << 4) + (c1 - '0');
1041                   else if (c1 >= 'A' && c1 <= 'F')
1042                     n = (n << 4) + (c1 - 'A' + 10);
1043                   else if (c1 >= 'a' && c1 <= 'f')
1044                     n = (n << 4) + (c1 - 'a' + 10);
1045                   else
1046                     {
1047                       phase2_ungetc (c1);
1048                       while (--i >= 0)
1049                         phase2_ungetc (buf[i]);
1050                       phase2_ungetc (c);
1051                       ++*backslash_counter;
1052                       return UNICODE ('\\');
1053                     }
1054 
1055                   buf[i] = c1;
1056                 }
1057               *backslash_counter = 0;
1058               return UNICODE (n);
1059             }
1060 
1061           if (interpret_ansic)
1062             {
1063               if (c == 'U')
1064                 {
1065                   unsigned char buf[8];
1066                   unsigned int n = 0;
1067                   int i;
1068 
1069                   for (i = 0; i < 8; i++)
1070                     {
1071                       int c1 = phase2_getc ();
1072 
1073                       if (c1 >= '0' && c1 <= '9')
1074                         n = (n << 4) + (c1 - '0');
1075                       else if (c1 >= 'A' && c1 <= 'F')
1076                         n = (n << 4) + (c1 - 'A' + 10);
1077                       else if (c1 >= 'a' && c1 <= 'f')
1078                         n = (n << 4) + (c1 - 'a' + 10);
1079                       else
1080                         {
1081                           phase2_ungetc (c1);
1082                           while (--i >= 0)
1083                             phase2_ungetc (buf[i]);
1084                           phase2_ungetc (c);
1085                           ++*backslash_counter;
1086                           return UNICODE ('\\');
1087                         }
1088 
1089                       buf[i] = c1;
1090                     }
1091                   if (n < 0x110000)
1092                     {
1093                       *backslash_counter = 0;
1094                       return UNICODE (n);
1095                     }
1096 
1097                   error_with_progname = false;
1098                   error (0, 0, _("%s:%d: warning: invalid Unicode character"),
1099                          logical_file_name, line_number);
1100                   error_with_progname = true;
1101 
1102                   while (--i >= 0)
1103                     phase2_ungetc (buf[i]);
1104                   phase2_ungetc (c);
1105                   ++*backslash_counter;
1106                   return UNICODE ('\\');
1107                 }
1108 
1109               if (c == 'N')
1110                 {
1111                   int c1 = phase2_getc ();
1112                   if (c1 == '{')
1113                     {
1114                       unsigned char buf[UNINAME_MAX + 1];
1115                       int i;
1116                       unsigned int n;
1117 
1118                       for (i = 0; i < UNINAME_MAX; i++)
1119                         {
1120                           int c2 = phase2_getc ();
1121                           if (!(c2 >= ' ' && c2 <= '~'))
1122                             {
1123                               phase2_ungetc (c2);
1124                               while (--i >= 0)
1125                                 phase2_ungetc (buf[i]);
1126                               phase2_ungetc (c1);
1127                               phase2_ungetc (c);
1128                               ++*backslash_counter;
1129                               return UNICODE ('\\');
1130                             }
1131                           if (c2 == '}')
1132                             break;
1133                           buf[i] = c2;
1134                         }
1135                       buf[i] = '\0';
1136 
1137                       n = unicode_name_character ((char *) buf);
1138                       if (n != UNINAME_INVALID)
1139                         {
1140                           *backslash_counter = 0;
1141                           return UNICODE (n);
1142                         }
1143 
1144                       phase2_ungetc ('}');
1145                       while (--i >= 0)
1146                         phase2_ungetc (buf[i]);
1147                     }
1148                   phase2_ungetc (c1);
1149                   phase2_ungetc (c);
1150                   ++*backslash_counter;
1151                   return UNICODE ('\\');
1152                 }
1153             }
1154         }
1155 
1156       phase2_ungetc (c);
1157       ++*backslash_counter;
1158       return UNICODE ('\\');
1159     }
1160 }
1161 
1162 
1163 /* Combine characters into tokens.  Discard whitespace except newlines at
1164    the end of logical lines.  */
1165 
1166 /* Number of pending open parentheses/braces/brackets.  */
1167 static int open_pbb;
1168 
1169 static token_ty phase5_pushback[2];
1170 static int phase5_pushback_length;
1171 
1172 static void
phase5_get(token_ty * tp)1173 phase5_get (token_ty *tp)
1174 {
1175   int c;
1176 
1177   if (phase5_pushback_length)
1178     {
1179       *tp = phase5_pushback[--phase5_pushback_length];
1180       return;
1181     }
1182 
1183   for (;;)
1184     {
1185       tp->line_number = line_number;
1186       c = phase3_getc ();
1187 
1188       switch (c)
1189         {
1190         case UEOF:
1191           tp->type = token_type_eof;
1192           return;
1193 
1194         case ' ':
1195         case '\t':
1196         case '\f':
1197           /* Ignore whitespace and comments.  */
1198           continue;
1199 
1200         case '\n':
1201           if (last_non_comment_line > last_comment_line)
1202             savable_comment_reset ();
1203           /* Ignore newline if and only if it is used for implicit line
1204              joining.  */
1205           if (open_pbb > 0)
1206             continue;
1207           tp->type = token_type_other;
1208           return;
1209         }
1210 
1211       last_non_comment_line = tp->line_number;
1212 
1213       switch (c)
1214         {
1215         case '.':
1216           {
1217             int c1 = phase3_getc ();
1218             phase3_ungetc (c1);
1219             if (!(c1 >= '0' && c1 <= '9'))
1220               {
1221 
1222                 tp->type = token_type_other;
1223                 return;
1224               }
1225           }
1226           /* FALLTHROUGH */
1227         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1228         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1229         case 'M': case 'N': case 'O': case 'P': case 'Q':
1230         case 'S': case 'T':           case 'V': case 'W': case 'X':
1231         case 'Y': case 'Z':
1232         case '_':
1233         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1234         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1235         case 'm': case 'n': case 'o': case 'p': case 'q':
1236         case 's': case 't':           case 'v': case 'w': case 'x':
1237         case 'y': case 'z':
1238         case '0': case '1': case '2': case '3': case '4':
1239         case '5': case '6': case '7': case '8': case '9':
1240         symbol:
1241           /* Symbol, or part of a number.  */
1242           {
1243             static char *buffer;
1244             static int bufmax;
1245             int bufpos;
1246 
1247             bufpos = 0;
1248             for (;;)
1249               {
1250                 if (bufpos >= bufmax)
1251                   {
1252                     bufmax = 2 * bufmax + 10;
1253                     buffer = xrealloc (buffer, bufmax);
1254                   }
1255                 buffer[bufpos++] = c;
1256                 c = phase3_getc ();
1257                 switch (c)
1258                   {
1259                   case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1260                   case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1261                   case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1262                   case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1263                   case 'Y': case 'Z':
1264                   case '_':
1265                   case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1266                   case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1267                   case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1268                   case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1269                   case 'y': case 'z':
1270                   case '0': case '1': case '2': case '3': case '4':
1271                   case '5': case '6': case '7': case '8': case '9':
1272                     continue;
1273                   default:
1274                     phase3_ungetc (c);
1275                     break;
1276                   }
1277                 break;
1278               }
1279             if (bufpos >= bufmax)
1280               {
1281                 bufmax = 2 * bufmax + 10;
1282                 buffer = xrealloc (buffer, bufmax);
1283               }
1284             buffer[bufpos] = '\0';
1285             tp->string = xstrdup (buffer);
1286             tp->type = token_type_symbol;
1287             return;
1288           }
1289 
1290         /* Strings.  */
1291           {
1292             int quote_char;
1293             bool interpret_ansic;
1294             bool interpret_unicode;
1295             bool triple;
1296             unsigned int backslash_counter;
1297 
1298             case 'R': case 'r':
1299               {
1300                 int c1 = phase2_getc ();
1301                 if (c1 == '"' || c1 == '\'')
1302                   {
1303                     quote_char = c1;
1304                     interpret_ansic = false;
1305                     interpret_unicode = false;
1306                     goto string;
1307                   }
1308                 phase2_ungetc (c1);
1309                 goto symbol;
1310               }
1311 
1312             case 'U': case 'u':
1313               {
1314                 int c1 = phase2_getc ();
1315                 if (c1 == '"' || c1 == '\'')
1316                   {
1317                     quote_char = c1;
1318                     interpret_ansic = true;
1319                     interpret_unicode = true;
1320                     goto string;
1321                   }
1322                 if (c1 == 'R' || c1 == 'r')
1323                   {
1324                     int c2 = phase2_getc ();
1325                     if (c2 == '"' || c2 == '\'')
1326                       {
1327                         quote_char = c2;
1328                         interpret_ansic = false;
1329                         interpret_unicode = true;
1330                         goto string;
1331                       }
1332                     phase2_ungetc (c2);
1333                   }
1334                 phase2_ungetc (c1);
1335                 goto symbol;
1336               }
1337 
1338             case '"': case '\'':
1339               quote_char = c;
1340               interpret_ansic = true;
1341               interpret_unicode = false;
1342             string:
1343               triple = false;
1344               lexical_context = lc_string;
1345               {
1346                 int c1 = phase2_getc ();
1347                 if (c1 == quote_char)
1348                   {
1349                     int c2 = phase2_getc ();
1350                     if (c2 == quote_char)
1351                       triple = true;
1352                     else
1353                       {
1354                         phase2_ungetc (c2);
1355                         phase2_ungetc (c1);
1356                       }
1357                   }
1358                 else
1359                   phase2_ungetc (c1);
1360               }
1361               backslash_counter = 0;
1362               {
1363                 struct mixed_string_buffer msb;
1364 
1365                 /* Start accumulating the string.  */
1366                 mixed_string_buffer_init (&msb, lexical_context,
1367                                           logical_file_name, line_number);
1368                 for (;;)
1369                   {
1370                     int uc = phase7_getuc (quote_char, triple, interpret_ansic,
1371                                            interpret_unicode, &backslash_counter);
1372 
1373                     /* Keep line_number in sync.  */
1374                     msb.line_number = line_number;
1375 
1376                     if (uc == P7_EOF || uc == P7_STRING_END)
1377                       break;
1378 
1379                     if (IS_UNICODE (uc))
1380                       {
1381                         assert (UNICODE_VALUE (uc) >= 0
1382                                 && UNICODE_VALUE (uc) < 0x110000);
1383                         mixed_string_buffer_append_unicode (&msb,
1384                                                             UNICODE_VALUE (uc));
1385                       }
1386                     else
1387                       mixed_string_buffer_append_char (&msb, uc);
1388                   }
1389                 tp->mixed_string = mixed_string_buffer_result (&msb);
1390                 tp->comment = add_reference (savable_comment);
1391                 lexical_context = lc_outside;
1392                 tp->type = token_type_string;
1393               }
1394               return;
1395           }
1396 
1397         case '(':
1398           open_pbb++;
1399           tp->type = token_type_lparen;
1400           return;
1401 
1402         case ')':
1403           if (open_pbb > 0)
1404             open_pbb--;
1405           tp->type = token_type_rparen;
1406           return;
1407 
1408         case ',':
1409           tp->type = token_type_comma;
1410           return;
1411 
1412         case '[': case '{':
1413           open_pbb++;
1414           tp->type = (c == '[' ? token_type_lbracket : token_type_other);
1415           return;
1416 
1417         case ']': case '}':
1418           if (open_pbb > 0)
1419             open_pbb--;
1420           tp->type = (c == ']' ? token_type_rbracket : token_type_other);
1421           return;
1422 
1423         case '+':
1424           tp->type = token_type_plus;
1425           return;
1426 
1427         default:
1428           /* We could carefully recognize each of the 2 and 3 character
1429              operators, but it is not necessary, as we only need to recognize
1430              gettext invocations.  Don't bother.  */
1431           tp->type = token_type_other;
1432           return;
1433         }
1434     }
1435 }
1436 
1437 /* Supports only one pushback token.  */
1438 static void
phase5_unget(token_ty * tp)1439 phase5_unget (token_ty *tp)
1440 {
1441   if (tp->type != token_type_eof)
1442     {
1443       if (phase5_pushback_length == SIZEOF (phase5_pushback))
1444         abort ();
1445       phase5_pushback[phase5_pushback_length++] = *tp;
1446     }
1447 }
1448 
1449 
1450 /* Combine adjacent strings to form a single string.  Note that the end
1451    of a logical line appears as a token of its own, therefore strings that
1452    belong to different logical lines will not be concatenated.  */
1453 
1454 static void
x_python_lex(token_ty * tp)1455 x_python_lex (token_ty *tp)
1456 {
1457   phase5_get (tp);
1458   if (tp->type == token_type_string)
1459     {
1460       mixed_string_ty *sum = tp->mixed_string;
1461 
1462       for (;;)
1463         {
1464           token_ty token2;
1465           token_ty token3;
1466           token_ty *tp2 = NULL;
1467 
1468           phase5_get (&token2);
1469           switch (token2.type)
1470             {
1471             case token_type_plus:
1472               {
1473                 phase5_get (&token3);
1474                 if (token3.type == token_type_string)
1475                   {
1476                     free_token (&token2);
1477                     tp2 = &token3;
1478                   }
1479                 else
1480                   phase5_unget (&token3);
1481               }
1482               break;
1483             case token_type_string:
1484               tp2 = &token2;
1485               break;
1486             default:
1487               break;
1488             }
1489 
1490           if (tp2)
1491             {
1492               sum = mixed_string_concat_free1 (sum, tp2->mixed_string);
1493 
1494               free_token (tp2);
1495               continue;
1496             }
1497           phase5_unget (&token2);
1498           break;
1499         }
1500       tp->mixed_string = sum;
1501     }
1502 }
1503 
1504 
1505 /* ========================= Extracting strings.  ========================== */
1506 
1507 
1508 /* Context lookup table.  */
1509 static flag_context_list_table_ty *flag_context_list_table;
1510 
1511 
1512 /* The file is broken into tokens.  Scan the token stream, looking for
1513    a keyword, followed by a left paren, followed by a string.  When we
1514    see this sequence, we have something to remember.  We assume we are
1515    looking at a valid C or C++ program, and leave the complaints about
1516    the grammar to the compiler.
1517 
1518      Normal handling: Look for
1519        keyword ( ... msgid ... )
1520      Plural handling: Look for
1521        keyword ( ... msgid ... msgid_plural ... )
1522 
1523    We use recursion because the arguments before msgid or between msgid
1524    and msgid_plural can contain subexpressions of the same form.  */
1525 
1526 
1527 /* Extract messages until the next balanced closing parenthesis or bracket.
1528    Extracted messages are added to MLP.
1529    DELIM can be either token_type_rparen or token_type_rbracket, or
1530    token_type_eof to accept both.
1531    Return true upon eof, false upon closing parenthesis or bracket.  */
1532 static bool
extract_balanced(message_list_ty * mlp,token_type_ty delim,flag_context_ty outer_context,flag_context_list_iterator_ty context_iter,struct arglist_parser * argparser)1533 extract_balanced (message_list_ty *mlp,
1534                   token_type_ty delim,
1535                   flag_context_ty outer_context,
1536                   flag_context_list_iterator_ty context_iter,
1537                   struct arglist_parser *argparser)
1538 {
1539   /* Current argument number.  */
1540   int arg = 1;
1541   /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
1542   int state;
1543   /* Parameters of the keyword just seen.  Defined only in state 1.  */
1544   const struct callshapes *next_shapes = NULL;
1545   /* Context iterator that will be used if the next token is a '('.  */
1546   flag_context_list_iterator_ty next_context_iter =
1547     passthrough_context_list_iterator;
1548   /* Current context.  */
1549   flag_context_ty inner_context =
1550     inherited_context (outer_context,
1551                        flag_context_list_iterator_advance (&context_iter));
1552 
1553   /* Start state is 0.  */
1554   state = 0;
1555 
1556   for (;;)
1557     {
1558       token_ty token;
1559 
1560       x_python_lex (&token);
1561       switch (token.type)
1562         {
1563         case token_type_symbol:
1564           {
1565             void *keyword_value;
1566 
1567             if (hash_find_entry (&keywords, token.string, strlen (token.string),
1568                                  &keyword_value)
1569                 == 0)
1570               {
1571                 next_shapes = (const struct callshapes *) keyword_value;
1572                 state = 1;
1573               }
1574             else
1575               state = 0;
1576           }
1577           next_context_iter =
1578             flag_context_list_iterator (
1579               flag_context_list_table_lookup (
1580                 flag_context_list_table,
1581                 token.string, strlen (token.string)));
1582           free (token.string);
1583           continue;
1584 
1585         case token_type_lparen:
1586           if (extract_balanced (mlp, token_type_rparen,
1587                                 inner_context, next_context_iter,
1588                                 arglist_parser_alloc (mlp,
1589                                                       state ? next_shapes : NULL)))
1590             {
1591               arglist_parser_done (argparser, arg);
1592               return true;
1593             }
1594           next_context_iter = null_context_list_iterator;
1595           state = 0;
1596           continue;
1597 
1598         case token_type_rparen:
1599           if (delim == token_type_rparen || delim == token_type_eof)
1600             {
1601               arglist_parser_done (argparser, arg);
1602               return false;
1603             }
1604           next_context_iter = null_context_list_iterator;
1605           state = 0;
1606           continue;
1607 
1608         case token_type_comma:
1609           arg++;
1610           inner_context =
1611             inherited_context (outer_context,
1612                                flag_context_list_iterator_advance (
1613                                  &context_iter));
1614           next_context_iter = passthrough_context_list_iterator;
1615           state = 0;
1616           continue;
1617 
1618         case token_type_lbracket:
1619           if (extract_balanced (mlp, token_type_rbracket,
1620                                 null_context, null_context_list_iterator,
1621                                 arglist_parser_alloc (mlp, NULL)))
1622             {
1623               arglist_parser_done (argparser, arg);
1624               return true;
1625             }
1626           next_context_iter = null_context_list_iterator;
1627           state = 0;
1628           continue;
1629 
1630         case token_type_rbracket:
1631           if (delim == token_type_rbracket || delim == token_type_eof)
1632             {
1633               arglist_parser_done (argparser, arg);
1634               return false;
1635             }
1636           next_context_iter = null_context_list_iterator;
1637           state = 0;
1638           continue;
1639 
1640         case token_type_string:
1641           {
1642             lex_pos_ty pos;
1643 
1644             pos.file_name = logical_file_name;
1645             pos.line_number = token.line_number;
1646 
1647             if (extract_all)
1648               {
1649                 char *string = mixed_string_contents (token.mixed_string);
1650                 mixed_string_free (token.mixed_string);
1651                 remember_a_message (mlp, NULL, string, true, false,
1652                                     inner_context, &pos,
1653                                     NULL, token.comment, true);
1654               }
1655             else
1656               arglist_parser_remember (argparser, arg, token.mixed_string,
1657                                        inner_context,
1658                                        pos.file_name, pos.line_number,
1659                                        token.comment, true);
1660           }
1661           drop_reference (token.comment);
1662           next_context_iter = null_context_list_iterator;
1663           state = 0;
1664           continue;
1665 
1666         case token_type_eof:
1667           arglist_parser_done (argparser, arg);
1668           return true;
1669 
1670         case token_type_plus:
1671         case token_type_other:
1672           next_context_iter = null_context_list_iterator;
1673           state = 0;
1674           continue;
1675 
1676         default:
1677           abort ();
1678         }
1679     }
1680 }
1681 
1682 
1683 void
extract_python(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1684 extract_python (FILE *f,
1685                 const char *real_filename, const char *logical_filename,
1686                 flag_context_list_table_ty *flag_table,
1687                 msgdomain_list_ty *mdlp)
1688 {
1689   message_list_ty *mlp = mdlp->item[0]->messages;
1690 
1691   fp = f;
1692   real_file_name = real_filename;
1693   logical_file_name = xstrdup (logical_filename);
1694   line_number = 1;
1695 
1696   phase1_pushback_length = 0;
1697 
1698   lexical_context = lc_outside;
1699 
1700   phase2_pushback_length = 0;
1701 
1702   last_comment_line = -1;
1703   last_non_comment_line = -1;
1704 
1705   /* For Python, the default source file encoding is UTF-8.  This is specified
1706      in PEP 3120.  */
1707   xgettext_current_file_source_encoding =
1708    (xgettext_global_source_encoding != NULL ? xgettext_global_source_encoding :
1709     po_charset_utf8);
1710 #if HAVE_ICONV
1711   xgettext_current_file_source_iconv = xgettext_global_source_iconv;
1712 #endif
1713 
1714   xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1715 #if HAVE_ICONV
1716   xgettext_current_source_iconv = xgettext_current_file_source_iconv;
1717 #endif
1718 
1719   continuation_or_nonblank_line = false;
1720 
1721   open_pbb = 0;
1722 
1723   phase5_pushback_length = 0;
1724 
1725   flag_context_list_table = flag_table;
1726 
1727   init_keywords ();
1728 
1729   /* Eat tokens until eof is seen.  When extract_balanced returns
1730      due to an unbalanced closing parenthesis, just restart it.  */
1731   while (!extract_balanced (mlp, token_type_eof,
1732                             null_context, null_context_list_iterator,
1733                             arglist_parser_alloc (mlp, NULL)))
1734     ;
1735 
1736   fp = NULL;
1737   real_file_name = NULL;
1738   logical_file_name = NULL;
1739   line_number = 0;
1740 }
1741