1 /* xgettext Python backend.
2 Copyright (C) 2002-2003, 2005-2011, 2013-2014, 2018-2020 Free Software Foundation, Inc.
3
4 This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
22
23 /* Specification. */
24 #include "x-python.h"
25
26 #include <assert.h>
27 #include <errno.h>
28 #include <stdbool.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32
33 #include "message.h"
34 #include "rc-str-list.h"
35 #include "xgettext.h"
36 #include "xg-pos.h"
37 #include "xg-encoding.h"
38 #include "xg-mixed-string.h"
39 #include "xg-arglist-context.h"
40 #include "xg-arglist-callshape.h"
41 #include "xg-arglist-parser.h"
42 #include "xg-message.h"
43 #include "error.h"
44 #include "error-progname.h"
45 #include "progname.h"
46 #include "basename-lgpl.h"
47 #include "xerror.h"
48 #include "xvasprintf.h"
49 #include "xalloc.h"
50 #include "c-strstr.h"
51 #include "c-ctype.h"
52 #include "po-charset.h"
53 #include "uniname.h"
54 #include "unistr.h"
55 #include "gettext.h"
56
57 #define _(s) gettext(s)
58
59 #define max(a,b) ((a) > (b) ? (a) : (b))
60
61 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
62
63
64 /* The Python syntax is defined in the Python Reference Manual
65 /usr/share/doc/packages/python/html/ref/index.html.
66 See also Python-2.0/Parser/tokenizer.c, Python-2.0/Python/compile.c,
67 Python-2.0/Objects/unicodeobject.c. */
68
69
70 /* ====================== Keyword set customization. ====================== */
71
72 /* If true extract all strings. */
73 static bool extract_all = false;
74
75 static hash_table keywords;
76 static bool default_keywords = true;
77
78
79 void
x_python_extract_all()80 x_python_extract_all ()
81 {
82 extract_all = true;
83 }
84
85
86 void
x_python_keyword(const char * name)87 x_python_keyword (const char *name)
88 {
89 if (name == NULL)
90 default_keywords = false;
91 else
92 {
93 const char *end;
94 struct callshape shape;
95 const char *colon;
96
97 if (keywords.table == NULL)
98 hash_init (&keywords, 100);
99
100 split_keywordspec (name, &end, &shape);
101
102 /* The characters between name and end should form a valid C identifier.
103 A colon means an invalid parse in split_keywordspec(). */
104 colon = strchr (name, ':');
105 if (colon == NULL || colon >= end)
106 insert_keyword_callshape (&keywords, name, end - name, &shape);
107 }
108 }
109
110 /* Finish initializing the keywords hash table.
111 Called after argument processing, before each file is processed. */
112 static void
init_keywords()113 init_keywords ()
114 {
115 if (default_keywords)
116 {
117 /* When adding new keywords here, also update the documentation in
118 xgettext.texi! */
119 x_python_keyword ("gettext");
120 x_python_keyword ("ugettext");
121 x_python_keyword ("dgettext:2");
122 x_python_keyword ("ngettext:1,2");
123 x_python_keyword ("ungettext:1,2");
124 x_python_keyword ("dngettext:2,3");
125 x_python_keyword ("_");
126 default_keywords = false;
127 }
128 }
129
130 void
init_flag_table_python()131 init_flag_table_python ()
132 {
133 xgettext_record_flag ("gettext:1:pass-python-format");
134 xgettext_record_flag ("ugettext:1:pass-python-format");
135 xgettext_record_flag ("dgettext:2:pass-python-format");
136 xgettext_record_flag ("ngettext:1:pass-python-format");
137 xgettext_record_flag ("ngettext:2:pass-python-format");
138 xgettext_record_flag ("ungettext:1:pass-python-format");
139 xgettext_record_flag ("ungettext:2:pass-python-format");
140 xgettext_record_flag ("dngettext:2:pass-python-format");
141 xgettext_record_flag ("dngettext:3:pass-python-format");
142 xgettext_record_flag ("_:1:pass-python-format");
143 /* xgettext_record_flag ("%:1:python-format"); // % is an infix operator! */
144
145 xgettext_record_flag ("gettext:1:pass-python-brace-format");
146 xgettext_record_flag ("ugettext:1:pass-python-brace-format");
147 xgettext_record_flag ("dgettext:2:pass-python-brace-format");
148 xgettext_record_flag ("ngettext:1:pass-python-brace-format");
149 xgettext_record_flag ("ngettext:2:pass-python-brace-format");
150 xgettext_record_flag ("ungettext:1:pass-python-brace-format");
151 xgettext_record_flag ("ungettext:2:pass-python-brace-format");
152 xgettext_record_flag ("dngettext:2:pass-python-brace-format");
153 xgettext_record_flag ("dngettext:3:pass-python-brace-format");
154 xgettext_record_flag ("_:1:pass-python-brace-format");
155 /* xgettext_record_flag ("format:1:python-brace-format"); */
156 }
157
158
159 /* ======================== Reading of characters. ======================== */
160
161 /* The input file stream. */
162 static FILE *fp;
163
164
165 /* 0. Terminate line by \n, regardless whether the external
166 representation of a line terminator is CR (Mac), and CR/LF
167 (DOS/Windows), as Python treats them equally. */
168 static int
phase0_getc()169 phase0_getc ()
170 {
171 int c;
172
173 c = getc (fp);
174 if (c == EOF)
175 {
176 if (ferror (fp))
177 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
178 real_file_name);
179 return EOF;
180 }
181
182 if (c == '\r')
183 {
184 int c1 = getc (fp);
185
186 if (c1 != EOF && c1 != '\n')
187 ungetc (c1, fp);
188
189 /* Seen line terminator CR or CR/LF. */
190 return '\n';
191 }
192
193 return c;
194 }
195
196 /* Supports only one pushback character, and not '\n'. */
197 static inline void
phase0_ungetc(int c)198 phase0_ungetc (int c)
199 {
200 if (c != EOF)
201 ungetc (c, fp);
202 }
203
204
205 /* 1. line_number handling. */
206
207 /* Maximum used, roughly a safer MB_LEN_MAX. */
208 #define MAX_PHASE1_PUSHBACK 16
209 static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
210 static int phase1_pushback_length;
211
212 /* Read the next single byte from the input file. */
213 static int
phase1_getc()214 phase1_getc ()
215 {
216 int c;
217
218 if (phase1_pushback_length)
219 c = phase1_pushback[--phase1_pushback_length];
220 else
221 c = phase0_getc ();
222
223 if (c == '\n')
224 ++line_number;
225
226 return c;
227 }
228
229 /* Supports MAX_PHASE1_PUSHBACK characters of pushback. */
230 static void
phase1_ungetc(int c)231 phase1_ungetc (int c)
232 {
233 if (c != EOF)
234 {
235 if (c == '\n')
236 --line_number;
237
238 if (phase1_pushback_length == SIZEOF (phase1_pushback))
239 abort ();
240 phase1_pushback[phase1_pushback_length++] = c;
241 }
242 }
243
244
245 /* Phase 2: Conversion to Unicode.
246 This is done early because PEP 0263 specifies that conversion to Unicode
247 conceptually occurs before tokenization. A test case where it matters
248 is with encodings like BIG5: when a double-byte character ending in 0x5C
249 is followed by '\' or 'u0021', the tokenizer must not treat the second
250 half of the double-byte character as a backslash. */
251
252 /* End-of-file indicator for functions returning an UCS-4 character. */
253 #define UEOF -1
254
255 static lexical_context_ty lexical_context;
256
257 static int phase2_pushback[max (9, UNINAME_MAX + 3)];
258 static int phase2_pushback_length;
259
260 /* Read the next Unicode UCS-4 character from the input file. */
261 static int
phase2_getc()262 phase2_getc ()
263 {
264 if (phase2_pushback_length)
265 return phase2_pushback[--phase2_pushback_length];
266
267 if (xgettext_current_source_encoding == po_charset_ascii)
268 {
269 int c = phase1_getc ();
270 if (c == EOF)
271 return UEOF;
272 if (!c_isascii (c))
273 {
274 multiline_error (xstrdup (""),
275 xasprintf ("%s\n%s\n",
276 non_ascii_error_message (lexical_context,
277 real_file_name,
278 line_number),
279 _("\
280 Please specify the source encoding through --from-code or through a comment\n\
281 as specified in https://www.python.org/peps/pep-0263.html.\n")));
282 exit (EXIT_FAILURE);
283 }
284 return c;
285 }
286 else if (xgettext_current_source_encoding != po_charset_utf8)
287 {
288 #if HAVE_ICONV
289 /* Use iconv on an increasing number of bytes. Read only as many bytes
290 through phase1_getc as needed. This is needed to give reasonable
291 interactive behaviour when fp is connected to an interactive tty. */
292 unsigned char buf[MAX_PHASE1_PUSHBACK];
293 size_t bufcount;
294 int c = phase1_getc ();
295 if (c == EOF)
296 return UEOF;
297 buf[0] = (unsigned char) c;
298 bufcount = 1;
299
300 for (;;)
301 {
302 unsigned char scratchbuf[6];
303 const char *inptr = (const char *) &buf[0];
304 size_t insize = bufcount;
305 char *outptr = (char *) &scratchbuf[0];
306 size_t outsize = sizeof (scratchbuf);
307
308 size_t res = iconv (xgettext_current_source_iconv,
309 (ICONV_CONST char **) &inptr, &insize,
310 &outptr, &outsize);
311 /* We expect that a character has been produced if and only if
312 some input bytes have been consumed. */
313 if ((insize < bufcount) != (outsize < sizeof (scratchbuf)))
314 abort ();
315 if (outsize == sizeof (scratchbuf))
316 {
317 /* No character has been produced. Must be an error. */
318 if (res != (size_t)(-1))
319 abort ();
320
321 if (errno == EILSEQ)
322 {
323 /* An invalid multibyte sequence was encountered. */
324 goto invalid;
325 }
326 else if (errno == EINVAL)
327 {
328 /* An incomplete multibyte character. */
329 int c;
330
331 if (bufcount == MAX_PHASE1_PUSHBACK)
332 {
333 /* An overlong incomplete multibyte sequence was
334 encountered. */
335 multiline_error (xstrdup (""),
336 xasprintf (_("\
337 %s:%d: Long incomplete multibyte sequence.\n\
338 Please specify the correct source encoding through --from-code or through a\n\
339 comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
340 real_file_name, line_number));
341 exit (EXIT_FAILURE);
342 }
343
344 /* Read one more byte and retry iconv. */
345 c = phase1_getc ();
346 if (c == EOF)
347 goto incomplete_at_eof;
348 if (c == '\n')
349 goto incomplete_at_eol;
350 buf[bufcount++] = (unsigned char) c;
351 }
352 else
353 error (EXIT_FAILURE, errno, _("%s:%d: iconv failure"),
354 real_file_name, line_number);
355 }
356 else
357 {
358 size_t outbytes = sizeof (scratchbuf) - outsize;
359 size_t bytes = bufcount - insize;
360 ucs4_t uc;
361
362 /* We expect that one character has been produced. */
363 if (bytes == 0)
364 abort ();
365 if (outbytes == 0)
366 abort ();
367 /* Push back the unused bytes. */
368 while (insize > 0)
369 phase1_ungetc (buf[--insize]);
370 /* Convert the character from UTF-8 to UCS-4. */
371 if (u8_mbtoucr (&uc, scratchbuf, outbytes) < (int) outbytes)
372 {
373 /* scratchbuf contains an out-of-range Unicode character
374 (> 0x10ffff). */
375 goto invalid;
376 }
377 return uc;
378 }
379 }
380 #else
381 /* If we don't have iconv(), the only supported values for
382 xgettext_global_source_encoding and thus also for
383 xgettext_current_source_encoding are ASCII and UTF-8. */
384 abort ();
385 #endif
386 }
387 else
388 {
389 /* Read an UTF-8 encoded character.
390 Reject invalid input, like u8_mbtouc does. */
391 int c;
392 ucs4_t uc;
393
394 c = phase1_getc ();
395 if (c == EOF)
396 return UEOF;
397 if (c < 0x80)
398 {
399 uc = c;
400 }
401 else if (c < 0xc2)
402 goto invalid;
403 else if (c < 0xe0)
404 {
405 int c1 = phase1_getc ();
406 if (c1 == EOF)
407 goto incomplete_at_eof;
408 if (c1 == '\n')
409 goto incomplete_at_eol;
410 if ((c1 ^ 0x80) < 0x40)
411 uc = ((unsigned int) (c & 0x1f) << 6)
412 | (unsigned int) (c1 ^ 0x80);
413 else
414 goto invalid;
415 }
416 else if (c < 0xf0)
417 {
418 int c1 = phase1_getc ();
419 if (c1 == EOF)
420 goto incomplete_at_eof;
421 if (c1 == '\n')
422 goto incomplete_at_eol;
423 if ((c1 ^ 0x80) < 0x40
424 && (c >= 0xe1 || c1 >= 0xa0)
425 && (c != 0xed || c1 < 0xa0))
426 {
427 int c2 = phase1_getc ();
428 if (c2 == EOF)
429 goto incomplete_at_eof;
430 if (c2 == '\n')
431 goto incomplete_at_eol;
432 if ((c2 ^ 0x80) < 0x40)
433 uc = ((unsigned int) (c & 0x0f) << 12)
434 | ((unsigned int) (c1 ^ 0x80) << 6)
435 | (unsigned int) (c2 ^ 0x80);
436 else
437 goto invalid;
438 }
439 else
440 goto invalid;
441 }
442 else if (c < 0xf8)
443 {
444 int c1 = phase1_getc ();
445 if (c1 == EOF)
446 goto incomplete_at_eof;
447 if (c1 == '\n')
448 goto incomplete_at_eol;
449 if ((c1 ^ 0x80) < 0x40
450 && (c >= 0xf1 || c1 >= 0x90)
451 && (c < 0xf4 || (c == 0xf4 && c1 < 0x90)))
452 {
453 int c2 = phase1_getc ();
454 if (c2 == EOF)
455 goto incomplete_at_eof;
456 if (c2 == '\n')
457 goto incomplete_at_eol;
458 if ((c2 ^ 0x80) < 0x40)
459 {
460 int c3 = phase1_getc ();
461 if (c3 == EOF)
462 goto incomplete_at_eof;
463 if (c3 == '\n')
464 goto incomplete_at_eol;
465 if ((c3 ^ 0x80) < 0x40)
466 uc = ((unsigned int) (c & 0x07) << 18)
467 | ((unsigned int) (c1 ^ 0x80) << 12)
468 | ((unsigned int) (c2 ^ 0x80) << 6)
469 | (unsigned int) (c3 ^ 0x80);
470 else
471 goto invalid;
472 }
473 else
474 goto invalid;
475 }
476 else
477 goto invalid;
478 }
479 else
480 goto invalid;
481
482 return uc;
483 }
484
485 invalid:
486 /* An invalid multibyte sequence was encountered. */
487 multiline_error (xstrdup (""),
488 xasprintf (_("\
489 %s:%d: Invalid multibyte sequence.\n\
490 Please specify the correct source encoding through --from-code or through a\n\
491 comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
492 real_file_name, line_number));
493 exit (EXIT_FAILURE);
494
495 incomplete_at_eof:
496 multiline_error (xstrdup (""),
497 xasprintf (_("\
498 %s:%d: Incomplete multibyte sequence at end of file.\n\
499 Please specify the correct source encoding through --from-code or through a\n\
500 comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
501 real_file_name, line_number));
502 exit (EXIT_FAILURE);
503
504 incomplete_at_eol:
505 multiline_error (xstrdup (""),
506 xasprintf (_("\
507 %s:%d: Incomplete multibyte sequence at end of line.\n\
508 Please specify the correct source encoding through --from-code or through a\n\
509 comment as specified in https://www.python.org/peps/pep-0263.html.\n"),
510 real_file_name, line_number - 1));
511 exit (EXIT_FAILURE);
512 }
513
514 /* Supports max (9, UNINAME_MAX + 3) pushback characters. */
515 static void
phase2_ungetc(int c)516 phase2_ungetc (int c)
517 {
518 if (c != UEOF)
519 {
520 if (phase2_pushback_length == SIZEOF (phase2_pushback))
521 abort ();
522 phase2_pushback[phase2_pushback_length++] = c;
523 }
524 }
525
526
527 /* ========================= Accumulating strings. ======================== */
528
529 /* See xg-mixed-string.h for the API. */
530
531
532 /* ======================== Accumulating comments. ======================== */
533
534
535 /* Accumulating a single comment line. */
536
537 static struct mixed_string_buffer comment_buffer;
538
539 static inline void
comment_start()540 comment_start ()
541 {
542 mixed_string_buffer_init (&comment_buffer, lc_comment,
543 logical_file_name, line_number);
544 }
545
546 static inline bool
comment_at_start()547 comment_at_start ()
548 {
549 return mixed_string_buffer_is_empty (&comment_buffer);
550 }
551
552 static inline void
comment_add(int c)553 comment_add (int c)
554 {
555 mixed_string_buffer_append_unicode (&comment_buffer, c);
556 }
557
558 static inline const char *
comment_line_end()559 comment_line_end ()
560 {
561 char *buffer =
562 mixed_string_contents_free1 (mixed_string_buffer_result (&comment_buffer));
563 size_t buflen = strlen (buffer);
564
565 while (buflen >= 1
566 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
567 --buflen;
568 buffer[buflen] = '\0';
569 savable_comment_add (buffer);
570 lexical_context = lc_outside;
571 return buffer;
572 }
573
574
575 /* These are for tracking whether comments count as immediately before
576 keyword. */
577 static int last_comment_line;
578 static int last_non_comment_line;
579
580
581 /* ======================== Recognizing comments. ======================== */
582
583
584 /* Recognizing the "coding" comment.
585 As specified in PEP 0263, it takes the form
586 "coding" [":"|"="] {alphanumeric or "-" or "_" or "*"}*
587 or
588 "set" "fileencoding" "=" {alphanumeric or "-" or "_" or "*"}*
589 and is located in a comment in a line that
590 - is either the first or second line,
591 - is not a continuation line,
592 - in the first form, contains no other tokens except this comment. */
593
594 /* Canonicalized encoding name for the current input file. */
595 static const char *xgettext_current_file_source_encoding;
596
597 #if HAVE_ICONV
598 /* Converter from xgettext_current_file_source_encoding to UTF-8 (except from
599 ASCII or UTF-8, when this conversion is a no-op). */
600 static iconv_t xgettext_current_file_source_iconv;
601 #endif
602
603 static inline void
set_current_file_source_encoding(const char * canon_encoding)604 set_current_file_source_encoding (const char *canon_encoding)
605 {
606 xgettext_current_file_source_encoding = canon_encoding;
607
608 if (xgettext_current_file_source_encoding != po_charset_ascii
609 && xgettext_current_file_source_encoding != po_charset_utf8)
610 {
611 #if HAVE_ICONV
612 iconv_t cd;
613
614 /* Avoid glibc-2.1 bug with EUC-KR. */
615 # if ((__GLIBC__ == 2 && __GLIBC_MINOR__ <= 1) && !defined __UCLIBC__) \
616 && !defined _LIBICONV_VERSION
617 if (strcmp (xgettext_current_file_source_encoding, "EUC-KR") == 0)
618 cd = (iconv_t)(-1);
619 else
620 # endif
621 cd = iconv_open (po_charset_utf8, xgettext_current_file_source_encoding);
622 if (cd == (iconv_t)(-1))
623 error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1,
624 _("Cannot convert from \"%s\" to \"%s\". %s relies on iconv(), and iconv() does not support this conversion."),
625 xgettext_current_file_source_encoding, po_charset_utf8,
626 last_component (program_name));
627 xgettext_current_file_source_iconv = cd;
628 #else
629 error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1,
630 _("Cannot convert from \"%s\" to \"%s\". %s relies on iconv(). This version was built without iconv()."),
631 xgettext_current_file_source_encoding, po_charset_utf8,
632 last_component (program_name));
633 #endif
634 }
635
636 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
637 #if HAVE_ICONV
638 xgettext_current_source_iconv = xgettext_current_file_source_iconv;
639 #endif
640 }
641
642 static inline void
try_to_extract_coding(const char * comment)643 try_to_extract_coding (const char *comment)
644 {
645 const char *p = c_strstr (comment, "coding");
646
647 if (p != NULL)
648 {
649 p += 6;
650 if (*p == ':' || *p == '=')
651 {
652 p++;
653 while (*p == ' ' || *p == '\t')
654 p++;
655 {
656 const char *encoding_start = p;
657
658 while (c_isalnum (*p) || *p == '-' || *p == '_' || *p == '.')
659 p++;
660 {
661 const char *encoding_end = p;
662
663 if (encoding_end > encoding_start)
664 {
665 /* Extract the encoding string. */
666 size_t encoding_len = encoding_end - encoding_start;
667 char *encoding = XNMALLOC (encoding_len + 1, char);
668
669 memcpy (encoding, encoding_start, encoding_len);
670 encoding[encoding_len] = '\0';
671
672 {
673 /* Canonicalize it. */
674 const char *canon_encoding = po_charset_canonicalize (encoding);
675 if (canon_encoding == NULL)
676 {
677 error_at_line (0, 0,
678 logical_file_name, line_number - 1,
679 _("Unknown encoding \"%s\". Proceeding with ASCII instead."),
680 encoding);
681 canon_encoding = po_charset_ascii;
682 }
683
684 /* Activate it. */
685 set_current_file_source_encoding (canon_encoding);
686 }
687
688 free (encoding);
689 }
690 }
691 }
692 }
693 }
694 }
695
696 /* Tracking whether the current line is a continuation line or contains a
697 non-blank character. */
698 static bool continuation_or_nonblank_line;
699
700
701 /* Phase 3: Outside strings, replace backslash-newline with nothing and a
702 comment with nothing. */
703
704 static int
phase3_getc()705 phase3_getc ()
706 {
707 int c;
708
709 for (;;)
710 {
711 c = phase2_getc ();
712 if (c == '\\')
713 {
714 c = phase2_getc ();
715 if (c != '\n')
716 {
717 phase2_ungetc (c);
718 /* This shouldn't happen usually, because "A backslash is
719 illegal elsewhere on a line outside a string literal." */
720 return '\\';
721 }
722 /* Eat backslash-newline. */
723 continuation_or_nonblank_line = true;
724 }
725 else if (c == '#')
726 {
727 /* Eat a comment. */
728 const char *comment;
729
730 last_comment_line = line_number;
731 comment_start ();
732 for (;;)
733 {
734 c = phase2_getc ();
735 if (c == UEOF || c == '\n')
736 break;
737 /* We skip all leading white space, but not EOLs. */
738 if (!(comment_at_start () && (c == ' ' || c == '\t')))
739 comment_add (c);
740 }
741 comment = comment_line_end ();
742 if (line_number - 1 <= 2 && !continuation_or_nonblank_line)
743 try_to_extract_coding (comment);
744 continuation_or_nonblank_line = false;
745 return c;
746 }
747 else
748 {
749 if (c == '\n')
750 continuation_or_nonblank_line = false;
751 else if (!(c == ' ' || c == '\t' || c == '\f'))
752 continuation_or_nonblank_line = true;
753 return c;
754 }
755 }
756 }
757
758 /* Supports only one pushback character. */
759 static void
phase3_ungetc(int c)760 phase3_ungetc (int c)
761 {
762 phase2_ungetc (c);
763 }
764
765
766 /* ========================= Accumulating strings. ======================== */
767
768 /* Return value of phase7_getuc when EOF is reached. */
769 #define P7_EOF (-1)
770 #define P7_STRING_END (-2)
771
772 /* Convert an UTF-16 or UTF-32 code point to a return value that can be
773 distinguished from a single-byte return value. */
774 #define UNICODE(code) (0x100 + (code))
775
776 /* Test a return value of phase7_getuc whether it designates an UTF-16 or
777 UTF-32 code point. */
778 #define IS_UNICODE(p7_result) ((p7_result) >= 0x100)
779
780 /* Extract the UTF-16 or UTF-32 code of a return value that satisfies
781 IS_UNICODE. */
782 #define UNICODE_VALUE(p7_result) ((p7_result) - 0x100)
783
784
785 /* ========================== Reading of tokens. ========================== */
786
787
788 enum token_type_ty
789 {
790 token_type_eof,
791 token_type_lparen, /* ( */
792 token_type_rparen, /* ) */
793 token_type_comma, /* , */
794 token_type_lbracket, /* [ */
795 token_type_rbracket, /* ] */
796 token_type_string, /* "abc", 'abc', """abc""", '''abc''' */
797 token_type_symbol, /* symbol, number */
798 token_type_plus, /* + */
799 token_type_other /* misc. operator */
800 };
801 typedef enum token_type_ty token_type_ty;
802
803 typedef struct token_ty token_ty;
804 struct token_ty
805 {
806 token_type_ty type;
807 char *string; /* for token_type_symbol */
808 mixed_string_ty *mixed_string; /* for token_type_string */
809 refcounted_string_list_ty *comment; /* for token_type_string */
810 int line_number;
811 };
812
813 /* Free the memory pointed to by a 'struct token_ty'. */
814 static inline void
free_token(token_ty * tp)815 free_token (token_ty *tp)
816 {
817 if (tp->type == token_type_symbol)
818 free (tp->string);
819 if (tp->type == token_type_string)
820 {
821 mixed_string_free (tp->mixed_string);
822 drop_reference (tp->comment);
823 }
824 }
825
826
827 /* There are two different input syntaxes for strings, "abc" and r"abc",
828 and two different input syntaxes for Unicode strings, u"abc" and ur"abc".
829 Which escape sequences are understood, i.e. what is interpreted specially
830 after backslash?
831 "abc" \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn
832 r"abc"
833 u"abc" \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn \unnnn \Unnnnnnnn \N{...}
834 ur"abc" \unnnn
835 The \unnnn values are UTF-16 values; a single \Unnnnnnnn can expand to two
836 \unnnn items. The \ooo and \xnn values are in the current source encoding
837 for byte strings, and Unicode code points for Unicode strings.
838 */
839
840 static int
phase7_getuc(int quote_char,bool triple,bool interpret_ansic,bool interpret_unicode,unsigned int * backslash_counter)841 phase7_getuc (int quote_char,
842 bool triple, bool interpret_ansic, bool interpret_unicode,
843 unsigned int *backslash_counter)
844 {
845 int c;
846
847 for (;;)
848 {
849 /* Use phase 2, because phase 3 elides comments. */
850 c = phase2_getc ();
851
852 if (c == UEOF)
853 return P7_EOF;
854
855 if (c == quote_char && (interpret_ansic || (*backslash_counter & 1) == 0))
856 {
857 if (triple)
858 {
859 int c1 = phase2_getc ();
860 if (c1 == quote_char)
861 {
862 int c2 = phase2_getc ();
863 if (c2 == quote_char)
864 return P7_STRING_END;
865 phase2_ungetc (c2);
866 }
867 phase2_ungetc (c1);
868 return UNICODE (c);
869 }
870 else
871 return P7_STRING_END;
872 }
873
874 if (c == '\n')
875 {
876 if (triple)
877 {
878 *backslash_counter = 0;
879 return UNICODE ('\n');
880 }
881 /* In r"..." and ur"..." strings, newline is only allowed
882 immediately after an odd number of backslashes (although the
883 backslashes are not interpreted!). */
884 if (!(interpret_ansic || (*backslash_counter & 1) == 0))
885 {
886 *backslash_counter = 0;
887 return UNICODE ('\n');
888 }
889 phase2_ungetc (c);
890 error_with_progname = false;
891 error (0, 0, _("%s:%d: warning: unterminated string"),
892 logical_file_name, line_number);
893 error_with_progname = true;
894 return P7_STRING_END;
895 }
896
897 if (c != '\\')
898 {
899 *backslash_counter = 0;
900 return UNICODE (c);
901 }
902
903 /* Backslash handling. */
904
905 if (!interpret_ansic && !interpret_unicode)
906 {
907 ++*backslash_counter;
908 return UNICODE ('\\');
909 }
910
911 /* Dispatch according to the character following the backslash. */
912 c = phase2_getc ();
913 if (c == UEOF)
914 {
915 ++*backslash_counter;
916 return UNICODE ('\\');
917 }
918
919 if (interpret_ansic)
920 switch (c)
921 {
922 case '\n':
923 continue;
924 case '\\':
925 ++*backslash_counter;
926 return UNICODE (c);
927 case '\'': case '"':
928 *backslash_counter = 0;
929 return UNICODE (c);
930 case 'a':
931 *backslash_counter = 0;
932 return UNICODE ('\a');
933 case 'b':
934 *backslash_counter = 0;
935 return UNICODE ('\b');
936 case 'f':
937 *backslash_counter = 0;
938 return UNICODE ('\f');
939 case 'n':
940 *backslash_counter = 0;
941 return UNICODE ('\n');
942 case 'r':
943 *backslash_counter = 0;
944 return UNICODE ('\r');
945 case 't':
946 *backslash_counter = 0;
947 return UNICODE ('\t');
948 case 'v':
949 *backslash_counter = 0;
950 return UNICODE ('\v');
951 case '0': case '1': case '2': case '3': case '4':
952 case '5': case '6': case '7':
953 {
954 int n = c - '0';
955
956 c = phase2_getc ();
957 if (c != UEOF)
958 {
959 if (c >= '0' && c <= '7')
960 {
961 n = (n << 3) + (c - '0');
962 c = phase2_getc ();
963 if (c != UEOF)
964 {
965 if (c >= '0' && c <= '7')
966 n = (n << 3) + (c - '0');
967 else
968 phase2_ungetc (c);
969 }
970 }
971 else
972 phase2_ungetc (c);
973 }
974 *backslash_counter = 0;
975 if (interpret_unicode)
976 return UNICODE (n);
977 else
978 return (unsigned char) n;
979 }
980 case 'x':
981 {
982 int c1 = phase2_getc ();
983 int n1;
984
985 if (c1 >= '0' && c1 <= '9')
986 n1 = c1 - '0';
987 else if (c1 >= 'A' && c1 <= 'F')
988 n1 = c1 - 'A' + 10;
989 else if (c1 >= 'a' && c1 <= 'f')
990 n1 = c1 - 'a' + 10;
991 else
992 n1 = -1;
993
994 if (n1 >= 0)
995 {
996 int c2 = phase2_getc ();
997 int n2;
998
999 if (c2 >= '0' && c2 <= '9')
1000 n2 = c2 - '0';
1001 else if (c2 >= 'A' && c2 <= 'F')
1002 n2 = c2 - 'A' + 10;
1003 else if (c2 >= 'a' && c2 <= 'f')
1004 n2 = c2 - 'a' + 10;
1005 else
1006 n2 = -1;
1007
1008 if (n2 >= 0)
1009 {
1010 int n = (n1 << 4) + n2;
1011 *backslash_counter = 0;
1012 if (interpret_unicode)
1013 return UNICODE (n);
1014 else
1015 return (unsigned char) n;
1016 }
1017
1018 phase2_ungetc (c2);
1019 }
1020 phase2_ungetc (c1);
1021 phase2_ungetc (c);
1022 ++*backslash_counter;
1023 return UNICODE ('\\');
1024 }
1025 }
1026
1027 if (interpret_unicode)
1028 {
1029 if (c == 'u')
1030 {
1031 unsigned char buf[4];
1032 unsigned int n = 0;
1033 int i;
1034
1035 for (i = 0; i < 4; i++)
1036 {
1037 int c1 = phase2_getc ();
1038
1039 if (c1 >= '0' && c1 <= '9')
1040 n = (n << 4) + (c1 - '0');
1041 else if (c1 >= 'A' && c1 <= 'F')
1042 n = (n << 4) + (c1 - 'A' + 10);
1043 else if (c1 >= 'a' && c1 <= 'f')
1044 n = (n << 4) + (c1 - 'a' + 10);
1045 else
1046 {
1047 phase2_ungetc (c1);
1048 while (--i >= 0)
1049 phase2_ungetc (buf[i]);
1050 phase2_ungetc (c);
1051 ++*backslash_counter;
1052 return UNICODE ('\\');
1053 }
1054
1055 buf[i] = c1;
1056 }
1057 *backslash_counter = 0;
1058 return UNICODE (n);
1059 }
1060
1061 if (interpret_ansic)
1062 {
1063 if (c == 'U')
1064 {
1065 unsigned char buf[8];
1066 unsigned int n = 0;
1067 int i;
1068
1069 for (i = 0; i < 8; i++)
1070 {
1071 int c1 = phase2_getc ();
1072
1073 if (c1 >= '0' && c1 <= '9')
1074 n = (n << 4) + (c1 - '0');
1075 else if (c1 >= 'A' && c1 <= 'F')
1076 n = (n << 4) + (c1 - 'A' + 10);
1077 else if (c1 >= 'a' && c1 <= 'f')
1078 n = (n << 4) + (c1 - 'a' + 10);
1079 else
1080 {
1081 phase2_ungetc (c1);
1082 while (--i >= 0)
1083 phase2_ungetc (buf[i]);
1084 phase2_ungetc (c);
1085 ++*backslash_counter;
1086 return UNICODE ('\\');
1087 }
1088
1089 buf[i] = c1;
1090 }
1091 if (n < 0x110000)
1092 {
1093 *backslash_counter = 0;
1094 return UNICODE (n);
1095 }
1096
1097 error_with_progname = false;
1098 error (0, 0, _("%s:%d: warning: invalid Unicode character"),
1099 logical_file_name, line_number);
1100 error_with_progname = true;
1101
1102 while (--i >= 0)
1103 phase2_ungetc (buf[i]);
1104 phase2_ungetc (c);
1105 ++*backslash_counter;
1106 return UNICODE ('\\');
1107 }
1108
1109 if (c == 'N')
1110 {
1111 int c1 = phase2_getc ();
1112 if (c1 == '{')
1113 {
1114 unsigned char buf[UNINAME_MAX + 1];
1115 int i;
1116 unsigned int n;
1117
1118 for (i = 0; i < UNINAME_MAX; i++)
1119 {
1120 int c2 = phase2_getc ();
1121 if (!(c2 >= ' ' && c2 <= '~'))
1122 {
1123 phase2_ungetc (c2);
1124 while (--i >= 0)
1125 phase2_ungetc (buf[i]);
1126 phase2_ungetc (c1);
1127 phase2_ungetc (c);
1128 ++*backslash_counter;
1129 return UNICODE ('\\');
1130 }
1131 if (c2 == '}')
1132 break;
1133 buf[i] = c2;
1134 }
1135 buf[i] = '\0';
1136
1137 n = unicode_name_character ((char *) buf);
1138 if (n != UNINAME_INVALID)
1139 {
1140 *backslash_counter = 0;
1141 return UNICODE (n);
1142 }
1143
1144 phase2_ungetc ('}');
1145 while (--i >= 0)
1146 phase2_ungetc (buf[i]);
1147 }
1148 phase2_ungetc (c1);
1149 phase2_ungetc (c);
1150 ++*backslash_counter;
1151 return UNICODE ('\\');
1152 }
1153 }
1154 }
1155
1156 phase2_ungetc (c);
1157 ++*backslash_counter;
1158 return UNICODE ('\\');
1159 }
1160 }
1161
1162
1163 /* Combine characters into tokens. Discard whitespace except newlines at
1164 the end of logical lines. */
1165
1166 /* Number of pending open parentheses/braces/brackets. */
1167 static int open_pbb;
1168
1169 static token_ty phase5_pushback[2];
1170 static int phase5_pushback_length;
1171
1172 static void
phase5_get(token_ty * tp)1173 phase5_get (token_ty *tp)
1174 {
1175 int c;
1176
1177 if (phase5_pushback_length)
1178 {
1179 *tp = phase5_pushback[--phase5_pushback_length];
1180 return;
1181 }
1182
1183 for (;;)
1184 {
1185 tp->line_number = line_number;
1186 c = phase3_getc ();
1187
1188 switch (c)
1189 {
1190 case UEOF:
1191 tp->type = token_type_eof;
1192 return;
1193
1194 case ' ':
1195 case '\t':
1196 case '\f':
1197 /* Ignore whitespace and comments. */
1198 continue;
1199
1200 case '\n':
1201 if (last_non_comment_line > last_comment_line)
1202 savable_comment_reset ();
1203 /* Ignore newline if and only if it is used for implicit line
1204 joining. */
1205 if (open_pbb > 0)
1206 continue;
1207 tp->type = token_type_other;
1208 return;
1209 }
1210
1211 last_non_comment_line = tp->line_number;
1212
1213 switch (c)
1214 {
1215 case '.':
1216 {
1217 int c1 = phase3_getc ();
1218 phase3_ungetc (c1);
1219 if (!(c1 >= '0' && c1 <= '9'))
1220 {
1221
1222 tp->type = token_type_other;
1223 return;
1224 }
1225 }
1226 /* FALLTHROUGH */
1227 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1228 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1229 case 'M': case 'N': case 'O': case 'P': case 'Q':
1230 case 'S': case 'T': case 'V': case 'W': case 'X':
1231 case 'Y': case 'Z':
1232 case '_':
1233 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1234 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1235 case 'm': case 'n': case 'o': case 'p': case 'q':
1236 case 's': case 't': case 'v': case 'w': case 'x':
1237 case 'y': case 'z':
1238 case '0': case '1': case '2': case '3': case '4':
1239 case '5': case '6': case '7': case '8': case '9':
1240 symbol:
1241 /* Symbol, or part of a number. */
1242 {
1243 static char *buffer;
1244 static int bufmax;
1245 int bufpos;
1246
1247 bufpos = 0;
1248 for (;;)
1249 {
1250 if (bufpos >= bufmax)
1251 {
1252 bufmax = 2 * bufmax + 10;
1253 buffer = xrealloc (buffer, bufmax);
1254 }
1255 buffer[bufpos++] = c;
1256 c = phase3_getc ();
1257 switch (c)
1258 {
1259 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1260 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1261 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1262 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1263 case 'Y': case 'Z':
1264 case '_':
1265 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1266 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1267 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1268 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1269 case 'y': case 'z':
1270 case '0': case '1': case '2': case '3': case '4':
1271 case '5': case '6': case '7': case '8': case '9':
1272 continue;
1273 default:
1274 phase3_ungetc (c);
1275 break;
1276 }
1277 break;
1278 }
1279 if (bufpos >= bufmax)
1280 {
1281 bufmax = 2 * bufmax + 10;
1282 buffer = xrealloc (buffer, bufmax);
1283 }
1284 buffer[bufpos] = '\0';
1285 tp->string = xstrdup (buffer);
1286 tp->type = token_type_symbol;
1287 return;
1288 }
1289
1290 /* Strings. */
1291 {
1292 int quote_char;
1293 bool interpret_ansic;
1294 bool interpret_unicode;
1295 bool triple;
1296 unsigned int backslash_counter;
1297
1298 case 'R': case 'r':
1299 {
1300 int c1 = phase2_getc ();
1301 if (c1 == '"' || c1 == '\'')
1302 {
1303 quote_char = c1;
1304 interpret_ansic = false;
1305 interpret_unicode = false;
1306 goto string;
1307 }
1308 phase2_ungetc (c1);
1309 goto symbol;
1310 }
1311
1312 case 'U': case 'u':
1313 {
1314 int c1 = phase2_getc ();
1315 if (c1 == '"' || c1 == '\'')
1316 {
1317 quote_char = c1;
1318 interpret_ansic = true;
1319 interpret_unicode = true;
1320 goto string;
1321 }
1322 if (c1 == 'R' || c1 == 'r')
1323 {
1324 int c2 = phase2_getc ();
1325 if (c2 == '"' || c2 == '\'')
1326 {
1327 quote_char = c2;
1328 interpret_ansic = false;
1329 interpret_unicode = true;
1330 goto string;
1331 }
1332 phase2_ungetc (c2);
1333 }
1334 phase2_ungetc (c1);
1335 goto symbol;
1336 }
1337
1338 case '"': case '\'':
1339 quote_char = c;
1340 interpret_ansic = true;
1341 interpret_unicode = false;
1342 string:
1343 triple = false;
1344 lexical_context = lc_string;
1345 {
1346 int c1 = phase2_getc ();
1347 if (c1 == quote_char)
1348 {
1349 int c2 = phase2_getc ();
1350 if (c2 == quote_char)
1351 triple = true;
1352 else
1353 {
1354 phase2_ungetc (c2);
1355 phase2_ungetc (c1);
1356 }
1357 }
1358 else
1359 phase2_ungetc (c1);
1360 }
1361 backslash_counter = 0;
1362 {
1363 struct mixed_string_buffer msb;
1364
1365 /* Start accumulating the string. */
1366 mixed_string_buffer_init (&msb, lexical_context,
1367 logical_file_name, line_number);
1368 for (;;)
1369 {
1370 int uc = phase7_getuc (quote_char, triple, interpret_ansic,
1371 interpret_unicode, &backslash_counter);
1372
1373 /* Keep line_number in sync. */
1374 msb.line_number = line_number;
1375
1376 if (uc == P7_EOF || uc == P7_STRING_END)
1377 break;
1378
1379 if (IS_UNICODE (uc))
1380 {
1381 assert (UNICODE_VALUE (uc) >= 0
1382 && UNICODE_VALUE (uc) < 0x110000);
1383 mixed_string_buffer_append_unicode (&msb,
1384 UNICODE_VALUE (uc));
1385 }
1386 else
1387 mixed_string_buffer_append_char (&msb, uc);
1388 }
1389 tp->mixed_string = mixed_string_buffer_result (&msb);
1390 tp->comment = add_reference (savable_comment);
1391 lexical_context = lc_outside;
1392 tp->type = token_type_string;
1393 }
1394 return;
1395 }
1396
1397 case '(':
1398 open_pbb++;
1399 tp->type = token_type_lparen;
1400 return;
1401
1402 case ')':
1403 if (open_pbb > 0)
1404 open_pbb--;
1405 tp->type = token_type_rparen;
1406 return;
1407
1408 case ',':
1409 tp->type = token_type_comma;
1410 return;
1411
1412 case '[': case '{':
1413 open_pbb++;
1414 tp->type = (c == '[' ? token_type_lbracket : token_type_other);
1415 return;
1416
1417 case ']': case '}':
1418 if (open_pbb > 0)
1419 open_pbb--;
1420 tp->type = (c == ']' ? token_type_rbracket : token_type_other);
1421 return;
1422
1423 case '+':
1424 tp->type = token_type_plus;
1425 return;
1426
1427 default:
1428 /* We could carefully recognize each of the 2 and 3 character
1429 operators, but it is not necessary, as we only need to recognize
1430 gettext invocations. Don't bother. */
1431 tp->type = token_type_other;
1432 return;
1433 }
1434 }
1435 }
1436
1437 /* Supports only one pushback token. */
1438 static void
phase5_unget(token_ty * tp)1439 phase5_unget (token_ty *tp)
1440 {
1441 if (tp->type != token_type_eof)
1442 {
1443 if (phase5_pushback_length == SIZEOF (phase5_pushback))
1444 abort ();
1445 phase5_pushback[phase5_pushback_length++] = *tp;
1446 }
1447 }
1448
1449
1450 /* Combine adjacent strings to form a single string. Note that the end
1451 of a logical line appears as a token of its own, therefore strings that
1452 belong to different logical lines will not be concatenated. */
1453
1454 static void
x_python_lex(token_ty * tp)1455 x_python_lex (token_ty *tp)
1456 {
1457 phase5_get (tp);
1458 if (tp->type == token_type_string)
1459 {
1460 mixed_string_ty *sum = tp->mixed_string;
1461
1462 for (;;)
1463 {
1464 token_ty token2;
1465 token_ty token3;
1466 token_ty *tp2 = NULL;
1467
1468 phase5_get (&token2);
1469 switch (token2.type)
1470 {
1471 case token_type_plus:
1472 {
1473 phase5_get (&token3);
1474 if (token3.type == token_type_string)
1475 {
1476 free_token (&token2);
1477 tp2 = &token3;
1478 }
1479 else
1480 phase5_unget (&token3);
1481 }
1482 break;
1483 case token_type_string:
1484 tp2 = &token2;
1485 break;
1486 default:
1487 break;
1488 }
1489
1490 if (tp2)
1491 {
1492 sum = mixed_string_concat_free1 (sum, tp2->mixed_string);
1493
1494 free_token (tp2);
1495 continue;
1496 }
1497 phase5_unget (&token2);
1498 break;
1499 }
1500 tp->mixed_string = sum;
1501 }
1502 }
1503
1504
1505 /* ========================= Extracting strings. ========================== */
1506
1507
1508 /* Context lookup table. */
1509 static flag_context_list_table_ty *flag_context_list_table;
1510
1511
1512 /* The file is broken into tokens. Scan the token stream, looking for
1513 a keyword, followed by a left paren, followed by a string. When we
1514 see this sequence, we have something to remember. We assume we are
1515 looking at a valid C or C++ program, and leave the complaints about
1516 the grammar to the compiler.
1517
1518 Normal handling: Look for
1519 keyword ( ... msgid ... )
1520 Plural handling: Look for
1521 keyword ( ... msgid ... msgid_plural ... )
1522
1523 We use recursion because the arguments before msgid or between msgid
1524 and msgid_plural can contain subexpressions of the same form. */
1525
1526
1527 /* Extract messages until the next balanced closing parenthesis or bracket.
1528 Extracted messages are added to MLP.
1529 DELIM can be either token_type_rparen or token_type_rbracket, or
1530 token_type_eof to accept both.
1531 Return true upon eof, false upon closing parenthesis or bracket. */
1532 static bool
extract_balanced(message_list_ty * mlp,token_type_ty delim,flag_context_ty outer_context,flag_context_list_iterator_ty context_iter,struct arglist_parser * argparser)1533 extract_balanced (message_list_ty *mlp,
1534 token_type_ty delim,
1535 flag_context_ty outer_context,
1536 flag_context_list_iterator_ty context_iter,
1537 struct arglist_parser *argparser)
1538 {
1539 /* Current argument number. */
1540 int arg = 1;
1541 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
1542 int state;
1543 /* Parameters of the keyword just seen. Defined only in state 1. */
1544 const struct callshapes *next_shapes = NULL;
1545 /* Context iterator that will be used if the next token is a '('. */
1546 flag_context_list_iterator_ty next_context_iter =
1547 passthrough_context_list_iterator;
1548 /* Current context. */
1549 flag_context_ty inner_context =
1550 inherited_context (outer_context,
1551 flag_context_list_iterator_advance (&context_iter));
1552
1553 /* Start state is 0. */
1554 state = 0;
1555
1556 for (;;)
1557 {
1558 token_ty token;
1559
1560 x_python_lex (&token);
1561 switch (token.type)
1562 {
1563 case token_type_symbol:
1564 {
1565 void *keyword_value;
1566
1567 if (hash_find_entry (&keywords, token.string, strlen (token.string),
1568 &keyword_value)
1569 == 0)
1570 {
1571 next_shapes = (const struct callshapes *) keyword_value;
1572 state = 1;
1573 }
1574 else
1575 state = 0;
1576 }
1577 next_context_iter =
1578 flag_context_list_iterator (
1579 flag_context_list_table_lookup (
1580 flag_context_list_table,
1581 token.string, strlen (token.string)));
1582 free (token.string);
1583 continue;
1584
1585 case token_type_lparen:
1586 if (extract_balanced (mlp, token_type_rparen,
1587 inner_context, next_context_iter,
1588 arglist_parser_alloc (mlp,
1589 state ? next_shapes : NULL)))
1590 {
1591 arglist_parser_done (argparser, arg);
1592 return true;
1593 }
1594 next_context_iter = null_context_list_iterator;
1595 state = 0;
1596 continue;
1597
1598 case token_type_rparen:
1599 if (delim == token_type_rparen || delim == token_type_eof)
1600 {
1601 arglist_parser_done (argparser, arg);
1602 return false;
1603 }
1604 next_context_iter = null_context_list_iterator;
1605 state = 0;
1606 continue;
1607
1608 case token_type_comma:
1609 arg++;
1610 inner_context =
1611 inherited_context (outer_context,
1612 flag_context_list_iterator_advance (
1613 &context_iter));
1614 next_context_iter = passthrough_context_list_iterator;
1615 state = 0;
1616 continue;
1617
1618 case token_type_lbracket:
1619 if (extract_balanced (mlp, token_type_rbracket,
1620 null_context, null_context_list_iterator,
1621 arglist_parser_alloc (mlp, NULL)))
1622 {
1623 arglist_parser_done (argparser, arg);
1624 return true;
1625 }
1626 next_context_iter = null_context_list_iterator;
1627 state = 0;
1628 continue;
1629
1630 case token_type_rbracket:
1631 if (delim == token_type_rbracket || delim == token_type_eof)
1632 {
1633 arglist_parser_done (argparser, arg);
1634 return false;
1635 }
1636 next_context_iter = null_context_list_iterator;
1637 state = 0;
1638 continue;
1639
1640 case token_type_string:
1641 {
1642 lex_pos_ty pos;
1643
1644 pos.file_name = logical_file_name;
1645 pos.line_number = token.line_number;
1646
1647 if (extract_all)
1648 {
1649 char *string = mixed_string_contents (token.mixed_string);
1650 mixed_string_free (token.mixed_string);
1651 remember_a_message (mlp, NULL, string, true, false,
1652 inner_context, &pos,
1653 NULL, token.comment, true);
1654 }
1655 else
1656 arglist_parser_remember (argparser, arg, token.mixed_string,
1657 inner_context,
1658 pos.file_name, pos.line_number,
1659 token.comment, true);
1660 }
1661 drop_reference (token.comment);
1662 next_context_iter = null_context_list_iterator;
1663 state = 0;
1664 continue;
1665
1666 case token_type_eof:
1667 arglist_parser_done (argparser, arg);
1668 return true;
1669
1670 case token_type_plus:
1671 case token_type_other:
1672 next_context_iter = null_context_list_iterator;
1673 state = 0;
1674 continue;
1675
1676 default:
1677 abort ();
1678 }
1679 }
1680 }
1681
1682
1683 void
extract_python(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1684 extract_python (FILE *f,
1685 const char *real_filename, const char *logical_filename,
1686 flag_context_list_table_ty *flag_table,
1687 msgdomain_list_ty *mdlp)
1688 {
1689 message_list_ty *mlp = mdlp->item[0]->messages;
1690
1691 fp = f;
1692 real_file_name = real_filename;
1693 logical_file_name = xstrdup (logical_filename);
1694 line_number = 1;
1695
1696 phase1_pushback_length = 0;
1697
1698 lexical_context = lc_outside;
1699
1700 phase2_pushback_length = 0;
1701
1702 last_comment_line = -1;
1703 last_non_comment_line = -1;
1704
1705 /* For Python, the default source file encoding is UTF-8. This is specified
1706 in PEP 3120. */
1707 xgettext_current_file_source_encoding =
1708 (xgettext_global_source_encoding != NULL ? xgettext_global_source_encoding :
1709 po_charset_utf8);
1710 #if HAVE_ICONV
1711 xgettext_current_file_source_iconv = xgettext_global_source_iconv;
1712 #endif
1713
1714 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1715 #if HAVE_ICONV
1716 xgettext_current_source_iconv = xgettext_current_file_source_iconv;
1717 #endif
1718
1719 continuation_or_nonblank_line = false;
1720
1721 open_pbb = 0;
1722
1723 phase5_pushback_length = 0;
1724
1725 flag_context_list_table = flag_table;
1726
1727 init_keywords ();
1728
1729 /* Eat tokens until eof is seen. When extract_balanced returns
1730 due to an unbalanced closing parenthesis, just restart it. */
1731 while (!extract_balanced (mlp, token_type_eof,
1732 null_context, null_context_list_iterator,
1733 arglist_parser_alloc (mlp, NULL)))
1734 ;
1735
1736 fp = NULL;
1737 real_file_name = NULL;
1738 logical_file_name = NULL;
1739 line_number = 0;
1740 }
1741