1 /* xgettext sh backend.
2 Copyright (C) 2003, 2005-2009, 2014, 2018-2020 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2003.
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17
18 #ifdef HAVE_CONFIG_H
19 # include "config.h"
20 #endif
21
22 /* Specification. */
23 #include "x-sh.h"
24
25 #include <errno.h>
26 #include <limits.h>
27 #include <stdbool.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31
32 #include "message.h"
33 #include "xgettext.h"
34 #include "xg-pos.h"
35 #include "xg-mixed-string.h"
36 #include "xg-arglist-context.h"
37 #include "xg-arglist-callshape.h"
38 #include "xg-arglist-parser.h"
39 #include "xg-message.h"
40 #include "error.h"
41 #include "error-progname.h"
42 #include "xalloc.h"
43 #include "mem-hash-map.h"
44 #include "../../gettext-runtime/src/escapes.h"
45 #include "gettext.h"
46
47 #define _(s) gettext(s)
48
49 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
50
51
52 /* The sh syntax is defined in POSIX:2001, see
53 http://www.opengroup.org/onlinepubs/007904975/utilities/xcu_chap02.html
54 Summary of sh syntax:
55 - Input is broken into words, which are then subject to
56 - tilde expansion ~...
57 - command substitution `...`
58 - variable substitution $var
59 - arithmetic substitution $((...))
60 - field splitting at whitespace (IFS)
61 - wildcard pattern expansion *?
62 - quote removal
63 - Strings are enclosed in "..."; command substitution, variable
64 substitution and arithmetic substitution are performed here as well.
65 - '...' is a string without substitutions.
66 - The list of resulting words is split into commands by semicolon and
67 newline.
68 - '#' at the beginning of a word introduces a comment until end of line.
69 The parser is implemented in bash-2.05b/parse.y. */
70
71
72 /* ====================== Keyword set customization. ====================== */
73
74 /* If true extract all strings. */
75 static bool extract_all = false;
76
77 static hash_table keywords;
78 static bool default_keywords = true;
79
80
81 void
x_sh_extract_all()82 x_sh_extract_all ()
83 {
84 extract_all = true;
85 }
86
87
88 void
x_sh_keyword(const char * name)89 x_sh_keyword (const char *name)
90 {
91 if (name == NULL)
92 default_keywords = false;
93 else
94 {
95 const char *end;
96 struct callshape shape;
97 const char *colon;
98
99 if (keywords.table == NULL)
100 hash_init (&keywords, 100);
101
102 split_keywordspec (name, &end, &shape);
103
104 /* The characters between name and end should form a valid C identifier.
105 A colon means an invalid parse in split_keywordspec(). */
106 colon = strchr (name, ':');
107 if (colon == NULL || colon >= end)
108 insert_keyword_callshape (&keywords, name, end - name, &shape);
109 }
110 }
111
112 /* Finish initializing the keywords hash table.
113 Called after argument processing, before each file is processed. */
114 static void
init_keywords()115 init_keywords ()
116 {
117 if (default_keywords)
118 {
119 /* When adding new keywords here, also update the documentation in
120 xgettext.texi! */
121 x_sh_keyword ("gettext");
122 x_sh_keyword ("ngettext:1,2");
123 /* Note: There is also special handling for 'gettext' and 'ngettext'
124 in read_command, below. */
125 x_sh_keyword ("eval_gettext");
126 x_sh_keyword ("eval_ngettext:1,2");
127 x_sh_keyword ("eval_pgettext:1c,2");
128 x_sh_keyword ("eval_npgettext:1c,2,3");
129 default_keywords = false;
130 }
131 }
132
133 void
init_flag_table_sh()134 init_flag_table_sh ()
135 {
136 xgettext_record_flag ("gettext:1:pass-sh-format");
137 xgettext_record_flag ("ngettext:1:pass-sh-format");
138 xgettext_record_flag ("ngettext:2:pass-sh-format");
139 xgettext_record_flag ("eval_gettext:1:sh-format");
140 xgettext_record_flag ("eval_ngettext:1:sh-format");
141 xgettext_record_flag ("eval_ngettext:2:sh-format");
142 xgettext_record_flag ("eval_pgettext:2:sh-format");
143 xgettext_record_flag ("eval_npgettext:2:sh-format");
144 xgettext_record_flag ("eval_npgettext:3:sh-format");
145 }
146
147
148 /* ======================== Reading of characters. ======================== */
149
150 /* The input file stream. */
151 static FILE *fp;
152
153
154 /* Fetch the next character from the input file. */
155 static int
do_getc()156 do_getc ()
157 {
158 int c = getc (fp);
159
160 if (c == EOF)
161 {
162 if (ferror (fp))
163 error (EXIT_FAILURE, errno,
164 _("error while reading \"%s\""), real_file_name);
165 }
166 else if (c == '\n')
167 line_number++;
168
169 return c;
170 }
171
172 /* Put back the last fetched character, not EOF. */
173 static void
do_ungetc(int c)174 do_ungetc (int c)
175 {
176 if (c == '\n')
177 line_number--;
178 ungetc (c, fp);
179 }
180
181
182 /* Remove backslash followed by newline from the input stream. */
183
184 static int phase1_pushback[1];
185 static int phase1_pushback_length;
186
187 static int
phase1_getc()188 phase1_getc ()
189 {
190 int c;
191
192 if (phase1_pushback_length)
193 {
194 c = phase1_pushback[--phase1_pushback_length];
195 if (c == '\n')
196 ++line_number;
197 return c;
198 }
199 for (;;)
200 {
201 c = do_getc ();
202 if (c != '\\')
203 return c;
204 c = do_getc ();
205 if (c != '\n')
206 {
207 if (c != EOF)
208 do_ungetc (c);
209 return '\\';
210 }
211 }
212 }
213
214 /* Supports only one pushback character. */
215 static void
phase1_ungetc(int c)216 phase1_ungetc (int c)
217 {
218 switch (c)
219 {
220 case EOF:
221 break;
222
223 case '\n':
224 --line_number;
225 /* FALLTHROUGH */
226
227 default:
228 if (phase1_pushback_length == SIZEOF (phase1_pushback))
229 abort ();
230 phase1_pushback[phase1_pushback_length++] = c;
231 break;
232 }
233 }
234
235
236 /* ========================== Reading of tokens. ========================== */
237
238
239 /* A token consists of a sequence of characters. */
240 struct token
241 {
242 int allocated; /* number of allocated 'token_char's */
243 int charcount; /* number of used 'token_char's */
244 char *chars; /* the token's constituents */
245 };
246
247 /* Initialize a 'struct token'. */
248 static inline void
init_token(struct token * tp)249 init_token (struct token *tp)
250 {
251 tp->allocated = 10;
252 tp->chars = XNMALLOC (tp->allocated, char);
253 tp->charcount = 0;
254 }
255
256 /* Free the memory pointed to by a 'struct token'. */
257 static inline void
free_token(struct token * tp)258 free_token (struct token *tp)
259 {
260 free (tp->chars);
261 }
262
263 /* Ensure there is enough room in the token for one more character. */
264 static inline void
grow_token(struct token * tp)265 grow_token (struct token *tp)
266 {
267 if (tp->charcount == tp->allocated)
268 {
269 tp->allocated *= 2;
270 tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char));
271 }
272 }
273
274 /* Convert a struct token * to a char*. */
275 static char *
string_of_token(const struct token * tp)276 string_of_token (const struct token *tp)
277 {
278 char *str;
279 int n;
280
281 n = tp->charcount;
282 str = XNMALLOC (n + 1, char);
283 memcpy (str, tp->chars, n);
284 str[n] = '\0';
285 return str;
286 }
287
288
289 /* ========================= Accumulating messages ========================= */
290
291
292 static message_list_ty *mlp;
293
294
295 /* ========================= Accumulating comments ========================= */
296
297
298 static char *buffer;
299 static size_t bufmax;
300 static size_t buflen;
301
302 static inline void
comment_start()303 comment_start ()
304 {
305 buflen = 0;
306 }
307
308 static inline void
comment_add(int c)309 comment_add (int c)
310 {
311 if (buflen >= bufmax)
312 {
313 bufmax = 2 * bufmax + 10;
314 buffer = xrealloc (buffer, bufmax);
315 }
316 buffer[buflen++] = c;
317 }
318
319 static inline void
comment_line_end()320 comment_line_end ()
321 {
322 while (buflen >= 1
323 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
324 --buflen;
325 if (buflen >= bufmax)
326 {
327 bufmax = 2 * bufmax + 10;
328 buffer = xrealloc (buffer, bufmax);
329 }
330 buffer[buflen] = '\0';
331 savable_comment_add (buffer);
332 }
333
334
335 /* These are for tracking whether comments count as immediately before
336 keyword. */
337 static int last_comment_line;
338 static int last_non_comment_line;
339
340
341 /* ========================= Debackslashification ========================== */
342
343 /* This state tracks the effect of backquotes, double-quotes and single-quotes
344 on the parsing of backslashes. We make a single pass through the input
345 file, keeping the state up to date. This is much faster than accumulating
346 strings and processing them with explicit debackslashification, like the
347 shell does it. */
348
349 /* The number of nested `...` or "`...`" constructs. Assumed to be <= 32. */
350 static unsigned int nested_backquotes;
351
352 /* A bit mask indicating which of the currently open `...` or "`...`"
353 constructs is with double-quotes: "`...`".
354 A bit value of 1 stands for "`...`", a bit value of 0 stands for `...`.
355 Bit position 0 designates the outermost backquotes nesting,
356 bit position 1 the second-outermost backquotes nesting,
357 ...
358 bit position (nested_backquotes-1) the innermost backquotes nesting. */
359 static unsigned int open_doublequotes_mask;
360
361 /* A bit indicating whether a double-quote is currently open inside the
362 innermost backquotes nesting. */
363 static bool open_doublequote;
364
365 /* A bit indicating whether a single-quote is currently open inside the
366 innermost backquotes nesting. */
367 static bool open_singlequote;
368
369 /* The expected terminator of the currently open single-quote.
370 Usually '\'', but can be '"' for i18n-quotes. */
371 static char open_singlequote_terminator;
372
373
374 /* Functions to update the state. */
375
376 static inline void
saw_opening_backquote()377 saw_opening_backquote ()
378 {
379 if (open_singlequote)
380 abort ();
381 if (open_doublequote)
382 open_doublequotes_mask |= (unsigned int) 1 << nested_backquotes;
383 nested_backquotes++;
384 open_doublequote = false;
385 }
386
387 static inline void
saw_closing_backquote()388 saw_closing_backquote ()
389 {
390 nested_backquotes--;
391 open_doublequote = (open_doublequotes_mask >> nested_backquotes) & 1;
392 open_doublequotes_mask &= ((unsigned int) 1 << nested_backquotes) - 1;
393 open_singlequote = false; /* just for safety */
394 }
395
396 static inline void
saw_opening_doublequote()397 saw_opening_doublequote ()
398 {
399 if (open_singlequote || open_doublequote)
400 abort ();
401 open_doublequote = true;
402 }
403
404 static inline void
saw_closing_doublequote()405 saw_closing_doublequote ()
406 {
407 if (open_singlequote || !open_doublequote)
408 abort ();
409 open_doublequote = false;
410 }
411
412 static inline void
saw_opening_singlequote()413 saw_opening_singlequote ()
414 {
415 if (open_doublequote || open_singlequote)
416 abort ();
417 open_singlequote = true;
418 open_singlequote_terminator = '\'';
419 }
420
421 static inline void
saw_closing_singlequote()422 saw_closing_singlequote ()
423 {
424 if (open_doublequote || !open_singlequote)
425 abort ();
426 open_singlequote = false;
427 }
428
429
430 /* ========================== Reading of commands ========================== */
431
432 /* We are only interested in constant strings. Other words need not to be
433 represented precisely. */
434 enum word_type
435 {
436 t_string, /* constant string */
437 t_assignment, /* variable assignment */
438 t_other, /* other string */
439 t_separator, /* command separator: semicolon or newline */
440 t_redirect, /* redirection: one of < > >| << <<- >> <> <& >& */
441 t_backquote, /* closing '`' pseudo word */
442 t_paren, /* closing ')' pseudo word */
443 t_eof /* EOF marker */
444 };
445
446 struct word
447 {
448 enum word_type type;
449 struct token *token; /* for t_string */
450 int line_number_at_start; /* for t_string */
451 };
452
453 /* Free the memory pointed to by a 'struct word'. */
454 static inline void
free_word(struct word * wp)455 free_word (struct word *wp)
456 {
457 if (wp->type == t_string)
458 {
459 free_token (wp->token);
460 free (wp->token);
461 }
462 }
463
464 /* Convert a t_string token to a char*. */
465 static char *
string_of_word(const struct word * wp)466 string_of_word (const struct word *wp)
467 {
468 char *str;
469 int n;
470
471 if (!(wp->type == t_string))
472 abort ();
473 n = wp->token->charcount;
474 str = XNMALLOC (n + 1, char);
475 memcpy (str, wp->token->chars, n);
476 str[n] = '\0';
477 return str;
478 }
479
480 /* Convert a t_string token to a char*, ignoring the first OFFSET bytes. */
481 static char *
substring_of_word(const struct word * wp,size_t offset)482 substring_of_word (const struct word *wp, size_t offset)
483 {
484 char *str;
485 int n;
486
487 if (!(wp->type == t_string))
488 abort ();
489 n = wp->token->charcount;
490 if (!(offset <= n))
491 abort ();
492 str = XNMALLOC (n - offset + 1, char);
493 memcpy (str, wp->token->chars + offset, n - offset);
494 str[n - offset] = '\0';
495 return str;
496 }
497
498
499 /* Whitespace recognition. */
500
501 static inline bool
is_whitespace(int c)502 is_whitespace (int c)
503 {
504 return (c == ' ' || c == '\t' || c == '\n');
505 }
506
507 /* Operator character recognition. */
508
509 static inline bool
is_operator_start(int c)510 is_operator_start (int c)
511 {
512 return (c == '|' || c == '&' || c == ';' || c == '<' || c == '>'
513 || c == '(' || c == ')');
514 }
515
516
517 /* Denotation of a quoted character.
518 The distinction between quoted and unquoted character is important only for
519 the special, whitespace and operator characters; it is irrelevant for
520 alphanumeric characters, '\\' and many others. */
521 #define QUOTED(c) (UCHAR_MAX + 1 + (c))
522 /* Values in the 'unsigned char' range are implicitly unquoted. Among these,
523 the following are important:
524 '"' opening or closing double quote
525 '\'' opening or closing single quote
526 '$' the unknown result of a dollar expansion
527 '`' does not occur - replaced with OPENING_BACKQUOTE or
528 CLOSING_BACKQUOTE
529 */
530 #define OPENING_BACKQUOTE (2 * (UCHAR_MAX + 1) + '`')
531 #define CLOSING_BACKQUOTE (3 * (UCHAR_MAX + 1) + '`')
532
533 /* 2 characters of pushback are supported.
534 2 characters of pushback occur only when the first is an 'x'; in all
535 other cases only one character of pushback is needed. */
536 static int phase2_pushback[2];
537 static int phase2_pushback_length;
538
539 /* Return the next character, with backslashes removed.
540 The result is QUOTED(c) for some unsigned char c, if the next character
541 is escaped sufficiently often to make it a regular constituent character,
542 or simply an 'unsigned char' if it has its special meaning (of special,
543 whitespace or operator charcter), or OPENING_BACKQUOTE, CLOSING_BACKQUOTE,
544 EOF.
545 It's the caller's responsibility to update the state. */
546 static int
phase2_getc()547 phase2_getc ()
548 {
549 int c;
550
551 if (phase2_pushback_length)
552 {
553 c = phase2_pushback[--phase2_pushback_length];
554 if (c == '\n')
555 ++line_number;
556 return c;
557 }
558
559 c = phase1_getc ();
560 if (c == EOF)
561 return c;
562 if (c == '\'')
563 return ((open_doublequote
564 || (open_singlequote && open_singlequote_terminator != c))
565 ? QUOTED (c)
566 : c);
567 if (open_singlequote)
568 {
569 if (c == open_singlequote_terminator)
570 return c;
571 }
572 else
573 {
574 if (c == '"' || c == '$')
575 return c;
576 if (c == '`')
577 return (nested_backquotes > 0 ? CLOSING_BACKQUOTE : OPENING_BACKQUOTE);
578 }
579 if (c == '\\')
580 {
581 /* Number of debackslashification passes that are active at the
582 current point. */
583 unsigned int debackslashify =
584 nested_backquotes + (open_singlequote ? 0 : 1);
585 /* Normal number of backslashes that yield a single backslash in the
586 final output. */
587 unsigned int expected_count =
588 (unsigned int) 1 << debackslashify;
589 /* Number of backslashes found. */
590 unsigned int count;
591
592 for (count = 1; count < expected_count; count++)
593 {
594 c = phase1_getc ();
595 if (c != '\\')
596 break;
597 }
598 if (count == expected_count)
599 return '\\';
600
601 /* The count of backslashes is > 0 and < expected_count, therefore the
602 result depends on c, the first character after the backslashes.
603 Note: The formulas below don't necessarily have a logic; they were
604 empirically determined such that 1. the xgettext-sh-1 test succeeds,
605 2. the behaviour for count == 0 would correspond to the one without
606 any baskslash. */
607 if (c == '\'')
608 {
609 if (!open_singlequote && count > (expected_count >> 1))
610 {
611 phase1_ungetc (c);
612 return '\\';
613 }
614 else
615 return ((open_doublequote
616 || (open_singlequote
617 ? open_singlequote_terminator != c
618 : count == (expected_count >> 1)))
619 ? QUOTED (c)
620 : c);
621 }
622 else if (c == '"')
623 {
624 /* Each debackslashification pass converts \\ to \ and \" to ";
625 passes corresponding to `...` drop a lone " whereas passes
626 corresponding to "`...`" leave it alone. Therefore, the
627 minimum number of backslashes needed to get one double-quote
628 in the end is open_doublequotes_mask + 1. */
629 if (open_singlequote)
630 {
631 if (count > open_doublequotes_mask)
632 {
633 phase1_ungetc (c);
634 return '\\';
635 }
636 else
637 return (open_singlequote_terminator != c ? QUOTED (c) : c);
638 }
639 else
640 {
641 if (count > open_doublequotes_mask)
642 return QUOTED (c);
643 else
644 /* Some of the count values <= open_doublequotes_mask are
645 actually invalid here, but we assume a syntactically
646 correct input file anyway. */
647 return c;
648 }
649 }
650 else if (c == '`')
651 {
652 /* FIXME: This code looks fishy. */
653 if (count == expected_count - 1)
654 return c;
655 else
656 /* Some of the count values < expected_count - 1 are
657 actually invalid here, but we assume a syntactically
658 correct input file anyway. */
659 if (nested_backquotes > 0 && !open_singlequote
660 && count >= (expected_count >> 2))
661 return OPENING_BACKQUOTE;
662 else
663 return CLOSING_BACKQUOTE;
664 }
665 else if (c == '$')
666 {
667 if (open_singlequote)
668 return QUOTED (c);
669 if (count >= (expected_count >> 1))
670 return QUOTED (c);
671 else
672 return c;
673 }
674 else
675 {
676 /* When not followed by a quoting character or backslash or dollar,
677 a backslash survives a debackslashification pass unmodified.
678 Therefore each debackslashification pass performs a
679 count := (count + 1) >> 1
680 operation. Therefore the minimum number of backslashes needed
681 to get one backslash in the end is (expected_count >> 1) + 1. */
682 if (open_doublequote || open_singlequote)
683 {
684 if (count > 0)
685 {
686 phase1_ungetc (c);
687 return '\\';
688 }
689 else
690 return QUOTED (c);
691 }
692 else
693 {
694 if (count > (expected_count >> 1))
695 {
696 phase1_ungetc (c);
697 return '\\';
698 }
699 else if (count > 0)
700 return QUOTED (c);
701 else
702 return c;
703 }
704 }
705 }
706
707 return (open_singlequote || open_doublequote ? QUOTED (c) : c);
708 }
709
710 /* Supports 2 characters of pushback. */
711 static void
phase2_ungetc(int c)712 phase2_ungetc (int c)
713 {
714 switch (c)
715 {
716 case EOF:
717 break;
718
719 case '\n':
720 --line_number;
721 /* FALLTHROUGH */
722
723 default:
724 if (phase2_pushback_length == SIZEOF (phase2_pushback))
725 abort ();
726 phase2_pushback[phase2_pushback_length++] = c;
727 break;
728 }
729 }
730
731
732 /* Context lookup table. */
733 static flag_context_list_table_ty *flag_context_list_table;
734
735
736 /* Forward declaration of local functions. */
737 static enum word_type read_command_list (int looking_for,
738 flag_context_ty outer_context);
739
740
741
742 /* Read the next word.
743 'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
744 or '\0'. */
745 static void
read_word(struct word * wp,int looking_for,flag_context_ty context)746 read_word (struct word *wp, int looking_for, flag_context_ty context)
747 {
748 int c;
749 bool all_unquoted_digits;
750 bool all_unquoted_name_characters;
751
752 do
753 {
754 c = phase2_getc ();
755 if (c == '#')
756 {
757 /* Skip a comment up to end of line. */
758 last_comment_line = line_number;
759 comment_start ();
760 for (;;)
761 {
762 c = phase1_getc ();
763 if (c == EOF || c == '\n')
764 break;
765 /* We skip all leading white space, but not EOLs. */
766 if (!(buflen == 0 && (c == ' ' || c == '\t')))
767 comment_add (c);
768 }
769 comment_line_end ();
770 }
771 if (c == '\n')
772 {
773 /* Comments assumed to be grouped with a message must immediately
774 precede it, with no non-whitespace token on a line between
775 both. */
776 if (last_non_comment_line > last_comment_line)
777 savable_comment_reset ();
778 wp->type = t_separator;
779 return;
780 }
781 }
782 while (is_whitespace (c));
783
784 if (c == EOF)
785 {
786 wp->type = t_eof;
787 return;
788 }
789
790 if (c == '<' || c == '>')
791 {
792 /* Recognize the redirection operators < > >| << <<- >> <> <& >&
793 But <( and >) are handled below, not here. */
794 int c2 = phase2_getc ();
795 if (c2 != '(')
796 {
797 if ((c == '<' ? c2 == '<' : c2 == '|') || c2 == '>' || c2 == '&')
798 {
799 if (c == '<' && c2 == '<')
800 {
801 int c3 = phase2_getc ();
802 if (c3 != '-')
803 phase2_ungetc (c3);
804 }
805 }
806 else
807 phase2_ungetc (c2);
808 wp->type = t_redirect;
809 return;
810 }
811 else
812 phase2_ungetc (c2);
813 }
814
815 if (c == CLOSING_BACKQUOTE)
816 {
817 if (looking_for == CLOSING_BACKQUOTE)
818 {
819 saw_closing_backquote ();
820 wp->type = t_backquote;
821 last_non_comment_line = line_number;
822 return;
823 }
824 else if (looking_for == ')')
825 {
826 /* The input is invalid syntax, such as `a<(`
827 Push back the closing backquote and pretend that we have seen a
828 closing parenthesis. */
829 phase2_ungetc (c);
830 wp->type = t_paren;
831 last_non_comment_line = line_number;
832 return;
833 }
834 else
835 /* We shouldn't be reading a CLOSING_BACKQUOTE when
836 looking_for == '\0'. */
837 abort ();
838 }
839
840 if (looking_for == ')' && c == ')')
841 {
842 wp->type = t_paren;
843 last_non_comment_line = line_number;
844 return;
845 }
846
847 if (is_operator_start (c))
848 {
849 wp->type = (c == ';' ? t_separator : t_other);
850 return;
851 }
852
853 wp->type = t_string;
854 wp->token = XMALLOC (struct token);
855 init_token (wp->token);
856 wp->line_number_at_start = line_number;
857 /* True while all characters in the token seen so far are digits. */
858 all_unquoted_digits = true;
859 /* True while all characters in the token seen so far form a "name":
860 all characters are unquoted underscores, digits, or alphabetics from the
861 portable character set, and the first character is not a digit. Cf.
862 <https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_235>
863 */
864 all_unquoted_name_characters = true;
865
866 for (;; c = phase2_getc ())
867 {
868 if (c == EOF)
869 break;
870
871 if (all_unquoted_digits && (c == '<' || c == '>'))
872 {
873 /* Recognize the redirection operators < > >| << <<- >> <> <& >&
874 prefixed with a nonempty sequence of unquoted digits. */
875 int c2 = phase2_getc ();
876 if ((c == '<' ? c2 == '<' : c2 == '|') || c2 == '>' || c2 == '&')
877 {
878 if (c == '<' && c2 == '<')
879 {
880 int c3 = phase2_getc ();
881 if (c3 != '-')
882 phase2_ungetc (c3);
883 }
884 }
885 else
886 phase2_ungetc (c2);
887
888 wp->type = t_redirect;
889 free_token (wp->token);
890 free (wp->token);
891
892 last_non_comment_line = line_number;
893
894 return;
895 }
896
897 all_unquoted_digits = all_unquoted_digits && (c >= '0' && c <= '9');
898
899 if (all_unquoted_name_characters && wp->token->charcount > 0 && c == '=')
900 {
901 wp->type = t_assignment;
902 continue;
903 }
904
905 all_unquoted_name_characters =
906 all_unquoted_name_characters
907 && ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_'
908 || (wp->token->charcount > 0 && c >= '0' && c <= '9'));
909
910 if (c == '$')
911 {
912 int c2;
913
914 /* An unquoted dollar indicates we are not inside '...'. */
915 if (open_singlequote)
916 abort ();
917 /* After reading a dollar, we know that there is no pushed back
918 character from an earlier lookahead. */
919 if (phase2_pushback_length > 0)
920 abort ();
921 /* Therefore we can use phase1 without interfering with phase2.
922 We need to recognize $( outside and inside double-quotes.
923 It would be incorrect to do
924 c2 = phase2_getc ();
925 if (c2 == '(' || c2 == QUOTED ('('))
926 because that would also trigger for $\(. */
927 c2 = phase1_getc ();
928 if (c2 == '(')
929 {
930 bool saved_open_doublequote;
931 int c3;
932
933 phase1_ungetc (c2);
934
935 /* The entire inner command or arithmetic expression is read
936 ignoring possible surrounding double-quotes. */
937 saved_open_doublequote = open_doublequote;
938 open_doublequote = false;
939
940 c2 = phase2_getc ();
941 if (c2 != '(')
942 abort ();
943
944 c3 = phase2_getc ();
945 if (c3 == '(')
946 {
947 /* Arithmetic expression (Bash syntax). Skip until the
948 matching closing parenthesis. */
949 unsigned int depth = 2;
950
951 do
952 {
953 c = phase2_getc ();
954 if (c == '(')
955 depth++;
956 else if (c == ')')
957 if (--depth == 0)
958 break;
959 }
960 while (c != EOF);
961 }
962 else
963 {
964 /* Command substitution (Bash syntax). */
965 phase2_ungetc (c3);
966 read_command_list (')', context);
967 }
968
969 open_doublequote = saved_open_doublequote;
970 }
971 else
972 {
973 phase1_ungetc (c2);
974 c2 = phase2_getc ();
975
976 if (c2 == '\'' && !open_singlequote)
977 {
978 /* Bash builtin for string with ANSI-C escape sequences. */
979 for (;;)
980 {
981 /* We have to use phase1 throughout this loop,
982 because phase2 does debackslashification,
983 which is undesirable when parsing ANSI-C
984 escape sequences. */
985 c = phase1_getc ();
986 if (c == EOF)
987 break;
988 if (c == '\'')
989 break;
990 if (c == '\\')
991 {
992 c = phase1_getc ();
993 switch (c)
994 {
995 default:
996 phase1_ungetc (c);
997 c = '\\';
998 break;
999
1000 case '\\':
1001 break;
1002 case '\'':
1003 break;
1004 case '"':
1005 break;
1006
1007 case 'a':
1008 c = '\a';
1009 break;
1010 case 'b':
1011 c = '\b';
1012 break;
1013 case 'e':
1014 case 'E':
1015 c = 0x1b; /* ESC */
1016 break;
1017 case 'f':
1018 c = '\f';
1019 break;
1020 case 'n':
1021 c = '\n';
1022 break;
1023 case 'r':
1024 c = '\r';
1025 break;
1026 case 't':
1027 c = '\t';
1028 break;
1029 case 'v':
1030 c = '\v';
1031 break;
1032
1033 case 'x':
1034 c = phase1_getc ();
1035 if ((c >= '0' && c <= '9')
1036 || (c >= 'A' && c <= 'F')
1037 || (c >= 'a' && c <= 'f'))
1038 {
1039 int n;
1040
1041 if (c >= '0' && c <= '9')
1042 n = c - '0';
1043 else if (c >= 'A' && c <= 'F')
1044 n = 10 + c - 'A';
1045 else if (c >= 'a' && c <= 'f')
1046 n = 10 + c - 'a';
1047 else
1048 abort ();
1049
1050 c = phase1_getc ();
1051 if ((c >= '0' && c <= '9')
1052 || (c >= 'A' && c <= 'F')
1053 || (c >= 'a' && c <= 'f'))
1054 {
1055 if (c >= '0' && c <= '9')
1056 n = n * 16 + c - '0';
1057 else if (c >= 'A' && c <= 'F')
1058 n = n * 16 + 10 + c - 'A';
1059 else if (c >= 'a' && c <= 'f')
1060 n = n * 16 + 10 + c - 'a';
1061 else
1062 abort ();
1063 }
1064 else
1065 phase1_ungetc (c);
1066
1067 c = n;
1068 }
1069 else
1070 {
1071 phase1_ungetc (c);
1072 phase1_ungetc ('x');
1073 c = '\\';
1074 }
1075 break;
1076
1077 case '0': case '1': case '2': case '3':
1078 case '4': case '5': case '6': case '7':
1079 {
1080 int n = c - '0';
1081
1082 c = phase1_getc ();
1083 if (c >= '0' && c <= '7')
1084 {
1085 n = n * 8 + c - '0';
1086
1087 c = phase1_getc ();
1088 if (c >= '0' && c <= '7')
1089 n = n * 8 + c - '0';
1090 else
1091 phase1_ungetc (c);
1092 }
1093 else
1094 phase1_ungetc (c);
1095
1096 c = n;
1097 }
1098 break;
1099 }
1100 }
1101 if (wp->type == t_string)
1102 {
1103 grow_token (wp->token);
1104 wp->token->chars[wp->token->charcount++] =
1105 (unsigned char) c;
1106 }
1107 }
1108 /* The result is a literal string. Don't change wp->type. */
1109 continue;
1110 }
1111 else if (c2 == '"' && !open_doublequote)
1112 {
1113 /* Bash builtin for internationalized string. */
1114 lex_pos_ty pos;
1115 struct token string;
1116
1117 saw_opening_singlequote ();
1118 open_singlequote_terminator = '"';
1119 pos.file_name = logical_file_name;
1120 pos.line_number = line_number;
1121 init_token (&string);
1122 for (;;)
1123 {
1124 c = phase2_getc ();
1125 if (c == EOF)
1126 break;
1127 if (c == '"')
1128 {
1129 saw_closing_singlequote ();
1130 break;
1131 }
1132 grow_token (&string);
1133 string.chars[string.charcount++] = (unsigned char) c;
1134 }
1135 remember_a_message (mlp, NULL, string_of_token (&string),
1136 false, false, context, &pos,
1137 NULL, savable_comment, false);
1138 free_token (&string);
1139
1140 error_with_progname = false;
1141 error (0, 0, _("%s:%lu: warning: the syntax $\"...\" is deprecated due to security reasons; use eval_gettext instead"),
1142 pos.file_name, (unsigned long) pos.line_number);
1143 error_with_progname = true;
1144
1145 /* The result at runtime is not constant. Therefore we
1146 change wp->type. */
1147 }
1148 else
1149 phase2_ungetc (c2);
1150 }
1151 wp->type = t_other;
1152 continue;
1153 }
1154
1155 if (c == '\'')
1156 {
1157 if (!open_singlequote)
1158 {
1159 /* Handle an opening single quote. */
1160 saw_opening_singlequote ();
1161 }
1162 else
1163 {
1164 /* Handle a closing single quote. */
1165 saw_closing_singlequote ();
1166 }
1167 continue;
1168 }
1169
1170 if (c == '"')
1171 {
1172 if (open_singlequote && open_singlequote_terminator == '"')
1173 {
1174 /* Handle a closing i18n quote. */
1175 saw_closing_singlequote ();
1176 }
1177 else if (!open_doublequote)
1178 {
1179 /* Handle an opening double quote. */
1180 saw_opening_doublequote ();
1181 }
1182 else
1183 {
1184 /* Handle a closing double quote. */
1185 saw_closing_doublequote ();
1186 }
1187 continue;
1188 }
1189
1190 if (c == OPENING_BACKQUOTE)
1191 {
1192 /* Handle an opening backquote. */
1193 saw_opening_backquote ();
1194
1195 read_command_list (CLOSING_BACKQUOTE, context);
1196
1197 wp->type = t_other;
1198 continue;
1199 }
1200 if (c == CLOSING_BACKQUOTE)
1201 break;
1202
1203 if (c == '<' || c == '>')
1204 {
1205 int c2;
1206
1207 /* An unquoted c indicates we are not inside '...' nor "...". */
1208 if (open_singlequote || open_doublequote)
1209 abort ();
1210
1211 c2 = phase2_getc ();
1212 if (c2 == '(')
1213 {
1214 /* Process substitution (Bash syntax). */
1215 read_command_list (')', context);
1216
1217 wp->type = t_other;
1218 continue;
1219 }
1220 else
1221 phase2_ungetc (c2);
1222 }
1223
1224 if (!open_singlequote && !open_doublequote
1225 && (is_whitespace (c) || is_operator_start (c)))
1226 break;
1227
1228 if (wp->type == t_string)
1229 {
1230 grow_token (wp->token);
1231 wp->token->chars[wp->token->charcount++] = (unsigned char) c;
1232 }
1233 }
1234
1235 phase2_ungetc (c);
1236
1237 if (wp->type != t_string)
1238 {
1239 free_token (wp->token);
1240 free (wp->token);
1241 }
1242 last_non_comment_line = line_number;
1243 }
1244
1245
1246 /* Read the next command.
1247 'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
1248 or '\0'.
1249 Returns the type of the word that terminated the command. */
1250 static enum word_type
read_command(int looking_for,flag_context_ty outer_context)1251 read_command (int looking_for, flag_context_ty outer_context)
1252 {
1253 /* Read the words that make up the command.
1254 Here we completely ignore field splitting at whitespace and wildcard
1255 expansions; i.e. we assume that the source is written in such a way that
1256 every word in the program determines exactly one word in the resulting
1257 command.
1258 But we do not require that the 'gettext'/'ngettext' command is the
1259 first in the command; this is because 1. we want to allow for prefixes
1260 like "$verbose" that may expand to nothing, and 2. it's a big effort
1261 to know where a command starts in a $(for ...) or $(case ...) compound
1262 command. */
1263 int arg = 0; /* Current argument number. */
1264 bool arg_of_redirect = false; /* True right after a redirection operator. */
1265 bool must_expand_arg_strings = false; /* True if need to expand escape
1266 sequences in arguments. */
1267 flag_context_list_iterator_ty context_iter;
1268 const struct callshapes *shapes = NULL;
1269 struct arglist_parser *argparser = NULL;
1270
1271 for (;;)
1272 {
1273 struct word inner;
1274 flag_context_ty inner_context;
1275
1276 if (arg == 0)
1277 inner_context = null_context;
1278 else
1279 inner_context =
1280 inherited_context (outer_context,
1281 flag_context_list_iterator_advance (
1282 &context_iter));
1283
1284 read_word (&inner, looking_for, inner_context);
1285
1286 /* Recognize end of command. */
1287 if (inner.type == t_separator
1288 || inner.type == t_backquote || inner.type == t_paren
1289 || inner.type == t_eof)
1290 {
1291 if (argparser != NULL)
1292 arglist_parser_done (argparser, arg);
1293 return inner.type;
1294 }
1295
1296 if (extract_all)
1297 {
1298 if (inner.type == t_string)
1299 {
1300 lex_pos_ty pos;
1301
1302 pos.file_name = logical_file_name;
1303 pos.line_number = inner.line_number_at_start;
1304 remember_a_message (mlp, NULL, string_of_word (&inner), false,
1305 false, inner_context, &pos,
1306 NULL, savable_comment, false);
1307 }
1308 }
1309
1310 if (arg_of_redirect)
1311 {
1312 /* Ignore arguments of redirection operators. */
1313 arg_of_redirect = false;
1314 }
1315 else if (inner.type == t_redirect)
1316 {
1317 /* Ignore this word and the following one. */
1318 arg_of_redirect = true;
1319 }
1320 else
1321 {
1322 bool matters_for_argparser = true;
1323
1324 if (argparser == NULL)
1325 {
1326 /* This is the function position. */
1327 arg = 0;
1328 if (inner.type == t_assignment)
1329 {
1330 /* An assignment just sets an environment variable.
1331 Ignore it. */
1332 /* Don't increment arg in this round. */
1333 matters_for_argparser = false;
1334 }
1335 else if (inner.type == t_string)
1336 {
1337 char *function_name = string_of_word (&inner);
1338
1339 if (strcmp (function_name, "env") == 0)
1340 {
1341 /* The 'env' command just introduces more assignments.
1342 Ignore it. */
1343 /* Don't increment arg in this round. */
1344 matters_for_argparser = false;
1345 }
1346 else
1347 {
1348 void *keyword_value;
1349
1350 if (hash_find_entry (&keywords,
1351 function_name,
1352 strlen (function_name),
1353 &keyword_value)
1354 == 0)
1355 shapes = (const struct callshapes *) keyword_value;
1356
1357 argparser = arglist_parser_alloc (mlp, shapes);
1358
1359 context_iter =
1360 flag_context_list_iterator (
1361 flag_context_list_table_lookup (
1362 flag_context_list_table,
1363 function_name, strlen (function_name)));
1364 }
1365
1366 free (function_name);
1367 }
1368 else
1369 context_iter = null_context_list_iterator;
1370 }
1371 else
1372 {
1373 /* These are the argument positions. */
1374 if (inner.type == t_string)
1375 {
1376 bool accepts_context =
1377 ((argparser->keyword_len == 7
1378 && memcmp (argparser->keyword, "gettext", 7) == 0)
1379 || (argparser->keyword_len == 8
1380 && memcmp (argparser->keyword, "ngettext", 8) == 0));
1381 bool accepts_expand =
1382 ((argparser->keyword_len == 7
1383 && memcmp (argparser->keyword, "gettext", 7) == 0)
1384 || (argparser->keyword_len == 8
1385 && memcmp (argparser->keyword, "ngettext", 8) == 0));
1386 if (accepts_context && argparser->next_is_msgctxt)
1387 {
1388 char *s = string_of_word (&inner);
1389 mixed_string_ty *ms =
1390 mixed_string_alloc_simple (s, lc_string,
1391 logical_file_name,
1392 inner.line_number_at_start);
1393 free (s);
1394 argparser->next_is_msgctxt = false;
1395 arglist_parser_remember_msgctxt (argparser, ms,
1396 inner_context,
1397 logical_file_name,
1398 inner.line_number_at_start);
1399 matters_for_argparser = false;
1400 }
1401 else if (accepts_context
1402 && ((inner.token->charcount == 2
1403 && memcmp (inner.token->chars, "-c", 2) == 0)
1404 || (inner.token->charcount == 9
1405 && memcmp (inner.token->chars, "--context", 9) == 0)))
1406 {
1407 argparser->next_is_msgctxt = true;
1408 matters_for_argparser = false;
1409 }
1410 else if (accepts_context
1411 && (inner.token->charcount >= 10
1412 && memcmp (inner.token->chars, "--context=", 10) == 0))
1413 {
1414 char *s = substring_of_word (&inner, 10);
1415 mixed_string_ty *ms =
1416 mixed_string_alloc_simple (s, lc_string,
1417 logical_file_name,
1418 inner.line_number_at_start);
1419 free (s);
1420 argparser->next_is_msgctxt = false;
1421 arglist_parser_remember_msgctxt (argparser, ms,
1422 inner_context,
1423 logical_file_name,
1424 inner.line_number_at_start);
1425 matters_for_argparser = false;
1426 }
1427 else if (accepts_expand
1428 && inner.token->charcount == 2
1429 && memcmp (inner.token->chars, "-e", 2) == 0)
1430 {
1431 must_expand_arg_strings = true;
1432 matters_for_argparser = false;
1433 }
1434 else
1435 {
1436 char *s = string_of_word (&inner);
1437 mixed_string_ty *ms;
1438
1439 /* When '-e' was specified, expand escape sequences in s. */
1440 if (accepts_expand && must_expand_arg_strings)
1441 {
1442 bool expands_backslash_c =
1443 (argparser->keyword_len == 7
1444 && memcmp (argparser->keyword, "gettext", 7) == 0);
1445 bool backslash_c = false;
1446 char *expanded =
1447 (char *)
1448 expand_escapes (s, expands_backslash_c ? &backslash_c : NULL);
1449 /* We can ignore the value of expands_backslash_c, because
1450 here we don't support the gettext '-s' option. */
1451 if (expanded != s)
1452 free (s);
1453 s = expanded;
1454 }
1455
1456 ms = mixed_string_alloc_simple (s, lc_string,
1457 logical_file_name,
1458 inner.line_number_at_start);
1459 free (s);
1460 arglist_parser_remember (argparser, arg, ms,
1461 inner_context,
1462 logical_file_name,
1463 inner.line_number_at_start,
1464 savable_comment, false);
1465 }
1466 }
1467
1468 if (matters_for_argparser)
1469 if (arglist_parser_decidedp (argparser, arg))
1470 {
1471 /* Stop looking for arguments of the last function_name. */
1472 /* FIXME: What about context_iter? */
1473 arglist_parser_done (argparser, arg);
1474 shapes = NULL;
1475 argparser = NULL;
1476 }
1477 }
1478
1479 if (matters_for_argparser)
1480 arg++;
1481 }
1482
1483 free_word (&inner);
1484 }
1485 }
1486
1487
1488 /* Read a list of commands.
1489 'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
1490 or '\0'.
1491 Returns the type of the word that terminated the command list. */
1492 static enum word_type
read_command_list(int looking_for,flag_context_ty outer_context)1493 read_command_list (int looking_for, flag_context_ty outer_context)
1494 {
1495 for (;;)
1496 {
1497 enum word_type terminator;
1498
1499 terminator = read_command (looking_for, outer_context);
1500 if (terminator != t_separator)
1501 return terminator;
1502 }
1503 }
1504
1505
1506 void
extract_sh(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1507 extract_sh (FILE *f,
1508 const char *real_filename, const char *logical_filename,
1509 flag_context_list_table_ty *flag_table,
1510 msgdomain_list_ty *mdlp)
1511 {
1512 mlp = mdlp->item[0]->messages;
1513
1514 fp = f;
1515 real_file_name = real_filename;
1516 logical_file_name = xstrdup (logical_filename);
1517 line_number = 1;
1518
1519 phase1_pushback_length = 0;
1520
1521 last_comment_line = -1;
1522 last_non_comment_line = -1;
1523
1524 nested_backquotes = 0;
1525 open_doublequotes_mask = 0;
1526 open_doublequote = false;
1527 open_singlequote = false;
1528
1529 phase2_pushback_length = 0;
1530
1531 flag_context_list_table = flag_table;
1532
1533 init_keywords ();
1534
1535 /* Eat tokens until eof is seen. */
1536 read_command_list ('\0', null_context);
1537
1538 fp = NULL;
1539 real_file_name = NULL;
1540 logical_file_name = NULL;
1541 line_number = 0;
1542 }
1543