1 /* xgettext Vala backend.
2 Copyright (C) 2013-2014, 2018-2020 Free Software Foundation, Inc.
3
4 This file was written by Daiki Ueno <ueno@gnu.org>, 2013.
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
22
23 /* Specification. */
24 #include "x-vala.h"
25
26 #include <assert.h>
27 #include <errno.h>
28 #include <stdbool.h>
29 #include <stdio.h>
30 #include <stdlib.h>
31 #include <string.h>
32
33 #include "message.h"
34 #include "rc-str-list.h"
35 #include "xgettext.h"
36 #include "xg-pos.h"
37 #include "xg-encoding.h"
38 #include "xg-mixed-string.h"
39 #include "xg-arglist-context.h"
40 #include "xg-arglist-callshape.h"
41 #include "xg-arglist-parser.h"
42 #include "xg-message.h"
43 #include "error.h"
44 #include "error-progname.h"
45 #include "xalloc.h"
46 #include "xvasprintf.h"
47 #include "mem-hash-map.h"
48 #include "po-charset.h"
49 #include "gettext.h"
50
51 #define _(s) gettext(s)
52
53 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
54
55 /* The Vala syntax is defined in the Vala Reference Manual
56 https://www.vala-project.org/doc/vala/.
57 See also vala/valascanner.vala. */
58
59 /* ====================== Keyword set customization. ====================== */
60
61 /* If true extract all strings. */
62 static bool extract_all = false;
63
64 static hash_table keywords;
65 static bool default_keywords = true;
66
67
68 void
x_vala_extract_all()69 x_vala_extract_all ()
70 {
71 extract_all = true;
72 }
73
74
75 static void
add_keyword(const char * name,hash_table * keywords)76 add_keyword (const char *name, hash_table *keywords)
77 {
78 if (name == NULL)
79 default_keywords = false;
80 else
81 {
82 const char *end;
83 struct callshape shape;
84 const char *colon;
85
86 if (keywords->table == NULL)
87 hash_init (keywords, 100);
88
89 split_keywordspec (name, &end, &shape);
90
91 /* The characters between name and end should form a valid C identifier.
92 A colon means an invalid parse in split_keywordspec(). */
93 colon = strchr (name, ':');
94 if (colon == NULL || colon >= end)
95 insert_keyword_callshape (keywords, name, end - name, &shape);
96 }
97 }
98
99 void
x_vala_keyword(const char * name)100 x_vala_keyword (const char *name)
101 {
102 add_keyword (name, &keywords);
103 }
104
105 static void
init_keywords()106 init_keywords ()
107 {
108 if (default_keywords)
109 {
110 /* When adding new keywords here, also update the documentation in
111 xgettext.texi! */
112 x_vala_keyword ("dgettext:2");
113 x_vala_keyword ("dcgettext:2");
114 x_vala_keyword ("ngettext:1,2");
115 x_vala_keyword ("dngettext:2,3");
116 x_vala_keyword ("dpgettext:2g");
117 x_vala_keyword ("dpgettext2:2c,3");
118 x_vala_keyword ("_");
119 x_vala_keyword ("Q_");
120 x_vala_keyword ("N_");
121 x_vala_keyword ("NC_:1c,2");
122
123 default_keywords = false;
124 }
125 }
126
127 void
init_flag_table_vala()128 init_flag_table_vala ()
129 {
130 xgettext_record_flag ("dgettext:2:pass-c-format");
131 xgettext_record_flag ("dcgettext:2:pass-c-format");
132 xgettext_record_flag ("ngettext:1:pass-c-format");
133 xgettext_record_flag ("ngettext:2:pass-c-format");
134 xgettext_record_flag ("dngettext:2:pass-c-format");
135 xgettext_record_flag ("dngettext:3:pass-c-format");
136 xgettext_record_flag ("dpgettext:2:pass-c-format");
137 xgettext_record_flag ("dpgettext2:3:pass-c-format");
138 xgettext_record_flag ("_:1:pass-c-format");
139 xgettext_record_flag ("Q_:1:pass-c-format");
140 xgettext_record_flag ("N_:1:pass-c-format");
141 xgettext_record_flag ("NC_:2:pass-c-format");
142
143 /* Vala leaves string formatting to Glib functions and thus the
144 format string is exactly same as C. See also
145 vapi/glib-2.0.vapi. */
146 xgettext_record_flag ("printf:1:c-format");
147 xgettext_record_flag ("vprintf:1:c-format");
148 }
149
150
151 /* ======================== Reading of characters. ======================== */
152
153 /* The input file stream. */
154 static FILE *fp;
155
156
157 /* 1. line_number handling. */
158
159 #define MAX_PHASE1_PUSHBACK 16
160 static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
161 static int phase1_pushback_length;
162
163
164 static int
phase1_getc()165 phase1_getc ()
166 {
167 int c;
168
169 if (phase1_pushback_length)
170 c = phase1_pushback[--phase1_pushback_length];
171 else
172 {
173 c = getc (fp);
174 if (c == EOF)
175 {
176 if (ferror (fp))
177 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
178 real_file_name);
179 return EOF;
180 }
181 }
182
183 if (c == '\n')
184 ++line_number;
185 return c;
186 }
187
188
189 /* Supports 2 characters of pushback. */
190 static void
phase1_ungetc(int c)191 phase1_ungetc (int c)
192 {
193 if (c != EOF)
194 {
195 if (c == '\n')
196 --line_number;
197
198 if (phase1_pushback_length == SIZEOF (phase1_pushback))
199 abort ();
200 phase1_pushback[phase1_pushback_length++] = c;
201 }
202 }
203
204
205 /* These are for tracking whether comments count as immediately before
206 keyword. */
207 static int last_comment_line;
208 static int last_non_comment_line;
209
210 /* Accumulating comments. */
211
212 static char *buffer;
213 static size_t bufmax;
214 static size_t buflen;
215
216 static inline void
comment_start()217 comment_start ()
218 {
219 buflen = 0;
220 }
221
222 static inline void
comment_add(int c)223 comment_add (int c)
224 {
225 if (buflen >= bufmax)
226 {
227 bufmax = 2 * bufmax + 10;
228 buffer = xrealloc (buffer, bufmax);
229 }
230 buffer[buflen++] = c;
231 }
232
233 static inline void
comment_line_end(size_t chars_to_remove)234 comment_line_end (size_t chars_to_remove)
235 {
236 buflen -= chars_to_remove;
237 while (buflen >= 1
238 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
239 --buflen;
240 if (chars_to_remove == 0 && buflen >= bufmax)
241 {
242 bufmax = 2 * bufmax + 10;
243 buffer = xrealloc (buffer, bufmax);
244 }
245 buffer[buflen] = '\0';
246 savable_comment_add (buffer);
247 }
248
249
250 /* 2. Replace each comment that is not inside a character constant or
251 string literal with a space character. */
252
253 static int
phase2_getc()254 phase2_getc ()
255 {
256 int c;
257 bool last_was_star;
258
259 c = phase1_getc ();
260 if (c != '/')
261 return c;
262 c = phase1_getc ();
263 switch (c)
264 {
265 default:
266 phase1_ungetc (c);
267 return '/';
268
269 case '*':
270 /* C comment. */
271 comment_start ();
272 last_was_star = false;
273 for (;;)
274 {
275 c = phase1_getc ();
276 if (c == EOF)
277 break;
278 /* We skip all leading white space, but not EOLs. */
279 if (!(buflen == 0 && (c == ' ' || c == '\t')))
280 comment_add (c);
281 switch (c)
282 {
283 case '\n':
284 comment_line_end (1);
285 comment_start ();
286 last_was_star = false;
287 continue;
288
289 case '*':
290 last_was_star = true;
291 continue;
292
293 case '/':
294 if (last_was_star)
295 {
296 comment_line_end (2);
297 break;
298 }
299 /* FALLTHROUGH */
300
301 default:
302 last_was_star = false;
303 continue;
304 }
305 break;
306 }
307 last_comment_line = line_number;
308 return ' ';
309
310 case '/':
311 /* C++ or ISO C 99 comment. */
312 comment_start ();
313 for (;;)
314 {
315 c = phase1_getc ();
316 if (c == '\n' || c == EOF)
317 break;
318 /* We skip all leading white space, but not EOLs. */
319 if (!(buflen == 0 && (c == ' ' || c == '\t')))
320 comment_add (c);
321 }
322 comment_line_end (0);
323 last_comment_line = line_number;
324 return '\n';
325 }
326 }
327
328
329 static void
phase2_ungetc(int c)330 phase2_ungetc (int c)
331 {
332 phase1_ungetc (c);
333 }
334
335
336 /* ========================== Reading of tokens. ========================== */
337
338 enum token_type_ty
339 {
340 token_type_character_constant, /* 'x' */
341 token_type_eof,
342 token_type_lparen, /* ( */
343 token_type_rparen, /* ) */
344 token_type_lbrace, /* { */
345 token_type_rbrace, /* } */
346 token_type_assign, /* = += -= *= /= %= <<= >>= &= |= ^= */
347 token_type_return, /* return */
348 token_type_plus, /* + */
349 token_type_arithmetic_operator, /* - * / % << >> & | ^ */
350 token_type_equality_test_operator, /* == < > >= <= != */
351 token_type_logic_operator, /* ! && || */
352 token_type_comma, /* , */
353 token_type_question, /* ? */
354 token_type_colon, /* : */
355 token_type_number, /* 2.7 */
356 token_type_string_literal, /* "abc" */
357 token_type_string_template, /* @"abc" */
358 token_type_regex_literal, /* /.../ */
359 token_type_symbol, /* if else etc. */
360 token_type_other
361 };
362 typedef enum token_type_ty token_type_ty;
363
364 typedef struct token_ty token_ty;
365 struct token_ty
366 {
367 token_type_ty type;
368 char *string; /* for token_type_symbol */
369 mixed_string_ty *mixed_string; /* for token_type_string_literal */
370 refcounted_string_list_ty *comment; /* for token_type_string_literal */
371 int line_number;
372 };
373
374 /* Free the memory pointed to by a 'struct token_ty'. */
375 static inline void
free_token(token_ty * tp)376 free_token (token_ty *tp)
377 {
378 if (tp->type == token_type_symbol)
379 free (tp->string);
380 if (tp->type == token_type_string_literal)
381 {
382 mixed_string_free (tp->mixed_string);
383 drop_reference (tp->comment);
384 }
385 }
386
387
388 /* Return value of phase7_getc when EOF is reached. */
389 #define P7_EOF (-1)
390 #define P7_STRING_END (-2)
391
392 /* Replace escape sequences within character strings with their single
393 character equivalents. */
394 #define P7_QUOTES (-3)
395 #define P7_QUOTE (-4)
396 #define P7_NEWLINE (-5)
397
398 /* Convert an UTF-16 or UTF-32 code point to a return value that can be
399 distinguished from a single-byte return value. */
400 #define UNICODE(code) (0x100 + (code))
401
402 /* Test a return value of phase7_getuc whether it designates an UTF-16 or
403 UTF-32 code point. */
404 #define IS_UNICODE(p7_result) ((p7_result) >= 0x100)
405
406 /* Extract the UTF-16 or UTF-32 code of a return value that satisfies
407 IS_UNICODE. */
408 #define UNICODE_VALUE(p7_result) ((p7_result) - 0x100)
409
410
411 static int
phase7_getc()412 phase7_getc ()
413 {
414 int c, n, j;
415
416 /* Use phase 1, because phase 2 elides comments. */
417 c = phase1_getc ();
418
419 /* Return a magic newline indicator, so that we can distinguish
420 between the user requesting a newline in the string (e.g. using
421 "\n" or "\012") from the user failing to terminate the string or
422 character constant. The ANSI C standard says: 3.1.3.4 Character
423 Constants contain "any character except single quote, backslash or
424 newline; or an escape sequence" and 3.1.4 String Literals contain
425 "any character except double quote, backslash or newline; or an
426 escape sequence".
427
428 Most compilers give a fatal error in this case, however gcc is
429 stupidly silent, even though this is a very common typo. OK, so
430 "gcc --pedantic" will tell me, but that gripes about too much other
431 stuff. Could I have a "gcc -Wnewline-in-string" option, or
432 better yet a "gcc -fno-newline-in-string" option, please? Gcc is
433 also inconsistent between string literals and character constants:
434 you may not embed newlines in character constants; try it, you get
435 a useful diagnostic. --PMiller */
436 if (c == '\n')
437 return P7_NEWLINE;
438
439 if (c == '"')
440 return P7_QUOTES;
441 if (c == '\'')
442 return P7_QUOTE;
443 if (c != '\\')
444 return c;
445 c = phase1_getc ();
446 switch (c)
447 {
448 default:
449 /* Unknown escape sequences really should be an error, but just
450 ignore them, and let the real compiler complain. */
451 phase1_ungetc (c);
452 return '\\';
453
454 case '"':
455 case '\'':
456 case '\\':
457 case '$':
458 return c;
459
460 case 'b':
461 return '\b';
462
463 case 'f':
464 return '\f';
465 case 'n':
466 return '\n';
467 case 'r':
468 return '\r';
469 case 't':
470 return '\t';
471 case 'v':
472 return '\v';
473
474 case 'x':
475 c = phase1_getc ();
476 switch (c)
477 {
478 default:
479 phase1_ungetc (c);
480 phase1_ungetc ('x');
481 return '\\';
482
483 case '0': case '1': case '2': case '3': case '4':
484 case '5': case '6': case '7': case '8': case '9':
485 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
486 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
487 break;
488 }
489 n = 0;
490 for (;;)
491 {
492 switch (c)
493 {
494 default:
495 phase1_ungetc (c);
496 return n;
497
498 case '0': case '1': case '2': case '3': case '4':
499 case '5': case '6': case '7': case '8': case '9':
500 n = n * 16 + c - '0';
501 break;
502
503 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
504 n = n * 16 + 10 + c - 'A';
505 break;
506
507 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
508 n = n * 16 + 10 + c - 'a';
509 break;
510 }
511 c = phase1_getc ();
512 }
513 return n;
514
515 case '0':
516 n = 0;
517 for (j = 0; j < 3; ++j)
518 {
519 n = n * 8 + c - '0';
520 c = phase1_getc ();
521 switch (c)
522 {
523 default:
524 break;
525
526 case '0': case '1': case '2': case '3':
527 case '4': case '5': case '6': case '7':
528 continue;
529 }
530 break;
531 }
532 phase1_ungetc (c);
533 return n;
534
535 case 'u':
536 {
537 unsigned char buf[8];
538
539 n = 0;
540 for (j = 0; j < 4; j++)
541 {
542 int c1 = phase1_getc ();
543
544 if (c1 >= '0' && c1 <= '9')
545 n = (n << 4) + (c1 - '0');
546 else if (c1 >= 'A' && c1 <= 'F')
547 n = (n << 4) + (c1 - 'A' + 10);
548 else if (c1 >= 'a' && c1 <= 'f')
549 n = (n << 4) + (c1 - 'a' + 10);
550 else
551 {
552 phase1_ungetc (c1);
553 while (--j >= 0)
554 phase1_ungetc (buf[j]);
555 phase1_ungetc (c);
556 return '\\';
557 }
558
559 buf[j] = c1;
560 }
561
562 if (n < 0x110000)
563 return UNICODE (n);
564
565 error_with_progname = false;
566 error (0, 0, _("%s:%d: warning: invalid Unicode character"),
567 logical_file_name, line_number);
568 error_with_progname = true;
569
570 while (--j >= 0)
571 phase1_ungetc (buf[j]);
572 phase1_ungetc (c);
573 return '\\';
574 }
575 }
576 }
577
578
579 static void
phase7_ungetc(int c)580 phase7_ungetc (int c)
581 {
582 phase1_ungetc (c);
583 }
584
585
586 /* 3. Parse each resulting logical line as preprocessing tokens and
587 white space. Preprocessing tokens and Vala tokens don't always
588 match. */
589
590 static token_ty phase3_pushback[2];
591 static int phase3_pushback_length;
592
593
594 static token_type_ty last_token_type;
595
596 static void
phase3_scan_regex()597 phase3_scan_regex ()
598 {
599 int c;
600
601 for (;;)
602 {
603 c = phase1_getc ();
604 if (c == '/')
605 break;
606 if (c == '\\')
607 {
608 c = phase1_getc ();
609 if (c != EOF)
610 continue;
611 }
612 if (c == EOF)
613 {
614 error_with_progname = false;
615 error (0, 0,
616 _("%s:%d: warning: regular expression literal terminated too early"),
617 logical_file_name, line_number);
618 error_with_progname = true;
619 return;
620 }
621 }
622
623 c = phase2_getc ();
624 if (!(c == 'i' || c == 's' || c == 'm' || c == 'x'))
625 phase2_ungetc (c);
626 }
627
628 static void
phase3_get(token_ty * tp)629 phase3_get (token_ty *tp)
630 {
631 static char *buffer;
632 static int bufmax;
633 int bufpos;
634
635 #undef APPEND
636 #define APPEND(c) \
637 do \
638 { \
639 if (bufpos >= bufmax) \
640 { \
641 bufmax = 2 * bufmax + 10; \
642 buffer = xrealloc (buffer, bufmax); \
643 } \
644 buffer[bufpos++] = c; \
645 } \
646 while (0)
647
648 if (phase3_pushback_length)
649 {
650 *tp = phase3_pushback[--phase3_pushback_length];
651 last_token_type = tp->type;
652 return;
653 }
654
655 for (;;)
656 {
657 bool template;
658 bool verbatim;
659 int c;
660
661 tp->line_number = line_number;
662 c = phase2_getc ();
663
664 switch (c)
665 {
666 case EOF:
667 tp->type = last_token_type = token_type_eof;
668 return;
669
670 case '\n':
671 if (last_non_comment_line > last_comment_line)
672 savable_comment_reset ();
673 /* FALLTHROUGH */
674 case ' ':
675 case '\f':
676 case '\t':
677 /* Ignore whitespace and comments. */
678 continue;
679 default:
680 break;
681 }
682
683 last_non_comment_line = tp->line_number;
684 template = false;
685 verbatim = false;
686
687 switch (c)
688 {
689 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
690 case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
691 case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
692 case 'V': case 'W': case 'X': case 'Y': case 'Z':
693 case '_':
694 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
695 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
696 case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
697 case 'v': case 'w': case 'x': case 'y': case 'z':
698 bufpos = 0;
699 for (;;)
700 {
701 APPEND (c);
702 c = phase2_getc ();
703 switch (c)
704 {
705 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
706 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
707 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
708 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
709 case 'Y': case 'Z':
710 case '_':
711 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
712 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
713 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
714 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
715 case 'y': case 'z':
716 case '0': case '1': case '2': case '3': case '4':
717 case '5': case '6': case '7': case '8': case '9':
718 continue;
719
720 default:
721 phase2_ungetc (c);
722 break;
723 }
724 break;
725 }
726 APPEND (0);
727 if (strcmp (buffer, "return") == 0)
728 tp->type = last_token_type = token_type_return;
729 else
730 {
731 tp->string = xstrdup (buffer);
732 tp->type = last_token_type = token_type_symbol;
733 }
734 return;
735
736 case '.':
737 c = phase2_getc ();
738 phase2_ungetc (c);
739 switch (c)
740 {
741 default:
742 tp->string = xstrdup (".");
743 tp->type = last_token_type = token_type_symbol;
744 return;
745
746 case '0': case '1': case '2': case '3': case '4':
747 case '5': case '6': case '7': case '8': case '9':
748 c = '.';
749 break;
750 }
751 /* FALLTHROUGH */
752
753 case '0': case '1': case '2': case '3': case '4':
754 case '5': case '6': case '7': case '8': case '9':
755 /* The preprocessing number token is more "generous" than the C
756 number tokens. This is mostly due to token pasting (another
757 thing we can ignore here). */
758 bufpos = 0;
759 for (;;)
760 {
761 APPEND (c);
762 c = phase2_getc ();
763 switch (c)
764 {
765 case 'e':
766 case 'E':
767 APPEND (c);
768 c = phase2_getc ();
769 if (c != '+' && c != '-')
770 {
771 phase2_ungetc (c);
772 break;
773 }
774 continue;
775
776 case 'A': case 'B': case 'C': case 'D': case 'F':
777 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
778 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
779 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
780 case 'Y': case 'Z':
781 case 'a': case 'b': case 'c': case 'd': case 'f':
782 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
783 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
784 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
785 case 'y': case 'z':
786 case '0': case '1': case '2': case '3': case '4':
787 case '5': case '6': case '7': case '8': case '9':
788 case '.':
789 continue;
790
791 default:
792 phase2_ungetc (c);
793 break;
794 }
795 break;
796 }
797 APPEND (0);
798 tp->type = last_token_type = token_type_number;
799 return;
800
801 case '\'':
802 for (;;)
803 {
804 c = phase7_getc ();
805 if (c == P7_NEWLINE)
806 {
807 error_with_progname = false;
808 error (0, 0, _("%s:%d: warning: unterminated character constant"),
809 logical_file_name, line_number - 1);
810 error_with_progname = true;
811 phase7_ungetc ('\n');
812 break;
813 }
814 if (c == EOF || c == P7_QUOTE)
815 break;
816 }
817 tp->type = last_token_type = token_type_character_constant;
818 return;
819
820 /* Vala provides strings in three different formats.
821
822 Usual string literals:
823 "..."
824 Verbatim string literals:
825 """...""" (where ... can include newlines and double quotes)
826 String templates.
827 @"...", @"""..."""
828
829 Note that, with the current implementation string
830 templates are not subject to translation, because they are
831 inspected at compile time. For example, the following code
832
833 string bar = "bar";
834 string foo = _(@"foo $bar");
835
836 will be translated into the C code, like:
837
838 _(g_strconcat ("foo ", "bar", NULL)); */
839 case '@':
840 c = phase2_getc ();
841 if (c != '"')
842 {
843 phase2_ungetc (c);
844 tp->type = last_token_type = token_type_other;
845 return;
846 }
847 template = true;
848 /* FALLTHROUGH */
849 case '"':
850 {
851 struct mixed_string_buffer msb;
852 int c2 = phase1_getc ();
853
854 if (c2 == '"')
855 {
856 int c3 = phase1_getc ();
857 if (c3 == '"')
858 verbatim = true;
859 else
860 {
861 phase1_ungetc (c3);
862 phase1_ungetc (c2);
863 }
864 }
865 else
866 phase2_ungetc (c2);
867
868 /* Start accumulating the string. */
869 mixed_string_buffer_init (&msb, lc_string,
870 logical_file_name, line_number);
871 if (verbatim)
872 for (;;)
873 {
874 c = phase1_getc ();
875
876 /* Keep line_number in sync. */
877 msb.line_number = line_number;
878
879 if (c == '"')
880 {
881 int c2 = phase1_getc ();
882 if (c2 == '"')
883 {
884 int c3 = phase1_getc ();
885 if (c3 == '"')
886 break;
887 phase1_ungetc (c3);
888 }
889 phase1_ungetc (c2);
890 }
891 if (c == EOF)
892 break;
893 mixed_string_buffer_append_char (&msb, c);
894 }
895 else
896 for (;;)
897 {
898 c = phase7_getc ();
899
900 /* Keep line_number in sync. */
901 msb.line_number = line_number;
902
903 if (c == P7_NEWLINE)
904 {
905 error_with_progname = false;
906 error (0, 0,
907 _("%s:%d: warning: unterminated string literal"),
908 logical_file_name, line_number - 1);
909 error_with_progname = true;
910 phase7_ungetc ('\n');
911 break;
912 }
913 if (c == P7_QUOTES)
914 break;
915 if (c == EOF)
916 break;
917 if (c == P7_QUOTE)
918 c = '\'';
919 if (IS_UNICODE (c))
920 {
921 assert (UNICODE_VALUE (c) >= 0
922 && UNICODE_VALUE (c) < 0x110000);
923 mixed_string_buffer_append_unicode (&msb,
924 UNICODE_VALUE (c));
925 }
926 else
927 mixed_string_buffer_append_char (&msb, c);
928 }
929 /* Done accumulating the string. */
930 if (template)
931 {
932 tp->type = token_type_string_template;
933 mixed_string_buffer_destroy (&msb);
934 }
935 else
936 {
937 tp->type = token_type_string_literal;
938 tp->mixed_string = mixed_string_buffer_result (&msb);
939 tp->comment = add_reference (savable_comment);
940 }
941 last_token_type = tp->type;
942 return;
943 }
944
945 case '/':
946 switch (last_token_type)
947 {
948 case token_type_lparen:
949 case token_type_lbrace:
950 case token_type_assign:
951 case token_type_return:
952 case token_type_plus:
953 case token_type_arithmetic_operator:
954 case token_type_equality_test_operator:
955 case token_type_logic_operator:
956 case token_type_comma:
957 case token_type_question:
958 case token_type_colon:
959 phase3_scan_regex ();
960 tp->type = last_token_type = token_type_regex_literal;
961 break;
962 default:
963 {
964 int c2 = phase2_getc ();
965 if (c2 == '=')
966 tp->type = last_token_type = token_type_assign;
967 else
968 {
969 phase2_ungetc (c2);
970 tp->type = last_token_type = token_type_arithmetic_operator;
971 }
972 break;
973 }
974 }
975 return;
976
977 case '(':
978 tp->type = last_token_type = token_type_lparen;
979 return;
980
981 case ')':
982 tp->type = last_token_type = token_type_rparen;
983 return;
984
985 case '{':
986 tp->type = last_token_type = token_type_lbrace;
987 return;
988
989 case '}':
990 tp->type = last_token_type = token_type_rbrace;
991 return;
992
993 case '+':
994 {
995 int c2 = phase2_getc ();
996 switch (c2)
997 {
998 case '+':
999 tp->type = last_token_type = token_type_other;
1000 break;
1001 case '=':
1002 tp->type = last_token_type = token_type_assign;
1003 break;
1004 default:
1005 phase2_ungetc (c2);
1006 tp->type = last_token_type = token_type_plus;
1007 break;
1008 }
1009 return;
1010 }
1011
1012 case '-':
1013 {
1014 int c2 = phase2_getc ();
1015 switch (c2)
1016 {
1017 case '-':
1018 tp->type = last_token_type = token_type_other;
1019 break;
1020 case '=':
1021 tp->type = last_token_type = token_type_assign;
1022 break;
1023 default:
1024 phase2_ungetc (c2);
1025 tp->type = last_token_type = token_type_arithmetic_operator;
1026 break;
1027 }
1028 return;
1029 }
1030
1031 case '%':
1032 case '^':
1033 {
1034 int c2 = phase2_getc ();
1035 if (c2 == '=')
1036 tp->type = last_token_type = token_type_assign;
1037 else
1038 {
1039 phase2_ungetc (c2);
1040 tp->type = last_token_type = token_type_logic_operator;
1041 }
1042 return;
1043 }
1044
1045 case '=':
1046 {
1047 int c2 = phase2_getc ();
1048 switch (c2)
1049 {
1050 case '=':
1051 tp->type = last_token_type = token_type_equality_test_operator;
1052 break;
1053 case '>':
1054 tp->type = last_token_type = token_type_other;
1055 break;
1056 default:
1057 phase2_ungetc (c2);
1058 tp->type = last_token_type = token_type_assign;
1059 break;
1060 }
1061 return;
1062 }
1063
1064 case '!':
1065 {
1066 int c2 = phase2_getc ();
1067 if (c2 == '=')
1068 tp->type = last_token_type = token_type_equality_test_operator;
1069 else
1070 {
1071 phase2_ungetc (c2);
1072 tp->type = last_token_type = token_type_logic_operator;
1073 }
1074 return;
1075 }
1076
1077 case '>':
1078 case '<':
1079 {
1080 int c2 = phase2_getc ();
1081 if (c2 == '=')
1082 tp->type = last_token_type = token_type_equality_test_operator;
1083 else if (c2 == c)
1084 {
1085 int c3 = phase2_getc ();
1086 if (c3 == '=')
1087 tp->type = last_token_type = token_type_assign;
1088 else
1089 {
1090 phase2_ungetc (c2);
1091 phase2_ungetc (c3);
1092 tp->type = last_token_type = token_type_other;
1093 }
1094 }
1095 else
1096 {
1097 phase2_ungetc (c2);
1098 tp->type = last_token_type = token_type_equality_test_operator;
1099 }
1100 return;
1101 }
1102
1103 case ',':
1104 tp->type = last_token_type = token_type_comma;
1105 return;
1106
1107 case ':':
1108 tp->type = last_token_type = token_type_colon;
1109 return;
1110
1111 case '&':
1112 case '|':
1113 {
1114 int c2 = phase2_getc ();
1115 if (c2 == c)
1116 tp->type = last_token_type = token_type_logic_operator;
1117 else if (c2 == '=')
1118 tp->type = last_token_type = token_type_assign;
1119 else
1120 {
1121 phase2_ungetc (c2);
1122 tp->type = last_token_type = token_type_arithmetic_operator;
1123 }
1124 return;
1125 }
1126
1127 case '?':
1128 {
1129 int c2 = phase2_getc ();
1130 if (c2 == '?')
1131 tp->type = last_token_type = token_type_logic_operator;
1132 else
1133 {
1134 phase2_ungetc (c2);
1135 tp->type = last_token_type = token_type_question;
1136 }
1137 return;
1138 }
1139
1140 default:
1141 tp->type = last_token_type = token_type_other;
1142 return;
1143 }
1144 }
1145 #undef APPEND
1146 }
1147
1148 static void
phase3_unget(token_ty * tp)1149 phase3_unget (token_ty *tp)
1150 {
1151 if (tp->type != token_type_eof)
1152 {
1153 if (phase3_pushback_length == SIZEOF (phase3_pushback))
1154 abort ();
1155 phase3_pushback[phase3_pushback_length++] = *tp;
1156 }
1157 }
1158
1159
1160 /* String concatenation with '+'. */
1161
1162 static void
x_vala_lex(token_ty * tp)1163 x_vala_lex (token_ty *tp)
1164 {
1165 phase3_get (tp);
1166 if (tp->type == token_type_string_literal)
1167 {
1168 mixed_string_ty *sum = tp->mixed_string;
1169
1170 for (;;)
1171 {
1172 token_ty token2;
1173
1174 phase3_get (&token2);
1175 if (token2.type == token_type_plus)
1176 {
1177 token_ty token3;
1178
1179 phase3_get (&token3);
1180 if (token3.type == token_type_string_literal)
1181 {
1182 sum = mixed_string_concat_free1 (sum, token3.mixed_string);
1183
1184 free_token (&token3);
1185 free_token (&token2);
1186 continue;
1187 }
1188 phase3_unget (&token3);
1189 }
1190 phase3_unget (&token2);
1191 break;
1192 }
1193 tp->mixed_string = sum;
1194 }
1195 }
1196
1197
1198 /* ========================= Extracting strings. ========================== */
1199
1200
1201 /* Context lookup table. */
1202 static flag_context_list_table_ty *flag_context_list_table;
1203
1204
1205 /* The file is broken into tokens. Scan the token stream, looking for
1206 a keyword, followed by a left paren, followed by a string. When we
1207 see this sequence, we have something to remember. We assume we are
1208 looking at a valid Vala program, and leave the complaints about the
1209 grammar to the compiler.
1210
1211 Normal handling: Look for
1212 keyword ( ... msgid ... )
1213 keyword msgid
1214 Plural handling: Look for
1215 keyword ( ... msgid ... msgid_plural ... )
1216
1217 We use recursion because the arguments before msgid or between msgid
1218 and msgid_plural can contain subexpressions of the same form. */
1219
1220 /* Extract messages until the next balanced closing parenthesis or bracket.
1221 Extracted messages are added to MLP.
1222 DELIM can be either token_type_rparen or token_type_rbracket, or
1223 token_type_eof to accept both.
1224 Return true upon eof, false upon closing parenthesis or bracket. */
1225 static bool
extract_balanced(message_list_ty * mlp,token_type_ty delim,flag_context_ty outer_context,flag_context_list_iterator_ty context_iter,struct arglist_parser * argparser)1226 extract_balanced (message_list_ty *mlp, token_type_ty delim,
1227 flag_context_ty outer_context,
1228 flag_context_list_iterator_ty context_iter,
1229 struct arglist_parser *argparser)
1230 {
1231 /* Current argument number. */
1232 int arg = 1;
1233 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
1234 int state;
1235 /* Parameters of the keyword just seen. Defined only in state 1. */
1236 const struct callshapes *next_shapes = NULL;
1237 /* Context iterator that will be used if the next token is a '('. */
1238 flag_context_list_iterator_ty next_context_iter =
1239 passthrough_context_list_iterator;
1240 /* Current context. */
1241 flag_context_ty inner_context =
1242 inherited_context (outer_context,
1243 flag_context_list_iterator_advance (&context_iter));
1244
1245 /* Start state is 0. */
1246 state = 0;
1247
1248 for (;;)
1249 {
1250 token_ty token;
1251
1252 x_vala_lex (&token);
1253
1254 switch (token.type)
1255 {
1256 case token_type_symbol:
1257 {
1258 void *keyword_value;
1259
1260 if (hash_find_entry (&keywords, token.string, strlen (token.string),
1261 &keyword_value)
1262 == 0)
1263 {
1264 next_shapes = (const struct callshapes *) keyword_value;
1265 state = 1;
1266 }
1267 else
1268 state = 0;
1269 }
1270 next_context_iter =
1271 flag_context_list_iterator (
1272 flag_context_list_table_lookup (
1273 flag_context_list_table,
1274 token.string, strlen (token.string)));
1275 free (token.string);
1276 continue;
1277
1278 case token_type_lparen:
1279 if (extract_balanced (mlp, token_type_rparen,
1280 inner_context, next_context_iter,
1281 arglist_parser_alloc (mlp,
1282 state ? next_shapes : NULL)))
1283 {
1284 arglist_parser_done (argparser, arg);
1285 return true;
1286 }
1287 next_context_iter = null_context_list_iterator;
1288 state = 0;
1289 break;
1290
1291 case token_type_rparen:
1292 if (delim == token_type_rparen || delim == token_type_eof)
1293 {
1294 arglist_parser_done (argparser, arg);
1295 return false;
1296 }
1297
1298 next_context_iter = null_context_list_iterator;
1299 state = 0;
1300 continue;
1301
1302 case token_type_comma:
1303 arg++;
1304 inner_context =
1305 inherited_context (outer_context,
1306 flag_context_list_iterator_advance (
1307 &context_iter));
1308 next_context_iter = passthrough_context_list_iterator;
1309 state = 0;
1310 continue;
1311
1312 case token_type_eof:
1313 arglist_parser_done (argparser, arg);
1314 return true;
1315
1316 case token_type_string_literal:
1317 {
1318 lex_pos_ty pos;
1319
1320 pos.file_name = logical_file_name;
1321 pos.line_number = token.line_number;
1322
1323 if (extract_all)
1324 {
1325 char *string = mixed_string_contents (token.mixed_string);
1326 mixed_string_free (token.mixed_string);
1327 remember_a_message (mlp, NULL, string, true, false,
1328 inner_context, &pos,
1329 NULL, token.comment, false);
1330 }
1331 else
1332 {
1333 /* A string immediately after a symbol means a function call. */
1334 if (state)
1335 {
1336 struct arglist_parser *tmp_argparser;
1337 tmp_argparser = arglist_parser_alloc (mlp, next_shapes);
1338
1339 arglist_parser_remember (tmp_argparser, 1,
1340 token.mixed_string, inner_context,
1341 pos.file_name, pos.line_number,
1342 token.comment, false);
1343 arglist_parser_done (tmp_argparser, 1);
1344 }
1345 else
1346 arglist_parser_remember (argparser, arg,
1347 token.mixed_string, inner_context,
1348 pos.file_name, pos.line_number,
1349 token.comment, false);
1350 }
1351 }
1352 drop_reference (token.comment);
1353 next_context_iter = null_context_list_iterator;
1354 state = 0;
1355 continue;
1356
1357 case token_type_character_constant:
1358 case token_type_lbrace:
1359 case token_type_rbrace:
1360 case token_type_assign:
1361 case token_type_return:
1362 case token_type_plus:
1363 case token_type_arithmetic_operator:
1364 case token_type_equality_test_operator:
1365 case token_type_logic_operator:
1366 case token_type_question:
1367 case token_type_colon:
1368 case token_type_number:
1369 case token_type_string_template:
1370 case token_type_regex_literal:
1371 case token_type_other:
1372 next_context_iter = null_context_list_iterator;
1373 state = 0;
1374 continue;
1375
1376 default:
1377 abort ();
1378 }
1379 }
1380 }
1381
1382 void
extract_vala(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1383 extract_vala (FILE *f,
1384 const char *real_filename, const char *logical_filename,
1385 flag_context_list_table_ty *flag_table,
1386 msgdomain_list_ty *mdlp)
1387 {
1388 message_list_ty *mlp = mdlp->item[0]->messages;
1389
1390 fp = f;
1391 real_file_name = real_filename;
1392 logical_file_name = xstrdup (logical_filename);
1393 line_number = 1;
1394
1395 phase1_pushback_length = 0;
1396
1397 last_comment_line = -1;
1398 last_non_comment_line = -1;
1399
1400 phase3_pushback_length = 0;
1401 last_token_type = token_type_other;
1402
1403 flag_context_list_table = flag_table;
1404
1405 init_keywords ();
1406
1407 /* Eat tokens until eof is seen. When extract_parenthesized returns
1408 due to an unbalanced closing parenthesis, just restart it. */
1409 while (!extract_balanced (mlp, token_type_eof,
1410 null_context, null_context_list_iterator,
1411 arglist_parser_alloc (mlp, NULL)))
1412 ;
1413
1414 fp = NULL;
1415 real_file_name = NULL;
1416 logical_file_name = NULL;
1417 line_number = 0;
1418 }
1419