1 /* xgettext Lisp backend.
2 Copyright (C) 2001-2003, 2005-2009, 2018-2020 Free Software Foundation, Inc.
3
4 This file was written by Bruno Haible <haible@clisp.cons.org>, 2001.
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
22
23 /* Specification. */
24 #include "x-lisp.h"
25
26 #include <errno.h>
27 #include <stdbool.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31
32 #include "message.h"
33 #include "xgettext.h"
34 #include "xg-pos.h"
35 #include "xg-mixed-string.h"
36 #include "xg-arglist-context.h"
37 #include "xg-arglist-callshape.h"
38 #include "xg-arglist-parser.h"
39 #include "xg-message.h"
40 #include "error.h"
41 #include "xalloc.h"
42 #include "mem-hash-map.h"
43 #include "gettext.h"
44
45 #define _(s) gettext(s)
46
47
48 /* The Common Lisp syntax is described in the Common Lisp HyperSpec, chapter 2.
49 Since we are interested only in strings and in forms similar to
50 (gettext msgid ...)
51 or (ngettext msgid msgid_plural ...)
52 we make the following simplifications:
53
54 - Assume the keywords and strings are in an ASCII compatible encoding.
55 This means we can read the input file one byte at a time, instead of
56 one character at a time. No need to worry about multibyte characters:
57 If they occur as part of identifiers, they most probably act as
58 constituent characters, and the byte based approach will do the same.
59
60 - Assume the read table is the standard Common Lisp read table.
61 Non-standard read tables are mostly used to read data, not programs.
62
63 - Assume the read table case is :UPCASE, and *READ-BASE* is 10.
64
65 - Don't interpret #n= and #n#, they usually don't appear in programs.
66
67 - Don't interpret #+, #-, they are unlikely to appear in a gettext form.
68
69 The remaining syntax rules are:
70
71 - The syntax code assigned to each character, and how tokens are built
72 up from characters (single escape, multiple escape etc.).
73
74 - Comment syntax: ';' and '#| ... |#'.
75
76 - String syntax: "..." with single escapes.
77
78 - Read macros and dispatch macro character '#'. Needed to be able to
79 tell which is the n-th argument of a function call.
80
81 */
82
83
84 /* ========================= Lexer customization. ========================= */
85
86 /* 'readtable_case' is the case conversion that is applied to non-escaped
87 parts of symbol tokens. In Common Lisp: (readtable-case *readtable*). */
88
89 enum rtcase
90 {
91 case_upcase,
92 case_downcase,
93 case_preserve,
94 case_invert
95 };
96
97 static enum rtcase readtable_case = case_upcase;
98
99 /* 'read_base' is the assumed radix of integers and rational numbers.
100 In Common Lisp: *read-base*. */
101 static int read_base = 10;
102
103 /* 'read_preserve_whitespace' specifies whether a whitespace character
104 that terminates a token must be pushed back on the input stream.
105 We set it to true, because the special newline side effect in read_object()
106 requires that read_object() sees every newline not inside a token. */
107 static bool read_preserve_whitespace = true;
108
109
110 /* ====================== Keyword set customization. ====================== */
111
112 /* If true extract all strings. */
113 static bool extract_all = false;
114
115 static hash_table keywords;
116 static bool default_keywords = true;
117
118
119 void
x_lisp_extract_all()120 x_lisp_extract_all ()
121 {
122 extract_all = true;
123 }
124
125
126 void
x_lisp_keyword(const char * name)127 x_lisp_keyword (const char *name)
128 {
129 if (name == NULL)
130 default_keywords = false;
131 else
132 {
133 const char *end;
134 struct callshape shape;
135 const char *colon;
136 size_t len;
137 char *symname;
138 size_t i;
139
140 if (keywords.table == NULL)
141 hash_init (&keywords, 100);
142
143 split_keywordspec (name, &end, &shape);
144
145 /* The characters between name and end should form a valid Lisp symbol.
146 Extract the symbol name part. */
147 colon = strchr (name, ':');
148 if (colon != NULL && colon < end)
149 {
150 name = colon + 1;
151 if (name < end && *name == ':')
152 name++;
153 colon = strchr (name, ':');
154 if (colon != NULL && colon < end)
155 return;
156 }
157
158 /* Uppercase it. */
159 len = end - name;
160 symname = XNMALLOC (len, char);
161 for (i = 0; i < len; i++)
162 symname[i] =
163 (name[i] >= 'a' && name[i] <= 'z' ? name[i] - 'a' + 'A' : name[i]);
164
165 insert_keyword_callshape (&keywords, symname, len, &shape);
166 }
167 }
168
169 /* Finish initializing the keywords hash table.
170 Called after argument processing, before each file is processed. */
171 static void
init_keywords()172 init_keywords ()
173 {
174 if (default_keywords)
175 {
176 /* When adding new keywords here, also update the documentation in
177 xgettext.texi! */
178 x_lisp_keyword ("gettext"); /* I18N:GETTEXT */
179 x_lisp_keyword ("ngettext:1,2"); /* I18N:NGETTEXT */
180 x_lisp_keyword ("gettext-noop");
181 default_keywords = false;
182 }
183 }
184
185 void
init_flag_table_lisp()186 init_flag_table_lisp ()
187 {
188 xgettext_record_flag ("gettext:1:pass-lisp-format");
189 xgettext_record_flag ("ngettext:1:pass-lisp-format");
190 xgettext_record_flag ("ngettext:2:pass-lisp-format");
191 xgettext_record_flag ("gettext-noop:1:pass-lisp-format");
192 xgettext_record_flag ("format:2:lisp-format");
193 }
194
195
196 /* ======================== Reading of characters. ======================== */
197
198 /* The input file stream. */
199 static FILE *fp;
200
201
202 /* Fetch the next character from the input file. */
203 static int
do_getc()204 do_getc ()
205 {
206 int c = getc (fp);
207
208 if (c == EOF)
209 {
210 if (ferror (fp))
211 error (EXIT_FAILURE, errno,
212 _("error while reading \"%s\""), real_file_name);
213 }
214 else if (c == '\n')
215 line_number++;
216
217 return c;
218 }
219
220 /* Put back the last fetched character, not EOF. */
221 static void
do_ungetc(int c)222 do_ungetc (int c)
223 {
224 if (c == '\n')
225 line_number--;
226 ungetc (c, fp);
227 }
228
229
230 /* ========= Reading of tokens. See CLHS 2.2 "Reader Algorithm". ========= */
231
232
233 /* Syntax code. See CLHS 2.1.4 "Character Syntax Types". */
234
235 enum syntax_code
236 {
237 syntax_illegal, /* non-printable, except whitespace */
238 syntax_single_esc, /* '\' (single escape) */
239 syntax_multi_esc, /* '|' (multiple escape) */
240 syntax_constituent, /* everything else (constituent) */
241 syntax_whitespace, /* TAB,LF,FF,CR,' ' (whitespace) */
242 syntax_eof, /* EOF */
243 syntax_t_macro, /* '()'"' (terminating macro) */
244 syntax_nt_macro /* '#' (non-terminating macro) */
245 };
246
247 /* Returns the syntax code of a character. */
248 static enum syntax_code
syntax_code_of(unsigned char c)249 syntax_code_of (unsigned char c)
250 {
251 switch (c)
252 {
253 case '\\':
254 return syntax_single_esc;
255 case '|':
256 return syntax_multi_esc;
257 case '\t': case '\n': case '\f': case '\r': case ' ':
258 return syntax_whitespace;
259 case '(': case ')': case '\'': case '"': case ',': case ';': case '`':
260 return syntax_t_macro;
261 case '#':
262 return syntax_nt_macro;
263 default:
264 if (c < ' ' && c != '\b')
265 return syntax_illegal;
266 else
267 return syntax_constituent;
268 }
269 }
270
271 struct char_syntax
272 {
273 int ch; /* character */
274 enum syntax_code scode; /* syntax code */
275 };
276
277 /* Returns the next character and its syntax code. */
278 static void
read_char_syntax(struct char_syntax * p)279 read_char_syntax (struct char_syntax *p)
280 {
281 int c = do_getc ();
282
283 p->ch = c;
284 p->scode = (c == EOF ? syntax_eof : syntax_code_of (c));
285 }
286
287 /* Every character in a token has an attribute assigned. The attributes
288 help during interpretation of the token. See
289 CLHS 2.3 "Interpretation of Tokens" for the possible interpretations,
290 and CLHS 2.1.4.2 "Constituent Traits". */
291
292 enum attribute
293 {
294 a_illg, /* invalid constituent */
295 a_pack_m, /* ':' package marker */
296 a_alpha, /* normal alphabetic */
297 a_escaped, /* alphabetic but not subject to case conversion */
298 a_ratio, /* '/' */
299 a_dot, /* '.' */
300 a_sign, /* '+-' */
301 a_extens, /* '_^' extension characters */
302 a_digit, /* '0123456789' */
303 a_letterdigit,/* 'A'-'Z','a'-'z' below base, except 'esfdlESFDL' */
304 a_expodigit, /* 'esfdlESFDL' below base */
305 a_letter, /* 'A'-'Z','a'-'z', except 'esfdlESFDL' */
306 a_expo /* 'esfdlESFDL' */
307 };
308
309 #define is_letter_attribute(a) ((a) >= a_letter)
310 #define is_number_attribute(a) ((a) >= a_ratio)
311
312 /* Returns the attribute of a character, assuming base 10. */
313 static enum attribute
attribute_of(unsigned char c)314 attribute_of (unsigned char c)
315 {
316 switch (c)
317 {
318 case ':':
319 return a_pack_m;
320 case '/':
321 return a_ratio;
322 case '.':
323 return a_dot;
324 case '+': case '-':
325 return a_sign;
326 case '_': case '^':
327 return a_extens;
328 case '0': case '1': case '2': case '3': case '4':
329 case '5': case '6': case '7': case '8': case '9':
330 return a_digit;
331 case 'a': case 'b': case 'c': case 'g': case 'h': case 'i': case 'j':
332 case 'k': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
333 case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
334 case 'A': case 'B': case 'C': case 'G': case 'H': case 'I': case 'J':
335 case 'K': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
336 case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z':
337 return a_letter;
338 case 'e': case 's': case 'd': case 'f': case 'l':
339 case 'E': case 'S': case 'D': case 'F': case 'L':
340 return a_expo;
341 default:
342 /* Treat everything as valid. Never return a_illg. */
343 return a_alpha;
344 }
345 }
346
347 struct token_char
348 {
349 unsigned char ch; /* character */
350 unsigned char attribute; /* attribute */
351 };
352
353 /* A token consists of a sequence of characters with associated attribute. */
354 struct token
355 {
356 int allocated; /* number of allocated 'token_char's */
357 int charcount; /* number of used 'token_char's */
358 struct token_char *chars; /* the token's constituents */
359 bool with_escape; /* whether single-escape or multiple escape occurs */
360 };
361
362 /* Initialize a 'struct token'. */
363 static inline void
init_token(struct token * tp)364 init_token (struct token *tp)
365 {
366 tp->allocated = 10;
367 tp->chars = XNMALLOC (tp->allocated, struct token_char);
368 tp->charcount = 0;
369 }
370
371 /* Free the memory pointed to by a 'struct token'. */
372 static inline void
free_token(struct token * tp)373 free_token (struct token *tp)
374 {
375 free (tp->chars);
376 }
377
378 /* Ensure there is enough room in the token for one more character. */
379 static inline void
grow_token(struct token * tp)380 grow_token (struct token *tp)
381 {
382 if (tp->charcount == tp->allocated)
383 {
384 tp->allocated *= 2;
385 tp->chars = (struct token_char *) xrealloc (tp->chars, tp->allocated * sizeof (struct token_char));
386 }
387 }
388
389 /* Read the next token. If 'first' is given, it points to the first
390 character, which has already been read.
391 The algorithm follows CLHS 2.2 "Reader Algorithm". */
392 static void
read_token(struct token * tp,const struct char_syntax * first)393 read_token (struct token *tp, const struct char_syntax *first)
394 {
395 bool multiple_escape_flag;
396 struct char_syntax curr;
397
398 init_token (tp);
399 tp->with_escape = false;
400
401 multiple_escape_flag = false;
402 if (first)
403 curr = *first;
404 else
405 read_char_syntax (&curr);
406
407 for (;; read_char_syntax (&curr))
408 {
409 switch (curr.scode)
410 {
411 case syntax_illegal:
412 /* Invalid input. Be tolerant, no error message. */
413 do_ungetc (curr.ch);
414 return;
415
416 case syntax_single_esc:
417 tp->with_escape = true;
418 read_char_syntax (&curr);
419 if (curr.scode == syntax_eof)
420 /* Invalid input. Be tolerant, no error message. */
421 return;
422 grow_token (tp);
423 tp->chars[tp->charcount].ch = curr.ch;
424 tp->chars[tp->charcount].attribute = a_escaped;
425 tp->charcount++;
426 break;
427
428 case syntax_multi_esc:
429 multiple_escape_flag = !multiple_escape_flag;
430 tp->with_escape = true;
431 break;
432
433 case syntax_constituent:
434 case syntax_nt_macro:
435 grow_token (tp);
436 if (multiple_escape_flag)
437 {
438 tp->chars[tp->charcount].ch = curr.ch;
439 tp->chars[tp->charcount].attribute = a_escaped;
440 tp->charcount++;
441 }
442 else
443 {
444 tp->chars[tp->charcount].ch = curr.ch;
445 tp->chars[tp->charcount].attribute = attribute_of (curr.ch);
446 tp->charcount++;
447 }
448 break;
449
450 case syntax_whitespace:
451 case syntax_t_macro:
452 if (multiple_escape_flag)
453 {
454 grow_token (tp);
455 tp->chars[tp->charcount].ch = curr.ch;
456 tp->chars[tp->charcount].attribute = a_escaped;
457 tp->charcount++;
458 }
459 else
460 {
461 if (curr.scode != syntax_whitespace || read_preserve_whitespace)
462 do_ungetc (curr.ch);
463 return;
464 }
465 break;
466
467 case syntax_eof:
468 if (multiple_escape_flag)
469 /* Invalid input. Be tolerant, no error message. */
470 ;
471 return;
472 }
473 }
474 }
475
476 /* A potential number is a token which
477 1. consists only of digits, '+','-','/','^','_','.' and number markers.
478 The base for digits is context dependent, but always 10 if a dot '.'
479 occurs. A number marker is a non-digit letter which is not adjacent
480 to a non-digit letter.
481 2. has at least one digit.
482 3. starts with a digit, '+','-','.','^' or '_'.
483 4. does not end with '+' or '-'.
484 See CLHS 2.3.1.1 "Potential Numbers as Tokens".
485 */
486
487 static inline bool
has_a_dot(const struct token * tp)488 has_a_dot (const struct token *tp)
489 {
490 int n = tp->charcount;
491 int i;
492
493 for (i = 0; i < n; i++)
494 if (tp->chars[i].attribute == a_dot)
495 return true;
496 return false;
497 }
498
499 static inline bool
all_a_number(const struct token * tp)500 all_a_number (const struct token *tp)
501 {
502 int n = tp->charcount;
503 int i;
504
505 for (i = 0; i < n; i++)
506 if (!is_number_attribute (tp->chars[i].attribute))
507 return false;
508 return true;
509 }
510
511 static inline void
a_letter_to_digit(const struct token * tp,int base)512 a_letter_to_digit (const struct token *tp, int base)
513 {
514 int n = tp->charcount;
515 int i;
516
517 for (i = 0; i < n; i++)
518 if (is_letter_attribute (tp->chars[i].attribute))
519 {
520 int c = tp->chars[i].ch;
521
522 if (c >= 'a')
523 c -= 'a' - 'A';
524 if (c - 'A' + 10 < base)
525 tp->chars[i].attribute -= 2; /* a_letter -> a_letterdigit,
526 a_expo -> a_expodigit */
527 }
528 }
529
530 static inline bool
has_a_digit(const struct token * tp)531 has_a_digit (const struct token *tp)
532 {
533 int n = tp->charcount;
534 int i;
535
536 for (i = 0; i < n; i++)
537 if (tp->chars[i].attribute == a_digit
538 || tp->chars[i].attribute == a_letterdigit
539 || tp->chars[i].attribute == a_expodigit)
540 return true;
541 return false;
542 }
543
544 static inline bool
has_adjacent_letters(const struct token * tp)545 has_adjacent_letters (const struct token *tp)
546 {
547 int n = tp->charcount;
548 int i;
549
550 for (i = 1; i < n; i++)
551 if (is_letter_attribute (tp->chars[i-1].attribute)
552 && is_letter_attribute (tp->chars[i].attribute))
553 return true;
554 return false;
555 }
556
557 static bool
is_potential_number(const struct token * tp,int * basep)558 is_potential_number (const struct token *tp, int *basep)
559 {
560 /* CLHS 2.3.1.1.1:
561 "A potential number cannot contain any escape characters." */
562 if (tp->with_escape)
563 return false;
564
565 if (has_a_dot (tp))
566 *basep = 10;
567
568 if (!all_a_number (tp))
569 return false;
570
571 a_letter_to_digit (tp, *basep);
572
573 if (!has_a_digit (tp))
574 return false;
575
576 if (has_adjacent_letters (tp))
577 return false;
578
579 if (!(tp->chars[0].attribute >= a_dot
580 && tp->chars[0].attribute <= a_expodigit))
581 return false;
582
583 if (tp->chars[tp->charcount - 1].attribute == a_sign)
584 return false;
585
586 return true;
587 }
588
589 /* A number is one of integer, ratio, float. Each has a particular syntax.
590 See CLHS 2.3.1 "Numbers as Tokens".
591 But note a mistake: The exponent rule should read:
592 exponent ::= exponent-marker [sign] {decimal-digit}+
593 (see 22.1.3.1.3 "Printing Floats"). */
594
595 enum number_type
596 {
597 n_none,
598 n_integer,
599 n_ratio,
600 n_float
601 };
602
603 static enum number_type
is_number(const struct token * tp,int * basep)604 is_number (const struct token *tp, int *basep)
605 {
606 struct token_char *ptr_limit;
607 struct token_char *ptr1;
608
609 if (!is_potential_number (tp, basep))
610 return n_none;
611
612 /* is_potential_number guarantees
613 - all attributes are >= a_ratio,
614 - there is at least one a_digit or a_letterdigit or a_expodigit, and
615 - if there is an a_dot, then *basep = 10. */
616
617 ptr1 = &tp->chars[0];
618 ptr_limit = &tp->chars[tp->charcount];
619
620 if (ptr1->attribute == a_sign)
621 ptr1++;
622
623 /* Test for syntax
624 * { a_sign | }
625 * { a_digit < base }+ { a_ratio { a_digit < base }+ | }
626 */
627 {
628 bool seen_a_ratio = false;
629 bool seen_a_digit = false; /* seen a digit in last digit block? */
630 struct token_char *ptr;
631
632 for (ptr = ptr1;; ptr++)
633 {
634 if (ptr >= ptr_limit)
635 {
636 if (!seen_a_digit)
637 break;
638 if (seen_a_ratio)
639 return n_ratio;
640 else
641 return n_integer;
642 }
643 if (ptr->attribute == a_digit
644 || ptr->attribute == a_letterdigit
645 || ptr->attribute == a_expodigit)
646 {
647 int c = ptr->ch;
648
649 c = (c < 'A' ? c - '0' : c < 'a' ? c - 'A' + 10 : c - 'a' + 10);
650 if (c >= *basep)
651 break;
652 seen_a_digit = true;
653 }
654 else if (ptr->attribute == a_ratio)
655 {
656 if (seen_a_ratio || !seen_a_digit)
657 break;
658 seen_a_ratio = true;
659 seen_a_digit = false;
660 }
661 else
662 break;
663 }
664 }
665
666 /* Test for syntax
667 * { a_sign | }
668 * { a_digit }* { a_dot { a_digit }* | }
669 * { a_expo { a_sign | } { a_digit }+ | }
670 *
671 * If there is an exponent part, there must be digits before the dot or
672 * after the dot. The result is a float.
673 * If there is no exponen:
674 * If there is no dot, it would an integer in base 10, but is has already
675 * been verified to not be an integer in the current base.
676 * If there is a dot:
677 * If there are digits after the dot, it's a float.
678 * Otherwise, if there are digits before the dot, it's an integer.
679 */
680 *basep = 10;
681 {
682 bool seen_a_dot = false;
683 bool seen_a_dot_with_leading_digits = false;
684 bool seen_a_digit = false; /* seen a digit in last digit block? */
685 struct token_char *ptr;
686
687 for (ptr = ptr1;; ptr++)
688 {
689 if (ptr >= ptr_limit)
690 {
691 /* no exponent */
692 if (!seen_a_dot)
693 return n_none;
694 if (seen_a_digit)
695 return n_float;
696 if (seen_a_dot_with_leading_digits)
697 return n_integer;
698 else
699 return n_none;
700 }
701 if (ptr->attribute == a_digit)
702 {
703 seen_a_digit = true;
704 }
705 else if (ptr->attribute == a_dot)
706 {
707 if (seen_a_dot)
708 return n_none;
709 seen_a_dot = true;
710 if (seen_a_digit)
711 seen_a_dot_with_leading_digits = true;
712 seen_a_digit = false;
713 }
714 else if (ptr->attribute == a_expo || ptr->attribute == a_expodigit)
715 break;
716 else
717 return n_none;
718 }
719 ptr++;
720 if (!seen_a_dot_with_leading_digits || !seen_a_digit)
721 return n_none;
722 if (ptr >= ptr_limit)
723 return n_none;
724 if (ptr->attribute == a_sign)
725 ptr++;
726 seen_a_digit = false;
727 for (;; ptr++)
728 {
729 if (ptr >= ptr_limit)
730 break;
731 if (ptr->attribute != a_digit)
732 return n_none;
733 seen_a_digit = true;
734 }
735 if (!seen_a_digit)
736 return n_none;
737 return n_float;
738 }
739 }
740
741 /* A token representing a symbol must be case converted.
742 For portability, we convert only ASCII characters here. */
743
744 static void
upcase_token(struct token * tp)745 upcase_token (struct token *tp)
746 {
747 int n = tp->charcount;
748 int i;
749
750 for (i = 0; i < n; i++)
751 if (tp->chars[i].attribute != a_escaped)
752 {
753 unsigned char c = tp->chars[i].ch;
754 if (c >= 'a' && c <= 'z')
755 tp->chars[i].ch = c - 'a' + 'A';
756 }
757 }
758
759 static void
downcase_token(struct token * tp)760 downcase_token (struct token *tp)
761 {
762 int n = tp->charcount;
763 int i;
764
765 for (i = 0; i < n; i++)
766 if (tp->chars[i].attribute != a_escaped)
767 {
768 unsigned char c = tp->chars[i].ch;
769 if (c >= 'A' && c <= 'Z')
770 tp->chars[i].ch = c - 'A' + 'a';
771 }
772 }
773
774 static void
case_convert_token(struct token * tp)775 case_convert_token (struct token *tp)
776 {
777 int n = tp->charcount;
778 int i;
779
780 switch (readtable_case)
781 {
782 case case_upcase:
783 upcase_token (tp);
784 break;
785
786 case case_downcase:
787 downcase_token (tp);
788 break;
789
790 case case_preserve:
791 break;
792
793 case case_invert:
794 {
795 bool seen_uppercase = false;
796 bool seen_lowercase = false;
797 for (i = 0; i < n; i++)
798 if (tp->chars[i].attribute != a_escaped)
799 {
800 unsigned char c = tp->chars[i].ch;
801 if (c >= 'a' && c <= 'z')
802 seen_lowercase = true;
803 if (c >= 'A' && c <= 'Z')
804 seen_uppercase = true;
805 }
806 if (seen_uppercase)
807 {
808 if (!seen_lowercase)
809 downcase_token (tp);
810 }
811 else
812 {
813 if (seen_lowercase)
814 upcase_token (tp);
815 }
816 }
817 break;
818 }
819 }
820
821
822 /* ========================= Accumulating comments ========================= */
823
824
825 static char *buffer;
826 static size_t bufmax;
827 static size_t buflen;
828
829 static inline void
comment_start()830 comment_start ()
831 {
832 buflen = 0;
833 }
834
835 static inline void
comment_add(int c)836 comment_add (int c)
837 {
838 if (buflen >= bufmax)
839 {
840 bufmax = 2 * bufmax + 10;
841 buffer = xrealloc (buffer, bufmax);
842 }
843 buffer[buflen++] = c;
844 }
845
846 static inline void
comment_line_end(size_t chars_to_remove)847 comment_line_end (size_t chars_to_remove)
848 {
849 buflen -= chars_to_remove;
850 while (buflen >= 1
851 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
852 --buflen;
853 if (chars_to_remove == 0 && buflen >= bufmax)
854 {
855 bufmax = 2 * bufmax + 10;
856 buffer = xrealloc (buffer, bufmax);
857 }
858 buffer[buflen] = '\0';
859 savable_comment_add (buffer);
860 }
861
862
863 /* These are for tracking whether comments count as immediately before
864 keyword. */
865 static int last_comment_line;
866 static int last_non_comment_line;
867
868
869 /* ========================= Accumulating messages ========================= */
870
871
872 static message_list_ty *mlp;
873
874
875 /* ============== Reading of objects. See CLHS 2 "Syntax". ============== */
876
877
878 /* We are only interested in symbols (e.g. GETTEXT or NGETTEXT) and strings.
879 Other objects need not to be represented precisely. */
880 enum object_type
881 {
882 t_symbol, /* symbol */
883 t_string, /* string */
884 t_other, /* other kind of real object */
885 t_dot, /* '.' pseudo object */
886 t_close, /* ')' pseudo object */
887 t_eof /* EOF marker */
888 };
889
890 struct object
891 {
892 enum object_type type;
893 struct token *token; /* for t_symbol and t_string */
894 int line_number_at_start; /* for t_string */
895 };
896
897 /* Free the memory pointed to by a 'struct object'. */
898 static inline void
free_object(struct object * op)899 free_object (struct object *op)
900 {
901 if (op->type == t_symbol || op->type == t_string)
902 {
903 free_token (op->token);
904 free (op->token);
905 }
906 }
907
908 /* Convert a t_symbol/t_string token to a char*. */
909 static char *
string_of_object(const struct object * op)910 string_of_object (const struct object *op)
911 {
912 char *str;
913 const struct token_char *p;
914 char *q;
915 int n;
916
917 if (!(op->type == t_symbol || op->type == t_string))
918 abort ();
919 n = op->token->charcount;
920 str = XNMALLOC (n + 1, char);
921 q = str;
922 for (p = op->token->chars; n > 0; p++, n--)
923 *q++ = p->ch;
924 *q = '\0';
925 return str;
926 }
927
928 /* Context lookup table. */
929 static flag_context_list_table_ty *flag_context_list_table;
930
931 /* Read the next object. */
932 static void
read_object(struct object * op,flag_context_ty outer_context)933 read_object (struct object *op, flag_context_ty outer_context)
934 {
935 for (;;)
936 {
937 struct char_syntax curr;
938
939 read_char_syntax (&curr);
940
941 switch (curr.scode)
942 {
943 case syntax_eof:
944 op->type = t_eof;
945 return;
946
947 case syntax_whitespace:
948 if (curr.ch == '\n')
949 /* Comments assumed to be grouped with a message must immediately
950 precede it, with no non-whitespace token on a line between
951 both. */
952 if (last_non_comment_line > last_comment_line)
953 savable_comment_reset ();
954 continue;
955
956 case syntax_illegal:
957 op->type = t_other;
958 return;
959
960 case syntax_single_esc:
961 case syntax_multi_esc:
962 case syntax_constituent:
963 /* Start reading a token. */
964 op->token = XMALLOC (struct token);
965 read_token (op->token, &curr);
966 last_non_comment_line = line_number;
967
968 /* Interpret the token. */
969
970 /* Dots. */
971 if (!op->token->with_escape
972 && op->token->charcount == 1
973 && op->token->chars[0].attribute == a_dot)
974 {
975 free_token (op->token);
976 free (op->token);
977 op->type = t_dot;
978 return;
979 }
980 /* Tokens consisting entirely of dots are illegal, but be tolerant
981 here. */
982
983 /* Number. */
984 {
985 int base = read_base;
986
987 if (is_number (op->token, &base) != n_none)
988 {
989 free_token (op->token);
990 free (op->token);
991 op->type = t_other;
992 return;
993 }
994 }
995
996 /* We interpret all other tokens as symbols (including 'reserved
997 tokens', i.e. potential numbers which are not numbers). */
998 case_convert_token (op->token);
999 op->type = t_symbol;
1000 return;
1001
1002 case syntax_t_macro:
1003 case syntax_nt_macro:
1004 /* Read a macro. */
1005 switch (curr.ch)
1006 {
1007 case '(':
1008 {
1009 int arg = 0; /* Current argument number. */
1010 flag_context_list_iterator_ty context_iter;
1011 const struct callshapes *shapes = NULL;
1012 struct arglist_parser *argparser = NULL;
1013
1014 for (;; arg++)
1015 {
1016 struct object inner;
1017 flag_context_ty inner_context;
1018
1019 if (arg == 0)
1020 inner_context = null_context;
1021 else
1022 inner_context =
1023 inherited_context (outer_context,
1024 flag_context_list_iterator_advance (
1025 &context_iter));
1026
1027 read_object (&inner, inner_context);
1028
1029 /* Recognize end of list. */
1030 if (inner.type == t_close)
1031 {
1032 op->type = t_other;
1033 /* Don't bother converting "()" to "NIL". */
1034 last_non_comment_line = line_number;
1035 if (argparser != NULL)
1036 arglist_parser_done (argparser, arg);
1037 return;
1038 }
1039
1040 /* Dots are not allowed in every position.
1041 But be tolerant. */
1042
1043 /* EOF inside list is illegal.
1044 But be tolerant. */
1045 if (inner.type == t_eof)
1046 break;
1047
1048 if (arg == 0)
1049 {
1050 /* This is the function position. */
1051 if (inner.type == t_symbol)
1052 {
1053 char *symbol_name = string_of_object (&inner);
1054 int i;
1055 int prefix_len;
1056 void *keyword_value;
1057
1058 /* Omit any package name. */
1059 i = inner.token->charcount;
1060 while (i > 0
1061 && inner.token->chars[i-1].attribute != a_pack_m)
1062 i--;
1063 prefix_len = i;
1064
1065 if (hash_find_entry (&keywords,
1066 symbol_name + prefix_len,
1067 strlen (symbol_name + prefix_len),
1068 &keyword_value)
1069 == 0)
1070 shapes = (const struct callshapes *) keyword_value;
1071
1072 argparser = arglist_parser_alloc (mlp, shapes);
1073
1074 context_iter =
1075 flag_context_list_iterator (
1076 flag_context_list_table_lookup (
1077 flag_context_list_table,
1078 symbol_name, strlen (symbol_name)));
1079
1080 free (symbol_name);
1081 }
1082 else
1083 context_iter = null_context_list_iterator;
1084 }
1085 else
1086 {
1087 /* These are the argument positions. */
1088 if (argparser != NULL && inner.type == t_string)
1089 {
1090 char *s = string_of_object (&inner);
1091 mixed_string_ty *ms =
1092 mixed_string_alloc_simple (s, lc_string,
1093 logical_file_name,
1094 inner.line_number_at_start);
1095 free (s);
1096 arglist_parser_remember (argparser, arg, ms,
1097 inner_context,
1098 logical_file_name,
1099 inner.line_number_at_start,
1100 savable_comment, false);
1101 }
1102 }
1103
1104 free_object (&inner);
1105 }
1106
1107 if (argparser != NULL)
1108 arglist_parser_done (argparser, arg);
1109 }
1110 op->type = t_other;
1111 last_non_comment_line = line_number;
1112 return;
1113
1114 case ')':
1115 /* Tell the caller about the end of list.
1116 Unmatched closing parenthesis is illegal.
1117 But be tolerant. */
1118 op->type = t_close;
1119 last_non_comment_line = line_number;
1120 return;
1121
1122 case ',':
1123 {
1124 int c = do_getc ();
1125 /* The ,@ handling inside lists is wrong anyway, because
1126 ,@form expands to an unknown number of elements. */
1127 if (c != EOF && c != '@' && c != '.')
1128 do_ungetc (c);
1129 }
1130 /*FALLTHROUGH*/
1131 case '\'':
1132 case '`':
1133 {
1134 struct object inner;
1135
1136 read_object (&inner, null_context);
1137
1138 /* Dots and EOF are not allowed here. But be tolerant. */
1139
1140 free_object (&inner);
1141
1142 op->type = t_other;
1143 last_non_comment_line = line_number;
1144 return;
1145 }
1146
1147 case ';':
1148 {
1149 bool all_semicolons = true;
1150
1151 last_comment_line = line_number;
1152 comment_start ();
1153 for (;;)
1154 {
1155 int c = do_getc ();
1156 if (c == EOF || c == '\n')
1157 break;
1158 if (c != ';')
1159 all_semicolons = false;
1160 if (!all_semicolons)
1161 {
1162 /* We skip all leading white space, but not EOLs. */
1163 if (!(buflen == 0 && (c == ' ' || c == '\t')))
1164 comment_add (c);
1165 }
1166 }
1167 comment_line_end (0);
1168 continue;
1169 }
1170
1171 case '"':
1172 {
1173 op->token = XMALLOC (struct token);
1174 init_token (op->token);
1175 op->line_number_at_start = line_number;
1176 for (;;)
1177 {
1178 int c = do_getc ();
1179 if (c == EOF)
1180 /* Invalid input. Be tolerant, no error message. */
1181 break;
1182 if (c == '"')
1183 break;
1184 if (c == '\\') /* syntax_single_esc */
1185 {
1186 c = do_getc ();
1187 if (c == EOF)
1188 /* Invalid input. Be tolerant, no error message. */
1189 break;
1190 }
1191 grow_token (op->token);
1192 op->token->chars[op->token->charcount++].ch = c;
1193 }
1194 op->type = t_string;
1195
1196 if (extract_all)
1197 {
1198 lex_pos_ty pos;
1199
1200 pos.file_name = logical_file_name;
1201 pos.line_number = op->line_number_at_start;
1202 remember_a_message (mlp, NULL, string_of_object (op), false,
1203 false, null_context, &pos,
1204 NULL, savable_comment, false);
1205 }
1206 last_non_comment_line = line_number;
1207 return;
1208 }
1209
1210 case '#':
1211 /* Dispatch macro handling. */
1212 {
1213 int c;
1214
1215 for (;;)
1216 {
1217 c = do_getc ();
1218 if (c == EOF)
1219 /* Invalid input. Be tolerant, no error message. */
1220 {
1221 op->type = t_other;
1222 return;
1223 }
1224 if (!(c >= '0' && c <= '9'))
1225 break;
1226 }
1227
1228 switch (c)
1229 {
1230 case '(':
1231 case '"':
1232 do_ungetc (c);
1233 /*FALLTHROUGH*/
1234 case '\'':
1235 case ':':
1236 case '.':
1237 case ',':
1238 case 'A': case 'a':
1239 case 'C': case 'c':
1240 case 'P': case 'p':
1241 case 'S': case 's':
1242 {
1243 struct object inner;
1244 read_object (&inner, null_context);
1245 /* Dots and EOF are not allowed here.
1246 But be tolerant. */
1247 free_object (&inner);
1248 op->type = t_other;
1249 last_non_comment_line = line_number;
1250 return;
1251 }
1252
1253 case '|':
1254 {
1255 int depth = 0;
1256 int c;
1257
1258 comment_start ();
1259 c = do_getc ();
1260 for (;;)
1261 {
1262 if (c == EOF)
1263 break;
1264 if (c == '|')
1265 {
1266 c = do_getc ();
1267 if (c == EOF)
1268 break;
1269 if (c == '#')
1270 {
1271 if (depth == 0)
1272 {
1273 comment_line_end (0);
1274 break;
1275 }
1276 depth--;
1277 comment_add ('|');
1278 comment_add ('#');
1279 c = do_getc ();
1280 }
1281 else
1282 comment_add ('|');
1283 }
1284 else if (c == '#')
1285 {
1286 c = do_getc ();
1287 if (c == EOF)
1288 break;
1289 comment_add ('#');
1290 if (c == '|')
1291 {
1292 depth++;
1293 comment_add ('|');
1294 c = do_getc ();
1295 }
1296 }
1297 else
1298 {
1299 /* We skip all leading white space. */
1300 if (!(buflen == 0 && (c == ' ' || c == '\t')))
1301 comment_add (c);
1302 if (c == '\n')
1303 {
1304 comment_line_end (1);
1305 comment_start ();
1306 }
1307 c = do_getc ();
1308 }
1309 }
1310 if (c == EOF)
1311 {
1312 /* EOF not allowed here. But be tolerant. */
1313 op->type = t_eof;
1314 return;
1315 }
1316 last_comment_line = line_number;
1317 continue;
1318 }
1319
1320 case '\\':
1321 {
1322 struct token token;
1323 struct char_syntax first;
1324 first.ch = '\\';
1325 first.scode = syntax_single_esc;
1326 read_token (&token, &first);
1327 free_token (&token);
1328 op->type = t_other;
1329 last_non_comment_line = line_number;
1330 return;
1331 }
1332
1333 case 'B': case 'b':
1334 case 'O': case 'o':
1335 case 'X': case 'x':
1336 case 'R': case 'r':
1337 case '*':
1338 {
1339 struct token token;
1340 read_token (&token, NULL);
1341 free_token (&token);
1342 op->type = t_other;
1343 last_non_comment_line = line_number;
1344 return;
1345 }
1346
1347 case '=':
1348 /* Ignore read labels. */
1349 continue;
1350
1351 case '#':
1352 /* Don't bother looking up the corresponding object. */
1353 op->type = t_other;
1354 last_non_comment_line = line_number;
1355 return;
1356
1357 case '+':
1358 case '-':
1359 /* Simply assume every feature expression is true. */
1360 {
1361 struct object inner;
1362 read_object (&inner, null_context);
1363 /* Dots and EOF are not allowed here.
1364 But be tolerant. */
1365 free_object (&inner);
1366 continue;
1367 }
1368
1369 default:
1370 op->type = t_other;
1371 last_non_comment_line = line_number;
1372 return;
1373 }
1374 /*NOTREACHED*/
1375 abort ();
1376 }
1377
1378 default:
1379 /*NOTREACHED*/
1380 abort ();
1381 }
1382
1383 default:
1384 /*NOTREACHED*/
1385 abort ();
1386 }
1387 }
1388 }
1389
1390
1391 void
extract_lisp(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1392 extract_lisp (FILE *f,
1393 const char *real_filename, const char *logical_filename,
1394 flag_context_list_table_ty *flag_table,
1395 msgdomain_list_ty *mdlp)
1396 {
1397 mlp = mdlp->item[0]->messages;
1398
1399 fp = f;
1400 real_file_name = real_filename;
1401 logical_file_name = xstrdup (logical_filename);
1402 line_number = 1;
1403
1404 last_comment_line = -1;
1405 last_non_comment_line = -1;
1406
1407 flag_context_list_table = flag_table;
1408
1409 init_keywords ();
1410
1411 /* Eat tokens until eof is seen. When read_object returns
1412 due to an unbalanced closing parenthesis, just restart it. */
1413 do
1414 {
1415 struct object toplevel_object;
1416
1417 read_object (&toplevel_object, null_context);
1418
1419 if (toplevel_object.type == t_eof)
1420 break;
1421
1422 free_object (&toplevel_object);
1423 }
1424 while (!feof (fp));
1425
1426 /* Close scanner. */
1427 fp = NULL;
1428 real_file_name = NULL;
1429 logical_file_name = NULL;
1430 line_number = 0;
1431 }
1432