• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* xgettext Lisp backend.
2    Copyright (C) 2001-2003, 2005-2009, 2018-2020 Free Software Foundation, Inc.
3 
4    This file was written by Bruno Haible <haible@clisp.cons.org>, 2001.
5 
6    This program is free software: you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3 of the License, or
9    (at your option) any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
18 
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
22 
23 /* Specification.  */
24 #include "x-lisp.h"
25 
26 #include <errno.h>
27 #include <stdbool.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 
32 #include "message.h"
33 #include "xgettext.h"
34 #include "xg-pos.h"
35 #include "xg-mixed-string.h"
36 #include "xg-arglist-context.h"
37 #include "xg-arglist-callshape.h"
38 #include "xg-arglist-parser.h"
39 #include "xg-message.h"
40 #include "error.h"
41 #include "xalloc.h"
42 #include "mem-hash-map.h"
43 #include "gettext.h"
44 
45 #define _(s) gettext(s)
46 
47 
48 /* The Common Lisp syntax is described in the Common Lisp HyperSpec, chapter 2.
49    Since we are interested only in strings and in forms similar to
50         (gettext msgid ...)
51    or   (ngettext msgid msgid_plural ...)
52    we make the following simplifications:
53 
54    - Assume the keywords and strings are in an ASCII compatible encoding.
55      This means we can read the input file one byte at a time, instead of
56      one character at a time.  No need to worry about multibyte characters:
57      If they occur as part of identifiers, they most probably act as
58      constituent characters, and the byte based approach will do the same.
59 
60    - Assume the read table is the standard Common Lisp read table.
61      Non-standard read tables are mostly used to read data, not programs.
62 
63    - Assume the read table case is :UPCASE, and *READ-BASE* is 10.
64 
65    - Don't interpret #n= and #n#, they usually don't appear in programs.
66 
67    - Don't interpret #+, #-, they are unlikely to appear in a gettext form.
68 
69    The remaining syntax rules are:
70 
71    - The syntax code assigned to each character, and how tokens are built
72      up from characters (single escape, multiple escape etc.).
73 
74    - Comment syntax: ';' and '#| ... |#'.
75 
76    - String syntax: "..." with single escapes.
77 
78    - Read macros and dispatch macro character '#'.  Needed to be able to
79      tell which is the n-th argument of a function call.
80 
81  */
82 
83 
84 /* ========================= Lexer customization.  ========================= */
85 
86 /* 'readtable_case' is the case conversion that is applied to non-escaped
87     parts of symbol tokens.  In Common Lisp: (readtable-case *readtable*).  */
88 
89 enum rtcase
90 {
91   case_upcase,
92   case_downcase,
93   case_preserve,
94   case_invert
95 };
96 
97 static enum rtcase readtable_case = case_upcase;
98 
99 /* 'read_base' is the assumed radix of integers and rational numbers.
100    In Common Lisp: *read-base*.  */
101 static int read_base = 10;
102 
103 /* 'read_preserve_whitespace' specifies whether a whitespace character
104    that terminates a token must be pushed back on the input stream.
105    We set it to true, because the special newline side effect in read_object()
106    requires that read_object() sees every newline not inside a token.  */
107 static bool read_preserve_whitespace = true;
108 
109 
110 /* ====================== Keyword set customization.  ====================== */
111 
112 /* If true extract all strings.  */
113 static bool extract_all = false;
114 
115 static hash_table keywords;
116 static bool default_keywords = true;
117 
118 
119 void
x_lisp_extract_all()120 x_lisp_extract_all ()
121 {
122   extract_all = true;
123 }
124 
125 
126 void
x_lisp_keyword(const char * name)127 x_lisp_keyword (const char *name)
128 {
129   if (name == NULL)
130     default_keywords = false;
131   else
132     {
133       const char *end;
134       struct callshape shape;
135       const char *colon;
136       size_t len;
137       char *symname;
138       size_t i;
139 
140       if (keywords.table == NULL)
141         hash_init (&keywords, 100);
142 
143       split_keywordspec (name, &end, &shape);
144 
145       /* The characters between name and end should form a valid Lisp symbol.
146          Extract the symbol name part.  */
147       colon = strchr (name, ':');
148       if (colon != NULL && colon < end)
149         {
150           name = colon + 1;
151           if (name < end && *name == ':')
152             name++;
153           colon = strchr (name, ':');
154           if (colon != NULL && colon < end)
155             return;
156         }
157 
158       /* Uppercase it.  */
159       len = end - name;
160       symname = XNMALLOC (len, char);
161       for (i = 0; i < len; i++)
162         symname[i] =
163           (name[i] >= 'a' && name[i] <= 'z' ? name[i] - 'a' + 'A' : name[i]);
164 
165       insert_keyword_callshape (&keywords, symname, len, &shape);
166     }
167 }
168 
169 /* Finish initializing the keywords hash table.
170    Called after argument processing, before each file is processed.  */
171 static void
init_keywords()172 init_keywords ()
173 {
174   if (default_keywords)
175     {
176       /* When adding new keywords here, also update the documentation in
177          xgettext.texi!  */
178       x_lisp_keyword ("gettext");       /* I18N:GETTEXT */
179       x_lisp_keyword ("ngettext:1,2");  /* I18N:NGETTEXT */
180       x_lisp_keyword ("gettext-noop");
181       default_keywords = false;
182     }
183 }
184 
185 void
init_flag_table_lisp()186 init_flag_table_lisp ()
187 {
188   xgettext_record_flag ("gettext:1:pass-lisp-format");
189   xgettext_record_flag ("ngettext:1:pass-lisp-format");
190   xgettext_record_flag ("ngettext:2:pass-lisp-format");
191   xgettext_record_flag ("gettext-noop:1:pass-lisp-format");
192   xgettext_record_flag ("format:2:lisp-format");
193 }
194 
195 
196 /* ======================== Reading of characters.  ======================== */
197 
198 /* The input file stream.  */
199 static FILE *fp;
200 
201 
202 /* Fetch the next character from the input file.  */
203 static int
do_getc()204 do_getc ()
205 {
206   int c = getc (fp);
207 
208   if (c == EOF)
209     {
210       if (ferror (fp))
211         error (EXIT_FAILURE, errno,
212                _("error while reading \"%s\""), real_file_name);
213     }
214   else if (c == '\n')
215    line_number++;
216 
217   return c;
218 }
219 
220 /* Put back the last fetched character, not EOF.  */
221 static void
do_ungetc(int c)222 do_ungetc (int c)
223 {
224   if (c == '\n')
225     line_number--;
226   ungetc (c, fp);
227 }
228 
229 
230 /* ========= Reading of tokens.  See CLHS 2.2 "Reader Algorithm".  ========= */
231 
232 
233 /* Syntax code.  See CLHS 2.1.4 "Character Syntax Types".  */
234 
235 enum syntax_code
236 {
237   syntax_illegal,       /* non-printable, except whitespace     */
238   syntax_single_esc,    /* '\' (single escape)                  */
239   syntax_multi_esc,     /* '|' (multiple escape)                */
240   syntax_constituent,   /* everything else (constituent)        */
241   syntax_whitespace,    /* TAB,LF,FF,CR,' ' (whitespace)        */
242   syntax_eof,           /* EOF                                  */
243   syntax_t_macro,       /* '()'"' (terminating macro)           */
244   syntax_nt_macro       /* '#' (non-terminating macro)          */
245 };
246 
247 /* Returns the syntax code of a character.  */
248 static enum syntax_code
syntax_code_of(unsigned char c)249 syntax_code_of (unsigned char c)
250 {
251   switch (c)
252     {
253     case '\\':
254       return syntax_single_esc;
255     case '|':
256       return syntax_multi_esc;
257     case '\t': case '\n': case '\f': case '\r': case ' ':
258       return syntax_whitespace;
259     case '(': case ')': case '\'': case '"': case ',': case ';': case '`':
260       return syntax_t_macro;
261     case '#':
262       return syntax_nt_macro;
263     default:
264       if (c < ' ' && c != '\b')
265         return syntax_illegal;
266       else
267         return syntax_constituent;
268     }
269 }
270 
271 struct char_syntax
272 {
273   int ch;                       /* character */
274   enum syntax_code scode;       /* syntax code */
275 };
276 
277 /* Returns the next character and its syntax code.  */
278 static void
read_char_syntax(struct char_syntax * p)279 read_char_syntax (struct char_syntax *p)
280 {
281   int c = do_getc ();
282 
283   p->ch = c;
284   p->scode = (c == EOF ? syntax_eof : syntax_code_of (c));
285 }
286 
287 /* Every character in a token has an attribute assigned.  The attributes
288    help during interpretation of the token.  See
289    CLHS 2.3 "Interpretation of Tokens" for the possible interpretations,
290    and CLHS 2.1.4.2 "Constituent Traits".  */
291 
292 enum attribute
293 {
294   a_illg,       /* invalid constituent */
295   a_pack_m,     /* ':' package marker */
296   a_alpha,      /* normal alphabetic */
297   a_escaped,    /* alphabetic but not subject to case conversion */
298   a_ratio,      /* '/' */
299   a_dot,        /* '.' */
300   a_sign,       /* '+-' */
301   a_extens,     /* '_^' extension characters */
302   a_digit,      /* '0123456789' */
303   a_letterdigit,/* 'A'-'Z','a'-'z' below base, except 'esfdlESFDL' */
304   a_expodigit,  /* 'esfdlESFDL' below base */
305   a_letter,     /* 'A'-'Z','a'-'z', except 'esfdlESFDL' */
306   a_expo        /* 'esfdlESFDL' */
307 };
308 
309 #define is_letter_attribute(a) ((a) >= a_letter)
310 #define is_number_attribute(a) ((a) >= a_ratio)
311 
312 /* Returns the attribute of a character, assuming base 10.  */
313 static enum attribute
attribute_of(unsigned char c)314 attribute_of (unsigned char c)
315 {
316   switch (c)
317     {
318     case ':':
319       return a_pack_m;
320     case '/':
321       return a_ratio;
322     case '.':
323       return a_dot;
324     case '+': case '-':
325       return a_sign;
326     case '_': case '^':
327       return a_extens;
328     case '0': case '1': case '2': case '3': case '4':
329     case '5': case '6': case '7': case '8': case '9':
330       return a_digit;
331     case 'a': case 'b': case 'c': case 'g': case 'h': case 'i': case 'j':
332     case 'k': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
333     case 't': case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
334     case 'A': case 'B': case 'C': case 'G': case 'H': case 'I': case 'J':
335     case 'K': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
336     case 'T': case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z':
337       return a_letter;
338     case 'e': case 's': case 'd': case 'f': case 'l':
339     case 'E': case 'S': case 'D': case 'F': case 'L':
340       return a_expo;
341     default:
342       /* Treat everything as valid.  Never return a_illg.  */
343       return a_alpha;
344     }
345 }
346 
347 struct token_char
348 {
349   unsigned char ch;             /* character */
350   unsigned char attribute;      /* attribute */
351 };
352 
353 /* A token consists of a sequence of characters with associated attribute.  */
354 struct token
355 {
356   int allocated;                /* number of allocated 'token_char's */
357   int charcount;                /* number of used 'token_char's */
358   struct token_char *chars;     /* the token's constituents */
359   bool with_escape;             /* whether single-escape or multiple escape occurs */
360 };
361 
362 /* Initialize a 'struct token'.  */
363 static inline void
init_token(struct token * tp)364 init_token (struct token *tp)
365 {
366   tp->allocated = 10;
367   tp->chars = XNMALLOC (tp->allocated, struct token_char);
368   tp->charcount = 0;
369 }
370 
371 /* Free the memory pointed to by a 'struct token'.  */
372 static inline void
free_token(struct token * tp)373 free_token (struct token *tp)
374 {
375   free (tp->chars);
376 }
377 
378 /* Ensure there is enough room in the token for one more character.  */
379 static inline void
grow_token(struct token * tp)380 grow_token (struct token *tp)
381 {
382   if (tp->charcount == tp->allocated)
383     {
384       tp->allocated *= 2;
385       tp->chars = (struct token_char *) xrealloc (tp->chars, tp->allocated * sizeof (struct token_char));
386     }
387 }
388 
389 /* Read the next token.  If 'first' is given, it points to the first
390    character, which has already been read.
391    The algorithm follows CLHS 2.2 "Reader Algorithm".  */
392 static void
read_token(struct token * tp,const struct char_syntax * first)393 read_token (struct token *tp, const struct char_syntax *first)
394 {
395   bool multiple_escape_flag;
396   struct char_syntax curr;
397 
398   init_token (tp);
399   tp->with_escape = false;
400 
401   multiple_escape_flag = false;
402   if (first)
403     curr = *first;
404   else
405     read_char_syntax (&curr);
406 
407   for (;; read_char_syntax (&curr))
408     {
409       switch (curr.scode)
410         {
411         case syntax_illegal:
412           /* Invalid input.  Be tolerant, no error message.  */
413           do_ungetc (curr.ch);
414           return;
415 
416         case syntax_single_esc:
417           tp->with_escape = true;
418           read_char_syntax (&curr);
419           if (curr.scode == syntax_eof)
420             /* Invalid input.  Be tolerant, no error message.  */
421             return;
422           grow_token (tp);
423           tp->chars[tp->charcount].ch = curr.ch;
424           tp->chars[tp->charcount].attribute = a_escaped;
425           tp->charcount++;
426           break;
427 
428         case syntax_multi_esc:
429           multiple_escape_flag = !multiple_escape_flag;
430           tp->with_escape = true;
431           break;
432 
433         case syntax_constituent:
434         case syntax_nt_macro:
435           grow_token (tp);
436           if (multiple_escape_flag)
437             {
438               tp->chars[tp->charcount].ch = curr.ch;
439               tp->chars[tp->charcount].attribute = a_escaped;
440               tp->charcount++;
441             }
442           else
443             {
444               tp->chars[tp->charcount].ch = curr.ch;
445               tp->chars[tp->charcount].attribute = attribute_of (curr.ch);
446               tp->charcount++;
447             }
448           break;
449 
450         case syntax_whitespace:
451         case syntax_t_macro:
452           if (multiple_escape_flag)
453             {
454               grow_token (tp);
455               tp->chars[tp->charcount].ch = curr.ch;
456               tp->chars[tp->charcount].attribute = a_escaped;
457               tp->charcount++;
458             }
459           else
460             {
461               if (curr.scode != syntax_whitespace || read_preserve_whitespace)
462                 do_ungetc (curr.ch);
463               return;
464             }
465           break;
466 
467         case syntax_eof:
468           if (multiple_escape_flag)
469             /* Invalid input.  Be tolerant, no error message.  */
470             ;
471           return;
472         }
473     }
474 }
475 
476 /* A potential number is a token which
477    1. consists only of digits, '+','-','/','^','_','.' and number markers.
478       The base for digits is context dependent, but always 10 if a dot '.'
479       occurs. A number marker is a non-digit letter which is not adjacent
480       to a non-digit letter.
481    2. has at least one digit.
482    3. starts with a digit, '+','-','.','^' or '_'.
483    4. does not end with '+' or '-'.
484    See CLHS 2.3.1.1 "Potential Numbers as Tokens".
485  */
486 
487 static inline bool
has_a_dot(const struct token * tp)488 has_a_dot (const struct token *tp)
489 {
490   int n = tp->charcount;
491   int i;
492 
493   for (i = 0; i < n; i++)
494     if (tp->chars[i].attribute == a_dot)
495       return true;
496   return false;
497 }
498 
499 static inline bool
all_a_number(const struct token * tp)500 all_a_number (const struct token *tp)
501 {
502   int n = tp->charcount;
503   int i;
504 
505   for (i = 0; i < n; i++)
506     if (!is_number_attribute (tp->chars[i].attribute))
507       return false;
508   return true;
509 }
510 
511 static inline void
a_letter_to_digit(const struct token * tp,int base)512 a_letter_to_digit (const struct token *tp, int base)
513 {
514   int n = tp->charcount;
515   int i;
516 
517   for (i = 0; i < n; i++)
518     if (is_letter_attribute (tp->chars[i].attribute))
519       {
520         int c = tp->chars[i].ch;
521 
522         if (c >= 'a')
523           c -= 'a' - 'A';
524         if (c - 'A' + 10 < base)
525           tp->chars[i].attribute -= 2; /* a_letter -> a_letterdigit,
526                                           a_expo -> a_expodigit */
527       }
528 }
529 
530 static inline bool
has_a_digit(const struct token * tp)531 has_a_digit (const struct token *tp)
532 {
533   int n = tp->charcount;
534   int i;
535 
536   for (i = 0; i < n; i++)
537     if (tp->chars[i].attribute == a_digit
538         || tp->chars[i].attribute == a_letterdigit
539         || tp->chars[i].attribute == a_expodigit)
540       return true;
541   return false;
542 }
543 
544 static inline bool
has_adjacent_letters(const struct token * tp)545 has_adjacent_letters (const struct token *tp)
546 {
547   int n = tp->charcount;
548   int i;
549 
550   for (i = 1; i < n; i++)
551     if (is_letter_attribute (tp->chars[i-1].attribute)
552         && is_letter_attribute (tp->chars[i].attribute))
553       return true;
554   return false;
555 }
556 
557 static bool
is_potential_number(const struct token * tp,int * basep)558 is_potential_number (const struct token *tp, int *basep)
559 {
560   /* CLHS 2.3.1.1.1:
561      "A potential number cannot contain any escape characters."  */
562   if (tp->with_escape)
563     return false;
564 
565   if (has_a_dot (tp))
566     *basep = 10;
567 
568   if (!all_a_number (tp))
569     return false;
570 
571   a_letter_to_digit (tp, *basep);
572 
573   if (!has_a_digit (tp))
574     return false;
575 
576   if (has_adjacent_letters (tp))
577     return false;
578 
579   if (!(tp->chars[0].attribute >= a_dot
580         && tp->chars[0].attribute <= a_expodigit))
581     return false;
582 
583   if (tp->chars[tp->charcount - 1].attribute == a_sign)
584     return false;
585 
586   return true;
587 }
588 
589 /* A number is one of integer, ratio, float.  Each has a particular syntax.
590    See CLHS 2.3.1 "Numbers as Tokens".
591    But note a mistake: The exponent rule should read:
592        exponent ::= exponent-marker [sign] {decimal-digit}+
593    (see 22.1.3.1.3 "Printing Floats").  */
594 
595 enum number_type
596 {
597   n_none,
598   n_integer,
599   n_ratio,
600   n_float
601 };
602 
603 static enum number_type
is_number(const struct token * tp,int * basep)604 is_number (const struct token *tp, int *basep)
605 {
606   struct token_char *ptr_limit;
607   struct token_char *ptr1;
608 
609   if (!is_potential_number (tp, basep))
610     return n_none;
611 
612   /* is_potential_number guarantees
613      - all attributes are >= a_ratio,
614      - there is at least one a_digit or a_letterdigit or a_expodigit, and
615      - if there is an a_dot, then *basep = 10.  */
616 
617   ptr1 = &tp->chars[0];
618   ptr_limit = &tp->chars[tp->charcount];
619 
620   if (ptr1->attribute == a_sign)
621     ptr1++;
622 
623   /* Test for syntax
624    * { a_sign | }
625    * { a_digit < base }+ { a_ratio { a_digit < base }+ | }
626    */
627   {
628     bool seen_a_ratio = false;
629     bool seen_a_digit = false;  /* seen a digit in last digit block? */
630     struct token_char *ptr;
631 
632     for (ptr = ptr1;; ptr++)
633       {
634         if (ptr >= ptr_limit)
635           {
636             if (!seen_a_digit)
637               break;
638             if (seen_a_ratio)
639               return n_ratio;
640             else
641               return n_integer;
642           }
643         if (ptr->attribute == a_digit
644             || ptr->attribute == a_letterdigit
645             || ptr->attribute == a_expodigit)
646           {
647             int c = ptr->ch;
648 
649             c = (c < 'A' ? c - '0' : c < 'a' ? c - 'A' + 10 : c - 'a' + 10);
650             if (c >= *basep)
651               break;
652             seen_a_digit = true;
653           }
654         else if (ptr->attribute == a_ratio)
655           {
656             if (seen_a_ratio || !seen_a_digit)
657               break;
658             seen_a_ratio = true;
659             seen_a_digit = false;
660           }
661         else
662           break;
663       }
664   }
665 
666   /* Test for syntax
667    * { a_sign | }
668    * { a_digit }* { a_dot { a_digit }* | }
669    * { a_expo { a_sign | } { a_digit }+ | }
670    *
671    * If there is an exponent part, there must be digits before the dot or
672    * after the dot. The result is a float.
673    * If there is no exponen:
674    *   If there is no dot, it would an integer in base 10, but is has already
675    *   been verified to not be an integer in the current base.
676    *   If there is a dot:
677    *     If there are digits after the dot, it's a float.
678    *     Otherwise, if there are digits before the dot, it's an integer.
679    */
680   *basep = 10;
681   {
682     bool seen_a_dot = false;
683     bool seen_a_dot_with_leading_digits = false;
684     bool seen_a_digit = false;  /* seen a digit in last digit block? */
685     struct token_char *ptr;
686 
687     for (ptr = ptr1;; ptr++)
688       {
689         if (ptr >= ptr_limit)
690           {
691             /* no exponent */
692             if (!seen_a_dot)
693               return n_none;
694             if (seen_a_digit)
695               return n_float;
696             if (seen_a_dot_with_leading_digits)
697               return n_integer;
698             else
699               return n_none;
700           }
701         if (ptr->attribute == a_digit)
702           {
703             seen_a_digit = true;
704           }
705         else if (ptr->attribute == a_dot)
706           {
707             if (seen_a_dot)
708               return n_none;
709             seen_a_dot = true;
710             if (seen_a_digit)
711               seen_a_dot_with_leading_digits = true;
712             seen_a_digit = false;
713           }
714         else if (ptr->attribute == a_expo || ptr->attribute == a_expodigit)
715           break;
716         else
717           return n_none;
718       }
719     ptr++;
720     if (!seen_a_dot_with_leading_digits || !seen_a_digit)
721       return n_none;
722     if (ptr >= ptr_limit)
723       return n_none;
724     if (ptr->attribute == a_sign)
725       ptr++;
726     seen_a_digit = false;
727     for (;; ptr++)
728       {
729         if (ptr >= ptr_limit)
730           break;
731         if (ptr->attribute != a_digit)
732           return n_none;
733         seen_a_digit = true;
734       }
735     if (!seen_a_digit)
736       return n_none;
737     return n_float;
738   }
739 }
740 
741 /* A token representing a symbol must be case converted.
742    For portability, we convert only ASCII characters here.  */
743 
744 static void
upcase_token(struct token * tp)745 upcase_token (struct token *tp)
746 {
747   int n = tp->charcount;
748   int i;
749 
750   for (i = 0; i < n; i++)
751     if (tp->chars[i].attribute != a_escaped)
752       {
753         unsigned char c = tp->chars[i].ch;
754         if (c >= 'a' && c <= 'z')
755           tp->chars[i].ch = c - 'a' + 'A';
756       }
757 }
758 
759 static void
downcase_token(struct token * tp)760 downcase_token (struct token *tp)
761 {
762   int n = tp->charcount;
763   int i;
764 
765   for (i = 0; i < n; i++)
766     if (tp->chars[i].attribute != a_escaped)
767       {
768         unsigned char c = tp->chars[i].ch;
769         if (c >= 'A' && c <= 'Z')
770           tp->chars[i].ch = c - 'A' + 'a';
771       }
772 }
773 
774 static void
case_convert_token(struct token * tp)775 case_convert_token (struct token *tp)
776 {
777   int n = tp->charcount;
778   int i;
779 
780   switch (readtable_case)
781     {
782     case case_upcase:
783       upcase_token (tp);
784       break;
785 
786     case case_downcase:
787       downcase_token (tp);
788       break;
789 
790     case case_preserve:
791       break;
792 
793     case case_invert:
794       {
795         bool seen_uppercase = false;
796         bool seen_lowercase = false;
797         for (i = 0; i < n; i++)
798           if (tp->chars[i].attribute != a_escaped)
799             {
800               unsigned char c = tp->chars[i].ch;
801               if (c >= 'a' && c <= 'z')
802                 seen_lowercase = true;
803               if (c >= 'A' && c <= 'Z')
804                 seen_uppercase = true;
805             }
806         if (seen_uppercase)
807           {
808             if (!seen_lowercase)
809               downcase_token (tp);
810           }
811         else
812           {
813             if (seen_lowercase)
814               upcase_token (tp);
815           }
816       }
817       break;
818     }
819 }
820 
821 
822 /* ========================= Accumulating comments ========================= */
823 
824 
825 static char *buffer;
826 static size_t bufmax;
827 static size_t buflen;
828 
829 static inline void
comment_start()830 comment_start ()
831 {
832   buflen = 0;
833 }
834 
835 static inline void
comment_add(int c)836 comment_add (int c)
837 {
838   if (buflen >= bufmax)
839     {
840       bufmax = 2 * bufmax + 10;
841       buffer = xrealloc (buffer, bufmax);
842     }
843   buffer[buflen++] = c;
844 }
845 
846 static inline void
comment_line_end(size_t chars_to_remove)847 comment_line_end (size_t chars_to_remove)
848 {
849   buflen -= chars_to_remove;
850   while (buflen >= 1
851          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
852     --buflen;
853   if (chars_to_remove == 0 && buflen >= bufmax)
854     {
855       bufmax = 2 * bufmax + 10;
856       buffer = xrealloc (buffer, bufmax);
857     }
858   buffer[buflen] = '\0';
859   savable_comment_add (buffer);
860 }
861 
862 
863 /* These are for tracking whether comments count as immediately before
864    keyword.  */
865 static int last_comment_line;
866 static int last_non_comment_line;
867 
868 
869 /* ========================= Accumulating messages ========================= */
870 
871 
872 static message_list_ty *mlp;
873 
874 
875 /* ============== Reading of objects.  See CLHS 2 "Syntax".  ============== */
876 
877 
878 /* We are only interested in symbols (e.g. GETTEXT or NGETTEXT) and strings.
879    Other objects need not to be represented precisely.  */
880 enum object_type
881 {
882   t_symbol,     /* symbol */
883   t_string,     /* string */
884   t_other,      /* other kind of real object */
885   t_dot,        /* '.' pseudo object */
886   t_close,      /* ')' pseudo object */
887   t_eof         /* EOF marker */
888 };
889 
890 struct object
891 {
892   enum object_type type;
893   struct token *token;          /* for t_symbol and t_string */
894   int line_number_at_start;     /* for t_string */
895 };
896 
897 /* Free the memory pointed to by a 'struct object'.  */
898 static inline void
free_object(struct object * op)899 free_object (struct object *op)
900 {
901   if (op->type == t_symbol || op->type == t_string)
902     {
903       free_token (op->token);
904       free (op->token);
905     }
906 }
907 
908 /* Convert a t_symbol/t_string token to a char*.  */
909 static char *
string_of_object(const struct object * op)910 string_of_object (const struct object *op)
911 {
912   char *str;
913   const struct token_char *p;
914   char *q;
915   int n;
916 
917   if (!(op->type == t_symbol || op->type == t_string))
918     abort ();
919   n = op->token->charcount;
920   str = XNMALLOC (n + 1, char);
921   q = str;
922   for (p = op->token->chars; n > 0; p++, n--)
923     *q++ = p->ch;
924   *q = '\0';
925   return str;
926 }
927 
928 /* Context lookup table.  */
929 static flag_context_list_table_ty *flag_context_list_table;
930 
931 /* Read the next object.  */
932 static void
read_object(struct object * op,flag_context_ty outer_context)933 read_object (struct object *op, flag_context_ty outer_context)
934 {
935   for (;;)
936     {
937       struct char_syntax curr;
938 
939       read_char_syntax (&curr);
940 
941       switch (curr.scode)
942         {
943         case syntax_eof:
944           op->type = t_eof;
945           return;
946 
947         case syntax_whitespace:
948           if (curr.ch == '\n')
949             /* Comments assumed to be grouped with a message must immediately
950                precede it, with no non-whitespace token on a line between
951                both.  */
952             if (last_non_comment_line > last_comment_line)
953               savable_comment_reset ();
954           continue;
955 
956         case syntax_illegal:
957           op->type = t_other;
958           return;
959 
960         case syntax_single_esc:
961         case syntax_multi_esc:
962         case syntax_constituent:
963           /* Start reading a token.  */
964           op->token = XMALLOC (struct token);
965           read_token (op->token, &curr);
966           last_non_comment_line = line_number;
967 
968           /* Interpret the token.  */
969 
970           /* Dots.  */
971           if (!op->token->with_escape
972               && op->token->charcount == 1
973               && op->token->chars[0].attribute == a_dot)
974             {
975               free_token (op->token);
976               free (op->token);
977               op->type = t_dot;
978               return;
979             }
980           /* Tokens consisting entirely of dots are illegal, but be tolerant
981              here.  */
982 
983           /* Number.  */
984           {
985             int base = read_base;
986 
987             if (is_number (op->token, &base) != n_none)
988               {
989                 free_token (op->token);
990                 free (op->token);
991                 op->type = t_other;
992                 return;
993               }
994           }
995 
996           /* We interpret all other tokens as symbols (including 'reserved
997              tokens', i.e. potential numbers which are not numbers).  */
998           case_convert_token (op->token);
999           op->type = t_symbol;
1000           return;
1001 
1002         case syntax_t_macro:
1003         case syntax_nt_macro:
1004           /* Read a macro.  */
1005           switch (curr.ch)
1006             {
1007             case '(':
1008               {
1009                 int arg = 0;            /* Current argument number.  */
1010                 flag_context_list_iterator_ty context_iter;
1011                 const struct callshapes *shapes = NULL;
1012                 struct arglist_parser *argparser = NULL;
1013 
1014                 for (;; arg++)
1015                   {
1016                     struct object inner;
1017                     flag_context_ty inner_context;
1018 
1019                     if (arg == 0)
1020                       inner_context = null_context;
1021                     else
1022                       inner_context =
1023                         inherited_context (outer_context,
1024                                            flag_context_list_iterator_advance (
1025                                              &context_iter));
1026 
1027                     read_object (&inner, inner_context);
1028 
1029                     /* Recognize end of list.  */
1030                     if (inner.type == t_close)
1031                       {
1032                         op->type = t_other;
1033                         /* Don't bother converting "()" to "NIL".  */
1034                         last_non_comment_line = line_number;
1035                         if (argparser != NULL)
1036                           arglist_parser_done (argparser, arg);
1037                         return;
1038                       }
1039 
1040                     /* Dots are not allowed in every position.
1041                        But be tolerant.  */
1042 
1043                     /* EOF inside list is illegal.
1044                        But be tolerant.  */
1045                     if (inner.type == t_eof)
1046                       break;
1047 
1048                     if (arg == 0)
1049                       {
1050                         /* This is the function position.  */
1051                         if (inner.type == t_symbol)
1052                           {
1053                             char *symbol_name = string_of_object (&inner);
1054                             int i;
1055                             int prefix_len;
1056                             void *keyword_value;
1057 
1058                             /* Omit any package name.  */
1059                             i = inner.token->charcount;
1060                             while (i > 0
1061                                    && inner.token->chars[i-1].attribute != a_pack_m)
1062                               i--;
1063                             prefix_len = i;
1064 
1065                             if (hash_find_entry (&keywords,
1066                                                  symbol_name + prefix_len,
1067                                                  strlen (symbol_name + prefix_len),
1068                                                  &keyword_value)
1069                                 == 0)
1070                               shapes = (const struct callshapes *) keyword_value;
1071 
1072                             argparser = arglist_parser_alloc (mlp, shapes);
1073 
1074                             context_iter =
1075                               flag_context_list_iterator (
1076                                 flag_context_list_table_lookup (
1077                                   flag_context_list_table,
1078                                   symbol_name, strlen (symbol_name)));
1079 
1080                             free (symbol_name);
1081                           }
1082                         else
1083                           context_iter = null_context_list_iterator;
1084                       }
1085                     else
1086                       {
1087                         /* These are the argument positions.  */
1088                         if (argparser != NULL && inner.type == t_string)
1089                           {
1090                             char *s = string_of_object (&inner);
1091                             mixed_string_ty *ms =
1092                               mixed_string_alloc_simple (s, lc_string,
1093                                                          logical_file_name,
1094                                                          inner.line_number_at_start);
1095                             free (s);
1096                             arglist_parser_remember (argparser, arg, ms,
1097                                                      inner_context,
1098                                                      logical_file_name,
1099                                                      inner.line_number_at_start,
1100                                                      savable_comment, false);
1101                           }
1102                       }
1103 
1104                     free_object (&inner);
1105                   }
1106 
1107                 if (argparser != NULL)
1108                   arglist_parser_done (argparser, arg);
1109               }
1110               op->type = t_other;
1111               last_non_comment_line = line_number;
1112               return;
1113 
1114             case ')':
1115               /* Tell the caller about the end of list.
1116                  Unmatched closing parenthesis is illegal.
1117                  But be tolerant.  */
1118               op->type = t_close;
1119               last_non_comment_line = line_number;
1120               return;
1121 
1122             case ',':
1123               {
1124                 int c = do_getc ();
1125                 /* The ,@ handling inside lists is wrong anyway, because
1126                    ,@form expands to an unknown number of elements.  */
1127                 if (c != EOF && c != '@' && c != '.')
1128                   do_ungetc (c);
1129               }
1130               /*FALLTHROUGH*/
1131             case '\'':
1132             case '`':
1133               {
1134                 struct object inner;
1135 
1136                 read_object (&inner, null_context);
1137 
1138                 /* Dots and EOF are not allowed here.  But be tolerant.  */
1139 
1140                 free_object (&inner);
1141 
1142                 op->type = t_other;
1143                 last_non_comment_line = line_number;
1144                 return;
1145               }
1146 
1147             case ';':
1148               {
1149                 bool all_semicolons = true;
1150 
1151                 last_comment_line = line_number;
1152                 comment_start ();
1153                 for (;;)
1154                   {
1155                     int c = do_getc ();
1156                     if (c == EOF || c == '\n')
1157                       break;
1158                     if (c != ';')
1159                       all_semicolons = false;
1160                     if (!all_semicolons)
1161                       {
1162                         /* We skip all leading white space, but not EOLs.  */
1163                         if (!(buflen == 0 && (c == ' ' || c == '\t')))
1164                           comment_add (c);
1165                       }
1166                   }
1167                 comment_line_end (0);
1168                 continue;
1169               }
1170 
1171             case '"':
1172               {
1173                 op->token = XMALLOC (struct token);
1174                 init_token (op->token);
1175                 op->line_number_at_start = line_number;
1176                 for (;;)
1177                   {
1178                     int c = do_getc ();
1179                     if (c == EOF)
1180                       /* Invalid input.  Be tolerant, no error message.  */
1181                       break;
1182                     if (c == '"')
1183                       break;
1184                     if (c == '\\') /* syntax_single_esc */
1185                       {
1186                         c = do_getc ();
1187                         if (c == EOF)
1188                           /* Invalid input.  Be tolerant, no error message.  */
1189                           break;
1190                       }
1191                     grow_token (op->token);
1192                     op->token->chars[op->token->charcount++].ch = c;
1193                   }
1194                 op->type = t_string;
1195 
1196                 if (extract_all)
1197                   {
1198                     lex_pos_ty pos;
1199 
1200                     pos.file_name = logical_file_name;
1201                     pos.line_number = op->line_number_at_start;
1202                     remember_a_message (mlp, NULL, string_of_object (op), false,
1203                                         false, null_context, &pos,
1204                                         NULL, savable_comment, false);
1205                   }
1206                 last_non_comment_line = line_number;
1207                 return;
1208               }
1209 
1210             case '#':
1211               /* Dispatch macro handling.  */
1212               {
1213                 int c;
1214 
1215                 for (;;)
1216                   {
1217                     c = do_getc ();
1218                     if (c == EOF)
1219                       /* Invalid input.  Be tolerant, no error message.  */
1220                       {
1221                         op->type = t_other;
1222                         return;
1223                       }
1224                     if (!(c >= '0' && c <= '9'))
1225                       break;
1226                   }
1227 
1228                 switch (c)
1229                   {
1230                   case '(':
1231                   case '"':
1232                     do_ungetc (c);
1233                     /*FALLTHROUGH*/
1234                   case '\'':
1235                   case ':':
1236                   case '.':
1237                   case ',':
1238                   case 'A': case 'a':
1239                   case 'C': case 'c':
1240                   case 'P': case 'p':
1241                   case 'S': case 's':
1242                     {
1243                       struct object inner;
1244                       read_object (&inner, null_context);
1245                       /* Dots and EOF are not allowed here.
1246                          But be tolerant.  */
1247                       free_object (&inner);
1248                       op->type = t_other;
1249                       last_non_comment_line = line_number;
1250                       return;
1251                     }
1252 
1253                   case '|':
1254                     {
1255                       int depth = 0;
1256                       int c;
1257 
1258                       comment_start ();
1259                       c = do_getc ();
1260                       for (;;)
1261                         {
1262                           if (c == EOF)
1263                             break;
1264                           if (c == '|')
1265                             {
1266                               c = do_getc ();
1267                               if (c == EOF)
1268                                 break;
1269                               if (c == '#')
1270                                 {
1271                                   if (depth == 0)
1272                                     {
1273                                       comment_line_end (0);
1274                                       break;
1275                                     }
1276                                   depth--;
1277                                   comment_add ('|');
1278                                   comment_add ('#');
1279                                   c = do_getc ();
1280                                 }
1281                               else
1282                                 comment_add ('|');
1283                             }
1284                           else if (c == '#')
1285                             {
1286                               c = do_getc ();
1287                               if (c == EOF)
1288                                 break;
1289                               comment_add ('#');
1290                               if (c == '|')
1291                                 {
1292                                   depth++;
1293                                   comment_add ('|');
1294                                   c = do_getc ();
1295                                 }
1296                             }
1297                           else
1298                             {
1299                               /* We skip all leading white space.  */
1300                               if (!(buflen == 0 && (c == ' ' || c == '\t')))
1301                                 comment_add (c);
1302                               if (c == '\n')
1303                                 {
1304                                   comment_line_end (1);
1305                                   comment_start ();
1306                                 }
1307                               c = do_getc ();
1308                             }
1309                         }
1310                       if (c == EOF)
1311                         {
1312                           /* EOF not allowed here.  But be tolerant.  */
1313                           op->type = t_eof;
1314                           return;
1315                         }
1316                       last_comment_line = line_number;
1317                       continue;
1318                     }
1319 
1320                   case '\\':
1321                     {
1322                       struct token token;
1323                       struct char_syntax first;
1324                       first.ch = '\\';
1325                       first.scode = syntax_single_esc;
1326                       read_token (&token, &first);
1327                       free_token (&token);
1328                       op->type = t_other;
1329                       last_non_comment_line = line_number;
1330                       return;
1331                     }
1332 
1333                   case 'B': case 'b':
1334                   case 'O': case 'o':
1335                   case 'X': case 'x':
1336                   case 'R': case 'r':
1337                   case '*':
1338                     {
1339                       struct token token;
1340                       read_token (&token, NULL);
1341                       free_token (&token);
1342                       op->type = t_other;
1343                       last_non_comment_line = line_number;
1344                       return;
1345                     }
1346 
1347                   case '=':
1348                     /* Ignore read labels.  */
1349                     continue;
1350 
1351                   case '#':
1352                     /* Don't bother looking up the corresponding object.  */
1353                     op->type = t_other;
1354                     last_non_comment_line = line_number;
1355                     return;
1356 
1357                   case '+':
1358                   case '-':
1359                     /* Simply assume every feature expression is true.  */
1360                     {
1361                       struct object inner;
1362                       read_object (&inner, null_context);
1363                       /* Dots and EOF are not allowed here.
1364                          But be tolerant.  */
1365                       free_object (&inner);
1366                       continue;
1367                     }
1368 
1369                   default:
1370                     op->type = t_other;
1371                     last_non_comment_line = line_number;
1372                     return;
1373                   }
1374                 /*NOTREACHED*/
1375                 abort ();
1376               }
1377 
1378             default:
1379               /*NOTREACHED*/
1380               abort ();
1381             }
1382 
1383         default:
1384           /*NOTREACHED*/
1385           abort ();
1386         }
1387     }
1388 }
1389 
1390 
1391 void
extract_lisp(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1392 extract_lisp (FILE *f,
1393               const char *real_filename, const char *logical_filename,
1394               flag_context_list_table_ty *flag_table,
1395               msgdomain_list_ty *mdlp)
1396 {
1397   mlp = mdlp->item[0]->messages;
1398 
1399   fp = f;
1400   real_file_name = real_filename;
1401   logical_file_name = xstrdup (logical_filename);
1402   line_number = 1;
1403 
1404   last_comment_line = -1;
1405   last_non_comment_line = -1;
1406 
1407   flag_context_list_table = flag_table;
1408 
1409   init_keywords ();
1410 
1411   /* Eat tokens until eof is seen.  When read_object returns
1412      due to an unbalanced closing parenthesis, just restart it.  */
1413   do
1414     {
1415       struct object toplevel_object;
1416 
1417       read_object (&toplevel_object, null_context);
1418 
1419       if (toplevel_object.type == t_eof)
1420         break;
1421 
1422       free_object (&toplevel_object);
1423     }
1424   while (!feof (fp));
1425 
1426   /* Close scanner.  */
1427   fp = NULL;
1428   real_file_name = NULL;
1429   logical_file_name = NULL;
1430   line_number = 0;
1431 }
1432