1 /* xgettext Emacs Lisp backend.
2 Copyright (C) 2001-2003, 2005-2009, 2018-2020 Free Software Foundation, Inc.
3
4 This file was written by Bruno Haible <haible@clisp.cons.org>, 2001-2002.
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
22
23 /* Specification. */
24 #include "x-elisp.h"
25
26 #include <errno.h>
27 #include <stdbool.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31
32 #include "message.h"
33 #include "xgettext.h"
34 #include "xg-pos.h"
35 #include "xg-mixed-string.h"
36 #include "xg-arglist-context.h"
37 #include "xg-arglist-callshape.h"
38 #include "xg-arglist-parser.h"
39 #include "xg-message.h"
40 #include "error.h"
41 #include "xalloc.h"
42 #include "mem-hash-map.h"
43 #include "c-ctype.h"
44 #include "gettext.h"
45
46 #define _(s) gettext(s)
47
48
49 /* Summary of Emacs Lisp syntax:
50 - ';' starts a comment until end of line.
51 - '#@nn' starts a comment of nn bytes.
52 - Integers are constituted of an optional prefix (#b, #B for binary,
53 #o, #O for octal, #x, #X for hexadecimal, #nnr, #nnR for any radix),
54 an optional sign (+ or -), the digits, and an optional trailing dot.
55 - Characters are written as '?' followed by the character, possibly
56 with an escape sequence, for examples '?a', '?\n', '?\177'.
57 - Strings are delimited by double quotes. Backslash introduces an escape
58 sequence. The following are understood: '\n', '\r', '\f', '\t', '\a',
59 '\\', '\^C', '\012' (octal), '\x12' (hexadecimal).
60 - Symbols: can contain meta-characters if preceded by backslash.
61 - Uninterned symbols: written as #:SYMBOL.
62 - () delimit lists.
63 - [] delimit vectors.
64 The reader is implemented in emacs-21.1/src/lread.c. */
65
66
67 /* ====================== Keyword set customization. ====================== */
68
69 /* If true extract all strings. */
70 static bool extract_all = false;
71
72 static hash_table keywords;
73 static bool default_keywords = true;
74
75
76 void
x_elisp_extract_all()77 x_elisp_extract_all ()
78 {
79 extract_all = true;
80 }
81
82
83 void
x_elisp_keyword(const char * name)84 x_elisp_keyword (const char *name)
85 {
86 if (name == NULL)
87 default_keywords = false;
88 else
89 {
90 const char *end;
91 struct callshape shape;
92 const char *colon;
93
94 if (keywords.table == NULL)
95 hash_init (&keywords, 100);
96
97 split_keywordspec (name, &end, &shape);
98
99 /* The characters between name and end should form a valid Lisp
100 symbol. */
101 colon = strchr (name, ':');
102 if (colon == NULL || colon >= end)
103 insert_keyword_callshape (&keywords, name, end - name, &shape);
104 }
105 }
106
107 /* Finish initializing the keywords hash table.
108 Called after argument processing, before each file is processed. */
109 static void
init_keywords()110 init_keywords ()
111 {
112 if (default_keywords)
113 {
114 /* When adding new keywords here, also update the documentation in
115 xgettext.texi! */
116 x_elisp_keyword ("_");
117 default_keywords = false;
118 }
119 }
120
121 void
init_flag_table_elisp()122 init_flag_table_elisp ()
123 {
124 xgettext_record_flag ("_:1:pass-elisp-format");
125 xgettext_record_flag ("format:1:elisp-format");
126 }
127
128
129 /* ======================== Reading of characters. ======================== */
130
131 /* The input file stream. */
132 static FILE *fp;
133
134
135 /* Fetch the next character from the input file. */
136 static int
do_getc()137 do_getc ()
138 {
139 int c = getc (fp);
140
141 if (c == EOF)
142 {
143 if (ferror (fp))
144 error (EXIT_FAILURE, errno,
145 _("error while reading \"%s\""), real_file_name);
146 }
147 else if (c == '\n')
148 line_number++;
149
150 return c;
151 }
152
153 /* Put back the last fetched character, not EOF. */
154 static void
do_ungetc(int c)155 do_ungetc (int c)
156 {
157 if (c == '\n')
158 line_number--;
159 ungetc (c, fp);
160 }
161
162
163 /* ========================== Reading of tokens. ========================== */
164
165
166 /* A token consists of a sequence of characters. */
167 struct token
168 {
169 int allocated; /* number of allocated 'token_char's */
170 int charcount; /* number of used 'token_char's */
171 char *chars; /* the token's constituents */
172 };
173
174 /* Initialize a 'struct token'. */
175 static inline void
init_token(struct token * tp)176 init_token (struct token *tp)
177 {
178 tp->allocated = 10;
179 tp->chars = XNMALLOC (tp->allocated, char);
180 tp->charcount = 0;
181 }
182
183 /* Free the memory pointed to by a 'struct token'. */
184 static inline void
free_token(struct token * tp)185 free_token (struct token *tp)
186 {
187 free (tp->chars);
188 }
189
190 /* Ensure there is enough room in the token for one more character. */
191 static inline void
grow_token(struct token * tp)192 grow_token (struct token *tp)
193 {
194 if (tp->charcount == tp->allocated)
195 {
196 tp->allocated *= 2;
197 tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char));
198 }
199 }
200
201 /* Test whether a token has integer syntax. */
202 static inline bool
is_integer(const char * p)203 is_integer (const char *p)
204 {
205 /* NB: Yes, '+.' and '-.' both designate the integer 0. */
206 const char *p_start = p;
207
208 if (*p == '+' || *p == '-')
209 p++;
210 if (*p == '\0')
211 return false;
212 while (*p >= '0' && *p <= '9')
213 p++;
214 if (p > p_start && *p == '.')
215 p++;
216 return (*p == '\0');
217 }
218
219 /* Test whether a token has float syntax. */
220 static inline bool
is_float(const char * p)221 is_float (const char *p)
222 {
223 enum { LEAD_INT = 1, DOT_CHAR = 2, TRAIL_INT = 4, E_CHAR = 8, EXP_INT = 16 };
224 int state;
225
226 state = 0;
227 if (*p == '+' || *p == '-')
228 p++;
229 if (*p >= '0' && *p <= '9')
230 {
231 state |= LEAD_INT;
232 do
233 p++;
234 while (*p >= '0' && *p <= '9');
235 }
236 if (*p == '.')
237 {
238 state |= DOT_CHAR;
239 p++;
240 }
241 if (*p >= '0' && *p <= '9')
242 {
243 state |= TRAIL_INT;
244 do
245 p++;
246 while (*p >= '0' && *p <= '9');
247 }
248 if (*p == 'e' || *p == 'E')
249 {
250 state |= E_CHAR;
251 p++;
252 if (*p == '+' || *p == '-')
253 p++;
254 if (*p >= '0' && *p <= '9')
255 {
256 state |= EXP_INT;
257 do
258 p++;
259 while (*p >= '0' && *p <= '9');
260 }
261 else if (p[-1] == '+'
262 && ((p[0] == 'I' && p[1] == 'N' && p[2] == 'F')
263 || (p[0] == 'N' && p[1] == 'a' && p[2] == 'N')))
264 {
265 state |= EXP_INT;
266 p += 3;
267 }
268 }
269 return (*p == '\0')
270 && (state == (LEAD_INT | DOT_CHAR | TRAIL_INT)
271 || state == (DOT_CHAR | TRAIL_INT)
272 || state == (LEAD_INT | E_CHAR | EXP_INT)
273 || state == (LEAD_INT | DOT_CHAR | TRAIL_INT | E_CHAR | EXP_INT)
274 || state == (DOT_CHAR | TRAIL_INT | E_CHAR | EXP_INT));
275 }
276
277 /* Read the next token. 'first' is the first character, which has already
278 been read. Returns true for a symbol, false for a number. */
279 static bool
read_token(struct token * tp,int first)280 read_token (struct token *tp, int first)
281 {
282 int c;
283 bool quoted = false;
284
285 init_token (tp);
286
287 c = first;
288
289 for (;; c = do_getc ())
290 {
291 if (c == EOF)
292 break;
293 if (c <= ' ') /* FIXME: Assumes ASCII compatible encoding */
294 break;
295 if (c == '\"' || c == '\'' || c == ';' || c == '(' || c == ')'
296 || c == '[' || c == ']' || c == '#')
297 break;
298 if (c == '\\')
299 {
300 quoted = true;
301 c = do_getc ();
302 if (c == EOF)
303 /* Invalid, but be tolerant. */
304 break;
305 }
306 grow_token (tp);
307 tp->chars[tp->charcount++] = c;
308 }
309 if (c != EOF)
310 do_ungetc (c);
311
312 if (quoted)
313 return true; /* symbol */
314
315 /* Add a NUL byte at the end, for is_integer and is_float. */
316 grow_token (tp);
317 tp->chars[tp->charcount] = '\0';
318
319 if (is_integer (tp->chars) || is_float (tp->chars))
320 return false; /* number */
321 else
322 return true; /* symbol */
323 }
324
325
326 /* ========================= Accumulating comments ========================= */
327
328
329 static char *buffer;
330 static size_t bufmax;
331 static size_t buflen;
332
333 static inline void
comment_start()334 comment_start ()
335 {
336 buflen = 0;
337 }
338
339 static inline void
comment_add(int c)340 comment_add (int c)
341 {
342 if (buflen >= bufmax)
343 {
344 bufmax = 2 * bufmax + 10;
345 buffer = xrealloc (buffer, bufmax);
346 }
347 buffer[buflen++] = c;
348 }
349
350 static inline void
comment_line_end(size_t chars_to_remove)351 comment_line_end (size_t chars_to_remove)
352 {
353 buflen -= chars_to_remove;
354 while (buflen >= 1
355 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
356 --buflen;
357 if (chars_to_remove == 0 && buflen >= bufmax)
358 {
359 bufmax = 2 * bufmax + 10;
360 buffer = xrealloc (buffer, bufmax);
361 }
362 buffer[buflen] = '\0';
363 savable_comment_add (buffer);
364 }
365
366
367 /* These are for tracking whether comments count as immediately before
368 keyword. */
369 static int last_comment_line;
370 static int last_non_comment_line;
371
372
373 /* ========================= Accumulating messages ========================= */
374
375
376 static message_list_ty *mlp;
377
378
379 /* ============== Reading of objects. See CLHS 2 "Syntax". ============== */
380
381
382 /* We are only interested in symbols (e.g. GETTEXT or NGETTEXT) and strings.
383 Other objects need not to be represented precisely. */
384 enum object_type
385 {
386 t_symbol, /* symbol */
387 t_string, /* string */
388 t_other, /* other kind of real object */
389 t_dot, /* '.' pseudo object */
390 t_listclose, /* ')' pseudo object */
391 t_vectorclose,/* ']' pseudo object */
392 t_eof /* EOF marker */
393 };
394
395 struct object
396 {
397 enum object_type type;
398 struct token *token; /* for t_symbol and t_string */
399 int line_number_at_start; /* for t_string */
400 };
401
402 /* Free the memory pointed to by a 'struct object'. */
403 static inline void
free_object(struct object * op)404 free_object (struct object *op)
405 {
406 if (op->type == t_symbol || op->type == t_string)
407 {
408 free_token (op->token);
409 free (op->token);
410 }
411 }
412
413 /* Convert a t_symbol/t_string token to a char*. */
414 static char *
string_of_object(const struct object * op)415 string_of_object (const struct object *op)
416 {
417 char *str;
418 int n;
419
420 if (!(op->type == t_symbol || op->type == t_string))
421 abort ();
422 n = op->token->charcount;
423 str = XNMALLOC (n + 1, char);
424 memcpy (str, op->token->chars, n);
425 str[n] = '\0';
426 return str;
427 }
428
429 /* Context lookup table. */
430 static flag_context_list_table_ty *flag_context_list_table;
431
432 /* Returns the character represented by an escape sequence. */
433 #define IGNORABLE_ESCAPE (EOF - 1)
434 static int
do_getc_escaped(int c,bool in_string)435 do_getc_escaped (int c, bool in_string)
436 {
437 switch (c)
438 {
439 case 'a':
440 return '\a';
441 case 'b':
442 return '\b';
443 case 'd':
444 return 0x7F;
445 case 'e':
446 return 0x1B;
447 case 'f':
448 return '\f';
449 case 'n':
450 return '\n';
451 case 'r':
452 return '\r';
453 case 't':
454 return '\t';
455 case 'v':
456 return '\v';
457
458 case '\n':
459 return IGNORABLE_ESCAPE;
460
461 case ' ':
462 return (in_string ? IGNORABLE_ESCAPE : ' ');
463
464 case 'M': /* meta */
465 c = do_getc ();
466 if (c == EOF)
467 return EOF;
468 if (c != '-')
469 /* Invalid input. But be tolerant. */
470 return c;
471 c = do_getc ();
472 if (c == EOF)
473 return EOF;
474 if (c == '\\')
475 {
476 c = do_getc ();
477 if (c == EOF)
478 return EOF;
479 c = do_getc_escaped (c, false);
480 }
481 return c | 0x80;
482
483 case 'S': /* shift */
484 c = do_getc ();
485 if (c == EOF)
486 return EOF;
487 if (c != '-')
488 /* Invalid input. But be tolerant. */
489 return c;
490 c = do_getc ();
491 if (c == EOF)
492 return EOF;
493 if (c == '\\')
494 {
495 c = do_getc ();
496 if (c == EOF)
497 return EOF;
498 c = do_getc_escaped (c, false);
499 }
500 return (c >= 'a' && c <= 'z' ? c - 'a' + 'A' : c);
501
502 case 'H': /* hyper */
503 case 'A': /* alt */
504 case 's': /* super */
505 c = do_getc ();
506 if (c == EOF)
507 return EOF;
508 if (c != '-')
509 /* Invalid input. But be tolerant. */
510 return c;
511 c = do_getc ();
512 if (c == EOF)
513 return EOF;
514 if (c == '\\')
515 {
516 c = do_getc ();
517 if (c == EOF)
518 return EOF;
519 c = do_getc_escaped (c, false);
520 }
521 return c;
522
523 case 'C': /* ctrl */
524 c = do_getc ();
525 if (c == EOF)
526 return EOF;
527 if (c != '-')
528 /* Invalid input. But be tolerant. */
529 return c;
530 /*FALLTHROUGH*/
531 case '^':
532 c = do_getc ();
533 if (c == EOF)
534 return EOF;
535 if (c == '\\')
536 {
537 c = do_getc ();
538 if (c == EOF)
539 return EOF;
540 c = do_getc_escaped (c, false);
541 }
542 if (c == '?')
543 return 0x7F;
544 if ((c & 0x5F) >= 0x41 && (c & 0x5F) <= 0x5A)
545 return c & 0x9F;
546 if ((c & 0x7F) >= 0x40 && (c & 0x7F) <= 0x5F)
547 return c & 0x9F;
548 #if 0 /* We cannot handle NUL bytes in strings. */
549 if (c == ' ')
550 return 0x00;
551 #endif
552 return c;
553
554 case '0': case '1': case '2': case '3': case '4':
555 case '5': case '6': case '7':
556 /* An octal escape, as in ANSI C. */
557 {
558 int n = c - '0';
559
560 c = do_getc ();
561 if (c != EOF)
562 {
563 if (c >= '0' && c <= '7')
564 {
565 n = (n << 3) + (c - '0');
566 c = do_getc ();
567 if (c != EOF)
568 {
569 if (c >= '0' && c <= '7')
570 n = (n << 3) + (c - '0');
571 else
572 do_ungetc (c);
573 }
574 }
575 else
576 do_ungetc (c);
577 }
578 return (unsigned char) n;
579 }
580
581 case 'x':
582 /* A hexadecimal escape, as in ANSI C. */
583 {
584 int n = 0;
585
586 for (;;)
587 {
588 c = do_getc ();
589 if (c == EOF)
590 break;
591 else if (c >= '0' && c <= '9')
592 n = (n << 4) + (c - '0');
593 else if (c >= 'A' && c <= 'F')
594 n = (n << 4) + (c - 'A' + 10);
595 else if (c >= 'a' && c <= 'f')
596 n = (n << 4) + (c - 'a' + 10);
597 else
598 {
599 do_ungetc (c);
600 break;
601 }
602 }
603 return (unsigned char) n;
604 }
605
606 default:
607 /* Ignore Emacs multibyte character stuff. All the strings we are
608 interested in are ASCII strings. */
609 return c;
610 }
611 }
612
613 /* Read the next object.
614 'first_in_list' and 'new_backquote_flag' are used for reading old
615 backquote syntax and new backquote syntax. */
616 static void
read_object(struct object * op,bool first_in_list,bool new_backquote_flag,flag_context_ty outer_context)617 read_object (struct object *op, bool first_in_list, bool new_backquote_flag,
618 flag_context_ty outer_context)
619 {
620 for (;;)
621 {
622 int c;
623
624 c = do_getc ();
625
626 switch (c)
627 {
628 case EOF:
629 op->type = t_eof;
630 return;
631
632 case '\n':
633 /* Comments assumed to be grouped with a message must immediately
634 precede it, with no non-whitespace token on a line between
635 both. */
636 if (last_non_comment_line > last_comment_line)
637 savable_comment_reset ();
638 continue;
639
640 case '(':
641 {
642 int arg = 0; /* Current argument number. */
643 flag_context_list_iterator_ty context_iter;
644 const struct callshapes *shapes = NULL;
645 struct arglist_parser *argparser = NULL;
646
647 for (;; arg++)
648 {
649 struct object inner;
650 flag_context_ty inner_context;
651
652 if (arg == 0)
653 inner_context = null_context;
654 else
655 inner_context =
656 inherited_context (outer_context,
657 flag_context_list_iterator_advance (
658 &context_iter));
659
660 read_object (&inner, arg == 0, new_backquote_flag,
661 inner_context);
662
663 /* Recognize end of list. */
664 if (inner.type == t_listclose)
665 {
666 op->type = t_other;
667 /* Don't bother converting "()" to "NIL". */
668 last_non_comment_line = line_number;
669 if (argparser != NULL)
670 arglist_parser_done (argparser, arg);
671 return;
672 }
673
674 /* Dots are not allowed in every position. ']' is not allowed.
675 But be tolerant. */
676
677 /* EOF inside list is illegal. But be tolerant. */
678 if (inner.type == t_eof)
679 break;
680
681 if (arg == 0)
682 {
683 /* This is the function position. */
684 if (inner.type == t_symbol)
685 {
686 char *symbol_name = string_of_object (&inner);
687 void *keyword_value;
688
689 if (hash_find_entry (&keywords,
690 symbol_name, strlen (symbol_name),
691 &keyword_value)
692 == 0)
693 shapes = (const struct callshapes *) keyword_value;
694
695 argparser = arglist_parser_alloc (mlp, shapes);
696
697 context_iter =
698 flag_context_list_iterator (
699 flag_context_list_table_lookup (
700 flag_context_list_table,
701 symbol_name, strlen (symbol_name)));
702
703 free (symbol_name);
704 }
705 else
706 context_iter = null_context_list_iterator;
707 }
708 else
709 {
710 /* These are the argument positions. */
711 if (argparser != NULL && inner.type == t_string)
712 {
713 char *s = string_of_object (&inner);
714 mixed_string_ty *ms =
715 mixed_string_alloc_simple (s, lc_string,
716 logical_file_name,
717 inner.line_number_at_start);
718 free (s);
719 arglist_parser_remember (argparser, arg, ms,
720 inner_context,
721 logical_file_name,
722 inner.line_number_at_start,
723 savable_comment, false);
724 }
725 }
726
727 free_object (&inner);
728 }
729
730 if (argparser != NULL)
731 arglist_parser_done (argparser, arg);
732 }
733 op->type = t_other;
734 last_non_comment_line = line_number;
735 return;
736
737 case ')':
738 /* Tell the caller about the end of list.
739 Unmatched closing parenthesis is illegal. But be tolerant. */
740 op->type = t_listclose;
741 last_non_comment_line = line_number;
742 return;
743
744 case '[':
745 {
746 for (;;)
747 {
748 struct object inner;
749
750 read_object (&inner, false, new_backquote_flag, null_context);
751
752 /* Recognize end of vector. */
753 if (inner.type == t_vectorclose)
754 {
755 op->type = t_other;
756 last_non_comment_line = line_number;
757 return;
758 }
759
760 /* Dots and ')' are not allowed. But be tolerant. */
761
762 /* EOF inside vector is illegal. But be tolerant. */
763 if (inner.type == t_eof)
764 break;
765
766 free_object (&inner);
767 }
768 }
769 op->type = t_other;
770 last_non_comment_line = line_number;
771 return;
772
773 case ']':
774 /* Tell the caller about the end of vector.
775 Unmatched closing bracket is illegal. But be tolerant. */
776 op->type = t_vectorclose;
777 last_non_comment_line = line_number;
778 return;
779
780 case '\'':
781 {
782 struct object inner;
783
784 read_object (&inner, false, new_backquote_flag, null_context);
785
786 /* Dots and EOF are not allowed here. But be tolerant. */
787
788 free_object (&inner);
789
790 op->type = t_other;
791 last_non_comment_line = line_number;
792 return;
793 }
794
795 case '`':
796 if (first_in_list)
797 goto default_label;
798 {
799 struct object inner;
800
801 read_object (&inner, false, true, null_context);
802
803 /* Dots and EOF are not allowed here. But be tolerant. */
804
805 free_object (&inner);
806
807 op->type = t_other;
808 last_non_comment_line = line_number;
809 return;
810 }
811
812 case ',':
813 if (!new_backquote_flag)
814 goto default_label;
815 {
816 int c = do_getc ();
817 /* The ,@ handling inside lists is wrong anyway, because
818 ,@form expands to an unknown number of elements. */
819 if (c != EOF && c != '@' && c != '.')
820 do_ungetc (c);
821 }
822 {
823 struct object inner;
824
825 read_object (&inner, false, false, null_context);
826
827 /* Dots and EOF are not allowed here. But be tolerant. */
828
829 free_object (&inner);
830
831 op->type = t_other;
832 last_non_comment_line = line_number;
833 return;
834 }
835
836 case ';':
837 {
838 bool all_semicolons = true;
839
840 last_comment_line = line_number;
841 comment_start ();
842 for (;;)
843 {
844 int c = do_getc ();
845 if (c == EOF || c == '\n')
846 break;
847 if (c != ';')
848 all_semicolons = false;
849 if (!all_semicolons)
850 {
851 /* We skip all leading white space, but not EOLs. */
852 if (!(buflen == 0 && (c == ' ' || c == '\t')))
853 comment_add (c);
854 }
855 }
856 comment_line_end (0);
857 continue;
858 }
859
860 case '"':
861 {
862 op->token = XMALLOC (struct token);
863 init_token (op->token);
864 op->line_number_at_start = line_number;
865 for (;;)
866 {
867 int c = do_getc ();
868 if (c == EOF)
869 /* Invalid input. Be tolerant, no error message. */
870 break;
871 if (c == '"')
872 break;
873 if (c == '\\')
874 {
875 c = do_getc ();
876 if (c == EOF)
877 /* Invalid input. Be tolerant, no error message. */
878 break;
879 c = do_getc_escaped (c, true);
880 if (c == EOF)
881 /* Invalid input. Be tolerant, no error message. */
882 break;
883 if (c == IGNORABLE_ESCAPE)
884 /* Ignore escaped newline and escaped space. */
885 ;
886 else
887 {
888 grow_token (op->token);
889 op->token->chars[op->token->charcount++] = c;
890 }
891 }
892 else
893 {
894 grow_token (op->token);
895 op->token->chars[op->token->charcount++] = c;
896 }
897 }
898 op->type = t_string;
899
900 if (extract_all)
901 {
902 lex_pos_ty pos;
903
904 pos.file_name = logical_file_name;
905 pos.line_number = op->line_number_at_start;
906 remember_a_message (mlp, NULL, string_of_object (op), false,
907 false, null_context, &pos,
908 NULL, savable_comment, false);
909 }
910 last_non_comment_line = line_number;
911 return;
912 }
913
914 case '?':
915 c = do_getc ();
916 if (c == EOF)
917 /* Invalid input. Be tolerant, no error message. */
918 ;
919 else if (c == '\\')
920 {
921 c = do_getc ();
922 if (c == EOF)
923 /* Invalid input. Be tolerant, no error message. */
924 ;
925 else
926 {
927 c = do_getc_escaped (c, false);
928 if (c == EOF)
929 /* Invalid input. Be tolerant, no error message. */
930 ;
931 }
932 }
933 /* Impossible to deal with Emacs multibyte character stuff here. */
934 op->type = t_other;
935 last_non_comment_line = line_number;
936 return;
937
938 case '#':
939 /* Dispatch macro handling. */
940 c = do_getc ();
941 if (c == EOF)
942 /* Invalid input. Be tolerant, no error message. */
943 {
944 op->type = t_other;
945 return;
946 }
947
948 switch (c)
949 {
950 case '^':
951 c = do_getc ();
952 if (c == '^')
953 c = do_getc ();
954 if (c == '[')
955 {
956 /* Read a char table, same syntax as a vector. */
957 for (;;)
958 {
959 struct object inner;
960
961 read_object (&inner, false, new_backquote_flag,
962 null_context);
963
964 /* Recognize end of vector. */
965 if (inner.type == t_vectorclose)
966 {
967 op->type = t_other;
968 last_non_comment_line = line_number;
969 return;
970 }
971
972 /* Dots and ')' are not allowed. But be tolerant. */
973
974 /* EOF inside vector is illegal. But be tolerant. */
975 if (inner.type == t_eof)
976 break;
977
978 free_object (&inner);
979 }
980 op->type = t_other;
981 last_non_comment_line = line_number;
982 return;
983 }
984 else
985 /* Invalid input. Be tolerant, no error message. */
986 {
987 op->type = t_other;
988 if (c != EOF)
989 last_non_comment_line = line_number;
990 return;
991 }
992
993 case '&':
994 /* Read a bit vector. */
995 {
996 struct object length;
997 read_object (&length, first_in_list, new_backquote_flag,
998 null_context);
999 /* Dots and EOF are not allowed here.
1000 But be tolerant. */
1001 free_object (&length);
1002 }
1003 c = do_getc ();
1004 if (c == '"')
1005 {
1006 struct object string;
1007 read_object (&string, first_in_list, new_backquote_flag,
1008 null_context);
1009 free_object (&string);
1010 }
1011 else
1012 /* Invalid input. Be tolerant, no error message. */
1013 do_ungetc (c);
1014 op->type = t_other;
1015 last_non_comment_line = line_number;
1016 return;
1017
1018 case '[':
1019 /* Read a compiled function, same syntax as a vector. */
1020 case '(':
1021 /* Read a string with properties, same syntax as a list. */
1022 {
1023 struct object inner;
1024 do_ungetc (c);
1025 read_object (&inner, false, new_backquote_flag, null_context);
1026 /* Dots and EOF are not allowed here.
1027 But be tolerant. */
1028 free_object (&inner);
1029 op->type = t_other;
1030 last_non_comment_line = line_number;
1031 return;
1032 }
1033
1034 case '@':
1035 /* Read a comment consisting of a given number of bytes. */
1036 {
1037 unsigned int nskip = 0;
1038
1039 for (;;)
1040 {
1041 c = do_getc ();
1042 if (!(c >= '0' && c <= '9'))
1043 break;
1044 nskip = 10 * nskip + (c - '0');
1045 }
1046 if (c != EOF)
1047 {
1048 do_ungetc (c);
1049 for (; nskip > 0; nskip--)
1050 if (do_getc () == EOF)
1051 break;
1052 }
1053 continue;
1054 }
1055
1056 case '$':
1057 op->type = t_other;
1058 last_non_comment_line = line_number;
1059 return;
1060
1061 case '\'':
1062 case ':':
1063 case 'S': case 's': /* XEmacs only */
1064 {
1065 struct object inner;
1066 read_object (&inner, false, new_backquote_flag, null_context);
1067 /* Dots and EOF are not allowed here.
1068 But be tolerant. */
1069 free_object (&inner);
1070 op->type = t_other;
1071 last_non_comment_line = line_number;
1072 return;
1073 }
1074
1075 case '0': case '1': case '2': case '3': case '4':
1076 case '5': case '6': case '7': case '8': case '9':
1077 /* Read Common Lisp style #n# or #n=. */
1078 for (;;)
1079 {
1080 c = do_getc ();
1081 if (!(c >= '0' && c <= '9'))
1082 break;
1083 }
1084 if (c == EOF)
1085 /* Invalid input. Be tolerant, no error message. */
1086 {
1087 op->type = t_other;
1088 return;
1089 }
1090 if (c == '=')
1091 {
1092 read_object (op, false, new_backquote_flag, outer_context);
1093 last_non_comment_line = line_number;
1094 return;
1095 }
1096 if (c == '#')
1097 {
1098 op->type = t_other;
1099 last_non_comment_line = line_number;
1100 return;
1101 }
1102 if (c == 'R' || c == 'r')
1103 {
1104 /* Read an integer. */
1105 c = do_getc ();
1106 if (c == '+' || c == '-')
1107 c = do_getc ();
1108 for (; c != EOF; c = do_getc ())
1109 if (!c_isalnum (c))
1110 {
1111 do_ungetc (c);
1112 break;
1113 }
1114 op->type = t_other;
1115 last_non_comment_line = line_number;
1116 return;
1117 }
1118 /* Invalid input. Be tolerant, no error message. */
1119 op->type = t_other;
1120 last_non_comment_line = line_number;
1121 return;
1122
1123 case 'X': case 'x':
1124 case 'O': case 'o':
1125 case 'B': case 'b':
1126 {
1127 /* Read an integer. */
1128 c = do_getc ();
1129 if (c == '+' || c == '-')
1130 c = do_getc ();
1131 for (; c != EOF; c = do_getc ())
1132 if (!c_isalnum (c))
1133 {
1134 do_ungetc (c);
1135 break;
1136 }
1137 op->type = t_other;
1138 last_non_comment_line = line_number;
1139 return;
1140 }
1141
1142 case '*': /* XEmacs only */
1143 {
1144 /* Read a bit-vector. */
1145 do
1146 c = do_getc ();
1147 while (c == '0' || c == '1');
1148 if (c != EOF)
1149 do_ungetc (c);
1150 op->type = t_other;
1151 last_non_comment_line = line_number;
1152 return;
1153 }
1154
1155 case '+': /* XEmacs only */
1156 case '-': /* XEmacs only */
1157 /* Simply assume every feature expression is true. */
1158 {
1159 struct object inner;
1160 read_object (&inner, false, new_backquote_flag, null_context);
1161 /* Dots and EOF are not allowed here.
1162 But be tolerant. */
1163 free_object (&inner);
1164 continue;
1165 }
1166
1167 default:
1168 /* Invalid input. Be tolerant, no error message. */
1169 op->type = t_other;
1170 last_non_comment_line = line_number;
1171 return;
1172 }
1173
1174 /*NOTREACHED*/
1175 abort ();
1176
1177 case '.':
1178 c = do_getc ();
1179 if (c != EOF)
1180 {
1181 do_ungetc (c);
1182 if (c <= ' ' /* FIXME: Assumes ASCII compatible encoding */
1183 || strchr ("\"'`,(", c) != NULL)
1184 {
1185 op->type = t_dot;
1186 last_non_comment_line = line_number;
1187 return;
1188 }
1189 }
1190 c = '.';
1191 /*FALLTHROUGH*/
1192 default:
1193 default_label:
1194 if (c <= ' ') /* FIXME: Assumes ASCII compatible encoding */
1195 continue;
1196 /* Read a token. */
1197 {
1198 bool symbol;
1199
1200 op->token = XMALLOC (struct token);
1201 symbol = read_token (op->token, c);
1202 if (symbol)
1203 {
1204 op->type = t_symbol;
1205 last_non_comment_line = line_number;
1206 return;
1207 }
1208 else
1209 {
1210 free_token (op->token);
1211 free (op->token);
1212 op->type = t_other;
1213 last_non_comment_line = line_number;
1214 return;
1215 }
1216 }
1217 }
1218 }
1219 }
1220
1221
1222 void
extract_elisp(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1223 extract_elisp (FILE *f,
1224 const char *real_filename, const char *logical_filename,
1225 flag_context_list_table_ty *flag_table,
1226 msgdomain_list_ty *mdlp)
1227 {
1228 mlp = mdlp->item[0]->messages;
1229
1230 fp = f;
1231 real_file_name = real_filename;
1232 logical_file_name = xstrdup (logical_filename);
1233 line_number = 1;
1234
1235 last_comment_line = -1;
1236 last_non_comment_line = -1;
1237
1238 flag_context_list_table = flag_table;
1239
1240 init_keywords ();
1241
1242 /* Eat tokens until eof is seen. When read_object returns
1243 due to an unbalanced closing parenthesis, just restart it. */
1244 do
1245 {
1246 struct object toplevel_object;
1247
1248 read_object (&toplevel_object, false, false, null_context);
1249
1250 if (toplevel_object.type == t_eof)
1251 break;
1252
1253 free_object (&toplevel_object);
1254 }
1255 while (!feof (fp));
1256
1257 /* Close scanner. */
1258 fp = NULL;
1259 real_file_name = NULL;
1260 logical_file_name = NULL;
1261 line_number = 0;
1262 }
1263