• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* xgettext Smalltalk backend.
2    Copyright (C) 2002-2003, 2005-2009, 2011, 2018-2020 Free Software Foundation, Inc.
3 
4    This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.
5 
6    This program is free software: you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3 of the License, or
9    (at your option) any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
18 
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
22 
23 /* Specification.  */
24 #include "x-smalltalk.h"
25 
26 #include <errno.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 
30 #include "message.h"
31 #include "xgettext.h"
32 #include "xg-pos.h"
33 #include "xg-message.h"
34 #include "error.h"
35 #include "xalloc.h"
36 #include "gettext.h"
37 
38 #define _(s) gettext(s)
39 
40 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
41 
42 
43 /* The relevant parts of the Smalltalk syntax are:
44 
45      stringliteral ::= string | stringconst | symconst
46      stringconst ::= "#"string
47      string      ::= "'"[char]*"'"
48      symconst    ::= "#"symbol
49      symbol      ::= id | binsel | keysel[keysel]*
50      keysel      ::= id":"
51      id          ::= letter[letter|digit]*
52      letter      ::= "A".."Z" | "a".."z"
53      digit       ::= "0".."9"
54      binsel      ::= selchar[selchar]
55      selchar     ::= "+" | "-" | "*" | "/" | "~" | "|" | "," | "<" | ">"
56                      | "=" | "&" | "@" | "?" | "%" | "\"
57 
58    Strings can contain any characters; to include the string delimiter itself,
59    it must be duplicated.
60 
61    Character constants are written  "$"char
62 
63    Comments are enclosed within double quotes.
64 
65    In well-formed expressions, {} and [] and () are balanced.
66  */
67 
68 
69 /* ======================== Reading of characters.  ======================== */
70 
71 /* The input file stream.  */
72 static FILE *fp;
73 
74 
75 /* 1. line_number handling.  */
76 
77 static int
phase1_getc()78 phase1_getc ()
79 {
80   int c = getc (fp);
81 
82   if (c == EOF)
83     {
84       if (ferror (fp))
85         error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
86                real_file_name);
87       return EOF;
88     }
89 
90   if (c == '\n')
91     line_number++;
92 
93   return c;
94 }
95 
96 /* Supports only one pushback character.  */
97 static void
phase1_ungetc(int c)98 phase1_ungetc (int c)
99 {
100   if (c != EOF)
101     {
102       if (c == '\n')
103         --line_number;
104 
105       ungetc (c, fp);
106     }
107 }
108 
109 
110 /* Accumulating comments.  */
111 
112 static char *buffer;
113 static size_t bufmax;
114 static size_t buflen;
115 
116 static inline void
comment_start()117 comment_start ()
118 {
119   buflen = 0;
120 }
121 
122 static inline void
comment_add(int c)123 comment_add (int c)
124 {
125   if (buflen >= bufmax)
126     {
127       bufmax = 2 * bufmax + 10;
128       buffer = xrealloc (buffer, bufmax);
129     }
130   buffer[buflen++] = c;
131 }
132 
133 static inline void
comment_line_end()134 comment_line_end ()
135 {
136   while (buflen >= 1
137          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
138     --buflen;
139   if (buflen >= bufmax)
140     {
141       bufmax = 2 * bufmax + 10;
142       buffer = xrealloc (buffer, bufmax);
143     }
144   buffer[buflen] = '\0';
145   savable_comment_add (buffer);
146 }
147 
148 
149 /* These are for tracking whether comments count as immediately before
150    keyword.  */
151 static int last_comment_line;
152 static int last_non_comment_line;
153 
154 
155 /* ========================== Reading of tokens.  ========================== */
156 
157 
158 enum token_type_ty
159 {
160   token_type_eof,
161   token_type_uniq,              /* # */
162   token_type_symbol,            /* symbol */
163   token_type_string_literal,    /* string, stringconst, symbolconst */
164   token_type_other              /* misc. operator */
165 };
166 typedef enum token_type_ty token_type_ty;
167 
168 typedef struct token_ty token_ty;
169 struct token_ty
170 {
171   token_type_ty type;
172   char *string;         /* for token_type_string_literal, token_type_symbol */
173   int line_number;
174 };
175 
176 
177 /* 2. Combine characters into tokens.  Discard comments and whitespace.  */
178 
179 static token_ty phase2_pushback[1];
180 static int phase2_pushback_length;
181 
182 static void
phase2_get(token_ty * tp)183 phase2_get (token_ty *tp)
184 {
185   static char *buffer;
186   static int bufmax;
187   int bufpos;
188   int c;
189 
190   if (phase2_pushback_length)
191     {
192       *tp = phase2_pushback[--phase2_pushback_length];
193       return;
194     }
195 
196   tp->string = NULL;
197 
198   for (;;)
199     {
200       tp->line_number = line_number;
201       c = phase1_getc ();
202       switch (c)
203         {
204         case EOF:
205           tp->type = token_type_eof;
206           return;
207 
208         case '"':
209           {
210             /* Comment.  */
211             int lineno;
212 
213             comment_start ();
214             lineno = line_number;
215             for (;;)
216               {
217                 c = phase1_getc ();
218                 if (c == '"' || c == EOF)
219                   break;
220                 if (c == '\n')
221                   {
222                     comment_line_end ();
223                     comment_start ();
224                   }
225                 else
226                   {
227                     /* We skip all leading white space, but not EOLs.  */
228                     if (!(buflen == 0 && (c == ' ' || c == '\t')))
229                       comment_add (c);
230                   }
231               }
232             comment_line_end ();
233             last_comment_line = lineno;
234             continue;
235           }
236 
237         case '\n':
238           if (last_non_comment_line > last_comment_line)
239             savable_comment_reset ();
240           /* FALLTHROUGH */
241         case ' ':
242         case '\t':
243         case '\r':
244           /* Ignore whitespace.  */
245           continue;
246         }
247 
248       last_non_comment_line = tp->line_number;
249 
250       switch (c)
251         {
252         case '\'':
253           /* String literal.  */
254           bufpos = 0;
255           for (;;)
256             {
257               c = phase1_getc ();
258               if (c == EOF)
259                 break;
260               if (c == '\'')
261                 {
262                   c = phase1_getc ();
263                   if (c != '\'')
264                     {
265                       phase1_ungetc (c);
266                       break;
267                     }
268                 }
269               if (bufpos >= bufmax)
270                 {
271                   bufmax = 2 * bufmax + 10;
272                   buffer = xrealloc (buffer, bufmax);
273                 }
274               buffer[bufpos++] = c;
275             }
276           if (bufpos >= bufmax)
277             {
278               bufmax = 2 * bufmax + 10;
279               buffer = xrealloc (buffer, bufmax);
280             }
281           buffer[bufpos] = 0;
282           tp->type = token_type_string_literal;
283           tp->string = xstrdup (buffer);
284           return;
285 
286         case '+':
287         case '-':
288         case '*':
289         case '/':
290         case '~':
291         case '|':
292         case ',':
293         case '<':
294         case '>':
295         case '=':
296         case '&':
297         case '@':
298         case '?':
299         case '%':
300         case '\\':
301           {
302             char *name;
303             int c2 = phase1_getc ();
304             switch (c2)
305               {
306               case '+':
307               case '-':
308               case '*':
309               case '/':
310               case '~':
311               case '|':
312               case ',':
313               case '<':
314               case '>':
315               case '=':
316               case '&':
317               case '@':
318               case '?':
319               case '%':
320                 name = XNMALLOC (3, char);
321                 name[0] = c;
322                 name[1] = c2;
323                 name[2] = '\0';
324                 tp->type = token_type_symbol;
325                 tp->string = name;
326                 return;
327               default:
328                 phase1_ungetc (c2);
329                 break;
330               }
331             name = XNMALLOC (2, char);
332             name[0] = c;
333             name[1] = '\0';
334             tp->type = token_type_symbol;
335             tp->string = name;
336             return;
337           }
338 
339         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
340         case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
341         case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
342         case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
343         case 'Y': case 'Z':
344         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
345         case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
346         case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
347         case 's': case 't': case 'u': case 'v': case 'w': case 'x':
348         case 'y': case 'z':
349           /* Recognize id or id":"[id":"]* or id":"[id":"]*id.  */
350           bufpos = 0;
351           for (;;)
352             {
353               if (bufpos >= bufmax)
354                 {
355                   bufmax = 2 * bufmax + 10;
356                   buffer = xrealloc (buffer, bufmax);
357                 }
358               buffer[bufpos++] = c;
359               c = phase1_getc ();
360               switch (c)
361                 {
362                 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
363                 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
364                 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
365                 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
366                 case 'Y': case 'Z':
367                 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
368                 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
369                 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
370                 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
371                 case 'y': case 'z':
372                 case '0': case '1': case '2': case '3': case '4':
373                 case '5': case '6': case '7': case '8': case '9':
374                   continue;
375                 case ':':
376                   if (bufpos >= bufmax)
377                     {
378                       bufmax = 2 * bufmax + 10;
379                       buffer = xrealloc (buffer, bufmax);
380                     }
381                   buffer[bufpos++] = c;
382                   c = phase1_getc ();
383                   switch (c)
384                     {
385                     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
386                     case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
387                     case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
388                     case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
389                     case 'Y': case 'Z':
390                     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
391                     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
392                     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
393                     case 's': case 't': case 'u': case 'v': case 'w': case 'x':
394                     case 'y': case 'z':
395                       continue;
396                     default:
397                       phase1_ungetc (c);
398                       break;
399                     }
400                   break;
401                 default:
402                   phase1_ungetc (c);
403                   break;
404                 }
405               break;
406             }
407           if (bufpos >= bufmax)
408             {
409               bufmax = 2 * bufmax + 10;
410               buffer = xrealloc (buffer, bufmax);
411             }
412           buffer[bufpos] = '\0';
413           tp->string = xstrdup (buffer);
414           tp->type = token_type_symbol;
415           return;
416 
417         case '#':
418           /* Uniquification operator.  */
419           tp->type = token_type_uniq;
420           return;
421 
422         case '$':
423           c = phase1_getc ();
424           tp->type = token_type_other;
425           return;
426 
427         default:
428           tp->type = token_type_other;
429           return;
430         }
431     }
432 }
433 
434 /* Supports only one pushback token.  */
435 static void
phase2_unget(token_ty * tp)436 phase2_unget (token_ty *tp)
437 {
438   if (tp->type != token_type_eof)
439     {
440       if (phase2_pushback_length == SIZEOF (phase2_pushback))
441         abort ();
442       phase2_pushback[phase2_pushback_length++] = *tp;
443     }
444 }
445 
446 
447 /* 3. Combine "# string_literal" and "# symbol" to a single token.  */
448 
449 static token_ty phase3_pushback[1];
450 static int phase3_pushback_length;
451 
452 static void
phase3_get(token_ty * tp)453 phase3_get (token_ty *tp)
454 {
455   if (phase3_pushback_length)
456     {
457       *tp = phase3_pushback[--phase3_pushback_length];
458       return;
459     }
460 
461   phase2_get (tp);
462   if (tp->type == token_type_uniq)
463     {
464       token_ty token2;
465 
466       phase2_get (&token2);
467       if (token2.type == token_type_symbol
468           || token2.type == token_type_string_literal)
469         {
470           tp->type = token_type_string_literal;
471           tp->string = token2.string;
472         }
473       else
474         phase2_unget (&token2);
475     }
476 }
477 
478 /* Supports only one pushback token.  */
479 static void
phase3_unget(token_ty * tp)480 phase3_unget (token_ty *tp)
481 {
482   if (tp->type != token_type_eof)
483     {
484       if (phase3_pushback_length == SIZEOF (phase3_pushback))
485         abort ();
486       phase3_pushback[phase3_pushback_length++] = *tp;
487     }
488 }
489 
490 
491 /* ========================= Extracting strings.  ========================== */
492 
493 /* The file is broken into tokens.  Scan the token stream, looking for the
494    following patterns
495       NLS ? <string>
496       NLS at: <string>
497       NLS at: <string> plural: <string>
498    where <string> is one of
499       string_literal
500       # string_literal
501       # symbol
502  */
503 
504 void
extract_smalltalk(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)505 extract_smalltalk (FILE *f,
506                    const char *real_filename, const char *logical_filename,
507                    flag_context_list_table_ty *flag_table,
508                    msgdomain_list_ty *mdlp)
509 {
510   message_list_ty *mlp = mdlp->item[0]->messages;
511 
512   fp = f;
513   real_file_name = real_filename;
514   logical_file_name = xstrdup (logical_filename);
515   line_number = 1;
516 
517   last_comment_line = -1;
518   last_non_comment_line = -1;
519 
520   phase2_pushback_length = 0;
521   phase3_pushback_length = 0;
522 
523   /* Eat tokens until eof is seen.  */
524   {
525     /* 0 when no "NLS" has been seen.
526        1 after "NLS".
527        2 after "NLS ?".
528        3 after "NLS at:".
529        4 after "NLS at: <string>".
530        5 after "NLS at: <string> plural:".  */
531     int state;
532     /* Remember the message containing the msgid, for msgid_plural.
533        Non-NULL in states 4, 5.  */
534     message_ty *plural_mp = NULL;
535 
536     /* Start state is 0.  */
537     state = 0;
538 
539     for (;;)
540       {
541         token_ty token;
542 
543         phase3_get (&token);
544 
545         switch (token.type)
546           {
547           case token_type_symbol:
548             state = (strcmp (token.string, "NLS") == 0 ? 1 :
549                      strcmp (token.string, "?") == 0 && state == 1 ? 2 :
550                      strcmp (token.string, "at:") == 0 && state == 1 ? 3 :
551                      strcmp (token.string, "plural:") == 0 && state == 4 ? 5 :
552                      0);
553             free (token.string);
554             break;
555 
556           case token_type_string_literal:
557             if (state == 2)
558               {
559                 lex_pos_ty pos;
560                 pos.file_name = logical_file_name;
561                 pos.line_number = token.line_number;
562                 remember_a_message (mlp, NULL, token.string, false, false,
563                                     null_context, &pos, NULL, savable_comment,
564                                     false);
565                 state = 0;
566                 break;
567               }
568             if (state == 3)
569               {
570                 lex_pos_ty pos;
571                 token_ty token2;
572 
573                 pos.file_name = logical_file_name;
574                 pos.line_number = token.line_number;
575 
576                 phase3_get (&token2);
577 
578                 plural_mp =
579                   remember_a_message (mlp, NULL, token.string, false,
580                                       token2.type == token_type_symbol
581                                       && strcmp (token.string, "plural:") == 0,
582                                       null_context, &pos,
583                                       NULL, savable_comment, false);
584 
585                 phase3_unget (&token2);
586 
587                 state = 4;
588                 break;
589               }
590             if (state == 5)
591               {
592                 lex_pos_ty pos;
593                 pos.file_name = logical_file_name;
594                 pos.line_number = token.line_number;
595                 if (plural_mp != NULL)
596                   remember_a_message_plural (plural_mp, token.string, false,
597                                              null_context, &pos,
598                                              savable_comment, false);
599                 state = 0;
600                 break;
601               }
602             state = 0;
603             free (token.string);
604             break;
605 
606           case token_type_uniq:
607           case token_type_other:
608             state = 0;
609             break;
610 
611           case token_type_eof:
612             break;
613 
614           default:
615             abort ();
616           }
617 
618         if (token.type == token_type_eof)
619           break;
620       }
621   }
622 
623   /* Close scanner.  */
624   fp = NULL;
625   real_file_name = NULL;
626   logical_file_name = NULL;
627   line_number = 0;
628 }
629