1 /* xgettext Smalltalk backend.
2 Copyright (C) 2002-2003, 2005-2009, 2011, 2018-2020 Free Software Foundation, Inc.
3
4 This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.
5
6 This program is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 3 of the License, or
9 (at your option) any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program. If not, see <https://www.gnu.org/licenses/>. */
18
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
22
23 /* Specification. */
24 #include "x-smalltalk.h"
25
26 #include <errno.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29
30 #include "message.h"
31 #include "xgettext.h"
32 #include "xg-pos.h"
33 #include "xg-message.h"
34 #include "error.h"
35 #include "xalloc.h"
36 #include "gettext.h"
37
38 #define _(s) gettext(s)
39
40 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
41
42
43 /* The relevant parts of the Smalltalk syntax are:
44
45 stringliteral ::= string | stringconst | symconst
46 stringconst ::= "#"string
47 string ::= "'"[char]*"'"
48 symconst ::= "#"symbol
49 symbol ::= id | binsel | keysel[keysel]*
50 keysel ::= id":"
51 id ::= letter[letter|digit]*
52 letter ::= "A".."Z" | "a".."z"
53 digit ::= "0".."9"
54 binsel ::= selchar[selchar]
55 selchar ::= "+" | "-" | "*" | "/" | "~" | "|" | "," | "<" | ">"
56 | "=" | "&" | "@" | "?" | "%" | "\"
57
58 Strings can contain any characters; to include the string delimiter itself,
59 it must be duplicated.
60
61 Character constants are written "$"char
62
63 Comments are enclosed within double quotes.
64
65 In well-formed expressions, {} and [] and () are balanced.
66 */
67
68
69 /* ======================== Reading of characters. ======================== */
70
71 /* The input file stream. */
72 static FILE *fp;
73
74
75 /* 1. line_number handling. */
76
77 static int
phase1_getc()78 phase1_getc ()
79 {
80 int c = getc (fp);
81
82 if (c == EOF)
83 {
84 if (ferror (fp))
85 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
86 real_file_name);
87 return EOF;
88 }
89
90 if (c == '\n')
91 line_number++;
92
93 return c;
94 }
95
96 /* Supports only one pushback character. */
97 static void
phase1_ungetc(int c)98 phase1_ungetc (int c)
99 {
100 if (c != EOF)
101 {
102 if (c == '\n')
103 --line_number;
104
105 ungetc (c, fp);
106 }
107 }
108
109
110 /* Accumulating comments. */
111
112 static char *buffer;
113 static size_t bufmax;
114 static size_t buflen;
115
116 static inline void
comment_start()117 comment_start ()
118 {
119 buflen = 0;
120 }
121
122 static inline void
comment_add(int c)123 comment_add (int c)
124 {
125 if (buflen >= bufmax)
126 {
127 bufmax = 2 * bufmax + 10;
128 buffer = xrealloc (buffer, bufmax);
129 }
130 buffer[buflen++] = c;
131 }
132
133 static inline void
comment_line_end()134 comment_line_end ()
135 {
136 while (buflen >= 1
137 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
138 --buflen;
139 if (buflen >= bufmax)
140 {
141 bufmax = 2 * bufmax + 10;
142 buffer = xrealloc (buffer, bufmax);
143 }
144 buffer[buflen] = '\0';
145 savable_comment_add (buffer);
146 }
147
148
149 /* These are for tracking whether comments count as immediately before
150 keyword. */
151 static int last_comment_line;
152 static int last_non_comment_line;
153
154
155 /* ========================== Reading of tokens. ========================== */
156
157
158 enum token_type_ty
159 {
160 token_type_eof,
161 token_type_uniq, /* # */
162 token_type_symbol, /* symbol */
163 token_type_string_literal, /* string, stringconst, symbolconst */
164 token_type_other /* misc. operator */
165 };
166 typedef enum token_type_ty token_type_ty;
167
168 typedef struct token_ty token_ty;
169 struct token_ty
170 {
171 token_type_ty type;
172 char *string; /* for token_type_string_literal, token_type_symbol */
173 int line_number;
174 };
175
176
177 /* 2. Combine characters into tokens. Discard comments and whitespace. */
178
179 static token_ty phase2_pushback[1];
180 static int phase2_pushback_length;
181
182 static void
phase2_get(token_ty * tp)183 phase2_get (token_ty *tp)
184 {
185 static char *buffer;
186 static int bufmax;
187 int bufpos;
188 int c;
189
190 if (phase2_pushback_length)
191 {
192 *tp = phase2_pushback[--phase2_pushback_length];
193 return;
194 }
195
196 tp->string = NULL;
197
198 for (;;)
199 {
200 tp->line_number = line_number;
201 c = phase1_getc ();
202 switch (c)
203 {
204 case EOF:
205 tp->type = token_type_eof;
206 return;
207
208 case '"':
209 {
210 /* Comment. */
211 int lineno;
212
213 comment_start ();
214 lineno = line_number;
215 for (;;)
216 {
217 c = phase1_getc ();
218 if (c == '"' || c == EOF)
219 break;
220 if (c == '\n')
221 {
222 comment_line_end ();
223 comment_start ();
224 }
225 else
226 {
227 /* We skip all leading white space, but not EOLs. */
228 if (!(buflen == 0 && (c == ' ' || c == '\t')))
229 comment_add (c);
230 }
231 }
232 comment_line_end ();
233 last_comment_line = lineno;
234 continue;
235 }
236
237 case '\n':
238 if (last_non_comment_line > last_comment_line)
239 savable_comment_reset ();
240 /* FALLTHROUGH */
241 case ' ':
242 case '\t':
243 case '\r':
244 /* Ignore whitespace. */
245 continue;
246 }
247
248 last_non_comment_line = tp->line_number;
249
250 switch (c)
251 {
252 case '\'':
253 /* String literal. */
254 bufpos = 0;
255 for (;;)
256 {
257 c = phase1_getc ();
258 if (c == EOF)
259 break;
260 if (c == '\'')
261 {
262 c = phase1_getc ();
263 if (c != '\'')
264 {
265 phase1_ungetc (c);
266 break;
267 }
268 }
269 if (bufpos >= bufmax)
270 {
271 bufmax = 2 * bufmax + 10;
272 buffer = xrealloc (buffer, bufmax);
273 }
274 buffer[bufpos++] = c;
275 }
276 if (bufpos >= bufmax)
277 {
278 bufmax = 2 * bufmax + 10;
279 buffer = xrealloc (buffer, bufmax);
280 }
281 buffer[bufpos] = 0;
282 tp->type = token_type_string_literal;
283 tp->string = xstrdup (buffer);
284 return;
285
286 case '+':
287 case '-':
288 case '*':
289 case '/':
290 case '~':
291 case '|':
292 case ',':
293 case '<':
294 case '>':
295 case '=':
296 case '&':
297 case '@':
298 case '?':
299 case '%':
300 case '\\':
301 {
302 char *name;
303 int c2 = phase1_getc ();
304 switch (c2)
305 {
306 case '+':
307 case '-':
308 case '*':
309 case '/':
310 case '~':
311 case '|':
312 case ',':
313 case '<':
314 case '>':
315 case '=':
316 case '&':
317 case '@':
318 case '?':
319 case '%':
320 name = XNMALLOC (3, char);
321 name[0] = c;
322 name[1] = c2;
323 name[2] = '\0';
324 tp->type = token_type_symbol;
325 tp->string = name;
326 return;
327 default:
328 phase1_ungetc (c2);
329 break;
330 }
331 name = XNMALLOC (2, char);
332 name[0] = c;
333 name[1] = '\0';
334 tp->type = token_type_symbol;
335 tp->string = name;
336 return;
337 }
338
339 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
340 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
341 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
342 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
343 case 'Y': case 'Z':
344 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
345 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
346 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
347 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
348 case 'y': case 'z':
349 /* Recognize id or id":"[id":"]* or id":"[id":"]*id. */
350 bufpos = 0;
351 for (;;)
352 {
353 if (bufpos >= bufmax)
354 {
355 bufmax = 2 * bufmax + 10;
356 buffer = xrealloc (buffer, bufmax);
357 }
358 buffer[bufpos++] = c;
359 c = phase1_getc ();
360 switch (c)
361 {
362 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
363 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
364 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
365 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
366 case 'Y': case 'Z':
367 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
368 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
369 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
370 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
371 case 'y': case 'z':
372 case '0': case '1': case '2': case '3': case '4':
373 case '5': case '6': case '7': case '8': case '9':
374 continue;
375 case ':':
376 if (bufpos >= bufmax)
377 {
378 bufmax = 2 * bufmax + 10;
379 buffer = xrealloc (buffer, bufmax);
380 }
381 buffer[bufpos++] = c;
382 c = phase1_getc ();
383 switch (c)
384 {
385 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
386 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
387 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
388 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
389 case 'Y': case 'Z':
390 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
391 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
392 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
393 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
394 case 'y': case 'z':
395 continue;
396 default:
397 phase1_ungetc (c);
398 break;
399 }
400 break;
401 default:
402 phase1_ungetc (c);
403 break;
404 }
405 break;
406 }
407 if (bufpos >= bufmax)
408 {
409 bufmax = 2 * bufmax + 10;
410 buffer = xrealloc (buffer, bufmax);
411 }
412 buffer[bufpos] = '\0';
413 tp->string = xstrdup (buffer);
414 tp->type = token_type_symbol;
415 return;
416
417 case '#':
418 /* Uniquification operator. */
419 tp->type = token_type_uniq;
420 return;
421
422 case '$':
423 c = phase1_getc ();
424 tp->type = token_type_other;
425 return;
426
427 default:
428 tp->type = token_type_other;
429 return;
430 }
431 }
432 }
433
434 /* Supports only one pushback token. */
435 static void
phase2_unget(token_ty * tp)436 phase2_unget (token_ty *tp)
437 {
438 if (tp->type != token_type_eof)
439 {
440 if (phase2_pushback_length == SIZEOF (phase2_pushback))
441 abort ();
442 phase2_pushback[phase2_pushback_length++] = *tp;
443 }
444 }
445
446
447 /* 3. Combine "# string_literal" and "# symbol" to a single token. */
448
449 static token_ty phase3_pushback[1];
450 static int phase3_pushback_length;
451
452 static void
phase3_get(token_ty * tp)453 phase3_get (token_ty *tp)
454 {
455 if (phase3_pushback_length)
456 {
457 *tp = phase3_pushback[--phase3_pushback_length];
458 return;
459 }
460
461 phase2_get (tp);
462 if (tp->type == token_type_uniq)
463 {
464 token_ty token2;
465
466 phase2_get (&token2);
467 if (token2.type == token_type_symbol
468 || token2.type == token_type_string_literal)
469 {
470 tp->type = token_type_string_literal;
471 tp->string = token2.string;
472 }
473 else
474 phase2_unget (&token2);
475 }
476 }
477
478 /* Supports only one pushback token. */
479 static void
phase3_unget(token_ty * tp)480 phase3_unget (token_ty *tp)
481 {
482 if (tp->type != token_type_eof)
483 {
484 if (phase3_pushback_length == SIZEOF (phase3_pushback))
485 abort ();
486 phase3_pushback[phase3_pushback_length++] = *tp;
487 }
488 }
489
490
491 /* ========================= Extracting strings. ========================== */
492
493 /* The file is broken into tokens. Scan the token stream, looking for the
494 following patterns
495 NLS ? <string>
496 NLS at: <string>
497 NLS at: <string> plural: <string>
498 where <string> is one of
499 string_literal
500 # string_literal
501 # symbol
502 */
503
504 void
extract_smalltalk(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)505 extract_smalltalk (FILE *f,
506 const char *real_filename, const char *logical_filename,
507 flag_context_list_table_ty *flag_table,
508 msgdomain_list_ty *mdlp)
509 {
510 message_list_ty *mlp = mdlp->item[0]->messages;
511
512 fp = f;
513 real_file_name = real_filename;
514 logical_file_name = xstrdup (logical_filename);
515 line_number = 1;
516
517 last_comment_line = -1;
518 last_non_comment_line = -1;
519
520 phase2_pushback_length = 0;
521 phase3_pushback_length = 0;
522
523 /* Eat tokens until eof is seen. */
524 {
525 /* 0 when no "NLS" has been seen.
526 1 after "NLS".
527 2 after "NLS ?".
528 3 after "NLS at:".
529 4 after "NLS at: <string>".
530 5 after "NLS at: <string> plural:". */
531 int state;
532 /* Remember the message containing the msgid, for msgid_plural.
533 Non-NULL in states 4, 5. */
534 message_ty *plural_mp = NULL;
535
536 /* Start state is 0. */
537 state = 0;
538
539 for (;;)
540 {
541 token_ty token;
542
543 phase3_get (&token);
544
545 switch (token.type)
546 {
547 case token_type_symbol:
548 state = (strcmp (token.string, "NLS") == 0 ? 1 :
549 strcmp (token.string, "?") == 0 && state == 1 ? 2 :
550 strcmp (token.string, "at:") == 0 && state == 1 ? 3 :
551 strcmp (token.string, "plural:") == 0 && state == 4 ? 5 :
552 0);
553 free (token.string);
554 break;
555
556 case token_type_string_literal:
557 if (state == 2)
558 {
559 lex_pos_ty pos;
560 pos.file_name = logical_file_name;
561 pos.line_number = token.line_number;
562 remember_a_message (mlp, NULL, token.string, false, false,
563 null_context, &pos, NULL, savable_comment,
564 false);
565 state = 0;
566 break;
567 }
568 if (state == 3)
569 {
570 lex_pos_ty pos;
571 token_ty token2;
572
573 pos.file_name = logical_file_name;
574 pos.line_number = token.line_number;
575
576 phase3_get (&token2);
577
578 plural_mp =
579 remember_a_message (mlp, NULL, token.string, false,
580 token2.type == token_type_symbol
581 && strcmp (token.string, "plural:") == 0,
582 null_context, &pos,
583 NULL, savable_comment, false);
584
585 phase3_unget (&token2);
586
587 state = 4;
588 break;
589 }
590 if (state == 5)
591 {
592 lex_pos_ty pos;
593 pos.file_name = logical_file_name;
594 pos.line_number = token.line_number;
595 if (plural_mp != NULL)
596 remember_a_message_plural (plural_mp, token.string, false,
597 null_context, &pos,
598 savable_comment, false);
599 state = 0;
600 break;
601 }
602 state = 0;
603 free (token.string);
604 break;
605
606 case token_type_uniq:
607 case token_type_other:
608 state = 0;
609 break;
610
611 case token_type_eof:
612 break;
613
614 default:
615 abort ();
616 }
617
618 if (token.type == token_type_eof)
619 break;
620 }
621 }
622
623 /* Close scanner. */
624 fp = NULL;
625 real_file_name = NULL;
626 logical_file_name = NULL;
627 line_number = 0;
628 }
629