1 /* GNU gettext - internationalization aids
2 Copyright (C) 1995-2009, 2011, 2019 Free Software Foundation, Inc.
3
4 This file was written by Peter Miller <millerp@canb.auug.org.au>.
5 Multibyte character handling by Bruno Haible <haible@clisp.cons.org>.
6
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <https://www.gnu.org/licenses/>. */
19
20
21 #ifdef HAVE_CONFIG_H
22 # include "config.h"
23 #endif
24
25 /* Specification. */
26 #include "po-lex.h"
27
28 #include <errno.h>
29 #include <limits.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <stdarg.h>
34
35 #if HAVE_ICONV
36 # include <iconv.h>
37 #endif
38
39 #include "c-ctype.h"
40 #include "uniwidth.h"
41 #include "gettext.h"
42 #include "po-charset.h"
43 #include "xalloc.h"
44 #include "error.h"
45 #include "error-progname.h"
46 #include "xvasprintf.h"
47 #include "po-error.h"
48 #include "po-xerror.h"
49 #include "pos.h"
50 #include "message.h"
51 #include "str-list.h"
52 #include "po-gram-gen2.h"
53
54 #define _(str) gettext(str)
55
56 #if HAVE_ICONV
57 # include "unistr.h"
58 #endif
59
60 #if HAVE_DECL_GETC_UNLOCKED
61 # undef getc
62 # define getc getc_unlocked
63 #endif
64
65
66 /* Current position within the PO file. */
67 lex_pos_ty gram_pos;
68 int gram_pos_column;
69
70
71 /* Error handling during the parsing of a PO file.
72 These functions can access gram_pos and gram_pos_column. */
73
74 /* VARARGS1 */
75 void
po_gram_error(const char * fmt,...)76 po_gram_error (const char *fmt, ...)
77 {
78 va_list ap;
79 char *buffer;
80
81 va_start (ap, fmt);
82 if (vasprintf (&buffer, fmt, ap) < 0)
83 error (EXIT_FAILURE, 0, _("memory exhausted"));
84 va_end (ap);
85 po_xerror (PO_SEVERITY_ERROR, NULL, gram_pos.file_name, gram_pos.line_number,
86 gram_pos_column + 1, false, buffer);
87 free (buffer);
88
89 if (error_message_count >= gram_max_allowed_errors)
90 po_error (EXIT_FAILURE, 0, _("too many errors, aborting"));
91 }
92
93 /* VARARGS2 */
94 void
po_gram_error_at_line(const lex_pos_ty * pp,const char * fmt,...)95 po_gram_error_at_line (const lex_pos_ty *pp, const char *fmt, ...)
96 {
97 va_list ap;
98 char *buffer;
99
100 va_start (ap, fmt);
101 if (vasprintf (&buffer, fmt, ap) < 0)
102 error (EXIT_FAILURE, 0, _("memory exhausted"));
103 va_end (ap);
104 po_xerror (PO_SEVERITY_ERROR, NULL, pp->file_name, pp->line_number,
105 (size_t)(-1), false, buffer);
106 free (buffer);
107
108 if (error_message_count >= gram_max_allowed_errors)
109 po_error (EXIT_FAILURE, 0, _("too many errors, aborting"));
110 }
111
112
113 /* The lowest level of PO file parsing converts bytes to multibyte characters.
114 This is needed
115 1. for C compatibility: ISO C 99 section 5.1.1.2 says that the first
116 translation phase maps bytes to characters.
117 2. to keep track of the current column, for the sake of precise error
118 location. Emacs compile.el interprets the column in error messages
119 by default as a screen column number, not as character number.
120 3. to avoid skipping backslash-newline in the midst of a multibyte
121 character. If XY is a multibyte character, X \ newline Y is invalid.
122 */
123
124 /* Multibyte character data type. */
125 /* Note this depends on po_lex_charset and po_lex_iconv, which get set
126 while the file is being parsed. */
127
128 #define MBCHAR_BUF_SIZE 24
129
130 struct mbchar
131 {
132 size_t bytes; /* number of bytes of current character, > 0 */
133 #if HAVE_ICONV
134 bool uc_valid; /* true if uc is a valid Unicode character */
135 ucs4_t uc; /* if uc_valid: the current character */
136 #endif
137 char buf[MBCHAR_BUF_SIZE]; /* room for the bytes */
138 };
139
140 /* We want to pass multibyte characters by reference automatically,
141 therefore we use an array type. */
142 typedef struct mbchar mbchar_t[1];
143
144 /* A version of memcpy optimized for the case n <= 1. */
145 static inline void
memcpy_small(void * dst,const void * src,size_t n)146 memcpy_small (void *dst, const void *src, size_t n)
147 {
148 if (n > 0)
149 {
150 char *q = (char *) dst;
151 const char *p = (const char *) src;
152
153 *q = *p;
154 if (--n > 0)
155 do *++q = *++p; while (--n > 0);
156 }
157 }
158
159 /* EOF (not a real character) is represented with bytes = 0 and
160 uc_valid = false. */
161 static inline bool
mb_iseof(const mbchar_t mbc)162 mb_iseof (const mbchar_t mbc)
163 {
164 return (mbc->bytes == 0);
165 }
166
167 /* Access the current character. */
168 static inline const char *
mb_ptr(const mbchar_t mbc)169 mb_ptr (const mbchar_t mbc)
170 {
171 return mbc->buf;
172 }
173 static inline size_t
mb_len(const mbchar_t mbc)174 mb_len (const mbchar_t mbc)
175 {
176 return mbc->bytes;
177 }
178
179 /* Comparison of characters. */
180
181 static inline bool
mb_iseq(const mbchar_t mbc,char sc)182 mb_iseq (const mbchar_t mbc, char sc)
183 {
184 /* Note: It is wrong to compare only mbc->uc, because when the encoding is
185 SHIFT_JIS, mbc->buf[0] == '\\' corresponds to mbc->uc == 0x00A5, but we
186 want to treat it as an escape character, although it looks like a Yen
187 sign. */
188 #if HAVE_ICONV && 0
189 if (mbc->uc_valid)
190 return (mbc->uc == sc); /* wrong! */
191 else
192 #endif
193 return (mbc->bytes == 1 && mbc->buf[0] == sc);
194 }
195
196 static inline bool
mb_isnul(const mbchar_t mbc)197 mb_isnul (const mbchar_t mbc)
198 {
199 #if HAVE_ICONV
200 if (mbc->uc_valid)
201 return (mbc->uc == 0);
202 else
203 #endif
204 return (mbc->bytes == 1 && mbc->buf[0] == 0);
205 }
206
207 static inline int
mb_cmp(const mbchar_t mbc1,const mbchar_t mbc2)208 mb_cmp (const mbchar_t mbc1, const mbchar_t mbc2)
209 {
210 #if HAVE_ICONV
211 if (mbc1->uc_valid && mbc2->uc_valid)
212 return (int) mbc1->uc - (int) mbc2->uc;
213 else
214 #endif
215 return (mbc1->bytes == mbc2->bytes
216 ? memcmp (mbc1->buf, mbc2->buf, mbc1->bytes)
217 : mbc1->bytes < mbc2->bytes
218 ? (memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) > 0 ? 1 : -1)
219 : (memcmp (mbc1->buf, mbc2->buf, mbc2->bytes) >= 0 ? 1 : -1));
220 }
221
222 static inline bool
mb_equal(const mbchar_t mbc1,const mbchar_t mbc2)223 mb_equal (const mbchar_t mbc1, const mbchar_t mbc2)
224 {
225 #if HAVE_ICONV
226 if (mbc1->uc_valid && mbc2->uc_valid)
227 return mbc1->uc == mbc2->uc;
228 else
229 #endif
230 return (mbc1->bytes == mbc2->bytes
231 && memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) == 0);
232 }
233
234 /* <ctype.h>, <wctype.h> classification. */
235
236 static inline bool
mb_isascii(const mbchar_t mbc)237 mb_isascii (const mbchar_t mbc)
238 {
239 #if HAVE_ICONV
240 if (mbc->uc_valid)
241 return (mbc->uc >= 0x0000 && mbc->uc <= 0x007F);
242 else
243 #endif
244 return (mbc->bytes == 1
245 #if CHAR_MIN < 0x00 /* to avoid gcc warning */
246 && mbc->buf[0] >= 0x00
247 #endif
248 #if CHAR_MAX > 0x7F /* to avoid gcc warning */
249 && mbc->buf[0] <= 0x7F
250 #endif
251 );
252 }
253
254 /* Extra <wchar.h> function. */
255
256 /* Unprintable characters appear as a small box of width 1. */
257 #define MB_UNPRINTABLE_WIDTH 1
258
259 static int
mb_width(const mbchar_t mbc)260 mb_width (const mbchar_t mbc)
261 {
262 #if HAVE_ICONV
263 if (mbc->uc_valid)
264 {
265 ucs4_t uc = mbc->uc;
266 const char *encoding =
267 (po_lex_iconv != (iconv_t)(-1) ? po_lex_charset : "");
268 int w = uc_width (uc, encoding);
269 /* For unprintable characters, arbitrarily return 0 for control
270 characters (except tab) and MB_UNPRINTABLE_WIDTH otherwise. */
271 if (w >= 0)
272 return w;
273 if (uc >= 0x0000 && uc <= 0x001F)
274 {
275 if (uc == 0x0009)
276 return 8 - (gram_pos_column & 7);
277 return 0;
278 }
279 if ((uc >= 0x007F && uc <= 0x009F) || (uc >= 0x2028 && uc <= 0x2029))
280 return 0;
281 return MB_UNPRINTABLE_WIDTH;
282 }
283 else
284 #endif
285 {
286 if (mbc->bytes == 1)
287 {
288 if (
289 #if CHAR_MIN < 0x00 /* to avoid gcc warning */
290 mbc->buf[0] >= 0x00 &&
291 #endif
292 mbc->buf[0] <= 0x1F)
293 {
294 if (mbc->buf[0] == 0x09)
295 return 8 - (gram_pos_column & 7);
296 return 0;
297 }
298 if (mbc->buf[0] == 0x7F)
299 return 0;
300 }
301 return MB_UNPRINTABLE_WIDTH;
302 }
303 }
304
305 /* Output. */
306 static inline void
mb_putc(const mbchar_t mbc,FILE * stream)307 mb_putc (const mbchar_t mbc, FILE *stream)
308 {
309 fwrite (mbc->buf, 1, mbc->bytes, stream);
310 }
311
312 /* Assignment. */
313 static inline void
mb_setascii(mbchar_t mbc,char sc)314 mb_setascii (mbchar_t mbc, char sc)
315 {
316 mbc->bytes = 1;
317 #if HAVE_ICONV
318 mbc->uc_valid = 1;
319 mbc->uc = sc;
320 #endif
321 mbc->buf[0] = sc;
322 }
323
324 /* Copying a character. */
325 static inline void
mb_copy(mbchar_t new_mbc,const mbchar_t old_mbc)326 mb_copy (mbchar_t new_mbc, const mbchar_t old_mbc)
327 {
328 memcpy_small (&new_mbc->buf[0], &old_mbc->buf[0], old_mbc->bytes);
329 new_mbc->bytes = old_mbc->bytes;
330 #if HAVE_ICONV
331 if ((new_mbc->uc_valid = old_mbc->uc_valid))
332 new_mbc->uc = old_mbc->uc;
333 #endif
334 }
335
336
337 /* Multibyte character input. */
338
339 /* Number of characters that can be pushed back.
340 We need 1 for lex_getc, plus 1 for lex_ungetc. */
341 #define NPUSHBACK 2
342
343 /* Data type of a multibyte character input stream. */
344 struct mbfile
345 {
346 FILE *fp;
347 bool eof_seen;
348 int have_pushback;
349 unsigned int bufcount;
350 char buf[MBCHAR_BUF_SIZE];
351 struct mbchar pushback[NPUSHBACK];
352 };
353
354 /* We want to pass multibyte streams by reference automatically,
355 therefore we use an array type. */
356 typedef struct mbfile mbfile_t[1];
357
358 /* Whether invalid multibyte sequences in the input shall be signalled
359 or silently tolerated. */
360 static bool signal_eilseq;
361
362 static inline void
mbfile_init(mbfile_t mbf,FILE * stream)363 mbfile_init (mbfile_t mbf, FILE *stream)
364 {
365 mbf->fp = stream;
366 mbf->eof_seen = false;
367 mbf->have_pushback = 0;
368 mbf->bufcount = 0;
369 }
370
371 /* Read the next multibyte character from mbf and put it into mbc.
372 If a read error occurs, errno is set and ferror (mbf->fp) becomes true. */
373 static void
mbfile_getc(mbchar_t mbc,mbfile_t mbf)374 mbfile_getc (mbchar_t mbc, mbfile_t mbf)
375 {
376 size_t bytes;
377
378 /* If EOF has already been seen, don't use getc. This matters if
379 mbf->fp is connected to an interactive tty. */
380 if (mbf->eof_seen)
381 goto eof;
382
383 /* Return character pushed back, if there is one. */
384 if (mbf->have_pushback > 0)
385 {
386 mbf->have_pushback--;
387 mb_copy (mbc, &mbf->pushback[mbf->have_pushback]);
388 return;
389 }
390
391 /* Before using iconv, we need at least one byte. */
392 if (mbf->bufcount == 0)
393 {
394 int c = getc (mbf->fp);
395 if (c == EOF)
396 {
397 mbf->eof_seen = true;
398 goto eof;
399 }
400 mbf->buf[0] = (unsigned char) c;
401 mbf->bufcount++;
402 }
403
404 #if HAVE_ICONV
405 if (po_lex_iconv != (iconv_t)(-1))
406 {
407 /* Use iconv on an increasing number of bytes. Read only as many
408 bytes from mbf->fp as needed. This is needed to give reasonable
409 interactive behaviour when mbf->fp is connected to an interactive
410 tty. */
411 for (;;)
412 {
413 unsigned char scratchbuf[64];
414 const char *inptr = &mbf->buf[0];
415 size_t insize = mbf->bufcount;
416 char *outptr = (char *) &scratchbuf[0];
417 size_t outsize = sizeof (scratchbuf);
418
419 size_t res = iconv (po_lex_iconv,
420 (ICONV_CONST char **) &inptr, &insize,
421 &outptr, &outsize);
422 /* We expect that a character has been produced if and only if
423 some input bytes have been consumed. */
424 if ((insize < mbf->bufcount) != (outsize < sizeof (scratchbuf)))
425 abort ();
426 if (outsize == sizeof (scratchbuf))
427 {
428 /* No character has been produced. Must be an error. */
429 if (res != (size_t)(-1))
430 abort ();
431
432 if (errno == EILSEQ)
433 {
434 /* An invalid multibyte sequence was encountered. */
435 /* Return a single byte. */
436 if (signal_eilseq)
437 po_gram_error (_("invalid multibyte sequence"));
438 bytes = 1;
439 mbc->uc_valid = false;
440 break;
441 }
442 else if (errno == EINVAL)
443 {
444 /* An incomplete multibyte character. */
445 int c;
446
447 if (mbf->bufcount == MBCHAR_BUF_SIZE)
448 {
449 /* An overlong incomplete multibyte sequence was
450 encountered. */
451 /* Return a single byte. */
452 bytes = 1;
453 mbc->uc_valid = false;
454 break;
455 }
456
457 /* Read one more byte and retry iconv. */
458 c = getc (mbf->fp);
459 if (c == EOF)
460 {
461 mbf->eof_seen = true;
462 if (ferror (mbf->fp))
463 goto eof;
464 if (signal_eilseq)
465 po_gram_error (_("incomplete multibyte sequence at end of file"));
466 bytes = mbf->bufcount;
467 mbc->uc_valid = false;
468 break;
469 }
470 mbf->buf[mbf->bufcount++] = (unsigned char) c;
471 if (c == '\n')
472 {
473 if (signal_eilseq)
474 po_gram_error (_("incomplete multibyte sequence at end of line"));
475 bytes = mbf->bufcount - 1;
476 mbc->uc_valid = false;
477 break;
478 }
479 }
480 else
481 {
482 const char *errno_description = strerror (errno);
483 po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
484 xasprintf ("%s: %s",
485 _("iconv failure"),
486 errno_description));
487 }
488 }
489 else
490 {
491 size_t outbytes = sizeof (scratchbuf) - outsize;
492 bytes = mbf->bufcount - insize;
493
494 /* We expect that one character has been produced. */
495 if (bytes == 0)
496 abort ();
497 if (outbytes == 0)
498 abort ();
499 /* Convert it from UTF-8 to UCS-4. */
500 if (u8_mbtoucr (&mbc->uc, scratchbuf, outbytes) < (int) outbytes)
501 {
502 /* scratchbuf contains an out-of-range Unicode character
503 (> 0x10ffff). */
504 if (signal_eilseq)
505 po_gram_error (_("invalid multibyte sequence"));
506 mbc->uc_valid = false;
507 break;
508 }
509 mbc->uc_valid = true;
510 break;
511 }
512 }
513 }
514 else
515 #endif
516 {
517 if (po_lex_weird_cjk
518 /* Special handling of encodings with CJK structure. */
519 && (unsigned char) mbf->buf[0] >= 0x80)
520 {
521 if (mbf->bufcount == 1)
522 {
523 /* Read one more byte. */
524 int c = getc (mbf->fp);
525 if (c == EOF)
526 {
527 if (ferror (mbf->fp))
528 {
529 mbf->eof_seen = true;
530 goto eof;
531 }
532 }
533 else
534 {
535 mbf->buf[1] = (unsigned char) c;
536 mbf->bufcount++;
537 }
538 }
539 if (mbf->bufcount >= 2 && (unsigned char) mbf->buf[1] >= 0x30)
540 /* Return a double byte. */
541 bytes = 2;
542 else
543 /* Return a single byte. */
544 bytes = 1;
545 }
546 else
547 {
548 /* Return a single byte. */
549 bytes = 1;
550 }
551 #if HAVE_ICONV
552 mbc->uc_valid = false;
553 #endif
554 }
555
556 /* Return the multibyte sequence mbf->buf[0..bytes-1]. */
557 memcpy_small (&mbc->buf[0], &mbf->buf[0], bytes);
558 mbc->bytes = bytes;
559
560 mbf->bufcount -= bytes;
561 if (mbf->bufcount > 0)
562 {
563 /* It's not worth calling memmove() for so few bytes. */
564 unsigned int count = mbf->bufcount;
565 char *p = &mbf->buf[0];
566
567 do
568 {
569 *p = *(p + bytes);
570 p++;
571 }
572 while (--count > 0);
573 }
574 return;
575
576 eof:
577 /* An mbchar_t with bytes == 0 is used to indicate EOF. */
578 mbc->bytes = 0;
579 #if HAVE_ICONV
580 mbc->uc_valid = false;
581 #endif
582 return;
583 }
584
585 static void
mbfile_ungetc(const mbchar_t mbc,mbfile_t mbf)586 mbfile_ungetc (const mbchar_t mbc, mbfile_t mbf)
587 {
588 if (mbf->have_pushback >= NPUSHBACK)
589 abort ();
590 mb_copy (&mbf->pushback[mbf->have_pushback], mbc);
591 mbf->have_pushback++;
592 }
593
594
595 /* Lexer variables. */
596
597 static mbfile_t mbf;
598 unsigned int gram_max_allowed_errors = 20;
599 static bool po_lex_obsolete;
600 static bool po_lex_previous;
601 static bool pass_comments = false;
602 bool pass_obsolete_entries = false;
603
604
605 /* Prepare lexical analysis. */
606 void
lex_start(FILE * fp,const char * real_filename,const char * logical_filename)607 lex_start (FILE *fp, const char *real_filename, const char *logical_filename)
608 {
609 /* Ignore the logical_filename, because PO file entries already have
610 their file names attached. But use real_filename for error messages. */
611 gram_pos.file_name = xstrdup (real_filename);
612
613 mbfile_init (mbf, fp);
614
615 gram_pos.line_number = 1;
616 gram_pos_column = 0;
617 signal_eilseq = true;
618 po_lex_obsolete = false;
619 po_lex_previous = false;
620 po_lex_charset_init ();
621 }
622
623 /* Terminate lexical analysis. */
624 void
lex_end()625 lex_end ()
626 {
627 mbf->fp = NULL;
628 gram_pos.file_name = NULL;
629 gram_pos.line_number = 0;
630 gram_pos_column = 0;
631 signal_eilseq = false;
632 po_lex_obsolete = false;
633 po_lex_previous = false;
634 po_lex_charset_close ();
635 }
636
637
638 /* Read a single character, dealing with backslash-newline.
639 Also keep track of the current line number and column number. */
640 static void
lex_getc(mbchar_t mbc)641 lex_getc (mbchar_t mbc)
642 {
643 for (;;)
644 {
645 mbfile_getc (mbc, mbf);
646
647 if (mb_iseof (mbc))
648 {
649 if (ferror (mbf->fp))
650 bomb:
651 {
652 const char *errno_description = strerror (errno);
653 po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
654 xasprintf ("%s: %s",
655 xasprintf (_("error while reading \"%s\""),
656 gram_pos.file_name),
657 errno_description));
658 }
659 break;
660 }
661
662 if (mb_iseq (mbc, '\n'))
663 {
664 gram_pos.line_number++;
665 gram_pos_column = 0;
666 break;
667 }
668
669 gram_pos_column += mb_width (mbc);
670
671 if (mb_iseq (mbc, '\\'))
672 {
673 mbchar_t mbc2;
674
675 mbfile_getc (mbc2, mbf);
676
677 if (mb_iseof (mbc2))
678 {
679 if (ferror (mbf->fp))
680 goto bomb;
681 break;
682 }
683
684 if (!mb_iseq (mbc2, '\n'))
685 {
686 mbfile_ungetc (mbc2, mbf);
687 break;
688 }
689
690 gram_pos.line_number++;
691 gram_pos_column = 0;
692 }
693 else
694 break;
695 }
696 }
697
698
699 static void
lex_ungetc(const mbchar_t mbc)700 lex_ungetc (const mbchar_t mbc)
701 {
702 if (!mb_iseof (mbc))
703 {
704 if (mb_iseq (mbc, '\n'))
705 /* Decrement the line number, but don't care about the column. */
706 gram_pos.line_number--;
707 else
708 /* Decrement the column number. Also works well enough for tabs. */
709 gram_pos_column -= mb_width (mbc);
710
711 mbfile_ungetc (mbc, mbf);
712 }
713 }
714
715
716 static int
keyword_p(const char * s)717 keyword_p (const char *s)
718 {
719 if (!po_lex_previous)
720 {
721 if (!strcmp (s, "domain"))
722 return DOMAIN;
723 if (!strcmp (s, "msgid"))
724 return MSGID;
725 if (!strcmp (s, "msgid_plural"))
726 return MSGID_PLURAL;
727 if (!strcmp (s, "msgstr"))
728 return MSGSTR;
729 if (!strcmp (s, "msgctxt"))
730 return MSGCTXT;
731 }
732 else
733 {
734 /* Inside a "#|" context, the keywords have a different meaning. */
735 if (!strcmp (s, "msgid"))
736 return PREV_MSGID;
737 if (!strcmp (s, "msgid_plural"))
738 return PREV_MSGID_PLURAL;
739 if (!strcmp (s, "msgctxt"))
740 return PREV_MSGCTXT;
741 }
742 po_gram_error_at_line (&gram_pos, _("keyword \"%s\" unknown"), s);
743 return NAME;
744 }
745
746
747 static int
control_sequence()748 control_sequence ()
749 {
750 mbchar_t mbc;
751 int val;
752 int max;
753
754 lex_getc (mbc);
755 if (mb_len (mbc) == 1)
756 switch (mb_ptr (mbc) [0])
757 {
758 case 'n':
759 return '\n';
760
761 case 't':
762 return '\t';
763
764 case 'b':
765 return '\b';
766
767 case 'r':
768 return '\r';
769
770 case 'f':
771 return '\f';
772
773 case 'v':
774 return '\v';
775
776 case 'a':
777 return '\a';
778
779 case '\\':
780 case '"':
781 return mb_ptr (mbc) [0];
782
783 case '0': case '1': case '2': case '3':
784 case '4': case '5': case '6': case '7':
785 val = 0;
786 max = 0;
787 for (;;)
788 {
789 char c = mb_ptr (mbc) [0];
790 /* Warning: not portable, can't depend on '0'..'7' ordering. */
791 val = val * 8 + (c - '0');
792 if (++max == 3)
793 break;
794 lex_getc (mbc);
795 if (mb_len (mbc) == 1)
796 switch (mb_ptr (mbc) [0])
797 {
798 case '0': case '1': case '2': case '3':
799 case '4': case '5': case '6': case '7':
800 continue;
801
802 default:
803 break;
804 }
805 lex_ungetc (mbc);
806 break;
807 }
808 return val;
809
810 case 'x':
811 lex_getc (mbc);
812 if (mb_iseof (mbc) || mb_len (mbc) != 1
813 || !c_isxdigit (mb_ptr (mbc) [0]))
814 break;
815
816 val = 0;
817 for (;;)
818 {
819 char c = mb_ptr (mbc) [0];
820 val *= 16;
821 if (c_isdigit (c))
822 /* Warning: not portable, can't depend on '0'..'9' ordering */
823 val += c - '0';
824 else if (c_isupper (c))
825 /* Warning: not portable, can't depend on 'A'..'F' ordering */
826 val += c - 'A' + 10;
827 else
828 /* Warning: not portable, can't depend on 'a'..'f' ordering */
829 val += c - 'a' + 10;
830
831 lex_getc (mbc);
832 if (mb_len (mbc) == 1)
833 switch (mb_ptr (mbc) [0])
834 {
835 case '0': case '1': case '2': case '3': case '4':
836 case '5': case '6': case '7': case '8': case '9':
837 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
838 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
839 continue;
840
841 default:
842 break;
843 }
844 lex_ungetc (mbc);
845 break;
846 }
847 return val;
848
849 /* FIXME: \u and \U are not handled. */
850 }
851 lex_ungetc (mbc);
852 po_gram_error (_("invalid control sequence"));
853 return ' ';
854 }
855
856
857 /* Return the next token in the PO file. The return codes are defined
858 in "po-gram-gen2.h". Associated data is put in 'po_gram_lval'. */
859 int
po_gram_lex()860 po_gram_lex ()
861 {
862 static char *buf;
863 static size_t bufmax;
864 mbchar_t mbc;
865 size_t bufpos;
866
867 for (;;)
868 {
869 lex_getc (mbc);
870
871 if (mb_iseof (mbc))
872 /* Yacc want this for end of file. */
873 return 0;
874
875 if (mb_len (mbc) == 1)
876 switch (mb_ptr (mbc) [0])
877 {
878 case '\n':
879 po_lex_obsolete = false;
880 po_lex_previous = false;
881 /* Ignore whitespace, not relevant for the grammar. */
882 break;
883
884 case ' ':
885 case '\t':
886 case '\r':
887 case '\f':
888 case '\v':
889 /* Ignore whitespace, not relevant for the grammar. */
890 break;
891
892 case '#':
893 lex_getc (mbc);
894 if (mb_iseq (mbc, '~'))
895 /* A pseudo-comment beginning with #~ is found. This is
896 not a comment. It is the format for obsolete entries.
897 We simply discard the "#~" prefix. The following
898 characters are expected to be well formed. */
899 {
900 po_lex_obsolete = true;
901 /* A pseudo-comment beginning with #~| denotes a previous
902 untranslated string in an obsolete entry. This does not
903 make much sense semantically, and is implemented here
904 for completeness only. */
905 lex_getc (mbc);
906 if (mb_iseq (mbc, '|'))
907 po_lex_previous = true;
908 else
909 lex_ungetc (mbc);
910 break;
911 }
912 if (mb_iseq (mbc, '|'))
913 /* A pseudo-comment beginning with #| is found. This is
914 the previous untranslated string. We discard the "#|"
915 prefix, but change the keywords and string returns
916 accordingly. */
917 {
918 po_lex_previous = true;
919 break;
920 }
921
922 /* Accumulate comments into a buffer. If we have been asked
923 to pass comments, generate a COMMENT token, otherwise
924 discard it. */
925 signal_eilseq = false;
926 if (pass_comments)
927 {
928 bufpos = 0;
929 for (;;)
930 {
931 while (bufpos + mb_len (mbc) >= bufmax)
932 {
933 bufmax += 100;
934 buf = xrealloc (buf, bufmax);
935 }
936 if (mb_iseof (mbc) || mb_iseq (mbc, '\n'))
937 break;
938
939 memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc));
940 bufpos += mb_len (mbc);
941
942 lex_getc (mbc);
943 }
944 buf[bufpos] = '\0';
945
946 po_gram_lval.string.string = buf;
947 po_gram_lval.string.pos = gram_pos;
948 po_gram_lval.string.obsolete = po_lex_obsolete;
949 po_lex_obsolete = false;
950 signal_eilseq = true;
951 return COMMENT;
952 }
953 else
954 {
955 /* We do this in separate loop because collecting large
956 comments while they get not passed to the upper layers
957 is not very efficient. */
958 while (!mb_iseof (mbc) && !mb_iseq (mbc, '\n'))
959 lex_getc (mbc);
960 po_lex_obsolete = false;
961 signal_eilseq = true;
962 }
963 break;
964
965 case '"':
966 /* Accumulate a string. */
967 bufpos = 0;
968 for (;;)
969 {
970 lex_getc (mbc);
971 while (bufpos + mb_len (mbc) >= bufmax)
972 {
973 bufmax += 100;
974 buf = xrealloc (buf, bufmax);
975 }
976 if (mb_iseof (mbc))
977 {
978 po_gram_error_at_line (&gram_pos,
979 _("end-of-file within string"));
980 break;
981 }
982 if (mb_iseq (mbc, '\n'))
983 {
984 po_gram_error_at_line (&gram_pos,
985 _("end-of-line within string"));
986 break;
987 }
988 if (mb_iseq (mbc, '"'))
989 break;
990 if (mb_iseq (mbc, '\\'))
991 {
992 buf[bufpos++] = control_sequence ();
993 continue;
994 }
995
996 /* Add mbc to the accumulator. */
997 memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc));
998 bufpos += mb_len (mbc);
999 }
1000 buf[bufpos] = '\0';
1001
1002 /* Strings cannot contain the msgctxt separator, because it cannot
1003 be faithfully represented in the msgid of a .mo file. */
1004 if (strchr (buf, MSGCTXT_SEPARATOR) != NULL)
1005 po_gram_error_at_line (&gram_pos,
1006 _("context separator <EOT> within string"));
1007
1008 /* FIXME: Treatment of embedded \000 chars is incorrect. */
1009 po_gram_lval.string.string = xstrdup (buf);
1010 po_gram_lval.string.pos = gram_pos;
1011 po_gram_lval.string.obsolete = po_lex_obsolete;
1012 return (po_lex_previous ? PREV_STRING : STRING);
1013
1014 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1015 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1016 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1017 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1018 case 'y': case 'z':
1019 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1020 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1021 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1022 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1023 case 'Y': case 'Z':
1024 case '_': case '$':
1025 bufpos = 0;
1026 for (;;)
1027 {
1028 char c = mb_ptr (mbc) [0];
1029 if (bufpos + 1 >= bufmax)
1030 {
1031 bufmax += 100;
1032 buf = xrealloc (buf, bufmax);
1033 }
1034 buf[bufpos++] = c;
1035 lex_getc (mbc);
1036 if (mb_len (mbc) == 1)
1037 switch (mb_ptr (mbc) [0])
1038 {
1039 default:
1040 break;
1041 case 'a': case 'b': case 'c': case 'd': case 'e':
1042 case 'f': case 'g': case 'h': case 'i': case 'j':
1043 case 'k': case 'l': case 'm': case 'n': case 'o':
1044 case 'p': case 'q': case 'r': case 's': case 't':
1045 case 'u': case 'v': case 'w': case 'x': case 'y':
1046 case 'z':
1047 case 'A': case 'B': case 'C': case 'D': case 'E':
1048 case 'F': case 'G': case 'H': case 'I': case 'J':
1049 case 'K': case 'L': case 'M': case 'N': case 'O':
1050 case 'P': case 'Q': case 'R': case 'S': case 'T':
1051 case 'U': case 'V': case 'W': case 'X': case 'Y':
1052 case 'Z':
1053 case '_': case '$':
1054 case '0': case '1': case '2': case '3': case '4':
1055 case '5': case '6': case '7': case '8': case '9':
1056 continue;
1057 }
1058 break;
1059 }
1060 lex_ungetc (mbc);
1061
1062 buf[bufpos] = '\0';
1063
1064 {
1065 int k = keyword_p (buf);
1066 if (k == NAME)
1067 {
1068 po_gram_lval.string.string = xstrdup (buf);
1069 po_gram_lval.string.pos = gram_pos;
1070 po_gram_lval.string.obsolete = po_lex_obsolete;
1071 }
1072 else
1073 {
1074 po_gram_lval.pos.pos = gram_pos;
1075 po_gram_lval.pos.obsolete = po_lex_obsolete;
1076 }
1077 return k;
1078 }
1079
1080 case '0': case '1': case '2': case '3': case '4':
1081 case '5': case '6': case '7': case '8': case '9':
1082 bufpos = 0;
1083 for (;;)
1084 {
1085 char c = mb_ptr (mbc) [0];
1086 if (bufpos + 1 >= bufmax)
1087 {
1088 bufmax += 100;
1089 buf = xrealloc (buf, bufmax + 1);
1090 }
1091 buf[bufpos++] = c;
1092 lex_getc (mbc);
1093 if (mb_len (mbc) == 1)
1094 switch (mb_ptr (mbc) [0])
1095 {
1096 default:
1097 break;
1098
1099 case '0': case '1': case '2': case '3': case '4':
1100 case '5': case '6': case '7': case '8': case '9':
1101 continue;
1102 }
1103 break;
1104 }
1105 lex_ungetc (mbc);
1106
1107 buf[bufpos] = '\0';
1108
1109 po_gram_lval.number.number = atol (buf);
1110 po_gram_lval.number.pos = gram_pos;
1111 po_gram_lval.number.obsolete = po_lex_obsolete;
1112 return NUMBER;
1113
1114 case '[':
1115 po_gram_lval.pos.pos = gram_pos;
1116 po_gram_lval.pos.obsolete = po_lex_obsolete;
1117 return '[';
1118
1119 case ']':
1120 po_gram_lval.pos.pos = gram_pos;
1121 po_gram_lval.pos.obsolete = po_lex_obsolete;
1122 return ']';
1123
1124 default:
1125 /* This will cause a syntax error. */
1126 return JUNK;
1127 }
1128 else
1129 /* This will cause a syntax error. */
1130 return JUNK;
1131 }
1132 }
1133
1134
1135 /* po_gram_lex() can return comments as COMMENT. Switch this on or off. */
1136 void
po_lex_pass_comments(bool flag)1137 po_lex_pass_comments (bool flag)
1138 {
1139 pass_comments = flag;
1140 }
1141
1142
1143 /* po_gram_lex() can return obsolete entries as if they were normal entries.
1144 Switch this on or off. */
1145 void
po_lex_pass_obsolete_entries(bool flag)1146 po_lex_pass_obsolete_entries (bool flag)
1147 {
1148 pass_obsolete_entries = flag;
1149 }
1150