• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* GNU gettext - internationalization aids
2    Copyright (C) 1995-2009, 2011, 2019 Free Software Foundation, Inc.
3 
4    This file was written by Peter Miller <millerp@canb.auug.org.au>.
5    Multibyte character handling by Bruno Haible <haible@clisp.cons.org>.
6 
7    This program is free software: you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3 of the License, or
10    (at your option) any later version.
11 
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16 
17    You should have received a copy of the GNU General Public License
18    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
19 
20 
21 #ifdef HAVE_CONFIG_H
22 # include "config.h"
23 #endif
24 
25 /* Specification.  */
26 #include "po-lex.h"
27 
28 #include <errno.h>
29 #include <limits.h>
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <stdarg.h>
34 
35 #if HAVE_ICONV
36 # include <iconv.h>
37 #endif
38 
39 #include "c-ctype.h"
40 #include "uniwidth.h"
41 #include "gettext.h"
42 #include "po-charset.h"
43 #include "xalloc.h"
44 #include "error.h"
45 #include "error-progname.h"
46 #include "xvasprintf.h"
47 #include "po-error.h"
48 #include "po-xerror.h"
49 #include "pos.h"
50 #include "message.h"
51 #include "str-list.h"
52 #include "po-gram-gen2.h"
53 
54 #define _(str) gettext(str)
55 
56 #if HAVE_ICONV
57 # include "unistr.h"
58 #endif
59 
60 #if HAVE_DECL_GETC_UNLOCKED
61 # undef getc
62 # define getc getc_unlocked
63 #endif
64 
65 
66 /* Current position within the PO file.  */
67 lex_pos_ty gram_pos;
68 int gram_pos_column;
69 
70 
71 /* Error handling during the parsing of a PO file.
72    These functions can access gram_pos and gram_pos_column.  */
73 
74 /* VARARGS1 */
75 void
po_gram_error(const char * fmt,...)76 po_gram_error (const char *fmt, ...)
77 {
78   va_list ap;
79   char *buffer;
80 
81   va_start (ap, fmt);
82   if (vasprintf (&buffer, fmt, ap) < 0)
83     error (EXIT_FAILURE, 0, _("memory exhausted"));
84   va_end (ap);
85   po_xerror (PO_SEVERITY_ERROR, NULL, gram_pos.file_name, gram_pos.line_number,
86              gram_pos_column + 1, false, buffer);
87   free (buffer);
88 
89   if (error_message_count >= gram_max_allowed_errors)
90     po_error (EXIT_FAILURE, 0, _("too many errors, aborting"));
91 }
92 
93 /* VARARGS2 */
94 void
po_gram_error_at_line(const lex_pos_ty * pp,const char * fmt,...)95 po_gram_error_at_line (const lex_pos_ty *pp, const char *fmt, ...)
96 {
97   va_list ap;
98   char *buffer;
99 
100   va_start (ap, fmt);
101   if (vasprintf (&buffer, fmt, ap) < 0)
102     error (EXIT_FAILURE, 0, _("memory exhausted"));
103   va_end (ap);
104   po_xerror (PO_SEVERITY_ERROR, NULL, pp->file_name, pp->line_number,
105              (size_t)(-1), false, buffer);
106   free (buffer);
107 
108   if (error_message_count >= gram_max_allowed_errors)
109     po_error (EXIT_FAILURE, 0, _("too many errors, aborting"));
110 }
111 
112 
113 /* The lowest level of PO file parsing converts bytes to multibyte characters.
114    This is needed
115    1. for C compatibility: ISO C 99 section 5.1.1.2 says that the first
116       translation phase maps bytes to characters.
117    2. to keep track of the current column, for the sake of precise error
118       location. Emacs compile.el interprets the column in error messages
119       by default as a screen column number, not as character number.
120    3. to avoid skipping backslash-newline in the midst of a multibyte
121       character. If XY is a multibyte character,  X \ newline Y  is invalid.
122  */
123 
124 /* Multibyte character data type.  */
125 /* Note this depends on po_lex_charset and po_lex_iconv, which get set
126    while the file is being parsed.  */
127 
128 #define MBCHAR_BUF_SIZE 24
129 
130 struct mbchar
131 {
132   size_t bytes;         /* number of bytes of current character, > 0 */
133 #if HAVE_ICONV
134   bool uc_valid;        /* true if uc is a valid Unicode character */
135   ucs4_t uc;            /* if uc_valid: the current character */
136 #endif
137   char buf[MBCHAR_BUF_SIZE]; /* room for the bytes */
138 };
139 
140 /* We want to pass multibyte characters by reference automatically,
141    therefore we use an array type.  */
142 typedef struct mbchar mbchar_t[1];
143 
144 /* A version of memcpy optimized for the case n <= 1.  */
145 static inline void
memcpy_small(void * dst,const void * src,size_t n)146 memcpy_small (void *dst, const void *src, size_t n)
147 {
148   if (n > 0)
149     {
150       char *q = (char *) dst;
151       const char *p = (const char *) src;
152 
153       *q = *p;
154       if (--n > 0)
155         do *++q = *++p; while (--n > 0);
156     }
157 }
158 
159 /* EOF (not a real character) is represented with bytes = 0 and
160    uc_valid = false.  */
161 static inline bool
mb_iseof(const mbchar_t mbc)162 mb_iseof (const mbchar_t mbc)
163 {
164   return (mbc->bytes == 0);
165 }
166 
167 /* Access the current character.  */
168 static inline const char *
mb_ptr(const mbchar_t mbc)169 mb_ptr (const mbchar_t mbc)
170 {
171   return mbc->buf;
172 }
173 static inline size_t
mb_len(const mbchar_t mbc)174 mb_len (const mbchar_t mbc)
175 {
176   return mbc->bytes;
177 }
178 
179 /* Comparison of characters.  */
180 
181 static inline bool
mb_iseq(const mbchar_t mbc,char sc)182 mb_iseq (const mbchar_t mbc, char sc)
183 {
184   /* Note: It is wrong to compare only mbc->uc, because when the encoding is
185      SHIFT_JIS, mbc->buf[0] == '\\' corresponds to mbc->uc == 0x00A5, but we
186      want to treat it as an escape character, although it looks like a Yen
187      sign.  */
188 #if HAVE_ICONV && 0
189   if (mbc->uc_valid)
190     return (mbc->uc == sc); /* wrong! */
191   else
192 #endif
193     return (mbc->bytes == 1 && mbc->buf[0] == sc);
194 }
195 
196 static inline bool
mb_isnul(const mbchar_t mbc)197 mb_isnul (const mbchar_t mbc)
198 {
199 #if HAVE_ICONV
200   if (mbc->uc_valid)
201     return (mbc->uc == 0);
202   else
203 #endif
204     return (mbc->bytes == 1 && mbc->buf[0] == 0);
205 }
206 
207 static inline int
mb_cmp(const mbchar_t mbc1,const mbchar_t mbc2)208 mb_cmp (const mbchar_t mbc1, const mbchar_t mbc2)
209 {
210 #if HAVE_ICONV
211   if (mbc1->uc_valid && mbc2->uc_valid)
212     return (int) mbc1->uc - (int) mbc2->uc;
213   else
214 #endif
215     return (mbc1->bytes == mbc2->bytes
216             ? memcmp (mbc1->buf, mbc2->buf, mbc1->bytes)
217             : mbc1->bytes < mbc2->bytes
218               ? (memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) > 0 ? 1 : -1)
219               : (memcmp (mbc1->buf, mbc2->buf, mbc2->bytes) >= 0 ? 1 : -1));
220 }
221 
222 static inline bool
mb_equal(const mbchar_t mbc1,const mbchar_t mbc2)223 mb_equal (const mbchar_t mbc1, const mbchar_t mbc2)
224 {
225 #if HAVE_ICONV
226   if (mbc1->uc_valid && mbc2->uc_valid)
227     return mbc1->uc == mbc2->uc;
228   else
229 #endif
230     return (mbc1->bytes == mbc2->bytes
231             && memcmp (mbc1->buf, mbc2->buf, mbc1->bytes) == 0);
232 }
233 
234 /* <ctype.h>, <wctype.h> classification.  */
235 
236 static inline bool
mb_isascii(const mbchar_t mbc)237 mb_isascii (const mbchar_t mbc)
238 {
239 #if HAVE_ICONV
240   if (mbc->uc_valid)
241     return (mbc->uc >= 0x0000 && mbc->uc <= 0x007F);
242   else
243 #endif
244     return (mbc->bytes == 1
245 #if CHAR_MIN < 0x00 /* to avoid gcc warning */
246             && mbc->buf[0] >= 0x00
247 #endif
248 #if CHAR_MAX > 0x7F /* to avoid gcc warning */
249             && mbc->buf[0] <= 0x7F
250 #endif
251            );
252 }
253 
254 /* Extra <wchar.h> function.  */
255 
256 /* Unprintable characters appear as a small box of width 1.  */
257 #define MB_UNPRINTABLE_WIDTH 1
258 
259 static int
mb_width(const mbchar_t mbc)260 mb_width (const mbchar_t mbc)
261 {
262 #if HAVE_ICONV
263   if (mbc->uc_valid)
264     {
265       ucs4_t uc = mbc->uc;
266       const char *encoding =
267         (po_lex_iconv != (iconv_t)(-1) ? po_lex_charset : "");
268       int w = uc_width (uc, encoding);
269       /* For unprintable characters, arbitrarily return 0 for control
270          characters (except tab) and MB_UNPRINTABLE_WIDTH otherwise.  */
271       if (w >= 0)
272         return w;
273       if (uc >= 0x0000 && uc <= 0x001F)
274         {
275           if (uc == 0x0009)
276             return 8 - (gram_pos_column & 7);
277           return 0;
278         }
279       if ((uc >= 0x007F && uc <= 0x009F) || (uc >= 0x2028 && uc <= 0x2029))
280         return 0;
281       return MB_UNPRINTABLE_WIDTH;
282     }
283   else
284 #endif
285     {
286       if (mbc->bytes == 1)
287         {
288           if (
289 #if CHAR_MIN < 0x00 /* to avoid gcc warning */
290               mbc->buf[0] >= 0x00 &&
291 #endif
292               mbc->buf[0] <= 0x1F)
293             {
294               if (mbc->buf[0] == 0x09)
295                 return 8 - (gram_pos_column & 7);
296               return 0;
297             }
298           if (mbc->buf[0] == 0x7F)
299             return 0;
300         }
301       return MB_UNPRINTABLE_WIDTH;
302     }
303 }
304 
305 /* Output.  */
306 static inline void
mb_putc(const mbchar_t mbc,FILE * stream)307 mb_putc (const mbchar_t mbc, FILE *stream)
308 {
309   fwrite (mbc->buf, 1, mbc->bytes, stream);
310 }
311 
312 /* Assignment.  */
313 static inline void
mb_setascii(mbchar_t mbc,char sc)314 mb_setascii (mbchar_t mbc, char sc)
315 {
316   mbc->bytes = 1;
317 #if HAVE_ICONV
318   mbc->uc_valid = 1;
319   mbc->uc = sc;
320 #endif
321   mbc->buf[0] = sc;
322 }
323 
324 /* Copying a character.  */
325 static inline void
mb_copy(mbchar_t new_mbc,const mbchar_t old_mbc)326 mb_copy (mbchar_t new_mbc, const mbchar_t old_mbc)
327 {
328   memcpy_small (&new_mbc->buf[0], &old_mbc->buf[0], old_mbc->bytes);
329   new_mbc->bytes = old_mbc->bytes;
330 #if HAVE_ICONV
331   if ((new_mbc->uc_valid = old_mbc->uc_valid))
332     new_mbc->uc = old_mbc->uc;
333 #endif
334 }
335 
336 
337 /* Multibyte character input.  */
338 
339 /* Number of characters that can be pushed back.
340    We need 1 for lex_getc, plus 1 for lex_ungetc.  */
341 #define NPUSHBACK 2
342 
343 /* Data type of a multibyte character input stream.  */
344 struct mbfile
345 {
346   FILE *fp;
347   bool eof_seen;
348   int have_pushback;
349   unsigned int bufcount;
350   char buf[MBCHAR_BUF_SIZE];
351   struct mbchar pushback[NPUSHBACK];
352 };
353 
354 /* We want to pass multibyte streams by reference automatically,
355    therefore we use an array type.  */
356 typedef struct mbfile mbfile_t[1];
357 
358 /* Whether invalid multibyte sequences in the input shall be signalled
359    or silently tolerated.  */
360 static bool signal_eilseq;
361 
362 static inline void
mbfile_init(mbfile_t mbf,FILE * stream)363 mbfile_init (mbfile_t mbf, FILE *stream)
364 {
365   mbf->fp = stream;
366   mbf->eof_seen = false;
367   mbf->have_pushback = 0;
368   mbf->bufcount = 0;
369 }
370 
371 /* Read the next multibyte character from mbf and put it into mbc.
372    If a read error occurs, errno is set and ferror (mbf->fp) becomes true.  */
373 static void
mbfile_getc(mbchar_t mbc,mbfile_t mbf)374 mbfile_getc (mbchar_t mbc, mbfile_t mbf)
375 {
376   size_t bytes;
377 
378   /* If EOF has already been seen, don't use getc.  This matters if
379      mbf->fp is connected to an interactive tty.  */
380   if (mbf->eof_seen)
381     goto eof;
382 
383   /* Return character pushed back, if there is one.  */
384   if (mbf->have_pushback > 0)
385     {
386       mbf->have_pushback--;
387       mb_copy (mbc, &mbf->pushback[mbf->have_pushback]);
388       return;
389     }
390 
391   /* Before using iconv, we need at least one byte.  */
392   if (mbf->bufcount == 0)
393     {
394       int c = getc (mbf->fp);
395       if (c == EOF)
396         {
397           mbf->eof_seen = true;
398           goto eof;
399         }
400       mbf->buf[0] = (unsigned char) c;
401       mbf->bufcount++;
402     }
403 
404 #if HAVE_ICONV
405   if (po_lex_iconv != (iconv_t)(-1))
406     {
407       /* Use iconv on an increasing number of bytes.  Read only as many
408          bytes from mbf->fp as needed.  This is needed to give reasonable
409          interactive behaviour when mbf->fp is connected to an interactive
410          tty.  */
411       for (;;)
412         {
413           unsigned char scratchbuf[64];
414           const char *inptr = &mbf->buf[0];
415           size_t insize = mbf->bufcount;
416           char *outptr = (char *) &scratchbuf[0];
417           size_t outsize = sizeof (scratchbuf);
418 
419           size_t res = iconv (po_lex_iconv,
420                               (ICONV_CONST char **) &inptr, &insize,
421                               &outptr, &outsize);
422           /* We expect that a character has been produced if and only if
423              some input bytes have been consumed.  */
424           if ((insize < mbf->bufcount) != (outsize < sizeof (scratchbuf)))
425             abort ();
426           if (outsize == sizeof (scratchbuf))
427             {
428               /* No character has been produced.  Must be an error.  */
429               if (res != (size_t)(-1))
430                 abort ();
431 
432               if (errno == EILSEQ)
433                 {
434                   /* An invalid multibyte sequence was encountered.  */
435                   /* Return a single byte.  */
436                   if (signal_eilseq)
437                     po_gram_error (_("invalid multibyte sequence"));
438                   bytes = 1;
439                   mbc->uc_valid = false;
440                   break;
441                 }
442               else if (errno == EINVAL)
443                 {
444                   /* An incomplete multibyte character.  */
445                   int c;
446 
447                   if (mbf->bufcount == MBCHAR_BUF_SIZE)
448                     {
449                       /* An overlong incomplete multibyte sequence was
450                          encountered.  */
451                       /* Return a single byte.  */
452                       bytes = 1;
453                       mbc->uc_valid = false;
454                       break;
455                     }
456 
457                   /* Read one more byte and retry iconv.  */
458                   c = getc (mbf->fp);
459                   if (c == EOF)
460                     {
461                       mbf->eof_seen = true;
462                       if (ferror (mbf->fp))
463                         goto eof;
464                       if (signal_eilseq)
465                         po_gram_error (_("incomplete multibyte sequence at end of file"));
466                       bytes = mbf->bufcount;
467                       mbc->uc_valid = false;
468                       break;
469                     }
470                   mbf->buf[mbf->bufcount++] = (unsigned char) c;
471                   if (c == '\n')
472                     {
473                       if (signal_eilseq)
474                         po_gram_error (_("incomplete multibyte sequence at end of line"));
475                       bytes = mbf->bufcount - 1;
476                       mbc->uc_valid = false;
477                       break;
478                     }
479                 }
480               else
481                 {
482                   const char *errno_description = strerror (errno);
483                   po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
484                              xasprintf ("%s: %s",
485                                         _("iconv failure"),
486                                         errno_description));
487                 }
488             }
489           else
490             {
491               size_t outbytes = sizeof (scratchbuf) - outsize;
492               bytes = mbf->bufcount - insize;
493 
494               /* We expect that one character has been produced.  */
495               if (bytes == 0)
496                 abort ();
497               if (outbytes == 0)
498                 abort ();
499               /* Convert it from UTF-8 to UCS-4.  */
500               if (u8_mbtoucr (&mbc->uc, scratchbuf, outbytes) < (int) outbytes)
501                 {
502                   /* scratchbuf contains an out-of-range Unicode character
503                      (> 0x10ffff).  */
504                   if (signal_eilseq)
505                     po_gram_error (_("invalid multibyte sequence"));
506                   mbc->uc_valid = false;
507                   break;
508                 }
509               mbc->uc_valid = true;
510               break;
511             }
512         }
513     }
514   else
515 #endif
516     {
517       if (po_lex_weird_cjk
518           /* Special handling of encodings with CJK structure.  */
519           && (unsigned char) mbf->buf[0] >= 0x80)
520         {
521           if (mbf->bufcount == 1)
522             {
523               /* Read one more byte.  */
524               int c = getc (mbf->fp);
525               if (c == EOF)
526                 {
527                   if (ferror (mbf->fp))
528                     {
529                       mbf->eof_seen = true;
530                       goto eof;
531                     }
532                 }
533               else
534                 {
535                   mbf->buf[1] = (unsigned char) c;
536                   mbf->bufcount++;
537                 }
538             }
539           if (mbf->bufcount >= 2 && (unsigned char) mbf->buf[1] >= 0x30)
540             /* Return a double byte.  */
541             bytes = 2;
542           else
543             /* Return a single byte.  */
544             bytes = 1;
545         }
546       else
547         {
548           /* Return a single byte.  */
549           bytes = 1;
550         }
551 #if HAVE_ICONV
552       mbc->uc_valid = false;
553 #endif
554     }
555 
556   /* Return the multibyte sequence mbf->buf[0..bytes-1].  */
557   memcpy_small (&mbc->buf[0], &mbf->buf[0], bytes);
558   mbc->bytes = bytes;
559 
560   mbf->bufcount -= bytes;
561   if (mbf->bufcount > 0)
562     {
563       /* It's not worth calling memmove() for so few bytes.  */
564       unsigned int count = mbf->bufcount;
565       char *p = &mbf->buf[0];
566 
567       do
568         {
569           *p = *(p + bytes);
570           p++;
571         }
572       while (--count > 0);
573     }
574   return;
575 
576 eof:
577   /* An mbchar_t with bytes == 0 is used to indicate EOF.  */
578   mbc->bytes = 0;
579 #if HAVE_ICONV
580   mbc->uc_valid = false;
581 #endif
582   return;
583 }
584 
585 static void
mbfile_ungetc(const mbchar_t mbc,mbfile_t mbf)586 mbfile_ungetc (const mbchar_t mbc, mbfile_t mbf)
587 {
588   if (mbf->have_pushback >= NPUSHBACK)
589     abort ();
590   mb_copy (&mbf->pushback[mbf->have_pushback], mbc);
591   mbf->have_pushback++;
592 }
593 
594 
595 /* Lexer variables.  */
596 
597 static mbfile_t mbf;
598 unsigned int gram_max_allowed_errors = 20;
599 static bool po_lex_obsolete;
600 static bool po_lex_previous;
601 static bool pass_comments = false;
602 bool pass_obsolete_entries = false;
603 
604 
605 /* Prepare lexical analysis.  */
606 void
lex_start(FILE * fp,const char * real_filename,const char * logical_filename)607 lex_start (FILE *fp, const char *real_filename, const char *logical_filename)
608 {
609   /* Ignore the logical_filename, because PO file entries already have
610      their file names attached.  But use real_filename for error messages.  */
611   gram_pos.file_name = xstrdup (real_filename);
612 
613   mbfile_init (mbf, fp);
614 
615   gram_pos.line_number = 1;
616   gram_pos_column = 0;
617   signal_eilseq = true;
618   po_lex_obsolete = false;
619   po_lex_previous = false;
620   po_lex_charset_init ();
621 }
622 
623 /* Terminate lexical analysis.  */
624 void
lex_end()625 lex_end ()
626 {
627   mbf->fp = NULL;
628   gram_pos.file_name = NULL;
629   gram_pos.line_number = 0;
630   gram_pos_column = 0;
631   signal_eilseq = false;
632   po_lex_obsolete = false;
633   po_lex_previous = false;
634   po_lex_charset_close ();
635 }
636 
637 
638 /* Read a single character, dealing with backslash-newline.
639    Also keep track of the current line number and column number.  */
640 static void
lex_getc(mbchar_t mbc)641 lex_getc (mbchar_t mbc)
642 {
643   for (;;)
644     {
645       mbfile_getc (mbc, mbf);
646 
647       if (mb_iseof (mbc))
648         {
649           if (ferror (mbf->fp))
650            bomb:
651             {
652               const char *errno_description = strerror (errno);
653               po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
654                          xasprintf ("%s: %s",
655                                     xasprintf (_("error while reading \"%s\""),
656                                                gram_pos.file_name),
657                                     errno_description));
658             }
659           break;
660         }
661 
662       if (mb_iseq (mbc, '\n'))
663         {
664           gram_pos.line_number++;
665           gram_pos_column = 0;
666           break;
667         }
668 
669       gram_pos_column += mb_width (mbc);
670 
671       if (mb_iseq (mbc, '\\'))
672         {
673           mbchar_t mbc2;
674 
675           mbfile_getc (mbc2, mbf);
676 
677           if (mb_iseof (mbc2))
678             {
679               if (ferror (mbf->fp))
680                 goto bomb;
681               break;
682             }
683 
684           if (!mb_iseq (mbc2, '\n'))
685             {
686               mbfile_ungetc (mbc2, mbf);
687               break;
688             }
689 
690           gram_pos.line_number++;
691           gram_pos_column = 0;
692         }
693       else
694         break;
695     }
696 }
697 
698 
699 static void
lex_ungetc(const mbchar_t mbc)700 lex_ungetc (const mbchar_t mbc)
701 {
702   if (!mb_iseof (mbc))
703     {
704       if (mb_iseq (mbc, '\n'))
705         /* Decrement the line number, but don't care about the column.  */
706         gram_pos.line_number--;
707       else
708         /* Decrement the column number.  Also works well enough for tabs.  */
709         gram_pos_column -= mb_width (mbc);
710 
711       mbfile_ungetc (mbc, mbf);
712     }
713 }
714 
715 
716 static int
keyword_p(const char * s)717 keyword_p (const char *s)
718 {
719   if (!po_lex_previous)
720     {
721       if (!strcmp (s, "domain"))
722         return DOMAIN;
723       if (!strcmp (s, "msgid"))
724         return MSGID;
725       if (!strcmp (s, "msgid_plural"))
726         return MSGID_PLURAL;
727       if (!strcmp (s, "msgstr"))
728         return MSGSTR;
729       if (!strcmp (s, "msgctxt"))
730         return MSGCTXT;
731     }
732   else
733     {
734       /* Inside a "#|" context, the keywords have a different meaning.  */
735       if (!strcmp (s, "msgid"))
736         return PREV_MSGID;
737       if (!strcmp (s, "msgid_plural"))
738         return PREV_MSGID_PLURAL;
739       if (!strcmp (s, "msgctxt"))
740         return PREV_MSGCTXT;
741     }
742   po_gram_error_at_line (&gram_pos, _("keyword \"%s\" unknown"), s);
743   return NAME;
744 }
745 
746 
747 static int
control_sequence()748 control_sequence ()
749 {
750   mbchar_t mbc;
751   int val;
752   int max;
753 
754   lex_getc (mbc);
755   if (mb_len (mbc) == 1)
756     switch (mb_ptr (mbc) [0])
757       {
758       case 'n':
759         return '\n';
760 
761       case 't':
762         return '\t';
763 
764       case 'b':
765         return '\b';
766 
767       case 'r':
768         return '\r';
769 
770       case 'f':
771         return '\f';
772 
773       case 'v':
774         return '\v';
775 
776       case 'a':
777         return '\a';
778 
779       case '\\':
780       case '"':
781         return mb_ptr (mbc) [0];
782 
783       case '0': case '1': case '2': case '3':
784       case '4': case '5': case '6': case '7':
785         val = 0;
786         max = 0;
787         for (;;)
788           {
789             char c = mb_ptr (mbc) [0];
790             /* Warning: not portable, can't depend on '0'..'7' ordering.  */
791             val = val * 8 + (c - '0');
792             if (++max == 3)
793               break;
794             lex_getc (mbc);
795             if (mb_len (mbc) == 1)
796               switch (mb_ptr (mbc) [0])
797                 {
798                 case '0': case '1': case '2': case '3':
799                 case '4': case '5': case '6': case '7':
800                   continue;
801 
802                 default:
803                   break;
804                 }
805             lex_ungetc (mbc);
806             break;
807           }
808         return val;
809 
810       case 'x':
811         lex_getc (mbc);
812         if (mb_iseof (mbc) || mb_len (mbc) != 1
813             || !c_isxdigit (mb_ptr (mbc) [0]))
814           break;
815 
816         val = 0;
817         for (;;)
818           {
819             char c = mb_ptr (mbc) [0];
820             val *= 16;
821             if (c_isdigit (c))
822               /* Warning: not portable, can't depend on '0'..'9' ordering */
823               val += c - '0';
824             else if (c_isupper (c))
825               /* Warning: not portable, can't depend on 'A'..'F' ordering */
826               val += c - 'A' + 10;
827             else
828               /* Warning: not portable, can't depend on 'a'..'f' ordering */
829               val += c - 'a' + 10;
830 
831             lex_getc (mbc);
832             if (mb_len (mbc) == 1)
833               switch (mb_ptr (mbc) [0])
834                 {
835                 case '0': case '1': case '2': case '3': case '4':
836                 case '5': case '6': case '7': case '8': case '9':
837                 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
838                 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
839                   continue;
840 
841                 default:
842                   break;
843                 }
844             lex_ungetc (mbc);
845             break;
846           }
847         return val;
848 
849       /* FIXME: \u and \U are not handled.  */
850       }
851   lex_ungetc (mbc);
852   po_gram_error (_("invalid control sequence"));
853   return ' ';
854 }
855 
856 
857 /* Return the next token in the PO file.  The return codes are defined
858    in "po-gram-gen2.h".  Associated data is put in 'po_gram_lval'.  */
859 int
po_gram_lex()860 po_gram_lex ()
861 {
862   static char *buf;
863   static size_t bufmax;
864   mbchar_t mbc;
865   size_t bufpos;
866 
867   for (;;)
868     {
869       lex_getc (mbc);
870 
871       if (mb_iseof (mbc))
872         /* Yacc want this for end of file.  */
873         return 0;
874 
875       if (mb_len (mbc) == 1)
876         switch (mb_ptr (mbc) [0])
877           {
878           case '\n':
879             po_lex_obsolete = false;
880             po_lex_previous = false;
881             /* Ignore whitespace, not relevant for the grammar.  */
882             break;
883 
884           case ' ':
885           case '\t':
886           case '\r':
887           case '\f':
888           case '\v':
889             /* Ignore whitespace, not relevant for the grammar.  */
890             break;
891 
892           case '#':
893             lex_getc (mbc);
894             if (mb_iseq (mbc, '~'))
895               /* A pseudo-comment beginning with #~ is found.  This is
896                  not a comment.  It is the format for obsolete entries.
897                  We simply discard the "#~" prefix.  The following
898                  characters are expected to be well formed.  */
899               {
900                 po_lex_obsolete = true;
901                 /* A pseudo-comment beginning with #~| denotes a previous
902                    untranslated string in an obsolete entry.  This does not
903                    make much sense semantically, and is implemented here
904                    for completeness only.  */
905                 lex_getc (mbc);
906                 if (mb_iseq (mbc, '|'))
907                   po_lex_previous = true;
908                 else
909                   lex_ungetc (mbc);
910                 break;
911               }
912             if (mb_iseq (mbc, '|'))
913               /* A pseudo-comment beginning with #| is found.  This is
914                  the previous untranslated string.  We discard the "#|"
915                  prefix, but change the keywords and string returns
916                  accordingly.  */
917               {
918                 po_lex_previous = true;
919                 break;
920               }
921 
922             /* Accumulate comments into a buffer.  If we have been asked
923                to pass comments, generate a COMMENT token, otherwise
924                discard it.  */
925             signal_eilseq = false;
926             if (pass_comments)
927               {
928                 bufpos = 0;
929                 for (;;)
930                   {
931                     while (bufpos + mb_len (mbc) >= bufmax)
932                       {
933                         bufmax += 100;
934                         buf = xrealloc (buf, bufmax);
935                       }
936                     if (mb_iseof (mbc) || mb_iseq (mbc, '\n'))
937                       break;
938 
939                     memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc));
940                     bufpos += mb_len (mbc);
941 
942                     lex_getc (mbc);
943                   }
944                 buf[bufpos] = '\0';
945 
946                 po_gram_lval.string.string = buf;
947                 po_gram_lval.string.pos = gram_pos;
948                 po_gram_lval.string.obsolete = po_lex_obsolete;
949                 po_lex_obsolete = false;
950                 signal_eilseq = true;
951                 return COMMENT;
952               }
953             else
954               {
955                 /* We do this in separate loop because collecting large
956                    comments while they get not passed to the upper layers
957                    is not very efficient.  */
958                 while (!mb_iseof (mbc) && !mb_iseq (mbc, '\n'))
959                   lex_getc (mbc);
960                 po_lex_obsolete = false;
961                 signal_eilseq = true;
962               }
963             break;
964 
965           case '"':
966             /* Accumulate a string.  */
967             bufpos = 0;
968             for (;;)
969               {
970                 lex_getc (mbc);
971                 while (bufpos + mb_len (mbc) >= bufmax)
972                   {
973                     bufmax += 100;
974                     buf = xrealloc (buf, bufmax);
975                   }
976                 if (mb_iseof (mbc))
977                   {
978                     po_gram_error_at_line (&gram_pos,
979                                            _("end-of-file within string"));
980                     break;
981                   }
982                 if (mb_iseq (mbc, '\n'))
983                   {
984                     po_gram_error_at_line (&gram_pos,
985                                            _("end-of-line within string"));
986                     break;
987                   }
988                 if (mb_iseq (mbc, '"'))
989                   break;
990                 if (mb_iseq (mbc, '\\'))
991                   {
992                     buf[bufpos++] = control_sequence ();
993                     continue;
994                   }
995 
996                 /* Add mbc to the accumulator.  */
997                 memcpy_small (&buf[bufpos], mb_ptr (mbc), mb_len (mbc));
998                 bufpos += mb_len (mbc);
999               }
1000             buf[bufpos] = '\0';
1001 
1002             /* Strings cannot contain the msgctxt separator, because it cannot
1003                be faithfully represented in the msgid of a .mo file.  */
1004             if (strchr (buf, MSGCTXT_SEPARATOR) != NULL)
1005               po_gram_error_at_line (&gram_pos,
1006                                      _("context separator <EOT> within string"));
1007 
1008             /* FIXME: Treatment of embedded \000 chars is incorrect.  */
1009             po_gram_lval.string.string = xstrdup (buf);
1010             po_gram_lval.string.pos = gram_pos;
1011             po_gram_lval.string.obsolete = po_lex_obsolete;
1012             return (po_lex_previous ? PREV_STRING : STRING);
1013 
1014           case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1015           case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1016           case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1017           case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1018           case 'y': case 'z':
1019           case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1020           case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1021           case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1022           case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1023           case 'Y': case 'Z':
1024           case '_': case '$':
1025             bufpos = 0;
1026             for (;;)
1027               {
1028                 char c = mb_ptr (mbc) [0];
1029                 if (bufpos + 1 >= bufmax)
1030                   {
1031                     bufmax += 100;
1032                     buf = xrealloc (buf, bufmax);
1033                   }
1034                 buf[bufpos++] = c;
1035                 lex_getc (mbc);
1036                 if (mb_len (mbc) == 1)
1037                   switch (mb_ptr (mbc) [0])
1038                     {
1039                     default:
1040                       break;
1041                     case 'a': case 'b': case 'c': case 'd': case 'e':
1042                     case 'f': case 'g': case 'h': case 'i': case 'j':
1043                     case 'k': case 'l': case 'm': case 'n': case 'o':
1044                     case 'p': case 'q': case 'r': case 's': case 't':
1045                     case 'u': case 'v': case 'w': case 'x': case 'y':
1046                     case 'z':
1047                     case 'A': case 'B': case 'C': case 'D': case 'E':
1048                     case 'F': case 'G': case 'H': case 'I': case 'J':
1049                     case 'K': case 'L': case 'M': case 'N': case 'O':
1050                     case 'P': case 'Q': case 'R': case 'S': case 'T':
1051                     case 'U': case 'V': case 'W': case 'X': case 'Y':
1052                     case 'Z':
1053                     case '_': case '$':
1054                     case '0': case '1': case '2': case '3': case '4':
1055                     case '5': case '6': case '7': case '8': case '9':
1056                       continue;
1057                     }
1058                 break;
1059               }
1060             lex_ungetc (mbc);
1061 
1062             buf[bufpos] = '\0';
1063 
1064             {
1065               int k = keyword_p (buf);
1066               if (k == NAME)
1067                 {
1068                   po_gram_lval.string.string = xstrdup (buf);
1069                   po_gram_lval.string.pos = gram_pos;
1070                   po_gram_lval.string.obsolete = po_lex_obsolete;
1071                 }
1072               else
1073                 {
1074                   po_gram_lval.pos.pos = gram_pos;
1075                   po_gram_lval.pos.obsolete = po_lex_obsolete;
1076                 }
1077               return k;
1078             }
1079 
1080           case '0': case '1': case '2': case '3': case '4':
1081           case '5': case '6': case '7': case '8': case '9':
1082             bufpos = 0;
1083             for (;;)
1084               {
1085                 char c = mb_ptr (mbc) [0];
1086                 if (bufpos + 1 >= bufmax)
1087                   {
1088                     bufmax += 100;
1089                     buf = xrealloc (buf, bufmax + 1);
1090                   }
1091                 buf[bufpos++] = c;
1092                 lex_getc (mbc);
1093                 if (mb_len (mbc) == 1)
1094                   switch (mb_ptr (mbc) [0])
1095                     {
1096                     default:
1097                       break;
1098 
1099                     case '0': case '1': case '2': case '3': case '4':
1100                     case '5': case '6': case '7': case '8': case '9':
1101                       continue;
1102                     }
1103                 break;
1104               }
1105             lex_ungetc (mbc);
1106 
1107             buf[bufpos] = '\0';
1108 
1109             po_gram_lval.number.number = atol (buf);
1110             po_gram_lval.number.pos = gram_pos;
1111             po_gram_lval.number.obsolete = po_lex_obsolete;
1112             return NUMBER;
1113 
1114           case '[':
1115             po_gram_lval.pos.pos = gram_pos;
1116             po_gram_lval.pos.obsolete = po_lex_obsolete;
1117             return '[';
1118 
1119           case ']':
1120             po_gram_lval.pos.pos = gram_pos;
1121             po_gram_lval.pos.obsolete = po_lex_obsolete;
1122             return ']';
1123 
1124           default:
1125             /* This will cause a syntax error.  */
1126             return JUNK;
1127           }
1128       else
1129         /* This will cause a syntax error.  */
1130         return JUNK;
1131     }
1132 }
1133 
1134 
1135 /* po_gram_lex() can return comments as COMMENT.  Switch this on or off.  */
1136 void
po_lex_pass_comments(bool flag)1137 po_lex_pass_comments (bool flag)
1138 {
1139   pass_comments = flag;
1140 }
1141 
1142 
1143 /* po_gram_lex() can return obsolete entries as if they were normal entries.
1144    Switch this on or off.  */
1145 void
po_lex_pass_obsolete_entries(bool flag)1146 po_lex_pass_obsolete_entries (bool flag)
1147 {
1148   pass_obsolete_entries = flag;
1149 }
1150