• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* xgettext C# backend.
2    Copyright (C) 2003-2009, 2011, 2014, 2018-2020 Free Software Foundation, Inc.
3    Written by Bruno Haible <bruno@clisp.org>, 2003.
4 
5    This program is free software: you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3 of the License, or
8    (at your option) any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17 
18 #ifdef HAVE_CONFIG_H
19 # include "config.h"
20 #endif
21 
22 /* Specification.  */
23 #include "x-csharp.h"
24 
25 #include <errno.h>
26 #include <stdbool.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 
31 #include "message.h"
32 #include "rc-str-list.h"
33 #include "xgettext.h"
34 #include "xg-pos.h"
35 #include "xg-encoding.h"
36 #include "xg-mixed-string.h"
37 #include "xg-arglist-context.h"
38 #include "xg-arglist-callshape.h"
39 #include "xg-arglist-parser.h"
40 #include "xg-message.h"
41 #include "c-ctype.h"
42 #include "error.h"
43 #include "error-progname.h"
44 #include "xalloc.h"
45 #include "xerror.h"
46 #include "xvasprintf.h"
47 #include "mem-hash-map.h"
48 #include "po-charset.h"
49 #include "unistr.h"
50 #include "gettext.h"
51 
52 #define _(s) gettext(s)
53 
54 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
55 
56 
57 /* The C# syntax is defined in ECMA-334, second edition.  */
58 
59 
60 /* ====================== Keyword set customization.  ====================== */
61 
62 /* If true extract all strings.  */
63 static bool extract_all = false;
64 
65 static hash_table keywords;
66 static bool default_keywords = true;
67 
68 
69 void
x_csharp_extract_all()70 x_csharp_extract_all ()
71 {
72   extract_all = true;
73 }
74 
75 
76 /* Processes a --keyword option.
77    Non-ASCII function names can be used if given in UTF-8 encoding.  */
78 void
x_csharp_keyword(const char * name)79 x_csharp_keyword (const char *name)
80 {
81   if (name == NULL)
82     default_keywords = false;
83   else
84     {
85       const char *end;
86       struct callshape shape;
87       const char *colon;
88 
89       if (keywords.table == NULL)
90         hash_init (&keywords, 100);
91 
92       split_keywordspec (name, &end, &shape);
93 
94       /* The characters between name and end should form a valid C#
95          identifier sequence with dots.
96          A colon means an invalid parse in split_keywordspec().  */
97       colon = strchr (name, ':');
98       if (colon == NULL || colon >= end)
99         insert_keyword_callshape (&keywords, name, end - name, &shape);
100     }
101 }
102 
103 /* Finish initializing the keywords hash table.
104    Called after argument processing, before each file is processed.  */
105 static void
init_keywords()106 init_keywords ()
107 {
108   if (default_keywords)
109     {
110       /* When adding new keywords here, also update the documentation in
111          xgettext.texi!  */
112       x_csharp_keyword ("GetString");   /* Resource{Manager,Set}.GetString */
113       x_csharp_keyword ("GetPluralString:1,2"); /* GettextResource{Manager,Set}.GetPluralString */
114       x_csharp_keyword ("GetParticularString:1c,2"); /* Resource{Manager,Set}.GetParticularString */
115       x_csharp_keyword ("GetParticularPluralString:1c,2,3"); /* Resource{Manager,Set}.GetParticularPluralString */
116       default_keywords = false;
117     }
118 }
119 
120 void
init_flag_table_csharp()121 init_flag_table_csharp ()
122 {
123   xgettext_record_flag ("GetString:1:pass-csharp-format");
124   xgettext_record_flag ("GetPluralString:1:pass-csharp-format");
125   xgettext_record_flag ("GetPluralString:2:pass-csharp-format");
126   xgettext_record_flag ("GetParticularString:2:pass-csharp-format");
127   xgettext_record_flag ("GetParticularPluralString:2:pass-csharp-format");
128   xgettext_record_flag ("GetParticularPluralString:3:pass-csharp-format");
129   xgettext_record_flag ("String.Format:1:csharp-format");
130 }
131 
132 
133 /* ======================== Reading of characters.  ======================== */
134 
135 /* The input file stream.  */
136 static FILE *fp;
137 
138 
139 /* Phase 1: line_number handling.  */
140 
141 /* Maximum used, roughly a safer MB_LEN_MAX.  */
142 #define MAX_PHASE1_PUSHBACK 16
143 static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
144 static int phase1_pushback_length;
145 
146 /* Read the next single byte from the input file.  */
147 static int
phase1_getc()148 phase1_getc ()
149 {
150   int c;
151 
152   if (phase1_pushback_length)
153     {
154       c = phase1_pushback[--phase1_pushback_length];
155       if (c == '\n')
156         ++line_number;
157       return c;
158     }
159 
160   c = getc (fp);
161   if (c == EOF)
162     {
163       if (ferror (fp))
164         error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
165                real_file_name);
166       return EOF;
167     }
168 
169   if (c == '\n')
170     ++line_number;
171   return c;
172 }
173 
174 /* Supports MAX_PHASE1_PUSHBACK characters of pushback.  */
175 static void
phase1_ungetc(int c)176 phase1_ungetc (int c)
177 {
178   if (c != EOF)
179     {
180       if (c == '\n')
181         --line_number;
182       if (phase1_pushback_length == SIZEOF (phase1_pushback))
183         abort ();
184       phase1_pushback[phase1_pushback_length++] = c;
185     }
186 }
187 
188 
189 /* Phase 2: Conversion to Unicode.
190    This is done early because ECMA-334 section 9.1. says that the source is
191    "an ordered sequence of Unicode characters", and because the recognition
192    of the line terminators (ECMA-334 section 9.3.1) is hardly possible without
193    prior conversion to Unicode.  */
194 
195 /* End-of-file indicator for functions returning an UCS-4 character.  */
196 #define UEOF -1
197 
198 /* Newline Unicode character.  */
199 #define UNL 0x000a
200 
201 static lexical_context_ty lexical_context;
202 
203 static int phase2_pushback[1];
204 static int phase2_pushback_length;
205 
206 /* Read the next Unicode UCS-4 character from the input file.  */
207 static int
phase2_getc()208 phase2_getc ()
209 {
210   if (phase2_pushback_length)
211     return phase2_pushback[--phase2_pushback_length];
212 
213   if (xgettext_current_source_encoding == po_charset_ascii)
214     {
215       int c = phase1_getc ();
216       if (c == EOF)
217         return UEOF;
218       if (!c_isascii (c))
219         {
220           multiline_error (xstrdup (""),
221                            xasprintf ("%s\n%s\n",
222                                       non_ascii_error_message (lexical_context,
223                                                                real_file_name,
224                                                                line_number),
225                                       _("Please specify the source encoding through --from-code.")));
226           exit (EXIT_FAILURE);
227         }
228       return c;
229     }
230   else if (xgettext_current_source_encoding != po_charset_utf8)
231     {
232 #if HAVE_ICONV
233       /* Use iconv on an increasing number of bytes.  Read only as many bytes
234          through phase1_getc as needed.  This is needed to give reasonable
235          interactive behaviour when fp is connected to an interactive tty.  */
236       unsigned char buf[MAX_PHASE1_PUSHBACK];
237       size_t bufcount;
238       int c = phase1_getc ();
239       if (c == EOF)
240         return UEOF;
241       buf[0] = (unsigned char) c;
242       bufcount = 1;
243 
244       for (;;)
245         {
246           unsigned char scratchbuf[6];
247           const char *inptr = (const char *) &buf[0];
248           size_t insize = bufcount;
249           char *outptr = (char *) &scratchbuf[0];
250           size_t outsize = sizeof (scratchbuf);
251 
252           size_t res = iconv (xgettext_current_source_iconv,
253                               (ICONV_CONST char **) &inptr, &insize,
254                               &outptr, &outsize);
255           /* We expect that a character has been produced if and only if
256              some input bytes have been consumed.  */
257           if ((insize < bufcount) != (outsize < sizeof (scratchbuf)))
258             abort ();
259           if (outsize == sizeof (scratchbuf))
260             {
261               /* No character has been produced.  Must be an error.  */
262               if (res != (size_t)(-1))
263                 abort ();
264 
265               if (errno == EILSEQ)
266                 {
267                   /* An invalid multibyte sequence was encountered.  */
268                   multiline_error (xstrdup (""),
269                                    xasprintf (_("\
270 %s:%d: Invalid multibyte sequence.\n\
271 Please specify the correct source encoding through --from-code.\n"),
272                                    real_file_name, line_number));
273                   exit (EXIT_FAILURE);
274                 }
275               else if (errno == EINVAL)
276                 {
277                   /* An incomplete multibyte character.  */
278                   int c;
279 
280                   if (bufcount == MAX_PHASE1_PUSHBACK)
281                     {
282                       /* An overlong incomplete multibyte sequence was
283                          encountered.  */
284                       multiline_error (xstrdup (""),
285                                        xasprintf (_("\
286 %s:%d: Long incomplete multibyte sequence.\n\
287 Please specify the correct source encoding through --from-code.\n"),
288                                        real_file_name, line_number));
289                       exit (EXIT_FAILURE);
290                     }
291 
292                   /* Read one more byte and retry iconv.  */
293                   c = phase1_getc ();
294                   if (c == EOF)
295                     {
296                       multiline_error (xstrdup (""),
297                                        xasprintf (_("\
298 %s:%d: Incomplete multibyte sequence at end of file.\n\
299 Please specify the correct source encoding through --from-code.\n"),
300                                        real_file_name, line_number));
301                       exit (EXIT_FAILURE);
302                     }
303                   if (c == '\n')
304                     {
305                       multiline_error (xstrdup (""),
306                                        xasprintf (_("\
307 %s:%d: Incomplete multibyte sequence at end of line.\n\
308 Please specify the correct source encoding through --from-code.\n"),
309                                        real_file_name, line_number - 1));
310                       exit (EXIT_FAILURE);
311                     }
312                   buf[bufcount++] = (unsigned char) c;
313                 }
314               else
315                 error (EXIT_FAILURE, errno, _("%s:%d: iconv failure"),
316                        real_file_name, line_number);
317             }
318           else
319             {
320               size_t outbytes = sizeof (scratchbuf) - outsize;
321               size_t bytes = bufcount - insize;
322               ucs4_t uc;
323 
324               /* We expect that one character has been produced.  */
325               if (bytes == 0)
326                 abort ();
327               if (outbytes == 0)
328                 abort ();
329               /* Push back the unused bytes.  */
330               while (insize > 0)
331                 phase1_ungetc (buf[--insize]);
332               /* Convert the character from UTF-8 to UCS-4.  */
333               if (u8_mbtoucr (&uc, scratchbuf, outbytes) < (int) outbytes)
334                 {
335                   /* scratchbuf contains an out-of-range Unicode character
336                      (> 0x10ffff).  */
337                   multiline_error (xstrdup (""),
338                                    xasprintf (_("\
339 %s:%d: Invalid multibyte sequence.\n\
340 Please specify the source encoding through --from-code.\n"),
341                                    real_file_name, line_number));
342                   exit (EXIT_FAILURE);
343                 }
344               return uc;
345             }
346         }
347 #else
348       /* If we don't have iconv(), the only supported values for
349          xgettext_global_source_encoding and thus also for
350          xgettext_current_source_encoding are ASCII and UTF-8.  */
351       abort ();
352 #endif
353     }
354   else
355     {
356       /* Read an UTF-8 encoded character.  */
357       unsigned char buf[6];
358       unsigned int count;
359       int c;
360       ucs4_t uc;
361 
362       c = phase1_getc ();
363       if (c == EOF)
364         return UEOF;
365       buf[0] = c;
366       count = 1;
367 
368       if (buf[0] >= 0xc0)
369         {
370           c = phase1_getc ();
371           if (c == EOF)
372             return UEOF;
373           buf[1] = c;
374           count = 2;
375         }
376 
377       if (buf[0] >= 0xe0
378           && ((buf[1] ^ 0x80) < 0x40))
379         {
380           c = phase1_getc ();
381           if (c == EOF)
382             return UEOF;
383           buf[2] = c;
384           count = 3;
385         }
386 
387       if (buf[0] >= 0xf0
388           && ((buf[1] ^ 0x80) < 0x40)
389           && ((buf[2] ^ 0x80) < 0x40))
390         {
391           c = phase1_getc ();
392           if (c == EOF)
393             return UEOF;
394           buf[3] = c;
395           count = 4;
396         }
397 
398       if (buf[0] >= 0xf8
399           && ((buf[1] ^ 0x80) < 0x40)
400           && ((buf[2] ^ 0x80) < 0x40)
401           && ((buf[3] ^ 0x80) < 0x40))
402         {
403           c = phase1_getc ();
404           if (c == EOF)
405             return UEOF;
406           buf[4] = c;
407           count = 5;
408         }
409 
410       if (buf[0] >= 0xfc
411           && ((buf[1] ^ 0x80) < 0x40)
412           && ((buf[2] ^ 0x80) < 0x40)
413           && ((buf[3] ^ 0x80) < 0x40)
414           && ((buf[4] ^ 0x80) < 0x40))
415         {
416           c = phase1_getc ();
417           if (c == EOF)
418             return UEOF;
419           buf[5] = c;
420           count = 6;
421         }
422 
423       u8_mbtouc (&uc, buf, count);
424       return uc;
425     }
426 }
427 
428 /* Supports only one pushback character.  */
429 static void
phase2_ungetc(int c)430 phase2_ungetc (int c)
431 {
432   if (c != UEOF)
433     {
434       if (phase2_pushback_length == SIZEOF (phase2_pushback))
435         abort ();
436       phase2_pushback[phase2_pushback_length++] = c;
437     }
438 }
439 
440 
441 /* Phase 3: Convert all line terminators to LF.
442    See ECMA-334 section 9.3.1.  */
443 
444 /* Line number defined in terms of phase3.  */
445 static int logical_line_number;
446 
447 static int phase3_pushback[9];
448 static int phase3_pushback_length;
449 
450 /* Read the next Unicode UCS-4 character from the input file, mapping
451    all line terminators to U+000A, and dropping U+001A at the end of file.  */
452 static int
phase3_getc()453 phase3_getc ()
454 {
455   int c;
456 
457   if (phase3_pushback_length)
458     {
459       c = phase3_pushback[--phase3_pushback_length];
460       if (c == UNL)
461         ++logical_line_number;
462       return c;
463     }
464 
465   c = phase2_getc ();
466 
467   if (c == 0x000d)
468     {
469       int c1 = phase2_getc ();
470 
471       if (c1 != UEOF && c1 != 0x000a)
472         phase2_ungetc (c1);
473 
474       /* Seen line terminator CR or CR/LF.  */
475       ++logical_line_number;
476       return UNL;
477     }
478 
479   if (c == 0x0085 || c == 0x2028 || c == 0x2029)
480     {
481       /* Seen Unicode word processor newline.  */
482       ++logical_line_number;
483       return UNL;
484     }
485 
486   if (c == 0x001a)
487     {
488       int c1 = phase2_getc ();
489 
490       if (c1 == UEOF)
491         /* Seen U+001A right before the end of file.  */
492         return UEOF;
493 
494       phase2_ungetc (c1);
495     }
496 
497   if (c == UNL)
498     ++logical_line_number;
499   return c;
500 }
501 
502 /* Supports 9 characters of pushback.  */
503 static void
phase3_ungetc(int c)504 phase3_ungetc (int c)
505 {
506   if (c != UEOF)
507     {
508       if (c == UNL)
509         --logical_line_number;
510       if (phase3_pushback_length == SIZEOF (phase3_pushback))
511         abort ();
512       phase3_pushback[phase3_pushback_length++] = c;
513     }
514 }
515 
516 
517 /* ========================= Accumulating strings.  ======================== */
518 
519 /* See xg-mixed-string.h for the API.
520    In this extractor, we add only Unicode characters.  */
521 
522 
523 /* ======================== Accumulating comments.  ======================== */
524 
525 
526 /* Accumulating a single comment line.  */
527 
528 static struct mixed_string_buffer comment_buffer;
529 
530 static inline void
comment_start()531 comment_start ()
532 {
533   mixed_string_buffer_init (&comment_buffer, lc_comment,
534                             logical_file_name, line_number);
535 }
536 
537 static inline bool
comment_at_start()538 comment_at_start ()
539 {
540   return mixed_string_buffer_is_empty (&comment_buffer);
541 }
542 
543 static inline void
comment_add(int c)544 comment_add (int c)
545 {
546   mixed_string_buffer_append_unicode (&comment_buffer, c);
547 }
548 
549 static inline void
comment_line_end(size_t chars_to_remove)550 comment_line_end (size_t chars_to_remove)
551 {
552   char *buffer =
553     mixed_string_contents_free1 (mixed_string_buffer_result (&comment_buffer));
554   size_t buflen = strlen (buffer);
555 
556   buflen -= chars_to_remove;
557   while (buflen >= 1
558          && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
559     --buflen;
560   buffer[buflen] = '\0';
561   savable_comment_add (buffer);
562   lexical_context = lc_outside;
563 }
564 
565 
566 /* These are for tracking whether comments count as immediately before
567    keyword.  */
568 static int last_comment_line;
569 static int last_non_comment_line;
570 
571 
572 /* Phase 4: Replace each comment that is not inside a character constant or
573    string literal with a space or newline character.
574    See ECMA-334 section 9.3.2.  */
575 
576 static int
phase4_getc()577 phase4_getc ()
578 {
579   int c0;
580   int c;
581   bool last_was_star;
582 
583   c0 = phase3_getc ();
584   if (c0 != '/')
585     return c0;
586   c = phase3_getc ();
587   switch (c)
588     {
589     default:
590       phase3_ungetc (c);
591       return c0;
592 
593     case '*':
594       /* C style comment.  */
595       comment_start ();
596       last_was_star = false;
597       for (;;)
598         {
599           c = phase3_getc ();
600           if (c == UEOF)
601             break;
602           /* We skip all leading white space, but not EOLs.  */
603           if (!(comment_at_start () && (c == ' ' || c == '\t')))
604             comment_add (c);
605           switch (c)
606             {
607             case UNL:
608               comment_line_end (1);
609               comment_start ();
610               last_was_star = false;
611               continue;
612 
613             case '*':
614               last_was_star = true;
615               continue;
616 
617             case '/':
618               if (last_was_star)
619                 {
620                   comment_line_end (2);
621                   break;
622                 }
623               /* FALLTHROUGH */
624 
625             default:
626               last_was_star = false;
627               continue;
628             }
629           break;
630         }
631       last_comment_line = logical_line_number;
632       return ' ';
633 
634     case '/':
635       /* C++ style comment.  */
636       last_comment_line = logical_line_number;
637       comment_start ();
638       for (;;)
639         {
640           c = phase3_getc ();
641           if (c == UNL || c == UEOF)
642             break;
643           /* We skip all leading white space, but not EOLs.  */
644           if (!(comment_at_start () && (c == ' ' || c == '\t')))
645             comment_add (c);
646         }
647       phase3_ungetc (c); /* push back the newline, to decrement logical_line_number */
648       comment_line_end (0);
649       phase3_getc (); /* read the newline again */
650       return UNL;
651     }
652 }
653 
654 /* Supports only one pushback character.  */
655 static void
phase4_ungetc(int c)656 phase4_ungetc (int c)
657 {
658   phase3_ungetc (c);
659 }
660 
661 
662 /* ======================= Character classification.  ====================== */
663 
664 
665 /* Return true if a given character is white space.
666    See ECMA-334 section 9.3.3.  */
667 static bool
is_whitespace(int c)668 is_whitespace (int c)
669 {
670   /* Unicode character class Zs, as of Unicode 4.0.  */
671   /* grep '^[^;]*;[^;]*;Zs;' UnicodeData-4.0.0.txt */
672   switch (c >> 8)
673     {
674     case 0x00:
675       return (c == 0x0020 || c == 0x00a0);
676     case 0x16:
677       return (c == 0x1680);
678     case 0x18:
679       return (c == 0x180e);
680     case 0x20:
681       return ((c >= 0x2000 && c <= 0x200b) || c == 0x202f || c == 0x205f);
682     case 0x30:
683       return (c == 0x3000);
684     default:
685       return false;
686     }
687 }
688 
689 
690 /* C# allows identifiers containing many Unicode characters.  We recognize
691    them; to use an identifier with Unicode characters in a --keyword option,
692    it must be specified in UTF-8.  */
693 
694 static inline int
bitmap_lookup(const void * table,unsigned int uc)695 bitmap_lookup (const void *table, unsigned int uc)
696 {
697   unsigned int index1 = uc >> 16;
698   if (index1 < ((const int *) table)[0])
699     {
700       int lookup1 = ((const int *) table)[1 + index1];
701       if (lookup1 >= 0)
702         {
703           unsigned int index2 = (uc >> 9) & 0x7f;
704           int lookup2 = ((const int *) table)[lookup1 + index2];
705           if (lookup2 >= 0)
706             {
707               unsigned int index3 = (uc >> 5) & 0xf;
708               unsigned int lookup3 = ((const int *) table)[lookup2 + index3];
709 
710               return (lookup3 >> (uc & 0x1f)) & 1;
711             }
712         }
713     }
714   return 0;
715 }
716 
717 /* Unicode character classes Lu, Ll, Lt, Lm, Lo, Nl, as of Unicode 4.0,
718    plus the underscore.  */
719 static const
720 struct
721   {
722     int header[1];
723     int level1[3];
724     int level2[3 << 7];
725     /*unsigned*/ int level3[34 << 4];
726   }
727 table_identifier_start =
728 {
729   { 3 },
730   {     4,   132,   260 },
731   {
732       388,   404,   420,   436,   452,   468,   484,   500,
733       516,   532,   548,   564,   580,    -1,   596,   612,
734       628,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
735       644,    -1,   660,   660,   660,   660,   660,   660,
736       660,   660,   660,   660,   660,   660,   676,   660,
737       660,   660,   660,   660,   660,   660,   660,   660,
738       660,   660,   660,   660,   660,   660,   660,   660,
739       660,   660,   660,   660,   660,   660,   660,   660,
740       660,   660,   660,   660,   660,   660,   660,   660,
741       660,   660,   660,   660,   660,   660,   660,   692,
742       660,   660,   708,    -1,    -1,    -1,   660,   660,
743       660,   660,   660,   660,   660,   660,   660,   660,
744       660,   660,   660,   660,   660,   660,   660,   660,
745       660,   660,   660,   724,    -1,    -1,    -1,    -1,
746        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
747        -1,    -1,    -1,    -1,   740,   756,   772,   788,
748       804,   820,   836,    -1,   852,    -1,    -1,    -1,
749        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
750        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
751        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
752        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
753        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
754        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
755        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
756        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
757        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
758        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
759        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
760        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
761        -1,    -1,   868,   884,    -1,    -1,    -1,    -1,
762        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
763        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
764       660,   660,   660,   660,   660,   660,   660,   660,
765       660,   660,   660,   660,   660,   660,   660,   660,
766       660,   660,   660,   660,   660,   660,   660,   660,
767       660,   660,   660,   660,   660,   660,   660,   660,
768       660,   660,   660,   660,   660,   660,   660,   660,
769       660,   660,   660,   660,   660,   660,   660,   660,
770       660,   660,   660,   660,   660,   660,   660,   660,
771       660,   660,   660,   660,   660,   660,   660,   660,
772       660,   660,   660,   660,   660,   660,   660,   660,
773       660,   660,   660,   660,   660,   660,   660,   660,
774       660,   660,   660,   900,    -1,    -1,    -1,    -1,
775        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
776        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
777        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
778        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
779        -1,    -1,    -1,    -1,   660,   916,    -1,    -1
780   },
781   {
782     0x00000000, 0x00000000, 0x87FFFFFE, 0x07FFFFFE,
783     0x00000000, 0x04200400, 0xFF7FFFFF, 0xFF7FFFFF,
784     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
785     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
786     0xFFFFFFFF, 0x007FFFFF, 0xFFFF0000, 0xFFFFFFFF,
787     0xFFFFFFFF, 0xFFFFFFFF, 0x0003FFC3, 0x0000401F,
788     0x00000000, 0x00000000, 0x00000000, 0x04000000,
789     0xFFFFD740, 0xFFFFFFFB, 0xFFFF7FFF, 0x0FBFFFFF,
790     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
791     0xFFFFFC03, 0xFFFFFFFF, 0xFFFF7FFF, 0x033FFFFF,
792     0x0000FFFF, 0xFFFE0000, 0x027FFFFF, 0xFFFFFFFE,
793     0x000000FF, 0x00000000, 0xFFFF0000, 0x000707FF,
794     0x00000000, 0x07FFFFFE, 0x000007FF, 0xFFFEC000,
795     0xFFFFFFFF, 0xFFFFFFFF, 0x002FFFFF, 0x9C00C060,
796     0xFFFD0000, 0x0000FFFF, 0x0000E000, 0x00000000,
797     0xFFFFFFFF, 0x0002003F, 0x00000000, 0x00000000,
798     0x00000000, 0x00000000, 0x00000000, 0x00000000,
799     0x00000000, 0x00000000, 0x00000000, 0x00000000,
800     0xFFFFFFF0, 0x23FFFFFF, 0xFF010000, 0x00000003,
801     0xFFF99FE0, 0x23C5FDFF, 0xB0000000, 0x00030003,
802     0xFFF987E0, 0x036DFDFF, 0x5E000000, 0x001C0000,
803     0xFFFBBFE0, 0x23EDFDFF, 0x00010000, 0x00000003,
804     0xFFF99FE0, 0x23EDFDFF, 0xB0000000, 0x00020003,
805     0xD63DC7E8, 0x03BFC718, 0x00000000, 0x00000000,
806     0xFFFDDFE0, 0x03EFFDFF, 0x00000000, 0x00000003,
807     0xFFFDDFE0, 0x23EFFDFF, 0x40000000, 0x00000003,
808     0xFFFDDFE0, 0x03FFFDFF, 0x00000000, 0x00000003,
809     0xFC7FFFE0, 0x2FFBFFFF, 0x0000007F, 0x00000000,
810     0xFFFFFFFE, 0x000DFFFF, 0x0000007F, 0x00000000,
811     0xFEF02596, 0x200DECAE, 0x3000005F, 0x00000000,
812     0x00000001, 0x00000000, 0xFFFFFEFF, 0x000007FF,
813     0x00000F00, 0x00000000, 0x00000000, 0x00000000,
814     0xFFFFFFFF, 0x000006FB, 0x003F0000, 0x00000000,
815     0x00000000, 0xFFFFFFFF, 0xFFFF003F, 0x01FFFFFF,
816     0xFFFFFFFF, 0xFFFFFFFF, 0x83FFFFFF, 0xFFFFFFFF,
817     0xFFFFFFFF, 0xFFFFFF07, 0xFFFFFFFF, 0x03FFFFFF,
818     0xFFFFFF7F, 0xFFFFFFFF, 0x3D7F3D7F, 0xFFFFFFFF,
819     0xFFFF3D7F, 0x7F3D7FFF, 0xFF7F7F3D, 0xFFFF7FFF,
820     0x7F3D7FFF, 0xFFFFFFFF, 0x07FFFF7F, 0x00000000,
821     0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x001FFFFF,
822     0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
823     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
824     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
825     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
826     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x007F9FFF,
827     0x07FFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0x0001C7FF,
828     0x0003DFFF, 0x0003FFFF, 0x0003FFFF, 0x0001DFFF,
829     0xFFFFFFFF, 0x000FFFFF, 0x10800000, 0x00000000,
830     0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00FFFFFF,
831     0xFFFFFFFF, 0x000001FF, 0x00000000, 0x00000000,
832     0x1FFFFFFF, 0x00000000, 0xFFFF0000, 0x001F3FFF,
833     0x00000000, 0x00000000, 0x00000000, 0x00000000,
834     0x00000000, 0x00000000, 0x00000000, 0x00000000,
835     0x00000000, 0x00000000, 0x00000000, 0x00000000,
836     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000FFF,
837     0x00000000, 0x00000000, 0x00000000, 0x00000000,
838     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
839     0x0FFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x03FFFFFF,
840     0x3F3FFFFF, 0xFFFFFFFF, 0xAAFF3F3F, 0x3FFFFFFF,
841     0xFFFFFFFF, 0x5FDFFFFF, 0x0FCF1FDC, 0x1FDC1FFF,
842     0x00000000, 0x00000000, 0x00000000, 0x80020000,
843     0x00000000, 0x00000000, 0x00000000, 0x00000000,
844     0x3E2FFC84, 0xE3FBBD50, 0x000003E0, 0xFFFFFFFF,
845     0x0000000F, 0x00000000, 0x00000000, 0x00000000,
846     0x000000E0, 0x1F3E03FE, 0xFFFFFFFE, 0xFFFFFFFF,
847     0xE07FFFFF, 0xFFFFFFFE, 0xFFFFFFFF, 0xF7FFFFFF,
848     0xFFFFFFE0, 0xFFFE1FFF, 0xFFFFFFFF, 0xFFFFFFFF,
849     0x00007FFF, 0x00FFFFFF, 0x00000000, 0xFFFF0000,
850     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
851     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
852     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
853     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
854     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
855     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
856     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
857     0xFFFFFFFF, 0x003FFFFF, 0x00000000, 0x00000000,
858     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
859     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
860     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
861     0xFFFFFFFF, 0x0000003F, 0x00000000, 0x00000000,
862     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
863     0x00001FFF, 0x00000000, 0x00000000, 0x00000000,
864     0x00000000, 0x00000000, 0x00000000, 0x00000000,
865     0x00000000, 0x00000000, 0x00000000, 0x00000000,
866     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
867     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
868     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
869     0xFFFFFFFF, 0x0000000F, 0x00000000, 0x00000000,
870     0x00000000, 0x00000000, 0x00000000, 0x00000000,
871     0x00000000, 0x00000000, 0x00000000, 0x00000000,
872     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
873     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
874     0xFFFFFFFF, 0xFFFF3FFF, 0xFFFFFFFF, 0x000007FF,
875     0x00000000, 0x00000000, 0x00000000, 0x00000000,
876     0xA0F8007F, 0x5F7FFDFF, 0xFFFFFFDB, 0xFFFFFFFF,
877     0xFFFFFFFF, 0x0003FFFF, 0xFFF80000, 0xFFFFFFFF,
878     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
879     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
880     0xFFFFFFFF, 0x3FFFFFFF, 0xFFFF0000, 0xFFFFFFFF,
881     0xFFFCFFFF, 0xFFFFFFFF, 0x000000FF, 0x0FFF0000,
882     0x00000000, 0x00000000, 0x00000000, 0xFFDF0000,
883     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x1FFFFFFF,
884     0x00000000, 0x07FFFFFE, 0x07FFFFFE, 0xFFFFFFC0,
885     0xFFFFFFFF, 0x7FFFFFFF, 0x1CFCFCFC, 0x00000000,
886     0xFFFFEFFF, 0xB7FFFF7F, 0x3FFF3FFF, 0x00000000,
887     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x07FFFFFF,
888     0x00000000, 0x00000000, 0x00000000, 0x00000000,
889     0x00000000, 0x00000000, 0x00000000, 0x00000000,
890     0x00000000, 0x00000000, 0x00000000, 0x00000000,
891     0x00000000, 0x00000000, 0x00000000, 0x00000000,
892     0x7FFFFFFF, 0xFFFF0000, 0x000007FF, 0x00000000,
893     0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
894     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
895     0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
896     0x00000000, 0x00000000, 0x00000000, 0x00000000,
897     0x00000000, 0x00000000, 0x00000000, 0x00000000,
898     0xFFFFFD3F, 0x91BFFFFF, 0x00000000, 0x00000000,
899     0x00000000, 0x00000000, 0x00000000, 0x00000000,
900     0x00000000, 0x00000000, 0x00000000, 0x00000000,
901     0x00000000, 0x00000000, 0x00000000, 0x00000000,
902     0xFFFFFFFF, 0xFFFFFFFF, 0xFFDFFFFF, 0xFFFFFFFF,
903     0xDFFFFFFF, 0xEBFFDE64, 0xFFFFFFEF, 0xFFFFFFFF,
904     0xDFDFE7BF, 0x7BFFFFFF, 0xFFFDFC5F, 0xFFFFFFFF,
905     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
906     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
907     0xFFFFFFFF, 0xFFFFFF0F, 0xF7FFFFFD, 0xF7FFFFFF,
908     0xFFDFFFFF, 0xFFDFFFFF, 0xFFFF7FFF, 0xFFFF7FFF,
909     0xFFFFFDFF, 0xFFFFFDFF, 0x000003F7, 0x00000000,
910     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
911     0xFFFFFFFF, 0xFFFFFFFF, 0x007FFFFF, 0x00000000,
912     0x00000000, 0x00000000, 0x00000000, 0x00000000,
913     0x00000000, 0x00000000, 0x00000000, 0x00000000,
914     0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
915     0x00000000, 0x00000000, 0x00000000, 0x00000000,
916     0x00000000, 0x00000000, 0x00000000, 0x00000000,
917     0x00000000, 0x00000000, 0x00000000, 0x00000000
918   }
919 };
920 
921 /* Unicode character classes Lu, Ll, Lt, Lm, Lo, Nl, Nd, Pc, Mn, Mc, Cf,
922    as of Unicode 4.0.  */
923 static const
924 struct
925   {
926     int header[1];
927     int level1[15];
928     int level2[4 << 7];
929     /*unsigned*/ int level3[36 << 4];
930   }
931 table_identifier_part =
932 {
933   { 15 },
934   {
935        16,   144,   272,    -1,    -1,    -1,    -1,    -1,
936        -1,    -1,    -1,    -1,    -1,    -1,   400
937   },
938   {
939       528,   544,   560,   576,   592,   608,   624,   640,
940       656,   672,   688,   704,   720,    -1,   736,   752,
941       768,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
942       784,    -1,   800,   800,   800,   800,   800,   800,
943       800,   800,   800,   800,   800,   800,   816,   800,
944       800,   800,   800,   800,   800,   800,   800,   800,
945       800,   800,   800,   800,   800,   800,   800,   800,
946       800,   800,   800,   800,   800,   800,   800,   800,
947       800,   800,   800,   800,   800,   800,   800,   800,
948       800,   800,   800,   800,   800,   800,   800,   832,
949       800,   800,   848,    -1,    -1,    -1,   800,   800,
950       800,   800,   800,   800,   800,   800,   800,   800,
951       800,   800,   800,   800,   800,   800,   800,   800,
952       800,   800,   800,   864,    -1,    -1,    -1,    -1,
953        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
954        -1,    -1,    -1,    -1,   880,   896,   912,   928,
955       944,   960,   976,    -1,   992,    -1,    -1,    -1,
956        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
957        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
958        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
959        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
960        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
961        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
962        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
963        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
964        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
965        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
966        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
967        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
968      1008,    -1,  1024,  1040,    -1,    -1,    -1,    -1,
969        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
970        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
971       800,   800,   800,   800,   800,   800,   800,   800,
972       800,   800,   800,   800,   800,   800,   800,   800,
973       800,   800,   800,   800,   800,   800,   800,   800,
974       800,   800,   800,   800,   800,   800,   800,   800,
975       800,   800,   800,   800,   800,   800,   800,   800,
976       800,   800,   800,   800,   800,   800,   800,   800,
977       800,   800,   800,   800,   800,   800,   800,   800,
978       800,   800,   800,   800,   800,   800,   800,   800,
979       800,   800,   800,   800,   800,   800,   800,   800,
980       800,   800,   800,   800,   800,   800,   800,   800,
981       800,   800,   800,  1056,    -1,    -1,    -1,    -1,
982        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
983        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
984        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
985        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
986        -1,    -1,    -1,    -1,   800,  1072,    -1,    -1,
987      1088,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
988        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
989        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
990        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
991        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
992        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
993        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
994        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
995        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
996        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
997        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
998        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
999        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1000        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1001        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1,
1002        -1,    -1,    -1,    -1,    -1,    -1,    -1,    -1
1003   },
1004   {
1005     0x00000000, 0x03FF0000, 0x87FFFFFE, 0x07FFFFFE,
1006     0x00000000, 0x04202400, 0xFF7FFFFF, 0xFF7FFFFF,
1007     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1008     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1009     0xFFFFFFFF, 0x007FFFFF, 0xFFFF0000, 0xFFFFFFFF,
1010     0xFFFFFFFF, 0xFFFFFFFF, 0x0003FFC3, 0x0000401F,
1011     0xFFFFFFFF, 0xFFFFFFFF, 0xE0FFFFFF, 0x0400FFFF,
1012     0xFFFFD740, 0xFFFFFFFB, 0xFFFF7FFF, 0x0FBFFFFF,
1013     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1014     0xFFFFFC7B, 0xFFFFFFFF, 0xFFFF7FFF, 0x033FFFFF,
1015     0x0000FFFF, 0xFFFE0000, 0x027FFFFF, 0xFFFFFFFE,
1016     0xFFFE00FF, 0xBBFFFFFB, 0xFFFF0016, 0x000707FF,
1017     0x003F000F, 0x07FFFFFE, 0x01FFFFFF, 0xFFFFC3FF,
1018     0xFFFFFFFF, 0xFFFFFFFF, 0xBFEFFFFF, 0x9FFFFDFF,
1019     0xFFFF8000, 0xFFFFFFFF, 0x0000E7FF, 0x00000000,
1020     0xFFFFFFFF, 0x0003FFFF, 0x00000000, 0x00000000,
1021     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1022     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1023     0xFFFFFFFE, 0xF3FFFFFF, 0xFF1F3FFF, 0x0000FFCF,
1024     0xFFF99FEE, 0xF3C5FDFF, 0xB080399F, 0x0003FFCF,
1025     0xFFF987EE, 0xD36DFDFF, 0x5E003987, 0x001FFFC0,
1026     0xFFFBBFEE, 0xF3EDFDFF, 0x00013BBF, 0x0000FFCF,
1027     0xFFF99FEE, 0xF3EDFDFF, 0xB0C0398F, 0x0002FFC3,
1028     0xD63DC7EC, 0xC3BFC718, 0x00803DC7, 0x0000FF80,
1029     0xFFFDDFEE, 0xC3EFFDFF, 0x00603DDF, 0x0000FFC3,
1030     0xFFFDDFEC, 0xF3EFFDFF, 0x40603DDF, 0x0000FFC3,
1031     0xFFFDDFEC, 0xC3FFFDFF, 0x00803DCF, 0x0000FFC3,
1032     0xFC7FFFEC, 0x2FFBFFFF, 0xFF5F847F, 0x000C0000,
1033     0xFFFFFFFE, 0x07FFFFFF, 0x03FF7FFF, 0x00000000,
1034     0xFEF02596, 0x3BFFECAE, 0x33FF3F5F, 0x00000000,
1035     0x03000001, 0xC2A003FF, 0xFFFFFEFF, 0xFFFE07FF,
1036     0xFEFF0FDF, 0x1FFFFFFF, 0x00000040, 0x00000000,
1037     0xFFFFFFFF, 0x03C7F6FB, 0x03FF03FF, 0x00000000,
1038     0x00000000, 0xFFFFFFFF, 0xFFFF003F, 0x01FFFFFF,
1039     0xFFFFFFFF, 0xFFFFFFFF, 0x83FFFFFF, 0xFFFFFFFF,
1040     0xFFFFFFFF, 0xFFFFFF07, 0xFFFFFFFF, 0x03FFFFFF,
1041     0xFFFFFF7F, 0xFFFFFFFF, 0x3D7F3D7F, 0xFFFFFFFF,
1042     0xFFFF3D7F, 0x7F3D7FFF, 0xFF7F7F3D, 0xFFFF7FFF,
1043     0x7F3D7FFF, 0xFFFFFFFF, 0x07FFFF7F, 0x0003FE00,
1044     0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x001FFFFF,
1045     0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1046     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1047     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1048     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1049     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x007F9FFF,
1050     0x07FFFFFE, 0xFFFFFFFF, 0xFFFFFFFF, 0x0001C7FF,
1051     0x001FDFFF, 0x001FFFFF, 0x000FFFFF, 0x000DDFFF,
1052     0xFFFFFFFF, 0xFFFFFFFF, 0x308FFFFF, 0x000003FF,
1053     0x03FF3800, 0xFFFFFFFF, 0xFFFFFFFF, 0x00FFFFFF,
1054     0xFFFFFFFF, 0x000003FF, 0x00000000, 0x00000000,
1055     0x1FFFFFFF, 0x0FFF0FFF, 0xFFFFFFC0, 0x001F3FFF,
1056     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1057     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1058     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1059     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000FFF,
1060     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1061     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1062     0x0FFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x03FFFFFF,
1063     0x3F3FFFFF, 0xFFFFFFFF, 0xAAFF3F3F, 0x3FFFFFFF,
1064     0xFFFFFFFF, 0x5FDFFFFF, 0x0FCF1FDC, 0x1FDC1FFF,
1065     0x0000F000, 0x80007C00, 0x00100001, 0x8002FC0F,
1066     0x00000000, 0x00000000, 0x1FFF0000, 0x000007E2,
1067     0x3E2FFC84, 0xE3FBBD50, 0x000003E0, 0xFFFFFFFF,
1068     0x0000000F, 0x00000000, 0x00000000, 0x00000000,
1069     0x000000E0, 0x1F3EFFFE, 0xFFFFFFFE, 0xFFFFFFFF,
1070     0xE67FFFFF, 0xFFFFFFFE, 0xFFFFFFFF, 0xFFFFFFFF,
1071     0xFFFFFFE0, 0xFFFE1FFF, 0xFFFFFFFF, 0xFFFFFFFF,
1072     0x00007FFF, 0x00FFFFFF, 0x00000000, 0xFFFF0000,
1073     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1074     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1075     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1076     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1077     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1078     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1079     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1080     0xFFFFFFFF, 0x003FFFFF, 0x00000000, 0x00000000,
1081     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1082     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1083     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1084     0xFFFFFFFF, 0x0000003F, 0x00000000, 0x00000000,
1085     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1086     0x00001FFF, 0x00000000, 0x00000000, 0x00000000,
1087     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1088     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1089     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1090     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1091     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1092     0xFFFFFFFF, 0x0000000F, 0x00000000, 0x00000000,
1093     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1094     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1095     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1096     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1097     0xFFFFFFFF, 0xFFFF3FFF, 0xFFFFFFFF, 0x000007FF,
1098     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1099     0xE0F8007F, 0x5F7FFDFF, 0xFFFFFFDB, 0xFFFFFFFF,
1100     0xFFFFFFFF, 0x0003FFFF, 0xFFF80000, 0xFFFFFFFF,
1101     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1102     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1103     0xFFFFFFFF, 0x3FFFFFFF, 0xFFFF0000, 0xFFFFFFFF,
1104     0xFFFCFFFF, 0xFFFFFFFF, 0x000000FF, 0x0FFF0000,
1105     0x0000FFFF, 0x0018000F, 0x0000E000, 0xFFDF0000,
1106     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x9FFFFFFF,
1107     0x03FF0000, 0x87FFFFFE, 0x07FFFFFE, 0xFFFFFFE0,
1108     0xFFFFFFFF, 0x7FFFFFFF, 0x1CFCFCFC, 0x0E000000,
1109     0xFFFFEFFF, 0xB7FFFF7F, 0x3FFF3FFF, 0x00000000,
1110     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x07FFFFFF,
1111     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1112     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1113     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1114     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1115     0x7FFFFFFF, 0xFFFF0000, 0x000007FF, 0x00000000,
1116     0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
1117     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1118     0x3FFFFFFF, 0x000003FF, 0x00000000, 0x00000000,
1119     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1120     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1121     0xFFFFFD3F, 0x91BFFFFF, 0x00000000, 0x00000000,
1122     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1123     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1124     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1125     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1126     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1127     0x00000000, 0x00000000, 0x00000000, 0xFFFFE3E0,
1128     0x00000FE7, 0x00003C00, 0x00000000, 0x00000000,
1129     0xFFFFFFFF, 0xFFFFFFFF, 0xFFDFFFFF, 0xFFFFFFFF,
1130     0xDFFFFFFF, 0xEBFFDE64, 0xFFFFFFEF, 0xFFFFFFFF,
1131     0xDFDFE7BF, 0x7BFFFFFF, 0xFFFDFC5F, 0xFFFFFFFF,
1132     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1133     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1134     0xFFFFFFFF, 0xFFFFFF0F, 0xF7FFFFFD, 0xF7FFFFFF,
1135     0xFFDFFFFF, 0xFFDFFFFF, 0xFFFF7FFF, 0xFFFF7FFF,
1136     0xFFFFFDFF, 0xFFFFFDFF, 0xFFFFC3F7, 0xFFFFFFFF,
1137     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1138     0xFFFFFFFF, 0xFFFFFFFF, 0x007FFFFF, 0x00000000,
1139     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1140     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1141     0x3FFFFFFF, 0x00000000, 0x00000000, 0x00000000,
1142     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1143     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1144     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1145     0x00000002, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1146     0x00000000, 0x00000000, 0x00000000, 0x00000000,
1147     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF,
1148     0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0000FFFF
1149   }
1150 };
1151 
1152 /* Return true if a given character can occur as first character of an
1153    identifier.  See ECMA-334 section 9.4.2.  */
1154 static bool
is_identifier_start(int c)1155 is_identifier_start (int c)
1156 {
1157   return bitmap_lookup (&table_identifier_start, c);
1158   /* In ASCII only this would be:
1159      return ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_');
1160    */
1161 }
1162 
1163 /* Return true if a given character can occur as character of an identifier.
1164    See ECMA-334 section 9.4.2.  */
1165 static bool
is_identifier_part(int c)1166 is_identifier_part (int c)
1167 {
1168   return bitmap_lookup (&table_identifier_part, c);
1169   /* In ASCII only this would be:
1170      return ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
1171              || (c >= '0' && c <= '9') || c == '_');
1172    */
1173 }
1174 
1175 static bool
is_any_character(int c)1176 is_any_character (int c)
1177 {
1178   return true;
1179 }
1180 
1181 
1182 /* ======================= Preprocessor directives.  ======================= */
1183 
1184 
1185 /* Phase 5: Remove preprocessor lines.  See ECMA-334 section 9.5.
1186    As a side effect, this also removes initial whitespace on every line;
1187    this whitespace doesn't matter.  */
1188 
1189 static int phase5_pushback[10];
1190 static int phase5_pushback_length;
1191 
1192 static int
phase5_getc()1193 phase5_getc ()
1194 {
1195   int c;
1196 
1197   if (phase5_pushback_length)
1198     return phase5_pushback[--phase5_pushback_length];
1199 
1200   c = phase4_getc ();
1201   if (c != UNL)
1202     return c;
1203 
1204   do
1205     c = phase3_getc ();
1206   while (c != UEOF && is_whitespace (c));
1207 
1208   if (c == '#')
1209     {
1210       /* Ignore the entire line containing the preprocessor directive
1211          (including the // comment if it contains one).  */
1212       do
1213         c = phase3_getc ();
1214       while (c != UEOF && c != UNL);
1215       return c;
1216     }
1217   else
1218     {
1219       phase3_ungetc (c);
1220       return UNL;
1221     }
1222 }
1223 
1224 #ifdef unused
1225 static void
phase5_ungetc(int c)1226 phase5_ungetc (int c)
1227 {
1228   if (c != UEOF)
1229     {
1230       if (phase5_pushback_length == SIZEOF (phase5_pushback))
1231         abort ();
1232       phase5_pushback[phase5_pushback_length++] = c;
1233     }
1234 }
1235 #endif
1236 
1237 
1238 /* ========================== Reading of tokens.  ========================== */
1239 
1240 enum token_type_ty
1241 {
1242   token_type_eof,
1243   token_type_lparen,            /* ( */
1244   token_type_rparen,            /* ) */
1245   token_type_lbrace,            /* { */
1246   token_type_rbrace,            /* } */
1247   token_type_comma,             /* , */
1248   token_type_dot,               /* . */
1249   token_type_string_literal,    /* "abc", @"abc" */
1250   token_type_number,            /* 1.23 */
1251   token_type_symbol,            /* identifier, keyword, null */
1252   token_type_plus,              /* + */
1253   token_type_other              /* character literal, misc. operator */
1254 };
1255 typedef enum token_type_ty token_type_ty;
1256 
1257 typedef struct token_ty token_ty;
1258 struct token_ty
1259 {
1260   token_type_ty type;
1261   char *string;                         /* for token_type_symbol */
1262   mixed_string_ty *mixed_string;        /* for token_type_string_literal */
1263   refcounted_string_list_ty *comment;   /* for token_type_string_literal */
1264   int line_number;
1265   int logical_line_number;
1266 };
1267 
1268 
1269 /* Free the memory pointed to by a 'struct token_ty'.  */
1270 static inline void
free_token(token_ty * tp)1271 free_token (token_ty *tp)
1272 {
1273   if (tp->type == token_type_symbol)
1274     free (tp->string);
1275   if (tp->type == token_type_string_literal)
1276     {
1277       mixed_string_free (tp->mixed_string);
1278       drop_reference (tp->comment);
1279     }
1280 }
1281 
1282 
1283 /* Read a Unicode escape sequence outside string/character literals.
1284    Reject Unicode escapes that don't fulfill the given predicate.
1285    See ECMA-334 section 9.4.2.  */
1286 static int
do_getc_unicode_escaped(bool (* predicate)(int))1287 do_getc_unicode_escaped (bool (*predicate) (int))
1288 {
1289   int c;
1290 
1291   /* Use phase 3, because phase 4 elides comments.  */
1292   c = phase3_getc ();
1293   if (c == UEOF)
1294     return '\\';
1295   if (c == 'u' || c == 'U')
1296     {
1297       unsigned char buf[8];
1298       int expect;
1299       unsigned int n;
1300       int i;
1301 
1302       expect = (c == 'U' ? 8 : 4);
1303       n = 0;
1304       for (i = 0; i < expect; i++)
1305         {
1306           int c1 = phase3_getc ();
1307 
1308           if (c1 >= '0' && c1 <= '9')
1309             n = (n << 4) + (c1 - '0');
1310           else if (c1 >= 'A' && c1 <= 'F')
1311             n = (n << 4) + (c1 - 'A' + 10);
1312           else if (c1 >= 'a' && c1 <= 'f')
1313             n = (n << 4) + (c1 - 'a' + 10);
1314           else
1315             {
1316               phase3_ungetc (c1);
1317               while (--i >= 0)
1318                 phase3_ungetc (buf[i]);
1319               phase3_ungetc (c);
1320               return '\\';
1321             }
1322 
1323           buf[i] = c1;
1324         }
1325 
1326       if (n >= 0x110000)
1327         {
1328           error_with_progname = false;
1329           error (0, 0, _("%s:%d: warning: invalid Unicode character"),
1330                  logical_file_name, line_number);
1331           error_with_progname = true;
1332         }
1333       else if (predicate (n))
1334         return n;
1335 
1336       while (--i >= 0)
1337         phase3_ungetc (buf[i]);
1338     }
1339   phase3_ungetc (c);
1340   return '\\';
1341 }
1342 
1343 
1344 /* Read an escape sequence inside a string literal or character literal.
1345    See ECMA-334 sections 9.4.4.4., 9.4.4.5.  */
1346 static int
do_getc_escaped()1347 do_getc_escaped ()
1348 {
1349   int c;
1350   int n;
1351   int i;
1352 
1353   /* Use phase 3, because phase 4 elides comments.  */
1354   c = phase3_getc ();
1355   if (c == UEOF)
1356     return '\\';
1357   switch (c)
1358     {
1359     case 'a':
1360       return 0x0007;
1361     case 'b':
1362       return 0x0008;
1363     case 't':
1364       return 0x0009;
1365     case 'n':
1366       return 0x000a;
1367     case 'v':
1368       return 0x000b;
1369     case 'f':
1370       return 0x000c;
1371     case 'r':
1372       return 0x000d;
1373     case '"':
1374       return '"';
1375     case '\'':
1376       return '\'';
1377     case '\\':
1378       return '\\';
1379     case '0':
1380       return 0x0000;
1381     case 'x':
1382       c = phase3_getc ();
1383       switch (c)
1384         {
1385         default:
1386           phase3_ungetc (c);
1387           phase3_ungetc ('x');
1388           return '\\';
1389 
1390         case '0': case '1': case '2': case '3': case '4':
1391         case '5': case '6': case '7': case '8': case '9':
1392         case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1393         case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1394           break;
1395         }
1396       n = 0;
1397       for (i = 0;; i++)
1398         {
1399           switch (c)
1400             {
1401             default:
1402               phase3_ungetc (c);
1403               return n;
1404             case '0': case '1': case '2': case '3': case '4':
1405             case '5': case '6': case '7': case '8': case '9':
1406               n = n * 16 + c - '0';
1407               break;
1408             case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1409               n = n * 16 + 10 + c - 'A';
1410               break;
1411             case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1412               n = n * 16 + 10 + c - 'a';
1413               break;
1414             }
1415           if (i == 3)
1416             break;
1417           c = phase3_getc ();
1418         }
1419       return n;
1420     case 'u': case 'U':
1421       phase3_ungetc (c);
1422       return do_getc_unicode_escaped (is_any_character);
1423     default:
1424       /* Invalid escape sequence.  */
1425       phase3_ungetc (c);
1426       return '\\';
1427     }
1428 }
1429 
1430 /* Read a regular string literal or character literal.
1431    See ECMA-334 sections 9.4.4.4., 9.4.4.5.  */
1432 static void
accumulate_escaped(struct mixed_string_buffer * literal,int delimiter)1433 accumulate_escaped (struct mixed_string_buffer *literal, int delimiter)
1434 {
1435   int c;
1436 
1437   for (;;)
1438     {
1439       /* Use phase 3, because phase 4 elides comments.  */
1440       c = phase3_getc ();
1441       if (c == UEOF || c == delimiter)
1442         break;
1443       if (c == UNL)
1444         {
1445           phase3_ungetc (c);
1446           error_with_progname = false;
1447           if (delimiter == '\'')
1448             error (0, 0, _("%s:%d: warning: unterminated character constant"),
1449                    logical_file_name, line_number);
1450           else
1451             error (0, 0, _("%s:%d: warning: unterminated string constant"),
1452                    logical_file_name, line_number);
1453           error_with_progname = true;
1454           break;
1455         }
1456       if (c == '\\')
1457         c = do_getc_escaped ();
1458       if (literal)
1459         mixed_string_buffer_append_unicode (literal, c);
1460     }
1461 }
1462 
1463 
1464 /* Combine characters into tokens.  Discard whitespace.  */
1465 
1466 /* Maximum used guaranteed to be < 4.  */
1467 static token_ty phase6_pushback[4];
1468 static int phase6_pushback_length;
1469 
1470 static void
phase6_get(token_ty * tp)1471 phase6_get (token_ty *tp)
1472 {
1473   int c;
1474 
1475   if (phase6_pushback_length)
1476     {
1477       *tp = phase6_pushback[--phase6_pushback_length];
1478       return;
1479     }
1480   tp->string = NULL;
1481 
1482   for (;;)
1483     {
1484       tp->line_number = line_number;
1485       tp->logical_line_number = logical_line_number;
1486       c = phase5_getc ();
1487 
1488       if (c == UEOF)
1489         {
1490           tp->type = token_type_eof;
1491           return;
1492         }
1493 
1494       switch (c)
1495         {
1496         case UNL:
1497           if (last_non_comment_line > last_comment_line)
1498             savable_comment_reset ();
1499           /* FALLTHROUGH */
1500         case ' ':
1501         case '\t':
1502         case '\f':
1503           /* Ignore whitespace and comments.  */
1504           continue;
1505         }
1506 
1507       last_non_comment_line = tp->logical_line_number;
1508 
1509       switch (c)
1510         {
1511         case '(':
1512           tp->type = token_type_lparen;
1513           return;
1514 
1515         case ')':
1516           tp->type = token_type_rparen;
1517           return;
1518 
1519         case '{':
1520           tp->type = token_type_lbrace;
1521           return;
1522 
1523         case '}':
1524           tp->type = token_type_rbrace;
1525           return;
1526 
1527         case ',':
1528           tp->type = token_type_comma;
1529           return;
1530 
1531         case '.':
1532           c = phase4_getc ();
1533           if (!(c >= '0' && c <= '9'))
1534             {
1535               phase4_ungetc (c);
1536               tp->type = token_type_dot;
1537               return;
1538             }
1539           /* FALLTHROUGH */
1540 
1541         case '0': case '1': case '2': case '3': case '4':
1542         case '5': case '6': case '7': case '8': case '9':
1543           {
1544             /* Don't need to verify the complicated syntax of integers and
1545                floating-point numbers.  We assume a valid C# input.
1546                The simplified syntax that we recognize as number is: any
1547                sequence of alphanumeric characters, additionally '+' and '-'
1548                immediately after 'e' or 'E' except in hexadecimal numbers.  */
1549             bool hexadecimal = false;
1550 
1551             for (;;)
1552               {
1553                 c = phase4_getc ();
1554                 if (c >= '0' && c <= '9')
1555                   continue;
1556                 if ((c >= 'A' && c <= 'Z') || (c >= 'a' &&c <= 'z'))
1557                   {
1558                     if (c == 'X' || c == 'x')
1559                       hexadecimal = true;
1560                     if ((c == 'E' || c == 'e') && !hexadecimal)
1561                       {
1562                         c = phase4_getc ();
1563                         if (!(c == '+' || c == '-'))
1564                           phase4_ungetc (c);
1565                       }
1566                     continue;
1567                   }
1568                 if (c == '.')
1569                   continue;
1570                 break;
1571               }
1572             phase4_ungetc (c);
1573             tp->type = token_type_number;
1574             return;
1575           }
1576 
1577         case '"':
1578           /* Regular string literal.  */
1579           {
1580             struct mixed_string_buffer literal;
1581 
1582             lexical_context = lc_string;
1583             mixed_string_buffer_init (&literal,
1584                                       lexical_context,
1585                                       logical_file_name,
1586                                       logical_line_number);
1587             accumulate_escaped (&literal, '"');
1588             tp->mixed_string = mixed_string_buffer_result (&literal);
1589             tp->comment = add_reference (savable_comment);
1590             lexical_context = lc_outside;
1591             tp->type = token_type_string_literal;
1592             return;
1593           }
1594 
1595         case '\'':
1596           /* Character literal.  */
1597           {
1598             accumulate_escaped (NULL, '\'');
1599             tp->type = token_type_other;
1600             return;
1601           }
1602 
1603         case '+':
1604           c = phase4_getc ();
1605           if (c == '+')
1606             /* Operator ++ */
1607             tp->type = token_type_other;
1608           else if (c == '=')
1609             /* Operator += */
1610             tp->type = token_type_other;
1611           else
1612             {
1613               /* Operator + */
1614               phase4_ungetc (c);
1615               tp->type = token_type_plus;
1616             }
1617           return;
1618 
1619         case '@':
1620           c = phase4_getc ();
1621           if (c == '"')
1622             {
1623               /* Verbatim string literal.  */
1624               struct mixed_string_buffer literal;
1625 
1626               lexical_context = lc_string;
1627               mixed_string_buffer_init (&literal, lexical_context,
1628                                         logical_file_name, logical_line_number);
1629               for (;;)
1630                 {
1631                   /* Use phase 2, because phase 4 elides comments and phase 3
1632                      mixes up the newline characters.  */
1633                   c = phase2_getc ();
1634                   if (c == UEOF)
1635                     break;
1636                   if (c == '"')
1637                     {
1638                       c = phase2_getc ();
1639                       if (c != '"')
1640                         {
1641                           phase2_ungetc (c);
1642                           break;
1643                         }
1644                     }
1645                   /* No special treatment of newline and backslash here.  */
1646                   mixed_string_buffer_append_unicode (&literal, c);
1647                 }
1648               tp->mixed_string = mixed_string_buffer_result (&literal);
1649               tp->comment = add_reference (savable_comment);
1650               lexical_context = lc_outside;
1651               tp->type = token_type_string_literal;
1652               return;
1653             }
1654           /* FALLTHROUGH, so that @identifier is recognized.  */
1655 
1656         default:
1657           if (c == '\\')
1658             c = do_getc_unicode_escaped (is_identifier_start);
1659           if (is_identifier_start (c))
1660             {
1661               struct mixed_string_buffer buffer;
1662               mixed_string_ty *mixed_string;
1663 
1664               mixed_string_buffer_init (&buffer, lexical_context,
1665                                         logical_file_name, logical_line_number);
1666               for (;;)
1667                 {
1668                   mixed_string_buffer_append_unicode (&buffer, c);
1669                   c = phase4_getc ();
1670                   if (c == '\\')
1671                     c = do_getc_unicode_escaped (is_identifier_part);
1672                   if (!is_identifier_part (c))
1673                     break;
1674                 }
1675               phase4_ungetc (c);
1676               mixed_string = mixed_string_buffer_result (&buffer);
1677               tp->string = mixed_string_contents (mixed_string);
1678               mixed_string_free (mixed_string);
1679               tp->type = token_type_symbol;
1680               return;
1681             }
1682           else
1683             {
1684               /* Misc. operator.  */
1685               tp->type = token_type_other;
1686               return;
1687             }
1688         }
1689     }
1690 }
1691 
1692 /* Supports 3 tokens of pushback.  */
1693 static void
phase6_unget(token_ty * tp)1694 phase6_unget (token_ty *tp)
1695 {
1696   if (tp->type != token_type_eof)
1697     {
1698       if (phase6_pushback_length == SIZEOF (phase6_pushback))
1699         abort ();
1700       phase6_pushback[phase6_pushback_length++] = *tp;
1701     }
1702 }
1703 
1704 
1705 /* Compile-time optimization of string literal concatenation.
1706    Combine "string1" + ... + "stringN" to the concatenated string if
1707      - the token after this expression is not '.' (because then the last
1708        string could be part of a method call expression).  */
1709 
1710 static token_ty phase7_pushback[2];
1711 static int phase7_pushback_length;
1712 
1713 static void
phase7_get(token_ty * tp)1714 phase7_get (token_ty *tp)
1715 {
1716   if (phase7_pushback_length)
1717     {
1718       *tp = phase7_pushback[--phase7_pushback_length];
1719       return;
1720     }
1721 
1722   phase6_get (tp);
1723   if (tp->type == token_type_string_literal)
1724     {
1725       mixed_string_ty *sum = tp->mixed_string;
1726 
1727       for (;;)
1728         {
1729           token_ty token2;
1730 
1731           phase6_get (&token2);
1732           if (token2.type == token_type_plus)
1733             {
1734               token_ty token3;
1735 
1736               phase6_get (&token3);
1737               if (token3.type == token_type_string_literal)
1738                 {
1739                   token_ty token_after;
1740 
1741                   phase6_get (&token_after);
1742                   if (token_after.type != token_type_dot)
1743                     {
1744                       sum = mixed_string_concat_free1 (sum, token3.mixed_string);
1745 
1746                       phase6_unget (&token_after);
1747                       free_token (&token3);
1748                       free_token (&token2);
1749                       continue;
1750                     }
1751                   phase6_unget (&token_after);
1752                 }
1753               phase6_unget (&token3);
1754             }
1755           phase6_unget (&token2);
1756           break;
1757         }
1758       tp->mixed_string = sum;
1759     }
1760 }
1761 
1762 /* Supports 2 tokens of pushback.  */
1763 static void
phase7_unget(token_ty * tp)1764 phase7_unget (token_ty *tp)
1765 {
1766   if (tp->type != token_type_eof)
1767     {
1768       if (phase7_pushback_length == SIZEOF (phase7_pushback))
1769         abort ();
1770       phase7_pushback[phase7_pushback_length++] = *tp;
1771     }
1772 }
1773 
1774 
1775 static void
x_csharp_lex(token_ty * tp)1776 x_csharp_lex (token_ty *tp)
1777 {
1778   phase7_get (tp);
1779 }
1780 
1781 /* Supports 2 tokens of pushback.  */
1782 static void
x_csharp_unlex(token_ty * tp)1783 x_csharp_unlex (token_ty *tp)
1784 {
1785   phase7_unget (tp);
1786 }
1787 
1788 
1789 /* ========================= Extracting strings.  ========================== */
1790 
1791 
1792 /* Context lookup table.  */
1793 static flag_context_list_table_ty *flag_context_list_table;
1794 
1795 
1796 /* The file is broken into tokens.  Scan the token stream, looking for
1797    a keyword, followed by a left paren, followed by a string.  When we
1798    see this sequence, we have something to remember.  We assume we are
1799    looking at a valid C or C++ program, and leave the complaints about
1800    the grammar to the compiler.
1801 
1802      Normal handling: Look for
1803        keyword ( ... msgid ... )
1804      Plural handling: Look for
1805        keyword ( ... msgid ... msgid_plural ... )
1806 
1807    We use recursion because the arguments before msgid or between msgid
1808    and msgid_plural can contain subexpressions of the same form.  */
1809 
1810 
1811 /* Extract messages until the next balanced closing parenthesis or brace,
1812    depending on TERMINATOR.
1813    Extracted messages are added to MLP.
1814    Return true upon eof, false upon closing parenthesis or brace.  */
1815 static bool
extract_parenthesized(message_list_ty * mlp,token_type_ty terminator,flag_context_ty outer_context,flag_context_list_iterator_ty context_iter,struct arglist_parser * argparser)1816 extract_parenthesized (message_list_ty *mlp, token_type_ty terminator,
1817                        flag_context_ty outer_context,
1818                        flag_context_list_iterator_ty context_iter,
1819                        struct arglist_parser *argparser)
1820 {
1821   /* Current argument number.  */
1822   int arg = 1;
1823   /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
1824   int state;
1825   /* Parameters of the keyword just seen.  Defined only in state 1.  */
1826   const struct callshapes *next_shapes = NULL;
1827   /* Context iterator that will be used if the next token is a '('.  */
1828   flag_context_list_iterator_ty next_context_iter =
1829     passthrough_context_list_iterator;
1830   /* Current context.  */
1831   flag_context_ty inner_context =
1832     inherited_context (outer_context,
1833                        flag_context_list_iterator_advance (&context_iter));
1834 
1835   /* Start state is 0.  */
1836   state = 0;
1837 
1838   for (;;)
1839     {
1840       token_ty token;
1841 
1842       x_csharp_lex (&token);
1843       switch (token.type)
1844         {
1845         case token_type_symbol:
1846           {
1847             /* Combine symbol1 . ... . symbolN to a single strings, so that
1848                we can recognize static function calls like
1849                GettextResource.gettext.  The information present for
1850                symbolI.....symbolN has precedence over the information for
1851                symbolJ.....symbolN with J > I.  */
1852             char *sum = token.string;
1853             size_t sum_len = strlen (sum);
1854             const char *dottedname;
1855             flag_context_list_ty *context_list;
1856 
1857             for (;;)
1858               {
1859                 token_ty token2;
1860 
1861                 x_csharp_lex (&token2);
1862                 if (token2.type == token_type_dot)
1863                   {
1864                     token_ty token3;
1865 
1866                     x_csharp_lex (&token3);
1867                     if (token3.type == token_type_symbol)
1868                       {
1869                         char *addend = token3.string;
1870                         size_t addend_len = strlen (addend);
1871 
1872                         sum =
1873                           (char *) xrealloc (sum, sum_len + 1 + addend_len + 1);
1874                         sum[sum_len] = '.';
1875                         memcpy (sum + sum_len + 1, addend, addend_len + 1);
1876                         sum_len += 1 + addend_len;
1877 
1878                         free_token (&token3);
1879                         free_token (&token2);
1880                         continue;
1881                       }
1882                     x_csharp_unlex (&token3);
1883                   }
1884                 x_csharp_unlex (&token2);
1885                 break;
1886               }
1887 
1888             for (dottedname = sum;;)
1889               {
1890                 void *keyword_value;
1891 
1892                 if (hash_find_entry (&keywords, dottedname, strlen (dottedname),
1893                                      &keyword_value)
1894                     == 0)
1895                   {
1896                     next_shapes = (const struct callshapes *) keyword_value;
1897                     state = 1;
1898                     break;
1899                   }
1900 
1901                 dottedname = strchr (dottedname, '.');
1902                 if (dottedname == NULL)
1903                   {
1904                     state = 0;
1905                     break;
1906                   }
1907                 dottedname++;
1908               }
1909 
1910             for (dottedname = sum;;)
1911               {
1912                 context_list =
1913                   flag_context_list_table_lookup (
1914                     flag_context_list_table,
1915                     dottedname, strlen (dottedname));
1916                 if (context_list != NULL)
1917                   break;
1918 
1919                 dottedname = strchr (dottedname, '.');
1920                 if (dottedname == NULL)
1921                   break;
1922                 dottedname++;
1923               }
1924             next_context_iter = flag_context_list_iterator (context_list);
1925 
1926             free (sum);
1927             continue;
1928           }
1929 
1930         case token_type_lparen:
1931           if (extract_parenthesized (mlp, token_type_rparen,
1932                                      inner_context, next_context_iter,
1933                                      arglist_parser_alloc (mlp,
1934                                                            state ? next_shapes : NULL)))
1935             {
1936               arglist_parser_done (argparser, arg);
1937               return true;
1938             }
1939           next_context_iter = null_context_list_iterator;
1940           state = 0;
1941           continue;
1942 
1943         case token_type_rparen:
1944           if (terminator == token_type_rparen)
1945             {
1946               arglist_parser_done (argparser, arg);
1947               return false;
1948             }
1949           if (terminator == token_type_rbrace)
1950             {
1951               error_with_progname = false;
1952               error (0, 0,
1953                      _("%s:%d: warning: ')' found where '}' was expected"),
1954                      logical_file_name, token.line_number);
1955               error_with_progname = true;
1956             }
1957           next_context_iter = null_context_list_iterator;
1958           state = 0;
1959           continue;
1960 
1961         case token_type_lbrace:
1962           if (extract_parenthesized (mlp, token_type_rbrace,
1963                                      null_context, null_context_list_iterator,
1964                                      arglist_parser_alloc (mlp, NULL)))
1965             {
1966               arglist_parser_done (argparser, arg);
1967               return true;
1968             }
1969           next_context_iter = null_context_list_iterator;
1970           state = 0;
1971           continue;
1972 
1973         case token_type_rbrace:
1974           if (terminator == token_type_rbrace)
1975             {
1976               arglist_parser_done (argparser, arg);
1977               return false;
1978             }
1979           if (terminator == token_type_rparen)
1980             {
1981               error_with_progname = false;
1982               error (0, 0,
1983                      _("%s:%d: warning: '}' found where ')' was expected"),
1984                      logical_file_name, token.line_number);
1985               error_with_progname = true;
1986             }
1987           next_context_iter = null_context_list_iterator;
1988           state = 0;
1989           continue;
1990 
1991         case token_type_comma:
1992           arg++;
1993           inner_context =
1994             inherited_context (outer_context,
1995                                flag_context_list_iterator_advance (
1996                                  &context_iter));
1997           next_context_iter = passthrough_context_list_iterator;
1998           state = 0;
1999           continue;
2000 
2001         case token_type_string_literal:
2002           {
2003             lex_pos_ty pos;
2004 
2005             pos.file_name = logical_file_name;
2006             pos.line_number = token.line_number;
2007 
2008             if (extract_all)
2009               {
2010                 char *string = mixed_string_contents (token.mixed_string);
2011                 mixed_string_free (token.mixed_string);
2012                 remember_a_message (mlp, NULL, string, true, false,
2013                                     inner_context, &pos,
2014                                     NULL, token.comment, true);
2015               }
2016             else
2017               arglist_parser_remember (argparser, arg, token.mixed_string,
2018                                        inner_context,
2019                                        pos.file_name, pos.line_number,
2020                                        token.comment, true);
2021           }
2022           drop_reference (token.comment);
2023           next_context_iter = null_context_list_iterator;
2024           state = 0;
2025           continue;
2026 
2027         case token_type_eof:
2028           arglist_parser_done (argparser, arg);
2029           return true;
2030 
2031         case token_type_dot:
2032         case token_type_number:
2033         case token_type_plus:
2034         case token_type_other:
2035           next_context_iter = null_context_list_iterator;
2036           state = 0;
2037           continue;
2038 
2039         default:
2040           abort ();
2041         }
2042     }
2043 }
2044 
2045 
2046 void
extract_csharp(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)2047 extract_csharp (FILE *f,
2048                 const char *real_filename, const char *logical_filename,
2049                 flag_context_list_table_ty *flag_table,
2050                 msgdomain_list_ty *mdlp)
2051 {
2052   message_list_ty *mlp = mdlp->item[0]->messages;
2053 
2054   fp = f;
2055   real_file_name = real_filename;
2056   logical_file_name = xstrdup (logical_filename);
2057   line_number = 1;
2058 
2059   phase1_pushback_length = 0;
2060 
2061   lexical_context = lc_outside;
2062 
2063   phase2_pushback_length = 0;
2064 
2065   logical_line_number = 1;
2066 
2067   phase3_pushback_length = 0;
2068 
2069   last_comment_line = -1;
2070   last_non_comment_line = -1;
2071 
2072   phase5_pushback_length = 0;
2073   phase6_pushback_length = 0;
2074   phase7_pushback_length = 0;
2075 
2076   flag_context_list_table = flag_table;
2077 
2078   init_keywords ();
2079 
2080   /* Eat tokens until eof is seen.  When extract_parenthesized returns
2081      due to an unbalanced closing parenthesis, just restart it.  */
2082   while (!extract_parenthesized (mlp, token_type_eof,
2083                                  null_context, null_context_list_iterator,
2084                                  arglist_parser_alloc (mlp, NULL)))
2085     ;
2086 
2087   fp = NULL;
2088   real_file_name = NULL;
2089   logical_file_name = NULL;
2090   line_number = 0;
2091 }
2092