• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /***************************************************
2 * A program for testing the Unicode property table *
3 ***************************************************/
4 
5 /* Copyright (c) University of Cambridge 2008-2022 */
6 
7 /* Compile thus:
8 
9    gcc -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8 -o ucptest \
10      ucptest.c ../src/pcre2_ucd.c ../src/pcre2_tables.c
11 
12    Add -lreadline or -ledit if PCRE2 was configured with readline or libedit
13    support in pcre2test.
14 */
15 
16 /* This is a hacked-up program for testing the Unicode properties tables of
17 PCRE2. It can also be used for finding characters with certain properties. I
18 wrote it to help with debugging, and have added things that I found useful, in
19 a rather haphazard way. The code has never been seriously tidied or checked for
20 robustness, but it shouldn't now give compiler warnings.
21 
22 There is only one option: "-s". If given, it applies only to the "findprop"
23 command. It causes the UTF-8 sequence of bytes that encode the character to be
24 output between angle brackets at the end of the line. On a UTF-8 terminal, this
25 will show the appropriate graphic for the code point.
26 
27 If the command has arguments, they are concatenated into a buffer, separated by
28 spaces. If the first argument starts "U+" or consists entirely of hexadecimal
29 digits, "findprop" is inserted at the start. The buffer is then processed as a
30 single line file, after which the program exits. If there are no arguments, the
31 program reads commands line by line on stdin and writes output to stdout. The
32 return code is always zero.
33 
34 There are three commands:
35 
36 The command "findprop" must be followed by a space-separated list of Unicode
37 code points as hex numbers, either without any prefix or starting with "U+", or
38 as individual UTF-8 characters preceded by '+'. For example:
39 
40   findprop U+1234 5Abc +?
41 
42 The output is one long line per character, listing Unicode properties that have
43 values, followed by its other case or cases if one or more exist, followed by
44 its Script Extension list if there is one. This list is in square brackets. A
45 second list in square brackets gives all the Boolean properties of the
46 character. The properties that come first are:
47 
48   Bidi class          e.g. NSM (most common is L)
49   General type        e.g. Letter
50   Specific type       e.g. Upper case letter
51   Script              e.g. Medefaidrin
52   Grapheme break type e.g. Extend (most common is Other)
53 
54 Script names and Boolean property names are all in lower case, with underscores
55 and hyphens removed, because that's how they are stored for "loose" matching.
56 
57 The command "find" must be followed by a list of property types and their
58 values. The values are case-sensitive, except for bidi class. This finds
59 characters that have those properties. If multiple properties are listed, they
60 must all be matched. Currently supported:
61 
62   script <name>    The character must have this script property. Only one
63                      such script may be given.
64   scriptx <name>   This script must be in the character's Script Extension
65                      property list. If this is used many times, all the given
66                      scripts must be present.
67   type <abbrev>    The character's specific type (e.g. Lu or Nd) must match.
68   gbreak <name>    The grapheme break property must match.
69   bidi <class>     The character's bidi class must match.
70   bool <name>      The character's Boolean property list must contain this
71                      property.
72 
73 If a <name> or <abbrev> is preceded by !, the value must NOT be present. For
74 Script Extensions and Boolean properties, there may be a mixture of positive
75 and negative requirements. All must be satisfied.
76 
77 Sequences of two or more characters are shown as ranges, for example
78 U+0041..U+004A. No more than 100 lines are are output. If there are more
79 characters, the list ends with ...
80 
81 The command "list" must be followed by one of property names script, bool,
82 type, gbreak or bidi. The defined values for that property are listed. */
83 
84 
85 #ifdef HAVE_CONFIG_H
86 #include "../src/config.h"
87 #endif
88 
89 #ifndef SUPPORT_UNICODE
90 #define SUPPORT_UNICODE
91 #endif
92 
93 #include <ctype.h>
94 #include <stdio.h>
95 #include <stdlib.h>
96 #include <string.h>
97 #include "../src/pcre2_internal.h"
98 #include "../src/pcre2_ucp.h"
99 
100 #ifdef HAVE_UNISTD_H
101 #include <unistd.h>
102 #endif
103 
104 #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
105 #if defined(SUPPORT_LIBREADLINE)
106 #include <readline/readline.h>
107 #include <readline/history.h>
108 #else
109 #if defined(HAVE_EDITLINE_READLINE_H)
110 #include <editline/readline.h>
111 #else
112 #include <readline/readline.h>
113 #ifdef RL_VERSION_MAJOR
114 #include <readline/history.h>
115 #endif
116 #endif
117 #endif
118 #endif
119 
120 
121 /* -------------------------------------------------------------------*/
122 
123 #define CS   (char *)
124 #define CCS  (const char *)
125 #define CSS  (char **)
126 #define US   (unsigned char *)
127 #define CUS  (const unsigned char *)
128 #define USS  (unsigned char **)
129 
130 /* -------------------------------------------------------------------*/
131 
132 static BOOL show_character = FALSE;
133 
134 static const unsigned char *type_names[] = {
135   US"Cc", US"Control",
136   US"Cf", US"Format",
137   US"Cn", US"Unassigned",
138   US"Co", US"Private use",
139   US"Cs", US"Surrogate",
140   US"Ll", US"Lower case letter",
141   US"Lm", US"Modifier letter",
142   US"Lo", US"Other letter",
143   US"Lt", US"Title case letter",
144   US"Lu", US"Upper case letter",
145   US"Mc", US"Spacing mark",
146   US"Me", US"Enclosing mark",
147   US"Mn", US"Non-spacing mark",
148   US"Nd", US"Decimal number",
149   US"Nl", US"Letter number",
150   US"No", US"Other number",
151   US"Pc", US"Connector punctuation",
152   US"Pd", US"Dash punctuation",
153   US"Pe", US"Close punctuation",
154   US"Pf", US"Final punctuation",
155   US"Pi", US"Initial punctuation",
156   US"Po", US"Other punctuation",
157   US"Ps", US"Open punctuation",
158   US"Sc", US"Currency symbol",
159   US"Sk", US"Modifier symbol",
160   US"Sm", US"Mathematical symbol",
161   US"So", US"Other symbol",
162   US"Zl", US"Line separator",
163   US"Zp", US"Paragraph separator",
164   US"Zs", US"Space separator"
165 };
166 
167 static const unsigned char *gb_names[] = {
168   US"CR",                    US"carriage return",
169   US"LF",                    US"linefeed",
170   US"Control",               US"",
171   US"Extend",                US"",
172   US"Prepend",               US"",
173   US"SpacingMark",           US"",
174   US"L",                     US"Hangul syllable type L",
175   US"V",                     US"Hangul syllable type V",
176   US"T",                     US"Hangul syllable type T",
177   US"LV",                    US"Hangul syllable type LV",
178   US"LVT",                   US"Hangul syllable type LVT",
179   US"Regional_Indicator",    US"",
180   US"Other",                 US"",
181   US"ZWJ",                   US"zero width joiner",
182   US"Extended_Pictographic", US""
183 };
184 
185 static const unsigned char *bd_names[] = {
186   US"AL",   US"Arabic letter",
187   US"AN",   US"Arabid number",
188   US"B",    US"Paragraph separator",
189   US"BN",   US"Boundary neutral",
190   US"CS",   US"Common separator",
191   US"EN",   US"European number",
192   US"ES",   US"European separator",
193   US"ET",   US"European terminator",
194   US"FSI",  US"First string isolate",
195   US"L",    US"Left-to-right",
196   US"LRE",  US"Left-to-right embedding",
197   US"LRI",  US"Left-to-right isolate",
198   US"LRO",  US"Left-to-right override",
199   US"NSM",  US"Non-spacing mark",
200   US"ON",   US"Other neutral",
201   US"PDF",  US"Pop directional format",
202   US"PDI",  US"Pop directional isolate",
203   US"R",    US"Right-to-left",
204   US"RLE",  US"Right-to-left embedding",
205   US"RLI",  US"Right-to-left isolate",
206   US"RLO",  US"Right-to-left override",
207   US"S",    US"Segment separator",
208   US"WS",   US"White space"
209 };
210 
211 static const unsigned int utf8_table1[] = {
212   0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
213 
214 static const int utf8_table2[] = {
215   0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
216 
217 /* Macro to pick up the remaining bytes of a UTF-8 character, advancing
218 the pointer. */
219 
220 #define GETUTF8INC(c, eptr) \
221     { \
222     if ((c & 0x20u) == 0) \
223       c = ((c & 0x1fu) << 6) | (*eptr++ & 0x3fu); \
224     else if ((c & 0x10u) == 0) \
225       { \
226       c = ((c & 0x0fu) << 12) | ((*eptr & 0x3fu) << 6) | (eptr[1] & 0x3fu); \
227       eptr += 2; \
228       } \
229     else if ((c & 0x08u) == 0) \
230       { \
231       c = ((c & 0x07u) << 18) | ((*eptr & 0x3fu) << 12) | \
232           ((eptr[1] & 0x3fu) << 6) | (eptr[2] & 0x3fu); \
233       eptr += 3; \
234       } \
235     else if ((c & 0x04u) == 0) \
236       { \
237       c = ((c & 0x03u) << 24) | ((*eptr & 0x3fu) << 18) | \
238           ((eptr[1] & 0x3fu) << 12) | ((eptr[2] & 0x3fu) << 6) | \
239           (eptr[3] & 0x3fu); \
240       eptr += 4; \
241       } \
242     else \
243       { \
244       c = ((c & 0x01u) << 30) | ((*eptr & 0x3fu) << 24) | \
245           ((eptr[1] & 0x3fu) << 18) | ((eptr[2] & 0x3fu) << 12) | \
246           ((eptr[3] & 0x3fu) << 6) | (eptr[4] & 0x3fu); \
247       eptr += 5; \
248       } \
249     }
250 
251 
252 
253 /*************************************************
254 *       Convert character value to UTF-8         *
255 *************************************************/
256 
257 /* This function takes an unsigned long integer value in the range 0 -
258 0x7fffffff and encodes it as a UTF-8 character in 1 to 6 bytes.
259 
260 Arguments:
261   cvalue     the character value
262   buffer     pointer to buffer for result - at least 6 bytes long
263 
264 Returns:     number of bytes placed in the buffer
265              0 if input code point is too big
266 */
267 
268 static size_t
ord2utf8(unsigned int cvalue,unsigned char * buffer)269 ord2utf8(unsigned int cvalue, unsigned char *buffer)
270 {
271 size_t i, j;
272 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
273   if (cvalue <= utf8_table1[i]) break;
274 if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
275 buffer += i;
276 for (j = i; j > 0; j--)
277  {
278  *buffer-- = 0x80 | (cvalue & 0x3f);
279  cvalue >>= 6;
280  }
281 *buffer = utf8_table2[i] | cvalue;
282 return i + 1;
283 }
284 
285 
286 
287 /*************************************************
288 *             Test for interaction               *
289 *************************************************/
290 
291 static BOOL
is_stdin_tty(void)292 is_stdin_tty(void)
293 {
294 #if defined WIN32
295 return _isatty(_fileno(stdin));
296 #else
297 return isatty(fileno(stdin));
298 #endif
299 }
300 
301 
302 /*************************************************
303 *            Get  name from ucp ident            *
304 *************************************************/
305 
306 /* The utt table contains both full names and abbreviations. So search for both
307 and use the longer if two are found, unless the first one is only 3 characters
308 and we are looking for a script (some scripts have 3-character names). If this
309 were not just a test program it might be worth making some kind of reverse
310 index. */
311 
312 static const char *
get_propname(int prop,int type)313 get_propname(int prop, int type)
314 {
315 size_t i, j, len;
316 size_t foundlist[2];
317 const char *yield;
318 int typex = (type == PT_SC)? PT_SCX : type;
319 
320 j = 0;
321 for (i = 0; i < PRIV(utt_size); i++)
322   {
323   const ucp_type_table *u = PRIV(utt) + i;
324   if ((u->type == type || u->type == typex) && u->value == prop)
325     {
326     foundlist[j++] = i;
327     if (j >= 2) break;
328     }
329   }
330 
331 if (j == 0) return "??";
332 
333 yield = NULL;
334 len = 0;
335 
336 for (i = 0; i < j; i++)
337   {
338   const char *s = PRIV(utt_names) + (PRIV(utt) + foundlist[i])->name_offset;
339   size_t sl = strlen(s);
340 
341   if (sl > len)
342     {
343     yield = s;
344     if (sl == 3 && type == PT_SC) break;
345     len = sl;
346     }
347   }
348 
349 return yield;
350 }
351 
352 
353 /*************************************************
354 *      Print Unicode property info for a char    *
355 *************************************************/
356 
357 static void
print_prop(unsigned int c,BOOL is_just_one)358 print_prop(unsigned int c, BOOL is_just_one)
359 {
360 int type = UCD_CATEGORY(c);
361 int fulltype = UCD_CHARTYPE(c);
362 int script = UCD_SCRIPT(c);
363 int scriptx = UCD_SCRIPTX(c);
364 int gbprop = UCD_GRAPHBREAK(c);
365 int bidi = UCD_BIDICLASS(c);
366 unsigned int othercase = UCD_OTHERCASE(c);
367 int caseset = UCD_CASESET(c);
368 int bprops = UCD_BPROPS(c);
369 
370 const unsigned char *fulltypename = US"??";
371 const unsigned char *typename = US"??";
372 const unsigned char *graphbreak = US"??";
373 const unsigned char *bidiclass = US"??";
374 const unsigned char *scriptname = CUS get_propname(script, PT_SC);
375 
376 switch (type)
377   {
378   case ucp_C: typename = US"Control"; break;
379   case ucp_L: typename = US"Letter"; break;
380   case ucp_M: typename = US"Mark"; break;
381   case ucp_N: typename = US"Number"; break;
382   case ucp_P: typename = US"Punctuation"; break;
383   case ucp_S: typename = US"Symbol"; break;
384   case ucp_Z: typename = US"Separator"; break;
385   }
386 
387 switch (fulltype)
388   {
389   case ucp_Cc: fulltypename = US"Control"; break;
390   case ucp_Cf: fulltypename = US"Format"; break;
391   case ucp_Cn: fulltypename = US"Unassigned"; break;
392   case ucp_Co: fulltypename = US"Private use"; break;
393   case ucp_Cs: fulltypename = US"Surrogate"; break;
394   case ucp_Ll: fulltypename = US"Lower case letter"; break;
395   case ucp_Lm: fulltypename = US"Modifier letter"; break;
396   case ucp_Lo: fulltypename = US"Other letter"; break;
397   case ucp_Lt: fulltypename = US"Title case letter"; break;
398   case ucp_Lu: fulltypename = US"Upper case letter"; break;
399   case ucp_Mc: fulltypename = US"Spacing mark"; break;
400   case ucp_Me: fulltypename = US"Enclosing mark"; break;
401   case ucp_Mn: fulltypename = US"Non-spacing mark"; break;
402   case ucp_Nd: fulltypename = US"Decimal number"; break;
403   case ucp_Nl: fulltypename = US"Letter number"; break;
404   case ucp_No: fulltypename = US"Other number"; break;
405   case ucp_Pc: fulltypename = US"Connector punctuation"; break;
406   case ucp_Pd: fulltypename = US"Dash punctuation"; break;
407   case ucp_Pe: fulltypename = US"Close punctuation"; break;
408   case ucp_Pf: fulltypename = US"Final punctuation"; break;
409   case ucp_Pi: fulltypename = US"Initial punctuation"; break;
410   case ucp_Po: fulltypename = US"Other punctuation"; break;
411   case ucp_Ps: fulltypename = US"Open punctuation"; break;
412   case ucp_Sc: fulltypename = US"Currency symbol"; break;
413   case ucp_Sk: fulltypename = US"Modifier symbol"; break;
414   case ucp_Sm: fulltypename = US"Mathematical symbol"; break;
415   case ucp_So: fulltypename = US"Other symbol"; break;
416   case ucp_Zl: fulltypename = US"Line separator"; break;
417   case ucp_Zp: fulltypename = US"Paragraph separator"; break;
418   case ucp_Zs: fulltypename = US"Space separator"; break;
419   }
420 
421 switch(gbprop)
422   {
423   case ucp_gbCR:           graphbreak = US"CR"; break;
424   case ucp_gbLF:           graphbreak = US"LF"; break;
425   case ucp_gbControl:      graphbreak = US"Control"; break;
426   case ucp_gbExtend:       graphbreak = US"Extend"; break;
427   case ucp_gbPrepend:      graphbreak = US"Prepend"; break;
428   case ucp_gbSpacingMark:  graphbreak = US"SpacingMark"; break;
429   case ucp_gbL:            graphbreak = US"Hangul syllable type L"; break;
430   case ucp_gbV:            graphbreak = US"Hangul syllable type V"; break;
431   case ucp_gbT:            graphbreak = US"Hangul syllable type T"; break;
432   case ucp_gbLV:           graphbreak = US"Hangul syllable type LV"; break;
433   case ucp_gbLVT:          graphbreak = US"Hangul syllable type LVT"; break;
434   case ucp_gbRegional_Indicator:
435                            graphbreak = US"Regional Indicator"; break;
436   case ucp_gbOther:        graphbreak = US"Other"; break;
437   case ucp_gbZWJ:          graphbreak = US"Zero Width Joiner"; break;
438   case ucp_gbExtended_Pictographic:
439                            graphbreak = US"Extended Pictographic"; break;
440   default:                 graphbreak = US"Unknown"; break;
441   }
442 
443 switch(bidi)
444   {
445   case ucp_bidiAL:   bidiclass = US"AL "; break;
446   case ucp_bidiFSI:  bidiclass = US"FSI"; break;
447   case ucp_bidiL:    bidiclass = US"L  "; break;
448   case ucp_bidiLRE:  bidiclass = US"LRE"; break;
449   case ucp_bidiLRI:  bidiclass = US"LRI"; break;
450   case ucp_bidiLRO:  bidiclass = US"LRO"; break;
451   case ucp_bidiPDF:  bidiclass = US"PDF"; break;
452   case ucp_bidiPDI:  bidiclass = US"PDI"; break;
453   case ucp_bidiR:    bidiclass = US"R  "; break;
454   case ucp_bidiRLE:  bidiclass = US"RLE"; break;
455   case ucp_bidiRLI:  bidiclass = US"RLI"; break;
456   case ucp_bidiRLO:  bidiclass = US"RLO"; break;
457   case ucp_bidiAN:   bidiclass = US"AN "; break;
458   case ucp_bidiB:    bidiclass = US"B  "; break;
459   case ucp_bidiBN:   bidiclass = US"BN "; break;
460   case ucp_bidiCS:   bidiclass = US"CS "; break;
461   case ucp_bidiEN:   bidiclass = US"EN "; break;
462   case ucp_bidiES:   bidiclass = US"ES "; break;
463   case ucp_bidiET:   bidiclass = US"ET "; break;
464   case ucp_bidiNSM:  bidiclass = US"NSM"; break;
465   case ucp_bidiON:   bidiclass = US"ON "; break;
466   case ucp_bidiS:    bidiclass = US"S  "; break;
467   case ucp_bidiWS:   bidiclass = US"WS "; break;
468   default:           bidiclass = US"???"; break;
469   }
470 
471 printf("U+%04X %s %s: %s, %s, %s", c, bidiclass, typename, fulltypename,
472   scriptname, graphbreak);
473 
474 if (is_just_one && othercase != c)
475   {
476   printf(", U+%04X", othercase);
477   if (caseset != 0)
478     {
479     const uint32_t *p = PRIV(ucd_caseless_sets) + caseset - 1;
480     while (*(++p) < NOTACHAR)
481       {
482       unsigned int d = *p;
483       if (d != othercase && d != c) printf(", U+%04X", d);
484       }
485     }
486   }
487 
488 if (scriptx != 0)
489   {
490   const char *sep = "";
491   const uint32_t *p = PRIV(ucd_script_sets) + scriptx;
492   printf(", [");
493   for (int i = 0; i < ucp_Unknown; i++)
494   if (MAPBIT(p, i) != 0)
495     {
496     printf("%s%s", sep, get_propname(i, PT_SC));
497     sep = ", ";
498     }
499   printf("]");
500   }
501 
502 if (bprops != 0)
503   {
504   const char *sep = "";
505   const uint32_t *p = PRIV(ucd_boolprop_sets) +
506     bprops * ucd_boolprop_sets_item_size;
507   printf(", [");
508   for (int i = 0; i < ucp_Bprop_Count; i++)
509   if (MAPBIT(p, i) != 0)
510     {
511     printf("%s%s", sep, get_propname(i, PT_BOOL));
512     sep = ", ";
513     }
514   printf("]");
515   }
516 
517 if (show_character && is_just_one)
518   {
519   unsigned char buffer[8];
520   size_t len = ord2utf8(c, buffer);
521   printf(", >%.*s<", (int)len, buffer);
522   }
523 
524 printf("\n");
525 }
526 
527 
528 
529 /*************************************************
530 *   Find character(s) with given property/ies    *
531 *************************************************/
532 
533 static void
find_chars(unsigned char * s)534 find_chars(unsigned char *s)
535 {
536 unsigned char name[128];
537 unsigned char value[128];
538 unsigned char *t;
539 unsigned int count= 0;
540 int scriptx_list[128];
541 unsigned int scriptx_count = 0;
542 int bprop_list[128];
543 unsigned int bprop_count = 0;
544 uint32_t i, c;
545 int script = -1;
546 int type = -1;
547 int gbreak = -1;
548 int bidiclass = -1;
549 BOOL bidicontrol = FALSE;
550 BOOL script_not = FALSE;
551 BOOL type_not = FALSE;
552 BOOL gbreak_not = FALSE;
553 BOOL bidiclass_not = FALSE;
554 BOOL hadrange = FALSE;
555 const ucd_record *ucd, *next_ucd;
556 const char *pad = "        ";
557 
558 while (*s != 0)
559   {
560   unsigned int offset = 0;
561   BOOL scriptx_not = FALSE;
562   char *value_start;
563 
564   for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
565   *t = 0;
566   while (isspace(*s)) s++;
567   value_start = s;
568 
569   for (t = value; *s != 0 && !isspace(*s); s++)
570     {
571     if (*s != '_' && *s != '-') *t++ = *s;
572     }
573   *t = 0;
574   while (isspace(*s)) s++;
575 
576   if (strcmp(CS name, "script") == 0 ||
577       strcmp(CS name, "scriptx") == 0)
578     {
579     for (t = value; *t != 0; t++) *t = tolower(*t);
580 
581     if (value[0] == '!')
582       {
583       if (name[6] == 'x') scriptx_not = TRUE;
584         else script_not = TRUE;
585       offset = 1;
586       }
587 
588     for (i = 0; i < PRIV(utt_size); i++)
589       {
590       const ucp_type_table *u = PRIV(utt) + i;
591       if ((u->type == PT_SCX || u->type == PT_SC) && strcmp(CS(value + offset),
592             PRIV(utt_names) + u->name_offset) == 0)
593         {
594         c = u->value;
595         if (name[6] == 'x')
596           {
597           scriptx_list[scriptx_count++] = scriptx_not? (-c):c;
598           }
599         else
600           {
601           if (script < 0) script = c; else
602             {
603             printf("** Only 1 script value allowed\n");
604             return;
605             }
606           }
607         break;
608         }
609       }
610 
611     if (i >= PRIV(utt_size))
612       {
613       printf("** Unrecognized script name \"%s\"\n", value);
614       return;
615       }
616     }
617 
618   else if (strcmp(CS name, "bool") == 0)
619     {
620     int not = 1;
621     if (value[0] == '!')
622       {
623       not = -1;
624       offset = 1;
625       }
626 
627     for (i = 0; i < PRIV(utt_size); i++)
628       {
629       const ucp_type_table *u = PRIV(utt) + i;
630       if (u->type == PT_BOOL && strcmp(CS(value + offset),
631             PRIV(utt_names) + u->name_offset) == 0)
632         {
633         bprop_list[bprop_count++] = u->value * not;
634         break;
635         }
636       }
637 
638     if (i >= PRIV(utt_size))
639       {
640       printf("** Unrecognized property name \"%s\"\n", value);
641       return;
642       }
643     }
644 
645   else if (strcmp(CS name, "type") == 0)
646     {
647     if (type >= 0)
648       {
649       printf("** Only 1 type value allowed\n");
650       return;
651       }
652     else
653       {
654       if (value[0] == '!')
655         {
656         type_not = TRUE;
657         offset = 1;
658         }
659 
660       for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2)
661         {
662         if (strcmp(CS (value + offset), CS type_names[i]) == 0)
663           {
664           type = i/2;
665           break;
666           }
667         }
668       if (i >= sizeof(type_names)/sizeof(char *))
669         {
670         printf("** Unrecognized type name \"%s\"\n", value);
671         return;
672         }
673       }
674     }
675 
676   else if (strcmp(CS name, "gbreak") == 0)
677     {
678     if (gbreak >= 0)
679       {
680       printf("** Only 1 grapheme break value allowed\n");
681       return;
682       }
683     else
684       {
685       if (value[0] == '!')
686         {
687         gbreak_not = TRUE;
688         offset = 1;
689         }
690 
691       for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2)
692         {
693         if (strcmp(CS (value + offset), CS gb_names[i]) == 0)
694           {
695           gbreak = i/2;
696           break;
697           }
698         }
699       if (i >= sizeof(gb_names)/sizeof(char *))
700         {
701         printf("** Unrecognized gbreak name \"%s\"\n", value);
702         return;
703         }
704       }
705     }
706 
707   else if (strcmp(CS name, "bidi") == 0 ||
708            strcmp(CS name, "bidiclass") == 0 ||
709            strcmp(CS name, "bidi_class") == 0 )
710     {
711     if (bidiclass >= 0)
712       {
713       printf("** Only 1 bidi class value allowed\n");
714       return;
715       }
716     else
717       {
718       if (value[0] == '!')
719         {
720         bidiclass_not = TRUE;
721         offset = 1;
722         }
723       for (i = 0; i < sizeof(bd_names)/sizeof(char *); i += 2)
724         {
725         if (strcasecmp(CS (value + offset), CS bd_names[i]) == 0)
726           {
727           bidiclass = i/2;
728           break;
729           }
730         }
731       if (i >= sizeof(bd_names)/sizeof(char *))
732         {
733         printf("** Unrecognized bidi class name \"%s\"\n", value);
734         return;
735         }
736       }
737     }
738 
739   else
740     {
741     printf("** Unrecognized property name \"%s\"\n", name);
742     return;
743     }
744   }
745 
746 if (script < 0 && scriptx_count == 0 && bprop_count == 0 && type < 0 &&
747     gbreak < 0 && bidiclass < 0)
748   {
749   printf("** No properties specified\n");
750   return;
751   }
752 
753 for (c = 0; c <= 0x10ffff; c++)
754   {
755   if (script >= 0 && (script == UCD_SCRIPT(c)) == script_not) continue;
756 
757   if (scriptx_count > 0)
758     {
759     const uint32_t *bits_scriptx = PRIV(ucd_script_sets) + UCD_SCRIPTX(c);
760     unsigned int found = 0;
761 
762     for (i = 0; i < scriptx_count; i++)
763       {
764       int x = scriptx_list[i]/32;
765       int y = scriptx_list[i]%32;
766 
767       /* Positive requirment */
768       if (scriptx_list[i] >= 0)
769         {
770         if ((bits_scriptx[x] & (1u<<y)) != 0) found++;
771         }
772       /* Negative requirement */
773       else
774         {
775         if ((bits_scriptx[x] & (1u<<y)) == 0) found++;
776         }
777       }
778 
779     if (found != scriptx_count) continue;
780     }
781 
782   if (bprop_count > 0)
783     {
784     const uint32_t *bits_bprop = PRIV(ucd_boolprop_sets) +
785       UCD_BPROPS(c) * ucd_boolprop_sets_item_size;
786     unsigned int found = 0;
787 
788     for (i = 0; i < bprop_count; i++)
789       {
790       int x = bprop_list[i]/32;
791       int y = bprop_list[i]%32;
792 
793       /* Positive requirement */
794       if (bprop_list[i] >= 0)
795         {
796         if ((bits_bprop[x] & (1u<<y)) != 0) found++;
797         }
798       /* Negative requirement */
799       else
800         {
801         if ((bits_bprop[-x] & (1u<<(-y))) == 0) found++;
802         }
803       }
804 
805     if (found != bprop_count) continue;
806     }
807 
808   if (type >= 0)
809     {
810     if (type_not)
811       {
812       if (type == UCD_CHARTYPE(c)) continue;
813       }
814     else
815       {
816       if (type != UCD_CHARTYPE(c)) continue;
817       }
818     }
819 
820   if (gbreak >= 0)
821     {
822     if (gbreak_not)
823       {
824       if (gbreak == UCD_GRAPHBREAK(c)) continue;
825       }
826     else
827       {
828       if (gbreak != UCD_GRAPHBREAK(c)) continue;
829       }
830     }
831 
832   if (bidiclass >= 0)
833     {
834     if (bidiclass_not)
835       {
836       if (bidiclass == UCD_BIDICLASS(c)) continue;
837       }
838     else
839       {
840       if (bidiclass != UCD_BIDICLASS(c)) continue;
841       }
842     }
843 
844   /* All conditions are met. Look for runs. */
845 
846   ucd = GET_UCD(c);
847 
848   for (i = c + 1; i < 0x10ffff; i++)
849     {
850     next_ucd = GET_UCD(i);
851     if (memcmp(ucd, next_ucd, sizeof(ucd_record)) != 0) break;
852     }
853 
854   if (--i > c)
855     {
856     printf("U+%04X..", c);
857     c = i;
858     hadrange = TRUE;
859     }
860   else if (hadrange) printf("%s", pad);
861 
862   print_prop(c, FALSE);
863   if (c >= 0x100000) pad = "        ";
864     else if (c >= 0x10000) pad = "       ";
865   count++;
866   if (count >= 100)
867     {
868     printf("...\n");
869     break;
870     }
871   }
872 
873 if (count == 0) printf("No characters found\n");
874 }
875 
876 
877 /*************************************************
878 *        Process command line                    *
879 *************************************************/
880 
881 static void
process_command_line(unsigned char * buffer)882 process_command_line(unsigned char *buffer)
883 {
884 unsigned char *s, *t;
885 unsigned char name[24];
886 
887 s = buffer;
888 while (isspace(*s)) s++;
889 if (*s == 0) return;
890 
891 for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
892 *t = 0;
893 while (isspace(*s)) s++;
894 
895 if (strcmp(CS name, "findprop") == 0)
896   {
897   while (*s != 0)
898     {
899     unsigned int c;
900     unsigned char *endptr;
901     t = s;
902 
903     if (*t == '+')
904       {
905       c = *(++t);
906       if (c > 0x7fu)
907         {
908         GETCHARINC(c, t);
909         }
910       endptr = t+1;
911       }
912     else
913       {
914       if (strncmp(CS t, "U+", 2) == 0) t += 2;
915       c = strtoul(CS t, CSS(&endptr), 16);
916       }
917 
918     if (*endptr != 0 && !isspace(*endptr))
919       {
920       while (*endptr != 0 && !isspace(*endptr)) endptr++;
921       printf("** Invalid character specifier: ignored \"%.*s\"\n", (int)(endptr-s), s);
922       }
923     else
924       {
925       if (c > 0x10ffff)
926         printf("** U+%x is too big for a Unicode code point\n", c);
927       else
928         print_prop(c, TRUE);
929       }
930     s = endptr;
931     while (isspace(*s)) s++;
932     }
933   }
934 
935 else if (strcmp(CS name, "find") == 0)
936   {
937   find_chars(s);
938   }
939 
940 else if (strcmp(CS name, "list") == 0)
941   {
942   while (*s != 0)
943     {
944     size_t i;
945     for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
946     *t = 0;
947     while (isspace(*s)) s++;
948 
949     if (strcmp(CS name, "script") == 0 || strcmp(CS name, "scripts") == 0)
950       {
951       for (i = 0; i < PRIV(utt_size); i++)
952         if (PRIV(utt)[i].type == PT_SCX || PRIV(utt)[i].type == PT_SC)
953           printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset);
954       }
955 
956     else if (strcmp(CS name, "bool") == 0)
957       {
958       for (i = 0; i < PRIV(utt_size); i++)
959         if (PRIV(utt)[i].type == PT_BOOL)
960           printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset);
961       }
962 
963     else if (strcmp(CS name, "type") == 0 || strcmp(CS name, "types") == 0)
964       {
965       for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2)
966         printf("%s %s\n", type_names[i], type_names[i+1]);
967       }
968 
969     else if (strcmp(CS name, "gbreak") == 0 || strcmp(CS name, "gbreaks") == 0)
970       {
971       for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2)
972         {
973         if (gb_names[i+1][0] != 0)
974           printf("%-3s (%s)\n", gb_names[i], gb_names[i+1]);
975         else
976           printf("%s\n", gb_names[i]);
977         }
978       }
979 
980     else if (strcmp(CS name, "bidi") == 0 ||
981              strcmp(CS name, "bidiclasses") == 0)
982       {
983       for (i = 0; i < sizeof(bd_names)/sizeof(char *); i += 2)
984         printf("%3s %s\n", bd_names[i], bd_names[i+1]);
985       }
986 
987     else
988       {
989       printf("** Unknown property \"%s\"\n", name);
990       break;
991       }
992     }
993   }
994 
995 else printf("** Unknown test command \"%s\"\n", name);
996 }
997 
998 
999 
1000 /*************************************************
1001 *               Main program                     *
1002 *************************************************/
1003 
1004 int
main(int argc,char ** argv)1005 main(int argc, char **argv)
1006 {
1007 BOOL interactive;
1008 int first_arg = 1;
1009 unsigned char buffer[1024];
1010 
1011 if (argc > 1 && strcmp(argv[1], "-s") == 0)
1012   {
1013   show_character = TRUE;
1014   first_arg++;
1015   }
1016 
1017 if (argc > first_arg)
1018   {
1019   int i;
1020   BOOL datafirst = TRUE;
1021   char *arg = argv[first_arg];
1022   unsigned char *s = buffer;
1023 
1024   if (*arg != '+' && strncmp(arg, "U+", 2) != 0 && !isdigit(*arg))
1025     {
1026     while (*arg != 0)
1027       {
1028       if (!isxdigit(*arg++)) { datafirst = FALSE; break; }
1029       }
1030     }
1031 
1032   if (datafirst)
1033     {
1034     strcpy(CS s, "findprop ");
1035     s += 9;
1036     }
1037 
1038   for (i = first_arg; i < argc; i++)
1039     {
1040     s += sprintf(CS s, "%s ", argv[i]);
1041     }
1042 
1043   process_command_line(buffer);
1044   return 0;
1045   }
1046 
1047 interactive = is_stdin_tty();
1048 
1049 #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
1050 if (interactive) using_history();
1051 #endif
1052 
1053 for(;;)
1054   {
1055 #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
1056   if (interactive)
1057     {
1058     size_t len;
1059     unsigned char *s = US readline("> ");
1060     if (s == NULL) break;
1061     len = strlen(CS s);
1062     if (len > 0) add_history(CS s);
1063     memcpy(buffer, s, len);
1064     buffer[len] = '\n';
1065     buffer[len+1] = 0;
1066     free(s);
1067     }
1068   else
1069 #endif
1070 
1071     {
1072     if (interactive) printf("> ");
1073     if (fgets(CS buffer, sizeof(buffer), stdin) == NULL) break;
1074     if (!interactive) printf("%s", buffer);
1075     }
1076 
1077   process_command_line(buffer);
1078   }
1079 
1080 if (interactive) printf("\n");
1081 
1082 #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
1083 if (interactive) clear_history();
1084 #endif
1085 
1086 return 0;
1087 }
1088 
1089 /* End */
1090