• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /***************************************************
2 * A program for testing the Unicode property table *
3 ***************************************************/
4 
5 /* Copyright (c) University of Cambridge 2008-2022 */
6 
7 /* Compile thus:
8 
9    gcc -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8 -o ucptest \
10      ucptest.c ../src/pcre2_ucd.c ../src/pcre2_tables.c
11 
12    Add -lreadline or -ledit if PCRE2 was configured with readline or libedit
13    support in pcre2test.
14 */
15 
16 /* This is a hacked-up program for testing the Unicode properties tables of
17 PCRE2. It can also be used for finding characters with certain properties. I
18 wrote it to help with debugging, and have added things that I found useful, in
19 a rather haphazard way. The code has never been seriously tidied or checked for
20 robustness, but it shouldn't now give compiler warnings.
21 
22 There is only one option: "-s". If given, it applies only to the "findprop"
23 command. It causes the UTF-8 sequence of bytes that encode the character to be
24 output between angle brackets at the end of the line. On a UTF-8 terminal, this
25 will show the appropriate graphic for the code point.
26 
27 If the command has arguments, they are concatenated into a buffer, separated by
28 spaces. If the first argument starts "U+" or consists entirely of hexadecimal
29 digits, "findprop" is inserted at the start. The buffer is then processed as a
30 single line file, after which the program exits. If there are no arguments, the
31 program reads commands line by line on stdin and writes output to stdout. The
32 return code is always zero.
33 
34 There are three commands:
35 
36 The command "findprop" must be followed by a space-separated list of Unicode
37 code points as hex numbers, either without any prefix or starting with "U+", or
38 as individual UTF-8 characters preceded by '+'. For example:
39 
40   findprop U+1234 5Abc +?
41 
42 The output is one long line per character, listing Unicode properties that have
43 values, followed by its other case or cases if one or more exist, followed by
44 its Script Extension list if there is one. This list is in square brackets. A
45 second list in square brackets gives all the Boolean properties of the
46 character. The properties that come first are:
47 
48   Bidi class          e.g. NSM (most common is L)
49   General type        e.g. Letter
50   Specific type       e.g. Upper case letter
51   Script              e.g. Medefaidrin
52   Grapheme break type e.g. Extend (most common is Other)
53 
54 Script names and Boolean property names are all in lower case, with underscores
55 and hyphens removed, because that's how they are stored for "loose" matching.
56 
57 The command "find" must be followed by a list of property types and their
58 values. The values are case-sensitive, except for bidi class. This finds
59 characters that have those properties. If multiple properties are listed, they
60 must all be matched. Currently supported:
61 
62   script <name>    The character must have this script property. Only one
63                      such script may be given.
64   scriptx <name>   This script must be in the character's Script Extension
65                      property list. If this is used many times, all the given
66                      scripts must be present.
67   type <abbrev>    The character's specific type (e.g. Lu or Nd) must match.
68   gbreak <name>    The grapheme break property must match.
69   bidi <class>     The character's bidi class must match.
70   bool <name>      The character's Boolean property list must contain this
71                      property.
72 
73 If a <name> or <abbrev> is preceded by !, the value must NOT be present. For
74 Script Extensions and Boolean properties, there may be a mixture of positive
75 and negative requirements. All must be satisfied.
76 
77 Sequences of two or more characters are shown as ranges, for example
78 U+0041..U+004A. No more than 100 lines are are output. If there are more
79 characters, the list ends with ...
80 
81 The command "list" must be followed by one of property names script, bool,
82 type, gbreak or bidi. The defined values for that property are listed. */
83 
84 
85 #ifdef HAVE_CONFIG_H
86 #include "../src/config.h"
87 #endif
88 
89 #ifndef SUPPORT_UNICODE
90 #define SUPPORT_UNICODE
91 #endif
92 
93 #include <ctype.h>
94 #include <stdio.h>
95 #include <stdlib.h>
96 #include <string.h>
97 #include "../src/pcre2_internal.h"
98 #include "../src/pcre2_ucp.h"
99 
100 #ifdef HAVE_UNISTD_H
101 #include <unistd.h>
102 #endif
103 
104 #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
105 #if defined(SUPPORT_LIBREADLINE)
106 #include <readline/readline.h>
107 #include <readline/history.h>
108 #else
109 #if defined(HAVE_EDITLINE_READLINE_H)
110 #include <editline/readline.h>
111 #else
112 #include <readline/readline.h>
113 #ifdef RL_VERSION_MAJOR
114 #include <readline/history.h>
115 #endif
116 #endif
117 #endif
118 #endif
119 
120 
121 /* -------------------------------------------------------------------*/
122 
123 #define CS   (char *)
124 #define CCS  (const char *)
125 #define CSS  (char **)
126 #define US   (unsigned char *)
127 #define CUS  (const unsigned char *)
128 #define USS  (unsigned char **)
129 
130 /* -------------------------------------------------------------------*/
131 
132 static BOOL show_character = FALSE;
133 
134 static const unsigned char *type_names[] = {
135   US"Cc", US"Control",
136   US"Cf", US"Format",
137   US"Cn", US"Unassigned",
138   US"Co", US"Private use",
139   US"Cs", US"Surrogate",
140   US"Ll", US"Lower case letter",
141   US"Lm", US"Modifier letter",
142   US"Lo", US"Other letter",
143   US"Lt", US"Title case letter",
144   US"Lu", US"Upper case letter",
145   US"Mc", US"Spacing mark",
146   US"Me", US"Enclosing mark",
147   US"Mn", US"Non-spacing mark",
148   US"Nd", US"Decimal number",
149   US"Nl", US"Letter number",
150   US"No", US"Other number",
151   US"Pc", US"Connector punctuation",
152   US"Pd", US"Dash punctuation",
153   US"Pe", US"Close punctuation",
154   US"Pf", US"Final punctuation",
155   US"Pi", US"Initial punctuation",
156   US"Po", US"Other punctuation",
157   US"Ps", US"Open punctuation",
158   US"Sc", US"Currency symbol",
159   US"Sk", US"Modifier symbol",
160   US"Sm", US"Mathematical symbol",
161   US"So", US"Other symbol",
162   US"Zl", US"Line separator",
163   US"Zp", US"Paragraph separator",
164   US"Zs", US"Space separator"
165 };
166 
167 static const unsigned char *gb_names[] = {
168   US"CR",                    US"carriage return",
169   US"LF",                    US"linefeed",
170   US"Control",               US"",
171   US"Extend",                US"",
172   US"Prepend",               US"",
173   US"SpacingMark",           US"",
174   US"L",                     US"Hangul syllable type L",
175   US"V",                     US"Hangul syllable type V",
176   US"T",                     US"Hangul syllable type T",
177   US"LV",                    US"Hangul syllable type LV",
178   US"LVT",                   US"Hangul syllable type LVT",
179   US"Regional_Indicator",    US"",
180   US"Other",                 US"",
181   US"ZWJ",                   US"zero width joiner",
182   US"Extended_Pictographic", US""
183 };
184 
185 static const unsigned char *bd_names[] = {
186   US"AL",   US"Arabic letter",
187   US"AN",   US"Arabid number",
188   US"B",    US"Paragraph separator",
189   US"BN",   US"Boundary neutral",
190   US"CS",   US"Common separator",
191   US"EN",   US"European number",
192   US"ES",   US"European separator",
193   US"ET",   US"European terminator",
194   US"FSI",  US"First string isolate",
195   US"L",    US"Left-to-right",
196   US"LRE",  US"Left-to-right embedding",
197   US"LRI",  US"Left-to-right isolate",
198   US"LRO",  US"Left-to-right override",
199   US"NSM",  US"Non-spacing mark",
200   US"ON",   US"Other neutral",
201   US"PDF",  US"Pop directional format",
202   US"PDI",  US"Pop directional isolate",
203   US"R",    US"Right-to-left",
204   US"RLE",  US"Right-to-left embedding",
205   US"RLI",  US"Right-to-left isolate",
206   US"RLO",  US"Right-to-left override",
207   US"S",    US"Segment separator",
208   US"WS",   US"White space"
209 };
210 
211 static const unsigned int utf8_table1[] = {
212   0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
213 
214 static const int utf8_table2[] = {
215   0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
216 
217 /* Macro to pick up the remaining bytes of a UTF-8 character, advancing
218 the pointer. */
219 
220 #define GETUTF8INC(c, eptr) \
221     { \
222     if ((c & 0x20u) == 0) \
223       c = ((c & 0x1fu) << 6) | (*eptr++ & 0x3fu); \
224     else if ((c & 0x10u) == 0) \
225       { \
226       c = ((c & 0x0fu) << 12) | ((*eptr & 0x3fu) << 6) | (eptr[1] & 0x3fu); \
227       eptr += 2; \
228       } \
229     else if ((c & 0x08u) == 0) \
230       { \
231       c = ((c & 0x07u) << 18) | ((*eptr & 0x3fu) << 12) | \
232           ((eptr[1] & 0x3fu) << 6) | (eptr[2] & 0x3fu); \
233       eptr += 3; \
234       } \
235     else if ((c & 0x04u) == 0) \
236       { \
237       c = ((c & 0x03u) << 24) | ((*eptr & 0x3fu) << 18) | \
238           ((eptr[1] & 0x3fu) << 12) | ((eptr[2] & 0x3fu) << 6) | \
239           (eptr[3] & 0x3fu); \
240       eptr += 4; \
241       } \
242     else \
243       { \
244       c = ((c & 0x01u) << 30) | ((*eptr & 0x3fu) << 24) | \
245           ((eptr[1] & 0x3fu) << 18) | ((eptr[2] & 0x3fu) << 12) | \
246           ((eptr[3] & 0x3fu) << 6) | (eptr[4] & 0x3fu); \
247       eptr += 5; \
248       } \
249     }
250 
251 
252 
253 /*************************************************
254 *       Convert character value to UTF-8         *
255 *************************************************/
256 
257 /* This function takes an unsigned long integer value in the range 0 -
258 0x7fffffff and encodes it as a UTF-8 character in 1 to 6 bytes.
259 
260 Arguments:
261   cvalue     the character value
262   buffer     pointer to buffer for result - at least 6 bytes long
263 
264 Returns:     number of bytes placed in the buffer
265              0 if input code point is too big
266 */
267 
268 static size_t
ord2utf8(unsigned int cvalue,unsigned char * buffer)269 ord2utf8(unsigned int cvalue, unsigned char *buffer)
270 {
271 size_t i, j;
272 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
273   if (cvalue <= utf8_table1[i]) break;
274 if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
275 buffer += i;
276 for (j = i; j > 0; j--)
277  {
278  *buffer-- = 0x80 | (cvalue & 0x3f);
279  cvalue >>= 6;
280  }
281 *buffer = utf8_table2[i] | cvalue;
282 return i + 1;
283 }
284 
285 
286 
287 /*************************************************
288 *             Test for interaction               *
289 *************************************************/
290 
291 static BOOL
is_stdin_tty(void)292 is_stdin_tty(void)
293 {
294 #if defined WIN32
295 return _isatty(_fileno(stdin));
296 #else
297 return isatty(fileno(stdin));
298 #endif
299 }
300 
301 
302 /*************************************************
303 *            Get  name from ucp ident            *
304 *************************************************/
305 
306 /* The utt table contains both full names and abbreviations. So search for both
307 and use the longer if two are found, unless the first one is only 3 characters
308 and we are looking for a script (some scripts have 3-character names). If this
309 were not just a test program it might be worth making some kind of reverse
310 index. */
311 
312 static const char *
get_propname(int prop,int type)313 get_propname(int prop, int type)
314 {
315 size_t i, j, len;
316 size_t foundlist[2];
317 const char *yield;
318 int typex = (type == PT_SC)? PT_SCX : type;
319 
320 j = 0;
321 for (i = 0; i < PRIV(utt_size); i++)
322   {
323   const ucp_type_table *u = PRIV(utt) + i;
324   if ((u->type == type || u->type == typex) && u->value == prop)
325     {
326     foundlist[j++] = i;
327     if (j >= 2) break;
328     }
329   }
330 
331 if (j == 0) return "??";
332 
333 yield = NULL;
334 len = 0;
335 
336 for (i = 0; i < j; i++)
337   {
338   const char *s = PRIV(utt_names) + (PRIV(utt) + foundlist[i])->name_offset;
339   size_t sl = strlen(s);
340 
341   if (sl > len)
342     {
343     yield = s;
344     if (sl == 3 && type == PT_SC) break;
345     len = sl;
346     }
347   }
348 
349 return yield;
350 }
351 
352 
353 /*************************************************
354 *      Print Unicode property info for a char    *
355 *************************************************/
356 
357 static void
print_prop(unsigned int c,BOOL is_just_one)358 print_prop(unsigned int c, BOOL is_just_one)
359 {
360 int type = UCD_CATEGORY(c);
361 int fulltype = UCD_CHARTYPE(c);
362 int script = UCD_SCRIPT(c);
363 int scriptx = UCD_SCRIPTX(c);
364 int gbprop = UCD_GRAPHBREAK(c);
365 int bidi = UCD_BIDICLASS(c);
366 unsigned int othercase = UCD_OTHERCASE(c);
367 int caseset = UCD_CASESET(c);
368 int bprops = UCD_BPROPS(c);
369 
370 const unsigned char *fulltypename = US"??";
371 const unsigned char *typename = US"??";
372 const unsigned char *graphbreak = US"??";
373 const unsigned char *bidiclass = US"??";
374 const unsigned char *scriptname = CUS get_propname(script, PT_SC);
375 
376 switch (type)
377   {
378   case ucp_C: typename = US"Control"; break;
379   case ucp_L: typename = US"Letter"; break;
380   case ucp_M: typename = US"Mark"; break;
381   case ucp_N: typename = US"Number"; break;
382   case ucp_P: typename = US"Punctuation"; break;
383   case ucp_S: typename = US"Symbol"; break;
384   case ucp_Z: typename = US"Separator"; break;
385   }
386 
387 switch (fulltype)
388   {
389   case ucp_Cc: fulltypename = US"Control"; break;
390   case ucp_Cf: fulltypename = US"Format"; break;
391   case ucp_Cn: fulltypename = US"Unassigned"; break;
392   case ucp_Co: fulltypename = US"Private use"; break;
393   case ucp_Cs: fulltypename = US"Surrogate"; break;
394   case ucp_Ll: fulltypename = US"Lower case letter"; break;
395   case ucp_Lm: fulltypename = US"Modifier letter"; break;
396   case ucp_Lo: fulltypename = US"Other letter"; break;
397   case ucp_Lt: fulltypename = US"Title case letter"; break;
398   case ucp_Lu: fulltypename = US"Upper case letter"; break;
399   case ucp_Mc: fulltypename = US"Spacing mark"; break;
400   case ucp_Me: fulltypename = US"Enclosing mark"; break;
401   case ucp_Mn: fulltypename = US"Non-spacing mark"; break;
402   case ucp_Nd: fulltypename = US"Decimal number"; break;
403   case ucp_Nl: fulltypename = US"Letter number"; break;
404   case ucp_No: fulltypename = US"Other number"; break;
405   case ucp_Pc: fulltypename = US"Connector punctuation"; break;
406   case ucp_Pd: fulltypename = US"Dash punctuation"; break;
407   case ucp_Pe: fulltypename = US"Close punctuation"; break;
408   case ucp_Pf: fulltypename = US"Final punctuation"; break;
409   case ucp_Pi: fulltypename = US"Initial punctuation"; break;
410   case ucp_Po: fulltypename = US"Other punctuation"; break;
411   case ucp_Ps: fulltypename = US"Open punctuation"; break;
412   case ucp_Sc: fulltypename = US"Currency symbol"; break;
413   case ucp_Sk: fulltypename = US"Modifier symbol"; break;
414   case ucp_Sm: fulltypename = US"Mathematical symbol"; break;
415   case ucp_So: fulltypename = US"Other symbol"; break;
416   case ucp_Zl: fulltypename = US"Line separator"; break;
417   case ucp_Zp: fulltypename = US"Paragraph separator"; break;
418   case ucp_Zs: fulltypename = US"Space separator"; break;
419   }
420 
421 switch(gbprop)
422   {
423   case ucp_gbCR:           graphbreak = US"CR"; break;
424   case ucp_gbLF:           graphbreak = US"LF"; break;
425   case ucp_gbControl:      graphbreak = US"Control"; break;
426   case ucp_gbExtend:       graphbreak = US"Extend"; break;
427   case ucp_gbPrepend:      graphbreak = US"Prepend"; break;
428   case ucp_gbSpacingMark:  graphbreak = US"SpacingMark"; break;
429   case ucp_gbL:            graphbreak = US"Hangul syllable type L"; break;
430   case ucp_gbV:            graphbreak = US"Hangul syllable type V"; break;
431   case ucp_gbT:            graphbreak = US"Hangul syllable type T"; break;
432   case ucp_gbLV:           graphbreak = US"Hangul syllable type LV"; break;
433   case ucp_gbLVT:          graphbreak = US"Hangul syllable type LVT"; break;
434   case ucp_gbRegional_Indicator:
435                            graphbreak = US"Regional Indicator"; break;
436   case ucp_gbOther:        graphbreak = US"Other"; break;
437   case ucp_gbZWJ:          graphbreak = US"Zero Width Joiner"; break;
438   case ucp_gbExtended_Pictographic:
439                            graphbreak = US"Extended Pictographic"; break;
440   default:                 graphbreak = US"Unknown"; break;
441   }
442 
443 switch(bidi)
444   {
445   case ucp_bidiAL:   bidiclass = US"AL "; break;
446   case ucp_bidiFSI:  bidiclass = US"FSI"; break;
447   case ucp_bidiL:    bidiclass = US"L  "; break;
448   case ucp_bidiLRE:  bidiclass = US"LRE"; break;
449   case ucp_bidiLRI:  bidiclass = US"LRI"; break;
450   case ucp_bidiLRO:  bidiclass = US"LRO"; break;
451   case ucp_bidiPDF:  bidiclass = US"PDF"; break;
452   case ucp_bidiPDI:  bidiclass = US"PDI"; break;
453   case ucp_bidiR:    bidiclass = US"R  "; break;
454   case ucp_bidiRLE:  bidiclass = US"RLE"; break;
455   case ucp_bidiRLI:  bidiclass = US"RLI"; break;
456   case ucp_bidiRLO:  bidiclass = US"RLO"; break;
457   case ucp_bidiAN:   bidiclass = US"AN "; break;
458   case ucp_bidiB:    bidiclass = US"B  "; break;
459   case ucp_bidiBN:   bidiclass = US"BN "; break;
460   case ucp_bidiCS:   bidiclass = US"CS "; break;
461   case ucp_bidiEN:   bidiclass = US"EN "; break;
462   case ucp_bidiES:   bidiclass = US"ES "; break;
463   case ucp_bidiET:   bidiclass = US"ET "; break;
464   case ucp_bidiNSM:  bidiclass = US"NSM"; break;
465   case ucp_bidiON:   bidiclass = US"ON "; break;
466   case ucp_bidiS:    bidiclass = US"S  "; break;
467   case ucp_bidiWS:   bidiclass = US"WS "; break;
468   default:           bidiclass = US"???"; break;
469   }
470 
471 printf("U+%04X %s %s: %s, %s, %s", c, bidiclass, typename, fulltypename,
472   scriptname, graphbreak);
473 
474 if (is_just_one && othercase != c)
475   {
476   printf(", U+%04X", othercase);
477   if (caseset != 0)
478     {
479     const uint32_t *p = PRIV(ucd_caseless_sets) + caseset - 1;
480     while (*(++p) < NOTACHAR)
481       {
482       unsigned int d = *p;
483       if (d != othercase && d != c) printf(", U+%04X", d);
484       }
485     }
486   }
487 
488 if (scriptx != 0)
489   {
490   const char *sep = "";
491   const uint32_t *p = PRIV(ucd_script_sets) + scriptx;
492   printf(", [");
493   for (int i = 0; i < ucp_Unknown; i++)
494   if (MAPBIT(p, i) != 0)
495     {
496     printf("%s%s", sep, get_propname(i, PT_SC));
497     sep = ", ";
498     }
499   printf("]");
500   }
501 
502 if (bprops != 0)
503   {
504   const char *sep = "";
505   const uint32_t *p = PRIV(ucd_boolprop_sets) +
506     bprops * ucd_boolprop_sets_item_size;
507   printf(", [");
508   for (int i = 0; i < ucp_Bprop_Count; i++)
509   if (MAPBIT(p, i) != 0)
510     {
511     printf("%s%s", sep, get_propname(i, PT_BOOL));
512     sep = ", ";
513     }
514   printf("]");
515   }
516 
517 if (show_character && is_just_one)
518   {
519   unsigned char buffer[8];
520   size_t len = ord2utf8(c, buffer);
521   printf(", >%.*s<", (int)len, buffer);
522   }
523 
524 printf("\n");
525 }
526 
527 
528 
529 /*************************************************
530 *   Find character(s) with given property/ies    *
531 *************************************************/
532 
533 static void
find_chars(unsigned char * s)534 find_chars(unsigned char *s)
535 {
536 unsigned char name[128];
537 unsigned char value[128];
538 unsigned char *t;
539 unsigned int count= 0;
540 int scriptx_list[128];
541 unsigned int scriptx_count = 0;
542 int bprop_list[128];
543 unsigned int bprop_count = 0;
544 uint32_t i, c;
545 int script = -1;
546 int type = -1;
547 int gbreak = -1;
548 int bidiclass = -1;
549 BOOL script_not = FALSE;
550 BOOL type_not = FALSE;
551 BOOL gbreak_not = FALSE;
552 BOOL bidiclass_not = FALSE;
553 BOOL hadrange = FALSE;
554 const ucd_record *ucd, *next_ucd;
555 const char *pad = "        ";
556 
557 while (*s != 0)
558   {
559   unsigned int offset = 0;
560   BOOL scriptx_not = FALSE;
561 
562   for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
563   *t = 0;
564   while (isspace(*s)) s++;
565 
566   for (t = value; *s != 0 && !isspace(*s); s++)
567     {
568     if (*s != '_' && *s != '-') *t++ = *s;
569     }
570   *t = 0;
571   while (isspace(*s)) s++;
572 
573   if (strcmp(CS name, "script") == 0 ||
574       strcmp(CS name, "scriptx") == 0)
575     {
576     for (t = value; *t != 0; t++) *t = tolower(*t);
577 
578     if (value[0] == '!')
579       {
580       if (name[6] == 'x') scriptx_not = TRUE;
581         else script_not = TRUE;
582       offset = 1;
583       }
584 
585     for (i = 0; i < PRIV(utt_size); i++)
586       {
587       const ucp_type_table *u = PRIV(utt) + i;
588       if ((u->type == PT_SCX || u->type == PT_SC) && strcmp(CS(value + offset),
589             PRIV(utt_names) + u->name_offset) == 0)
590         {
591         c = u->value;
592         if (name[6] == 'x')
593           {
594           scriptx_list[scriptx_count++] = scriptx_not? (-c):c;
595           }
596         else
597           {
598           if (script < 0) script = c; else
599             {
600             printf("** Only 1 script value allowed\n");
601             return;
602             }
603           }
604         break;
605         }
606       }
607 
608     if (i >= PRIV(utt_size))
609       {
610       printf("** Unrecognized script name \"%s\"\n", value);
611       return;
612       }
613     }
614 
615   else if (strcmp(CS name, "bool") == 0)
616     {
617     int not = 1;
618     if (value[0] == '!')
619       {
620       not = -1;
621       offset = 1;
622       }
623 
624     for (i = 0; i < PRIV(utt_size); i++)
625       {
626       const ucp_type_table *u = PRIV(utt) + i;
627       if (u->type == PT_BOOL && strcmp(CS(value + offset),
628             PRIV(utt_names) + u->name_offset) == 0)
629         {
630         bprop_list[bprop_count++] = u->value * not;
631         break;
632         }
633       }
634 
635     if (i >= PRIV(utt_size))
636       {
637       printf("** Unrecognized property name \"%s\"\n", value);
638       return;
639       }
640     }
641 
642   else if (strcmp(CS name, "type") == 0)
643     {
644     if (type >= 0)
645       {
646       printf("** Only 1 type value allowed\n");
647       return;
648       }
649     else
650       {
651       if (value[0] == '!')
652         {
653         type_not = TRUE;
654         offset = 1;
655         }
656 
657       for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2)
658         {
659         if (strcmp(CS (value + offset), CS type_names[i]) == 0)
660           {
661           type = i/2;
662           break;
663           }
664         }
665       if (i >= sizeof(type_names)/sizeof(char *))
666         {
667         printf("** Unrecognized type name \"%s\"\n", value);
668         return;
669         }
670       }
671     }
672 
673   else if (strcmp(CS name, "gbreak") == 0)
674     {
675     if (gbreak >= 0)
676       {
677       printf("** Only 1 grapheme break value allowed\n");
678       return;
679       }
680     else
681       {
682       if (value[0] == '!')
683         {
684         gbreak_not = TRUE;
685         offset = 1;
686         }
687 
688       for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2)
689         {
690         if (strcmp(CS (value + offset), CS gb_names[i]) == 0)
691           {
692           gbreak = i/2;
693           break;
694           }
695         }
696       if (i >= sizeof(gb_names)/sizeof(char *))
697         {
698         printf("** Unrecognized gbreak name \"%s\"\n", value);
699         return;
700         }
701       }
702     }
703 
704   else if (strcmp(CS name, "bidi") == 0 ||
705            strcmp(CS name, "bidiclass") == 0 ||
706            strcmp(CS name, "bidi_class") == 0 )
707     {
708     if (bidiclass >= 0)
709       {
710       printf("** Only 1 bidi class value allowed\n");
711       return;
712       }
713     else
714       {
715       if (value[0] == '!')
716         {
717         bidiclass_not = TRUE;
718         offset = 1;
719         }
720       for (i = 0; i < sizeof(bd_names)/sizeof(char *); i += 2)
721         {
722         if (strcasecmp(CS (value + offset), CS bd_names[i]) == 0)
723           {
724           bidiclass = i/2;
725           break;
726           }
727         }
728       if (i >= sizeof(bd_names)/sizeof(char *))
729         {
730         printf("** Unrecognized bidi class name \"%s\"\n", value);
731         return;
732         }
733       }
734     }
735 
736   else
737     {
738     printf("** Unrecognized property name \"%s\"\n", name);
739     return;
740     }
741   }
742 
743 if (script < 0 && scriptx_count == 0 && bprop_count == 0 && type < 0 &&
744     gbreak < 0 && bidiclass < 0)
745   {
746   printf("** No properties specified\n");
747   return;
748   }
749 
750 for (c = 0; c <= 0x10ffff; c++)
751   {
752   if (script >= 0 && (script == UCD_SCRIPT(c)) == script_not) continue;
753 
754   if (scriptx_count > 0)
755     {
756     const uint32_t *bits_scriptx = PRIV(ucd_script_sets) + UCD_SCRIPTX(c);
757     unsigned int found = 0;
758 
759     for (i = 0; i < scriptx_count; i++)
760       {
761       int x = scriptx_list[i]/32;
762       int y = scriptx_list[i]%32;
763 
764       /* Positive requirment */
765       if (scriptx_list[i] >= 0)
766         {
767         if ((bits_scriptx[x] & (1u<<y)) != 0) found++;
768         }
769       /* Negative requirement */
770       else
771         {
772         if ((bits_scriptx[x] & (1u<<y)) == 0) found++;
773         }
774       }
775 
776     if (found != scriptx_count) continue;
777     }
778 
779   if (bprop_count > 0)
780     {
781     const uint32_t *bits_bprop = PRIV(ucd_boolprop_sets) +
782       UCD_BPROPS(c) * ucd_boolprop_sets_item_size;
783     unsigned int found = 0;
784 
785     for (i = 0; i < bprop_count; i++)
786       {
787       int x = bprop_list[i]/32;
788       int y = bprop_list[i]%32;
789 
790       /* Positive requirement */
791       if (bprop_list[i] >= 0)
792         {
793         if ((bits_bprop[x] & (1u<<y)) != 0) found++;
794         }
795       /* Negative requirement */
796       else
797         {
798         if ((bits_bprop[-x] & (1u<<(-y))) == 0) found++;
799         }
800       }
801 
802     if (found != bprop_count) continue;
803     }
804 
805   if (type >= 0)
806     {
807     if (type_not)
808       {
809       if (type == UCD_CHARTYPE(c)) continue;
810       }
811     else
812       {
813       if (type != UCD_CHARTYPE(c)) continue;
814       }
815     }
816 
817   if (gbreak >= 0)
818     {
819     if (gbreak_not)
820       {
821       if (gbreak == UCD_GRAPHBREAK(c)) continue;
822       }
823     else
824       {
825       if (gbreak != UCD_GRAPHBREAK(c)) continue;
826       }
827     }
828 
829   if (bidiclass >= 0)
830     {
831     if (bidiclass_not)
832       {
833       if (bidiclass == UCD_BIDICLASS(c)) continue;
834       }
835     else
836       {
837       if (bidiclass != UCD_BIDICLASS(c)) continue;
838       }
839     }
840 
841   /* All conditions are met. Look for runs. */
842 
843   ucd = GET_UCD(c);
844 
845   for (i = c + 1; i < 0x10ffff; i++)
846     {
847     next_ucd = GET_UCD(i);
848     if (memcmp(ucd, next_ucd, sizeof(ucd_record)) != 0) break;
849     }
850 
851   if (--i > c)
852     {
853     printf("U+%04X..", c);
854     c = i;
855     hadrange = TRUE;
856     }
857   else if (hadrange) printf("%s", pad);
858 
859   print_prop(c, FALSE);
860   if (c >= 0x100000) pad = "        ";
861     else if (c >= 0x10000) pad = "       ";
862   count++;
863   if (count >= 100)
864     {
865     printf("...\n");
866     break;
867     }
868   }
869 
870 if (count == 0) printf("No characters found\n");
871 }
872 
873 
874 /*************************************************
875 *        Process command line                    *
876 *************************************************/
877 
878 static void
process_command_line(unsigned char * buffer)879 process_command_line(unsigned char *buffer)
880 {
881 unsigned char *s, *t;
882 unsigned char name[24];
883 
884 s = buffer;
885 while (isspace(*s)) s++;
886 if (*s == 0) return;
887 
888 for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
889 *t = 0;
890 while (isspace(*s)) s++;
891 
892 if (strcmp(CS name, "findprop") == 0)
893   {
894   while (*s != 0)
895     {
896     unsigned int c;
897     unsigned char *endptr;
898     t = s;
899 
900     if (*t == '+')
901       {
902       c = *(++t);
903       if (c > 0x7fu)
904         {
905         GETCHARINC(c, t);
906         }
907       endptr = t+1;
908       }
909     else
910       {
911       if (strncmp(CS t, "U+", 2) == 0) t += 2;
912       c = strtoul(CS t, CSS(&endptr), 16);
913       }
914 
915     if (*endptr != 0 && !isspace(*endptr))
916       {
917       while (*endptr != 0 && !isspace(*endptr)) endptr++;
918       printf("** Invalid character specifier: ignored \"%.*s\"\n", (int)(endptr-s), s);
919       }
920     else
921       {
922       if (c > 0x10ffff)
923         printf("** U+%x is too big for a Unicode code point\n", c);
924       else
925         print_prop(c, TRUE);
926       }
927     s = endptr;
928     while (isspace(*s)) s++;
929     }
930   }
931 
932 else if (strcmp(CS name, "find") == 0)
933   {
934   find_chars(s);
935   }
936 
937 else if (strcmp(CS name, "list") == 0)
938   {
939   while (*s != 0)
940     {
941     size_t i;
942     for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
943     *t = 0;
944     while (isspace(*s)) s++;
945 
946     if (strcmp(CS name, "script") == 0 || strcmp(CS name, "scripts") == 0)
947       {
948       for (i = 0; i < PRIV(utt_size); i++)
949         if (PRIV(utt)[i].type == PT_SCX || PRIV(utt)[i].type == PT_SC)
950           printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset);
951       }
952 
953     else if (strcmp(CS name, "bool") == 0)
954       {
955       for (i = 0; i < PRIV(utt_size); i++)
956         if (PRIV(utt)[i].type == PT_BOOL)
957           printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset);
958       }
959 
960     else if (strcmp(CS name, "type") == 0 || strcmp(CS name, "types") == 0)
961       {
962       for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2)
963         printf("%s %s\n", type_names[i], type_names[i+1]);
964       }
965 
966     else if (strcmp(CS name, "gbreak") == 0 || strcmp(CS name, "gbreaks") == 0)
967       {
968       for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2)
969         {
970         if (gb_names[i+1][0] != 0)
971           printf("%-3s (%s)\n", gb_names[i], gb_names[i+1]);
972         else
973           printf("%s\n", gb_names[i]);
974         }
975       }
976 
977     else if (strcmp(CS name, "bidi") == 0 ||
978              strcmp(CS name, "bidiclasses") == 0)
979       {
980       for (i = 0; i < sizeof(bd_names)/sizeof(char *); i += 2)
981         printf("%3s %s\n", bd_names[i], bd_names[i+1]);
982       }
983 
984     else
985       {
986       printf("** Unknown property \"%s\"\n", name);
987       break;
988       }
989     }
990   }
991 
992 else printf("** Unknown test command \"%s\"\n", name);
993 }
994 
995 
996 
997 /*************************************************
998 *               Main program                     *
999 *************************************************/
1000 
1001 int
main(int argc,char ** argv)1002 main(int argc, char **argv)
1003 {
1004 BOOL interactive;
1005 int first_arg = 1;
1006 unsigned char buffer[1024];
1007 
1008 if (argc > 1 && strcmp(argv[1], "-s") == 0)
1009   {
1010   show_character = TRUE;
1011   first_arg++;
1012   }
1013 
1014 if (argc > first_arg)
1015   {
1016   int i;
1017   BOOL datafirst = TRUE;
1018   char *arg = argv[first_arg];
1019   unsigned char *s = buffer;
1020 
1021   if (*arg != '+' && strncmp(arg, "U+", 2) != 0 && !isdigit(*arg))
1022     {
1023     while (*arg != 0)
1024       {
1025       if (!isxdigit(*arg++)) { datafirst = FALSE; break; }
1026       }
1027     }
1028 
1029   if (datafirst)
1030     {
1031     strcpy(CS s, "findprop ");
1032     s += 9;
1033     }
1034 
1035   for (i = first_arg; i < argc; i++)
1036     {
1037     s += sprintf(CS s, "%s ", argv[i]);
1038     }
1039 
1040   process_command_line(buffer);
1041   return 0;
1042   }
1043 
1044 interactive = is_stdin_tty();
1045 
1046 #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
1047 if (interactive) using_history();
1048 #endif
1049 
1050 for(;;)
1051   {
1052 #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
1053   if (interactive)
1054     {
1055     size_t len;
1056     unsigned char *s = US readline("> ");
1057     if (s == NULL) break;
1058     len = strlen(CS s);
1059     if (len > 0) add_history(CS s);
1060     memcpy(buffer, s, len);
1061     buffer[len] = '\n';
1062     buffer[len+1] = 0;
1063     free(s);
1064     }
1065   else
1066 #endif
1067 
1068     {
1069     if (interactive) printf("> ");
1070     if (fgets(CS buffer, sizeof(buffer), stdin) == NULL) break;
1071     if (!interactive) printf("%s", buffer);
1072     }
1073 
1074   process_command_line(buffer);
1075   }
1076 
1077 if (interactive) printf("\n");
1078 
1079 #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
1080 if (interactive) clear_history();
1081 #endif
1082 
1083 return 0;
1084 }
1085 
1086 /* End */
1087