1 /***************************************************
2 * A program for testing the Unicode property table *
3 ***************************************************/
4
5 /* Copyright (c) University of Cambridge 2008-2022 */
6
7 /* Compile thus:
8
9 gcc -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8 -o ucptest \
10 ucptest.c ../src/pcre2_ucd.c ../src/pcre2_tables.c
11
12 Add -lreadline or -ledit if PCRE2 was configured with readline or libedit
13 support in pcre2test.
14 */
15
16 /* This is a hacked-up program for testing the Unicode properties tables of
17 PCRE2. It can also be used for finding characters with certain properties. I
18 wrote it to help with debugging, and have added things that I found useful, in
19 a rather haphazard way. The code has never been seriously tidied or checked for
20 robustness, but it shouldn't now give compiler warnings.
21
22 There is only one option: "-s". If given, it applies only to the "findprop"
23 command. It causes the UTF-8 sequence of bytes that encode the character to be
24 output between angle brackets at the end of the line. On a UTF-8 terminal, this
25 will show the appropriate graphic for the code point.
26
27 If the command has arguments, they are concatenated into a buffer, separated by
28 spaces. If the first argument starts "U+" or consists entirely of hexadecimal
29 digits, "findprop" is inserted at the start. The buffer is then processed as a
30 single line file, after which the program exits. If there are no arguments, the
31 program reads commands line by line on stdin and writes output to stdout. The
32 return code is always zero.
33
34 There are three commands:
35
36 The command "findprop" must be followed by a space-separated list of Unicode
37 code points as hex numbers, either without any prefix or starting with "U+", or
38 as individual UTF-8 characters preceded by '+'. For example:
39
40 findprop U+1234 5Abc +?
41
42 The output is one long line per character, listing Unicode properties that have
43 values, followed by its other case or cases if one or more exist, followed by
44 its Script Extension list if there is one. This list is in square brackets. A
45 second list in square brackets gives all the Boolean properties of the
46 character. The properties that come first are:
47
48 Bidi class e.g. NSM (most common is L)
49 General type e.g. Letter
50 Specific type e.g. Upper case letter
51 Script e.g. Medefaidrin
52 Grapheme break type e.g. Extend (most common is Other)
53
54 Script names and Boolean property names are all in lower case, with underscores
55 and hyphens removed, because that's how they are stored for "loose" matching.
56
57 The command "find" must be followed by a list of property types and their
58 values. The values are case-sensitive, except for bidi class. This finds
59 characters that have those properties. If multiple properties are listed, they
60 must all be matched. Currently supported:
61
62 script <name> The character must have this script property. Only one
63 such script may be given.
64 scriptx <name> This script must be in the character's Script Extension
65 property list. If this is used many times, all the given
66 scripts must be present.
67 type <abbrev> The character's specific type (e.g. Lu or Nd) must match.
68 gbreak <name> The grapheme break property must match.
69 bidi <class> The character's bidi class must match.
70 bool <name> The character's Boolean property list must contain this
71 property.
72
73 If a <name> or <abbrev> is preceded by !, the value must NOT be present. For
74 Script Extensions and Boolean properties, there may be a mixture of positive
75 and negative requirements. All must be satisfied.
76
77 Sequences of two or more characters are shown as ranges, for example
78 U+0041..U+004A. No more than 100 lines are are output. If there are more
79 characters, the list ends with ...
80
81 The command "list" must be followed by one of property names script, bool,
82 type, gbreak or bidi. The defined values for that property are listed. */
83
84
85 #ifdef HAVE_CONFIG_H
86 #include "../src/config.h"
87 #endif
88
89 #ifndef SUPPORT_UNICODE
90 #define SUPPORT_UNICODE
91 #endif
92
93 #include <ctype.h>
94 #include <stdio.h>
95 #include <stdlib.h>
96 #include <string.h>
97 #include "../src/pcre2_internal.h"
98 #include "../src/pcre2_ucp.h"
99
100 #ifdef HAVE_UNISTD_H
101 #include <unistd.h>
102 #endif
103
104 #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
105 #if defined(SUPPORT_LIBREADLINE)
106 #include <readline/readline.h>
107 #include <readline/history.h>
108 #else
109 #if defined(HAVE_EDITLINE_READLINE_H)
110 #include <editline/readline.h>
111 #else
112 #include <readline/readline.h>
113 #ifdef RL_VERSION_MAJOR
114 #include <readline/history.h>
115 #endif
116 #endif
117 #endif
118 #endif
119
120
121 /* -------------------------------------------------------------------*/
122
123 #define CS (char *)
124 #define CCS (const char *)
125 #define CSS (char **)
126 #define US (unsigned char *)
127 #define CUS (const unsigned char *)
128 #define USS (unsigned char **)
129
130 /* -------------------------------------------------------------------*/
131
132 static BOOL show_character = FALSE;
133
134 static const unsigned char *type_names[] = {
135 US"Cc", US"Control",
136 US"Cf", US"Format",
137 US"Cn", US"Unassigned",
138 US"Co", US"Private use",
139 US"Cs", US"Surrogate",
140 US"Ll", US"Lower case letter",
141 US"Lm", US"Modifier letter",
142 US"Lo", US"Other letter",
143 US"Lt", US"Title case letter",
144 US"Lu", US"Upper case letter",
145 US"Mc", US"Spacing mark",
146 US"Me", US"Enclosing mark",
147 US"Mn", US"Non-spacing mark",
148 US"Nd", US"Decimal number",
149 US"Nl", US"Letter number",
150 US"No", US"Other number",
151 US"Pc", US"Connector punctuation",
152 US"Pd", US"Dash punctuation",
153 US"Pe", US"Close punctuation",
154 US"Pf", US"Final punctuation",
155 US"Pi", US"Initial punctuation",
156 US"Po", US"Other punctuation",
157 US"Ps", US"Open punctuation",
158 US"Sc", US"Currency symbol",
159 US"Sk", US"Modifier symbol",
160 US"Sm", US"Mathematical symbol",
161 US"So", US"Other symbol",
162 US"Zl", US"Line separator",
163 US"Zp", US"Paragraph separator",
164 US"Zs", US"Space separator"
165 };
166
167 static const unsigned char *gb_names[] = {
168 US"CR", US"carriage return",
169 US"LF", US"linefeed",
170 US"Control", US"",
171 US"Extend", US"",
172 US"Prepend", US"",
173 US"SpacingMark", US"",
174 US"L", US"Hangul syllable type L",
175 US"V", US"Hangul syllable type V",
176 US"T", US"Hangul syllable type T",
177 US"LV", US"Hangul syllable type LV",
178 US"LVT", US"Hangul syllable type LVT",
179 US"Regional_Indicator", US"",
180 US"Other", US"",
181 US"ZWJ", US"zero width joiner",
182 US"Extended_Pictographic", US""
183 };
184
185 static const unsigned char *bd_names[] = {
186 US"AL", US"Arabic letter",
187 US"AN", US"Arabid number",
188 US"B", US"Paragraph separator",
189 US"BN", US"Boundary neutral",
190 US"CS", US"Common separator",
191 US"EN", US"European number",
192 US"ES", US"European separator",
193 US"ET", US"European terminator",
194 US"FSI", US"First string isolate",
195 US"L", US"Left-to-right",
196 US"LRE", US"Left-to-right embedding",
197 US"LRI", US"Left-to-right isolate",
198 US"LRO", US"Left-to-right override",
199 US"NSM", US"Non-spacing mark",
200 US"ON", US"Other neutral",
201 US"PDF", US"Pop directional format",
202 US"PDI", US"Pop directional isolate",
203 US"R", US"Right-to-left",
204 US"RLE", US"Right-to-left embedding",
205 US"RLI", US"Right-to-left isolate",
206 US"RLO", US"Right-to-left override",
207 US"S", US"Segment separator",
208 US"WS", US"White space"
209 };
210
211 static const unsigned int utf8_table1[] = {
212 0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
213
214 static const int utf8_table2[] = {
215 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
216
217 /* Macro to pick up the remaining bytes of a UTF-8 character, advancing
218 the pointer. */
219
220 #define GETUTF8INC(c, eptr) \
221 { \
222 if ((c & 0x20u) == 0) \
223 c = ((c & 0x1fu) << 6) | (*eptr++ & 0x3fu); \
224 else if ((c & 0x10u) == 0) \
225 { \
226 c = ((c & 0x0fu) << 12) | ((*eptr & 0x3fu) << 6) | (eptr[1] & 0x3fu); \
227 eptr += 2; \
228 } \
229 else if ((c & 0x08u) == 0) \
230 { \
231 c = ((c & 0x07u) << 18) | ((*eptr & 0x3fu) << 12) | \
232 ((eptr[1] & 0x3fu) << 6) | (eptr[2] & 0x3fu); \
233 eptr += 3; \
234 } \
235 else if ((c & 0x04u) == 0) \
236 { \
237 c = ((c & 0x03u) << 24) | ((*eptr & 0x3fu) << 18) | \
238 ((eptr[1] & 0x3fu) << 12) | ((eptr[2] & 0x3fu) << 6) | \
239 (eptr[3] & 0x3fu); \
240 eptr += 4; \
241 } \
242 else \
243 { \
244 c = ((c & 0x01u) << 30) | ((*eptr & 0x3fu) << 24) | \
245 ((eptr[1] & 0x3fu) << 18) | ((eptr[2] & 0x3fu) << 12) | \
246 ((eptr[3] & 0x3fu) << 6) | (eptr[4] & 0x3fu); \
247 eptr += 5; \
248 } \
249 }
250
251
252
253 /*************************************************
254 * Convert character value to UTF-8 *
255 *************************************************/
256
257 /* This function takes an unsigned long integer value in the range 0 -
258 0x7fffffff and encodes it as a UTF-8 character in 1 to 6 bytes.
259
260 Arguments:
261 cvalue the character value
262 buffer pointer to buffer for result - at least 6 bytes long
263
264 Returns: number of bytes placed in the buffer
265 0 if input code point is too big
266 */
267
268 static size_t
ord2utf8(unsigned int cvalue,unsigned char * buffer)269 ord2utf8(unsigned int cvalue, unsigned char *buffer)
270 {
271 size_t i, j;
272 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
273 if (cvalue <= utf8_table1[i]) break;
274 if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
275 buffer += i;
276 for (j = i; j > 0; j--)
277 {
278 *buffer-- = 0x80 | (cvalue & 0x3f);
279 cvalue >>= 6;
280 }
281 *buffer = utf8_table2[i] | cvalue;
282 return i + 1;
283 }
284
285
286
287 /*************************************************
288 * Test for interaction *
289 *************************************************/
290
291 static BOOL
is_stdin_tty(void)292 is_stdin_tty(void)
293 {
294 #if defined WIN32
295 return _isatty(_fileno(stdin));
296 #else
297 return isatty(fileno(stdin));
298 #endif
299 }
300
301
302 /*************************************************
303 * Get name from ucp ident *
304 *************************************************/
305
306 /* The utt table contains both full names and abbreviations. So search for both
307 and use the longer if two are found, unless the first one is only 3 characters
308 and we are looking for a script (some scripts have 3-character names). If this
309 were not just a test program it might be worth making some kind of reverse
310 index. */
311
312 static const char *
get_propname(int prop,int type)313 get_propname(int prop, int type)
314 {
315 size_t i, j, len;
316 size_t foundlist[2];
317 const char *yield;
318 int typex = (type == PT_SC)? PT_SCX : type;
319
320 j = 0;
321 for (i = 0; i < PRIV(utt_size); i++)
322 {
323 const ucp_type_table *u = PRIV(utt) + i;
324 if ((u->type == type || u->type == typex) && u->value == prop)
325 {
326 foundlist[j++] = i;
327 if (j >= 2) break;
328 }
329 }
330
331 if (j == 0) return "??";
332
333 yield = NULL;
334 len = 0;
335
336 for (i = 0; i < j; i++)
337 {
338 const char *s = PRIV(utt_names) + (PRIV(utt) + foundlist[i])->name_offset;
339 size_t sl = strlen(s);
340
341 if (sl > len)
342 {
343 yield = s;
344 if (sl == 3 && type == PT_SC) break;
345 len = sl;
346 }
347 }
348
349 return yield;
350 }
351
352
353 /*************************************************
354 * Print Unicode property info for a char *
355 *************************************************/
356
357 static void
print_prop(unsigned int c,BOOL is_just_one)358 print_prop(unsigned int c, BOOL is_just_one)
359 {
360 int type = UCD_CATEGORY(c);
361 int fulltype = UCD_CHARTYPE(c);
362 int script = UCD_SCRIPT(c);
363 int scriptx = UCD_SCRIPTX(c);
364 int gbprop = UCD_GRAPHBREAK(c);
365 int bidi = UCD_BIDICLASS(c);
366 unsigned int othercase = UCD_OTHERCASE(c);
367 int caseset = UCD_CASESET(c);
368 int bprops = UCD_BPROPS(c);
369
370 const unsigned char *fulltypename = US"??";
371 const unsigned char *typename = US"??";
372 const unsigned char *graphbreak = US"??";
373 const unsigned char *bidiclass = US"??";
374 const unsigned char *scriptname = CUS get_propname(script, PT_SC);
375
376 switch (type)
377 {
378 case ucp_C: typename = US"Control"; break;
379 case ucp_L: typename = US"Letter"; break;
380 case ucp_M: typename = US"Mark"; break;
381 case ucp_N: typename = US"Number"; break;
382 case ucp_P: typename = US"Punctuation"; break;
383 case ucp_S: typename = US"Symbol"; break;
384 case ucp_Z: typename = US"Separator"; break;
385 }
386
387 switch (fulltype)
388 {
389 case ucp_Cc: fulltypename = US"Control"; break;
390 case ucp_Cf: fulltypename = US"Format"; break;
391 case ucp_Cn: fulltypename = US"Unassigned"; break;
392 case ucp_Co: fulltypename = US"Private use"; break;
393 case ucp_Cs: fulltypename = US"Surrogate"; break;
394 case ucp_Ll: fulltypename = US"Lower case letter"; break;
395 case ucp_Lm: fulltypename = US"Modifier letter"; break;
396 case ucp_Lo: fulltypename = US"Other letter"; break;
397 case ucp_Lt: fulltypename = US"Title case letter"; break;
398 case ucp_Lu: fulltypename = US"Upper case letter"; break;
399 case ucp_Mc: fulltypename = US"Spacing mark"; break;
400 case ucp_Me: fulltypename = US"Enclosing mark"; break;
401 case ucp_Mn: fulltypename = US"Non-spacing mark"; break;
402 case ucp_Nd: fulltypename = US"Decimal number"; break;
403 case ucp_Nl: fulltypename = US"Letter number"; break;
404 case ucp_No: fulltypename = US"Other number"; break;
405 case ucp_Pc: fulltypename = US"Connector punctuation"; break;
406 case ucp_Pd: fulltypename = US"Dash punctuation"; break;
407 case ucp_Pe: fulltypename = US"Close punctuation"; break;
408 case ucp_Pf: fulltypename = US"Final punctuation"; break;
409 case ucp_Pi: fulltypename = US"Initial punctuation"; break;
410 case ucp_Po: fulltypename = US"Other punctuation"; break;
411 case ucp_Ps: fulltypename = US"Open punctuation"; break;
412 case ucp_Sc: fulltypename = US"Currency symbol"; break;
413 case ucp_Sk: fulltypename = US"Modifier symbol"; break;
414 case ucp_Sm: fulltypename = US"Mathematical symbol"; break;
415 case ucp_So: fulltypename = US"Other symbol"; break;
416 case ucp_Zl: fulltypename = US"Line separator"; break;
417 case ucp_Zp: fulltypename = US"Paragraph separator"; break;
418 case ucp_Zs: fulltypename = US"Space separator"; break;
419 }
420
421 switch(gbprop)
422 {
423 case ucp_gbCR: graphbreak = US"CR"; break;
424 case ucp_gbLF: graphbreak = US"LF"; break;
425 case ucp_gbControl: graphbreak = US"Control"; break;
426 case ucp_gbExtend: graphbreak = US"Extend"; break;
427 case ucp_gbPrepend: graphbreak = US"Prepend"; break;
428 case ucp_gbSpacingMark: graphbreak = US"SpacingMark"; break;
429 case ucp_gbL: graphbreak = US"Hangul syllable type L"; break;
430 case ucp_gbV: graphbreak = US"Hangul syllable type V"; break;
431 case ucp_gbT: graphbreak = US"Hangul syllable type T"; break;
432 case ucp_gbLV: graphbreak = US"Hangul syllable type LV"; break;
433 case ucp_gbLVT: graphbreak = US"Hangul syllable type LVT"; break;
434 case ucp_gbRegional_Indicator:
435 graphbreak = US"Regional Indicator"; break;
436 case ucp_gbOther: graphbreak = US"Other"; break;
437 case ucp_gbZWJ: graphbreak = US"Zero Width Joiner"; break;
438 case ucp_gbExtended_Pictographic:
439 graphbreak = US"Extended Pictographic"; break;
440 default: graphbreak = US"Unknown"; break;
441 }
442
443 switch(bidi)
444 {
445 case ucp_bidiAL: bidiclass = US"AL "; break;
446 case ucp_bidiFSI: bidiclass = US"FSI"; break;
447 case ucp_bidiL: bidiclass = US"L "; break;
448 case ucp_bidiLRE: bidiclass = US"LRE"; break;
449 case ucp_bidiLRI: bidiclass = US"LRI"; break;
450 case ucp_bidiLRO: bidiclass = US"LRO"; break;
451 case ucp_bidiPDF: bidiclass = US"PDF"; break;
452 case ucp_bidiPDI: bidiclass = US"PDI"; break;
453 case ucp_bidiR: bidiclass = US"R "; break;
454 case ucp_bidiRLE: bidiclass = US"RLE"; break;
455 case ucp_bidiRLI: bidiclass = US"RLI"; break;
456 case ucp_bidiRLO: bidiclass = US"RLO"; break;
457 case ucp_bidiAN: bidiclass = US"AN "; break;
458 case ucp_bidiB: bidiclass = US"B "; break;
459 case ucp_bidiBN: bidiclass = US"BN "; break;
460 case ucp_bidiCS: bidiclass = US"CS "; break;
461 case ucp_bidiEN: bidiclass = US"EN "; break;
462 case ucp_bidiES: bidiclass = US"ES "; break;
463 case ucp_bidiET: bidiclass = US"ET "; break;
464 case ucp_bidiNSM: bidiclass = US"NSM"; break;
465 case ucp_bidiON: bidiclass = US"ON "; break;
466 case ucp_bidiS: bidiclass = US"S "; break;
467 case ucp_bidiWS: bidiclass = US"WS "; break;
468 default: bidiclass = US"???"; break;
469 }
470
471 printf("U+%04X %s %s: %s, %s, %s", c, bidiclass, typename, fulltypename,
472 scriptname, graphbreak);
473
474 if (is_just_one && othercase != c)
475 {
476 printf(", U+%04X", othercase);
477 if (caseset != 0)
478 {
479 const uint32_t *p = PRIV(ucd_caseless_sets) + caseset - 1;
480 while (*(++p) < NOTACHAR)
481 {
482 unsigned int d = *p;
483 if (d != othercase && d != c) printf(", U+%04X", d);
484 }
485 }
486 }
487
488 if (scriptx != 0)
489 {
490 const char *sep = "";
491 const uint32_t *p = PRIV(ucd_script_sets) + scriptx;
492 printf(", [");
493 for (int i = 0; i < ucp_Unknown; i++)
494 if (MAPBIT(p, i) != 0)
495 {
496 printf("%s%s", sep, get_propname(i, PT_SC));
497 sep = ", ";
498 }
499 printf("]");
500 }
501
502 if (bprops != 0)
503 {
504 const char *sep = "";
505 const uint32_t *p = PRIV(ucd_boolprop_sets) +
506 bprops * ucd_boolprop_sets_item_size;
507 printf(", [");
508 for (int i = 0; i < ucp_Bprop_Count; i++)
509 if (MAPBIT(p, i) != 0)
510 {
511 printf("%s%s", sep, get_propname(i, PT_BOOL));
512 sep = ", ";
513 }
514 printf("]");
515 }
516
517 if (show_character && is_just_one)
518 {
519 unsigned char buffer[8];
520 size_t len = ord2utf8(c, buffer);
521 printf(", >%.*s<", (int)len, buffer);
522 }
523
524 printf("\n");
525 }
526
527
528
529 /*************************************************
530 * Find character(s) with given property/ies *
531 *************************************************/
532
533 static void
find_chars(unsigned char * s)534 find_chars(unsigned char *s)
535 {
536 unsigned char name[128];
537 unsigned char value[128];
538 unsigned char *t;
539 unsigned int count= 0;
540 int scriptx_list[128];
541 unsigned int scriptx_count = 0;
542 int bprop_list[128];
543 unsigned int bprop_count = 0;
544 uint32_t i, c;
545 int script = -1;
546 int type = -1;
547 int gbreak = -1;
548 int bidiclass = -1;
549 BOOL bidicontrol = FALSE;
550 BOOL script_not = FALSE;
551 BOOL type_not = FALSE;
552 BOOL gbreak_not = FALSE;
553 BOOL bidiclass_not = FALSE;
554 BOOL hadrange = FALSE;
555 const ucd_record *ucd, *next_ucd;
556 const char *pad = " ";
557
558 while (*s != 0)
559 {
560 unsigned int offset = 0;
561 BOOL scriptx_not = FALSE;
562 char *value_start;
563
564 for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
565 *t = 0;
566 while (isspace(*s)) s++;
567 value_start = s;
568
569 for (t = value; *s != 0 && !isspace(*s); s++)
570 {
571 if (*s != '_' && *s != '-') *t++ = *s;
572 }
573 *t = 0;
574 while (isspace(*s)) s++;
575
576 if (strcmp(CS name, "script") == 0 ||
577 strcmp(CS name, "scriptx") == 0)
578 {
579 for (t = value; *t != 0; t++) *t = tolower(*t);
580
581 if (value[0] == '!')
582 {
583 if (name[6] == 'x') scriptx_not = TRUE;
584 else script_not = TRUE;
585 offset = 1;
586 }
587
588 for (i = 0; i < PRIV(utt_size); i++)
589 {
590 const ucp_type_table *u = PRIV(utt) + i;
591 if ((u->type == PT_SCX || u->type == PT_SC) && strcmp(CS(value + offset),
592 PRIV(utt_names) + u->name_offset) == 0)
593 {
594 c = u->value;
595 if (name[6] == 'x')
596 {
597 scriptx_list[scriptx_count++] = scriptx_not? (-c):c;
598 }
599 else
600 {
601 if (script < 0) script = c; else
602 {
603 printf("** Only 1 script value allowed\n");
604 return;
605 }
606 }
607 break;
608 }
609 }
610
611 if (i >= PRIV(utt_size))
612 {
613 printf("** Unrecognized script name \"%s\"\n", value);
614 return;
615 }
616 }
617
618 else if (strcmp(CS name, "bool") == 0)
619 {
620 int not = 1;
621 if (value[0] == '!')
622 {
623 not = -1;
624 offset = 1;
625 }
626
627 for (i = 0; i < PRIV(utt_size); i++)
628 {
629 const ucp_type_table *u = PRIV(utt) + i;
630 if (u->type == PT_BOOL && strcmp(CS(value + offset),
631 PRIV(utt_names) + u->name_offset) == 0)
632 {
633 bprop_list[bprop_count++] = u->value * not;
634 break;
635 }
636 }
637
638 if (i >= PRIV(utt_size))
639 {
640 printf("** Unrecognized property name \"%s\"\n", value);
641 return;
642 }
643 }
644
645 else if (strcmp(CS name, "type") == 0)
646 {
647 if (type >= 0)
648 {
649 printf("** Only 1 type value allowed\n");
650 return;
651 }
652 else
653 {
654 if (value[0] == '!')
655 {
656 type_not = TRUE;
657 offset = 1;
658 }
659
660 for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2)
661 {
662 if (strcmp(CS (value + offset), CS type_names[i]) == 0)
663 {
664 type = i/2;
665 break;
666 }
667 }
668 if (i >= sizeof(type_names)/sizeof(char *))
669 {
670 printf("** Unrecognized type name \"%s\"\n", value);
671 return;
672 }
673 }
674 }
675
676 else if (strcmp(CS name, "gbreak") == 0)
677 {
678 if (gbreak >= 0)
679 {
680 printf("** Only 1 grapheme break value allowed\n");
681 return;
682 }
683 else
684 {
685 if (value[0] == '!')
686 {
687 gbreak_not = TRUE;
688 offset = 1;
689 }
690
691 for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2)
692 {
693 if (strcmp(CS (value + offset), CS gb_names[i]) == 0)
694 {
695 gbreak = i/2;
696 break;
697 }
698 }
699 if (i >= sizeof(gb_names)/sizeof(char *))
700 {
701 printf("** Unrecognized gbreak name \"%s\"\n", value);
702 return;
703 }
704 }
705 }
706
707 else if (strcmp(CS name, "bidi") == 0 ||
708 strcmp(CS name, "bidiclass") == 0 ||
709 strcmp(CS name, "bidi_class") == 0 )
710 {
711 if (bidiclass >= 0)
712 {
713 printf("** Only 1 bidi class value allowed\n");
714 return;
715 }
716 else
717 {
718 if (value[0] == '!')
719 {
720 bidiclass_not = TRUE;
721 offset = 1;
722 }
723 for (i = 0; i < sizeof(bd_names)/sizeof(char *); i += 2)
724 {
725 if (strcasecmp(CS (value + offset), CS bd_names[i]) == 0)
726 {
727 bidiclass = i/2;
728 break;
729 }
730 }
731 if (i >= sizeof(bd_names)/sizeof(char *))
732 {
733 printf("** Unrecognized bidi class name \"%s\"\n", value);
734 return;
735 }
736 }
737 }
738
739 else
740 {
741 printf("** Unrecognized property name \"%s\"\n", name);
742 return;
743 }
744 }
745
746 if (script < 0 && scriptx_count == 0 && bprop_count == 0 && type < 0 &&
747 gbreak < 0 && bidiclass < 0)
748 {
749 printf("** No properties specified\n");
750 return;
751 }
752
753 for (c = 0; c <= 0x10ffff; c++)
754 {
755 if (script >= 0 && (script == UCD_SCRIPT(c)) == script_not) continue;
756
757 if (scriptx_count > 0)
758 {
759 const uint32_t *bits_scriptx = PRIV(ucd_script_sets) + UCD_SCRIPTX(c);
760 unsigned int found = 0;
761
762 for (i = 0; i < scriptx_count; i++)
763 {
764 int x = scriptx_list[i]/32;
765 int y = scriptx_list[i]%32;
766
767 /* Positive requirment */
768 if (scriptx_list[i] >= 0)
769 {
770 if ((bits_scriptx[x] & (1u<<y)) != 0) found++;
771 }
772 /* Negative requirement */
773 else
774 {
775 if ((bits_scriptx[x] & (1u<<y)) == 0) found++;
776 }
777 }
778
779 if (found != scriptx_count) continue;
780 }
781
782 if (bprop_count > 0)
783 {
784 const uint32_t *bits_bprop = PRIV(ucd_boolprop_sets) +
785 UCD_BPROPS(c) * ucd_boolprop_sets_item_size;
786 unsigned int found = 0;
787
788 for (i = 0; i < bprop_count; i++)
789 {
790 int x = bprop_list[i]/32;
791 int y = bprop_list[i]%32;
792
793 /* Positive requirement */
794 if (bprop_list[i] >= 0)
795 {
796 if ((bits_bprop[x] & (1u<<y)) != 0) found++;
797 }
798 /* Negative requirement */
799 else
800 {
801 if ((bits_bprop[-x] & (1u<<(-y))) == 0) found++;
802 }
803 }
804
805 if (found != bprop_count) continue;
806 }
807
808 if (type >= 0)
809 {
810 if (type_not)
811 {
812 if (type == UCD_CHARTYPE(c)) continue;
813 }
814 else
815 {
816 if (type != UCD_CHARTYPE(c)) continue;
817 }
818 }
819
820 if (gbreak >= 0)
821 {
822 if (gbreak_not)
823 {
824 if (gbreak == UCD_GRAPHBREAK(c)) continue;
825 }
826 else
827 {
828 if (gbreak != UCD_GRAPHBREAK(c)) continue;
829 }
830 }
831
832 if (bidiclass >= 0)
833 {
834 if (bidiclass_not)
835 {
836 if (bidiclass == UCD_BIDICLASS(c)) continue;
837 }
838 else
839 {
840 if (bidiclass != UCD_BIDICLASS(c)) continue;
841 }
842 }
843
844 /* All conditions are met. Look for runs. */
845
846 ucd = GET_UCD(c);
847
848 for (i = c + 1; i < 0x10ffff; i++)
849 {
850 next_ucd = GET_UCD(i);
851 if (memcmp(ucd, next_ucd, sizeof(ucd_record)) != 0) break;
852 }
853
854 if (--i > c)
855 {
856 printf("U+%04X..", c);
857 c = i;
858 hadrange = TRUE;
859 }
860 else if (hadrange) printf("%s", pad);
861
862 print_prop(c, FALSE);
863 if (c >= 0x100000) pad = " ";
864 else if (c >= 0x10000) pad = " ";
865 count++;
866 if (count >= 100)
867 {
868 printf("...\n");
869 break;
870 }
871 }
872
873 if (count == 0) printf("No characters found\n");
874 }
875
876
877 /*************************************************
878 * Process command line *
879 *************************************************/
880
881 static void
process_command_line(unsigned char * buffer)882 process_command_line(unsigned char *buffer)
883 {
884 unsigned char *s, *t;
885 unsigned char name[24];
886
887 s = buffer;
888 while (isspace(*s)) s++;
889 if (*s == 0) return;
890
891 for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
892 *t = 0;
893 while (isspace(*s)) s++;
894
895 if (strcmp(CS name, "findprop") == 0)
896 {
897 while (*s != 0)
898 {
899 unsigned int c;
900 unsigned char *endptr;
901 t = s;
902
903 if (*t == '+')
904 {
905 c = *(++t);
906 if (c > 0x7fu)
907 {
908 GETCHARINC(c, t);
909 }
910 endptr = t+1;
911 }
912 else
913 {
914 if (strncmp(CS t, "U+", 2) == 0) t += 2;
915 c = strtoul(CS t, CSS(&endptr), 16);
916 }
917
918 if (*endptr != 0 && !isspace(*endptr))
919 {
920 while (*endptr != 0 && !isspace(*endptr)) endptr++;
921 printf("** Invalid character specifier: ignored \"%.*s\"\n", (int)(endptr-s), s);
922 }
923 else
924 {
925 if (c > 0x10ffff)
926 printf("** U+%x is too big for a Unicode code point\n", c);
927 else
928 print_prop(c, TRUE);
929 }
930 s = endptr;
931 while (isspace(*s)) s++;
932 }
933 }
934
935 else if (strcmp(CS name, "find") == 0)
936 {
937 find_chars(s);
938 }
939
940 else if (strcmp(CS name, "list") == 0)
941 {
942 while (*s != 0)
943 {
944 size_t i;
945 for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
946 *t = 0;
947 while (isspace(*s)) s++;
948
949 if (strcmp(CS name, "script") == 0 || strcmp(CS name, "scripts") == 0)
950 {
951 for (i = 0; i < PRIV(utt_size); i++)
952 if (PRIV(utt)[i].type == PT_SCX || PRIV(utt)[i].type == PT_SC)
953 printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset);
954 }
955
956 else if (strcmp(CS name, "bool") == 0)
957 {
958 for (i = 0; i < PRIV(utt_size); i++)
959 if (PRIV(utt)[i].type == PT_BOOL)
960 printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset);
961 }
962
963 else if (strcmp(CS name, "type") == 0 || strcmp(CS name, "types") == 0)
964 {
965 for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2)
966 printf("%s %s\n", type_names[i], type_names[i+1]);
967 }
968
969 else if (strcmp(CS name, "gbreak") == 0 || strcmp(CS name, "gbreaks") == 0)
970 {
971 for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2)
972 {
973 if (gb_names[i+1][0] != 0)
974 printf("%-3s (%s)\n", gb_names[i], gb_names[i+1]);
975 else
976 printf("%s\n", gb_names[i]);
977 }
978 }
979
980 else if (strcmp(CS name, "bidi") == 0 ||
981 strcmp(CS name, "bidiclasses") == 0)
982 {
983 for (i = 0; i < sizeof(bd_names)/sizeof(char *); i += 2)
984 printf("%3s %s\n", bd_names[i], bd_names[i+1]);
985 }
986
987 else
988 {
989 printf("** Unknown property \"%s\"\n", name);
990 break;
991 }
992 }
993 }
994
995 else printf("** Unknown test command \"%s\"\n", name);
996 }
997
998
999
1000 /*************************************************
1001 * Main program *
1002 *************************************************/
1003
1004 int
main(int argc,char ** argv)1005 main(int argc, char **argv)
1006 {
1007 BOOL interactive;
1008 int first_arg = 1;
1009 unsigned char buffer[1024];
1010
1011 if (argc > 1 && strcmp(argv[1], "-s") == 0)
1012 {
1013 show_character = TRUE;
1014 first_arg++;
1015 }
1016
1017 if (argc > first_arg)
1018 {
1019 int i;
1020 BOOL datafirst = TRUE;
1021 char *arg = argv[first_arg];
1022 unsigned char *s = buffer;
1023
1024 if (*arg != '+' && strncmp(arg, "U+", 2) != 0 && !isdigit(*arg))
1025 {
1026 while (*arg != 0)
1027 {
1028 if (!isxdigit(*arg++)) { datafirst = FALSE; break; }
1029 }
1030 }
1031
1032 if (datafirst)
1033 {
1034 strcpy(CS s, "findprop ");
1035 s += 9;
1036 }
1037
1038 for (i = first_arg; i < argc; i++)
1039 {
1040 s += sprintf(CS s, "%s ", argv[i]);
1041 }
1042
1043 process_command_line(buffer);
1044 return 0;
1045 }
1046
1047 interactive = is_stdin_tty();
1048
1049 #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
1050 if (interactive) using_history();
1051 #endif
1052
1053 for(;;)
1054 {
1055 #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
1056 if (interactive)
1057 {
1058 size_t len;
1059 unsigned char *s = US readline("> ");
1060 if (s == NULL) break;
1061 len = strlen(CS s);
1062 if (len > 0) add_history(CS s);
1063 memcpy(buffer, s, len);
1064 buffer[len] = '\n';
1065 buffer[len+1] = 0;
1066 free(s);
1067 }
1068 else
1069 #endif
1070
1071 {
1072 if (interactive) printf("> ");
1073 if (fgets(CS buffer, sizeof(buffer), stdin) == NULL) break;
1074 if (!interactive) printf("%s", buffer);
1075 }
1076
1077 process_command_line(buffer);
1078 }
1079
1080 if (interactive) printf("\n");
1081
1082 #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
1083 if (interactive) clear_history();
1084 #endif
1085
1086 return 0;
1087 }
1088
1089 /* End */
1090