1 /***************************************************
2 * A program for testing the Unicode property table *
3 ***************************************************/
4
5 /* Copyright (c) University of Cambridge 2008-2022 */
6
7 /* Compile thus:
8
9 gcc -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8 -o ucptest \
10 ucptest.c ../src/pcre2_ucd.c ../src/pcre2_tables.c
11
12 Add -lreadline or -ledit if PCRE2 was configured with readline or libedit
13 support in pcre2test.
14 */
15
16 /* This is a hacked-up program for testing the Unicode properties tables of
17 PCRE2. It can also be used for finding characters with certain properties. I
18 wrote it to help with debugging, and have added things that I found useful, in
19 a rather haphazard way. The code has never been seriously tidied or checked for
20 robustness, but it shouldn't now give compiler warnings.
21
22 There is only one option: "-s". If given, it applies only to the "findprop"
23 command. It causes the UTF-8 sequence of bytes that encode the character to be
24 output between angle brackets at the end of the line. On a UTF-8 terminal, this
25 will show the appropriate graphic for the code point.
26
27 If the command has arguments, they are concatenated into a buffer, separated by
28 spaces. If the first argument starts "U+" or consists entirely of hexadecimal
29 digits, "findprop" is inserted at the start. The buffer is then processed as a
30 single line file, after which the program exits. If there are no arguments, the
31 program reads commands line by line on stdin and writes output to stdout. The
32 return code is always zero.
33
34 There are three commands:
35
36 The command "findprop" must be followed by a space-separated list of Unicode
37 code points as hex numbers, either without any prefix or starting with "U+", or
38 as individual UTF-8 characters preceded by '+'. For example:
39
40 findprop U+1234 5Abc +?
41
42 The output is one long line per character, listing Unicode properties that have
43 values, followed by its other case or cases if one or more exist, followed by
44 its Script Extension list if there is one. This list is in square brackets. A
45 second list in square brackets gives all the Boolean properties of the
46 character. The properties that come first are:
47
48 Bidi class e.g. NSM (most common is L)
49 General type e.g. Letter
50 Specific type e.g. Upper case letter
51 Script e.g. Medefaidrin
52 Grapheme break type e.g. Extend (most common is Other)
53
54 Script names and Boolean property names are all in lower case, with underscores
55 and hyphens removed, because that's how they are stored for "loose" matching.
56
57 The command "find" must be followed by a list of property types and their
58 values. The values are case-sensitive, except for bidi class. This finds
59 characters that have those properties. If multiple properties are listed, they
60 must all be matched. Currently supported:
61
62 script <name> The character must have this script property. Only one
63 such script may be given.
64 scriptx <name> This script must be in the character's Script Extension
65 property list. If this is used many times, all the given
66 scripts must be present.
67 type <abbrev> The character's specific type (e.g. Lu or Nd) must match.
68 gbreak <name> The grapheme break property must match.
69 bidi <class> The character's bidi class must match.
70 bool <name> The character's Boolean property list must contain this
71 property.
72
73 If a <name> or <abbrev> is preceded by !, the value must NOT be present. For
74 Script Extensions and Boolean properties, there may be a mixture of positive
75 and negative requirements. All must be satisfied.
76
77 Sequences of two or more characters are shown as ranges, for example
78 U+0041..U+004A. No more than 100 lines are are output. If there are more
79 characters, the list ends with ...
80
81 The command "list" must be followed by one of property names script, bool,
82 type, gbreak or bidi. The defined values for that property are listed. */
83
84
85 #ifdef HAVE_CONFIG_H
86 #include "../src/config.h"
87 #endif
88
89 #ifndef SUPPORT_UNICODE
90 #define SUPPORT_UNICODE
91 #endif
92
93 #include <ctype.h>
94 #include <stdio.h>
95 #include <stdlib.h>
96 #include <string.h>
97 #include "../src/pcre2_internal.h"
98 #include "../src/pcre2_ucp.h"
99
100 #ifdef HAVE_UNISTD_H
101 #include <unistd.h>
102 #endif
103
104 #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
105 #if defined(SUPPORT_LIBREADLINE)
106 #include <readline/readline.h>
107 #include <readline/history.h>
108 #else
109 #if defined(HAVE_EDITLINE_READLINE_H)
110 #include <editline/readline.h>
111 #else
112 #include <readline/readline.h>
113 #ifdef RL_VERSION_MAJOR
114 #include <readline/history.h>
115 #endif
116 #endif
117 #endif
118 #endif
119
120
121 /* -------------------------------------------------------------------*/
122
123 #define CS (char *)
124 #define CCS (const char *)
125 #define CSS (char **)
126 #define US (unsigned char *)
127 #define CUS (const unsigned char *)
128 #define USS (unsigned char **)
129
130 /* -------------------------------------------------------------------*/
131
132 static BOOL show_character = FALSE;
133
134 static const unsigned char *type_names[] = {
135 US"Cc", US"Control",
136 US"Cf", US"Format",
137 US"Cn", US"Unassigned",
138 US"Co", US"Private use",
139 US"Cs", US"Surrogate",
140 US"Ll", US"Lower case letter",
141 US"Lm", US"Modifier letter",
142 US"Lo", US"Other letter",
143 US"Lt", US"Title case letter",
144 US"Lu", US"Upper case letter",
145 US"Mc", US"Spacing mark",
146 US"Me", US"Enclosing mark",
147 US"Mn", US"Non-spacing mark",
148 US"Nd", US"Decimal number",
149 US"Nl", US"Letter number",
150 US"No", US"Other number",
151 US"Pc", US"Connector punctuation",
152 US"Pd", US"Dash punctuation",
153 US"Pe", US"Close punctuation",
154 US"Pf", US"Final punctuation",
155 US"Pi", US"Initial punctuation",
156 US"Po", US"Other punctuation",
157 US"Ps", US"Open punctuation",
158 US"Sc", US"Currency symbol",
159 US"Sk", US"Modifier symbol",
160 US"Sm", US"Mathematical symbol",
161 US"So", US"Other symbol",
162 US"Zl", US"Line separator",
163 US"Zp", US"Paragraph separator",
164 US"Zs", US"Space separator"
165 };
166
167 static const unsigned char *gb_names[] = {
168 US"CR", US"carriage return",
169 US"LF", US"linefeed",
170 US"Control", US"",
171 US"Extend", US"",
172 US"Prepend", US"",
173 US"SpacingMark", US"",
174 US"L", US"Hangul syllable type L",
175 US"V", US"Hangul syllable type V",
176 US"T", US"Hangul syllable type T",
177 US"LV", US"Hangul syllable type LV",
178 US"LVT", US"Hangul syllable type LVT",
179 US"Regional_Indicator", US"",
180 US"Other", US"",
181 US"ZWJ", US"zero width joiner",
182 US"Extended_Pictographic", US""
183 };
184
185 static const unsigned char *bd_names[] = {
186 US"AL", US"Arabic letter",
187 US"AN", US"Arabid number",
188 US"B", US"Paragraph separator",
189 US"BN", US"Boundary neutral",
190 US"CS", US"Common separator",
191 US"EN", US"European number",
192 US"ES", US"European separator",
193 US"ET", US"European terminator",
194 US"FSI", US"First string isolate",
195 US"L", US"Left-to-right",
196 US"LRE", US"Left-to-right embedding",
197 US"LRI", US"Left-to-right isolate",
198 US"LRO", US"Left-to-right override",
199 US"NSM", US"Non-spacing mark",
200 US"ON", US"Other neutral",
201 US"PDF", US"Pop directional format",
202 US"PDI", US"Pop directional isolate",
203 US"R", US"Right-to-left",
204 US"RLE", US"Right-to-left embedding",
205 US"RLI", US"Right-to-left isolate",
206 US"RLO", US"Right-to-left override",
207 US"S", US"Segment separator",
208 US"WS", US"White space"
209 };
210
211 static const unsigned int utf8_table1[] = {
212 0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};
213
214 static const int utf8_table2[] = {
215 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
216
217 /* Macro to pick up the remaining bytes of a UTF-8 character, advancing
218 the pointer. */
219
220 #define GETUTF8INC(c, eptr) \
221 { \
222 if ((c & 0x20u) == 0) \
223 c = ((c & 0x1fu) << 6) | (*eptr++ & 0x3fu); \
224 else if ((c & 0x10u) == 0) \
225 { \
226 c = ((c & 0x0fu) << 12) | ((*eptr & 0x3fu) << 6) | (eptr[1] & 0x3fu); \
227 eptr += 2; \
228 } \
229 else if ((c & 0x08u) == 0) \
230 { \
231 c = ((c & 0x07u) << 18) | ((*eptr & 0x3fu) << 12) | \
232 ((eptr[1] & 0x3fu) << 6) | (eptr[2] & 0x3fu); \
233 eptr += 3; \
234 } \
235 else if ((c & 0x04u) == 0) \
236 { \
237 c = ((c & 0x03u) << 24) | ((*eptr & 0x3fu) << 18) | \
238 ((eptr[1] & 0x3fu) << 12) | ((eptr[2] & 0x3fu) << 6) | \
239 (eptr[3] & 0x3fu); \
240 eptr += 4; \
241 } \
242 else \
243 { \
244 c = ((c & 0x01u) << 30) | ((*eptr & 0x3fu) << 24) | \
245 ((eptr[1] & 0x3fu) << 18) | ((eptr[2] & 0x3fu) << 12) | \
246 ((eptr[3] & 0x3fu) << 6) | (eptr[4] & 0x3fu); \
247 eptr += 5; \
248 } \
249 }
250
251
252
253 /*************************************************
254 * Convert character value to UTF-8 *
255 *************************************************/
256
257 /* This function takes an unsigned long integer value in the range 0 -
258 0x7fffffff and encodes it as a UTF-8 character in 1 to 6 bytes.
259
260 Arguments:
261 cvalue the character value
262 buffer pointer to buffer for result - at least 6 bytes long
263
264 Returns: number of bytes placed in the buffer
265 0 if input code point is too big
266 */
267
268 static size_t
ord2utf8(unsigned int cvalue,unsigned char * buffer)269 ord2utf8(unsigned int cvalue, unsigned char *buffer)
270 {
271 size_t i, j;
272 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
273 if (cvalue <= utf8_table1[i]) break;
274 if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
275 buffer += i;
276 for (j = i; j > 0; j--)
277 {
278 *buffer-- = 0x80 | (cvalue & 0x3f);
279 cvalue >>= 6;
280 }
281 *buffer = utf8_table2[i] | cvalue;
282 return i + 1;
283 }
284
285
286
287 /*************************************************
288 * Test for interaction *
289 *************************************************/
290
291 static BOOL
is_stdin_tty(void)292 is_stdin_tty(void)
293 {
294 #if defined WIN32
295 return _isatty(_fileno(stdin));
296 #else
297 return isatty(fileno(stdin));
298 #endif
299 }
300
301
302 /*************************************************
303 * Get name from ucp ident *
304 *************************************************/
305
306 /* The utt table contains both full names and abbreviations. So search for both
307 and use the longer if two are found, unless the first one is only 3 characters
308 and we are looking for a script (some scripts have 3-character names). If this
309 were not just a test program it might be worth making some kind of reverse
310 index. */
311
312 static const char *
get_propname(int prop,int type)313 get_propname(int prop, int type)
314 {
315 size_t i, j, len;
316 size_t foundlist[2];
317 const char *yield;
318 int typex = (type == PT_SC)? PT_SCX : type;
319
320 j = 0;
321 for (i = 0; i < PRIV(utt_size); i++)
322 {
323 const ucp_type_table *u = PRIV(utt) + i;
324 if ((u->type == type || u->type == typex) && u->value == prop)
325 {
326 foundlist[j++] = i;
327 if (j >= 2) break;
328 }
329 }
330
331 if (j == 0) return "??";
332
333 yield = NULL;
334 len = 0;
335
336 for (i = 0; i < j; i++)
337 {
338 const char *s = PRIV(utt_names) + (PRIV(utt) + foundlist[i])->name_offset;
339 size_t sl = strlen(s);
340
341 if (sl > len)
342 {
343 yield = s;
344 if (sl == 3 && type == PT_SC) break;
345 len = sl;
346 }
347 }
348
349 return yield;
350 }
351
352
353 /*************************************************
354 * Print Unicode property info for a char *
355 *************************************************/
356
357 static void
print_prop(unsigned int c,BOOL is_just_one)358 print_prop(unsigned int c, BOOL is_just_one)
359 {
360 int type = UCD_CATEGORY(c);
361 int fulltype = UCD_CHARTYPE(c);
362 int script = UCD_SCRIPT(c);
363 int scriptx = UCD_SCRIPTX(c);
364 int gbprop = UCD_GRAPHBREAK(c);
365 int bidi = UCD_BIDICLASS(c);
366 unsigned int othercase = UCD_OTHERCASE(c);
367 int caseset = UCD_CASESET(c);
368 int bprops = UCD_BPROPS(c);
369
370 const unsigned char *fulltypename = US"??";
371 const unsigned char *typename = US"??";
372 const unsigned char *graphbreak = US"??";
373 const unsigned char *bidiclass = US"??";
374 const unsigned char *scriptname = CUS get_propname(script, PT_SC);
375
376 switch (type)
377 {
378 case ucp_C: typename = US"Control"; break;
379 case ucp_L: typename = US"Letter"; break;
380 case ucp_M: typename = US"Mark"; break;
381 case ucp_N: typename = US"Number"; break;
382 case ucp_P: typename = US"Punctuation"; break;
383 case ucp_S: typename = US"Symbol"; break;
384 case ucp_Z: typename = US"Separator"; break;
385 }
386
387 switch (fulltype)
388 {
389 case ucp_Cc: fulltypename = US"Control"; break;
390 case ucp_Cf: fulltypename = US"Format"; break;
391 case ucp_Cn: fulltypename = US"Unassigned"; break;
392 case ucp_Co: fulltypename = US"Private use"; break;
393 case ucp_Cs: fulltypename = US"Surrogate"; break;
394 case ucp_Ll: fulltypename = US"Lower case letter"; break;
395 case ucp_Lm: fulltypename = US"Modifier letter"; break;
396 case ucp_Lo: fulltypename = US"Other letter"; break;
397 case ucp_Lt: fulltypename = US"Title case letter"; break;
398 case ucp_Lu: fulltypename = US"Upper case letter"; break;
399 case ucp_Mc: fulltypename = US"Spacing mark"; break;
400 case ucp_Me: fulltypename = US"Enclosing mark"; break;
401 case ucp_Mn: fulltypename = US"Non-spacing mark"; break;
402 case ucp_Nd: fulltypename = US"Decimal number"; break;
403 case ucp_Nl: fulltypename = US"Letter number"; break;
404 case ucp_No: fulltypename = US"Other number"; break;
405 case ucp_Pc: fulltypename = US"Connector punctuation"; break;
406 case ucp_Pd: fulltypename = US"Dash punctuation"; break;
407 case ucp_Pe: fulltypename = US"Close punctuation"; break;
408 case ucp_Pf: fulltypename = US"Final punctuation"; break;
409 case ucp_Pi: fulltypename = US"Initial punctuation"; break;
410 case ucp_Po: fulltypename = US"Other punctuation"; break;
411 case ucp_Ps: fulltypename = US"Open punctuation"; break;
412 case ucp_Sc: fulltypename = US"Currency symbol"; break;
413 case ucp_Sk: fulltypename = US"Modifier symbol"; break;
414 case ucp_Sm: fulltypename = US"Mathematical symbol"; break;
415 case ucp_So: fulltypename = US"Other symbol"; break;
416 case ucp_Zl: fulltypename = US"Line separator"; break;
417 case ucp_Zp: fulltypename = US"Paragraph separator"; break;
418 case ucp_Zs: fulltypename = US"Space separator"; break;
419 }
420
421 switch(gbprop)
422 {
423 case ucp_gbCR: graphbreak = US"CR"; break;
424 case ucp_gbLF: graphbreak = US"LF"; break;
425 case ucp_gbControl: graphbreak = US"Control"; break;
426 case ucp_gbExtend: graphbreak = US"Extend"; break;
427 case ucp_gbPrepend: graphbreak = US"Prepend"; break;
428 case ucp_gbSpacingMark: graphbreak = US"SpacingMark"; break;
429 case ucp_gbL: graphbreak = US"Hangul syllable type L"; break;
430 case ucp_gbV: graphbreak = US"Hangul syllable type V"; break;
431 case ucp_gbT: graphbreak = US"Hangul syllable type T"; break;
432 case ucp_gbLV: graphbreak = US"Hangul syllable type LV"; break;
433 case ucp_gbLVT: graphbreak = US"Hangul syllable type LVT"; break;
434 case ucp_gbRegional_Indicator:
435 graphbreak = US"Regional Indicator"; break;
436 case ucp_gbOther: graphbreak = US"Other"; break;
437 case ucp_gbZWJ: graphbreak = US"Zero Width Joiner"; break;
438 case ucp_gbExtended_Pictographic:
439 graphbreak = US"Extended Pictographic"; break;
440 default: graphbreak = US"Unknown"; break;
441 }
442
443 switch(bidi)
444 {
445 case ucp_bidiAL: bidiclass = US"AL "; break;
446 case ucp_bidiFSI: bidiclass = US"FSI"; break;
447 case ucp_bidiL: bidiclass = US"L "; break;
448 case ucp_bidiLRE: bidiclass = US"LRE"; break;
449 case ucp_bidiLRI: bidiclass = US"LRI"; break;
450 case ucp_bidiLRO: bidiclass = US"LRO"; break;
451 case ucp_bidiPDF: bidiclass = US"PDF"; break;
452 case ucp_bidiPDI: bidiclass = US"PDI"; break;
453 case ucp_bidiR: bidiclass = US"R "; break;
454 case ucp_bidiRLE: bidiclass = US"RLE"; break;
455 case ucp_bidiRLI: bidiclass = US"RLI"; break;
456 case ucp_bidiRLO: bidiclass = US"RLO"; break;
457 case ucp_bidiAN: bidiclass = US"AN "; break;
458 case ucp_bidiB: bidiclass = US"B "; break;
459 case ucp_bidiBN: bidiclass = US"BN "; break;
460 case ucp_bidiCS: bidiclass = US"CS "; break;
461 case ucp_bidiEN: bidiclass = US"EN "; break;
462 case ucp_bidiES: bidiclass = US"ES "; break;
463 case ucp_bidiET: bidiclass = US"ET "; break;
464 case ucp_bidiNSM: bidiclass = US"NSM"; break;
465 case ucp_bidiON: bidiclass = US"ON "; break;
466 case ucp_bidiS: bidiclass = US"S "; break;
467 case ucp_bidiWS: bidiclass = US"WS "; break;
468 default: bidiclass = US"???"; break;
469 }
470
471 printf("U+%04X %s %s: %s, %s, %s", c, bidiclass, typename, fulltypename,
472 scriptname, graphbreak);
473
474 if (is_just_one && othercase != c)
475 {
476 printf(", U+%04X", othercase);
477 if (caseset != 0)
478 {
479 const uint32_t *p = PRIV(ucd_caseless_sets) + caseset - 1;
480 while (*(++p) < NOTACHAR)
481 {
482 unsigned int d = *p;
483 if (d != othercase && d != c) printf(", U+%04X", d);
484 }
485 }
486 }
487
488 if (scriptx != 0)
489 {
490 const char *sep = "";
491 const uint32_t *p = PRIV(ucd_script_sets) + scriptx;
492 printf(", [");
493 for (int i = 0; i < ucp_Unknown; i++)
494 if (MAPBIT(p, i) != 0)
495 {
496 printf("%s%s", sep, get_propname(i, PT_SC));
497 sep = ", ";
498 }
499 printf("]");
500 }
501
502 if (bprops != 0)
503 {
504 const char *sep = "";
505 const uint32_t *p = PRIV(ucd_boolprop_sets) +
506 bprops * ucd_boolprop_sets_item_size;
507 printf(", [");
508 for (int i = 0; i < ucp_Bprop_Count; i++)
509 if (MAPBIT(p, i) != 0)
510 {
511 printf("%s%s", sep, get_propname(i, PT_BOOL));
512 sep = ", ";
513 }
514 printf("]");
515 }
516
517 if (show_character && is_just_one)
518 {
519 unsigned char buffer[8];
520 size_t len = ord2utf8(c, buffer);
521 printf(", >%.*s<", (int)len, buffer);
522 }
523
524 printf("\n");
525 }
526
527
528
529 /*************************************************
530 * Find character(s) with given property/ies *
531 *************************************************/
532
533 static void
find_chars(unsigned char * s)534 find_chars(unsigned char *s)
535 {
536 unsigned char name[128];
537 unsigned char value[128];
538 unsigned char *t;
539 unsigned int count= 0;
540 int scriptx_list[128];
541 unsigned int scriptx_count = 0;
542 int bprop_list[128];
543 unsigned int bprop_count = 0;
544 uint32_t i, c;
545 int script = -1;
546 int type = -1;
547 int gbreak = -1;
548 int bidiclass = -1;
549 BOOL script_not = FALSE;
550 BOOL type_not = FALSE;
551 BOOL gbreak_not = FALSE;
552 BOOL bidiclass_not = FALSE;
553 BOOL hadrange = FALSE;
554 const ucd_record *ucd, *next_ucd;
555 const char *pad = " ";
556
557 while (*s != 0)
558 {
559 unsigned int offset = 0;
560 BOOL scriptx_not = FALSE;
561
562 for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
563 *t = 0;
564 while (isspace(*s)) s++;
565
566 for (t = value; *s != 0 && !isspace(*s); s++)
567 {
568 if (*s != '_' && *s != '-') *t++ = *s;
569 }
570 *t = 0;
571 while (isspace(*s)) s++;
572
573 if (strcmp(CS name, "script") == 0 ||
574 strcmp(CS name, "scriptx") == 0)
575 {
576 for (t = value; *t != 0; t++) *t = tolower(*t);
577
578 if (value[0] == '!')
579 {
580 if (name[6] == 'x') scriptx_not = TRUE;
581 else script_not = TRUE;
582 offset = 1;
583 }
584
585 for (i = 0; i < PRIV(utt_size); i++)
586 {
587 const ucp_type_table *u = PRIV(utt) + i;
588 if ((u->type == PT_SCX || u->type == PT_SC) && strcmp(CS(value + offset),
589 PRIV(utt_names) + u->name_offset) == 0)
590 {
591 c = u->value;
592 if (name[6] == 'x')
593 {
594 scriptx_list[scriptx_count++] = scriptx_not? (-c):c;
595 }
596 else
597 {
598 if (script < 0) script = c; else
599 {
600 printf("** Only 1 script value allowed\n");
601 return;
602 }
603 }
604 break;
605 }
606 }
607
608 if (i >= PRIV(utt_size))
609 {
610 printf("** Unrecognized script name \"%s\"\n", value);
611 return;
612 }
613 }
614
615 else if (strcmp(CS name, "bool") == 0)
616 {
617 int not = 1;
618 if (value[0] == '!')
619 {
620 not = -1;
621 offset = 1;
622 }
623
624 for (i = 0; i < PRIV(utt_size); i++)
625 {
626 const ucp_type_table *u = PRIV(utt) + i;
627 if (u->type == PT_BOOL && strcmp(CS(value + offset),
628 PRIV(utt_names) + u->name_offset) == 0)
629 {
630 bprop_list[bprop_count++] = u->value * not;
631 break;
632 }
633 }
634
635 if (i >= PRIV(utt_size))
636 {
637 printf("** Unrecognized property name \"%s\"\n", value);
638 return;
639 }
640 }
641
642 else if (strcmp(CS name, "type") == 0)
643 {
644 if (type >= 0)
645 {
646 printf("** Only 1 type value allowed\n");
647 return;
648 }
649 else
650 {
651 if (value[0] == '!')
652 {
653 type_not = TRUE;
654 offset = 1;
655 }
656
657 for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2)
658 {
659 if (strcmp(CS (value + offset), CS type_names[i]) == 0)
660 {
661 type = i/2;
662 break;
663 }
664 }
665 if (i >= sizeof(type_names)/sizeof(char *))
666 {
667 printf("** Unrecognized type name \"%s\"\n", value);
668 return;
669 }
670 }
671 }
672
673 else if (strcmp(CS name, "gbreak") == 0)
674 {
675 if (gbreak >= 0)
676 {
677 printf("** Only 1 grapheme break value allowed\n");
678 return;
679 }
680 else
681 {
682 if (value[0] == '!')
683 {
684 gbreak_not = TRUE;
685 offset = 1;
686 }
687
688 for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2)
689 {
690 if (strcmp(CS (value + offset), CS gb_names[i]) == 0)
691 {
692 gbreak = i/2;
693 break;
694 }
695 }
696 if (i >= sizeof(gb_names)/sizeof(char *))
697 {
698 printf("** Unrecognized gbreak name \"%s\"\n", value);
699 return;
700 }
701 }
702 }
703
704 else if (strcmp(CS name, "bidi") == 0 ||
705 strcmp(CS name, "bidiclass") == 0 ||
706 strcmp(CS name, "bidi_class") == 0 )
707 {
708 if (bidiclass >= 0)
709 {
710 printf("** Only 1 bidi class value allowed\n");
711 return;
712 }
713 else
714 {
715 if (value[0] == '!')
716 {
717 bidiclass_not = TRUE;
718 offset = 1;
719 }
720 for (i = 0; i < sizeof(bd_names)/sizeof(char *); i += 2)
721 {
722 if (strcasecmp(CS (value + offset), CS bd_names[i]) == 0)
723 {
724 bidiclass = i/2;
725 break;
726 }
727 }
728 if (i >= sizeof(bd_names)/sizeof(char *))
729 {
730 printf("** Unrecognized bidi class name \"%s\"\n", value);
731 return;
732 }
733 }
734 }
735
736 else
737 {
738 printf("** Unrecognized property name \"%s\"\n", name);
739 return;
740 }
741 }
742
743 if (script < 0 && scriptx_count == 0 && bprop_count == 0 && type < 0 &&
744 gbreak < 0 && bidiclass < 0)
745 {
746 printf("** No properties specified\n");
747 return;
748 }
749
750 for (c = 0; c <= 0x10ffff; c++)
751 {
752 if (script >= 0 && (script == UCD_SCRIPT(c)) == script_not) continue;
753
754 if (scriptx_count > 0)
755 {
756 const uint32_t *bits_scriptx = PRIV(ucd_script_sets) + UCD_SCRIPTX(c);
757 unsigned int found = 0;
758
759 for (i = 0; i < scriptx_count; i++)
760 {
761 int x = scriptx_list[i]/32;
762 int y = scriptx_list[i]%32;
763
764 /* Positive requirment */
765 if (scriptx_list[i] >= 0)
766 {
767 if ((bits_scriptx[x] & (1u<<y)) != 0) found++;
768 }
769 /* Negative requirement */
770 else
771 {
772 if ((bits_scriptx[x] & (1u<<y)) == 0) found++;
773 }
774 }
775
776 if (found != scriptx_count) continue;
777 }
778
779 if (bprop_count > 0)
780 {
781 const uint32_t *bits_bprop = PRIV(ucd_boolprop_sets) +
782 UCD_BPROPS(c) * ucd_boolprop_sets_item_size;
783 unsigned int found = 0;
784
785 for (i = 0; i < bprop_count; i++)
786 {
787 int x = bprop_list[i]/32;
788 int y = bprop_list[i]%32;
789
790 /* Positive requirement */
791 if (bprop_list[i] >= 0)
792 {
793 if ((bits_bprop[x] & (1u<<y)) != 0) found++;
794 }
795 /* Negative requirement */
796 else
797 {
798 if ((bits_bprop[-x] & (1u<<(-y))) == 0) found++;
799 }
800 }
801
802 if (found != bprop_count) continue;
803 }
804
805 if (type >= 0)
806 {
807 if (type_not)
808 {
809 if (type == UCD_CHARTYPE(c)) continue;
810 }
811 else
812 {
813 if (type != UCD_CHARTYPE(c)) continue;
814 }
815 }
816
817 if (gbreak >= 0)
818 {
819 if (gbreak_not)
820 {
821 if (gbreak == UCD_GRAPHBREAK(c)) continue;
822 }
823 else
824 {
825 if (gbreak != UCD_GRAPHBREAK(c)) continue;
826 }
827 }
828
829 if (bidiclass >= 0)
830 {
831 if (bidiclass_not)
832 {
833 if (bidiclass == UCD_BIDICLASS(c)) continue;
834 }
835 else
836 {
837 if (bidiclass != UCD_BIDICLASS(c)) continue;
838 }
839 }
840
841 /* All conditions are met. Look for runs. */
842
843 ucd = GET_UCD(c);
844
845 for (i = c + 1; i < 0x10ffff; i++)
846 {
847 next_ucd = GET_UCD(i);
848 if (memcmp(ucd, next_ucd, sizeof(ucd_record)) != 0) break;
849 }
850
851 if (--i > c)
852 {
853 printf("U+%04X..", c);
854 c = i;
855 hadrange = TRUE;
856 }
857 else if (hadrange) printf("%s", pad);
858
859 print_prop(c, FALSE);
860 if (c >= 0x100000) pad = " ";
861 else if (c >= 0x10000) pad = " ";
862 count++;
863 if (count >= 100)
864 {
865 printf("...\n");
866 break;
867 }
868 }
869
870 if (count == 0) printf("No characters found\n");
871 }
872
873
874 /*************************************************
875 * Process command line *
876 *************************************************/
877
878 static void
process_command_line(unsigned char * buffer)879 process_command_line(unsigned char *buffer)
880 {
881 unsigned char *s, *t;
882 unsigned char name[24];
883
884 s = buffer;
885 while (isspace(*s)) s++;
886 if (*s == 0) return;
887
888 for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
889 *t = 0;
890 while (isspace(*s)) s++;
891
892 if (strcmp(CS name, "findprop") == 0)
893 {
894 while (*s != 0)
895 {
896 unsigned int c;
897 unsigned char *endptr;
898 t = s;
899
900 if (*t == '+')
901 {
902 c = *(++t);
903 if (c > 0x7fu)
904 {
905 GETCHARINC(c, t);
906 }
907 endptr = t+1;
908 }
909 else
910 {
911 if (strncmp(CS t, "U+", 2) == 0) t += 2;
912 c = strtoul(CS t, CSS(&endptr), 16);
913 }
914
915 if (*endptr != 0 && !isspace(*endptr))
916 {
917 while (*endptr != 0 && !isspace(*endptr)) endptr++;
918 printf("** Invalid character specifier: ignored \"%.*s\"\n", (int)(endptr-s), s);
919 }
920 else
921 {
922 if (c > 0x10ffff)
923 printf("** U+%x is too big for a Unicode code point\n", c);
924 else
925 print_prop(c, TRUE);
926 }
927 s = endptr;
928 while (isspace(*s)) s++;
929 }
930 }
931
932 else if (strcmp(CS name, "find") == 0)
933 {
934 find_chars(s);
935 }
936
937 else if (strcmp(CS name, "list") == 0)
938 {
939 while (*s != 0)
940 {
941 size_t i;
942 for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
943 *t = 0;
944 while (isspace(*s)) s++;
945
946 if (strcmp(CS name, "script") == 0 || strcmp(CS name, "scripts") == 0)
947 {
948 for (i = 0; i < PRIV(utt_size); i++)
949 if (PRIV(utt)[i].type == PT_SCX || PRIV(utt)[i].type == PT_SC)
950 printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset);
951 }
952
953 else if (strcmp(CS name, "bool") == 0)
954 {
955 for (i = 0; i < PRIV(utt_size); i++)
956 if (PRIV(utt)[i].type == PT_BOOL)
957 printf("%s\n", PRIV(utt_names) + PRIV(utt)[i].name_offset);
958 }
959
960 else if (strcmp(CS name, "type") == 0 || strcmp(CS name, "types") == 0)
961 {
962 for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2)
963 printf("%s %s\n", type_names[i], type_names[i+1]);
964 }
965
966 else if (strcmp(CS name, "gbreak") == 0 || strcmp(CS name, "gbreaks") == 0)
967 {
968 for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2)
969 {
970 if (gb_names[i+1][0] != 0)
971 printf("%-3s (%s)\n", gb_names[i], gb_names[i+1]);
972 else
973 printf("%s\n", gb_names[i]);
974 }
975 }
976
977 else if (strcmp(CS name, "bidi") == 0 ||
978 strcmp(CS name, "bidiclasses") == 0)
979 {
980 for (i = 0; i < sizeof(bd_names)/sizeof(char *); i += 2)
981 printf("%3s %s\n", bd_names[i], bd_names[i+1]);
982 }
983
984 else
985 {
986 printf("** Unknown property \"%s\"\n", name);
987 break;
988 }
989 }
990 }
991
992 else printf("** Unknown test command \"%s\"\n", name);
993 }
994
995
996
997 /*************************************************
998 * Main program *
999 *************************************************/
1000
1001 int
main(int argc,char ** argv)1002 main(int argc, char **argv)
1003 {
1004 BOOL interactive;
1005 int first_arg = 1;
1006 unsigned char buffer[1024];
1007
1008 if (argc > 1 && strcmp(argv[1], "-s") == 0)
1009 {
1010 show_character = TRUE;
1011 first_arg++;
1012 }
1013
1014 if (argc > first_arg)
1015 {
1016 int i;
1017 BOOL datafirst = TRUE;
1018 char *arg = argv[first_arg];
1019 unsigned char *s = buffer;
1020
1021 if (*arg != '+' && strncmp(arg, "U+", 2) != 0 && !isdigit(*arg))
1022 {
1023 while (*arg != 0)
1024 {
1025 if (!isxdigit(*arg++)) { datafirst = FALSE; break; }
1026 }
1027 }
1028
1029 if (datafirst)
1030 {
1031 strcpy(CS s, "findprop ");
1032 s += 9;
1033 }
1034
1035 for (i = first_arg; i < argc; i++)
1036 {
1037 s += sprintf(CS s, "%s ", argv[i]);
1038 }
1039
1040 process_command_line(buffer);
1041 return 0;
1042 }
1043
1044 interactive = is_stdin_tty();
1045
1046 #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
1047 if (interactive) using_history();
1048 #endif
1049
1050 for(;;)
1051 {
1052 #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
1053 if (interactive)
1054 {
1055 size_t len;
1056 unsigned char *s = US readline("> ");
1057 if (s == NULL) break;
1058 len = strlen(CS s);
1059 if (len > 0) add_history(CS s);
1060 memcpy(buffer, s, len);
1061 buffer[len] = '\n';
1062 buffer[len+1] = 0;
1063 free(s);
1064 }
1065 else
1066 #endif
1067
1068 {
1069 if (interactive) printf("> ");
1070 if (fgets(CS buffer, sizeof(buffer), stdin) == NULL) break;
1071 if (!interactive) printf("%s", buffer);
1072 }
1073
1074 process_command_line(buffer);
1075 }
1076
1077 if (interactive) printf("\n");
1078
1079 #if defined(SUPPORT_LIBREADLINE) || defined(SUPPORT_LIBEDIT)
1080 if (interactive) clear_history();
1081 #endif
1082
1083 return 0;
1084 }
1085
1086 /* End */
1087