1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 * Copyright (C) 2003, International Business Machines
7 * Corporation and others. All Rights Reserved.
8 *
9 *******************************************************************************
10 * file name: ucdmerge.c
11 * encoding: US-ASCII
12 * tab size: 8 (not used)
13 * indentation:4
14 *
15 * created on: 2003feb20
16 * created by: Markus W. Scherer
17 *
18 * Simple tool for Unicode Character Database files with semicolon-delimited fields.
19 * Merges adjacent, identical per-code point data lines into one line with range syntax.
20 *
21 * To compile, just call a C compiler/linker with this source file.
22 * On Windows: cl ucdmerge.c
23 */
24
25 #include <stdio.h>
26 #include <string.h>
27 #include <stdlib.h>
28
29 static const char *
skipWhitespace(const char * s)30 skipWhitespace(const char *s) {
31 while(*s==' ' || *s=='\t') {
32 ++s;
33 }
34 return s;
35 }
36
37 /* return the first character position after the end of the data */
38 static char *
endOfData(const char * l)39 endOfData(const char *l) {
40 char *end;
41 char c;
42
43 end=strchr(l, '#');
44 if(end!=NULL) {
45 /* ignore whitespace before the comment */
46 while(l!=end && ((c=*(end-1))==' ' || c=='\t')) {
47 --end;
48 }
49 } else {
50 end=strchr(l, 0);
51 }
52 return end;
53 }
54
55 static int
sameData(const char * l1,const char * l2)56 sameData(const char *l1, const char *l2) {
57 char *end1, *end2;
58 int length;
59
60 /* find the first semicolon in each line - there must be one */
61 l1=strchr(l1, ';')+1;
62 l2=strchr(l2, ';')+1;
63
64 /* find the end of data: end of string or start of comment */
65 end1=endOfData(l1);
66 end2=endOfData(l2);
67
68 /* compare the line data portions */
69 length=end1-l1;
70 return length==(end2-l2) && 0==memcmp(l1, l2, length);
71 }
72
73 extern int
main(int argc,const char * argv[])74 main(int argc, const char *argv[]) {
75 static char line[2000], firstLine[2000], lastLine[2000];
76 char *end;
77 long first, last, c;
78 int finished;
79
80 first=last=-1;
81 finished=0;
82
83 for(;;) {
84 if(gets(line)!=NULL) {
85 /* parse the initial code point, if any */
86 c=strtol(line, &end, 16);
87 if(end!=line && *skipWhitespace(end)==';') {
88 /* single code point followed by semicolon and data, keep c */
89 } else {
90 c=-1;
91 }
92 } else {
93 line[0]=0;
94 c=-1;
95 finished=1;
96 }
97
98 if(last>=0 && (c!=(last+1) || !sameData(firstLine, line))) {
99 /* output the current range */
100 if(first==last) {
101 /* there was no range, just output the one line we found */
102 puts(firstLine);
103 } else {
104 /* there was a real range, merge their lines */
105 end=strchr(lastLine, '#');
106 if(end==NULL) {
107 /* no comment in second line */
108 printf("%04lX..%04lX%s\n",
109 first, last, /* code point range */
110 strchr(firstLine, ';'));/* first line starting from the first ; */
111 } else if(strchr(firstLine, '#')==NULL) {
112 /* no comment in first line */
113 printf("%04lX..%04lX%s%s\n",
114 first, last, /* code point range */
115 strchr(firstLine, ';'), /* first line starting from the first ; */
116 end); /* comment from second line */
117 } else {
118 /* merge comments from both lines */
119 printf("%04lX..%04lX%s..%s\n",
120 first, last, /* code point range */
121 strchr(firstLine, ';'), /* first line starting from the first ; */
122 skipWhitespace(end+1)); /* comment from second line, after # and spaces */
123 }
124 }
125 first=last=-1;
126 }
127
128 if(c<0) {
129 if(finished) {
130 break;
131 }
132
133 /* no data on this line, output as is */
134 puts(line);
135 } else {
136 /* data on this line, store for possible range compaction */
137 if(last<0) {
138 /* set as the first line in a possible range */
139 first=last=c;
140 strcpy(firstLine, line);
141 lastLine[0]=0;
142 } else /* must be c==(last+1) && sameData() because of previous conditions */ {
143 /* continue with the current range */
144 last=c;
145 strcpy(lastLine, line);
146 }
147 }
148 }
149
150 return 0;
151 }
152