• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2017 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 *
6 *   Copyright (C) 2003, International Business Machines
7 *   Corporation and others.  All Rights Reserved.
8 *
9 *******************************************************************************
10 *   file name:  ucdmerge.c
11 *   encoding:   US-ASCII
12 *   tab size:   8 (not used)
13 *   indentation:4
14 *
15 *   created on: 2003feb20
16 *   created by: Markus W. Scherer
17 *
18 *   Simple tool for Unicode Character Database files with semicolon-delimited fields.
19 *   Merges adjacent, identical per-code point data lines into one line with range syntax.
20 *
21 *   To compile, just call a C compiler/linker with this source file.
22 *   On Windows: cl ucdmerge.c
23 */
24 
25 #include <stdio.h>
26 #include <string.h>
27 #include <stdlib.h>
28 
29 static const char *
skipWhitespace(const char * s)30 skipWhitespace(const char *s) {
31     while(*s==' ' || *s=='\t') {
32         ++s;
33     }
34     return s;
35 }
36 
37 /* return the first character position after the end of the data */
38 static char *
endOfData(const char * l)39 endOfData(const char *l) {
40     char *end;
41     char c;
42 
43     end=strchr(l, '#');
44     if(end!=NULL) {
45         /* ignore whitespace before the comment */
46         while(l!=end && ((c=*(end-1))==' ' || c=='\t')) {
47             --end;
48         }
49     } else {
50         end=strchr(l, 0);
51     }
52     return end;
53 }
54 
55 static int
sameData(const char * l1,const char * l2)56 sameData(const char *l1, const char *l2) {
57     char *end1, *end2;
58     int length;
59 
60     /* find the first semicolon in each line - there must be one */
61     l1=strchr(l1, ';')+1;
62     l2=strchr(l2, ';')+1;
63 
64     /* find the end of data: end of string or start of comment */
65     end1=endOfData(l1);
66     end2=endOfData(l2);
67 
68     /* compare the line data portions */
69     length=end1-l1;
70     return length==(end2-l2) && 0==memcmp(l1, l2, length);
71 }
72 
73 extern int
main(int argc,const char * argv[])74 main(int argc, const char *argv[]) {
75     static char line[2000], firstLine[2000], lastLine[2000];
76     char *end;
77     long first, last, c;
78     int finished;
79 
80     first=last=-1;
81     finished=0;
82 
83     for(;;) {
84         if(gets(line)!=NULL) {
85             /* parse the initial code point, if any */
86             c=strtol(line, &end, 16);
87             if(end!=line && *skipWhitespace(end)==';') {
88                 /* single code point followed by semicolon and data, keep c */
89             } else {
90                 c=-1;
91             }
92         } else {
93             line[0]=0;
94             c=-1;
95             finished=1;
96         }
97 
98         if(last>=0 && (c!=(last+1) || !sameData(firstLine, line))) {
99             /* output the current range */
100             if(first==last) {
101                 /* there was no range, just output the one line we found */
102                 puts(firstLine);
103             } else {
104                 /* there was a real range, merge their lines */
105                 end=strchr(lastLine, '#');
106                 if(end==NULL) {
107                     /* no comment in second line */
108                     printf("%04lX..%04lX%s\n",
109                             first, last,            /* code point range */
110                             strchr(firstLine, ';'));/* first line starting from the first ; */
111                 } else if(strchr(firstLine, '#')==NULL) {
112                     /* no comment in first line */
113                     printf("%04lX..%04lX%s%s\n",
114                             first, last,            /* code point range */
115                             strchr(firstLine, ';'), /* first line starting from the first ; */
116                             end);                   /* comment from second line */
117                 } else {
118                     /* merge comments from both lines */
119                     printf("%04lX..%04lX%s..%s\n",
120                             first, last,            /* code point range */
121                             strchr(firstLine, ';'), /* first line starting from the first ; */
122                             skipWhitespace(end+1)); /* comment from second line, after # and spaces */
123                 }
124             }
125             first=last=-1;
126         }
127 
128         if(c<0) {
129             if(finished) {
130                 break;
131             }
132 
133             /* no data on this line, output as is */
134             puts(line);
135         } else {
136             /* data on this line, store for possible range compaction */
137             if(last<0) {
138                 /* set as the first line in a possible range */
139                 first=last=c;
140                 strcpy(firstLine, line);
141                 lastLine[0]=0;
142             } else /* must be c==(last+1) && sameData() because of previous conditions */ {
143                 /* continue with the current range */
144                 last=c;
145                 strcpy(lastLine, line);
146             }
147         }
148     }
149 
150     return 0;
151 }
152