1 //
2 // A utility for finding substring embeddings in patterns
3
4 #include <stdio.h>
5 #include <string.h>
6 #include <stdlib.h>
7
8 #define MAXPATHS (256*1024)
9
10 //
11 //
die(const char * msg)12 static void die(
13 const char*msg
14 ) {
15 fprintf(stderr,"%s\n",msg);
16 exit(1);
17 }
18
19
20 // Finds the index of an entry, only used on xxx_key arrays
21 // Caveat: the table has to be sorted
find_in(char * tab[],int max,const char * pat)22 static int find_in(
23 char *tab[],
24 int max,
25 const char *pat
26 ) {
27 int left=0, right=max-1;
28 while (left <= right) {
29 int mid = ((right-left)/2)+left;
30 int v = strcmp(pat,tab[mid]);
31 if (v>0) {
32 left = mid + 1;
33 } else if (v<0) {
34 right = mid -1;
35 } else {
36 return mid;
37 }
38 }
39 return -1;
40 }
41
42
43 // used by partition (which is used by qsort_arr)
44 //
swap2(char * a[],char * b[],int i,int j)45 static void swap2(
46 char *a[],
47 char *b[],
48 int i,
49 int j
50 ) {
51 if (i==j) return;
52 char*v;
53 v=a[i]; a[i]=a[j]; a[j]=v;
54 v=b[i]; b[i]=b[j]; b[j]=v;
55 }
56
57
58 // used by qsort_arr
59 //
partition(char * a[],char * b[],int left,int right,int p)60 static int partition(
61 char *a[],
62 char *b[],
63 int left,
64 int right,
65 int p
66 ) {
67 const char *pivotValue = a[p];
68 int i;
69 swap2(a,b,p,right); // Move pivot to end
70 p = left;
71 for (i=left; i<right; i++) {
72 if (strcmp(a[i],pivotValue)<=0) {
73 swap2(a,b,p,i);
74 p++;
75 }
76 }
77 swap2(a,b,right,p); // Move pivot to its final place
78 return p;
79 }
80
81
82 //
83 //
qsort_arr(char * a[],char * b[],int left,int right)84 static void qsort_arr(
85 char *a[],
86 char *b[],
87 int left,
88 int right
89 ) {
90 while (right > left) {
91 int p = left + (right-left)/2; //select a pivot
92 p = partition(a,b, left, right, p);
93 if ((p-1) - left < right - (p+1)) {
94 qsort_arr(a,b, left, p-1);
95 left = p+1;
96 } else {
97 qsort_arr(a,b, p+1, right);
98 right = p-1;
99 }
100 }
101 }
102
103
104 // Removes extra '0' entries from the string
105 //
compact(char * expr)106 static char* compact(
107 char *expr
108 ) {
109 int l=strlen(expr);
110 int i,j;
111 for (i=0,j=0; i<l; i++) {
112 if (expr[i]!='0') expr[j++] = expr[i];
113 }
114 expr[j]=0;
115 return expr;
116 }
117
118
119 // convert 'n1im' to 0n1i0m0 expressed as a string
120 //
expand(char * expr,const char * pat,int l)121 static void expand(
122 char *expr,
123 const char *pat,
124 int l
125 ) {
126 int el = 0;
127 char last = '.';
128 int i;
129 for (i=0; i<l; i++) {
130 char c = pat[i];
131 if ( (last<'0' || last>'9')
132 && (c <'0' || c >'9')
133 ) {
134 expr[el++] = '0';
135 }
136 expr[el++] = c;
137 last = c;
138 }
139 if (last<'0' || last>'9') expr[el++] = '0';
140 expr[el]=0;
141 }
142
143
144 // Combine two patterns, i.e. .ad4der + a2d becomes .a2d4der
145 // The second pattern needs to be a right side match of the first
146 // (modulo digits)
combine(char * expr,const char * subexpr)147 static char *combine(
148 char *expr,
149 const char *subexpr
150 ) {
151 int l1 = strlen(expr);
152 int l2 = strlen(subexpr);
153 int off = l1-l2;
154 int j;
155 // this works also for utf8 sequences because the substring is identical
156 // to the last substring-length bytes of expr except for the (single byte)
157 // hyphenation encoders
158 for (j=0; j<l2; j++) {
159 if (subexpr[j]>expr[off+j]) {
160 expr[off+j] = subexpr[j];
161 }
162 }
163 return expr;
164 }
165
166
167 //
168 //
main(int argc,const char * argv[])169 int main(int argc, const char* argv[]) {
170 FILE *in, *out;
171 char *pattab_key[MAXPATHS];
172 char *pattab_val[MAXPATHS];
173 int patterns = 0;
174 char *newpattab_key[MAXPATHS];
175 char *newpattab_val[MAXPATHS];
176 int newpatterns = 0;
177 char format[132]; // 64+65+newline+zero+spare
178 int p;
179 if (argc!=3) die("Usage: <orig-file> <new-file>\n");
180 if ((in = fopen(argv[1],"r"))==NULL) die("Could not read input");
181 if ((out = fopen(argv[2],"w"))==NULL) die("Could not create output");
182 // read all patterns and split in pure text (_key) & expanded patterns (_val)
183 while(fgets(format,132,in)) {
184 int l = strlen(format);
185 if (format[l-1]=='\n') { l--; format[l]=0; } // Chomp
186 if (format[0]=='%' || format[0]==0) {
187 // skip
188 } else {
189 if (format[l-1]=='%') {
190 l--;
191 format[l] = 0; // remove '%'
192 }
193 int i,j;
194 char *pat = (char*) malloc(l+1);
195 char *org = (char*) malloc(l*2+1);
196 expand(org,format,l);
197 // remove hyphenation encoders (digits) from pat
198 for (i=0,j=0; i<l; i++) {
199 // odd, but utf-8 proof
200 char c = format[i];
201 if (c<'0' || c>'9') pat[j++]=c;
202 }
203 pat[j]=0;
204 p = patterns;
205 pattab_key[patterns] = pat;
206 pattab_val[patterns++] = org;
207 if (patterns>MAXPATHS) die("to many base patterns");
208 }
209 }
210 fclose(in);
211 // As we use binairy search, make sure it is sorted
212 qsort_arr(pattab_key,pattab_val,0,patterns-1);
213
214 for (p=0; p<patterns; p++) {
215 char *pat = pattab_key[p];
216 int patsize = strlen(pat);
217 int j,l;
218 for (l=1; l<=patsize; l++) {
219 for (j=1; j<=l; j++) {
220 int i = l-j;
221 int subpat_ndx;
222 char subpat[132];
223 strncpy(subpat,pat+i,j); subpat[j]=0;
224 if ((subpat_ndx = find_in(pattab_key,patterns,subpat))>=0) {
225 int newpat_ndx;
226 char *newpat=malloc(l+1);
227 //printf("%s is embedded in %s\n",pattab_val[subpat_ndx],pattab_val[p]);
228 strncpy(newpat, pat+0,l); newpat[l]=0;
229 if ((newpat_ndx = find_in(newpattab_key,newpatterns,newpat))<0) {
230 char *neworg = malloc(132); // TODO: compute exact length
231 expand(neworg,newpat,l);
232 newpattab_key[newpatterns] = newpat;
233 newpattab_val[newpatterns++] = combine(neworg,pattab_val[subpat_ndx]);
234 if (newpatterns>MAXPATHS) die("to many new patterns");
235 //printf("%*.*s|%*.*s[%s] (%s|%s) = %s\n",i,i,pat,j,j,pat+i,pat+i+j,pattab_val[p],pattab_val[subpat_ndx],neworg);
236 } else {
237 free(newpat);
238 newpattab_val[newpat_ndx] = combine(
239 newpattab_val[newpat_ndx], pattab_val[subpat_ndx] );
240 }
241 }
242 }
243 }
244 }
245
246 /* for some tiny extra speed, one could forget the free()s
247 * as the memory is freed anyway on exit().
248 * However, the gain is minimal and now the code can be cleanly
249 * incorporated into other code */
250 for (p=0; p<newpatterns; p++) {
251 fprintf(out,"%s\n",compact(newpattab_val[p]));
252 free(newpattab_key[p]);
253 free(newpattab_val[p]);
254 }
255 fclose(out);
256
257 for (p=0; p<patterns; p++) {
258 free(pattab_key[p]);
259 free(pattab_val[p]);
260 }
261 return 0;
262 }
263