• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //
2 // A utility for finding substring embeddings in patterns
3 
4 #include <stdio.h>
5 #include <string.h>
6 #include <stdlib.h>
7 
8 #define MAXPATHS (256*1024)
9 
10 //
11 //
die(const char * msg)12 static void die(
13   const char*msg
14 ) {
15   fprintf(stderr,"%s\n",msg);
16   exit(1);
17 }
18 
19 
20 // Finds the index of an entry, only used on xxx_key arrays
21 // Caveat: the table has to be sorted
find_in(char * tab[],int max,const char * pat)22 static int find_in(
23   char *tab[],
24   int max,
25   const char *pat
26 ) {
27   int left=0, right=max-1;
28   while (left <= right) {
29     int mid = ((right-left)/2)+left;
30     int v   = strcmp(pat,tab[mid]);
31     if (v>0) {
32       left = mid + 1;
33     } else if (v<0) {
34       right = mid -1;
35     } else {
36       return mid;
37     }
38   }
39   return -1;
40 }
41 
42 
43 // used by partition (which is used by qsort_arr)
44 //
swap2(char * a[],char * b[],int i,int j)45 static void swap2(
46   char *a[],
47   char *b[],
48   int i,
49   int j
50 ) {
51   if (i==j) return;
52   char*v;
53   v=a[i]; a[i]=a[j]; a[j]=v;
54   v=b[i]; b[i]=b[j]; b[j]=v;
55 }
56 
57 
58 // used by qsort_arr
59 //
partition(char * a[],char * b[],int left,int right,int p)60 static int partition(
61   char *a[],
62   char *b[],
63   int left,
64   int right,
65   int p
66 ) {
67   const char *pivotValue = a[p];
68   int i;
69   swap2(a,b,p,right); // Move pivot to end
70   p = left;
71   for (i=left; i<right; i++) {
72     if (strcmp(a[i],pivotValue)<=0) {
73       swap2(a,b,p,i);
74       p++;
75     }
76   }
77   swap2(a,b,right,p); // Move pivot to its final place
78   return p;
79 }
80 
81 
82 //
83 //
qsort_arr(char * a[],char * b[],int left,int right)84 static void qsort_arr(
85   char *a[],
86   char *b[],
87   int left,
88   int right
89 ) {
90   while (right > left) {
91     int p = left + (right-left)/2; //select a pivot
92     p = partition(a,b, left, right, p);
93     if ((p-1) - left < right - (p+1)) {
94       qsort_arr(a,b, left, p-1);
95       left  = p+1;
96     } else {
97       qsort_arr(a,b, p+1, right);
98       right = p-1;
99     }
100   }
101 }
102 
103 
104 // Removes extra '0' entries from the string
105 //
compact(char * expr)106 static char* compact(
107   char *expr
108 ) {
109   int l=strlen(expr);
110   int i,j;
111   for (i=0,j=0; i<l; i++) {
112     if (expr[i]!='0') expr[j++] = expr[i];
113   }
114   expr[j]=0;
115   return expr;
116 }
117 
118 
119 // convert 'n1im' to 0n1i0m0 expressed as a string
120 //
expand(char * expr,const char * pat,int l)121 static void expand(
122   char *expr,
123   const char *pat,
124   int l
125 ) {
126   int  el = 0;
127   char last = '.';
128   int  i;
129   for (i=0; i<l; i++) {
130     char c = pat[i];
131     if ( (last<'0' || last>'9')
132       && (c   <'0' || c   >'9')
133       ) {
134       expr[el++] = '0';
135     }
136     expr[el++] = c;
137     last = c;
138   }
139   if (last<'0' || last>'9') expr[el++] = '0';
140   expr[el]=0;
141 }
142 
143 
144 // Combine two patterns, i.e. .ad4der + a2d becomes .a2d4der
145 // The second pattern needs to be a right side match of the first
146 // (modulo digits)
combine(char * expr,const char * subexpr)147 static char *combine(
148   char *expr,
149   const char *subexpr
150 ) {
151   int l1 = strlen(expr);
152   int l2 = strlen(subexpr);
153   int off = l1-l2;
154   int j;
155   // this works also for utf8 sequences because the substring is identical
156   // to the last substring-length bytes of expr except for the (single byte)
157   // hyphenation encoders
158   for (j=0; j<l2; j++) {
159     if (subexpr[j]>expr[off+j]) {
160       expr[off+j] = subexpr[j];
161     }
162   }
163   return expr;
164 }
165 
166 
167 //
168 //
main(int argc,const char * argv[])169 int main(int argc, const char* argv[]) {
170   FILE *in, *out;
171   char *pattab_key[MAXPATHS];
172   char *pattab_val[MAXPATHS];
173   int   patterns = 0;
174   char *newpattab_key[MAXPATHS];
175   char *newpattab_val[MAXPATHS];
176   int   newpatterns = 0;
177   char format[132]; // 64+65+newline+zero+spare
178   int p;
179   if (argc!=3) die("Usage: <orig-file> <new-file>\n");
180   if ((in = fopen(argv[1],"r"))==NULL) die("Could not read input");
181   if ((out = fopen(argv[2],"w"))==NULL) die("Could not create output");
182   // read all patterns and split in pure text (_key) & expanded patterns (_val)
183   while(fgets(format,132,in)) {
184     int l = strlen(format);
185     if (format[l-1]=='\n') { l--; format[l]=0; } // Chomp
186     if (format[0]=='%' || format[0]==0) {
187       // skip
188     } else {
189       if (format[l-1]=='%') {
190         l--;
191         format[l] = 0; // remove '%'
192       }
193       int i,j;
194       char *pat = (char*) malloc(l+1);
195       char *org = (char*) malloc(l*2+1);
196       expand(org,format,l);
197       // remove hyphenation encoders (digits) from pat
198       for (i=0,j=0; i<l; i++) {
199         // odd, but utf-8 proof
200         char c = format[i];
201         if (c<'0' || c>'9') pat[j++]=c;
202       }
203       pat[j]=0;
204       p = patterns;
205       pattab_key[patterns]   = pat;
206       pattab_val[patterns++] = org;
207       if (patterns>MAXPATHS) die("to many base patterns");
208     }
209   }
210   fclose(in);
211   // As we use binairy search, make sure it is sorted
212   qsort_arr(pattab_key,pattab_val,0,patterns-1);
213 
214   for (p=0; p<patterns; p++) {
215     char *pat = pattab_key[p];
216     int   patsize = strlen(pat);
217     int   j,l;
218     for (l=1; l<=patsize; l++) {
219       for (j=1; j<=l; j++) {
220         int i = l-j;
221         int  subpat_ndx;
222         char subpat[132];
223         strncpy(subpat,pat+i,j); subpat[j]=0;
224         if ((subpat_ndx = find_in(pattab_key,patterns,subpat))>=0) {
225           int   newpat_ndx;
226           char *newpat=malloc(l+1);
227       //printf("%s is embedded in %s\n",pattab_val[subpat_ndx],pattab_val[p]);
228           strncpy(newpat, pat+0,l); newpat[l]=0;
229           if ((newpat_ndx = find_in(newpattab_key,newpatterns,newpat))<0) {
230             char *neworg = malloc(132); // TODO: compute exact length
231             expand(neworg,newpat,l);
232             newpattab_key[newpatterns]   = newpat;
233             newpattab_val[newpatterns++] = combine(neworg,pattab_val[subpat_ndx]);
234             if (newpatterns>MAXPATHS) die("to many new patterns");
235     //printf("%*.*s|%*.*s[%s] (%s|%s) = %s\n",i,i,pat,j,j,pat+i,pat+i+j,pattab_val[p],pattab_val[subpat_ndx],neworg);
236           } else {
237             free(newpat);
238             newpattab_val[newpat_ndx] = combine(
239               newpattab_val[newpat_ndx], pattab_val[subpat_ndx] );
240           }
241         }
242       }
243     }
244   }
245 
246   /* for some tiny extra speed, one could forget the free()s
247    * as the memory is freed anyway on exit().
248    * However, the gain is minimal and now the code can be cleanly
249    * incorporated into other code */
250   for (p=0; p<newpatterns; p++) {
251     fprintf(out,"%s\n",compact(newpattab_val[p]));
252     free(newpattab_key[p]);
253     free(newpattab_val[p]);
254   }
255   fclose(out);
256 
257   for (p=0; p<patterns; p++) {
258     free(pattab_key[p]);
259     free(pattab_val[p]);
260   }
261   return 0;
262 }
263