• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include <cstddef>
2 #include <cstdio>
3 #include <cstring>
4 #include <fstream>
5 #include <vector>
6 
7 #include "./deorummolae.h"
8 #include "./durchschlag.h"
9 #include "./sieve.h"
10 
11 #define METHOD_DM 0
12 #define METHOD_SIEVE 1
13 #define METHOD_DURCHSCHLAG 2
14 #define METHOD_DISTILL 3
15 #define METHOD_PURIFY 4
16 
readInt(const char * str)17 static size_t readInt(const char* str) {
18   size_t result = 0;
19   if (str[0] == 0 || str[0] == '0') {
20     return 0;
21   }
22   for (size_t i = 0; i < 13; ++i) {
23     if (str[i] == 0) {
24       return result;
25     }
26     if (str[i] == 'k' || str[i] == 'K') {
27       if ((str[i + 1] == 0) && ((result << 10) > result)) {
28         return result << 10;
29       }
30       return 0;
31     }
32     if (str[i] == 'm' || str[i] == 'M') {
33       if ((str[i + 1] == 0) && ((result << 20) > result)) {
34         return result << 20;
35       }
36       return 0;
37     }
38     if (str[i] < '0' || str[i] > '9') {
39       return 0;
40     }
41     size_t next = (10 * result) + (str[i] - '0');
42     if (next <= result) {
43       return 0;
44     }
45     result = next;
46   }
47   return 0;
48 }
49 
readFile(const std::string & path)50 static std::string readFile(const std::string& path) {
51   std::ifstream file(path);
52   std::string content(
53       (std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>());
54   return content;
55 }
56 
writeFile(const char * file,const std::string & content)57 static void writeFile(const char* file, const std::string& content) {
58   std::ofstream outfile(file, std::ofstream::binary);
59   outfile.write(content.c_str(), static_cast<std::streamsize>(content.size()));
60   outfile.close();
61 }
62 
writeSamples(char const * argv[],const std::vector<int> & pathArgs,const std::vector<size_t> & sizes,const uint8_t * data)63 static void writeSamples(char const* argv[], const std::vector<int>& pathArgs,
64     const std::vector<size_t>& sizes, const uint8_t* data) {
65   size_t offset = 0;
66   for (size_t i = 0; i < pathArgs.size(); ++i) {
67     int j = pathArgs[i];
68     const char* file = argv[j];
69     size_t sampleSize = sizes[i];
70     std::ofstream outfile(file, std::ofstream::binary);
71     outfile.write(reinterpret_cast<const char*>(data + offset),
72         static_cast<std::streamsize>(sampleSize));
73     outfile.close();
74     offset += sampleSize;
75   }
76 }
77 
78 /* Returns "base file name" or its tail, if it contains '/' or '\'. */
fileName(const char * path)79 static const char* fileName(const char* path) {
80   const char* separator_position = strrchr(path, '/');
81   if (separator_position) path = separator_position + 1;
82   separator_position = strrchr(path, '\\');
83   if (separator_position) path = separator_position + 1;
84   return path;
85 }
86 
printHelp(const char * name)87 static void printHelp(const char* name) {
88   fprintf(stderr, "Usage: %s [OPTION]... DICTIONARY [SAMPLE]...\n", name);
89   fprintf(stderr,
90       "Options:\n"
91       "  --dm       use 'deorummolae' engine\n"
92       "  --distill  rewrite samples; unique text parts are removed\n"
93       "  --dsh      use 'durchschlag' engine (default)\n"
94       "  --purify   rewrite samples; unique text parts are zeroed out\n"
95       "  --sieve    use 'sieve' engine\n"
96       "  -b#        set block length for 'durchschlag'; default: 1024\n"
97       "  -s#        set slice length for 'distill', 'durchschlag', 'purify'\n"
98       "             and 'sieve'; default: 16\n"
99       "  -t#        set target dictionary size (limit); default: 16K\n"
100       "  -u#        set minimum slice population (for rewrites); default: 2\n"
101       "# is a decimal number with optional k/K/m/M suffix.\n"
102       "WARNING: 'distill' and 'purify' will overwrite original samples!\n"
103       "         Completely unique samples might become empty files.\n\n");
104 }
105 
main(int argc,char const * argv[])106 int main(int argc, char const* argv[]) {
107   int dictionaryArg = -1;
108   int method = METHOD_DURCHSCHLAG;
109   size_t sliceLen = 16;
110   size_t targetSize = 16 << 10;
111   size_t blockSize = 1024;
112   size_t minimumPopulation = 2;
113 
114   std::vector<uint8_t> data;
115   std::vector<size_t> sizes;
116   std::vector<int> pathArgs;
117   size_t total = 0;
118   for (int i = 1; i < argc; ++i) {
119     if (argv[i] == nullptr) {
120       continue;
121     }
122     if (argv[i][0] == '-') {
123       if (argv[i][1] == '-') {
124         if (dictionaryArg != -1) {
125           fprintf(stderr,
126               "Method should be specified before dictionary / sample '%s'\n",
127               argv[i]);
128           exit(1);
129         }
130         if (std::strcmp("--sieve", argv[i]) == 0) {
131           method = METHOD_SIEVE;
132           continue;
133         }
134         if (std::strcmp("--dm", argv[i]) == 0) {
135           method = METHOD_DM;
136           continue;
137         }
138         if (std::strcmp("--dsh", argv[i]) == 0) {
139           method = METHOD_DURCHSCHLAG;
140           continue;
141         }
142         if (std::strcmp("--distill", argv[i]) == 0) {
143           method = METHOD_DISTILL;
144           continue;
145         }
146         if (std::strcmp("--purify", argv[i]) == 0) {
147           method = METHOD_PURIFY;
148           continue;
149         }
150         printHelp(fileName(argv[0]));
151         fprintf(stderr, "Invalid option '%s'\n", argv[i]);
152         exit(1);
153       }
154       if (argv[i][1] == 'b') {
155         blockSize = readInt(&argv[i][2]);
156         if (blockSize < 16 || blockSize > 65536) {
157           printHelp(fileName(argv[0]));
158           fprintf(stderr, "Invalid option '%s'\n", argv[i]);
159           exit(1);
160         }
161       } else if (argv[i][1] == 's') {
162         sliceLen = readInt(&argv[i][2]);
163         if (sliceLen < 4 || sliceLen > 256) {
164           printHelp(fileName(argv[0]));
165           fprintf(stderr, "Invalid option '%s'\n", argv[i]);
166           exit(1);
167         }
168       } else if (argv[i][1] == 't') {
169         targetSize = readInt(&argv[i][2]);
170         if (targetSize < 256 || targetSize > (1 << 25)) {
171           printHelp(fileName(argv[0]));
172           fprintf(stderr, "Invalid option '%s'\n", argv[i]);
173           exit(1);
174         }
175       } else if (argv[i][1] == 'u') {
176         minimumPopulation = readInt(&argv[i][2]);
177         if (minimumPopulation < 256 || minimumPopulation > 65536) {
178           printHelp(fileName(argv[0]));
179           fprintf(stderr, "Invalid option '%s'\n", argv[i]);
180           exit(1);
181         }
182       } else {
183         printHelp(fileName(argv[0]));
184         fprintf(stderr, "Unrecognized option '%s'\n", argv[i]);
185         exit(1);
186       }
187       continue;
188     }
189     if (dictionaryArg == -1) {
190       if (method != METHOD_DISTILL && method != METHOD_PURIFY) {
191         dictionaryArg = i;
192         continue;
193       }
194     }
195     std::string content = readFile(argv[i]);
196     data.insert(data.end(), content.begin(), content.end());
197     total += content.size();
198     pathArgs.push_back(i);
199     sizes.push_back(content.size());
200   }
201   bool wantDictionary = (dictionaryArg == -1);
202   if (method == METHOD_DISTILL || method == METHOD_PURIFY) {
203     wantDictionary = false;
204   }
205   if (wantDictionary || total == 0) {
206     printHelp(fileName(argv[0]));
207     fprintf(stderr, "Not enough arguments\n");
208     exit(1);
209   }
210 
211   if (method == METHOD_SIEVE) {
212     writeFile(argv[dictionaryArg], sieve_generate(
213         targetSize, sliceLen, sizes, data.data()));
214   } else if (method == METHOD_DM) {
215     writeFile(argv[dictionaryArg], DM_generate(
216         targetSize, sizes, data.data()));
217   } else if (method == METHOD_DURCHSCHLAG) {
218     writeFile(argv[dictionaryArg], durchschlag_generate(
219         targetSize, sliceLen, blockSize, sizes, data.data()));
220   } else if (method == METHOD_DISTILL) {
221     durchschlag_distill(sliceLen, minimumPopulation, &sizes, data.data());
222     writeSamples(argv, pathArgs, sizes, data.data());
223   } else if (method == METHOD_PURIFY) {
224     durchschlag_purify(sliceLen, minimumPopulation, sizes, data.data());
225     writeSamples(argv, pathArgs, sizes, data.data());
226   } else {
227     printHelp(fileName(argv[0]));
228     fprintf(stderr, "Unknown generator\n");
229     exit(1);
230   }
231   return 0;
232 }
233