1 /*************************************************************************
2 *
3 * Copyright (C) 2016 and later: Unicode, Inc. and others.
4 * License & terms of use: http://www.unicode.org/copyright.html#License
5 *
6 **************************************************************************
7 **************************************************************************
8 *
9 * Copyright (C) 2002-2010, International Business Machines
10 * Corporation and others. All Rights Reserved.
11 *
12 ***************************************************************************
13 */
14
15 //
16 // ugrep - an ICU sample program illustrating the use of ICU Regular Expressions.
17 //
18 // The use of the ICU Regex API all occurs within the main()
19 // function. The rest of the code deals with with opening files,
20 // encoding conversions, printing results, etc.
21 //
22 // This is not a full-featured grep program. The command line options
23 // have been kept to a minimum to avoid complicating the sample code.
24 //
25
26
27
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31
32 #include "unicode/utypes.h"
33 #include "unicode/ustring.h"
34 #include "unicode/regex.h"
35 #include "unicode/ucnv.h"
36 #include "unicode/uclean.h"
37
38
39 //
40 // The following variables contain paramters that may be set from the command line.
41 //
42 const char *pattern = NULL; // The regular expression
43 int firstFileNum; // argv index of the first file name
44 UBool displayFileName = FALSE;
45 UBool displayLineNum = FALSE;
46
47
48 //
49 // Info regarding the file currently being processed
50 //
51 const char *fileName;
52 int fileLen; // Length, in UTF-16 Code Units.
53
54 UChar *ucharBuf = 0; // Buffer, holds converted file. (Simple minded program, always reads
55 // the whole file at once.
56
57 char *charBuf = 0; // Buffer, for original, unconverted file data.
58
59
60 //
61 // Info regarding the line currently being processed
62 //
63 int lineStart; // Index of first char of the current line in the file buffer
64 int lineEnd; // Index of char following the new line sequence for the current line
65 int lineNum;
66
67 //
68 // Converter, used on output to convert Unicode data back to char *
69 // so that it will display in non-Unicode terminal windows.
70 //
71 UConverter *outConverter = 0;
72
73 //
74 // Function forward declarations
75 //
76 void processOptions(int argc, const char **argv);
77 void nextLine(int start);
78 void printMatch();
79 void printUsage();
80 void readFile(const char *name);
81
82
83
84 //------------------------------------------------------------------------------------------
85 //
86 // main for ugrep
87 //
88 // Structurally, all use of the ICU Regular Expression API is in main(),
89 // and all of the supporting stuff necessary to make a running program, but
90 // not directly related to regular expressions, is factored out into these other
91 // functions.
92 //
93 //------------------------------------------------------------------------------------------
main(int argc,const char ** argv)94 int main(int argc, const char** argv) {
95 UBool matchFound = FALSE;
96
97 //
98 // Process the commmand line options.
99 //
100 processOptions(argc, argv);
101
102 //
103 // Create a RegexPattern object from the user supplied pattern string.
104 //
105 UErrorCode status = U_ZERO_ERROR; // All ICU operations report success or failure
106 // in a status variable.
107
108 UParseError parseErr; // In the event of a syntax error in the regex pattern,
109 // this struct will contain the position of the
110 // error.
111
112 RegexPattern *rePat = RegexPattern::compile(pattern, parseErr, status);
113 // Note that C++ is doing an automatic conversion
114 // of the (char *) pattern to a temporary
115 // UnicodeString object.
116 if (U_FAILURE(status)) {
117 fprintf(stderr, "ugrep: error in pattern: \"%s\" at position %d\n",
118 u_errorName(status), parseErr.offset);
119 exit(-1);
120 }
121
122 //
123 // Create a RegexMatcher from the newly created pattern.
124 //
125 UnicodeString empty;
126 RegexMatcher *matcher = rePat->matcher(empty, status);
127 if (U_FAILURE(status)) {
128 fprintf(stderr, "ugrep: error in creating RegexMatcher: \"%s\"\n",
129 u_errorName(status));
130 exit(-1);
131 }
132
133 //
134 // Loop, processing each of the input files.
135 //
136 for (int fileNum=firstFileNum; fileNum < argc; fileNum++) {
137 readFile(argv[fileNum]);
138
139 //
140 // Loop through the lines of a file, trying to match the regex pattern on each.
141 //
142 for (nextLine(0); lineStart<fileLen; nextLine(lineEnd)) {
143 UnicodeString s(FALSE, ucharBuf+lineStart, lineEnd-lineStart);
144 matcher->reset(s);
145 if (matcher->find()) {
146 matchFound = TRUE;
147 printMatch();
148 }
149 }
150 }
151
152 //
153 // Clean up
154 //
155 delete matcher;
156 delete rePat;
157 free(ucharBuf);
158 free(charBuf);
159 ucnv_close(outConverter);
160
161 u_cleanup(); // shut down ICU, release any cached data it owns.
162
163 return matchFound? 0: 1;
164 }
165
166
167
168 //------------------------------------------------------------------------------------------
169 //
170 // doOptions Run through the command line options, and set
171 // the global variables accordingly.
172 //
173 // exit without returning if an error occured and
174 // ugrep should not proceed further.
175 //
176 //------------------------------------------------------------------------------------------
processOptions(int argc,const char ** argv)177 void processOptions(int argc, const char **argv) {
178 int optInd;
179 UBool doUsage = FALSE;
180 UBool doVersion = FALSE;
181 const char *arg;
182
183
184 for(optInd = 1; optInd < argc; ++optInd) {
185 arg = argv[optInd];
186
187 /* version info */
188 if(strcmp(arg, "-V") == 0 || strcmp(arg, "--version") == 0) {
189 doVersion = TRUE;
190 }
191 /* usage info */
192 else if(strcmp(arg, "--help") == 0) {
193 doUsage = TRUE;
194 }
195 else if(strcmp(arg, "-n") == 0 || strcmp(arg, "--line-number") == 0) {
196 displayLineNum = TRUE;
197 }
198 /* POSIX.1 says all arguments after -- are not options */
199 else if(strcmp(arg, "--") == 0) {
200 /* skip the -- */
201 ++optInd;
202 break;
203 }
204 /* unrecognized option */
205 else if(strncmp(arg, "-", strlen("-")) == 0) {
206 printf("ugrep: invalid option -- %s\n", arg+1);
207 doUsage = TRUE;
208 }
209 /* done with options */
210 else {
211 break;
212 }
213 }
214
215 if (doUsage) {
216 printUsage();
217 exit(0);
218 }
219
220 if (doVersion) {
221 printf("ugrep version 0.01\n");
222 if (optInd == argc) {
223 exit(0);
224 }
225 }
226
227 int remainingArgs = argc-optInd; // pattern file ...
228 if (remainingArgs < 2) {
229 fprintf(stderr, "ugrep: files or pattern are missing.\n");
230 printUsage();
231 exit(1);
232 }
233
234 if (remainingArgs > 2) {
235 // More than one file to be processed. Display file names with match output.
236 displayFileName = TRUE;
237 }
238
239 pattern = argv[optInd];
240 firstFileNum = optInd+1;
241 }
242
243 //------------------------------------------------------------------------------------------
244 //
245 // printUsage
246 //
247 //------------------------------------------------------------------------------------------
printUsage()248 void printUsage() {
249 printf("ugrep [options] pattern file...\n"
250 " -V or --version display version information\n"
251 " --help display this help and exit\n"
252 " -- stop further option processing\n"
253 "-n, --line-number Prefix each line of output with the line number within its input file.\n"
254 );
255 exit(0);
256 }
257
258 //------------------------------------------------------------------------------------------
259 //
260 // readFile Read a file into memory, and convert it to Unicode.
261 //
262 // Since this is just a demo program, take the simple minded approach
263 // of always reading the whole file at once. No intelligent buffering
264 // is done.
265 //
266 //------------------------------------------------------------------------------------------
readFile(const char * name)267 void readFile(const char *name) {
268
269 //
270 // Initialize global file variables
271 //
272 fileName = name;
273 fileLen = 0; // zero length prevents processing in case of errors.
274
275
276 //
277 // Open the file and determine its size.
278 //
279 FILE *file = fopen(name, "rb");
280 if (file == 0 ) {
281 fprintf(stderr, "ugrep: Could not open file \"%s\"\n", fileName);
282 return;
283 }
284 fseek(file, 0, SEEK_END);
285 int rawFileLen = ftell(file);
286 fseek(file, 0, SEEK_SET);
287
288
289 //
290 // Read in the file
291 //
292 charBuf = (char *)realloc(charBuf, rawFileLen+1); // Need error checking...
293 int t = fread(charBuf, 1, rawFileLen, file);
294 if (t != rawFileLen) {
295 fprintf(stderr, "Error reading file \"%s\"\n", fileName);
296 fclose(file);
297 return;
298 }
299 charBuf[rawFileLen]=0;
300 fclose(file);
301
302 //
303 // Look for a Unicode Signature (BOM) in the data
304 //
305 int32_t signatureLength;
306 const char * charDataStart = charBuf;
307 UErrorCode status = U_ZERO_ERROR;
308 const char* encoding = ucnv_detectUnicodeSignature(
309 charDataStart, rawFileLen, &signatureLength, &status);
310 if (U_FAILURE(status)) {
311 fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_detectUnicodeSignature()\n",
312 u_errorName(status));
313 return;
314 }
315 if(encoding!=NULL ){
316 charDataStart += signatureLength;
317 rawFileLen -= signatureLength;
318 }
319
320 //
321 // Open a converter to take the file to UTF-16
322 //
323 UConverter* conv;
324 conv = ucnv_open(encoding, &status);
325 if (U_FAILURE(status)) {
326 fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_open()\n", u_errorName(status));
327 return;
328 }
329
330 //
331 // Convert the file data to UChar.
332 // Preflight first to determine required buffer size.
333 //
334 uint32_t destCap = ucnv_toUChars(conv,
335 NULL, // dest,
336 0, // destCapacity,
337 charDataStart,
338 rawFileLen,
339 &status);
340 if (status != U_BUFFER_OVERFLOW_ERROR) {
341 fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
342 return;
343 };
344
345 status = U_ZERO_ERROR;
346 ucharBuf = (UChar *)realloc(ucharBuf, (destCap+1) * sizeof(UChar));
347 ucnv_toUChars(conv,
348 ucharBuf, // dest,
349 destCap+1,
350 charDataStart,
351 rawFileLen,
352 &status);
353 if (U_FAILURE(status)) {
354 fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
355 return;
356 };
357 ucnv_close(conv);
358
359 //
360 // Successful conversion. Set the global size variables so that
361 // the rest of the processing will proceed for this file.
362 //
363 fileLen = destCap;
364 }
365
366
367
368
369
370 //------------------------------------------------------------------------------------------
371 //
372 // nextLine Advance the line index variables, starting at the
373 // specified position in the input file buffer, by
374 // scanning forwrd until the next end-of-line.
375 //
376 // Need to take into account all of the possible Unicode
377 // line ending sequences.
378 //
379 //------------------------------------------------------------------------------------------
nextLine(int startPos)380 void nextLine(int startPos) {
381 if (startPos == 0) {
382 lineNum = 0;
383 } else {
384 lineNum++;
385 }
386 lineStart = lineEnd = startPos;
387
388 for (;;) {
389 if (lineEnd >= fileLen) {
390 return;
391 }
392 UChar c = ucharBuf[lineEnd];
393 lineEnd++;
394 if (c == 0x0a || // Line Feed
395 c == 0x0c || // Form Feed
396 c == 0x0d || // Carriage Return
397 c == 0x85 || // Next Line
398 c == 0x2028 || // Line Separator
399 c == 0x2029) // Paragraph separator
400 {
401 break;
402 }
403 }
404
405 // Check for CR/LF sequence, and advance over the LF if we're in the middle of one.
406 if (lineEnd < fileLen &&
407 ucharBuf[lineEnd-1] == 0x0d &&
408 ucharBuf[lineEnd] == 0x0a)
409 {
410 lineEnd++;
411 }
412 }
413
414
415 //------------------------------------------------------------------------------------------
416 //
417 // printMatch Called when a matching line has been located.
418 // Print out the line from the file with the match, after
419 // converting it back to the default code page.
420 //
421 //------------------------------------------------------------------------------------------
printMatch()422 void printMatch() {
423 char buf[2000];
424 UErrorCode status = U_ZERO_ERROR;
425
426 // If we haven't already created a converter for output, do it now.
427 if (outConverter == 0) {
428 outConverter = ucnv_open(NULL, &status);
429 if (U_FAILURE(status)) {
430 fprintf(stderr, "ugrep: Error opening default converter: \"%s\"\n",
431 u_errorName(status));
432 exit(-1);
433 }
434 };
435
436 // Convert the line to be printed back to the default 8 bit code page.
437 // If the line is too long for our buffer, just truncate it.
438 ucnv_fromUChars(outConverter,
439 buf, // destination buffer for conversion
440 sizeof(buf), // capacity of destination buffer
441 &ucharBuf[lineStart], // Input to conversion
442 lineEnd-lineStart, // number of UChars to convert
443 &status);
444 buf[sizeof(buf)-1] = 0; // Add null for use in case of too long lines.
445 // The converter null-terminates its output unless
446 // the buffer completely fills.
447
448 if (displayFileName) {
449 printf("%s:", fileName);
450 }
451 if (displayLineNum) {
452 printf("%d:", lineNum);
453 }
454 printf("%s", buf);
455 }
456
457