1 #include <stdio.h>
2 #include <stdlib.h>
3 #include <string.h>
4 #include <getopt.h>
5
6 #define PACKAGE "wgram"
7 #define VERSION "0.0.4"
8 #define MAXLINE 1024
9 #define MAXGRAM 32
10
11 /* status epilepticus .. print help */
12 void print_help(int exval);
13
main(int argc,char * argv[])14 int main (int argc, char *argv[]) {
15 /* word delimeter for strtok() */
16 char delim[] = ".,:;`/\"+-_(){}[]<>*&^%$#@!?~/|\\=1234567890 \t\n";
17 char line[MAXLINE]; /* input buff, fgets() */
18 char *stray = NULL; /* returned value by strtok() */
19 char **strarray = NULL; /* array to hold all entrys */
20 int i = 0; /* general counter */
21 int strcount = 0; /* number of entrys in pointer array */
22 int N = 3, pos = 0; /* ngram size, 3 in this case */
23 int opt = 0; /* holds command line opt nr.. */
24 int word_flag = 0; /* print only the `raw' words */
25 FILE *fp = stdin; /* read input from `FILE', default is stdin */
26
27 while((opt = getopt(argc, argv, "hvn:wf:")) != -1) {
28 switch(opt) {
29 case 'h':
30 print_help(0);
31 break;
32 case 'v':
33 exit(0);
34 break;
35 case 'n':
36 N = atoi(optarg);
37 if(N > MAXGRAM || N < 2) {
38 fprintf(stderr, "%s: Error - Ngram length `%d' out of range `0-%d'\n",
39 PACKAGE, N, MAXGRAM);
40 return 1;
41 }
42 break;
43 case 'w':
44 word_flag = 1;
45 break;
46 case 'f':
47 if(freopen(optarg, "r", fp) == NULL) {
48 fprintf(stderr, "%s: Error - opening `%s'\n", PACKAGE, optarg);
49 return 1;
50 }
51 break;
52 case '?':
53 fprintf(stderr, "%s: Error - No such option: `%c'\n\n", PACKAGE, optopt);
54 print_help(1);
55 } /* switch */
56 } /* while */
57
58 /* start reading lines from file pointer, add all entrys to **strarray */
59 while((fgets(line, MAXLINE, fp)) != NULL) {
60 if(strlen(line) < 2)
61 continue;
62
63 stray = strtok(line, delim);
64 while(stray != NULL) {
65 strarray = (char **)realloc(strarray, (strcount + 1) * sizeof(char *));
66 strarray[strcount++] = strdup(stray);
67 stray = strtok(NULL, delim);
68 }
69 }
70
71 if(word_flag == 0) {
72 /*
73 // print the array of strings, jumping back each time
74 // (N - 1) positions if a whole ngram of words has been printed
75 */
76 for(i = 0, pos = N; i < strcount; i++, pos--) {
77 if(pos == 0) pos = N, i -= (N - 1), printf("\n");
78 printf("%s ", strarray[i]);
79 }
80 printf("\n");
81 } else {
82 /* print raw words */
83 for(i = 0; i < strcount; i++)
84 printf("%s\n", strarray[i]);
85 }
86
87 /* free the string array */
88 for(i = 0; i < strcount; i++)
89 free(strarray[i]);
90
91 free(strarray);
92 return 0;
93 }
94
95 /* status epilepticus .. print help */
print_help(int exval)96 void print_help(int exval) {
97 printf("%s,%s extract N-grams from text data\n", PACKAGE, VERSION);
98 printf("Usage: %s [-h] [-v] [-n INT] [-w] [-f FILE]\n\n", PACKAGE);
99
100 printf(" -h print this help and exit\n");
101 printf(" -v print version and exit\n\n");
102
103 printf(" -n INT set ngram length (default=3)\n");
104 printf(" -w print only the extracted words\n");
105 printf(" -f FILE read input from `FILE' (default=stdin)\n\n");
106 exit(exval);
107 }
108