• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //  Copyright (c) 2001-2010 Hartmut Kaiser
2 //
3 //  Distributed under the Boost Software License, Version 1.0. (See accompanying
4 //  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
5 
6 //  This example is the equivalent to the following flex program:
7 /*
8 //[wcf_flex_version
9     %{
10         #define ID_WORD 1000
11         #define ID_EOL  1001
12         #define ID_CHAR 1002
13         int c = 0, w = 0, l = 0;
14     %}
15     %%
16     [^ \t\n]+  { return ID_WORD; }
17     \n         { return ID_EOL; }
18     .          { return ID_CHAR; }
19     %%
20     bool count(int tok)
21     {
22         switch (tok) {
23         case ID_WORD: ++w; c += yyleng; break;
24         case ID_EOL:  ++l; ++c; break;
25         case ID_CHAR: ++c; break;
26         default:
27             return false;
28         }
29         return true;
30     }
31     void main()
32     {
33         int tok = EOF;
34         do {
35             tok = yylex();
36             if (!count(tok))
37                 break;
38         } while (EOF != tok);
39         printf("%d %d %d\n", l, w, c);
40     }
41 //]
42 */
43 //  Its purpose is to do the word count function of the wc command in UNIX. It
44 //  prints the number of lines, words and characters in a file.
45 //
46 //  This examples shows how to use the tokenize() function together with a
47 //  simple functor, which gets executed whenever a token got matched in the
48 //  input sequence.
49 
50 // #define BOOST_SPIRIT_LEXERTL_DEBUG
51 
52 #include <boost/config/warning_disable.hpp>
53 //[wcf_includes
54 #include <boost/spirit/include/lex_lexertl.hpp>
55 #include <boost/bind/bind.hpp>
56 #include <boost/ref.hpp>
57 //]
58 
59 #include <iostream>
60 #include <string>
61 
62 #include "example.hpp"
63 
64 //[wcf_namespaces
65 namespace lex = boost::spirit::lex;
66 //]
67 
68 ///////////////////////////////////////////////////////////////////////////////
69 //  Token id definitions
70 ///////////////////////////////////////////////////////////////////////////////
71 //[wcf_token_ids
72 enum token_ids
73 {
74     ID_WORD = 1000,
75     ID_EOL,
76     ID_CHAR
77 };
78 //]
79 
80 //[wcf_token_definition
81 /*` The template `word_count_tokens` defines three different tokens:
82     `ID_WORD`, `ID_EOL`, and `ID_CHAR`, representing a word (anything except
83     a whitespace or a newline), a newline character, and any other character
84     (`ID_WORD`, `ID_EOL`, and `ID_CHAR` are enum values representing the token
85     ids, but could be anything else convertible to an integer as well).
86     The direct base class of any token definition class needs to be the
87     template `lex::lexer<>`, where the corresponding template parameter (here:
88     `lex::lexertl::lexer<BaseIterator>`) defines which underlying lexer engine has
89     to be used to provide the required state machine functionality. In this
90     example we use the Lexertl based lexer engine as the underlying lexer type.
91 */
92 template <typename Lexer>
93 struct word_count_tokens : lex::lexer<Lexer>
94 {
word_count_tokensword_count_tokens95     word_count_tokens()
96     {
97         // define tokens (the regular expression to match and the corresponding
98         // token id) and add them to the lexer
99         this->self.add
100             ("[^ \t\n]+", ID_WORD) // words (anything except ' ', '\t' or '\n')
101             ("\n", ID_EOL)         // newline characters
102             (".", ID_CHAR)         // anything else is a plain character
103         ;
104     }
105 };
106 //]
107 
108 //[wcf_functor
109 /*` In this example the struct 'counter' is used as a functor counting the
110     characters, words and lines in the analyzed input sequence by identifying
111     the matched tokens as passed from the /Spirit.Lex/ library.
112 */
113 struct counter
114 {
115 //<- this is an implementation detail specific to boost::bind and doesn't show
116 //   up in the documentation
117     typedef bool result_type;
118 //->
119     // the function operator gets called for each of the matched tokens
120     // c, l, w are references to the counters used to keep track of the numbers
121     template <typename Token>
operator ()counter122     bool operator()(Token const& t, std::size_t& c, std::size_t& w, std::size_t& l) const
123     {
124         switch (t.id()) {
125         case ID_WORD:       // matched a word
126         // since we're using a default token type in this example, every
127         // token instance contains a `iterator_range<BaseIterator>` as its token
128         // attribute pointing to the matched character sequence in the input
129             ++w; c += t.value().size();
130             break;
131         case ID_EOL:        // matched a newline character
132             ++l; ++c;
133             break;
134         case ID_CHAR:       // matched something else
135             ++c;
136             break;
137         }
138         return true;        // always continue to tokenize
139     }
140 };
141 //]
142 
143 ///////////////////////////////////////////////////////////////////////////////
144 //[wcf_main
145 /*` The main function simply loads the given file into memory (as a
146     `std::string`), instantiates an instance of the token definition template
147     using the correct iterator type (`word_count_tokens<char const*>`),
148     and finally calls `lex::tokenize`, passing an instance of the counter function
149     object. The return value of `lex::tokenize()` will be `true` if the
150     whole input sequence has been successfully tokenized, and `false` otherwise.
151 */
main(int argc,char * argv[])152 int main(int argc, char* argv[])
153 {
154     // these variables are used to count characters, words and lines
155     std::size_t c = 0, w = 0, l = 0;
156 
157     // read input from the given file
158     std::string str (read_from_file(1 == argc ? "word_count.input" : argv[1]));
159 
160     // create the token definition instance needed to invoke the lexical analyzer
161     word_count_tokens<lex::lexertl::lexer<> > word_count_functor;
162 
163     // tokenize the given string, the bound functor gets invoked for each of
164     // the matched tokens
165     using boost::placeholders::_1;
166     char const* first = str.c_str();
167     char const* last = &first[str.size()];
168     bool r = lex::tokenize(first, last, word_count_functor,
169         boost::bind(counter(), _1, boost::ref(c), boost::ref(w), boost::ref(l)));
170 
171     // print results
172     if (r) {
173         std::cout << "lines: " << l << ", words: " << w
174                   << ", characters: " << c << "\n";
175     }
176     else {
177         std::string rest(first, last);
178         std::cout << "Lexical analysis failed\n" << "stopped at: \""
179                   << rest << "\"\n";
180     }
181     return 0;
182 }
183 //]
184 
185