1 // Copyright (c) 2001-2010 Hartmut Kaiser
2 //
3 // Distributed under the Boost Software License, Version 1.0. (See accompanying
4 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
5
6 // This example shows how to create a simple lexer recognizing a couple of
7 // different tokens aimed at a simple language and how to use this lexer with
8 // a grammar. It shows how to associate attributes to tokens and how to access the
9 // token attributes from inside the grammar.
10 //
11 // Additionally, this example demonstrates, how to define a token set usable
12 // as the skip parser during parsing, allowing to define several tokens to be
13 // ignored.
14 //
15 // The example demonstrates how to use the add(...)(...) syntax to associate
16 // token definitions with the lexer and how token ids can be used in the
17 // parser to refer to a token, without having to directly reference its
18 // definition.
19 //
20 // This example recognizes a very simple programming language having
21 // assignment statements and if and while control structures. Look at the file
22 // example6.input for an example.
23 //
24 // This example is essentially identical to example4.cpp. The only difference
25 // is that we use the self.add() syntax to define tokens and to associate them
26 // with the lexer.
27
28 #include <boost/config/warning_disable.hpp>
29 #include <boost/spirit/include/qi.hpp>
30 #include <boost/spirit/include/lex_lexertl.hpp>
31 #include <boost/spirit/include/phoenix_operator.hpp>
32
33 #include <iostream>
34 #include <fstream>
35 #include <string>
36
37 #include "example.hpp"
38
39 using namespace boost::spirit;
40 using boost::phoenix::val;
41
42 ///////////////////////////////////////////////////////////////////////////////
43 // Token id definitions
44 ///////////////////////////////////////////////////////////////////////////////
45 enum token_ids
46 {
47 ID_CONSTANT = 1000,
48 ID_IF,
49 ID_ELSE,
50 ID_WHILE,
51 ID_IDENTIFIER
52 };
53
54 ///////////////////////////////////////////////////////////////////////////////
55 // Token definitions
56 ///////////////////////////////////////////////////////////////////////////////
57 template <typename Lexer>
58 struct example6_tokens : lex::lexer<Lexer>
59 {
example6_tokensexample6_tokens60 example6_tokens()
61 {
62 // define the tokens to match
63 identifier = "[a-zA-Z_][a-zA-Z0-9_]*";
64 constant = "[0-9]+";
65
66 // associate the tokens and the token set with the lexer
67 this->self = lex::token_def<>('(') | ')' | '{' | '}' | '=' | ';';
68
69 // Token definitions can be added by using some special syntactic
70 // construct as shown below.
71 // Note, that the token definitions added this way expose the iterator
72 // pair pointing to the matched input stream as their attribute.
73 this->self.add
74 (constant, ID_CONSTANT)
75 ("if", ID_IF)
76 ("else", ID_ELSE)
77 ("while", ID_WHILE)
78 (identifier, ID_IDENTIFIER)
79 ;
80
81 // define the whitespace to ignore (spaces, tabs, newlines and C-style
82 // comments) and add those to another lexer state (here: "WS")
83 this->self("WS")
84 = lex::token_def<>("[ \\t\\n]+")
85 | "\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/"
86 ;
87 }
88
89 // The following two tokens have an associated attribute type, identifier
90 // carries a string (the identifier name) and constant carries the matched
91 // integer value.
92 //
93 // Note: any token attribute type explicitly specified in a token_def<>
94 // declaration needs to be listed during token type definition as
95 // well (see the typedef for the token_type below).
96 //
97 // The conversion of the matched input to an instance of this type occurs
98 // once (on first access), which makes token attributes as efficient as
99 // possible. Moreover, token instances are constructed once by the lexer
100 // library. From this point on tokens are passed by reference only,
101 // avoiding them being copied around.
102 lex::token_def<std::string> identifier;
103 lex::token_def<unsigned int> constant;
104 };
105
106 ///////////////////////////////////////////////////////////////////////////////
107 // Grammar definition
108 ///////////////////////////////////////////////////////////////////////////////
109 template <typename Iterator, typename Lexer>
110 struct example6_grammar
111 : qi::grammar<Iterator, qi::in_state_skipper<Lexer> >
112 {
113 template <typename TokenDef>
example6_grammarexample6_grammar114 example6_grammar(TokenDef const& tok)
115 : example6_grammar::base_type(program)
116 {
117 using boost::spirit::_val;
118
119 program
120 = +block
121 ;
122
123 block
124 = '{' >> *statement >> '}'
125 ;
126
127 statement
128 = assignment
129 | if_stmt
130 | while_stmt
131 ;
132
133 assignment
134 = (tok.identifier >> '=' >> expression >> ';')
135 [
136 std::cout << val("assignment statement to: ")
137 << _1 << "\n"
138 ]
139 ;
140
141 if_stmt
142 = ( token(ID_IF) >> '(' >> expression >> ')' >> block
143 >> -(token(ID_ELSE) >> block)
144 )
145 [
146 std::cout << val("if expression: ")
147 << _2 << "\n"
148 ]
149 ;
150
151 while_stmt
152 = (token(ID_WHILE) >> '(' >> expression >> ')' >> block)
153 [
154 std::cout << val("while expression: ")
155 << _2 << "\n"
156 ]
157 ;
158
159 // since expression has a variant return type accommodating for
160 // std::string and unsigned integer, both possible values may be
161 // returned to the calling rule
162 expression
163 = tok.identifier [ _val = _1 ]
164 | tok.constant [ _val = _1 ]
165 ;
166 }
167
168 typedef boost::variant<unsigned int, std::string> expression_type;
169
170 qi::rule<Iterator, qi::in_state_skipper<Lexer> > program, block, statement;
171 qi::rule<Iterator, qi::in_state_skipper<Lexer> > assignment, if_stmt;
172 qi::rule<Iterator, qi::in_state_skipper<Lexer> > while_stmt;
173
174 // the expression is the only rule having a return value
175 qi::rule<Iterator, expression_type(), qi::in_state_skipper<Lexer> > expression;
176 };
177
178 ///////////////////////////////////////////////////////////////////////////////
main()179 int main()
180 {
181 // iterator type used to expose the underlying input stream
182 typedef std::string::iterator base_iterator_type;
183
184 // This is the lexer token type to use. The second template parameter lists
185 // all attribute types used for token_def's during token definition (see
186 // calculator_tokens<> above). Here we use the predefined lexertl token
187 // type, but any compatible token type may be used instead.
188 //
189 // If you don't list any token attribute types in the following declaration
190 // (or just use the default token type: lexertl_token<base_iterator_type>)
191 // it will compile and work just fine, just a bit less efficient. This is
192 // because the token attribute will be generated from the matched input
193 // sequence every time it is requested. But as soon as you specify at
194 // least one token attribute type you'll have to list all attribute types
195 // used for token_def<> declarations in the token definition class above,
196 // otherwise compilation errors will occur.
197 typedef lex::lexertl::token<
198 base_iterator_type, boost::mpl::vector<unsigned int, std::string>
199 > token_type;
200
201 // Here we use the lexertl based lexer engine.
202 typedef lex::lexertl::lexer<token_type> lexer_type;
203
204 // This is the token definition type (derived from the given lexer type).
205 typedef example6_tokens<lexer_type> example6_tokens;
206
207 // this is the iterator type exposed by the lexer
208 typedef example6_tokens::iterator_type iterator_type;
209
210 // this is the type of the grammar to parse
211 typedef example6_grammar<iterator_type, example6_tokens::lexer_def> example6_grammar;
212
213 // now we use the types defined above to create the lexer and grammar
214 // object instances needed to invoke the parsing process
215 example6_tokens tokens; // Our lexer
216 example6_grammar calc(tokens); // Our parser
217
218 std::string str (read_from_file("example6.input"));
219
220 // At this point we generate the iterator pair used to expose the
221 // tokenized input stream.
222 std::string::iterator it = str.begin();
223 iterator_type iter = tokens.begin(it, str.end());
224 iterator_type end = tokens.end();
225
226 // Parsing is done based on the token stream, not the character
227 // stream read from the input.
228 // Note how we use the lexer defined above as the skip parser. It must
229 // be explicitly wrapped inside a state directive, switching the lexer
230 // state for the duration of skipping whitespace.
231 std::string ws("WS");
232 bool r = qi::phrase_parse(iter, end, calc, qi::in_state(ws)[tokens.self]);
233
234 if (r && iter == end)
235 {
236 std::cout << "-------------------------\n";
237 std::cout << "Parsing succeeded\n";
238 std::cout << "-------------------------\n";
239 }
240 else
241 {
242 std::cout << "-------------------------\n";
243 std::cout << "Parsing failed\n";
244 std::cout << "-------------------------\n";
245 }
246
247 std::cout << "Bye... :-) \n\n";
248 return 0;
249 }
250