• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //  Copyright (c) 2001-2010 Hartmut Kaiser
2 //
3 //  Distributed under the Boost Software License, Version 1.0. (See accompanying
4 //  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
5 
6 //  This example shows how to create a simple lexer recognizing a couple of
7 //  different tokens aimed at a simple language and how to use this lexer with
8 //  a grammar. It shows how to associate attributes to tokens and how to access the
9 //  token attributes from inside the grammar.
10 //
11 //  Additionally, this example demonstrates, how to define a token set usable
12 //  as the skip parser during parsing, allowing to define several tokens to be
13 //  ignored.
14 //
15 //  The example demonstrates how to use the add(...)(...) syntax to associate
16 //  token definitions with the lexer and how token ids can be used in the
17 //  parser to refer to a token, without having to directly reference its
18 //  definition.
19 //
20 //  This example recognizes a very simple programming language having
21 //  assignment statements and if and while control structures. Look at the file
22 //  example6.input for an example.
23 //
24 //  This example is essentially identical to example4.cpp. The only difference
25 //  is that we use the self.add() syntax to define tokens and to associate them
26 //  with the lexer.
27 
28 #include <boost/config/warning_disable.hpp>
29 #include <boost/spirit/include/qi.hpp>
30 #include <boost/spirit/include/lex_lexertl.hpp>
31 #include <boost/spirit/include/phoenix_operator.hpp>
32 
33 #include <iostream>
34 #include <fstream>
35 #include <string>
36 
37 #include "example.hpp"
38 
39 using namespace boost::spirit;
40 using boost::phoenix::val;
41 
42 ///////////////////////////////////////////////////////////////////////////////
43 //  Token id definitions
44 ///////////////////////////////////////////////////////////////////////////////
45 enum token_ids
46 {
47     ID_CONSTANT = 1000,
48     ID_IF,
49     ID_ELSE,
50     ID_WHILE,
51     ID_IDENTIFIER
52 };
53 
54 ///////////////////////////////////////////////////////////////////////////////
55 //  Token definitions
56 ///////////////////////////////////////////////////////////////////////////////
57 template <typename Lexer>
58 struct example6_tokens : lex::lexer<Lexer>
59 {
example6_tokensexample6_tokens60     example6_tokens()
61     {
62         // define the tokens to match
63         identifier = "[a-zA-Z_][a-zA-Z0-9_]*";
64         constant = "[0-9]+";
65 
66         // associate the tokens and the token set with the lexer
67         this->self = lex::token_def<>('(') | ')' | '{' | '}' | '=' | ';';
68 
69         // Token definitions can be added by using some special syntactic
70         // construct as shown below.
71         // Note, that the token definitions added this way expose the iterator
72         // pair pointing to the matched input stream as their attribute.
73         this->self.add
74             (constant, ID_CONSTANT)
75             ("if", ID_IF)
76             ("else", ID_ELSE)
77             ("while", ID_WHILE)
78             (identifier, ID_IDENTIFIER)
79         ;
80 
81         // define the whitespace to ignore (spaces, tabs, newlines and C-style
82         // comments) and add those to another lexer state (here: "WS")
83         this->self("WS")
84             =   lex::token_def<>("[ \\t\\n]+")
85             |   "\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/"
86             ;
87     }
88 
89     // The following two tokens have an associated attribute type, identifier
90     // carries a string (the identifier name) and constant carries the matched
91     // integer value.
92     //
93     // Note: any token attribute type explicitly specified in a token_def<>
94     //       declaration needs to be listed during token type definition as
95     //       well (see the typedef for the token_type below).
96     //
97     // The conversion of the matched input to an instance of this type occurs
98     // once (on first access), which makes token attributes as efficient as
99     // possible. Moreover, token instances are constructed once by the lexer
100     // library. From this point on tokens are passed by reference only,
101     // avoiding them being copied around.
102     lex::token_def<std::string> identifier;
103     lex::token_def<unsigned int> constant;
104 };
105 
106 ///////////////////////////////////////////////////////////////////////////////
107 //  Grammar definition
108 ///////////////////////////////////////////////////////////////////////////////
109 template <typename Iterator, typename Lexer>
110 struct example6_grammar
111   : qi::grammar<Iterator, qi::in_state_skipper<Lexer> >
112 {
113     template <typename TokenDef>
example6_grammarexample6_grammar114     example6_grammar(TokenDef const& tok)
115       : example6_grammar::base_type(program)
116     {
117         using boost::spirit::_val;
118 
119         program
120             =  +block
121             ;
122 
123         block
124             =   '{' >> *statement >> '}'
125             ;
126 
127         statement
128             =   assignment
129             |   if_stmt
130             |   while_stmt
131             ;
132 
133         assignment
134             =   (tok.identifier >> '=' >> expression >> ';')
135                 [
136                     std::cout << val("assignment statement to: ")
137                               << _1 << "\n"
138                 ]
139             ;
140 
141         if_stmt
142             =   (   token(ID_IF) >> '(' >> expression >> ')' >> block
143                 >> -(token(ID_ELSE) >> block)
144                 )
145                 [
146                     std::cout << val("if expression: ")
147                               << _2 << "\n"
148                 ]
149             ;
150 
151         while_stmt
152             =   (token(ID_WHILE) >> '(' >> expression >> ')' >> block)
153                 [
154                     std::cout << val("while expression: ")
155                               << _2 << "\n"
156                 ]
157             ;
158 
159         //  since expression has a variant return type accommodating for
160         //  std::string and unsigned integer, both possible values may be
161         //  returned to the calling rule
162         expression
163             =   tok.identifier [ _val = _1 ]
164             |   tok.constant   [ _val = _1 ]
165             ;
166     }
167 
168     typedef boost::variant<unsigned int, std::string> expression_type;
169 
170     qi::rule<Iterator, qi::in_state_skipper<Lexer> > program, block, statement;
171     qi::rule<Iterator, qi::in_state_skipper<Lexer> > assignment, if_stmt;
172     qi::rule<Iterator, qi::in_state_skipper<Lexer> > while_stmt;
173 
174     //  the expression is the only rule having a return value
175     qi::rule<Iterator, expression_type(), qi::in_state_skipper<Lexer> >  expression;
176 };
177 
178 ///////////////////////////////////////////////////////////////////////////////
main()179 int main()
180 {
181     // iterator type used to expose the underlying input stream
182     typedef std::string::iterator base_iterator_type;
183 
184     // This is the lexer token type to use. The second template parameter lists
185     // all attribute types used for token_def's during token definition (see
186     // calculator_tokens<> above). Here we use the predefined lexertl token
187     // type, but any compatible token type may be used instead.
188     //
189     // If you don't list any token attribute types in the following declaration
190     // (or just use the default token type: lexertl_token<base_iterator_type>)
191     // it will compile and work just fine, just a bit less efficient. This is
192     // because the token attribute will be generated from the matched input
193     // sequence every time it is requested. But as soon as you specify at
194     // least one token attribute type you'll have to list all attribute types
195     // used for token_def<> declarations in the token definition class above,
196     // otherwise compilation errors will occur.
197     typedef lex::lexertl::token<
198         base_iterator_type, boost::mpl::vector<unsigned int, std::string>
199     > token_type;
200 
201     // Here we use the lexertl based lexer engine.
202     typedef lex::lexertl::lexer<token_type> lexer_type;
203 
204     // This is the token definition type (derived from the given lexer type).
205     typedef example6_tokens<lexer_type> example6_tokens;
206 
207     // this is the iterator type exposed by the lexer
208     typedef example6_tokens::iterator_type iterator_type;
209 
210     // this is the type of the grammar to parse
211     typedef example6_grammar<iterator_type, example6_tokens::lexer_def> example6_grammar;
212 
213     // now we use the types defined above to create the lexer and grammar
214     // object instances needed to invoke the parsing process
215     example6_tokens tokens;                         // Our lexer
216     example6_grammar calc(tokens);                  // Our parser
217 
218     std::string str (read_from_file("example6.input"));
219 
220     // At this point we generate the iterator pair used to expose the
221     // tokenized input stream.
222     std::string::iterator it = str.begin();
223     iterator_type iter = tokens.begin(it, str.end());
224     iterator_type end = tokens.end();
225 
226     // Parsing is done based on the token stream, not the character
227     // stream read from the input.
228     // Note how we use the lexer defined above as the skip parser. It must
229     // be explicitly wrapped inside a state directive, switching the lexer
230     // state for the duration of skipping whitespace.
231     std::string ws("WS");
232     bool r = qi::phrase_parse(iter, end, calc, qi::in_state(ws)[tokens.self]);
233 
234     if (r && iter == end)
235     {
236         std::cout << "-------------------------\n";
237         std::cout << "Parsing succeeded\n";
238         std::cout << "-------------------------\n";
239     }
240     else
241     {
242         std::cout << "-------------------------\n";
243         std::cout << "Parsing failed\n";
244         std::cout << "-------------------------\n";
245     }
246 
247     std::cout << "Bye... :-) \n\n";
248     return 0;
249 }
250