• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //  Copyright (c) 2001-2010 Hartmut Kaiser
2 //
3 //  Distributed under the Boost Software License, Version 1.0. (See accompanying
4 //  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
5 
6 //  This example shows how to create a simple lexer recognizing a couple of
7 //  different tokens aimed at a simple language and how to use this lexer with
8 //  a grammar. It shows how to associate attributes to tokens and how to access the
9 //  token attributes from inside the grammar.
10 //
11 //  Additionally, this example demonstrates, how to define a token set usable
12 //  as the skip parser during parsing, allowing to define several tokens to be
13 //  ignored.
14 //
15 //  The main purpose of this example is to show how inheritance can be used to
16 //  overload parts of a base grammar and add token definitions to a base lexer.
17 //
18 //  Further, it shows how you can use the 'omit' attribute type specifier
19 //  for token definitions to force the token to have no attribute (expose an
20 //  unused attribute).
21 //
22 //  This example recognizes a very simple programming language having
23 //  assignment statements and if and while control structures. Look at the file
24 //  example5.input for an example.
25 
26 #include <boost/config/warning_disable.hpp>
27 #include <boost/spirit/include/qi.hpp>
28 #include <boost/spirit/include/lex_lexertl.hpp>
29 #include <boost/spirit/include/phoenix_operator.hpp>
30 
31 #include <iostream>
32 #include <fstream>
33 #include <string>
34 
35 #include "example.hpp"
36 
37 using namespace boost::spirit;
38 using boost::phoenix::val;
39 
40 ///////////////////////////////////////////////////////////////////////////////
41 //  Token definition base, defines all tokens for the base grammar below
42 ///////////////////////////////////////////////////////////////////////////////
43 template <typename Lexer>
44 struct example5_base_tokens : lex::lexer<Lexer>
45 {
46 protected:
47     // this lexer is supposed to be used as a base type only
example5_base_tokensexample5_base_tokens48     example5_base_tokens() {}
49 
50 public:
init_token_definitionsexample5_base_tokens51     void init_token_definitions()
52     {
53         // define the tokens to match
54         identifier = "[a-zA-Z_][a-zA-Z0-9_]*";
55         constant = "[0-9]+";
56         if_ = "if";
57         while_ = "while";
58 
59         // associate the tokens and the token set with the lexer
60         this->self += lex::token_def<>('(') | ')' | '{' | '}' | '=' | ';' | constant;
61         this->self += if_ | while_ | identifier;
62 
63         // define the whitespace to ignore (spaces, tabs, newlines and C-style
64         // comments)
65         this->self("WS")
66             =   lex::token_def<>("[ \\t\\n]+")
67             |   "\\/\\*[^*]*\\*+([^/*][^*]*\\*+)*\\/"
68             ;
69     }
70 
71     // these tokens have no attribute
72     lex::token_def<lex::omit> if_, while_;
73 
74     // The following two tokens have an associated attribute type, 'identifier'
75     // carries a string (the identifier name) and 'constant' carries the
76     // matched integer value.
77     //
78     // Note: any token attribute type explicitly specified in a token_def<>
79     //       declaration needs to be listed during token type definition as
80     //       well (see the typedef for the token_type below).
81     //
82     // The conversion of the matched input to an instance of this type occurs
83     // once (on first access), which makes token attributes as efficient as
84     // possible. Moreover, token instances are constructed once by the lexer
85     // library. From this point on tokens are passed by reference only,
86     // avoiding them being copied around.
87     lex::token_def<std::string> identifier;
88     lex::token_def<unsigned int> constant;
89 };
90 
91 ///////////////////////////////////////////////////////////////////////////////
92 //  Grammar definition base, defines a basic language
93 ///////////////////////////////////////////////////////////////////////////////
94 template <typename Iterator, typename Lexer>
95 struct example5_base_grammar
96   : qi::grammar<Iterator, qi::in_state_skipper<Lexer> >
97 {
98     template <typename TokenDef>
example5_base_grammarexample5_base_grammar99     example5_base_grammar(TokenDef const& tok)
100       : example5_base_grammar::base_type(program)
101     {
102         using boost::spirit::_val;
103 
104         program
105             =  +block
106             ;
107 
108         block
109             =   '{' >> *statement >> '}'
110             ;
111 
112         statement
113             =   assignment
114             |   if_stmt
115             |   while_stmt
116             ;
117 
118         assignment
119             =   (tok.identifier >> '=' >> expression >> ';')
120                 [
121                     std::cout << val("assignment statement to: ") << _1 << "\n"
122                 ]
123             ;
124 
125         if_stmt
126             =   (tok.if_ >> '(' >> expression >> ')' >> block)
127                 [
128                     std::cout << val("if expression: ") << _1 << "\n"
129                 ]
130             ;
131 
132         while_stmt
133             =   (tok.while_ >> '(' >> expression >> ')' >> block)
134                 [
135                     std::cout << val("while expression: ") << _1 << "\n"
136                 ]
137             ;
138 
139         //  since expression has a variant return type accommodating for
140         //  std::string and unsigned integer, both possible values may be
141         //  returned to the calling rule
142         expression
143             =   tok.identifier [ _val = _1 ]
144             |   tok.constant   [ _val = _1 ]
145             ;
146     }
147 
148     typedef qi::in_state_skipper<Lexer> skipper_type;
149 
150     qi::rule<Iterator, skipper_type> program, block, statement;
151     qi::rule<Iterator, skipper_type> assignment, if_stmt;
152     qi::rule<Iterator, skipper_type> while_stmt;
153 
154     //  the expression is the only rule having a return value
155     typedef boost::variant<unsigned int, std::string> expression_type;
156     qi::rule<Iterator, expression_type(), skipper_type>  expression;
157 };
158 
159 ///////////////////////////////////////////////////////////////////////////////
160 //  Token definition for derived lexer, defines additional tokens
161 ///////////////////////////////////////////////////////////////////////////////
162 template <typename Lexer>
163 struct example5_tokens : example5_base_tokens<Lexer>
164 {
165     typedef example5_base_tokens<Lexer> base_type;
166 
example5_tokensexample5_tokens167     example5_tokens()
168     {
169         // define the additional token to match
170         else_ = "else";
171 
172         // associate the new token with the lexer, note we add 'else' before
173         // anything else to add it to the token set before the identifier
174         // token, otherwise "else" would be matched as an identifier
175         this->self = else_;
176 
177         // now add the token definitions from the base class
178         this->base_type::init_token_definitions();
179     }
180 
181     // this token has no attribute
182     lex::token_def<lex::omit> else_;
183 };
184 
185 ///////////////////////////////////////////////////////////////////////////////
186 //  Derived grammar definition, defines a language extension
187 ///////////////////////////////////////////////////////////////////////////////
188 template <typename Iterator, typename Lexer>
189 struct example5_grammar : example5_base_grammar<Iterator, Lexer>
190 {
191     template <typename TokenDef>
example5_grammarexample5_grammar192     example5_grammar(TokenDef const& tok)
193       : example5_base_grammar<Iterator, Lexer>(tok)
194     {
195         // we alter the if_stmt only
196         this->if_stmt
197             =   this->if_stmt.copy() >> -(tok.else_ >> this->block)
198             ;
199     }
200 };
201 
202 ///////////////////////////////////////////////////////////////////////////////
main()203 int main()
204 {
205     // iterator type used to expose the underlying input stream
206     typedef std::string::iterator base_iterator_type;
207 
208     // This is the lexer token type to use. The second template parameter lists
209     // all attribute types used for token_def's during token definition (see
210     // example5_base_tokens<> above). Here we use the predefined lexertl token
211     // type, but any compatible token type may be used instead.
212     //
213     // If you don't list any token attribute types in the following declaration
214     // (or just use the default token type: lexertl_token<base_iterator_type>)
215     // it will compile and work just fine, just a bit less efficient. This is
216     // because the token attribute will be generated from the matched input
217     // sequence every time it is requested. But as soon as you specify at
218     // least one token attribute type you'll have to list all attribute types
219     // used for token_def<> declarations in the token definition class above,
220     // otherwise compilation errors will occur.
221     typedef lex::lexertl::token<
222         base_iterator_type, boost::mpl::vector<unsigned int, std::string>
223     > token_type;
224 
225     // Here we use the lexertl based lexer engine.
226     typedef lex::lexertl::lexer<token_type> lexer_type;
227 
228     // This is the token definition type (derived from the given lexer type).
229     typedef example5_tokens<lexer_type> example5_tokens;
230 
231     // this is the iterator type exposed by the lexer
232     typedef example5_tokens::iterator_type iterator_type;
233 
234     // this is the type of the grammar to parse
235     typedef example5_grammar<iterator_type, example5_tokens::lexer_def> example5_grammar;
236 
237     // now we use the types defined above to create the lexer and grammar
238     // object instances needed to invoke the parsing process
239     example5_tokens tokens;                         // Our lexer
240     example5_grammar calc(tokens);                  // Our parser
241 
242     std::string str (read_from_file("example5.input"));
243 
244     // At this point we generate the iterator pair used to expose the
245     // tokenized input stream.
246     std::string::iterator it = str.begin();
247     iterator_type iter = tokens.begin(it, str.end());
248     iterator_type end = tokens.end();
249 
250     // Parsing is done based on the token stream, not the character
251     // stream read from the input.
252     // Note how we use the lexer defined above as the skip parser. It must
253     // be explicitly wrapped inside a state directive, switching the lexer
254     // state for the duration of skipping whitespace.
255     std::string ws("WS");
256     bool r = qi::phrase_parse(iter, end, calc, qi::in_state(ws)[tokens.self]);
257 
258     if (r && iter == end)
259     {
260         std::cout << "-------------------------\n";
261         std::cout << "Parsing succeeded\n";
262         std::cout << "-------------------------\n";
263     }
264     else
265     {
266         std::cout << "-------------------------\n";
267         std::cout << "Parsing failed\n";
268         std::cout << "-------------------------\n";
269     }
270 
271     std::cout << "Bye... :-) \n\n";
272     return 0;
273 }
274