1 // Copyright (c) 2001-2010 Hartmut Kaiser
2 //
3 // Distributed under the Boost Software License, Version 1.0. (See accompanying
4 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
5
6 // This example shows how to create a simple lexer recognizing a couple of
7 // different tokens and how to use this with a grammar. This example has a
8 // heavily backtracking grammar which makes it a candidate for lexer based
9 // parsing (all tokens are scanned and generated only once, even if
10 // backtracking is required) which speeds up the overall parsing process
11 // considerably, out-weighting the overhead needed for setting up the lexer.
12 // Additionally it demonstrates how to use one of the defined tokens as a
13 // parser component in the grammar.
14 //
15 // The grammar recognizes a simple input structure: any number of English
16 // simple sentences (statements, questions and commands) are recognized and
17 // are being counted separately.
18
19 // #define BOOST_SPIRIT_DEBUG
20 // #define BOOST_SPIRIT_LEXERTL_DEBUG
21
22 #include <boost/config/warning_disable.hpp>
23 #include <boost/spirit/include/qi.hpp>
24 #include <boost/spirit/include/lex_lexertl.hpp>
25 #include <boost/spirit/include/phoenix_operator.hpp>
26
27 #include <iostream>
28 #include <fstream>
29 #include <string>
30
31 #include "example.hpp"
32
33 using namespace boost::spirit;
34 using namespace boost::spirit::ascii;
35 using boost::phoenix::ref;
36
37 ///////////////////////////////////////////////////////////////////////////////
38 // Token definition
39 ///////////////////////////////////////////////////////////////////////////////
40 template <typename Lexer>
41 struct example2_tokens : lex::lexer<Lexer>
42 {
example2_tokensexample2_tokens43 example2_tokens()
44 {
45 // A 'word' is comprised of one or more letters and an optional
46 // apostrophe. If it contains an apostrophe, there may only be one and
47 // the apostrophe must be preceded and succeeded by at least 1 letter.
48 // For example, "I'm" and "doesn't" meet the definition of 'word' we
49 // define below.
50 word = "[a-zA-Z]+('[a-zA-Z]+)?";
51
52 // Associate the tokens and the token set with the lexer. Note that
53 // single character token definitions as used below always get
54 // interpreted literally and never as special regex characters. This is
55 // done to be able to assign single characters the id of their character
56 // code value, allowing to reference those as literals in Qi grammars.
57 this->self = lex::token_def<>(',') | '!' | '.' | '?' | ' ' | '\n' | word;
58 }
59
60 lex::token_def<> word;
61 };
62
63 ///////////////////////////////////////////////////////////////////////////////
64 // Grammar definition
65 ///////////////////////////////////////////////////////////////////////////////
66 template <typename Iterator>
67 struct example2_grammar : qi::grammar<Iterator>
68 {
69 template <typename TokenDef>
example2_grammarexample2_grammar70 example2_grammar(TokenDef const& tok)
71 : example2_grammar::base_type(story)
72 , paragraphs(0), commands(0), questions(0), statements(0)
73 {
74 story
75 = +paragraph
76 ;
77
78 paragraph
79 = ( +( command [ ++ref(commands) ]
80 | question [ ++ref(questions) ]
81 | statement [ ++ref(statements) ]
82 )
83 >> *char_(' ') >> +char_('\n')
84 )
85 [ ++ref(paragraphs) ]
86 ;
87
88 command
89 = +(tok.word | ' ' | ',') >> '!'
90 ;
91
92 question
93 = +(tok.word | ' ' | ',') >> '?'
94 ;
95
96 statement
97 = +(tok.word | ' ' | ',') >> '.'
98 ;
99
100 BOOST_SPIRIT_DEBUG_NODE(story);
101 BOOST_SPIRIT_DEBUG_NODE(paragraph);
102 BOOST_SPIRIT_DEBUG_NODE(command);
103 BOOST_SPIRIT_DEBUG_NODE(question);
104 BOOST_SPIRIT_DEBUG_NODE(statement);
105 }
106
107 qi::rule<Iterator> story, paragraph, command, question, statement;
108 int paragraphs, commands, questions, statements;
109 };
110
111 ///////////////////////////////////////////////////////////////////////////////
main()112 int main()
113 {
114 // iterator type used to expose the underlying input stream
115 typedef std::string::iterator base_iterator_type;
116
117 // This is the token type to return from the lexer iterator
118 typedef lex::lexertl::token<base_iterator_type> token_type;
119
120 // This is the lexer type to use to tokenize the input.
121 // Here we use the lexertl based lexer engine.
122 typedef lex::lexertl::lexer<token_type> lexer_type;
123
124 // This is the token definition type (derived from the given lexer type).
125 typedef example2_tokens<lexer_type> example2_tokens;
126
127 // this is the iterator type exposed by the lexer
128 typedef example2_tokens::iterator_type iterator_type;
129
130 // this is the type of the grammar to parse
131 typedef example2_grammar<iterator_type> example2_grammar;
132
133 // now we use the types defined above to create the lexer and grammar
134 // object instances needed to invoke the parsing process
135 example2_tokens tokens; // Our lexer
136 example2_grammar calc(tokens); // Our parser
137
138 std::string str (read_from_file("example2.input"));
139
140 // At this point we generate the iterator pair used to expose the
141 // tokenized input stream.
142 std::string::iterator it = str.begin();
143 iterator_type iter = tokens.begin(it, str.end());
144 iterator_type end = tokens.end();
145
146 // Parsing is done based on the token stream, not the character
147 // stream read from the input.
148 bool r = qi::parse(iter, end, calc);
149
150 if (r && iter == end)
151 {
152 std::cout << "-------------------------\n";
153 std::cout << "Parsing succeeded\n";
154 std::cout << "There were "
155 << calc.commands << " commands, "
156 << calc.questions << " questions, and "
157 << calc.statements << " statements.\n";
158 std::cout << "-------------------------\n";
159 }
160 else
161 {
162 std::cout << "-------------------------\n";
163 std::cout << "Parsing failed\n";
164 std::cout << "-------------------------\n";
165 }
166
167 std::cout << "Bye... :-) \n\n";
168 return 0;
169 }
170