example/lex/example2.cpp

//  Copyright (c) 2001-2010 Hartmut Kaiser
//
//  Distributed under the Boost Software License, Version 1.0. (See accompanying
//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

//  This example shows how to create a simple lexer recognizing a couple of
//  different tokens and how to use this with a grammar. This example has a
//  heavily backtracking grammar which makes it a candidate for lexer based
//  parsing (all tokens are scanned and generated only once, even if
//  backtracking is required) which speeds up the overall parsing process
//  considerably, out-weighting the overhead needed for setting up the lexer.
//  Additionally it demonstrates how to use one of the defined tokens as a
//  parser component in the grammar.
//
//  The grammar recognizes a simple input structure: any number of English
//  simple sentences (statements, questions and commands) are recognized and
//  are being counted separately.

// #define BOOST_SPIRIT_DEBUG
// #define BOOST_SPIRIT_LEXERTL_DEBUG

#include <boost/config/warning_disable.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>

#include <iostream>
#include <fstream>
#include <string>

#include "example.hpp"

using namespace boost::spirit;
using namespace boost::spirit::ascii;
using boost::phoenix::ref;

///////////////////////////////////////////////////////////////////////////////
//  Token definition
///////////////////////////////////////////////////////////////////////////////
template <typename Lexer>
struct example2_tokens : lex::lexer<Lexer>
{
    example2_tokens()
    {
        //  A 'word' is comprised of one or more letters and an optional
        //  apostrophe. If it contains an apostrophe, there may only be one and
        //  the apostrophe must be preceded and succeeded by at least 1 letter.
        //  For example, "I'm" and "doesn't" meet the definition of 'word' we
        //  define below.
        word = "[a-zA-Z]+('[a-zA-Z]+)?";

        // Associate the tokens and the token set with the lexer. Note that
        // single character token definitions as used below always get
        // interpreted literally and never as special regex characters. This is
        // done to be able to assign single characters the id of their character
        // code value, allowing to reference those as literals in Qi grammars.
        this->self = lex::token_def<>(',') | '!' | '.' | '?' | ' ' | '\n' | word;
    }

    lex::token_def<> word;
};

///////////////////////////////////////////////////////////////////////////////
//  Grammar definition
///////////////////////////////////////////////////////////////////////////////
template <typename Iterator>
struct example2_grammar : qi::grammar<Iterator>
{
    template <typename TokenDef>
    example2_grammar(TokenDef const& tok)
      : example2_grammar::base_type(story)
      , paragraphs(0), commands(0), questions(0), statements(0)
    {
        story
            =  +paragraph
            ;

        paragraph
            =   (  +(   command [ ++ref(commands) ]
                    |   question [ ++ref(questions) ]
                    |   statement [ ++ref(statements) ]
                    )
                    >> *char_(' ') >> +char_('\n')
                )
                [ ++ref(paragraphs) ]
            ;

        command
            =  +(tok.word | ' ' | ',') >> '!'
            ;

        question
            =  +(tok.word | ' ' | ',') >> '?'
            ;

        statement
            =  +(tok.word | ' ' | ',') >> '.'
            ;

        BOOST_SPIRIT_DEBUG_NODE(story);
        BOOST_SPIRIT_DEBUG_NODE(paragraph);
        BOOST_SPIRIT_DEBUG_NODE(command);
        BOOST_SPIRIT_DEBUG_NODE(question);
        BOOST_SPIRIT_DEBUG_NODE(statement);
    }

    qi::rule<Iterator> story, paragraph, command, question, statement;
    int paragraphs, commands, questions, statements;
};

///////////////////////////////////////////////////////////////////////////////
int main()
{
    // iterator type used to expose the underlying input stream
    typedef std::string::iterator base_iterator_type;

    // This is the token type to return from the lexer iterator
    typedef lex::lexertl::token<base_iterator_type> token_type;

    // This is the lexer type to use to tokenize the input.
    // Here we use the lexertl based lexer engine.
    typedef lex::lexertl::lexer<token_type> lexer_type;

    // This is the token definition type (derived from the given lexer type).
    typedef example2_tokens<lexer_type> example2_tokens;

    // this is the iterator type exposed by the lexer
    typedef example2_tokens::iterator_type iterator_type;

    // this is the type of the grammar to parse
    typedef example2_grammar<iterator_type> example2_grammar;

    // now we use the types defined above to create the lexer and grammar
    // object instances needed to invoke the parsing process
    example2_tokens tokens;                         // Our lexer
    example2_grammar calc(tokens);                  // Our parser

    std::string str (read_from_file("example2.input"));

    // At this point we generate the iterator pair used to expose the
    // tokenized input stream.
    std::string::iterator it = str.begin();
    iterator_type iter = tokens.begin(it, str.end());
    iterator_type end = tokens.end();

    // Parsing is done based on the token stream, not the character
    // stream read from the input.
    bool r = qi::parse(iter, end, calc);

    if (r && iter == end)
    {
        std::cout << "-------------------------\n";
        std::cout << "Parsing succeeded\n";
        std::cout << "There were "
                  << calc.commands << " commands, "
                  << calc.questions << " questions, and "
                  << calc.statements << " statements.\n";
        std::cout << "-------------------------\n";
    }
    else
    {
        std::cout << "-------------------------\n";
        std::cout << "Parsing failed\n";
        std::cout << "-------------------------\n";
    }

    std::cout << "Bye... :-) \n\n";
    return 0;
}