/** \file * Base interface for any ANTLR3 lexer. * * An ANLTR3 lexer builds from two sets of components: * * - The runtime components that provide common functionality such as * traversing character streams, building tokens for output and so on. * - The generated rules and struutre of the actual lexer, which call upon the * runtime components. * * A lexer class contains a character input stream, a base recognizer interface * (which it will normally implement) and a token source interface (which it also * implements. The Tokensource interface is called by a token consumer (such as * a parser, but in theory it can be anything that wants a set of abstract * tokens in place of a raw character stream. * * So then, we set up a lexer in a sequence akin to: * * - Create a character stream (something which implements ANTLR3_INPUT_STREAM) * and initialize it. * - Create a lexer interface and tell it where it its input stream is. * This will cause the creation of a base recognizer class, which it will * override with its own implementations of some methods. The lexer creator * can also then in turn override anything it likes. * - The lexer token source interface is then passed to some interface that * knows how to use it, byte calling for a next token. * - When a next token is called, let ze lexing begin. * */ #ifndef _ANTLR3_LEXER_HPP #define _ANTLR3_LEXER_HPP // [The "BSD licence"] // Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB // // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // 1. Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // 2. Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // 3. The name of the author may not be used to endorse or promote products // derived from this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* Definitions */ #include "antlr3defs.hpp" ANTLR_BEGIN_NAMESPACE() static const ANTLR_UINT32 ANTLR_STRING_TERMINATOR = 0xFFFFFFFF; template class Lexer : public ImplTraits::template RecognizerType< typename ImplTraits::InputStreamType >, public ImplTraits::TokenSourceType { public: typedef typename ImplTraits::AllocPolicyType AllocPolicyType; typedef typename ImplTraits::InputStreamType InputStreamType; typedef InputStreamType StreamType; typedef typename InputStreamType::IntStreamType IntStreamType; typedef typename ImplTraits::CommonTokenType CommonTokenType; typedef typename ImplTraits::StreamDataType TokenType; typedef typename ImplTraits::StringType StringType; typedef typename ImplTraits::StringStreamType StringStreamType; typedef typename ImplTraits::template RecognizerType< InputStreamType > RecognizerType; typedef typename RecognizerType::RecognizerSharedStateType RecognizerSharedStateType; typedef typename ImplTraits::template ExceptionBaseType ExceptionBaseType; typedef typename ImplTraits::BitsetListType BitsetListType; typedef typename ImplTraits::TokenSourceType TokenSourceType; typedef typename RecognizerSharedStateType::RuleMemoType RuleMemoType; typedef typename RecognizerType::DebugEventListenerType DebuggerType; private: /** A pointer to the character stream whence this lexer is receiving * characters. * TODO: I may come back to this and implement charstream outside * the input stream as per the java implementation. */ InputStreamType* m_input; public: Lexer(ANTLR_UINT32 sizeHint, RecognizerSharedStateType* state); Lexer(ANTLR_UINT32 sizeHint, InputStreamType* input, RecognizerSharedStateType* state); InputStreamType* get_input() const; IntStreamType* get_istream() const; RecognizerType* get_rec(); const RecognizerType* get_rec() const; TokenSourceType* get_tokSource(); //functions used in .stg file const RecognizerType* get_recognizer() const; RecognizerSharedStateType* get_lexstate() const; void set_lexstate( RecognizerSharedStateType* lexstate ); const TokenSourceType* get_tokSource() const; CommonTokenType* get_ltoken() const; void set_ltoken( const CommonTokenType* ltoken ); bool hasFailed() const; ANTLR_INT32 get_backtracking() const; void inc_backtracking(); void dec_backtracking(); bool get_failedflag() const; void set_failedflag( bool failed ); InputStreamType* get_strstream() const; ANTLR_MARKER index() const; void seek(ANTLR_MARKER index); const CommonTokenType* EOF_Token() const; bool hasException() const; ExceptionBaseType* get_exception() const; void constructEx(); void lrecover(); ANTLR_MARKER mark(); void rewind(ANTLR_MARKER marker); void rewindLast(); void setText( const StringType& text ); void skip(); RuleMemoType* getRuleMemo() const; DebuggerType* get_debugger() const; void setRuleMemo(RuleMemoType* rulememo); ANTLR_UINT32 LA(ANTLR_INT32 i); void consume(); void memoize(ANTLR_MARKER ruleIndex, ANTLR_MARKER ruleParseStart); bool haveParsedRule(ANTLR_MARKER ruleIndex); /** Pointer to a function that sets the charstream source for the lexer and * causes it to be reset. */ void setCharStream(InputStreamType* input); /*! * \brief * Change to a new input stream, remembering the old one. * * \param lexer * Pointer to the lexer instance to switch input streams for. * * \param input * New input stream to install as the current one. * * Switches the current character input stream to * a new one, saving the old one, which we will revert to at the end of this * new one. */ void pushCharStream(InputStreamType* input); /*! * \brief * Stops using the current input stream and reverts to any prior * input stream on the stack. * * \param lexer * Description of parameter lexer. * * Pointer to a function that abandons the current input stream, whether it * is empty or not and reverts to the previous stacked input stream. * * \remark * The function fails silently if there are no prior input streams. */ void popCharStream(); /** Function that emits (a copy of ) the supplied token as the next token in * the stream. */ void emit(const CommonTokenType* token); /** Pointer to a function that constructs a new token from the lexer stored information */ CommonTokenType* emit(); /** Pointer to a function that attempts to match and consume the specified string from the input * stream. Note that strings muse be passed as terminated arrays of ANTLR3_UCHAR. Strings are terminated * with 0xFFFFFFFF, which is an invalid UTF32 character */ bool matchs(ANTLR_UCHAR* string); /** Pointer to a function that matches and consumes the specified character from the input stream. * The input stream is required to provide characters via LA() as UTF32 characters. The default lexer * implementation is source encoding agnostic and so input streams do not generally need to * override the default implmentation. */ bool matchc(ANTLR_UCHAR c); /** Pointer to a function that matches any character in the supplied range (I suppose it could be a token range too * but this would only be useful if the tokens were in tsome guaranteed order which is * only going to happen with a hand crafted token set). */ bool matchRange(ANTLR_UCHAR low, ANTLR_UCHAR high); /** Pointer to a function that matches the next token/char in the input stream * regardless of what it actaully is. */ void matchAny(); /** Pointer to a function that recovers from an error found in the input stream. * Generally, this will be a #ANTLR3_EXCEPTION_NOVIABLE_ALT but it could also * be from a mismatched token that the (*match)() could not recover from. */ void recover(); /** Function to return the current line number in the input stream */ ANTLR_UINT32 getLine(); ANTLR_MARKER getCharIndex(); ANTLR_UINT32 getCharPositionInLine(); /** Function to return the text so far for the current token being generated */ StringType getText(); //Other utility functions void fillExceptionData( ExceptionBaseType* ex ); /** Default lexer error handler (works for 8 bit streams only!!!) */ void displayRecognitionError( ANTLR_UINT8** tokenNames, ExceptionBaseType* ex); void exConstruct(); TokenType* getMissingSymbol( IntStreamType* istream, ExceptionBaseType* e, ANTLR_UINT32 expectedTokenType, BitsetListType* follow); /** Pointer to a function that knows how to free the resources of a lexer */ ~Lexer(); }; ANTLR_END_NAMESPACE() #include "antlr3lexer.inl" #endif