Sources/Antlr3.Runtime/Lexer.cs

/*
 * [The "BSD licence"]
 * Copyright (c) 2005-2008 Terence Parr
 * All rights reserved.
 *
 * Conversion to C#:
 * Copyright (c) 2008-2009 Sam Harwell, Pixel Mine, Inc.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. The name of the author may not be used to endorse or promote products
 *    derived from this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

namespace Antlr.Runtime
{
    using ConditionalAttribute = System.Diagnostics.ConditionalAttribute;

    /** <summary>
     *  A lexer is recognizer that draws input symbols from a character stream.
     *  lexer grammars result in a subclass of this object. A Lexer object
     *  uses simplified match() and error recovery mechanisms in the interest
     *  of speed.
     *  </summary>
     */
    public abstract class Lexer : BaseRecognizer, ITokenSource
    {
        /** <summary>Where is the lexer drawing characters from?</summary> */
        protected ICharStream input;

        public Lexer()
        {
        }

        public Lexer( ICharStream input )
        {
            this.input = input;
        }

        public Lexer( ICharStream input, RecognizerSharedState state )
            : base(state)
        {
            this.input = input;
        }

        #region Properties
        public string Text
        {
            /** <summary>Return the text matched so far for the current token or any text override.</summary> */
            get
            {
                if ( state.text != null )
                {
                    return state.text;
                }
                return input.Substring( state.tokenStartCharIndex, CharIndex - state.tokenStartCharIndex );
            }
            /** <summary>Set the complete text of this token; it wipes any previous changes to the text.</summary> */
            set
            {
                state.text = value;
            }
        }
        public int Line
        {
            get
            {
                return input.Line;
            }
            set
            {
                input.Line = value;
            }
        }
        public int CharPositionInLine
        {
            get
            {
                return input.CharPositionInLine;
            }
            set
            {
                input.CharPositionInLine = value;
            }
        }
        #endregion

        public override void Reset()
        {
            base.Reset(); // reset all recognizer state variables
            // wack Lexer state variables
            if ( input != null )
            {
                input.Seek( 0 ); // rewind the input
            }
            if ( state == null )
            {
                return; // no shared state work to do
            }
            state.token = null;
            state.type = TokenTypes.Invalid;
            state.channel = TokenChannels.Default;
            state.tokenStartCharIndex = -1;
            state.tokenStartCharPositionInLine = -1;
            state.tokenStartLine = -1;
            state.text = null;
        }

        /** <summary>Return a token from this source; i.e., match a token on the char stream.</summary> */
        public virtual IToken NextToken()
        {
            for ( ; ; )
            {
                state.token = null;
                state.channel = TokenChannels.Default;
                state.tokenStartCharIndex = input.Index;
                state.tokenStartCharPositionInLine = input.CharPositionInLine;
                state.tokenStartLine = input.Line;
                state.text = null;
                if ( input.LA( 1 ) == CharStreamConstants.EndOfFile )
                {
                    IToken eof = new CommonToken((ICharStream)input, CharStreamConstants.EndOfFile, TokenChannels.Default, input.Index, input.Index);
                    eof.Line = Line;
                    eof.CharPositionInLine = CharPositionInLine;
                    return eof;
                }
                try
                {
                    ParseNextToken();
                    if ( state.token == null )
                    {
                        Emit();
                    }
                    else if ( state.token == Tokens.Skip )
                    {
                        continue;
                    }
                    return state.token;
                }
                catch (MismatchedRangeException mre)
                {
                    ReportError(mre);
                    // MatchRange() routine has already called recover()
                }
                catch (MismatchedTokenException mte)
                {
                    ReportError(mte);
                    // Match() routine has already called recover()
                }
                catch ( RecognitionException re )
                {
                    ReportError( re );
                    Recover( re ); // throw out current char and try again
                }
            }
        }

        /** <summary>
         *  Instruct the lexer to skip creating a token for current lexer rule
         *  and look for another token.  nextToken() knows to keep looking when
         *  a lexer rule finishes with token set to SKIP_TOKEN.  Recall that
         *  if token==null at end of any token rule, it creates one for you
         *  and emits it.
         *  </summary>
         */
        public virtual void Skip()
        {
            state.token = Tokens.Skip;
        }

        /** <summary>This is the lexer entry point that sets instance var 'token'</summary> */
        public abstract void mTokens();

        public virtual ICharStream CharStream
        {
            get
            {
                return input;
            }
            /** <summary>Set the char stream and reset the lexer</summary> */
            set
            {
                input = null;
                Reset();
                input = value;
            }
        }

        public override string SourceName
        {
            get
            {
                return input.SourceName;
            }
        }

        /** <summary>
         *  Currently does not support multiple emits per nextToken invocation
         *  for efficiency reasons.  Subclass and override this method and
         *  nextToken (to push tokens into a list and pull from that list rather
         *  than a single variable as this implementation does).
         *  </summary>
         */
        public virtual void Emit( IToken token )
        {
            state.token = token;
        }

        /** <summary>
         *  The standard method called to automatically emit a token at the
         *  outermost lexical rule.  The token object should point into the
         *  char buffer start..stop.  If there is a text override in 'text',
         *  use that to set the token's text.  Override this method to emit
         *  custom Token objects.
         *  </summary>
         *
         *  <remarks>
         *  If you are building trees, then you should also override
         *  Parser or TreeParser.getMissingSymbol().
         *  </remarks>
         */
        public virtual IToken Emit()
        {
            IToken t = new CommonToken( input, state.type, state.channel, state.tokenStartCharIndex, CharIndex - 1 );
            t.Line = state.tokenStartLine;
            t.Text = state.text;
            t.CharPositionInLine = state.tokenStartCharPositionInLine;
            Emit( t );
            return t;
        }

        public virtual void Match( string s )
        {
            int i = 0;
            while ( i < s.Length )
            {
                if ( input.LA( 1 ) != s[i] )
                {
                    if ( state.backtracking > 0 )
                    {
                        state.failed = true;
                        return;
                    }
                    MismatchedTokenException mte = new MismatchedTokenException(s[i], input, TokenNames);
                    Recover( mte );
                    throw mte;
                }
                i++;
                input.Consume();
                state.failed = false;
            }
        }

        public virtual void MatchAny()
        {
            input.Consume();
        }

        public virtual void Match( int c )
        {
            if ( input.LA( 1 ) != c )
            {
                if ( state.backtracking > 0 )
                {
                    state.failed = true;
                    return;
                }
                MismatchedTokenException mte = new MismatchedTokenException(c, input, TokenNames);
                Recover( mte );  // don't really recover; just consume in lexer
                throw mte;
            }
            input.Consume();
            state.failed = false;
        }

        public virtual void MatchRange( int a, int b )
        {
            if ( input.LA( 1 ) < a || input.LA( 1 ) > b )
            {
                if ( state.backtracking > 0 )
                {
                    state.failed = true;
                    return;
                }
                MismatchedRangeException mre = new MismatchedRangeException(a, b, input);
                Recover( mre );
                throw mre;
            }
            input.Consume();
            state.failed = false;
        }

        /** <summary>What is the index of the current character of lookahead?</summary> */
        public virtual int CharIndex
        {
            get
            {
                return input.Index;
            }
        }

        public override void ReportError( RecognitionException e )
        {
            /** TODO: not thought about recovery in lexer yet.
             *
            // if we've already reported an error and have not matched a token
            // yet successfully, don't report any errors.
            if ( errorRecovery ) {
                //System.err.print("[SPURIOUS] ");
                return;
            }
            errorRecovery = true;
             */

            DisplayRecognitionError( this.TokenNames, e );
        }

        public override string GetErrorMessage( RecognitionException e, string[] tokenNames )
        {
            string msg = null;
            if ( e is MismatchedTokenException )
            {
                MismatchedTokenException mte = (MismatchedTokenException)e;
                msg = "mismatched character " + GetCharErrorDisplay( e.Character ) + " expecting " + GetCharErrorDisplay( mte.Expecting );
            }
            else if ( e is NoViableAltException )
            {
                NoViableAltException nvae = (NoViableAltException)e;
                // for development, can add "decision=<<"+nvae.grammarDecisionDescription+">>"
                // and "(decision="+nvae.decisionNumber+") and
                // "state "+nvae.stateNumber
                msg = "no viable alternative at character " + GetCharErrorDisplay( e.Character );
            }
            else if ( e is EarlyExitException )
            {
                EarlyExitException eee = (EarlyExitException)e;
                // for development, can add "(decision="+eee.decisionNumber+")"
                msg = "required (...)+ loop did not match anything at character " + GetCharErrorDisplay( e.Character );
            }
            else if ( e is MismatchedNotSetException )
            {
                MismatchedNotSetException mse = (MismatchedNotSetException)e;
                msg = "mismatched character " + GetCharErrorDisplay( e.Character ) + " expecting set " + mse.Expecting;
            }
            else if ( e is MismatchedSetException )
            {
                MismatchedSetException mse = (MismatchedSetException)e;
                msg = "mismatched character " + GetCharErrorDisplay( e.Character ) + " expecting set " + mse.Expecting;
            }
            else if ( e is MismatchedRangeException )
            {
                MismatchedRangeException mre = (MismatchedRangeException)e;
                msg = "mismatched character " + GetCharErrorDisplay( e.Character ) + " expecting set " +
                      GetCharErrorDisplay( mre.A ) + ".." + GetCharErrorDisplay( mre.B );
            }
            else
            {
                msg = base.GetErrorMessage( e, tokenNames );
            }
            return msg;
        }

        public virtual string GetCharErrorDisplay( int c )
        {
            string s = ( (char)c ).ToString();
            switch ( c )
            {
            case TokenTypes.EndOfFile:
                s = "<EOF>";
                break;
            case '\n':
                s = "\\n";
                break;
            case '\t':
                s = "\\t";
                break;
            case '\r':
                s = "\\r";
                break;
            }
            return "'" + s + "'";
        }

        /** <summary>
         *  Lexers can normally match any char in it's vocabulary after matching
         *  a token, so do the easy thing and just kill a character and hope
         *  it all works out.  You can instead use the rule invocation stack
         *  to do sophisticated error recovery if you are in a fragment rule.
         *  </summary>
         */
        public virtual void Recover( RecognitionException re )
        {
            //System.out.println("consuming char "+(char)input.LA(1)+" during recovery");
            //re.printStackTrace();
            input.Consume();
        }

        [Conditional("ANTLR_TRACE")]
        public virtual void TraceIn( string ruleName, int ruleIndex )
        {
            string inputSymbol = ( (char)input.LT( 1 ) ) + " line=" + Line + ":" + CharPositionInLine;
            base.TraceIn( ruleName, ruleIndex, inputSymbol );
        }

        [Conditional("ANTLR_TRACE")]
        public virtual void TraceOut( string ruleName, int ruleIndex )
        {
            string inputSymbol = ( (char)input.LT( 1 ) ) + " line=" + Line + ":" + CharPositionInLine;
            base.TraceOut( ruleName, ruleIndex, inputSymbol );
        }

        protected virtual void ParseNextToken()
        {
            mTokens();
        }
    }
}