package ANTLR::Runtime::Lexer; use English qw( -no_match_vars ); use Readonly; use Carp; use Switch; use ANTLR::Runtime::Token; use ANTLR::Runtime::CommonToken; use ANTLR::Runtime::CharStream; use ANTLR::Runtime::MismatchedTokenException; use Moose; extends 'ANTLR::Runtime::BaseRecognizer'; with 'ANTLR::Runtime::TokenSource'; has 'input' => ( is => 'rw', does => 'ANTLR::Runtime::CharStream', ); sub reset { my ($self) = @_; # reset all recognizer state variables $self->SUPER::reset(); # wack Lexer state variables if (defined $self->input) { # rewind the input $self->input->seek(0); } if (defined $self->state) { $self->state->token(undef); $self->state->type(ANTLR::Runtime::Token->INVALID_TOKEN_TYPE); $self->state->channel(ANTLR::Runtime::Token->DEFAULT_CHANNEL); $self->state->token_start_char_index(-1); $self->state->token_start_char_position_in_line(-1); $self->state->start_line(-1); $self->state->text(undef); } } # Return a token from this source; i.e., match a token on the char # stream. sub next_token { my ($self) = @_; while (1) { $self->state->token(undef); $self->state->channel(ANTLR::Runtime::Token->DEFAULT_CHANNEL); $self->state->token_start_char_index($self->input->index()); $self->state->token_start_char_position_in_line($self->input->get_char_position_in_line()); $self->state->token_start_line($self->input->get_line()); $self->state->text(undef); if ($self->input->LA(1) eq ANTLR::Runtime::CharStream->EOF) { return ANTLR::Runtime::Token->EOF_TOKEN; } my $rv; my $op = ''; eval { $self->m_tokens(); if (!defined $self->state->token) { $self->emit(); } elsif ($self->state->token == ANTLR::Runtime::Token->SKIP_TOKEN) { $op = 'next'; return; } $op = 'return'; $rv = $self->state->token; }; return $rv if $op eq 'return'; next if $op eq 'next'; if ($EVAL_ERROR) { my $exception = $EVAL_ERROR; if ($exception->isa('ANTLR::Runtime::RecognitionException')) { $self->report_error($exception); $self->recover($exception); } else { croak $exception; } } } } # Instruct the lexer to skip creating a token for current lexer rule # and look for another token. nextToken() knows to keep looking when # a lexer rule finishes with token set to SKIP_TOKEN. Recall that # if token==null at end of any token rule, it creates one for you # and emits it. sub skip { my ($self) = @_; $self->state->token(ANTLR::Runtime::Token->SKIP_TOKEN); return; } # This is the lexer entry point that sets instance var 'token' sub m_tokens { croak "Unimplemented"; } # Set the char stream and reset the lexer sub set_char_stream { my ($self, $input) = @_; $self->input(undef); $self->reset(); $self->input($input); } sub get_char_stream { my ($self) = @_; return $self->input; } sub get_source_name { my ($self) = @_; return $self->input->get_source_name(); } sub emit { if (@_ == 1) { my ($self) = @_; # The standard method called to automatically emit a token at the # outermost lexical rule. The token object should point into the # char buffer start..stop. If there is a text override in 'text', # use that to set the token's text. Override this method to emit # custom Token objects. my $t = ANTLR::Runtime::CommonToken->new({ input => $self->input, type => $self->state->type, channel => $self->state->channel, start => $self->state->token_start_char_index, stop => $self->get_char_index() - 1 }); $t->set_line($self->state->token_start_line); $t->set_text($self->state->text); $t->set_char_position_in_line($self->state->token_start_char_position_in_line); $self->emit($t); return $t; } elsif (@_ == 2) { my ($self, $token) = @_; # Currently does not support multiple emits per nextToken invocation # for efficiency reasons. Subclass and override this method and # nextToken (to push tokens into a list and pull from that list rather # than a single variable as this implementation does). $self->state->token($token); } } sub match { my ($self, $s) = @_; foreach my $c (split //, $s) { if ($self->input->LA(1) ne $c) { if ($self->state->backtracking > 0) { $self->state->failed(1); return; } my $mte = ANTLR::Runtime::MismatchedTokenException->new({ expecting => $c, input => $self->input }); $self->recover($mte); croak $mte; } $self->input->consume(); $self->state->failed(0); } } sub match_any { my ($self) = @_; $self->input->consume(); } sub match_range { my ($self, $a, $b) = @_; if ($self->input->LA(1) lt $a || $self->input->LA(1) gt $b) { if ($self->state->backtracking > 0) { $self->state->failed(1); return; } my $mre = ANTLR::Runtime::MismatchedRangeException($a, $b, $self->input); $self->recover($mre); croak $mre; } $self->input->consume(); $self->state->failed(0); } sub get_line { my ($self) = @_; return $self->input->get_line(); } sub get_char_position_in_line { my ($self) = @_; return $self->input->get_char_position_in_line(); } # What is the index of the current character of lookahead? sub get_char_index { my ($self) = @_; return $self->input->index(); } # Return the text matched so far for the current token or any # text override. sub get_text { my ($self) = @_; if (defined $self->state->text) { return $self->state->text; } return $self->input->substring($self->state->token_start_char_index, $self->get_char_index() - 1); } # Set the complete text of this token; it wipes any previous # changes to the text. sub set_text { my ($self, $text) = @_; $self->state->text($text); } sub report_error { Readonly my $usage => 'void report_error(RecognitionException e)'; croak $usage if @_ != 2; my ($self, $e) = @_; $self->display_recognition_error($self->get_token_names(), $e); } sub get_error_message { my ($self, $e, $token_names) = @_; my $msg; if ($e->isa('ANTLR::Runtime::MismatchedTokenException')) { $msg = 'mismatched character ' . $self->get_char_error_display($e->get_c()) . ' expecting ' . $self->get_char_error_display($e->expecting); } elsif ($e->isa('ANTLR::Runtime::NoViableAltException')) { $msg = 'no viable alternative at character ' . $self->get_char_error_display($e->get_c()); } elsif ($e->isa('ANTLR::Runtime::EarlyExitException')) { $msg = 'required (...)+ loop did not match anything at character ' . $self->get_char_error_display($e->get_c()); } elsif ($e->isa('ANTLR::Runtime::MismatchedSetException')) { $msg = 'mismatched character ' . $self->get_char_error_display($e->get_c()) . ' expecting set ' . $e->expecting; } elsif ($e->isa('ANTLR::Runtime::MismatchedNotSetException')) { $msg = 'mismatched character ' . $self->get_char_error_display($e->get_c()) . ' expecting set ' . $e->expecting; } elsif ($e->isa('ANTLR::Runtime::MismatchedRangeException')) { $msg = 'mismatched character ' . $self->get_char_error_display($e->get_c()) . ' expecting set ' . $self->get_char_error_display($e->a) . '..' . $self->get_char_error_display($e->b); } else { $msg = $self->SUPER::get_error_message($e, $token_names); } return $msg; } sub get_char_error_display { my ($self, $c) = @_; my $s; if ($c eq ANTLR::Runtime::Token->EOF) { $s = ''; } elsif ($c eq "\n") { $s = '\n'; } elsif ($c eq "\t") { $s = '\t'; } elsif ($c eq "\r") { $s = '\r'; } else { $s = $c; } return "'$s'"; } # Lexers can normally match any char in it's vocabulary after matching # a token, so do the easy thing and just kill a character and hope # it all works out. You can instead use the rule invocation stack # to do sophisticated error recovery if you are in a fragment rule. sub recover { my ($self, $re) = @_; $self->input->consume(); } sub trace_in { my ($self, $rule_name, $rule_index) = @_; my $input_symbol = $self->input->LT(1) . ' line=' . $self->get_line() . ':' . $self->get_char_position_in_line(); $self->SUPER::trace_in($rule_name, $rule_index, $input_symbol); } sub trace_out { my ($self, $rule_name, $rule_index) = @_; my $input_symbol = $self->input->LT(1) . ' line=' . $self->get_line() . ':' . $self->get_char_position_in_line(); $self->SUPER::trace_out($rule_name, $rule_index, $input_symbol); } no Moose; __PACKAGE__->meta->make_immutable(); 1;