#!/usr/bin/ruby # encoding: utf-8 require 'antlr3/test/functional' class XMLLexerTest < ANTLR3::Test::Functional inline_grammar( <<-'END' ) lexer grammar XML; options { language = Ruby; } @members { include ANTLR3::Test::CaptureOutput include ANTLR3::Test::RaiseErrors def quote(text) text = text.gsub(/\"/, '\\"') \%("#{ text }") end } DOCUMENT : XMLDECL? WS? DOCTYPE? WS? ELEMENT WS? ; fragment DOCTYPE : '' ; fragment INTERNAL_DTD : '[' (options {greedy=false;} : .)* ']' ; fragment PI : '' ; fragment XMLDECL : '' ; fragment ELEMENT : ( START_TAG (ELEMENT | t=PCDATA {say("PCDATA: " << quote($t.text))} | t=CDATA {say("CDATA: " << quote($t.text))} | t=COMMENT {say("Comment: " << quote($t.text))} | pi=PI )* END_TAG | EMPTY_ELEMENT ) ; fragment START_TAG : '<' WS? name=GENERIC_ID WS? {say("Start Tag: " + $name.text)} ( ATTRIBUTE WS? )* '>' ; fragment EMPTY_ELEMENT : '<' WS? name=GENERIC_ID WS? {say("Empty Element: " + $name.text)} ( ATTRIBUTE WS? )* '/>' ; fragment ATTRIBUTE : name=GENERIC_ID WS? '=' WS? value=VALUE {say("Attr: " + $name.text + " = "+ $value.text)} ; fragment END_TAG : '' {say("End Tag: " + $name.text)} ; fragment COMMENT : '' ; fragment CDATA : '' ; fragment PCDATA : (~'<')+ ; fragment VALUE : ( '\"' (~'\"')* '\"' | '\'' (~'\'')* '\'' ) ; fragment GENERIC_ID : ( LETTER | '_' | ':') ( options {greedy=true;} : LETTER | '0'..'9' | '.' | '-' | '_' | ':' )* ; fragment LETTER : 'a'..'z' | 'A'..'Z' ; fragment WS : ( ' ' | '\t' | ( '\n' | '\r\n' | '\r' ) )+ ; END it "should be valid" do lexer = XML::Lexer.new( <<-'END'.fixed_indent( 0 ) ) ]> Text öäüß & < END lexer.map { |tk| tk } lexer.output.should == <<-'END'.fixed_indent( 0 ) XML declaration Attr: version = '1.0' ROOTELEMENT: component INTERNAL DTD: [ ] Start Tag: component Attr: attr = "val'ue" Attr: attr2 = 'val"ue' PCDATA: " " Comment: "" PCDATA: " Text " CDATA: "" PCDATA: " öäüß & < " PI: xtal Attr: cursor = '11' PCDATA: " " Empty Element: sub PCDATA: " " Start Tag: sub End Tag: sub PCDATA: " " End Tag: component END end end