1# Copyright (C) 2010 The Android Open Source Project 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# 15 16# Tiny XML parser implementation in awk. 17# 18# This file is not meant to be used directly, instead copy the 19# functions it defines here into your own script then specialize 20# it appropriately. 21# 22 23# See further below for usage instructions and implementation details. 24# 25 26# ---------------------------- cut here --------------------------- 27 28function xml_event () { 29 RS=">"; 30 XML_TAG=XML_TYPE=""; 31 split("", XML_ATTR); 32 while ( 1 ) { 33 if (_xml_closing) { # delayed direct tag closure 34 XML_TAG = _xml_closing; 35 XML_TYPE = "END"; 36 _xml_closing = ""; 37 _xml_exit(XML_TAG); 38 return 1; 39 } 40 if (getline <= 0) return 0; # read new input line 41 _xml_p = index($0, "<"); # get start marker 42 if (_xml_p == 0) return 0; # end of file (or malformed input) 43 $0 = substr($0, _xml_p) # remove anything before '<' 44 # ignore CData / Comments / Processing instructions / Declarations 45 if (_xml_in_section("<!\\[[Cc][Dd][Aa][Tt][Aa]\\[", "]]") || 46 _xml_in_section("<!--", "--") || 47 _xml_in_section("<\\?", "\\?") || 48 _xml_in_section("<!", "")) { 49 continue; 50 } 51 if (substr($0, 1, 2) == "</") { # is it a closing tag ? 52 XML_TYPE = "END"; 53 $0 = substr($0, 3); 54 } else { # nope, it's an opening one 55 XML_TYPE = "BEGIN"; 56 $0 = substr($0, 2); 57 } 58 XML_TAG = $0 59 sub("[ \n\t/].*$", "", XML_TAG); # extract tag name 60 XML_TAG = toupper(XML_TAG); # uppercase it 61 if ( XML_TAG !~ /^[A-Z][-+_.:0-9A-Z]*$/ ) # validate it 62 _xml_panic("Invalid tag name: " XML_TAG); 63 if (XML_TYPE == "BEGIN") { # update reverse path 64 _xml_enter(XML_TAG); 65 } else { 66 _xml_exit(XML_TAG); 67 } 68 sub("[^ \n\t]*[ \n\t]*", "", $0); # get rid of tag and spaces 69 while ($0) { # process attributes 70 if ($0 == "/") { # deal with direct closing tag, e.g. </foo> 71 _xml_closing = XML_TAG; # record delayed tag closure. 72 break 73 } 74 _xml_attrib = $0; 75 sub(/=.*$/,"",_xml_attrib); # extract attribute name 76 sub(/^[^=]*/,"",$0); # remove it from record 77 _xml_attrib = tolower(_xml_attrib); 78 if ( _xml_attrib !~ /^[a-z][-+_0-9a-z:]*$/ ) # validate it 79 _xml_panic("Invalid attribute name: " _xml_attrib); 80 if (substr($0,1,2) == "=\"") { # value is ="something" 81 _xml_value = substr($0,3); 82 sub(/".*$/,"",_xml_value); 83 sub(/^="[^"]*"/,"",$0); 84 } else if (substr($0,1,2) == "='") { # value is ='something' 85 _xml_value = substr($0,3); 86 sub(/'.*$/,"",_xml_value); 87 sub(/^='[^']*'/,"",$0); 88 } else { 89 _xml_panic("Invalid attribute value syntax for " _xml_attrib ": " $0); 90 } 91 XML_ATTR[_xml_attrib] = _xml_value; # store attribute name/value 92 sub(/^[ \t\n]*/,"",$0); # get rid of remaining leading spaces 93 } 94 return 1; # now return, XML_TYPE/TAG/ATTR/RPATH are set 95 } 96} 97 98function _xml_panic (msg) { 99 print msg > "/dev/stderr" 100 exit(1) 101} 102 103function _xml_in_section (sec_begin, sec_end) { 104 if (!match( $0, "^" sec_begin )) return 0; 105 while (!match($0, sec_end "$")) { 106 if (getline <= 0) _xml_panic("Unexpected EOF: " ERRNO); 107 } 108 return 1; 109} 110 111function _xml_enter (tag) { 112 XML_RPATH = tag "/" XML_RPATH; 113} 114 115function _xml_exit (tag) { 116 _xml_p = index(XML_RPATH, "/"); 117 _xml_expected = substr(XML_RPATH, 1, _xml_p-1); 118 if (_xml_expected != XML_TAG) 119 _xml_panic("Unexpected close tag: " XML_TAG ", expecting " _xml_expected); 120 XML_RPATH = substr(XML_RPATH, _xml_p+1); 121} 122 123# ---------------------------- cut here --------------------------- 124 125# USAGE: 126# 127# The functions provided here are used to extract the tags and attributes of a 128# given XML file. They do not support extraction of data, CDATA, comments, 129# processing instructions and declarations at all. 130# 131# You should use this from the BEGIN {} action of your awk script (it will 132# not work from an END {} action). 133# 134# Call xml_event() in a while loop. This functions returns 1 for each XML 135# 'event' encountered, or 0 when the end of input is reached. Note that in 136# case of malformed output, an error will be printed and the script will 137# force an exit(1) 138# 139# After each succesful xml_event() call, the following variables will be set: 140# 141# XML_TYPE: type of event: "BEGIN" -> mean an opening tag, "END" a 142# closing one. 143# 144# XML_TAG: name of the tag, always in UPPERCASE! 145# 146# XML_ATTR: a map of attributes for the type. Only set for "BEGIN" types. 147# all attribute names are in lowercase. 148# 149# beware: values are *not* unescaped ! 150# 151# XML_RPATH: the _reversed_ element path, using "/" as a separator. 152# if you are within the <manifest><application> tag, then 153# it will be set to "APPLICATION/MANIFEST/" 154# (note the trailing slash). 155# 156 157# This is a simple example that dumps the output of the parsing. 158# 159BEGIN { 160 while ( xml_event() ) { 161 printf "XML_TYPE=%s XML_TAG=%s XML_RPATH=%s", XML_TYPE, XML_TAG, XML_RPATH; 162 if (XML_TYPE == "BEGIN") { 163 for (attr in XML_ATTR) { 164 printf " %s='%s'", attr, XML_ATTR[attr]; 165 } 166 } 167 printf "\n"; 168 } 169} 170 171# IMPLEMENTATION DETAILS: 172# 173# 1. '>' as the record separator: 174# 175# RS is set to '>' to use this character as the record separator, instead of 176# the default '\n'. This means that something like the following: 177# 178# <foo><bar attrib="value">stuff</bar></foo> 179# 180# will be translated into the following successive 'records': 181# 182# <foo 183# <bar attrib="value" 184# stuff</bar 185# </foo 186# 187# Note that the '>' is never part of the records and thus will not be matched. 188# If the record does not contain a single '<', the input is either 189# malformed XML, or we reached the end of file with data after the last 190# '>'. 191# 192# Newlines in the original input are kept in the records as-is. 193# 194# 2. Getting rid of unwanted stuff: 195# 196# We don't need any of the data within elements, so we get rid of them by 197# simply ignoring anything before the '<' in the current record. This is 198# done with code like this: 199# 200# p = index($0, "<"); # get index of '<' 201# if (p == 0) -> return 0; # malformed input or end of file 202# $0 = substr($0, p+1); # remove anything before the '<' in record 203# 204# We also want to ignore certain sections like CDATA, comments, declarations, 205# etc.. These begin with a certain pattern and end with another one, e.g. 206# "<!--" and "-->" for comments. This is handled by the _xml_in_section() 207# function that accepts two patterns as input: 208# 209# sec_begin: is the pattern for the start of the record. 210# sec_end: is the pattern for the end of the record (minus trailing '>'). 211# 212# The function deals with the fact that these section can embed a valid '>' 213# and will then span multiple records, i.e. something like: 214# 215# <!-- A comment with an embedded > right here ! --> 216# 217# will be decomposed into two records: 218# 219# "<!-- A comment with an embedded " 220# " right here ! --" 221# 222# The function deals with this case, and exits when such a section is not 223# properly terminated in the input. 224# 225# _xml_in_section() returns 1 if an ignorable section was found, or 0 otherwise. 226# 227# 3. Extracting the tag name: 228# 229# </foo> is a closing tag, and <foo> an opening tag, this is handled 230# by the following code: 231# 232# if (substr($0, 1, 2) == "</") { 233# XML_TYPE = "END"; 234# $0 = substr($0, 3); 235# } else { 236# XML_TYPE = "BEGIN"; 237# $0 = substr($0, 2); 238# } 239# 240# which defines XML_TYPE, and removes the leading "</" or "<" from the record. 241# The tag is later extracted and converted to uppercase with: 242# 243# XML_TAG = $0 # copy record 244# sub("[ \n\t/].*$", "", XML_TAG); # remove anything after tag name 245# XML_TAG = toupper(XML_TAG); # conver to uppercase 246# # validate tag 247# if ( XML_TAG !~ /^[A-Z][-+_.:0-9A-Z]*$/ ) -> panic 248# 249# Then the record is purged from the tag name and the spaces after it: 250# 251# # get rid of tag and spaces after it in $0 252# sub("[^ \n\t]*[ \n\t]*", "", $0); 253# 254# 4. Maintaining XML_RPATH: 255# 256# The _xml_enter() and _xml_exit() functions are called to maintain the 257# XML_RPATH variable when entering and exiting specific tags. _xml_exit() 258# will also validate the input, checking proper tag enclosure (or exit(1) 259# in case of error). 260# 261# if (XML_TYPE == "BEGIN") { 262# _xml_enter(XML_TAG); 263# } else { 264# _xml_exit(XML_TAG); 265# } 266# 267# 5. Extracting attributes: 268# 269# A loop is implemented to parse attributes, the idea is to get the attribute 270# name, which is always followed by a '=' character: 271# 272# _xml_attrib = $0; # copy record. 273# sub(/=.*$/,"",_xml_attrib); # get rid of '=' and anything after. 274# sub(/^[^=]*/,"",$0); # remove attribute name from $0 275# _xml_attrib = tolower(_xml_attrib); 276# if ( _xml_attrib !~ /^[a-z][-+_0-9a-z:]*$/ ) 277# _xml_panic("Invalid attribute name: " _xml_attrib); 278# 279# Now get the value, which is enclosed by either (") or (') 280# 281# if (substr($0,1,2) == "=\"") { # if $0 begins with =" 282# _xml_value = substr($0,3); # extract value 283# sub(/".*$/,"",_xml_value); 284# sub(/^="[^"]*"/,"",$0); # remove it from $0 285# } else if (substr($0,1,2) == "='") { # if $0 begins with =' 286# _xml_value = substr($0,3); # extract value 287# sub(/'.*$/,"",_xml_value); 288# sub(/^='[^']*'/,"",$0); # remove it from $0 289# } else { 290# -> panic (malformed input) 291# } 292# 293# After that, we simply store the value into the XML_ATTR associative 294# array, and cleanup $0 from leading spaces: 295# 296# XML_ATTR[_xml_attrib] = _xml_value; 297# sub(/^[ \t\n]*/,"",$0); 298# 299# 300# 6. Handling direct tag closure: 301# 302# When a tag is closed directly (as in <foo/>), A single '/' will be 303# parsed in the attribute parsing loop. We need to record this for the 304# next call to xml_event(), since the current one should return a"BEGIN" 305# for the "FOO" tag instead. 306# 307# We do this by setting the special _xml_closing variable, as in: 308# 309# if ($0 == "/") { 310# # record a delayed tag closure for the next call 311# _xml_closing = XML_TAG; 312# break 313# } 314# 315# This variable is checked at the start of xml_event() like this: 316# 317# # delayed tag closure - see below 318# if (_xml_closing) { 319# XML_TAG = _xml_closing; 320# XML_TYPE = "END"; 321# _xml_closing = ""; 322# _xml_exit(XML_TAG); 323# return 1; 324# } 325# 326# Note the call to _xml_exit() to update XML_RPATH here. 327# 328