1 /* 2 * [The "BSD license"] 3 * Copyright (c) 2010 Terence Parr 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 3. The name of the author may not be used to endorse or promote products 15 * derived from this software without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 package org.antlr.codegen; 29 30 import org.antlr.Tool; 31 import org.antlr.analysis.Label; 32 import org.antlr.runtime.Token; 33 import org.stringtemplate.v4.ST; 34 import org.antlr.tool.Grammar; 35 36 import java.io.IOException; 37 import java.util.List; 38 39 /** The code generator for ANTLR can usually be retargeted just by providing 40 * a new X.stg file for language X, however, sometimes the files that must 41 * be generated vary enough that some X-specific functionality is required. 42 * For example, in C, you must generate header files whereas in Java you do not. 43 * Other languages may want to keep DFA separate from the main 44 * generated recognizer file. 45 * 46 * The notion of a Code Generator target abstracts out the creation 47 * of the various files. As new language targets get added to the ANTLR 48 * system, this target class may have to be altered to handle more 49 * functionality. Eventually, just about all language generation issues 50 * will be expressible in terms of these methods. 51 * 52 * If org.antlr.codegen.XTarget class exists, it is used else 53 * Target base class is used. I am using a superclass rather than an 54 * interface for this target concept because I can add functionality 55 * later without breaking previously written targets (extra interface 56 * methods would force adding dummy functions to all code generator 57 * target classes). 58 * 59 */ 60 public class Target { 61 62 /** For pure strings of Java 16-bit unicode char, how can we display 63 * it in the target language as a literal. Useful for dumping 64 * predicates and such that may refer to chars that need to be escaped 65 * when represented as strings. Also, templates need to be escaped so 66 * that the target language can hold them as a string. 67 * 68 * I have defined (via the constructor) the set of typical escapes, 69 * but your Target subclass is free to alter the translated chars or 70 * add more definitions. This is nonstatic so each target can have 71 * a different set in memory at same time. 72 */ 73 protected String[] targetCharValueEscape = new String[255]; 74 Target()75 public Target() { 76 targetCharValueEscape['\n'] = "\\n"; 77 targetCharValueEscape['\r'] = "\\r"; 78 targetCharValueEscape['\t'] = "\\t"; 79 targetCharValueEscape['\b'] = "\\b"; 80 targetCharValueEscape['\f'] = "\\f"; 81 targetCharValueEscape['\\'] = "\\\\"; 82 targetCharValueEscape['\''] = "\\'"; 83 targetCharValueEscape['"'] = "\\\""; 84 } 85 genRecognizerFile(Tool tool, CodeGenerator generator, Grammar grammar, ST outputFileST)86 protected void genRecognizerFile(Tool tool, 87 CodeGenerator generator, 88 Grammar grammar, 89 ST outputFileST) 90 throws IOException 91 { 92 String fileName = 93 generator.getRecognizerFileName(grammar.name, grammar.type); 94 generator.write(outputFileST, fileName); 95 } 96 genRecognizerHeaderFile(Tool tool, CodeGenerator generator, Grammar grammar, ST headerFileST, String extName)97 protected void genRecognizerHeaderFile(Tool tool, 98 CodeGenerator generator, 99 Grammar grammar, 100 ST headerFileST, 101 String extName) // e.g., ".h" 102 throws IOException 103 { 104 // no header file by default 105 } 106 performGrammarAnalysis(CodeGenerator generator, Grammar grammar)107 protected void performGrammarAnalysis(CodeGenerator generator, 108 Grammar grammar) 109 { 110 // Build NFAs from the grammar AST 111 grammar.buildNFA(); 112 113 // Create the DFA predictors for each decision 114 grammar.createLookaheadDFAs(); 115 } 116 117 /** Is scope in @scope::name {action} valid for this kind of grammar? 118 * Targets like C++ may want to allow new scopes like headerfile or 119 * some such. The action names themselves are not policed at the 120 * moment so targets can add template actions w/o having to recompile 121 * ANTLR. 122 */ isValidActionScope(int grammarType, String scope)123 public boolean isValidActionScope(int grammarType, String scope) { 124 switch (grammarType) { 125 case Grammar.LEXER : 126 if ( scope.equals("lexer") ) {return true;} 127 break; 128 case Grammar.PARSER : 129 if ( scope.equals("parser") ) {return true;} 130 break; 131 case Grammar.COMBINED : 132 if ( scope.equals("parser") ) {return true;} 133 if ( scope.equals("lexer") ) {return true;} 134 break; 135 case Grammar.TREE_PARSER : 136 if ( scope.equals("treeparser") ) {return true;} 137 break; 138 } 139 return false; 140 } 141 142 /** Target must be able to override the labels used for token types */ getTokenTypeAsTargetLabel(CodeGenerator generator, int ttype)143 public String getTokenTypeAsTargetLabel(CodeGenerator generator, int ttype) { 144 String name = generator.grammar.getTokenDisplayName(ttype); 145 // If name is a literal, return the token type instead 146 if ( name.charAt(0)=='\'' ) { 147 return String.valueOf(ttype); 148 } 149 return name; 150 } 151 152 /** Convert from an ANTLR char literal found in a grammar file to 153 * an equivalent char literal in the target language. For most 154 * languages, this means leaving 'x' as 'x'. Actually, we need 155 * to escape '\u000A' so that it doesn't get converted to \n by 156 * the compiler. Convert the literal to the char value and then 157 * to an appropriate target char literal. 158 * 159 * Expect single quotes around the incoming literal. 160 */ getTargetCharLiteralFromANTLRCharLiteral( CodeGenerator generator, String literal)161 public String getTargetCharLiteralFromANTLRCharLiteral( 162 CodeGenerator generator, 163 String literal) 164 { 165 StringBuffer buf = new StringBuffer(); 166 buf.append('\''); 167 int c = Grammar.getCharValueFromGrammarCharLiteral(literal); 168 if ( c<Label.MIN_CHAR_VALUE ) { 169 return "'\u0000'"; 170 } 171 if ( c<targetCharValueEscape.length && 172 targetCharValueEscape[c]!=null ) 173 { 174 buf.append(targetCharValueEscape[c]); 175 } 176 else if ( Character.UnicodeBlock.of((char)c)== 177 Character.UnicodeBlock.BASIC_LATIN && 178 !Character.isISOControl((char)c) ) 179 { 180 // normal char 181 buf.append((char)c); 182 } 183 else { 184 // must be something unprintable...use \\uXXXX 185 // turn on the bit above max "\\uFFFF" value so that we pad with zeros 186 // then only take last 4 digits 187 String hex = Integer.toHexString(c|0x10000).toUpperCase().substring(1,5); 188 buf.append("\\u"); 189 buf.append(hex); 190 } 191 192 buf.append('\''); 193 return buf.toString(); 194 } 195 196 /** Convert from an ANTLR string literal found in a grammar file to 197 * an equivalent string literal in the target language. For Java, this 198 * is the translation 'a\n"' -> "a\n\"". Expect single quotes 199 * around the incoming literal. Just flip the quotes and replace 200 * double quotes with \" 201 * 202 * Note that we have decided to allow poeple to use '\"' without 203 * penalty, so we must build the target string in a loop as Utils.replae 204 * cannot handle both \" and " without a lot of messing around. 205 * 206 */ getTargetStringLiteralFromANTLRStringLiteral( CodeGenerator generator, String literal)207 public String getTargetStringLiteralFromANTLRStringLiteral( 208 CodeGenerator generator, 209 String literal) 210 { 211 StringBuilder sb = new StringBuilder(); 212 StringBuffer is = new StringBuffer(literal); 213 214 // Opening quote 215 // 216 sb.append('"'); 217 218 for (int i = 1; i < is.length() -1; i++) { 219 if (is.charAt(i) == '\\') { 220 // Anything escaped is what it is! We assume that 221 // people know how to escape characters correctly. However 222 // we catch anything that does not need an escape in Java (which 223 // is what the default implementation is dealing with and remove 224 // the escape. The C target does this for instance. 225 // 226 switch (is.charAt(i+1)) { 227 // Pass through any escapes that Java also needs 228 // 229 case '"': 230 case 'n': 231 case 'r': 232 case 't': 233 case 'b': 234 case 'f': 235 case '\\': 236 case 'u': // Assume unnnn 237 sb.append('\\'); // Pass the escape through 238 break; 239 default: 240 // Remove the escape by virtue of not adding it here 241 // Thus \' becomes ' and so on 242 // 243 break; 244 } 245 246 // Go past the \ character 247 // 248 i++; 249 } else { 250 // Chracters that don't need \ in ANTLR 'strings' but do in Java 251 // 252 if (is.charAt(i) == '"') { 253 // We need to escape " in Java 254 // 255 sb.append('\\'); 256 } 257 } 258 // Add in the next character, which may have been escaped 259 // 260 sb.append(is.charAt(i)); 261 } 262 263 // Append closing " and return 264 // 265 sb.append('"'); 266 267 return sb.toString(); 268 } 269 270 /** Given a random string of Java unicode chars, return a new string with 271 * optionally appropriate quote characters for target language and possibly 272 * with some escaped characters. For example, if the incoming string has 273 * actual newline characters, the output of this method would convert them 274 * to the two char sequence \n for Java, C, C++, ... The new string has 275 * double-quotes around it as well. Example String in memory: 276 * 277 * a"[newlinechar]b'c[carriagereturnchar]d[tab]e\f 278 * 279 * would be converted to the valid Java s: 280 * 281 * "a\"\nb'c\rd\te\\f" 282 * 283 * or 284 * 285 * a\"\nb'c\rd\te\\f 286 * 287 * depending on the quoted arg. 288 */ getTargetStringLiteralFromString(String s, boolean quoted)289 public String getTargetStringLiteralFromString(String s, boolean quoted) { 290 if ( s==null ) { 291 return null; 292 } 293 294 StringBuffer buf = new StringBuffer(); 295 if ( quoted ) { 296 buf.append('"'); 297 } 298 for (int i=0; i<s.length(); i++) { 299 int c = s.charAt(i); 300 if ( c!='\'' && // don't escape single quotes in strings for java 301 c<targetCharValueEscape.length && 302 targetCharValueEscape[c]!=null ) 303 { 304 buf.append(targetCharValueEscape[c]); 305 } 306 else { 307 buf.append((char)c); 308 } 309 } 310 if ( quoted ) { 311 buf.append('"'); 312 } 313 return buf.toString(); 314 } 315 getTargetStringLiteralFromString(String s)316 public String getTargetStringLiteralFromString(String s) { 317 return getTargetStringLiteralFromString(s, false); 318 } 319 320 /** Convert long to 0xNNNNNNNNNNNNNNNN by default for spitting out 321 * with bitsets. I.e., convert bytes to hex string. 322 */ getTarget64BitStringFromValue(long word)323 public String getTarget64BitStringFromValue(long word) { 324 int numHexDigits = 8*2; 325 StringBuffer buf = new StringBuffer(numHexDigits+2); 326 buf.append("0x"); 327 String digits = Long.toHexString(word); 328 digits = digits.toUpperCase(); 329 int padding = numHexDigits - digits.length(); 330 // pad left with zeros 331 for (int i=1; i<=padding; i++) { 332 buf.append('0'); 333 } 334 buf.append(digits); 335 return buf.toString(); 336 } 337 encodeIntAsCharEscape(int v)338 public String encodeIntAsCharEscape(int v) { 339 if ( v<=127 ) { 340 return "\\"+Integer.toOctalString(v); 341 } 342 String hex = Integer.toHexString(v|0x10000).substring(1,5); 343 return "\\u"+hex; 344 } 345 346 /** Some targets only support ASCII or 8-bit chars/strings. For example, 347 * C++ will probably want to return 0xFF here. 348 */ getMaxCharValue(CodeGenerator generator)349 public int getMaxCharValue(CodeGenerator generator) { 350 return Label.MAX_CHAR_VALUE; 351 } 352 353 /** Give target a chance to do some postprocessing on actions. 354 * Python for example will have to fix the indention. 355 */ postProcessAction(List chunks, Token actionToken)356 public List postProcessAction(List chunks, Token actionToken) { 357 return chunks; 358 } 359 360 } 361