1 /** \file 2 * Defines the basic structures used to manipulate character 3 * streams from any input source. Any character size and encoding 4 * can in theory be used, so long as a set of functinos is provided that 5 * can return a 32 bit Integer representation of their characters amd efficiently mark and revert 6 * to specific offsets into their input streams. 7 */ 8 #ifndef _ANTLR_INPUT_HPP 9 #define _ANTLR_INPUT_HPP 10 11 // [The "BSD licence"] 12 // Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB 13 14 // 15 // All rights reserved. 16 // 17 // Redistribution and use in source and binary forms, with or without 18 // modification, are permitted provided that the following conditions 19 // are met: 20 // 1. Redistributions of source code must retain the above copyright 21 // notice, this list of conditions and the following disclaimer. 22 // 2. Redistributions in binary form must reproduce the above copyright 23 // notice, this list of conditions and the following disclaimer in the 24 // documentation and/or other materials provided with the distribution. 25 // 3. The name of the author may not be used to endorse or promote products 26 // derived from this software without specific prior written permission. 27 // 28 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 29 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 30 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 31 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 32 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 33 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 34 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 35 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 36 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 37 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 39 #include "antlr3defs.hpp" 40 41 ANTLR_BEGIN_NAMESPACE() 42 43 /// Master context structure for an ANTLR3 C runtime based input stream. 44 /// \ingroup apistructures. Calling _LT on this doesn't seem right. You would 45 /// call it only with parser / TreeParser, and their respective input streams 46 /// has that function. calling it from lexer will throw a compile time error 47 /// 48 49 template<class ImplTraits> 50 class InputStream : public ImplTraits::template IntStreamType< typename ImplTraits::InputStreamType > 51 { 52 public: 53 typedef typename ImplTraits::AllocPolicyType AllocPolicyType; 54 typedef typename ImplTraits::LexStateType LexStateType; 55 typedef typename ImplTraits::template IntStreamType< typename ImplTraits::InputStreamType > IntStreamType; 56 typedef IntStreamType BaseType; 57 typedef typename ImplTraits::StreamDataType UnitType; 58 typedef UnitType DataType; 59 typedef UnitType TokenType; 60 typedef typename AllocPolicyType::template VectorType<LexStateType> MarkersType; 61 typedef typename ImplTraits::StringType StringType; 62 63 private: 64 /** Pointer the start of the input string, characters may be 65 * taken as offsets from here and in original input format encoding. 66 */ 67 const DataType* m_data; 68 69 /** Pointer to the next character to be consumed from the input data 70 * This is cast to point at the encoding of the original file that 71 * was read by the functions installed as pointer in this input stream 72 * context instance at file/string/whatever load time. 73 */ 74 const DataType* m_nextChar; 75 76 /** Number of characters that can be consumed at this point in time. 77 * Mostly this is just what is left in the pre-read buffer, but if the 78 * input source is a stream such as a socket or something then we may 79 * call special read code to wait for more input. 80 */ 81 ANTLR_UINT32 m_sizeBuf; 82 83 /** The line number we are traversing in the input file. This gets incremented 84 * by a newline() call in the lexer grammar actions. 85 */ 86 ANTLR_UINT32 m_line; 87 88 /** Pointer into the input buffer where the current line 89 * started. 90 */ 91 const DataType* m_currentLine; 92 93 /** The offset within the current line of the current character 94 */ 95 ANTLR_INT32 m_charPositionInLine; 96 97 /** Tracks how deep mark() calls are nested 98 */ 99 ANTLR_UINT32 m_markDepth; 100 101 /** List of mark() points in the input stream 102 */ 103 MarkersType m_markers; 104 105 /** File name string, set to pointer to memory if 106 * you set it manually as it will be free()d 107 */ 108 StringType m_fileName; 109 110 /** File number, needs to be set manually to some file index of your devising. 111 */ 112 ANTLR_UINT32 m_fileNo; 113 114 /// Character that automatically causes an internal line count 115 /// increment. 116 /// 117 ANTLR_UCHAR m_newlineChar; 118 119 /// Indicates the size, in 8 bit units, of a single character. Note that 120 /// the C runtime does not deal with surrogates as this would be 121 /// slow and complicated. If this is a UTF-8 stream then this field 122 /// will be set to 0. Generally you are best working internally with 32 bit characters 123 /// as this is the most efficient. 124 /// 125 ANTLR_UINT8 m_charByteSize; 126 127 /** Indicates if the data pointer was allocated by us, and so should be freed 128 * when the stream dies. 129 */ 130 bool m_isAllocated; 131 132 /// Indicates the encoding scheme used in this input stream 133 /// 134 ANTLR_UINT32 m_encoding; 135 136 /* API */ 137 public: 138 InputStream(const ANTLR_UINT8* fileName, ANTLR_UINT32 encoding); 139 InputStream(const ANTLR_UINT8* data, ANTLR_UINT32 encoding, ANTLR_UINT32 size, ANTLR_UINT8* name); 140 ~InputStream(); 141 const DataType* get_data() const; 142 bool get_isAllocated() const; 143 const DataType* get_nextChar() const; 144 ANTLR_UINT32 get_sizeBuf() const; 145 ANTLR_UINT32 get_line() const; 146 const DataType* get_currentLine() const; 147 ANTLR_INT32 get_charPositionInLine() const; 148 ANTLR_UINT32 get_markDepth() const; 149 MarkersType& get_markers(); 150 const StringType& get_fileName() const; 151 ANTLR_UINT32 get_fileNo() const; 152 ANTLR_UCHAR get_newlineChar() const; 153 ANTLR_UINT8 get_charByteSize() const; 154 ANTLR_UINT32 get_encoding() const; 155 156 void set_data( DataType* data ); 157 void set_isAllocated( bool isAllocated ); 158 void set_nextChar( const DataType* nextChar ); 159 void set_sizeBuf( ANTLR_UINT32 sizeBuf ); 160 void set_line( ANTLR_UINT32 line ); 161 void set_currentLine( const DataType* currentLine ); 162 void set_charPositionInLine( ANTLR_INT32 charPositionInLine ); 163 void set_markDepth( ANTLR_UINT32 markDepth ); 164 void set_markers( const MarkersType& markers ); 165 void set_fileName( const StringType& fileName ); 166 void set_fileNo( ANTLR_UINT32 fileNo ); 167 void set_newlineChar( ANTLR_UCHAR newlineChar ); 168 void set_charByteSize( ANTLR_UINT8 charByteSize ); 169 void set_encoding( ANTLR_UINT32 encoding ); 170 171 void inc_charPositionInLine(); 172 void inc_line(); 173 void inc_markDepth(); 174 175 IntStreamType* get_istream(); 176 177 /** Function that resets the input stream 178 */ 179 void reset(); 180 181 /** Pointer to a function that reuses and resets an input stream by 182 * supplying a new 'source' 183 */ 184 void reuse(ANTLR_UINT8* inString, ANTLR_UINT32 size, ANTLR_UINT8* name); 185 186 187 /** Function to return the total size of the input buffer. For streams 188 * this may be just the total we have available so far. This means of course that 189 * the input stream must be careful to accumulate enough input so that any backtracking 190 * can be satisfied. 191 */ 192 ANTLR_UINT32 size(); 193 194 /** Function to return a substring of the input stream. String is returned in allocated 195 * memory and is in same encoding as the input stream itself, NOT internal ANTLR_UCHAR form. 196 */ 197 StringType substr(ANTLR_MARKER start, ANTLR_MARKER stop); 198 199 /** Function to return the current line number in the input stream 200 */ 201 ANTLR_UINT32 get_line(); 202 203 /** Function to return the current line buffer in the input stream 204 * The pointer returned is directly into the input stream so you must copy 205 * it if you wish to manipulate it without damaging the input stream. Encoding 206 * is obviously in the same form as the input stream. 207 * \remark 208 * - Note taht this function wil lbe inaccurate if setLine is called as there 209 * is no way at the moment to position the input stream at a particular line 210 * number offset. 211 */ 212 const DataType* getLineBuf(); 213 214 /** Function to return the current offset in the current input stream line 215 */ 216 ANTLR_UINT32 get_charPositionInLine(); 217 218 /** Function to set the current position in the current line. 219 */ 220 void set_charPositionInLine(ANTLR_UINT32 position); 221 222 /** Function to override the default newline character that the input stream 223 * looks for to trigger the line/offset and line buffer recording information. 224 * \remark 225 * - By default the chracter '\n' will be installed as the newline trigger character. When this 226 * character is seen by the consume() function then the current line number is incremented and the 227 * current line offset is reset to 0. The Pointer for the line of input we are consuming 228 * is updated to point to the next character after this one in the input stream (which means it 229 * may become invalid if the last newline character in the file is seen (so watch out). 230 * - If for some reason you do not want the counters and pointers to be restee, you can set the 231 * chracter to some impossible character such as '\0' or whatever. 232 * - This is a single character only, so choose the last character in a sequence of two or more. 233 * - This is only a simple aid to error reporting - if you have a complicated binary input structure 234 * it may not be adequate, but you can always override every function in the input stream with your 235 * own of course, and can even write your own complete input stream set if you like. 236 * - It is your responsiblity to set a valid character for the input stream type. There is no point 237 * setting this to 0xFFFFFFFF if the input stream is 8 bit ASCII, as this will just be truncated and never 238 * trigger as the comparison will be (INT32)0xFF == (INT32)0xFFFFFFFF 239 */ 240 void set_newLineChar(ANTLR_UINT32 newlineChar); 241 242 ANTLR_MARKER index_impl(); 243 244 private: 245 /** \brief Use the contents of an operating system file as the input 246 * for an input stream. 247 * 248 * \param fileName Name of operating system file to read. 249 * \return 250 * - Pointer to new input stream context upon success 251 * - One of the ANTLR3_ERR_ defines on error. 252 */ 253 void createFileStream(const ANTLR_UINT8* fileName); 254 255 /** \brief Use the supplied 'string' as input to the stream 256 * 257 * \param data Pointer to the input data 258 * \return 259 * - Pointer to new input stream context upon success 260 * - NULL defines on error. 261 */ 262 void createStringStream(const ANTLR_UINT8* data); 263 void genericSetupStream(); 264 265 /// Determine endianess of the input stream and install the 266 /// API required for the encoding in that format. 267 /// 268 void setupInputStream(); 269 270 }; 271 272 /** \brief Structure for track lex input states as part of mark() 273 * and rewind() of lexer. 274 */ 275 template<class ImplTraits> 276 class LexState : public ImplTraits::AllocPolicyType 277 { 278 public: 279 typedef typename ImplTraits::StreamDataType DataType; 280 281 private: 282 /** Pointer to the next character to be consumed from the input data 283 * This is cast to point at the encoding of the original file that 284 * was read by the functions installed as pointer in this input stream 285 * context instance at file/string/whatever load time. 286 */ 287 const DataType* m_nextChar; 288 289 /** The line number we are traversing in the input file. This gets incremented 290 * by a newline() call in the lexer grammer actions. 291 */ 292 ANTLR_UINT32 m_line; 293 294 /** Pointer into the input buffer where the current line 295 * started. 296 */ 297 const DataType* m_currentLine; 298 299 /** The offset within the current line of the current character 300 */ 301 ANTLR_INT32 m_charPositionInLine; 302 303 public: 304 LexState(); 305 const DataType* get_nextChar() const; 306 ANTLR_UINT32 get_line() const; 307 const DataType* get_currentLine() const; 308 ANTLR_INT32 get_charPositionInLine() const; 309 void set_nextChar( const DataType* nextChar ); 310 void set_line( ANTLR_UINT32 line ); 311 void set_currentLine( const DataType* currentLine ); 312 void set_charPositionInLine( ANTLR_INT32 charPositionInLine ); 313 }; 314 315 class ParseNullStringException : public std::exception 316 { what() const317 virtual const char* what() const throw() 318 { 319 return "Null String"; 320 } 321 }; 322 323 ANTLR_END_NAMESPACE() 324 325 #include "antlr3input.inl" 326 327 #endif /* _ANTLR_INPUT_H */ 328