• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /// \file
2 /// Base functions to initialize and manipulate any input stream
3 ///
4 
5 // [The "BSD licence"]
6 // Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC
7 // http://www.temporal-wave.com
8 // http://www.linkedin.com/in/jimidle
9 //
10 // All rights reserved.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions
14 // are met:
15 // 1. Redistributions of source code must retain the above copyright
16 //    notice, this list of conditions and the following disclaimer.
17 // 2. Redistributions in binary form must reproduce the above copyright
18 //    notice, this list of conditions and the following disclaimer in the
19 //    documentation and/or other materials provided with the distribution.
20 // 3. The name of the author may not be used to endorse or promote products
21 //    derived from this software without specific prior written permission.
22 //
23 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
24 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
25 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
26 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
27 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
28 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
32 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 
34 #include    <antlr3input.h>
35 
36 // -----------------------------------
37 // Generic 8 bit input such as latin-1
38 //
39 
40 // 8Bit INT Stream API
41 //
42 static	    void	    antlr38BitConsume		(pANTLR3_INT_STREAM is);
43 static	    ANTLR3_UCHAR    antlr38BitLA		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
44 static	    ANTLR3_UCHAR    antlr38BitLA_ucase		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
45 static	    ANTLR3_MARKER   antlr38BitIndex		(pANTLR3_INT_STREAM is);
46 static	    ANTLR3_MARKER   antlr38BitMark		(pANTLR3_INT_STREAM is);
47 static	    void	    antlr38BitRewind		(pANTLR3_INT_STREAM is, ANTLR3_MARKER mark);
48 static	    void	    antlr38BitRewindLast	(pANTLR3_INT_STREAM is);
49 static	    void	    antlr38BitRelease		(pANTLR3_INT_STREAM is, ANTLR3_MARKER mark);
50 static	    void	    antlr38BitSeek		(pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint);
51 static	    pANTLR3_STRING  antlr38BitGetSourceName	(pANTLR3_INT_STREAM is);
52 
53 // 8Bit Charstream API functions
54 //
55 static	    void	    antlr3InputClose		(pANTLR3_INPUT_STREAM input);
56 static	    void	    antlr3InputReset		(pANTLR3_INPUT_STREAM input);
57 static      void            antlr38BitReuse            (pANTLR3_INPUT_STREAM input, pANTLR3_UINT8 inString, ANTLR3_UINT32 size, pANTLR3_UINT8 name);
58 static	    void *	    antlr38BitLT		(pANTLR3_INPUT_STREAM input, ANTLR3_INT32 lt);
59 static	    ANTLR3_UINT32   antlr38BitSize		(pANTLR3_INPUT_STREAM input);
60 static	    pANTLR3_STRING  antlr38BitSubstr		(pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop);
61 static	    ANTLR3_UINT32   antlr38BitGetLine		(pANTLR3_INPUT_STREAM input);
62 static	    void	  * antlr38BitGetLineBuf	(pANTLR3_INPUT_STREAM input);
63 static	    ANTLR3_UINT32   antlr38BitGetCharPosition	(pANTLR3_INPUT_STREAM input);
64 static	    void	    antlr38BitSetLine		(pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 line);
65 static	    void	    antlr38BitSetCharPosition	(pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 position);
66 static	    void	    antlr38BitSetNewLineChar	(pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 newlineChar);
67 static	    void	    antlr38BitSetUcaseLA	(pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN flag);
68 
69 // -----------------------------------
70 // UTF16 (also covers UCS2)
71 //
72 // INT Stream API
73 //
74 static	    void	    antlr3UTF16Consume	        (pANTLR3_INT_STREAM is);
75 static	    ANTLR3_UCHAR    antlr3UTF16LA		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
76 static	    void	    antlr3UTF16ConsumeLE        (pANTLR3_INT_STREAM is);
77 static	    ANTLR3_UCHAR    antlr3UTF16LALE		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
78 static	    void	    antlr3UTF16ConsumeBE        (pANTLR3_INT_STREAM is);
79 static	    ANTLR3_UCHAR    antlr3UTF16LABE		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
80 static	    ANTLR3_MARKER   antlr3UTF16Index		(pANTLR3_INT_STREAM is);
81 static	    void	    antlr3UTF16Seek		(pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint);
82 
83 // UTF16 Charstream API functions
84 //
85 static	    pANTLR3_STRING	antlr3UTF16Substr	(pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop);
86 
87 // -----------------------------------
88 // UTF32 (also covers UCS2)
89 //
90 // INT Stream API
91 //
92 static	    void	    antlr3UTF32Consume	        (pANTLR3_INT_STREAM is);
93 static	    ANTLR3_UCHAR    antlr3UTF32LA		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
94 static	    ANTLR3_UCHAR    antlr3UTF32LALE		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
95 static	    ANTLR3_UCHAR    antlr3UTF32LABE		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
96 static	    ANTLR3_MARKER   antlr3UTF32Index		(pANTLR3_INT_STREAM is);
97 static	    void	    antlr3UTF32Seek		(pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint);
98 
99 // UTF16 Charstream API functions
100 //
101 static	    pANTLR3_STRING  antlr3UTF32Substr	        (pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop);
102 
103 // ------------------------------------
104 // UTF-8
105 //
106 static	    void	    antlr3UTF8Consume	        (pANTLR3_INT_STREAM is);
107 static	    ANTLR3_UCHAR    antlr3UTF8LA		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
108 
109 // ------------------------------------
110 // EBCDIC
111 //
112 static	    ANTLR3_UCHAR    antlr3EBCDICLA		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
113 
114 /// \brief Common function to setup function interface for an 8 bit input stream.
115 ///
116 /// \param input Input stream context pointer
117 ///
118 /// \remark
119 ///   - Many of the 8 bit oriented file stream handling functions will be usable
120 ///     by any or at least some, other input streams. Therefore it is perfectly acceptable
121 ///     to call this function to install the 8Bit handler then override just those functions
122 ///     that would not work for the particular input encoding, such as consume for instance.
123 ///
124 void
antlr38BitSetupStream(pANTLR3_INPUT_STREAM input)125 antlr38BitSetupStream	(pANTLR3_INPUT_STREAM input)
126 {
127     // Build a string factory for this stream
128     //
129     input->strFactory	= antlr3StringFactoryNew(input->encoding);
130 
131     // Default stream API set up is for 8Bit, so we are done
132     //
133 }
134 
135 void
antlr3GenericSetupStream(pANTLR3_INPUT_STREAM input)136 antlr3GenericSetupStream  (pANTLR3_INPUT_STREAM input)
137 {
138     /* Install function pointers for an 8 bit input
139      */
140 
141     /* Allocate stream interface
142      */
143     input->istream		= antlr3IntStreamNew();
144     input->istream->type        = ANTLR3_CHARSTREAM;
145     input->istream->super       = input;
146 
147     /* Intstream API
148      */
149     input->istream->consume	    = antlr38BitConsume;	    // Consume the next 8 bit character in the buffer
150     input->istream->_LA		    = antlr38BitLA;	            // Return the UTF32 character at offset n (1 based)
151     input->istream->index	    = antlr38BitIndex;	            // Current index (offset from first character
152     input->istream->mark	    = antlr38BitMark;		    // Record the current lex state for later restore
153     input->istream->rewind	    = antlr38BitRewind;	            // How to rewind the input
154     input->istream->rewindLast	    = antlr38BitRewindLast;	    // How to rewind the input
155     input->istream->seek	    = antlr38BitSeek;		    // How to seek to a specific point in the stream
156     input->istream->release	    = antlr38BitRelease;	    // Reset marks after mark n
157     input->istream->getSourceName   = antlr38BitGetSourceName;      // Return a string that names the input source
158 
159     /* Charstream API
160      */
161     input->close		    =  antlr3InputClose;	    // Close down the stream completely
162     input->free			    =  antlr3InputClose;	    // Synonym for free
163     input->reset		    =  antlr3InputReset;	    // Reset input to start
164     input->reuse                    =  antlr38BitReuse;             // Install a new input string and reset
165     input->_LT			    =  antlr38BitLT;		    // Same as _LA for 8 bit file
166     input->size			    =  antlr38BitSize;		    // Return the size of the input buffer
167     input->substr		    =  antlr38BitSubstr;	    // Return a string from the input stream
168     input->getLine		    =  antlr38BitGetLine;	    // Return the current line number in the input stream
169     input->getLineBuf		    =  antlr38BitGetLineBuf;	    // Return a pointer to the start of the current line being consumed
170     input->getCharPositionInLine    =  antlr38BitGetCharPosition;   // Return the offset into the current line of input
171     input->setLine		    =  antlr38BitSetLine;	    // Set the input stream line number (does not set buffer pointers)
172     input->setCharPositionInLine    =  antlr38BitSetCharPosition;   // Set the offset in to the current line (does not set any pointers)
173     input->SetNewLineChar	    =  antlr38BitSetNewLineChar;    // Set the value of the newline trigger character
174     input->setUcaseLA		    =  antlr38BitSetUcaseLA;        // Changes the LA function to return upper case always
175 
176     input->charByteSize		    = 1;		// Size in bytes of characters in this stream.
177 
178     /* Initialize entries for tables etc
179      */
180     input->markers  = NULL;
181 
182     /* Set up the input stream brand new
183      */
184     input->reset(input);
185 
186     /* Install default line separator character (it can be replaced
187      * by the grammar programmer later)
188      */
189     input->SetNewLineChar(input, (ANTLR3_UCHAR)'\n');
190 }
191 
192 static pANTLR3_STRING
antlr38BitGetSourceName(pANTLR3_INT_STREAM is)193 antlr38BitGetSourceName(pANTLR3_INT_STREAM is)
194 {
195 	return	is->streamName;
196 }
197 
198 /** \brief Close down an input stream and free any memory allocated by it.
199  *
200  * \param input Input stream context pointer
201  */
202 static void
antlr3InputClose(pANTLR3_INPUT_STREAM input)203 antlr3InputClose(pANTLR3_INPUT_STREAM input)
204 {
205     // Close any markers in the input stream
206     //
207     if	(input->markers != NULL)
208     {
209 		input->markers->free(input->markers);
210 		input->markers = NULL;
211     }
212 
213     // Close the string factory
214     //
215     if	(input->strFactory != NULL)
216     {
217 		input->strFactory->close(input->strFactory);
218     }
219 
220     // Free the input stream buffer if we allocated it
221     //
222     if	(input->isAllocated && input->data != NULL)
223     {
224 		ANTLR3_FREE(input->data);
225 		input->data = NULL;
226     }
227 
228     input->istream->free(input->istream);
229 
230     // Finally, free the space for the structure itself
231     //
232     ANTLR3_FREE(input);
233 
234     // Done
235     //
236 }
237 
238 static void
antlr38BitSetUcaseLA(pANTLR3_INPUT_STREAM input,ANTLR3_BOOLEAN flag)239 antlr38BitSetUcaseLA		(pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN flag)
240 {
241 	if	(flag)
242 	{
243 		// Return the upper case version of the characters
244 		//
245 		input->istream->_LA		    =  antlr38BitLA_ucase;
246 	}
247 	else
248 	{
249 		// Return the raw characters as they are in the buffer
250 		//
251 		input->istream->_LA		    =  antlr38BitLA;
252 	}
253 }
254 
255 
256 /** \brief Reset a re-startable input stream to the start
257  *
258  * \param input Input stream context pointer
259  */
260 static void
antlr3InputReset(pANTLR3_INPUT_STREAM input)261 antlr3InputReset(pANTLR3_INPUT_STREAM input)
262 {
263 
264     input->nextChar		= input->data;	/* Input at first character */
265     input->line			= 1;		/* starts at line 1	    */
266     input->charPositionInLine	= 0;
267     input->currentLine		= input->data;
268     input->markDepth		= 0;		/* Reset markers	    */
269 
270     /* Clear out up the markers table if it is there
271      */
272     if	(input->markers != NULL)
273     {
274         input->markers->clear(input->markers);
275     }
276     else
277     {
278         /* Install a new markers table
279          */
280         input->markers  = antlr3VectorNew(0);
281     }
282 }
283 
284 /** Install a new source code in to a working input stream so that the
285  *  input stream can be reused.
286  */
287 static void
antlr38BitReuse(pANTLR3_INPUT_STREAM input,pANTLR3_UINT8 inString,ANTLR3_UINT32 size,pANTLR3_UINT8 name)288 antlr38BitReuse(pANTLR3_INPUT_STREAM input, pANTLR3_UINT8 inString, ANTLR3_UINT32 size, pANTLR3_UINT8 name)
289 {
290     input->isAllocated	= ANTLR3_FALSE;
291     input->data		= inString;
292     input->sizeBuf	= size;
293 
294     // Now we can set up the file name. As we are reusing the stream, there may already
295     // be a string that we can reuse for holding the filename.
296     //
297 	if	(input->istream->streamName == NULL)
298 	{
299 		input->istream->streamName	= input->strFactory->newStr(input->strFactory, name == NULL ? (pANTLR3_UINT8)"-memory-" : name);
300 		input->fileName		= input->istream->streamName;
301 	}
302 	else
303 	{
304 		input->istream->streamName->set(input->istream->streamName,  (name == NULL ? (const char *)"-memory-" : (const char *)name));
305 	}
306 
307     input->reset(input);
308 }
309 
310 /** \brief Consume the next character in an 8 bit input stream
311  *
312  * \param input Input stream context pointer
313  */
314 static void
antlr38BitConsume(pANTLR3_INT_STREAM is)315 antlr38BitConsume(pANTLR3_INT_STREAM is)
316 {
317     pANTLR3_INPUT_STREAM input;
318 
319     input   = ((pANTLR3_INPUT_STREAM) (is->super));
320 
321     if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
322     {
323 	/* Indicate one more character in this line
324 	 */
325 	input->charPositionInLine++;
326 
327 	if  ((ANTLR3_UCHAR)(*((pANTLR3_UINT8)input->nextChar)) == input->newlineChar)
328 	{
329 	    /* Reset for start of a new line of input
330 	     */
331 	    input->line++;
332 	    input->charPositionInLine	= 0;
333 	    input->currentLine		= (void *)(((pANTLR3_UINT8)input->nextChar) + 1);
334 	}
335 
336 	/* Increment to next character position
337 	 */
338 	input->nextChar = (void *)(((pANTLR3_UINT8)input->nextChar) + 1);
339     }
340 }
341 
342 /** \brief Return the input element assuming an 8 bit ascii input
343  *
344  * \param[in] input Input stream context pointer
345  * \param[in] la 1 based offset of next input stream element
346  *
347  * \return Next input character in internal ANTLR3 encoding (UTF32)
348  */
349 static ANTLR3_UCHAR
antlr38BitLA(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)350 antlr38BitLA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
351 {
352     pANTLR3_INPUT_STREAM input;
353 
354     input   = ((pANTLR3_INPUT_STREAM) (is->super));
355 
356     if	(( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
357     {
358 		return	ANTLR3_CHARSTREAM_EOF;
359     }
360     else
361     {
362 		return	(ANTLR3_UCHAR)(*((pANTLR3_UINT8)input->nextChar + la - 1));
363     }
364 }
365 
366 /** \brief Return the input element assuming an 8 bit input and
367  *         always return the UPPER CASE character.
368  *		   Note that this is 8 bit and so we assume that the toupper
369  *		   function will use the correct locale for 8 bits.
370  *
371  * \param[in] input Input stream context pointer
372  * \param[in] la 1 based offset of next input stream element
373  *
374  * \return Next input character in internal ANTLR3 encoding (UTF32)
375  */
376 static ANTLR3_UCHAR
antlr38BitLA_ucase(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)377 antlr38BitLA_ucase	(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
378 {
379     pANTLR3_INPUT_STREAM input;
380 
381     input   = ((pANTLR3_INPUT_STREAM) (is->super));
382 
383     if	(( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
384     {
385 		return	ANTLR3_CHARSTREAM_EOF;
386     }
387     else
388     {
389 		return	(ANTLR3_UCHAR)toupper((*((pANTLR3_UINT8)input->nextChar + la - 1)));
390     }
391 }
392 
393 
394 /** \brief Return the input element assuming an 8 bit ascii input
395  *
396  * \param[in] input Input stream context pointer
397  * \param[in] lt 1 based offset of next input stream element
398  *
399  * \return Next input character in internal ANTLR3 encoding (UTF32)
400  */
401 static void *
antlr38BitLT(pANTLR3_INPUT_STREAM input,ANTLR3_INT32 lt)402 antlr38BitLT(pANTLR3_INPUT_STREAM input, ANTLR3_INT32 lt)
403 {
404     /* Casting is horrible but it means no warnings and LT should never be called
405      * on a character stream anyway I think. If it is then, the void * will need to be
406      * cast back in a similar manner. Yuck! But this means that LT for Token streams and
407      * tree streams is correct.
408      */
409     return (ANTLR3_FUNC_PTR(input->istream->_LA(input->istream, lt)));
410 }
411 
412 /** \brief Calculate the current index in the output stream.
413  * \param[in] input Input stream context pointer
414  */
415 static ANTLR3_MARKER
antlr38BitIndex(pANTLR3_INT_STREAM is)416 antlr38BitIndex(pANTLR3_INT_STREAM is)
417 {
418     pANTLR3_INPUT_STREAM input;
419 
420     input   = ((pANTLR3_INPUT_STREAM) (is->super));
421 
422     return  (ANTLR3_MARKER)(((pANTLR3_UINT8)input->nextChar));
423 }
424 
425 /** \brief Return the size of the current input stream, as an 8Bit file
426  *   which in this case is the total input. Other implementations may provide
427  *   more sophisticated implementations to deal with non-recoverable streams
428  *   and so on.
429  *
430  * \param[in] input Input stream context pointer
431  */
432 static	ANTLR3_UINT32
antlr38BitSize(pANTLR3_INPUT_STREAM input)433 antlr38BitSize(pANTLR3_INPUT_STREAM input)
434 {
435     return  input->sizeBuf;
436 }
437 
438 /** \brief Mark the current input point in an 8Bit 8 bit stream
439  *  such as a file stream, where all the input is available in the
440  *  buffer.
441  *
442  * \param[in] is Input stream context pointer
443  */
444 static ANTLR3_MARKER
antlr38BitMark(pANTLR3_INT_STREAM is)445 antlr38BitMark	(pANTLR3_INT_STREAM is)
446 {
447     pANTLR3_LEX_STATE	    state;
448     pANTLR3_INPUT_STREAM    input;
449 
450     input   = ((pANTLR3_INPUT_STREAM) (is->super));
451 
452     /* New mark point
453      */
454     ++input->markDepth;
455 
456     /* See if we are revisiting a mark as we can just reuse the vector
457      * entry if we are, otherwise, we need a new one
458      */
459     if	(input->markDepth > input->markers->count)
460     {
461 		state = (pANTLR3_LEX_STATE)ANTLR3_MALLOC(sizeof(ANTLR3_LEX_STATE));
462 		if (state == NULL)
463 		{
464 			// malloc failed
465 			--input->markDepth;
466 			return 0;
467 		}
468 
469 		/* Add it to the table
470 		 */
471 		input->markers->add(input->markers, state, ANTLR3_FREE_FUNC);	/* No special structure, just free() on delete */
472     }
473     else
474     {
475 		state	= (pANTLR3_LEX_STATE)input->markers->get(input->markers, input->markDepth - 1);
476 
477 		/* Assume no errors for speed, it will just blow up if the table failed
478 		 * for some reasons, hence lots of unit tests on the tables ;-)
479 		 */
480     }
481 
482     /* We have created or retrieved the state, so update it with the current
483      * elements of the lexer state.
484      */
485     state->charPositionInLine	= input->charPositionInLine;
486     state->currentLine		= input->currentLine;
487     state->line			= input->line;
488     state->nextChar		= input->nextChar;
489 
490     is->lastMarker  = input->markDepth;
491 
492     /* And that's it
493      */
494     return  input->markDepth;
495 }
496 /** \brief Rewind the lexer input to the state specified by the last produced mark.
497  *
498  * \param[in] input Input stream context pointer
499  *
500  * \remark
501  * Assumes 8 Bit input stream.
502  */
503 static void
antlr38BitRewindLast(pANTLR3_INT_STREAM is)504 antlr38BitRewindLast	(pANTLR3_INT_STREAM is)
505 {
506     is->rewind(is, is->lastMarker);
507 }
508 
509 /** \brief Rewind the lexer input to the state specified by the supplied mark.
510  *
511  * \param[in] input Input stream context pointer
512  *
513  * \remark
514  * Assumes 8 Bit input stream.
515  */
516 static void
antlr38BitRewind(pANTLR3_INT_STREAM is,ANTLR3_MARKER mark)517 antlr38BitRewind	(pANTLR3_INT_STREAM is, ANTLR3_MARKER mark)
518 {
519     pANTLR3_LEX_STATE	state;
520     pANTLR3_INPUT_STREAM input;
521 
522     input   = ((pANTLR3_INPUT_STREAM) is->super);
523 
524     /* Perform any clean up of the marks
525      */
526     input->istream->release(input->istream, mark);
527 
528     /* Find the supplied mark state
529      */
530     state   = (pANTLR3_LEX_STATE)input->markers->get(input->markers, (ANTLR3_UINT32)(mark - 1));
531 	if (state == NULL) { return; }
532 
533     /* Seek input pointer to the requested point (note we supply the void *pointer
534      * to whatever is implementing the int stream to seek).
535      */
536     antlr38BitSeek(is, (ANTLR3_MARKER)(state->nextChar));
537 
538     /* Reset to the reset of the information in the mark
539      */
540     input->charPositionInLine	= state->charPositionInLine;
541     input->currentLine		= state->currentLine;
542     input->line			= state->line;
543     input->nextChar		= state->nextChar;
544 
545     /* And we are done
546      */
547 }
548 
549 /** \brief Rewind the lexer input to the state specified by the supplied mark.
550  *
551  * \param[in] input Input stream context pointer
552  *
553  * \remark
554  * Assumes 8 Bit input stream.
555  */
556 static void
antlr38BitRelease(pANTLR3_INT_STREAM is,ANTLR3_MARKER mark)557 antlr38BitRelease	(pANTLR3_INT_STREAM is, ANTLR3_MARKER mark)
558 {
559     pANTLR3_INPUT_STREAM input;
560 
561     input   = ((pANTLR3_INPUT_STREAM) (is->super));
562 
563     /* We don't do much here in fact as we never free any higher marks in
564      * the hashtable as we just resuse any memory allocated for them.
565      */
566     input->markDepth	= (ANTLR3_UINT32)(mark - 1);
567 }
568 
569 /** \brief Rewind the lexer input to the state specified by the supplied mark.
570  *
571  * \param[in] input Input stream context pointer
572  *
573  * \remark
574  * Assumes 8 Bit input stream.
575  */
576 static void
antlr38BitSeek(pANTLR3_INT_STREAM is,ANTLR3_MARKER seekPoint)577 antlr38BitSeek	(pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint)
578 {
579 	ANTLR3_INT32   count;
580 	pANTLR3_INPUT_STREAM input;
581 
582 	input   = (pANTLR3_INPUT_STREAM)ANTLR3_FUNC_PTR(((pANTLR3_INPUT_STREAM) is->super));
583 
584 	/* If the requested seek point is less than the current
585 	* input point, then we assume that we are resetting from a mark
586 	* and do not need to scan, but can just set to there.
587 	*/
588 	if	(seekPoint <= (ANTLR3_MARKER)(input->nextChar))
589 	{
590 		input->nextChar	= ((pANTLR3_UINT8) seekPoint);
591 	}
592 	else
593 	{
594 		count	= (ANTLR3_UINT32)(seekPoint - (ANTLR3_MARKER)(input->nextChar));
595 
596 		while (count--)
597 		{
598 			is->consume(is);
599 		}
600 	}
601 }
602 /** Return a substring of the 8 bit input stream in
603  *  newly allocated memory.
604  *
605  * \param input Input stream context pointer
606  * \param start Offset in input stream where the string starts
607  * \param stop  Offset in the input stream where the string ends.
608  */
609 static pANTLR3_STRING
antlr38BitSubstr(pANTLR3_INPUT_STREAM input,ANTLR3_MARKER start,ANTLR3_MARKER stop)610 antlr38BitSubstr		(pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop)
611 {
612 	return  input->strFactory->newPtr(input->strFactory, (pANTLR3_UINT8)start, (ANTLR3_UINT32)(stop - start + 1));
613 }
614 
615 /** \brief Return the line number as understood by the 8 bit input stream.
616  *
617  * \param input Input stream context pointer
618  * \return	Line number in input stream that we believe we are working on.
619  */
620 static ANTLR3_UINT32
antlr38BitGetLine(pANTLR3_INPUT_STREAM input)621 antlr38BitGetLine		(pANTLR3_INPUT_STREAM input)
622 {
623     return  input->line;
624 }
625 
626 /** Return a pointer into the input stream that points at the start
627  *  of the current input line as triggered by the end of line character installed
628  *  for the stream ('\n' unless told differently).
629  *
630  * \param[in] input
631  */
632 static void	  *
antlr38BitGetLineBuf(pANTLR3_INPUT_STREAM input)633 antlr38BitGetLineBuf	(pANTLR3_INPUT_STREAM input)
634 {
635     return  input->currentLine;
636 }
637 
638 /** Return the current offset in to the current line in the input stream.
639  *
640  * \param input Input stream context pointer
641  * \return      Current line offset
642  */
643 static ANTLR3_UINT32
antlr38BitGetCharPosition(pANTLR3_INPUT_STREAM input)644 antlr38BitGetCharPosition	(pANTLR3_INPUT_STREAM input)
645 {
646     return  input->charPositionInLine;
647 }
648 
649 /** Set the current line number as understood by the input stream.
650  *
651  * \param input Input stream context pointer
652  * \param line  Line number to tell the input stream we are on
653  *
654  * \remark
655  *  This function does not change any pointers, it just allows the programmer to set the
656  *  line number according to some external criterion, such as finding a lexed directive
657  *  like: #nnn "file.c" for instance, such that error reporting and so on in is in sync
658  *  with some original source format.
659  */
660 static void
antlr38BitSetLine(pANTLR3_INPUT_STREAM input,ANTLR3_UINT32 line)661 antlr38BitSetLine		(pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 line)
662 {
663     input->line	= line;
664 }
665 
666 /** Set the current offset in the current line to be a particular setting.
667  *
668  * \param[in] input    Input stream context pointer
669  * \param[in] position New setting for current offset.
670  *
671  * \remark
672  * This does not set the actual pointers in the input stream, it is purely for reporting
673  * purposes and so on as per antlr38BitSetLine();
674  */
675 static void
antlr38BitSetCharPosition(pANTLR3_INPUT_STREAM input,ANTLR3_UINT32 position)676 antlr38BitSetCharPosition	(pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 position)
677 {
678     input->charPositionInLine = position;
679 }
680 
681 /** Set the newline trigger character in the input stream to the supplied parameter.
682  *
683  * \param[in] input	    Input stream context pointer
684  * \param[in] newlineChar   Character to set to be the newline trigger.
685  *
686  * \remark
687  *  - The supplied newLineChar is in UTF32 encoding (which means ASCII and latin1 etc
688  *    are the same encodings), but the input stream catered to by this function is 8 bit
689  *    only, so it is up to the programmer to ensure that the character supplied is valid.
690  */
691 static void
antlr38BitSetNewLineChar(pANTLR3_INPUT_STREAM input,ANTLR3_UINT32 newlineChar)692 antlr38BitSetNewLineChar	(pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 newlineChar)
693 {
694     input->newlineChar	= newlineChar;
695 }
696 
697 
698 /// \brief Common function to setup function interface for a UTF16 or UCS2 input stream.
699 ///
700 /// \param input Input stream context pointer
701 ///
702 /// \remark
703 ///  - Strictly speaking, there is no such thing as a UCS2 input stream as the term
704 ///    tends to confuse the notions of character encoding, unicode and so on. UCS2 is
705 ///    essentially UTF16 without any surrogates and so the standard UTF16
706 ///    input stream is able to handle it without any special code.
707 ///
708 void
antlr3UTF16SetupStream(pANTLR3_INPUT_STREAM input,ANTLR3_BOOLEAN machineBigEndian,ANTLR3_BOOLEAN inputBigEndian)709 antlr3UTF16SetupStream	(pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN machineBigEndian, ANTLR3_BOOLEAN inputBigEndian)
710 {
711     // Build a string factory for this stream. This is a UTF16 string factory which is a standard
712     // part of the ANTLR3 string. The string factory is then passed through the whole chain
713     // of lexer->parser->tree->treeparser and so on.
714     //
715     input->strFactory	= antlr3StringFactoryNew(input->encoding);
716 
717     // Generic API that does not care about endianess.
718     //
719     input->istream->index	    =  antlr3UTF16Index;            // Calculate current index in input stream, UTF16 based
720     input->substr		    =  antlr3UTF16Substr;	    // Return a string from the input stream
721     input->istream->seek	    =  antlr3UTF16Seek;		    // How to seek to a specific point in the stream
722 
723     // We must install different UTF16 routines according to whether the input
724     // is the same endianess as the machine we are executing upon or not. If it is not
725     // then we must install methods that can convert the endianess on the fly as they go
726     //
727 
728     switch (machineBigEndian)
729     {
730         case    ANTLR3_TRUE:
731 
732             // Machine is Big Endian, if the input is also then install the
733             // methods that do not access input by bytes and reverse them.
734             // Otherwise install endian aware methods.
735             //
736             if  (inputBigEndian == ANTLR3_TRUE)
737             {
738                 // Input is machine compatible
739                 //
740                 input->istream->consume	    =  antlr3UTF16Consume;	    // Consume the next UTF16 character in the buffer
741                 input->istream->_LA         =  antlr3UTF16LA;		    // Return the UTF32 character at offset n (1 based)
742             }
743             else
744             {
745                 // Need to use methods that know that the input is little endian
746                 //
747                 input->istream->consume	    =  antlr3UTF16ConsumeLE;	    // Consume the next UTF16 character in the buffer
748                 input->istream->_LA         =  antlr3UTF16LALE;		    // Return the UTF32 character at offset n (1 based)
749             }
750             break;
751 
752         case    ANTLR3_FALSE:
753 
754             // Machine is Little Endian, if the input is also then install the
755             // methods that do not access input by bytes and reverse them.
756             // Otherwise install endian aware methods.
757             //
758             if  (inputBigEndian == ANTLR3_FALSE)
759             {
760                 // Input is machine compatible
761                 //
762                 input->istream->consume	    =  antlr3UTF16Consume;	    // Consume the next UTF16 character in the buffer
763                 input->istream->_LA         =  antlr3UTF16LA;		    // Return the UTF32 character at offset n (1 based)
764             }
765             else
766             {
767                 // Need to use methods that know that the input is Big Endian
768                 //
769                 input->istream->consume	    =  antlr3UTF16ConsumeBE;	    // Consume the next UTF16 character in the buffer
770                 input->istream->_LA         =  antlr3UTF16LABE;		    // Return the UTF32 character at offset n (1 based)
771             }
772             break;
773     }
774 
775 
776     input->charByteSize		    = 2;			    // Size in bytes of characters in this stream.
777 
778 }
779 
780 /// \brief Consume the next character in a UTF16 input stream
781 ///
782 /// \param input Input stream context pointer
783 ///
784 static void
antlr3UTF16Consume(pANTLR3_INT_STREAM is)785 antlr3UTF16Consume(pANTLR3_INT_STREAM is)
786 {
787 	pANTLR3_INPUT_STREAM input;
788         UTF32   ch;
789         UTF32   ch2;
790 
791 	input   = ((pANTLR3_INPUT_STREAM) (is->super));
792 
793         // Buffer size is always in bytes
794         //
795 	if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
796 	{
797 		// Indicate one more character in this line
798 		//
799 		input->charPositionInLine++;
800 
801 		if  ((ANTLR3_UCHAR)(*((pANTLR3_UINT16)input->nextChar)) == input->newlineChar)
802 		{
803 			// Reset for start of a new line of input
804 			//
805 			input->line++;
806 			input->charPositionInLine	= 0;
807 			input->currentLine		= (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
808 		}
809 
810 		// Increment to next character position, accounting for any surrogates
811 		//
812                 // Next char in natural machine byte order
813                 //
814                 ch  = *((UTF16*)input->nextChar);
815 
816                 // We consumed one 16 bit character
817                 //
818 		input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
819 
820                 // If we have a surrogate pair then we need to consume
821                 // a following valid LO surrogate.
822                 //
823                 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
824 
825                     // If the 16 bits following the high surrogate are in the source buffer...
826                     //
827                     if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
828                     {
829                         // Next character is in natural machine byte order
830                         //
831                         ch2 = *((UTF16*)input->nextChar);
832 
833                         // If it's a valid low surrogate, consume it
834                         //
835                         if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
836                         {
837                             // We consumed one 16 bit character
838                             //
839 		            input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
840                         }
841                         // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
842                         // it.
843                         //
844                     }
845                     // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
846                     // it because the buffer ended
847                     //
848                 }
849                 // Note that we did not check for an invalid low surrogate here, or that fact that the
850                 // lo surrogate was missing. We just picked out one 16 bit character unless the character
851                 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
852                 //
853 	}
854 }
855 
856 /// \brief Return the input element assuming an 8 bit ascii input
857 ///
858 /// \param[in] input Input stream context pointer
859 /// \param[in] la 1 based offset of next input stream element
860 ///
861 /// \return Next input character in internal ANTLR3 encoding (UTF32)
862 ///
863 static ANTLR3_UCHAR
antlr3UTF16LA(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)864 antlr3UTF16LA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
865 {
866 	pANTLR3_INPUT_STREAM input;
867         UTF32   ch;
868         UTF32   ch2;
869         UTF16   * nextChar;
870 
871         // Find the input interface and where we are currently pointing to
872         // in the input stream
873         //
874 	input       = ((pANTLR3_INPUT_STREAM) (is->super));
875         nextChar    = (UTF16*)input->nextChar;
876 
877         // If a positive offset then advance forward, else retreat
878         //
879         if  (la >= 0)
880         {
881             while   (--la > 0 && (pANTLR3_UINT8)nextChar < ((pANTLR3_UINT8)input->data) + input->sizeBuf )
882             {
883                 // Advance our copy of the input pointer
884                 //
885                 // Next char in natural machine byte order
886                 //
887                 ch  = *nextChar++;
888 
889                 // If we have a surrogate pair then we need to consume
890                 // a following valid LO surrogate.
891                 //
892                 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
893                 {
894                     // If the 16 bits following the high surrogate are in the source buffer...
895                     //
896                     if	((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
897                     {
898                         // Next character is in natural machine byte order
899                         //
900                         ch2 = *nextChar;
901 
902                         // If it's a valid low surrogate, consume it
903                         //
904                         if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
905                         {
906                             // We consumed one 16 bit character
907                             //
908 		            nextChar++;
909                         }
910                         // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
911                         // it.
912                         //
913                     }
914                     // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
915                     // it because the buffer ended
916                     //
917                 }
918                 // Note that we did not check for an invalid low surrogate here, or that fact that the
919                 // lo surrogate was missing. We just picked out one 16 bit character unless the character
920                 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
921                 //
922             }
923         }
924         else
925         {
926             // We need to go backwards from our input point
927             //
928             while   (la++ < 0 && (pANTLR3_UINT8)nextChar > (pANTLR3_UINT8)input->data )
929             {
930                 // Get the previous 16 bit character
931                 //
932                 ch = *--nextChar;
933 
934                 // If we found a low surrogate then go back one more character if
935                 // the hi surrogate is there
936                 //
937                 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
938                 {
939                     ch2 = *(nextChar-1);
940                     if (ch2 >= UNI_SUR_HIGH_START && ch2 <= UNI_SUR_HIGH_END)
941                     {
942                         // Yes, there is a high surrogate to match it so decrement one more and point to that
943                         //
944                         nextChar--;
945                     }
946                 }
947             }
948         }
949 
950         // Our local copy of nextChar is now pointing to either the correct character or end of file
951         //
952         // Input buffer size is always in bytes
953         //
954 	if	( (pANTLR3_UINT8)nextChar >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
955 	{
956 		return	ANTLR3_CHARSTREAM_EOF;
957 	}
958 	else
959 	{
960             // Pick up the next 16 character (native machine byte order)
961             //
962             ch = *nextChar++;
963 
964             // If we have a surrogate pair then we need to consume
965             // a following valid LO surrogate.
966             //
967             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
968             {
969                 // If the 16 bits following the high surrogate are in the source buffer...
970                 //
971                 if	((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
972                 {
973                     // Next character is in natural machine byte order
974                     //
975                     ch2 = *nextChar;
976 
977                     // If it's a valid low surrogate, consume it
978                     //
979                     if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
980                     {
981                         // Construct the UTF32 code point
982                         //
983                         ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
984 			    + (ch2 - UNI_SUR_LOW_START) + halfBase;
985                     }
986                     // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
987                     // it.
988                     //
989                 }
990                 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
991                 // it because the buffer ended
992                 //
993             }
994         }
995         return ch;
996 }
997 
998 
999 /// \brief Calculate the current index in the output stream.
1000 /// \param[in] input Input stream context pointer
1001 ///
1002 static ANTLR3_MARKER
antlr3UTF16Index(pANTLR3_INT_STREAM is)1003 antlr3UTF16Index(pANTLR3_INT_STREAM is)
1004 {
1005     pANTLR3_INPUT_STREAM input;
1006 
1007     input   = ((pANTLR3_INPUT_STREAM) (is->super));
1008 
1009     return  (ANTLR3_MARKER)(input->nextChar);
1010 }
1011 
1012 /// \brief Rewind the lexer input to the state specified by the supplied mark.
1013 ///
1014 /// \param[in] input Input stream context pointer
1015 ///
1016 /// \remark
1017 /// Assumes UTF16 input stream.
1018 ///
1019 static void
antlr3UTF16Seek(pANTLR3_INT_STREAM is,ANTLR3_MARKER seekPoint)1020 antlr3UTF16Seek	(pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint)
1021 {
1022 	pANTLR3_INPUT_STREAM input;
1023 
1024 	input   = ((pANTLR3_INPUT_STREAM) is->super);
1025 
1026 	// If the requested seek point is less than the current
1027 	// input point, then we assume that we are resetting from a mark
1028 	// and do not need to scan, but can just set to there as rewind will
1029         // reset line numbers and so on.
1030 	//
1031 	if	(seekPoint <= (ANTLR3_MARKER)(input->nextChar))
1032 	{
1033 		input->nextChar	= (void *)seekPoint;
1034 	}
1035 	else
1036 	{
1037             // Call consume until we reach the asked for seek point or EOF
1038             //
1039             while (is->_LA(is, 1) != ANTLR3_CHARSTREAM_EOF && seekPoint < (ANTLR3_MARKER)input->nextChar)
1040 	    {
1041 		is->consume(is);
1042 	    }
1043 	}
1044 }
1045 /// \brief Return a substring of the UTF16 input stream in
1046 ///  newly allocated memory.
1047 ///
1048 /// \param input Input stream context pointer
1049 /// \param start Offset in input stream where the string starts
1050 /// \param stop  Offset in the input stream where the string ends.
1051 ///
1052 static pANTLR3_STRING
antlr3UTF16Substr(pANTLR3_INPUT_STREAM input,ANTLR3_MARKER start,ANTLR3_MARKER stop)1053 antlr3UTF16Substr		(pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop)
1054 {
1055     return  input->strFactory->newPtr(input->strFactory, (pANTLR3_UINT8)start, ((ANTLR3_UINT32_CAST(stop - start))/2) + 1);
1056 }
1057 
1058 /// \brief Consume the next character in a UTF16 input stream when the input is Little Endian and the machine is not
1059 /// Note that the UTF16 routines do not do any substantial verification of the input stream as for performance
1060 /// sake, we assume it is validly encoded. So if a low surrogate is found at the curent input position then we
1061 /// just consume it. Surrogate pairs should be seen as Hi, Lo. So if we have a Lo first, then the input stream
1062 /// is fubar but we just ignore that.
1063 ///
1064 /// \param input Input stream context pointer
1065 ///
1066 static void
antlr3UTF16ConsumeLE(pANTLR3_INT_STREAM is)1067 antlr3UTF16ConsumeLE(pANTLR3_INT_STREAM is)
1068 {
1069 	pANTLR3_INPUT_STREAM input;
1070         UTF32   ch;
1071         UTF32   ch2;
1072 
1073 	input   = ((pANTLR3_INPUT_STREAM) (is->super));
1074 
1075         // Buffer size is always in bytes
1076         //
1077 	if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1078 	{
1079 		// Indicate one more character in this line
1080 		//
1081 		input->charPositionInLine++;
1082 
1083 		if  ((ANTLR3_UCHAR)(*((pANTLR3_UINT16)input->nextChar)) == input->newlineChar)
1084 		{
1085 			// Reset for start of a new line of input
1086 			//
1087 			input->line++;
1088 			input->charPositionInLine	= 0;
1089 			input->currentLine		= (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
1090 		}
1091 
1092 		// Increment to next character position, accounting for any surrogates
1093 		//
1094                 // Next char in litle endian form
1095                 //
1096                 ch  = *((pANTLR3_UINT8)input->nextChar) + (*((pANTLR3_UINT8)input->nextChar + 1) <<8);
1097 
1098                 // We consumed one 16 bit character
1099                 //
1100 		input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
1101 
1102                 // If we have a surrogate pair then we need to consume
1103                 // a following valid LO surrogate.
1104                 //
1105                 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
1106 
1107                     // If the 16 bits following the high surrogate are in the source buffer...
1108                     //
1109                     if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1110                     {
1111                         ch2 = *((pANTLR3_UINT8)input->nextChar) + (*((pANTLR3_UINT8)input->nextChar + 1) <<8);
1112 
1113                         // If it's a valid low surrogate, consume it
1114                         //
1115                         if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
1116                         {
1117                             // We consumed one 16 bit character
1118                             //
1119 		            input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
1120                         }
1121                         // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1122                         // it.
1123                         //
1124                     }
1125                     // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1126                     // it because the buffer ended
1127                     //
1128                 }
1129                 // Note that we did not check for an invalid low surrogate here, or that fact that the
1130                 // lo surrogate was missing. We just picked out one 16 bit character unless the character
1131                 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
1132                 //
1133 	}
1134 }
1135 
1136 /// \brief Return the input element assuming a UTF16 input when the input is Little Endian and the machine is not
1137 ///
1138 /// \param[in] input Input stream context pointer
1139 /// \param[in] la 1 based offset of next input stream element
1140 ///
1141 /// \return Next input character in internal ANTLR3 encoding (UTF32)
1142 ///
1143 static ANTLR3_UCHAR
antlr3UTF16LALE(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)1144 antlr3UTF16LALE(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
1145 {
1146 	pANTLR3_INPUT_STREAM input;
1147         UTF32           ch;
1148         UTF32           ch2;
1149         pANTLR3_UCHAR   nextChar;
1150 
1151         // Find the input interface and where we are currently pointing to
1152         // in the input stream
1153         //
1154 	input       = ((pANTLR3_INPUT_STREAM) (is->super));
1155         nextChar    = (pANTLR3_UCHAR)input->nextChar;
1156 
1157         // If a positive offset then advance forward, else retreat
1158         //
1159         if  (la >= 0)
1160         {
1161             while   (--la > 0 && (pANTLR3_UINT8)nextChar < ((pANTLR3_UINT8)input->data) + input->sizeBuf )
1162             {
1163                 // Advance our copy of the input pointer
1164                 //
1165                 // Next char in Little Endian byte order
1166                 //
1167                 ch  = (*nextChar) + (*(nextChar+1) << 8);
1168                 nextChar += 2;
1169 
1170                 // If we have a surrogate pair then we need to consume
1171                 // a following valid LO surrogate.
1172                 //
1173                 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
1174                 {
1175                     // If the 16 bits following the high surrogate are in the source buffer...
1176                     //
1177                     if	((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1178                     {
1179                         // Next character is in little endian byte order
1180                         //
1181                         ch2 = (*nextChar) + (*(nextChar+1) << 8);
1182 
1183                         // If it's a valid low surrogate, consume it
1184                         //
1185                         if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
1186                         {
1187                             // We consumed one 16 bit character
1188                             //
1189 		            nextChar += 2;
1190                         }
1191                         // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1192                         // it.
1193                         //
1194                     }
1195                     // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1196                     // it because the buffer ended
1197                     //
1198                 }
1199                 // Note that we did not check for an invalid low surrogate here, or that fact that the
1200                 // lo surrogate was missing. We just picked out one 16 bit character unless the character
1201                 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
1202                 //
1203             }
1204         }
1205         else
1206         {
1207             // We need to go backwards from our input point
1208             //
1209             while   (la++ < 0 && (pANTLR3_UINT8)nextChar > (pANTLR3_UINT8)input->data )
1210             {
1211                 // Get the previous 16 bit character
1212                 //
1213                 ch = (*nextChar - 2) + ((*nextChar -1) << 8);
1214                 nextChar -= 2;
1215 
1216                 // If we found a low surrogate then go back one more character if
1217                 // the hi surrogate is there
1218                 //
1219                 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
1220                 {
1221                     ch2 = (*nextChar - 2) + ((*nextChar -1) << 8);
1222                     if (ch2 >= UNI_SUR_HIGH_START && ch2 <= UNI_SUR_HIGH_END)
1223                     {
1224                         // Yes, there is a high surrogate to match it so decrement one more and point to that
1225                         //
1226                         nextChar -=2;
1227                     }
1228                 }
1229             }
1230         }
1231 
1232         // Our local copy of nextChar is now pointing to either the correct character or end of file
1233         //
1234         // Input buffer size is always in bytes
1235         //
1236 	if	( (pANTLR3_UINT8)nextChar >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1237 	{
1238 		return	ANTLR3_CHARSTREAM_EOF;
1239 	}
1240 	else
1241 	{
1242             // Pick up the next 16 character (little endian byte order)
1243             //
1244             ch = (*nextChar) + (*(nextChar+1) << 8);
1245             nextChar += 2;
1246 
1247             // If we have a surrogate pair then we need to consume
1248             // a following valid LO surrogate.
1249             //
1250             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
1251             {
1252                 // If the 16 bits following the high surrogate are in the source buffer...
1253                 //
1254                 if	((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1255                 {
1256                     // Next character is in little endian byte order
1257                     //
1258                     ch2 = (*nextChar) + (*(nextChar+1) << 8);
1259 
1260                     // If it's a valid low surrogate, consume it
1261                     //
1262                     if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
1263                     {
1264                         // Construct the UTF32 code point
1265                         //
1266                         ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
1267 			    + (ch2 - UNI_SUR_LOW_START) + halfBase;
1268                     }
1269                     // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1270                     // it.
1271                     //
1272                 }
1273                 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1274                 // it because the buffer ended
1275                 //
1276             }
1277         }
1278         return ch;
1279 }
1280 
1281 /// \brief Consume the next character in a UTF16 input stream when the input is Big Endian and the machine is not
1282 ///
1283 /// \param input Input stream context pointer
1284 ///
1285 static void
antlr3UTF16ConsumeBE(pANTLR3_INT_STREAM is)1286 antlr3UTF16ConsumeBE(pANTLR3_INT_STREAM is)
1287 {
1288 	pANTLR3_INPUT_STREAM input;
1289         UTF32   ch;
1290         UTF32   ch2;
1291 
1292 	input   = ((pANTLR3_INPUT_STREAM) (is->super));
1293 
1294         // Buffer size is always in bytes
1295         //
1296 	if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1297 	{
1298 		// Indicate one more character in this line
1299 		//
1300 		input->charPositionInLine++;
1301 
1302 		if  ((ANTLR3_UCHAR)(*((pANTLR3_UINT16)input->nextChar)) == input->newlineChar)
1303 		{
1304 			// Reset for start of a new line of input
1305 			//
1306 			input->line++;
1307 			input->charPositionInLine	= 0;
1308 			input->currentLine		= (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
1309 		}
1310 
1311 		// Increment to next character position, accounting for any surrogates
1312 		//
1313                 // Next char in big endian form
1314                 //
1315                 ch  = *((pANTLR3_UINT8)input->nextChar + 1) + (*((pANTLR3_UINT8)input->nextChar ) <<8);
1316 
1317                 // We consumed one 16 bit character
1318                 //
1319 		input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
1320 
1321                 // If we have a surrogate pair then we need to consume
1322                 // a following valid LO surrogate.
1323                 //
1324                 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
1325 
1326                     // If the 16 bits following the high surrogate are in the source buffer...
1327                     //
1328                     if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1329                     {
1330                         // Big endian
1331                         //
1332                         ch2 = *((pANTLR3_UINT8)input->nextChar + 1) + (*((pANTLR3_UINT8)input->nextChar ) <<8);
1333 
1334                         // If it's a valid low surrogate, consume it
1335                         //
1336                         if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
1337                         {
1338                             // We consumed one 16 bit character
1339                             //
1340 		            input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
1341                         }
1342                         // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1343                         // it.
1344                         //
1345                     }
1346                     // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1347                     // it because the buffer ended
1348                     //
1349                 }
1350                 // Note that we did not check for an invalid low surrogate here, or that fact that the
1351                 // lo surrogate was missing. We just picked out one 16 bit character unless the character
1352                 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
1353                 //
1354 	}
1355 }
1356 
1357 /// \brief Return the input element assuming a UTF16 input when the input is Little Endian and the machine is not
1358 ///
1359 /// \param[in] input Input stream context pointer
1360 /// \param[in] la 1 based offset of next input stream element
1361 ///
1362 /// \return Next input character in internal ANTLR3 encoding (UTF32)
1363 ///
1364 static ANTLR3_UCHAR
antlr3UTF16LABE(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)1365 antlr3UTF16LABE(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
1366 {
1367 	pANTLR3_INPUT_STREAM input;
1368         UTF32           ch;
1369         UTF32           ch2;
1370         pANTLR3_UCHAR   nextChar;
1371 
1372         // Find the input interface and where we are currently pointing to
1373         // in the input stream
1374         //
1375 	input       = ((pANTLR3_INPUT_STREAM) (is->super));
1376         nextChar    = (pANTLR3_UCHAR)input->nextChar;
1377 
1378         // If a positive offset then advance forward, else retreat
1379         //
1380         if  (la >= 0)
1381         {
1382             while   (--la > 0 && (pANTLR3_UINT8)nextChar < ((pANTLR3_UINT8)input->data) + input->sizeBuf )
1383             {
1384                 // Advance our copy of the input pointer
1385                 //
1386                 // Next char in Big Endian byte order
1387                 //
1388                 ch  = ((*nextChar) << 8) + *(nextChar+1);
1389                 nextChar += 2;
1390 
1391                 // If we have a surrogate pair then we need to consume
1392                 // a following valid LO surrogate.
1393                 //
1394                 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
1395                 {
1396                     // If the 16 bits following the high surrogate are in the source buffer...
1397                     //
1398                     if	((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1399                     {
1400                         // Next character is in big endian byte order
1401                         //
1402                         ch2 = ((*nextChar) << 8) + *(nextChar+1);
1403 
1404                         // If it's a valid low surrogate, consume it
1405                         //
1406                         if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
1407                         {
1408                             // We consumed one 16 bit character
1409                             //
1410 		            nextChar += 2;
1411                         }
1412                         // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1413                         // it.
1414                         //
1415                     }
1416                     // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1417                     // it because the buffer ended
1418                     //
1419                 }
1420                 // Note that we did not check for an invalid low surrogate here, or that fact that the
1421                 // lo surrogate was missing. We just picked out one 16 bit character unless the character
1422                 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
1423                 //
1424             }
1425         }
1426         else
1427         {
1428             // We need to go backwards from our input point
1429             //
1430             while   (la++ < 0 && (pANTLR3_UINT8)nextChar > (pANTLR3_UINT8)input->data )
1431             {
1432                 // Get the previous 16 bit character
1433                 //
1434                 ch = ((*nextChar - 2) << 8) + (*nextChar -1);
1435                 nextChar -= 2;
1436 
1437                 // If we found a low surrogate then go back one more character if
1438                 // the hi surrogate is there
1439                 //
1440                 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
1441                 {
1442                     ch2 = ((*nextChar - 2) << 8) + (*nextChar -1);
1443                     if (ch2 >= UNI_SUR_HIGH_START && ch2 <= UNI_SUR_HIGH_END)
1444                     {
1445                         // Yes, there is a high surrogate to match it so decrement one more and point to that
1446                         //
1447                         nextChar -=2;
1448                     }
1449                 }
1450             }
1451         }
1452 
1453         // Our local copy of nextChar is now pointing to either the correct character or end of file
1454         //
1455         // Input buffer size is always in bytes
1456         //
1457 	if	( (pANTLR3_UINT8)nextChar >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1458 	{
1459 		return	ANTLR3_CHARSTREAM_EOF;
1460 	}
1461 	else
1462 	{
1463             // Pick up the next 16 character (big endian byte order)
1464             //
1465             ch = ((*nextChar) << 8) + *(nextChar+1);
1466             nextChar += 2;
1467 
1468             // If we have a surrogate pair then we need to consume
1469             // a following valid LO surrogate.
1470             //
1471             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
1472             {
1473                 // If the 16 bits following the high surrogate are in the source buffer...
1474                 //
1475                 if	((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1476                 {
1477                     // Next character is in big endian byte order
1478                     //
1479                     ch2 = ((*nextChar) << 8) + *(nextChar+1);
1480 
1481                     // If it's a valid low surrogate, consume it
1482                     //
1483                     if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
1484                     {
1485                         // Construct the UTF32 code point
1486                         //
1487                         ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
1488 			    + (ch2 - UNI_SUR_LOW_START) + halfBase;
1489                     }
1490                     // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1491                     // it.
1492                     //
1493                 }
1494                 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1495                 // it because the buffer ended
1496                 //
1497             }
1498         }
1499         return ch;
1500 }
1501 
1502 /// \brief Common function to setup function interface for a UTF3 input stream.
1503 ///
1504 /// \param input Input stream context pointer
1505 ///
1506 void
antlr3UTF32SetupStream(pANTLR3_INPUT_STREAM input,ANTLR3_BOOLEAN machineBigEndian,ANTLR3_BOOLEAN inputBigEndian)1507 antlr3UTF32SetupStream	(pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN machineBigEndian, ANTLR3_BOOLEAN inputBigEndian)
1508 {
1509     // Build a string factory for this stream. This is a UTF32 string factory which is a standard
1510     // part of the ANTLR3 string. The string factory is then passed through the whole chain of lexer->parser->tree->treeparser
1511     // and so on.
1512     //
1513     input->strFactory	= antlr3StringFactoryNew(input->encoding);
1514 
1515     // Generic API that does not care about endianess.
1516     //
1517     input->istream->index	    =  antlr3UTF32Index;            // Calculate current index in input stream, UTF16 based
1518     input->substr		    =  antlr3UTF32Substr;	    // Return a string from the input stream
1519     input->istream->seek	    =  antlr3UTF32Seek;		    // How to seek to a specific point in the stream
1520     input->istream->consume	    =  antlr3UTF32Consume;	    // Consume the next UTF32 character in the buffer
1521 
1522     // We must install different UTF32 LA routines according to whether the input
1523     // is the same endianess as the machine we are executing upon or not. If it is not
1524     // then we must install methods that can convert the endianess on the fly as they go
1525     //
1526     switch (machineBigEndian)
1527     {
1528         case    ANTLR3_TRUE:
1529 
1530             // Machine is Big Endian, if the input is also then install the
1531             // methods that do not access input by bytes and reverse them.
1532             // Otherwise install endian aware methods.
1533             //
1534             if  (inputBigEndian == ANTLR3_TRUE)
1535             {
1536                 // Input is machine compatible
1537                 //
1538                 input->istream->_LA         =  antlr3UTF32LA;		    // Return the UTF32 character at offset n (1 based)
1539             }
1540             else
1541             {
1542                 // Need to use methods that know that the input is little endian
1543                 //
1544                 input->istream->_LA         =  antlr3UTF32LALE;		    // Return the UTF32 character at offset n (1 based)
1545             }
1546             break;
1547 
1548         case    ANTLR3_FALSE:
1549 
1550             // Machine is Little Endian, if the input is also then install the
1551             // methods that do not access input by bytes and reverse them.
1552             // Otherwise install endian aware methods.
1553             //
1554             if  (inputBigEndian == ANTLR3_FALSE)
1555             {
1556                 // Input is machine compatible
1557                 //
1558                 input->istream->_LA         =  antlr3UTF32LA;		    // Return the UTF32 character at offset n (1 based)
1559             }
1560             else
1561             {
1562                 // Need to use methods that know that the input is Big Endian
1563                 //
1564                 input->istream->_LA         =  antlr3UTF32LABE;		    // Return the UTF32 character at offset n (1 based)
1565             }
1566             break;
1567     }
1568 
1569     input->charByteSize		    = 4;			    // Size in bytes of characters in this stream.
1570 }
1571 
1572 /** \brief Consume the next character in a UTF32 input stream
1573  *
1574  * \param input Input stream context pointer
1575  */
1576 static void
antlr3UTF32Consume(pANTLR3_INT_STREAM is)1577 antlr3UTF32Consume(pANTLR3_INT_STREAM is)
1578 {
1579     pANTLR3_INPUT_STREAM input;
1580 
1581     input   = ((pANTLR3_INPUT_STREAM) (is->super));
1582 
1583     // SizeBuf is always in bytes
1584     //
1585     if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1586     {
1587 	/* Indicate one more character in this line
1588 	 */
1589 	input->charPositionInLine++;
1590 
1591 	if  ((ANTLR3_UCHAR)(*((pANTLR3_UINT32)input->nextChar)) == input->newlineChar)
1592 	{
1593 	    /* Reset for start of a new line of input
1594 	     */
1595 	    input->line++;
1596 	    input->charPositionInLine	= 0;
1597 	    input->currentLine		= (void *)(((pANTLR3_UINT32)input->nextChar) + 1);
1598 	}
1599 
1600 	/* Increment to next character position
1601 	 */
1602 	input->nextChar = (void *)(((pANTLR3_UINT32)input->nextChar) + 1);
1603     }
1604 }
1605 
1606 /// \brief Calculate the current index in the output stream.
1607 /// \param[in] input Input stream context pointer
1608 ///
1609 static ANTLR3_MARKER
antlr3UTF32Index(pANTLR3_INT_STREAM is)1610 antlr3UTF32Index(pANTLR3_INT_STREAM is)
1611 {
1612     pANTLR3_INPUT_STREAM input;
1613 
1614     input   = ((pANTLR3_INPUT_STREAM) (is->super));
1615 
1616     return  (ANTLR3_MARKER)(input->nextChar);
1617 }
1618 
1619 /// \brief Return a substring of the UTF16 input stream in
1620 ///  newly allocated memory.
1621 ///
1622 /// \param input Input stream context pointer
1623 /// \param start Offset in input stream where the string starts
1624 /// \param stop  Offset in the input stream where the string ends.
1625 ///
1626 static pANTLR3_STRING
antlr3UTF32Substr(pANTLR3_INPUT_STREAM input,ANTLR3_MARKER start,ANTLR3_MARKER stop)1627 antlr3UTF32Substr		(pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop)
1628 {
1629     return  input->strFactory->newPtr(input->strFactory, (pANTLR3_UINT8)start, ((ANTLR3_UINT32_CAST(stop - start))/4) + 1);
1630 }
1631 
1632 /// \brief Rewind the lexer input to the state specified by the supplied mark.
1633 ///
1634 /// \param[in] input Input stream context pointer
1635 ///
1636 /// \remark
1637 /// Assumes UTF32 input stream.
1638 ///
1639 static void
antlr3UTF32Seek(pANTLR3_INT_STREAM is,ANTLR3_MARKER seekPoint)1640 antlr3UTF32Seek	(pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint)
1641 {
1642 	pANTLR3_INPUT_STREAM input;
1643 
1644 	input   = ((pANTLR3_INPUT_STREAM) is->super);
1645 
1646 	// If the requested seek point is less than the current
1647 	// input point, then we assume that we are resetting from a mark
1648 	// and do not need to scan, but can just set to there as rewind will
1649         // reset line numbers and so on.
1650 	//
1651 	if	(seekPoint <= (ANTLR3_MARKER)(input->nextChar))
1652 	{
1653 		input->nextChar	= (void *)seekPoint;
1654 	}
1655 	else
1656 	{
1657             // Call consume until we reach the asked for seek point or EOF
1658             //
1659             while (is->_LA(is, 1) != ANTLR3_CHARSTREAM_EOF && seekPoint < (ANTLR3_MARKER)input->nextChar)
1660 	    {
1661 		is->consume(is);
1662 	    }
1663 	}
1664 }
1665 
1666 /** \brief Return the input element assuming a UTF32 input in natural machine byte order
1667  *
1668  * \param[in] input Input stream context pointer
1669  * \param[in] la 1 based offset of next input stream element
1670  *
1671  * \return Next input character in internal ANTLR3 encoding (UTF32)
1672  */
1673 static ANTLR3_UCHAR
antlr3UTF32LA(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)1674 antlr3UTF32LA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
1675 {
1676     pANTLR3_INPUT_STREAM input;
1677 
1678     input   = ((pANTLR3_INPUT_STREAM) (is->super));
1679 
1680     if	(( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1681     {
1682 		return	ANTLR3_CHARSTREAM_EOF;
1683     }
1684     else
1685     {
1686 		return	(ANTLR3_UCHAR)(*((pANTLR3_UINT32)input->nextChar + la - 1));
1687     }
1688 }
1689 
1690 /** \brief Return the input element assuming a UTF32 input in little endian byte order
1691  *
1692  * \param[in] input Input stream context pointer
1693  * \param[in] la 1 based offset of next input stream element
1694  *
1695  * \return Next input character in internal ANTLR3 encoding (UTF32)
1696  */
1697 static ANTLR3_UCHAR
antlr3UTF32LALE(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)1698 antlr3UTF32LALE(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
1699 {
1700     pANTLR3_INPUT_STREAM input;
1701 
1702     input   = ((pANTLR3_INPUT_STREAM) (is->super));
1703 
1704     if	(( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1705     {
1706 		return	ANTLR3_CHARSTREAM_EOF;
1707     }
1708     else
1709     {
1710         ANTLR3_UCHAR   c;
1711 
1712         c = (ANTLR3_UCHAR)(*((pANTLR3_UINT32)input->nextChar + la - 1));
1713 
1714         // Swap Endianess to Big Endian
1715         //
1716         return (c>>24) | ((c<<8) & 0x00FF0000) | ((c>>8) & 0x0000FF00) | (c<<24);
1717     }
1718 }
1719 
1720 /** \brief Return the input element assuming a UTF32 input in big endian byte order
1721  *
1722  * \param[in] input Input stream context pointer
1723  * \param[in] la 1 based offset of next input stream element
1724  *
1725  * \return Next input character in internal ANTLR3 encoding (UTF32)
1726  * \remark This is the same code as LE version but seprated in case there are better optimisations fo rendinan swap
1727  */
1728 static ANTLR3_UCHAR
antlr3UTF32LABE(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)1729 antlr3UTF32LABE(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
1730 {
1731     pANTLR3_INPUT_STREAM input;
1732 
1733     input   = ((pANTLR3_INPUT_STREAM) (is->super));
1734 
1735     if	(( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1736     {
1737 		return	ANTLR3_CHARSTREAM_EOF;
1738     }
1739     else
1740     {
1741         ANTLR3_UCHAR   c;
1742 
1743         c = (ANTLR3_UCHAR)(*((pANTLR3_UINT32)input->nextChar + la - 1));
1744 
1745         // Swap Endianess to Little Endian
1746         //
1747         return (c>>24) | ((c<<8) & 0x00FF0000) | ((c>>8) & 0x0000FF00) | (c<<24);
1748     }
1749 }
1750 
1751 
1752 /// \brief Common function to setup function interface for a UTF8 input stream.
1753 ///
1754 /// \param input Input stream context pointer
1755 ///
1756 void
antlr3UTF8SetupStream(pANTLR3_INPUT_STREAM input)1757 antlr3UTF8SetupStream	(pANTLR3_INPUT_STREAM input)
1758 {
1759     // Build a string factory for this stream. This is a UTF16 string factory which is a standard
1760     // part of the ANTLR3 string. The string factory is then passed through the whole chain of lexer->parser->tree->treeparser
1761     // and so on.
1762     //
1763     input->strFactory	= antlr3StringFactoryNew(input->encoding);
1764 
1765     // Generic API that does not care about endianess.
1766     //
1767     input->istream->consume	= antlr3UTF8Consume;	// Consume the next UTF32 character in the buffer
1768     input->istream->_LA         = antlr3UTF8LA;         // Return the UTF32 character at offset n (1 based)
1769     input->charByteSize		= 0;	                // Size in bytes of characters in this stream.
1770 }
1771 
1772 // ------------------------------------------------------
1773 // Following is from Unicode.org (see antlr3convertutf.c)
1774 //
1775 
1776 /// Index into the table below with the first byte of a UTF-8 sequence to
1777 /// get the number of trailing bytes that are supposed to follow it.
1778 /// Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
1779 /// left as-is for anyone who may want to do such conversion, which was
1780 /// allowed in earlier algorithms.
1781 ///
1782 static const ANTLR3_UINT32 trailingBytesForUTF8[256] = {
1783     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1784     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1785     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1786     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1787     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1788     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1789     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1790     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
1791 };
1792 
1793 /// Magic values subtracted from a buffer value during UTF8 conversion.
1794 /// This table contains as many values as there might be trailing bytes
1795 /// in a UTF-8 sequence.
1796 ///
1797 static const UTF32 offsetsFromUTF8[6] =
1798     {   0x00000000UL, 0x00003080UL, 0x000E2080UL,
1799 	0x03C82080UL, 0xFA082080UL, 0x82082080UL
1800     };
1801 
1802 // End of Unicode.org tables
1803 // -------------------------
1804 
1805 
1806 /** \brief Consume the next character in a UTF8 input stream
1807  *
1808  * \param input Input stream context pointer
1809  */
1810 static void
antlr3UTF8Consume(pANTLR3_INT_STREAM is)1811 antlr3UTF8Consume(pANTLR3_INT_STREAM is)
1812 {
1813     pANTLR3_INPUT_STREAM    input;
1814     ANTLR3_UINT32           extraBytesToRead;
1815     ANTLR3_UCHAR            ch;
1816     pANTLR3_UINT8           nextChar;
1817 
1818     input   = ((pANTLR3_INPUT_STREAM) (is->super));
1819 
1820     nextChar = (pANTLR3_UINT8)input->nextChar;
1821 
1822     if	(nextChar < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1823     {
1824 	// Indicate one more character in this line
1825 	//
1826 	input->charPositionInLine++;
1827 
1828         // Are there more bytes needed to make up the whole thing?
1829         //
1830         extraBytesToRead = trailingBytesForUTF8[*nextChar];
1831 
1832         if	(nextChar + extraBytesToRead >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1833         {
1834             input->nextChar = (((pANTLR3_UINT8)input->data) + input->sizeBuf);
1835             return;
1836         }
1837 
1838         // Cases deliberately fall through (see note A in antlrconvertutf.c)
1839         // Legal UTF8 is only 4 bytes but 6 bytes could be used in old UTF8 so
1840         // we allow it.
1841         //
1842         ch  = 0;
1843        	switch (extraBytesToRead) {
1844 	    case 5: ch += *nextChar++; ch <<= 6;
1845 	    case 4: ch += *nextChar++; ch <<= 6;
1846 	    case 3: ch += *nextChar++; ch <<= 6;
1847 	    case 2: ch += *nextChar++; ch <<= 6;
1848 	    case 1: ch += *nextChar++; ch <<= 6;
1849 	    case 0: ch += *nextChar++;
1850 	}
1851 
1852         // Magically correct the input value
1853         //
1854 	ch -= offsetsFromUTF8[extraBytesToRead];
1855 	if  (ch == input->newlineChar)
1856 	{
1857 	    /* Reset for start of a new line of input
1858 	     */
1859 	    input->line++;
1860 	    input->charPositionInLine	= 0;
1861 	    input->currentLine		= (void *)nextChar;
1862 	}
1863 
1864         // Update input pointer
1865         //
1866         input->nextChar = nextChar;
1867     }
1868 }
1869 /** \brief Return the input element assuming a UTF8 input
1870  *
1871  * \param[in] input Input stream context pointer
1872  * \param[in] la 1 based offset of next input stream element
1873  *
1874  * \return Next input character in internal ANTLR3 encoding (UTF32)
1875  */
1876 static ANTLR3_UCHAR
antlr3UTF8LA(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)1877 antlr3UTF8LA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
1878 {
1879     pANTLR3_INPUT_STREAM    input;
1880     ANTLR3_UINT32           extraBytesToRead;
1881     ANTLR3_UCHAR            ch;
1882     pANTLR3_UINT8           nextChar;
1883 
1884     input   = ((pANTLR3_INPUT_STREAM) (is->super));
1885 
1886     nextChar = (pANTLR3_UINT8)input->nextChar;
1887 
1888     // Do we need to traverse forwards or backwards?
1889     // - LA(0) is treated as LA(1) and we assume that the nextChar is
1890     //   already positioned.
1891     // - LA(n+) ; n>1 means we must traverse forward n-1 characters catering for UTF8 encoding
1892     // - LA(-n) means we must traverse backwards n chracters
1893     //
1894     if (la > 1) {
1895 
1896         // Make sure that we have at least one character left before trying to
1897         // loop through the buffer.
1898         //
1899         if	(nextChar < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1900         {
1901             // Now traverse n-1 characters forward
1902             //
1903             while (--la > 0)
1904             {
1905                 // Does the next character require trailing bytes?
1906                 // If so advance the pointer by that many bytes as well as advancing
1907                 // one position for what will be at least a single byte character.
1908                 //
1909                 nextChar += trailingBytesForUTF8[*nextChar] + 1;
1910 
1911                 // Does that calculation take us past the byte length of the buffer?
1912                 //
1913                 if	(nextChar >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1914                 {
1915                     return ANTLR3_CHARSTREAM_EOF;
1916                 }
1917             }
1918         }
1919         else
1920         {
1921             return ANTLR3_CHARSTREAM_EOF;
1922         }
1923     }
1924     else
1925     {
1926         // LA is negative so we decrease the pointer by n character positions
1927         //
1928         while   (nextChar > (pANTLR3_UINT8)input->data && la++ < 0)
1929         {
1930             // Traversing backwards in UTF8 means decermenting by one
1931             // then continuing to decrement while ever a character pattern
1932             // is flagged as being a trailing byte of an encoded code point.
1933             // Trailing UTF8 bytes always start with 10 in binary. We assumne that
1934             // the UTF8 is well formed and do not check boundary conditions
1935             //
1936             nextChar--;
1937             while ((*nextChar & 0xC0) == 0x80)
1938             {
1939                 nextChar--;
1940             }
1941         }
1942     }
1943 
1944     // nextChar is now pointing at the UTF8 encoded character that we need to
1945     // decode and return.
1946     //
1947     // Are there more bytes needed to make up the whole thing?
1948     //
1949     extraBytesToRead = trailingBytesForUTF8[*nextChar];
1950     if	(nextChar + extraBytesToRead >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1951     {
1952         return ANTLR3_CHARSTREAM_EOF;
1953     }
1954 
1955     // Cases deliberately fall through (see note A in antlrconvertutf.c)
1956     //
1957     ch  = 0;
1958     switch (extraBytesToRead) {
1959             case 5: ch += *nextChar++; ch <<= 6;
1960             case 4: ch += *nextChar++; ch <<= 6;
1961             case 3: ch += *nextChar++; ch <<= 6;
1962             case 2: ch += *nextChar++; ch <<= 6;
1963             case 1: ch += *nextChar++; ch <<= 6;
1964             case 0: ch += *nextChar++;
1965     }
1966 
1967     // Magically correct the input value
1968     //
1969     ch -= offsetsFromUTF8[extraBytesToRead];
1970 
1971     return ch;
1972 }
1973 
1974 // EBCDIC to ASCII conversion table
1975 //
1976 // This for EBCDIC EDF04 translated to ISO-8859.1 which is the usually accepted POSIX
1977 // translation and the character tables are published all over the interweb.
1978 //
1979 const ANTLR3_UCHAR e2a[256] =
1980 {
1981     0x00, 0x01, 0x02, 0x03, 0x85, 0x09, 0x86, 0x7f,
1982     0x87, 0x8d, 0x8e, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1983     0x10, 0x11, 0x12, 0x13, 0x8f, 0x0a, 0x08, 0x97,
1984     0x18, 0x19, 0x9c, 0x9d, 0x1c, 0x1d, 0x1e, 0x1f,
1985     0x80, 0x81, 0x82, 0x83, 0x84, 0x92, 0x17, 0x1b,
1986     0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x05, 0x06, 0x07,
1987     0x90, 0x91, 0x16, 0x93, 0x94, 0x95, 0x96, 0x04,
1988     0x98, 0x99, 0x9a, 0x9b, 0x14, 0x15, 0x9e, 0x1a,
1989     0x20, 0xa0, 0xe2, 0xe4, 0xe0, 0xe1, 0xe3, 0xe5,
1990     0xe7, 0xf1, 0x60, 0x2e, 0x3c, 0x28, 0x2b, 0x7c,
1991     0x26, 0xe9, 0xea, 0xeb, 0xe8, 0xed, 0xee, 0xef,
1992     0xec, 0xdf, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0x9f,
1993     0x2d, 0x2f, 0xc2, 0xc4, 0xc0, 0xc1, 0xc3, 0xc5,
1994     0xc7, 0xd1, 0x5e, 0x2c, 0x25, 0x5f, 0x3e, 0x3f,
1995     0xf8, 0xc9, 0xca, 0xcb, 0xc8, 0xcd, 0xce, 0xcf,
1996     0xcc, 0xa8, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22,
1997     0xd8, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
1998     0x68, 0x69, 0xab, 0xbb, 0xf0, 0xfd, 0xfe, 0xb1,
1999     0xb0, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70,
2000     0x71, 0x72, 0xaa, 0xba, 0xe6, 0xb8, 0xc6, 0xa4,
2001     0xb5, 0xaf, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
2002     0x79, 0x7a, 0xa1, 0xbf, 0xd0, 0xdd, 0xde, 0xae,
2003     0xa2, 0xa3, 0xa5, 0xb7, 0xa9, 0xa7, 0xb6, 0xbc,
2004     0xbd, 0xbe, 0xac, 0x5b, 0x5c, 0x5d, 0xb4, 0xd7,
2005     0xf9, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
2006     0x48, 0x49, 0xad, 0xf4, 0xf6, 0xf2, 0xf3, 0xf5,
2007     0xa6, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50,
2008     0x51, 0x52, 0xb9, 0xfb, 0xfc, 0xdb, 0xfa, 0xff,
2009     0xd9, 0xf7, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
2010     0x59, 0x5a, 0xb2, 0xd4, 0xd6, 0xd2, 0xd3, 0xd5,
2011     0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
2012     0x38, 0x39, 0xb3, 0x7b, 0xdc, 0x7d, 0xda, 0x7e
2013 };
2014 
2015 /// \brief Common function to setup function interface for a EBCDIC input stream.
2016 ///
2017 /// \param input Input stream context pointer
2018 ///
2019 void
antlr3EBCDICSetupStream(pANTLR3_INPUT_STREAM input)2020 antlr3EBCDICSetupStream	(pANTLR3_INPUT_STREAM input)
2021 {
2022     // EBCDIC streams can use the standard 8 bit string factory
2023     //
2024     input->strFactory	= antlr3StringFactoryNew(input->encoding);
2025 
2026     // Generic API that does not care about endianess.
2027     //
2028     input->istream->_LA         = antlr3EBCDICLA;       // Return the UTF32 character at offset n (1 based)
2029     input->charByteSize		= 1;	                // Size in bytes of characters in this stream.
2030 }
2031 
2032 /// \brief Return the input element assuming an 8 bit EBCDIC input
2033 ///
2034 /// \param[in] input Input stream context pointer
2035 /// \param[in] la 1 based offset of next input stream element
2036 ///
2037 /// \return Next input character in internal ANTLR3 encoding (UTF32) after translation
2038 ///         from EBCDIC to ASCII
2039 ///
2040 static ANTLR3_UCHAR
antlr3EBCDICLA(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)2041 antlr3EBCDICLA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
2042 {
2043     pANTLR3_INPUT_STREAM input;
2044 
2045     input   = ((pANTLR3_INPUT_STREAM) (is->super));
2046 
2047     if	(( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
2048     {
2049         return	ANTLR3_CHARSTREAM_EOF;
2050     }
2051     else
2052     {
2053         // Translate the required character via the constant conversion table
2054         //
2055         return	e2a[(*((pANTLR3_UINT8)input->nextChar + la - 1))];
2056     }
2057 }