• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /// \file
2 /// Base functions to initialize and manipulate any input stream
3 ///
4 
5 // [The "BSD licence"]
6 // Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC
7 // http://www.temporal-wave.com
8 // http://www.linkedin.com/in/jimidle
9 //
10 // All rights reserved.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions
14 // are met:
15 // 1. Redistributions of source code must retain the above copyright
16 //    notice, this list of conditions and the following disclaimer.
17 // 2. Redistributions in binary form must reproduce the above copyright
18 //    notice, this list of conditions and the following disclaimer in the
19 //    documentation and/or other materials provided with the distribution.
20 // 3. The name of the author may not be used to endorse or promote products
21 //    derived from this software without specific prior written permission.
22 //
23 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
24 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
25 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
26 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
27 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
28 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
32 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 
34 #include    <antlr3input.h>
35 
36 // -----------------------------------
37 // Generic 8 bit input such as latin-1
38 //
39 
40 // 8Bit INT Stream API
41 //
42 static	    void	    antlr38BitConsume		(pANTLR3_INT_STREAM is);
43 static	    ANTLR3_UCHAR    antlr38BitLA		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
44 static	    ANTLR3_UCHAR    antlr38BitLA_ucase		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
45 static	    ANTLR3_MARKER   antlr38BitIndex		(pANTLR3_INT_STREAM is);
46 static	    ANTLR3_MARKER   antlr38BitMark		(pANTLR3_INT_STREAM is);
47 static	    void	    antlr38BitRewind		(pANTLR3_INT_STREAM is, ANTLR3_MARKER mark);
48 static	    void	    antlr38BitRewindLast	(pANTLR3_INT_STREAM is);
49 static	    void	    antlr38BitRelease		(pANTLR3_INT_STREAM is, ANTLR3_MARKER mark);
50 static	    void	    antlr38BitSeek		(pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint);
51 static	    pANTLR3_STRING  antlr38BitGetSourceName	(pANTLR3_INT_STREAM is);
52 
53 // 8Bit Charstream API functions
54 //
55 static	    void	    antlr3InputClose		(pANTLR3_INPUT_STREAM input);
56 static	    void	    antlr3InputReset		(pANTLR3_INPUT_STREAM input);
57 static      void            antlr38BitReuse            (pANTLR3_INPUT_STREAM input, pANTLR3_UINT8 inString, ANTLR3_UINT32 size, pANTLR3_UINT8 name);
58 static	    void *	    antlr38BitLT		(pANTLR3_INPUT_STREAM input, ANTLR3_INT32 lt);
59 static	    ANTLR3_UINT32   antlr38BitSize		(pANTLR3_INPUT_STREAM input);
60 static	    pANTLR3_STRING  antlr38BitSubstr		(pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop);
61 static	    ANTLR3_UINT32   antlr38BitGetLine		(pANTLR3_INPUT_STREAM input);
62 static	    void	  * antlr38BitGetLineBuf	(pANTLR3_INPUT_STREAM input);
63 static	    ANTLR3_UINT32   antlr38BitGetCharPosition	(pANTLR3_INPUT_STREAM input);
64 static	    void	    antlr38BitSetLine		(pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 line);
65 static	    void	    antlr38BitSetCharPosition	(pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 position);
66 static	    void	    antlr38BitSetNewLineChar	(pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 newlineChar);
67 static	    void	    antlr38BitSetUcaseLA	(pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN flag);
68 
69 // -----------------------------------
70 // UTF16 (also covers UCS2)
71 //
72 // INT Stream API
73 //
74 static	    void	    antlr3UTF16Consume	        (pANTLR3_INT_STREAM is);
75 static	    ANTLR3_UCHAR    antlr3UTF16LA		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
76 static	    void	    antlr3UTF16ConsumeLE        (pANTLR3_INT_STREAM is);
77 static	    ANTLR3_UCHAR    antlr3UTF16LALE		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
78 static	    void	    antlr3UTF16ConsumeBE        (pANTLR3_INT_STREAM is);
79 static	    ANTLR3_UCHAR    antlr3UTF16LABE		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
80 static	    ANTLR3_MARKER   antlr3UTF16Index		(pANTLR3_INT_STREAM is);
81 static	    void	    antlr3UTF16Seek		(pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint);
82 
83 // UTF16 Charstream API functions
84 //
85 static	    pANTLR3_STRING	antlr3UTF16Substr	(pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop);
86 
87 // -----------------------------------
88 // UTF32 (also covers UCS2)
89 //
90 // INT Stream API
91 //
92 static	    void	    antlr3UTF32Consume	        (pANTLR3_INT_STREAM is);
93 static	    ANTLR3_UCHAR    antlr3UTF32LA		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
94 static	    ANTLR3_UCHAR    antlr3UTF32LALE		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
95 static	    ANTLR3_UCHAR    antlr3UTF32LABE		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
96 static	    ANTLR3_MARKER   antlr3UTF32Index		(pANTLR3_INT_STREAM is);
97 static	    void	    antlr3UTF32Seek		(pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint);
98 
99 // UTF16 Charstream API functions
100 //
101 static	    pANTLR3_STRING  antlr3UTF32Substr	        (pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop);
102 
103 // ------------------------------------
104 // UTF-8
105 //
106 static	    void	    antlr3UTF8Consume	        (pANTLR3_INT_STREAM is);
107 static	    ANTLR3_UCHAR    antlr3UTF8LA		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
108 
109 // ------------------------------------
110 // EBCDIC
111 //
112 static	    ANTLR3_UCHAR    antlr3EBCDICLA		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
113 
114 /// \brief Common function to setup function interface for an 8 bit input stream.
115 ///
116 /// \param input Input stream context pointer
117 ///
118 /// \remark
119 ///   - Many of the 8 bit oriented file stream handling functions will be usable
120 ///     by any or at least some, other input streams. Therefore it is perfectly acceptable
121 ///     to call this function to install the 8Bit handler then override just those functions
122 ///     that would not work for the particular input encoding, such as consume for instance.
123 ///
124 void
antlr38BitSetupStream(pANTLR3_INPUT_STREAM input)125 antlr38BitSetupStream	(pANTLR3_INPUT_STREAM input)
126 {
127     // Build a string factory for this stream
128     //
129     input->strFactory	= antlr3StringFactoryNew(input->encoding);
130 
131     // Default stream API set up is for 8Bit, so we are done
132     //
133 }
134 
135 void
antlr3GenericSetupStream(pANTLR3_INPUT_STREAM input)136 antlr3GenericSetupStream  (pANTLR3_INPUT_STREAM input)
137 {
138     /* Install function pointers for an 8 bit input
139      */
140 
141     /* Allocate stream interface
142      */
143     input->istream		= antlr3IntStreamNew();
144     input->istream->type        = ANTLR3_CHARSTREAM;
145     input->istream->super       = input;
146 
147     /* Intstream API
148      */
149     input->istream->consume	    = antlr38BitConsume;	    // Consume the next 8 bit character in the buffer
150     input->istream->_LA		    = antlr38BitLA;	            // Return the UTF32 character at offset n (1 based)
151     input->istream->index	    = antlr38BitIndex;	            // Current index (offset from first character
152     input->istream->mark	    = antlr38BitMark;		    // Record the current lex state for later restore
153     input->istream->rewind	    = antlr38BitRewind;	            // How to rewind the input
154     input->istream->rewindLast	    = antlr38BitRewindLast;	    // How to rewind the input
155     input->istream->seek	    = antlr38BitSeek;		    // How to seek to a specific point in the stream
156     input->istream->release	    = antlr38BitRelease;	    // Reset marks after mark n
157     input->istream->getSourceName   = antlr38BitGetSourceName;      // Return a string that names the input source
158 
159     /* Charstream API
160      */
161     input->close		    =  antlr3InputClose;	    // Close down the stream completely
162     input->free			    =  antlr3InputClose;	    // Synonym for free
163     input->reset		    =  antlr3InputReset;	    // Reset input to start
164     input->reuse                    =  antlr38BitReuse;             // Install a new input string and reset
165     input->_LT			    =  antlr38BitLT;		    // Same as _LA for 8 bit file
166     input->size			    =  antlr38BitSize;		    // Return the size of the input buffer
167     input->substr		    =  antlr38BitSubstr;	    // Return a string from the input stream
168     input->getLine		    =  antlr38BitGetLine;	    // Return the current line number in the input stream
169     input->getLineBuf		    =  antlr38BitGetLineBuf;	    // Return a pointer to the start of the current line being consumed
170     input->getCharPositionInLine    =  antlr38BitGetCharPosition;   // Return the offset into the current line of input
171     input->setLine		    =  antlr38BitSetLine;	    // Set the input stream line number (does not set buffer pointers)
172     input->setCharPositionInLine    =  antlr38BitSetCharPosition;   // Set the offset in to the current line (does not set any pointers)
173     input->SetNewLineChar	    =  antlr38BitSetNewLineChar;    // Set the value of the newline trigger character
174     input->setUcaseLA		    =  antlr38BitSetUcaseLA;        // Changes the LA function to return upper case always
175 
176     input->charByteSize		    = 1;		// Size in bytes of characters in this stream.
177 
178     /* Initialize entries for tables etc
179      */
180     input->markers  = NULL;
181 
182     /* Set up the input stream brand new
183      */
184     input->reset(input);
185 
186     /* Install default line separator character (it can be replaced
187      * by the grammar programmer later)
188      */
189     input->SetNewLineChar(input, (ANTLR3_UCHAR)'\n');
190 }
191 
192 static pANTLR3_STRING
antlr38BitGetSourceName(pANTLR3_INT_STREAM is)193 antlr38BitGetSourceName(pANTLR3_INT_STREAM is)
194 {
195 	return	is->streamName;
196 }
197 
198 /** \brief Close down an input stream and free any memory allocated by it.
199  *
200  * \param input Input stream context pointer
201  */
202 static void
antlr3InputClose(pANTLR3_INPUT_STREAM input)203 antlr3InputClose(pANTLR3_INPUT_STREAM input)
204 {
205     // Close any markers in the input stream
206     //
207     if	(input->markers != NULL)
208     {
209 		input->markers->free(input->markers);
210 		input->markers = NULL;
211     }
212 
213     // Close the string factory
214     //
215     if	(input->strFactory != NULL)
216     {
217 		input->strFactory->close(input->strFactory);
218     }
219 
220     // Free the input stream buffer if we allocated it
221     //
222     if	(input->isAllocated && input->data != NULL)
223     {
224 		ANTLR3_FREE(input->data);
225 		input->data = NULL;
226     }
227 
228     input->istream->free(input->istream);
229 
230     // Finally, free the space for the structure itself
231     //
232     ANTLR3_FREE(input);
233 
234     // Done
235     //
236 }
237 
238 static void
antlr38BitSetUcaseLA(pANTLR3_INPUT_STREAM input,ANTLR3_BOOLEAN flag)239 antlr38BitSetUcaseLA		(pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN flag)
240 {
241 	if	(flag)
242 	{
243 		// Return the upper case version of the characters
244 		//
245 		input->istream->_LA		    =  antlr38BitLA_ucase;
246 	}
247 	else
248 	{
249 		// Return the raw characters as they are in the buffer
250 		//
251 		input->istream->_LA		    =  antlr38BitLA;
252 	}
253 }
254 
255 
256 /** \brief Reset a re-startable input stream to the start
257  *
258  * \param input Input stream context pointer
259  */
260 static void
antlr3InputReset(pANTLR3_INPUT_STREAM input)261 antlr3InputReset(pANTLR3_INPUT_STREAM input)
262 {
263 
264     input->nextChar		= input->data;	/* Input at first character */
265     input->line			= 1;		/* starts at line 1	    */
266     input->charPositionInLine	= -1;
267     input->currentLine		= input->data;
268     input->markDepth		= 0;		/* Reset markers	    */
269 
270     /* Clear out up the markers table if it is there
271      */
272     if	(input->markers != NULL)
273     {
274         input->markers->clear(input->markers);
275     }
276     else
277     {
278         /* Install a new markers table
279          */
280         input->markers  = antlr3VectorNew(0);
281     }
282 }
283 
284 /** Install a new source code in to a working input stream so that the
285  *  input stream can be reused.
286  */
287 static void
antlr38BitReuse(pANTLR3_INPUT_STREAM input,pANTLR3_UINT8 inString,ANTLR3_UINT32 size,pANTLR3_UINT8 name)288 antlr38BitReuse(pANTLR3_INPUT_STREAM input, pANTLR3_UINT8 inString, ANTLR3_UINT32 size, pANTLR3_UINT8 name)
289 {
290     input->isAllocated	= ANTLR3_FALSE;
291     input->data		= inString;
292     input->sizeBuf	= size;
293 
294     // Now we can set up the file name. As we are reusing the stream, there may already
295     // be a string that we can reuse for holding the filename.
296     //
297 	if	(input->istream->streamName == NULL)
298 	{
299 		input->istream->streamName	= input->strFactory->newStr(input->strFactory, name == NULL ? (pANTLR3_UINT8)"-memory-" : name);
300 		input->fileName		= input->istream->streamName;
301 	}
302 	else
303 	{
304 		input->istream->streamName->set(input->istream->streamName,  (name == NULL ? (const char *)"-memory-" : (const char *)name));
305 	}
306 
307     input->reset(input);
308 }
309 
310 /** \brief Consume the next character in an 8 bit input stream
311  *
312  * \param input Input stream context pointer
313  */
314 static void
antlr38BitConsume(pANTLR3_INT_STREAM is)315 antlr38BitConsume(pANTLR3_INT_STREAM is)
316 {
317     pANTLR3_INPUT_STREAM input;
318 
319     input   = ((pANTLR3_INPUT_STREAM) (is->super));
320 
321     if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
322     {
323 	/* Indicate one more character in this line
324 	 */
325 	input->charPositionInLine++;
326 
327 	if  ((ANTLR3_UCHAR)(*((pANTLR3_UINT8)input->nextChar)) == input->newlineChar)
328 	{
329 	    /* Reset for start of a new line of input
330 	     */
331 	    input->line++;
332 	    input->charPositionInLine	= 0;
333 	    input->currentLine		= (void *)(((pANTLR3_UINT8)input->nextChar) + 1);
334 	}
335 
336 	/* Increment to next character position
337 	 */
338 	input->nextChar = (void *)(((pANTLR3_UINT8)input->nextChar) + 1);
339     }
340 }
341 
342 /** \brief Return the input element assuming an 8 bit ascii input
343  *
344  * \param[in] input Input stream context pointer
345  * \param[in] la 1 based offset of next input stream element
346  *
347  * \return Next input character in internal ANTLR3 encoding (UTF32)
348  */
349 static ANTLR3_UCHAR
antlr38BitLA(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)350 antlr38BitLA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
351 {
352     pANTLR3_INPUT_STREAM input;
353 
354     input   = ((pANTLR3_INPUT_STREAM) (is->super));
355 
356     if	(( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
357     {
358 		return	ANTLR3_CHARSTREAM_EOF;
359     }
360     else
361     {
362 		return	(ANTLR3_UCHAR)(*((pANTLR3_UINT8)input->nextChar + la - 1));
363     }
364 }
365 
366 /** \brief Return the input element assuming an 8 bit input and
367  *         always return the UPPER CASE character.
368  *		   Note that this is 8 bit and so we assume that the toupper
369  *		   function will use the correct locale for 8 bits.
370  *
371  * \param[in] input Input stream context pointer
372  * \param[in] la 1 based offset of next input stream element
373  *
374  * \return Next input character in internal ANTLR3 encoding (UTF32)
375  */
376 static ANTLR3_UCHAR
antlr38BitLA_ucase(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)377 antlr38BitLA_ucase	(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
378 {
379     pANTLR3_INPUT_STREAM input;
380 
381     input   = ((pANTLR3_INPUT_STREAM) (is->super));
382 
383     if	(( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
384     {
385 		return	ANTLR3_CHARSTREAM_EOF;
386     }
387     else
388     {
389 		return	(ANTLR3_UCHAR)toupper((*((pANTLR3_UINT8)input->nextChar + la - 1)));
390     }
391 }
392 
393 
394 /** \brief Return the input element assuming an 8 bit ascii input
395  *
396  * \param[in] input Input stream context pointer
397  * \param[in] lt 1 based offset of next input stream element
398  *
399  * \return Next input character in internal ANTLR3 encoding (UTF32)
400  */
401 static void *
antlr38BitLT(pANTLR3_INPUT_STREAM input,ANTLR3_INT32 lt)402 antlr38BitLT(pANTLR3_INPUT_STREAM input, ANTLR3_INT32 lt)
403 {
404     /* Casting is horrible but it means no warnings and LT should never be called
405      * on a character stream anyway I think. If it is then, the void * will need to be
406      * cast back in a similar manner. Yuck! But this means that LT for Token streams and
407      * tree streams is correct.
408      */
409     return (ANTLR3_FUNC_PTR(input->istream->_LA(input->istream, lt)));
410 }
411 
412 /** \brief Calculate the current index in the output stream.
413  * \param[in] input Input stream context pointer
414  */
415 static ANTLR3_MARKER
antlr38BitIndex(pANTLR3_INT_STREAM is)416 antlr38BitIndex(pANTLR3_INT_STREAM is)
417 {
418     pANTLR3_INPUT_STREAM input;
419 
420     input   = ((pANTLR3_INPUT_STREAM) (is->super));
421 
422     return  (ANTLR3_MARKER)(((pANTLR3_UINT8)input->nextChar));
423 }
424 
425 /** \brief Return the size of the current input stream, as an 8Bit file
426  *   which in this case is the total input. Other implementations may provide
427  *   more sophisticated implementations to deal with non-recoverable streams
428  *   and so on.
429  *
430  * \param[in] input Input stream context pointer
431  */
432 static	ANTLR3_UINT32
antlr38BitSize(pANTLR3_INPUT_STREAM input)433 antlr38BitSize(pANTLR3_INPUT_STREAM input)
434 {
435     return  input->sizeBuf;
436 }
437 
438 /** \brief Mark the current input point in an 8Bit 8 bit stream
439  *  such as a file stream, where all the input is available in the
440  *  buffer.
441  *
442  * \param[in] is Input stream context pointer
443  */
444 static ANTLR3_MARKER
antlr38BitMark(pANTLR3_INT_STREAM is)445 antlr38BitMark	(pANTLR3_INT_STREAM is)
446 {
447     pANTLR3_LEX_STATE	    state;
448     pANTLR3_INPUT_STREAM    input;
449 
450     input   = ((pANTLR3_INPUT_STREAM) (is->super));
451 
452     /* New mark point
453      */
454     input->markDepth++;
455 
456     /* See if we are revisiting a mark as we can just reuse the vector
457      * entry if we are, otherwise, we need a new one
458      */
459     if	(input->markDepth > input->markers->count)
460     {
461 	state	= ANTLR3_MALLOC(sizeof(ANTLR3_LEX_STATE));
462 
463 	/* Add it to the table
464 	 */
465 	input->markers->add(input->markers, state, ANTLR3_FREE_FUNC);	/* No special structure, just free() on delete */
466     }
467     else
468     {
469 	state	= (pANTLR3_LEX_STATE)input->markers->get(input->markers, input->markDepth - 1);
470 
471 	/* Assume no errors for speed, it will just blow up if the table failed
472 	 * for some reasons, hence lots of unit tests on the tables ;-)
473 	 */
474     }
475 
476     /* We have created or retrieved the state, so update it with the current
477      * elements of the lexer state.
478      */
479     state->charPositionInLine	= input->charPositionInLine;
480     state->currentLine		= input->currentLine;
481     state->line			= input->line;
482     state->nextChar		= input->nextChar;
483 
484     is->lastMarker  = input->markDepth;
485 
486     /* And that's it
487      */
488     return  input->markDepth;
489 }
490 /** \brief Rewind the lexer input to the state specified by the last produced mark.
491  *
492  * \param[in] input Input stream context pointer
493  *
494  * \remark
495  * Assumes 8 Bit input stream.
496  */
497 static void
antlr38BitRewindLast(pANTLR3_INT_STREAM is)498 antlr38BitRewindLast	(pANTLR3_INT_STREAM is)
499 {
500     is->rewind(is, is->lastMarker);
501 }
502 
503 /** \brief Rewind the lexer input to the state specified by the supplied mark.
504  *
505  * \param[in] input Input stream context pointer
506  *
507  * \remark
508  * Assumes 8 Bit input stream.
509  */
510 static void
antlr38BitRewind(pANTLR3_INT_STREAM is,ANTLR3_MARKER mark)511 antlr38BitRewind	(pANTLR3_INT_STREAM is, ANTLR3_MARKER mark)
512 {
513     pANTLR3_LEX_STATE	state;
514     pANTLR3_INPUT_STREAM input;
515 
516     input   = ((pANTLR3_INPUT_STREAM) is->super);
517 
518     /* Perform any clean up of the marks
519      */
520     input->istream->release(input->istream, mark);
521 
522     /* Find the supplied mark state
523      */
524     state   = (pANTLR3_LEX_STATE)input->markers->get(input->markers, (ANTLR3_UINT32)(mark - 1));
525 
526     /* Seek input pointer to the requested point (note we supply the void *pointer
527      * to whatever is implementing the int stream to seek).
528      */
529     antlr38BitSeek(is, (ANTLR3_MARKER)(state->nextChar));
530 
531     /* Reset to the reset of the information in the mark
532      */
533     input->charPositionInLine	= state->charPositionInLine;
534     input->currentLine		= state->currentLine;
535     input->line			= state->line;
536     input->nextChar		= state->nextChar;
537 
538     /* And we are done
539      */
540 }
541 
542 /** \brief Rewind the lexer input to the state specified by the supplied mark.
543  *
544  * \param[in] input Input stream context pointer
545  *
546  * \remark
547  * Assumes 8 Bit input stream.
548  */
549 static void
antlr38BitRelease(pANTLR3_INT_STREAM is,ANTLR3_MARKER mark)550 antlr38BitRelease	(pANTLR3_INT_STREAM is, ANTLR3_MARKER mark)
551 {
552     pANTLR3_INPUT_STREAM input;
553 
554     input   = ((pANTLR3_INPUT_STREAM) (is->super));
555 
556     /* We don't do much here in fact as we never free any higher marks in
557      * the hashtable as we just resuse any memory allocated for them.
558      */
559     input->markDepth	= (ANTLR3_UINT32)(mark - 1);
560 }
561 
562 /** \brief Rewind the lexer input to the state specified by the supplied mark.
563  *
564  * \param[in] input Input stream context pointer
565  *
566  * \remark
567  * Assumes 8 Bit input stream.
568  */
569 static void
antlr38BitSeek(pANTLR3_INT_STREAM is,ANTLR3_MARKER seekPoint)570 antlr38BitSeek	(pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint)
571 {
572 	ANTLR3_INT32   count;
573 	pANTLR3_INPUT_STREAM input;
574 
575 	input   = ANTLR3_FUNC_PTR(((pANTLR3_INPUT_STREAM) is->super));
576 
577 	/* If the requested seek point is less than the current
578 	* input point, then we assume that we are resetting from a mark
579 	* and do not need to scan, but can just set to there.
580 	*/
581 	if	(seekPoint <= (ANTLR3_MARKER)(input->nextChar))
582 	{
583 		input->nextChar	= ((pANTLR3_UINT8) seekPoint);
584 	}
585 	else
586 	{
587 		count	= (ANTLR3_UINT32)(seekPoint - (ANTLR3_MARKER)(input->nextChar));
588 
589 		while (count--)
590 		{
591 			is->consume(is);
592 		}
593 	}
594 }
595 /** Return a substring of the 8 bit input stream in
596  *  newly allocated memory.
597  *
598  * \param input Input stream context pointer
599  * \param start Offset in input stream where the string starts
600  * \param stop  Offset in the input stream where the string ends.
601  */
602 static pANTLR3_STRING
antlr38BitSubstr(pANTLR3_INPUT_STREAM input,ANTLR3_MARKER start,ANTLR3_MARKER stop)603 antlr38BitSubstr		(pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop)
604 {
605 	return  input->strFactory->newPtr(input->strFactory, (pANTLR3_UINT8)start, (ANTLR3_UINT32)(stop - start + 1));
606 }
607 
608 /** \brief Return the line number as understood by the 8 bit input stream.
609  *
610  * \param input Input stream context pointer
611  * \return	Line number in input stream that we believe we are working on.
612  */
613 static ANTLR3_UINT32
antlr38BitGetLine(pANTLR3_INPUT_STREAM input)614 antlr38BitGetLine		(pANTLR3_INPUT_STREAM input)
615 {
616     return  input->line;
617 }
618 
619 /** Return a pointer into the input stream that points at the start
620  *  of the current input line as triggered by the end of line character installed
621  *  for the stream ('\n' unless told differently).
622  *
623  * \param[in] input
624  */
625 static void	  *
antlr38BitGetLineBuf(pANTLR3_INPUT_STREAM input)626 antlr38BitGetLineBuf	(pANTLR3_INPUT_STREAM input)
627 {
628     return  input->currentLine;
629 }
630 
631 /** Return the current offset in to the current line in the input stream.
632  *
633  * \param input Input stream context pointer
634  * \return      Current line offset
635  */
636 static ANTLR3_UINT32
antlr38BitGetCharPosition(pANTLR3_INPUT_STREAM input)637 antlr38BitGetCharPosition	(pANTLR3_INPUT_STREAM input)
638 {
639     return  input->charPositionInLine;
640 }
641 
642 /** Set the current line number as understood by the input stream.
643  *
644  * \param input Input stream context pointer
645  * \param line  Line number to tell the input stream we are on
646  *
647  * \remark
648  *  This function does not change any pointers, it just allows the programmer to set the
649  *  line number according to some external criterion, such as finding a lexed directive
650  *  like: #nnn "file.c" for instance, such that error reporting and so on in is in sync
651  *  with some original source format.
652  */
653 static void
antlr38BitSetLine(pANTLR3_INPUT_STREAM input,ANTLR3_UINT32 line)654 antlr38BitSetLine		(pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 line)
655 {
656     input->line	= line;
657 }
658 
659 /** Set the current offset in the current line to be a particular setting.
660  *
661  * \param[in] input    Input stream context pointer
662  * \param[in] position New setting for current offset.
663  *
664  * \remark
665  * This does not set the actual pointers in the input stream, it is purely for reporting
666  * purposes and so on as per antlr38BitSetLine();
667  */
668 static void
antlr38BitSetCharPosition(pANTLR3_INPUT_STREAM input,ANTLR3_UINT32 position)669 antlr38BitSetCharPosition	(pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 position)
670 {
671     input->charPositionInLine = position;
672 }
673 
674 /** Set the newline trigger character in the input stream to the supplied parameter.
675  *
676  * \param[in] input	    Input stream context pointer
677  * \param[in] newlineChar   Character to set to be the newline trigger.
678  *
679  * \remark
680  *  - The supplied newLineChar is in UTF32 encoding (which means ASCII and latin1 etc
681  *    are the same encodings), but the input stream catered to by this function is 8 bit
682  *    only, so it is up to the programmer to ensure that the character supplied is valid.
683  */
684 static void
antlr38BitSetNewLineChar(pANTLR3_INPUT_STREAM input,ANTLR3_UINT32 newlineChar)685 antlr38BitSetNewLineChar	(pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 newlineChar)
686 {
687     input->newlineChar	= newlineChar;
688 }
689 
690 
691 /// \brief Common function to setup function interface for a UTF16 or UCS2 input stream.
692 ///
693 /// \param input Input stream context pointer
694 ///
695 /// \remark
696 ///  - Strictly speaking, there is no such thing as a UCS2 input stream as the term
697 ///    tends to confuse the notions of character encoding, unicode and so on. UCS2 is
698 ///    essentially UTF16 without any surrogates and so the standard UTF16
699 ///    input stream is able to handle it without any special code.
700 ///
701 void
antlr3UTF16SetupStream(pANTLR3_INPUT_STREAM input,ANTLR3_BOOLEAN machineBigEndian,ANTLR3_BOOLEAN inputBigEndian)702 antlr3UTF16SetupStream	(pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN machineBigEndian, ANTLR3_BOOLEAN inputBigEndian)
703 {
704     // Build a string factory for this stream. This is a UTF16 string factory which is a standard
705     // part of the ANTLR3 string. The string factory is then passed through the whole chain
706     // of lexer->parser->tree->treeparser and so on.
707     //
708     input->strFactory	= antlr3StringFactoryNew(input->encoding);
709 
710     // Generic API that does not care about endianess.
711     //
712     input->istream->index	    =  antlr3UTF16Index;            // Calculate current index in input stream, UTF16 based
713     input->substr		    =  antlr3UTF16Substr;	    // Return a string from the input stream
714     input->istream->seek	    =  antlr3UTF16Seek;		    // How to seek to a specific point in the stream
715 
716     // We must install different UTF16 routines according to whether the input
717     // is the same endianess as the machine we are executing upon or not. If it is not
718     // then we must install methods that can convert the endianess on the fly as they go
719     //
720 
721     switch (machineBigEndian)
722     {
723         case    ANTLR3_TRUE:
724 
725             // Machine is Big Endian, if the input is also then install the
726             // methods that do not access input by bytes and reverse them.
727             // Otherwise install endian aware methods.
728             //
729             if  (inputBigEndian == ANTLR3_TRUE)
730             {
731                 // Input is machine compatible
732                 //
733                 input->istream->consume	    =  antlr3UTF16Consume;	    // Consume the next UTF16 character in the buffer
734                 input->istream->_LA         =  antlr3UTF16LA;		    // Return the UTF32 character at offset n (1 based)
735             }
736             else
737             {
738                 // Need to use methods that know that the input is little endian
739                 //
740                 input->istream->consume	    =  antlr3UTF16ConsumeLE;	    // Consume the next UTF16 character in the buffer
741                 input->istream->_LA         =  antlr3UTF16LALE;		    // Return the UTF32 character at offset n (1 based)
742             }
743             break;
744 
745         case    ANTLR3_FALSE:
746 
747             // Machine is Little Endian, if the input is also then install the
748             // methods that do not access input by bytes and reverse them.
749             // Otherwise install endian aware methods.
750             //
751             if  (inputBigEndian == ANTLR3_FALSE)
752             {
753                 // Input is machine compatible
754                 //
755                 input->istream->consume	    =  antlr3UTF16Consume;	    // Consume the next UTF16 character in the buffer
756                 input->istream->_LA         =  antlr3UTF16LA;		    // Return the UTF32 character at offset n (1 based)
757             }
758             else
759             {
760                 // Need to use methods that know that the input is Big Endian
761                 //
762                 input->istream->consume	    =  antlr3UTF16ConsumeBE;	    // Consume the next UTF16 character in the buffer
763                 input->istream->_LA         =  antlr3UTF16LABE;		    // Return the UTF32 character at offset n (1 based)
764             }
765             break;
766     }
767 
768 
769     input->charByteSize		    = 2;			    // Size in bytes of characters in this stream.
770 
771 }
772 
773 /// \brief Consume the next character in a UTF16 input stream
774 ///
775 /// \param input Input stream context pointer
776 ///
777 static void
antlr3UTF16Consume(pANTLR3_INT_STREAM is)778 antlr3UTF16Consume(pANTLR3_INT_STREAM is)
779 {
780 	pANTLR3_INPUT_STREAM input;
781         UTF32   ch;
782         UTF32   ch2;
783 
784 	input   = ((pANTLR3_INPUT_STREAM) (is->super));
785 
786         // Buffer size is always in bytes
787         //
788 	if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
789 	{
790 		// Indicate one more character in this line
791 		//
792 		input->charPositionInLine++;
793 
794 		if  ((ANTLR3_UCHAR)(*((pANTLR3_UINT16)input->nextChar)) == input->newlineChar)
795 		{
796 			// Reset for start of a new line of input
797 			//
798 			input->line++;
799 			input->charPositionInLine	= 0;
800 			input->currentLine		= (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
801 		}
802 
803 		// Increment to next character position, accounting for any surrogates
804 		//
805                 // Next char in natural machine byte order
806                 //
807                 ch  = *((UTF16*)input->nextChar);
808 
809                 // We consumed one 16 bit character
810                 //
811 		input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
812 
813                 // If we have a surrogate pair then we need to consume
814                 // a following valid LO surrogate.
815                 //
816                 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
817 
818                     // If the 16 bits following the high surrogate are in the source buffer...
819                     //
820                     if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
821                     {
822                         // Next character is in natural machine byte order
823                         //
824                         ch2 = *((UTF16*)input->nextChar);
825 
826                         // If it's a valid low surrogate, consume it
827                         //
828                         if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
829                         {
830                             // We consumed one 16 bit character
831                             //
832 		            input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
833                         }
834                         // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
835                         // it.
836                         //
837                     }
838                     // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
839                     // it because the buffer ended
840                     //
841                 }
842                 // Note that we did not check for an invalid low surrogate here, or that fact that the
843                 // lo surrogate was missing. We just picked out one 16 bit character unless the character
844                 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
845                 //
846 	}
847 }
848 
849 /// \brief Return the input element assuming an 8 bit ascii input
850 ///
851 /// \param[in] input Input stream context pointer
852 /// \param[in] la 1 based offset of next input stream element
853 ///
854 /// \return Next input character in internal ANTLR3 encoding (UTF32)
855 ///
856 static ANTLR3_UCHAR
antlr3UTF16LA(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)857 antlr3UTF16LA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
858 {
859 	pANTLR3_INPUT_STREAM input;
860         UTF32   ch;
861         UTF32   ch2;
862         UTF16   * nextChar;
863 
864         // Find the input interface and where we are currently pointing to
865         // in the input stream
866         //
867 	input       = ((pANTLR3_INPUT_STREAM) (is->super));
868         nextChar    = input->nextChar;
869 
870         // If a positive offset then advance forward, else retreat
871         //
872         if  (la >= 0)
873         {
874             while   (--la > 0 && (pANTLR3_UINT8)nextChar < ((pANTLR3_UINT8)input->data) + input->sizeBuf )
875             {
876                 // Advance our copy of the input pointer
877                 //
878                 // Next char in natural machine byte order
879                 //
880                 ch  = *nextChar++;
881 
882                 // If we have a surrogate pair then we need to consume
883                 // a following valid LO surrogate.
884                 //
885                 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
886                 {
887                     // If the 16 bits following the high surrogate are in the source buffer...
888                     //
889                     if	((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
890                     {
891                         // Next character is in natural machine byte order
892                         //
893                         ch2 = *nextChar;
894 
895                         // If it's a valid low surrogate, consume it
896                         //
897                         if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
898                         {
899                             // We consumed one 16 bit character
900                             //
901 		            nextChar++;
902                         }
903                         // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
904                         // it.
905                         //
906                     }
907                     // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
908                     // it because the buffer ended
909                     //
910                 }
911                 // Note that we did not check for an invalid low surrogate here, or that fact that the
912                 // lo surrogate was missing. We just picked out one 16 bit character unless the character
913                 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
914                 //
915             }
916         }
917         else
918         {
919             // We need to go backwards from our input point
920             //
921             while   (la++ < 0 && (pANTLR3_UINT8)nextChar > (pANTLR3_UINT8)input->data )
922             {
923                 // Get the previous 16 bit character
924                 //
925                 ch = *--nextChar;
926 
927                 // If we found a low surrogate then go back one more character if
928                 // the hi surrogate is there
929                 //
930                 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
931                 {
932                     ch2 = *(nextChar-1);
933                     if (ch2 >= UNI_SUR_HIGH_START && ch2 <= UNI_SUR_HIGH_END)
934                     {
935                         // Yes, there is a high surrogate to match it so decrement one more and point to that
936                         //
937                         nextChar--;
938                     }
939                 }
940             }
941         }
942 
943         // Our local copy of nextChar is now pointing to either the correct character or end of file
944         //
945         // Input buffer size is always in bytes
946         //
947 	if	( (pANTLR3_UINT8)nextChar >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
948 	{
949 		return	ANTLR3_CHARSTREAM_EOF;
950 	}
951 	else
952 	{
953             // Pick up the next 16 character (native machine byte order)
954             //
955             ch = *nextChar++;
956 
957             // If we have a surrogate pair then we need to consume
958             // a following valid LO surrogate.
959             //
960             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
961             {
962                 // If the 16 bits following the high surrogate are in the source buffer...
963                 //
964                 if	((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
965                 {
966                     // Next character is in natural machine byte order
967                     //
968                     ch2 = *nextChar;
969 
970                     // If it's a valid low surrogate, consume it
971                     //
972                     if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
973                     {
974                         // Construct the UTF32 code point
975                         //
976                         ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
977 			    + (ch2 - UNI_SUR_LOW_START) + halfBase;
978                     }
979                     // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
980                     // it.
981                     //
982                 }
983                 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
984                 // it because the buffer ended
985                 //
986             }
987         }
988         return ch;
989 }
990 
991 
992 /// \brief Calculate the current index in the output stream.
993 /// \param[in] input Input stream context pointer
994 ///
995 static ANTLR3_MARKER
antlr3UTF16Index(pANTLR3_INT_STREAM is)996 antlr3UTF16Index(pANTLR3_INT_STREAM is)
997 {
998     pANTLR3_INPUT_STREAM input;
999 
1000     input   = ((pANTLR3_INPUT_STREAM) (is->super));
1001 
1002     return  (ANTLR3_MARKER)(input->nextChar);
1003 }
1004 
1005 /// \brief Rewind the lexer input to the state specified by the supplied mark.
1006 ///
1007 /// \param[in] input Input stream context pointer
1008 ///
1009 /// \remark
1010 /// Assumes UTF16 input stream.
1011 ///
1012 static void
antlr3UTF16Seek(pANTLR3_INT_STREAM is,ANTLR3_MARKER seekPoint)1013 antlr3UTF16Seek	(pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint)
1014 {
1015 	pANTLR3_INPUT_STREAM input;
1016 
1017 	input   = ((pANTLR3_INPUT_STREAM) is->super);
1018 
1019 	// If the requested seek point is less than the current
1020 	// input point, then we assume that we are resetting from a mark
1021 	// and do not need to scan, but can just set to there as rewind will
1022         // reset line numbers and so on.
1023 	//
1024 	if	(seekPoint <= (ANTLR3_MARKER)(input->nextChar))
1025 	{
1026 		input->nextChar	= (void *)seekPoint;
1027 	}
1028 	else
1029 	{
1030             // Call consume until we reach the asked for seek point or EOF
1031             //
1032             while (is->_LA(is, 1) != ANTLR3_CHARSTREAM_EOF && seekPoint < (ANTLR3_MARKER)input->nextChar)
1033 	    {
1034 		is->consume(is);
1035 	    }
1036 	}
1037 }
1038 /// \brief Return a substring of the UTF16 input stream in
1039 ///  newly allocated memory.
1040 ///
1041 /// \param input Input stream context pointer
1042 /// \param start Offset in input stream where the string starts
1043 /// \param stop  Offset in the input stream where the string ends.
1044 ///
1045 static pANTLR3_STRING
antlr3UTF16Substr(pANTLR3_INPUT_STREAM input,ANTLR3_MARKER start,ANTLR3_MARKER stop)1046 antlr3UTF16Substr		(pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop)
1047 {
1048     return  input->strFactory->newPtr(input->strFactory, (pANTLR3_UINT8)start, ((ANTLR3_UINT32_CAST(stop - start))/2) + 1);
1049 }
1050 
1051 /// \brief Consume the next character in a UTF16 input stream when the input is Little Endian and the machine is not
1052 /// Note that the UTF16 routines do not do any substantial verification of the input stream as for performance
1053 /// sake, we assume it is validly encoded. So if a low surrogate is found at the curent input position then we
1054 /// just consume it. Surrogate pairs should be seen as Hi, Lo. So if we have a Lo first, then the input stream
1055 /// is fubar but we just ignore that.
1056 ///
1057 /// \param input Input stream context pointer
1058 ///
1059 static void
antlr3UTF16ConsumeLE(pANTLR3_INT_STREAM is)1060 antlr3UTF16ConsumeLE(pANTLR3_INT_STREAM is)
1061 {
1062 	pANTLR3_INPUT_STREAM input;
1063         UTF32   ch;
1064         UTF32   ch2;
1065 
1066 	input   = ((pANTLR3_INPUT_STREAM) (is->super));
1067 
1068         // Buffer size is always in bytes
1069         //
1070 	if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1071 	{
1072 		// Indicate one more character in this line
1073 		//
1074 		input->charPositionInLine++;
1075 
1076 		if  ((ANTLR3_UCHAR)(*((pANTLR3_UINT16)input->nextChar)) == input->newlineChar)
1077 		{
1078 			// Reset for start of a new line of input
1079 			//
1080 			input->line++;
1081 			input->charPositionInLine	= 0;
1082 			input->currentLine		= (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
1083 		}
1084 
1085 		// Increment to next character position, accounting for any surrogates
1086 		//
1087                 // Next char in litle endian form
1088                 //
1089                 ch  = *((pANTLR3_UINT8)input->nextChar) + (*((pANTLR3_UINT8)input->nextChar + 1) <<8);
1090 
1091                 // We consumed one 16 bit character
1092                 //
1093 		input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
1094 
1095                 // If we have a surrogate pair then we need to consume
1096                 // a following valid LO surrogate.
1097                 //
1098                 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
1099 
1100                     // If the 16 bits following the high surrogate are in the source buffer...
1101                     //
1102                     if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1103                     {
1104                         ch2 = *((pANTLR3_UINT8)input->nextChar) + (*((pANTLR3_UINT8)input->nextChar + 1) <<8);
1105 
1106                         // If it's a valid low surrogate, consume it
1107                         //
1108                         if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
1109                         {
1110                             // We consumed one 16 bit character
1111                             //
1112 		            input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
1113                         }
1114                         // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1115                         // it.
1116                         //
1117                     }
1118                     // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1119                     // it because the buffer ended
1120                     //
1121                 }
1122                 // Note that we did not check for an invalid low surrogate here, or that fact that the
1123                 // lo surrogate was missing. We just picked out one 16 bit character unless the character
1124                 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
1125                 //
1126 	}
1127 }
1128 
1129 /// \brief Return the input element assuming a UTF16 input when the input is Little Endian and the machine is not
1130 ///
1131 /// \param[in] input Input stream context pointer
1132 /// \param[in] la 1 based offset of next input stream element
1133 ///
1134 /// \return Next input character in internal ANTLR3 encoding (UTF32)
1135 ///
1136 static ANTLR3_UCHAR
antlr3UTF16LALE(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)1137 antlr3UTF16LALE(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
1138 {
1139 	pANTLR3_INPUT_STREAM input;
1140         UTF32           ch;
1141         UTF32           ch2;
1142         pANTLR3_UCHAR   nextChar;
1143 
1144         // Find the input interface and where we are currently pointing to
1145         // in the input stream
1146         //
1147 	input       = ((pANTLR3_INPUT_STREAM) (is->super));
1148         nextChar    = input->nextChar;
1149 
1150         // If a positive offset then advance forward, else retreat
1151         //
1152         if  (la >= 0)
1153         {
1154             while   (--la > 0 && (pANTLR3_UINT8)nextChar < ((pANTLR3_UINT8)input->data) + input->sizeBuf )
1155             {
1156                 // Advance our copy of the input pointer
1157                 //
1158                 // Next char in Little Endian byte order
1159                 //
1160                 ch  = (*nextChar) + (*(nextChar+1) << 8);
1161                 nextChar += 2;
1162 
1163                 // If we have a surrogate pair then we need to consume
1164                 // a following valid LO surrogate.
1165                 //
1166                 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
1167                 {
1168                     // If the 16 bits following the high surrogate are in the source buffer...
1169                     //
1170                     if	((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1171                     {
1172                         // Next character is in little endian byte order
1173                         //
1174                         ch2 = (*nextChar) + (*(nextChar+1) << 8);
1175 
1176                         // If it's a valid low surrogate, consume it
1177                         //
1178                         if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
1179                         {
1180                             // We consumed one 16 bit character
1181                             //
1182 		            nextChar += 2;
1183                         }
1184                         // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1185                         // it.
1186                         //
1187                     }
1188                     // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1189                     // it because the buffer ended
1190                     //
1191                 }
1192                 // Note that we did not check for an invalid low surrogate here, or that fact that the
1193                 // lo surrogate was missing. We just picked out one 16 bit character unless the character
1194                 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
1195                 //
1196             }
1197         }
1198         else
1199         {
1200             // We need to go backwards from our input point
1201             //
1202             while   (la++ < 0 && (pANTLR3_UINT8)nextChar > (pANTLR3_UINT8)input->data )
1203             {
1204                 // Get the previous 16 bit character
1205                 //
1206                 ch = (*nextChar - 2) + ((*nextChar -1) << 8);
1207                 nextChar -= 2;
1208 
1209                 // If we found a low surrogate then go back one more character if
1210                 // the hi surrogate is there
1211                 //
1212                 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
1213                 {
1214                     ch2 = (*nextChar - 2) + ((*nextChar -1) << 8);
1215                     if (ch2 >= UNI_SUR_HIGH_START && ch2 <= UNI_SUR_HIGH_END)
1216                     {
1217                         // Yes, there is a high surrogate to match it so decrement one more and point to that
1218                         //
1219                         nextChar -=2;
1220                     }
1221                 }
1222             }
1223         }
1224 
1225         // Our local copy of nextChar is now pointing to either the correct character or end of file
1226         //
1227         // Input buffer size is always in bytes
1228         //
1229 	if	( (pANTLR3_UINT8)nextChar >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1230 	{
1231 		return	ANTLR3_CHARSTREAM_EOF;
1232 	}
1233 	else
1234 	{
1235             // Pick up the next 16 character (little endian byte order)
1236             //
1237             ch = (*nextChar) + (*(nextChar+1) << 8);
1238             nextChar += 2;
1239 
1240             // If we have a surrogate pair then we need to consume
1241             // a following valid LO surrogate.
1242             //
1243             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
1244             {
1245                 // If the 16 bits following the high surrogate are in the source buffer...
1246                 //
1247                 if	((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1248                 {
1249                     // Next character is in little endian byte order
1250                     //
1251                     ch2 = (*nextChar) + (*(nextChar+1) << 8);
1252 
1253                     // If it's a valid low surrogate, consume it
1254                     //
1255                     if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
1256                     {
1257                         // Construct the UTF32 code point
1258                         //
1259                         ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
1260 			    + (ch2 - UNI_SUR_LOW_START) + halfBase;
1261                     }
1262                     // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1263                     // it.
1264                     //
1265                 }
1266                 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1267                 // it because the buffer ended
1268                 //
1269             }
1270         }
1271         return ch;
1272 }
1273 
1274 /// \brief Consume the next character in a UTF16 input stream when the input is Big Endian and the machine is not
1275 ///
1276 /// \param input Input stream context pointer
1277 ///
1278 static void
antlr3UTF16ConsumeBE(pANTLR3_INT_STREAM is)1279 antlr3UTF16ConsumeBE(pANTLR3_INT_STREAM is)
1280 {
1281 	pANTLR3_INPUT_STREAM input;
1282         UTF32   ch;
1283         UTF32   ch2;
1284 
1285 	input   = ((pANTLR3_INPUT_STREAM) (is->super));
1286 
1287         // Buffer size is always in bytes
1288         //
1289 	if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1290 	{
1291 		// Indicate one more character in this line
1292 		//
1293 		input->charPositionInLine++;
1294 
1295 		if  ((ANTLR3_UCHAR)(*((pANTLR3_UINT16)input->nextChar)) == input->newlineChar)
1296 		{
1297 			// Reset for start of a new line of input
1298 			//
1299 			input->line++;
1300 			input->charPositionInLine	= 0;
1301 			input->currentLine		= (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
1302 		}
1303 
1304 		// Increment to next character position, accounting for any surrogates
1305 		//
1306                 // Next char in big endian form
1307                 //
1308                 ch  = *((pANTLR3_UINT8)input->nextChar + 1) + (*((pANTLR3_UINT8)input->nextChar ) <<8);
1309 
1310                 // We consumed one 16 bit character
1311                 //
1312 		input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
1313 
1314                 // If we have a surrogate pair then we need to consume
1315                 // a following valid LO surrogate.
1316                 //
1317                 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
1318 
1319                     // If the 16 bits following the high surrogate are in the source buffer...
1320                     //
1321                     if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1322                     {
1323                         // Big endian
1324                         //
1325                         ch2 = *((pANTLR3_UINT8)input->nextChar + 1) + (*((pANTLR3_UINT8)input->nextChar ) <<8);
1326 
1327                         // If it's a valid low surrogate, consume it
1328                         //
1329                         if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
1330                         {
1331                             // We consumed one 16 bit character
1332                             //
1333 		            input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
1334                         }
1335                         // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1336                         // it.
1337                         //
1338                     }
1339                     // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1340                     // it because the buffer ended
1341                     //
1342                 }
1343                 // Note that we did not check for an invalid low surrogate here, or that fact that the
1344                 // lo surrogate was missing. We just picked out one 16 bit character unless the character
1345                 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
1346                 //
1347 	}
1348 }
1349 
1350 /// \brief Return the input element assuming a UTF16 input when the input is Little Endian and the machine is not
1351 ///
1352 /// \param[in] input Input stream context pointer
1353 /// \param[in] la 1 based offset of next input stream element
1354 ///
1355 /// \return Next input character in internal ANTLR3 encoding (UTF32)
1356 ///
1357 static ANTLR3_UCHAR
antlr3UTF16LABE(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)1358 antlr3UTF16LABE(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
1359 {
1360 	pANTLR3_INPUT_STREAM input;
1361         UTF32           ch;
1362         UTF32           ch2;
1363         pANTLR3_UCHAR   nextChar;
1364 
1365         // Find the input interface and where we are currently pointing to
1366         // in the input stream
1367         //
1368 	input       = ((pANTLR3_INPUT_STREAM) (is->super));
1369         nextChar    = input->nextChar;
1370 
1371         // If a positive offset then advance forward, else retreat
1372         //
1373         if  (la >= 0)
1374         {
1375             while   (--la > 0 && (pANTLR3_UINT8)nextChar < ((pANTLR3_UINT8)input->data) + input->sizeBuf )
1376             {
1377                 // Advance our copy of the input pointer
1378                 //
1379                 // Next char in Big Endian byte order
1380                 //
1381                 ch  = ((*nextChar) << 8) + *(nextChar+1);
1382                 nextChar += 2;
1383 
1384                 // If we have a surrogate pair then we need to consume
1385                 // a following valid LO surrogate.
1386                 //
1387                 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
1388                 {
1389                     // If the 16 bits following the high surrogate are in the source buffer...
1390                     //
1391                     if	((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1392                     {
1393                         // Next character is in big endian byte order
1394                         //
1395                         ch2 = ((*nextChar) << 8) + *(nextChar+1);
1396 
1397                         // If it's a valid low surrogate, consume it
1398                         //
1399                         if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
1400                         {
1401                             // We consumed one 16 bit character
1402                             //
1403 		            nextChar += 2;
1404                         }
1405                         // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1406                         // it.
1407                         //
1408                     }
1409                     // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1410                     // it because the buffer ended
1411                     //
1412                 }
1413                 // Note that we did not check for an invalid low surrogate here, or that fact that the
1414                 // lo surrogate was missing. We just picked out one 16 bit character unless the character
1415                 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
1416                 //
1417             }
1418         }
1419         else
1420         {
1421             // We need to go backwards from our input point
1422             //
1423             while   (la++ < 0 && (pANTLR3_UINT8)nextChar > (pANTLR3_UINT8)input->data )
1424             {
1425                 // Get the previous 16 bit character
1426                 //
1427                 ch = ((*nextChar - 2) << 8) + (*nextChar -1);
1428                 nextChar -= 2;
1429 
1430                 // If we found a low surrogate then go back one more character if
1431                 // the hi surrogate is there
1432                 //
1433                 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
1434                 {
1435                     ch2 = ((*nextChar - 2) << 8) + (*nextChar -1);
1436                     if (ch2 >= UNI_SUR_HIGH_START && ch2 <= UNI_SUR_HIGH_END)
1437                     {
1438                         // Yes, there is a high surrogate to match it so decrement one more and point to that
1439                         //
1440                         nextChar -=2;
1441                     }
1442                 }
1443             }
1444         }
1445 
1446         // Our local copy of nextChar is now pointing to either the correct character or end of file
1447         //
1448         // Input buffer size is always in bytes
1449         //
1450 	if	( (pANTLR3_UINT8)nextChar >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1451 	{
1452 		return	ANTLR3_CHARSTREAM_EOF;
1453 	}
1454 	else
1455 	{
1456             // Pick up the next 16 character (big endian byte order)
1457             //
1458             ch = ((*nextChar) << 8) + *(nextChar+1);
1459             nextChar += 2;
1460 
1461             // If we have a surrogate pair then we need to consume
1462             // a following valid LO surrogate.
1463             //
1464             if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
1465             {
1466                 // If the 16 bits following the high surrogate are in the source buffer...
1467                 //
1468                 if	((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1469                 {
1470                     // Next character is in big endian byte order
1471                     //
1472                     ch2 = ((*nextChar) << 8) + *(nextChar+1);
1473 
1474                     // If it's a valid low surrogate, consume it
1475                     //
1476                     if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
1477                     {
1478                         // Construct the UTF32 code point
1479                         //
1480                         ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
1481 			    + (ch2 - UNI_SUR_LOW_START) + halfBase;
1482                     }
1483                     // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1484                     // it.
1485                     //
1486                 }
1487                 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1488                 // it because the buffer ended
1489                 //
1490             }
1491         }
1492         return ch;
1493 }
1494 
1495 /// \brief Common function to setup function interface for a UTF3 input stream.
1496 ///
1497 /// \param input Input stream context pointer
1498 ///
1499 void
antlr3UTF32SetupStream(pANTLR3_INPUT_STREAM input,ANTLR3_BOOLEAN machineBigEndian,ANTLR3_BOOLEAN inputBigEndian)1500 antlr3UTF32SetupStream	(pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN machineBigEndian, ANTLR3_BOOLEAN inputBigEndian)
1501 {
1502     // Build a string factory for this stream. This is a UTF32 string factory which is a standard
1503     // part of the ANTLR3 string. The string factory is then passed through the whole chain of lexer->parser->tree->treeparser
1504     // and so on.
1505     //
1506     input->strFactory	= antlr3StringFactoryNew(input->encoding);
1507 
1508     // Generic API that does not care about endianess.
1509     //
1510     input->istream->index	    =  antlr3UTF32Index;            // Calculate current index in input stream, UTF16 based
1511     input->substr		    =  antlr3UTF32Substr;	    // Return a string from the input stream
1512     input->istream->seek	    =  antlr3UTF32Seek;		    // How to seek to a specific point in the stream
1513     input->istream->consume	    =  antlr3UTF32Consume;	    // Consume the next UTF32 character in the buffer
1514 
1515     // We must install different UTF32 LA routines according to whether the input
1516     // is the same endianess as the machine we are executing upon or not. If it is not
1517     // then we must install methods that can convert the endianess on the fly as they go
1518     //
1519     switch (machineBigEndian)
1520     {
1521         case    ANTLR3_TRUE:
1522 
1523             // Machine is Big Endian, if the input is also then install the
1524             // methods that do not access input by bytes and reverse them.
1525             // Otherwise install endian aware methods.
1526             //
1527             if  (inputBigEndian == ANTLR3_TRUE)
1528             {
1529                 // Input is machine compatible
1530                 //
1531                 input->istream->_LA         =  antlr3UTF32LA;		    // Return the UTF32 character at offset n (1 based)
1532             }
1533             else
1534             {
1535                 // Need to use methods that know that the input is little endian
1536                 //
1537                 input->istream->_LA         =  antlr3UTF32LALE;		    // Return the UTF32 character at offset n (1 based)
1538             }
1539             break;
1540 
1541         case    ANTLR3_FALSE:
1542 
1543             // Machine is Little Endian, if the input is also then install the
1544             // methods that do not access input by bytes and reverse them.
1545             // Otherwise install endian aware methods.
1546             //
1547             if  (inputBigEndian == ANTLR3_FALSE)
1548             {
1549                 // Input is machine compatible
1550                 //
1551                 input->istream->_LA         =  antlr3UTF32LA;		    // Return the UTF32 character at offset n (1 based)
1552             }
1553             else
1554             {
1555                 // Need to use methods that know that the input is Big Endian
1556                 //
1557                 input->istream->_LA         =  antlr3UTF32LABE;		    // Return the UTF32 character at offset n (1 based)
1558             }
1559             break;
1560     }
1561 
1562     input->charByteSize		    = 4;			    // Size in bytes of characters in this stream.
1563 }
1564 
1565 /** \brief Consume the next character in a UTF32 input stream
1566  *
1567  * \param input Input stream context pointer
1568  */
1569 static void
antlr3UTF32Consume(pANTLR3_INT_STREAM is)1570 antlr3UTF32Consume(pANTLR3_INT_STREAM is)
1571 {
1572     pANTLR3_INPUT_STREAM input;
1573 
1574     input   = ((pANTLR3_INPUT_STREAM) (is->super));
1575 
1576     // SizeBuf is always in bytes
1577     //
1578     if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1579     {
1580 	/* Indicate one more character in this line
1581 	 */
1582 	input->charPositionInLine++;
1583 
1584 	if  ((ANTLR3_UCHAR)(*((pANTLR3_UINT32)input->nextChar)) == input->newlineChar)
1585 	{
1586 	    /* Reset for start of a new line of input
1587 	     */
1588 	    input->line++;
1589 	    input->charPositionInLine	= 0;
1590 	    input->currentLine		= (void *)(((pANTLR3_UINT32)input->nextChar) + 1);
1591 	}
1592 
1593 	/* Increment to next character position
1594 	 */
1595 	input->nextChar = (void *)(((pANTLR3_UINT32)input->nextChar) + 1);
1596     }
1597 }
1598 
1599 /// \brief Calculate the current index in the output stream.
1600 /// \param[in] input Input stream context pointer
1601 ///
1602 static ANTLR3_MARKER
antlr3UTF32Index(pANTLR3_INT_STREAM is)1603 antlr3UTF32Index(pANTLR3_INT_STREAM is)
1604 {
1605     pANTLR3_INPUT_STREAM input;
1606 
1607     input   = ((pANTLR3_INPUT_STREAM) (is->super));
1608 
1609     return  (ANTLR3_MARKER)(input->nextChar);
1610 }
1611 
1612 /// \brief Return a substring of the UTF16 input stream in
1613 ///  newly allocated memory.
1614 ///
1615 /// \param input Input stream context pointer
1616 /// \param start Offset in input stream where the string starts
1617 /// \param stop  Offset in the input stream where the string ends.
1618 ///
1619 static pANTLR3_STRING
antlr3UTF32Substr(pANTLR3_INPUT_STREAM input,ANTLR3_MARKER start,ANTLR3_MARKER stop)1620 antlr3UTF32Substr		(pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop)
1621 {
1622     return  input->strFactory->newPtr(input->strFactory, (pANTLR3_UINT8)start, ((ANTLR3_UINT32_CAST(stop - start))/4) + 1);
1623 }
1624 
1625 /// \brief Rewind the lexer input to the state specified by the supplied mark.
1626 ///
1627 /// \param[in] input Input stream context pointer
1628 ///
1629 /// \remark
1630 /// Assumes UTF32 input stream.
1631 ///
1632 static void
antlr3UTF32Seek(pANTLR3_INT_STREAM is,ANTLR3_MARKER seekPoint)1633 antlr3UTF32Seek	(pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint)
1634 {
1635 	pANTLR3_INPUT_STREAM input;
1636 
1637 	input   = ((pANTLR3_INPUT_STREAM) is->super);
1638 
1639 	// If the requested seek point is less than the current
1640 	// input point, then we assume that we are resetting from a mark
1641 	// and do not need to scan, but can just set to there as rewind will
1642         // reset line numbers and so on.
1643 	//
1644 	if	(seekPoint <= (ANTLR3_MARKER)(input->nextChar))
1645 	{
1646 		input->nextChar	= (void *)seekPoint;
1647 	}
1648 	else
1649 	{
1650             // Call consume until we reach the asked for seek point or EOF
1651             //
1652             while (is->_LA(is, 1) != ANTLR3_CHARSTREAM_EOF && seekPoint < (ANTLR3_MARKER)input->nextChar)
1653 	    {
1654 		is->consume(is);
1655 	    }
1656 	}
1657 }
1658 
1659 /** \brief Return the input element assuming a UTF32 input in natural machine byte order
1660  *
1661  * \param[in] input Input stream context pointer
1662  * \param[in] la 1 based offset of next input stream element
1663  *
1664  * \return Next input character in internal ANTLR3 encoding (UTF32)
1665  */
1666 static ANTLR3_UCHAR
antlr3UTF32LA(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)1667 antlr3UTF32LA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
1668 {
1669     pANTLR3_INPUT_STREAM input;
1670 
1671     input   = ((pANTLR3_INPUT_STREAM) (is->super));
1672 
1673     if	(( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1674     {
1675 		return	ANTLR3_CHARSTREAM_EOF;
1676     }
1677     else
1678     {
1679 		return	(ANTLR3_UCHAR)(*((pANTLR3_UINT32)input->nextChar + la - 1));
1680     }
1681 }
1682 
1683 /** \brief Return the input element assuming a UTF32 input in little endian byte order
1684  *
1685  * \param[in] input Input stream context pointer
1686  * \param[in] la 1 based offset of next input stream element
1687  *
1688  * \return Next input character in internal ANTLR3 encoding (UTF32)
1689  */
1690 static ANTLR3_UCHAR
antlr3UTF32LALE(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)1691 antlr3UTF32LALE(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
1692 {
1693     pANTLR3_INPUT_STREAM input;
1694 
1695     input   = ((pANTLR3_INPUT_STREAM) (is->super));
1696 
1697     if	(( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1698     {
1699 		return	ANTLR3_CHARSTREAM_EOF;
1700     }
1701     else
1702     {
1703         ANTLR3_UCHAR   c;
1704 
1705         c = (ANTLR3_UCHAR)(*((pANTLR3_UINT32)input->nextChar + la - 1));
1706 
1707         // Swap Endianess to Big Endian
1708         //
1709         return (c>>24) | ((c<<8) & 0x00FF0000) | ((c>>8) & 0x0000FF00) | (c<<24);
1710     }
1711 }
1712 
1713 /** \brief Return the input element assuming a UTF32 input in big endian byte order
1714  *
1715  * \param[in] input Input stream context pointer
1716  * \param[in] la 1 based offset of next input stream element
1717  *
1718  * \return Next input character in internal ANTLR3 encoding (UTF32)
1719  * \remark This is the same code as LE version but seprated in case there are better optimisations fo rendinan swap
1720  */
1721 static ANTLR3_UCHAR
antlr3UTF32LABE(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)1722 antlr3UTF32LABE(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
1723 {
1724     pANTLR3_INPUT_STREAM input;
1725 
1726     input   = ((pANTLR3_INPUT_STREAM) (is->super));
1727 
1728     if	(( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1729     {
1730 		return	ANTLR3_CHARSTREAM_EOF;
1731     }
1732     else
1733     {
1734         ANTLR3_UCHAR   c;
1735 
1736         c = (ANTLR3_UCHAR)(*((pANTLR3_UINT32)input->nextChar + la - 1));
1737 
1738         // Swap Endianess to Little Endian
1739         //
1740         return (c>>24) | ((c<<8) & 0x00FF0000) | ((c>>8) & 0x0000FF00) | (c<<24);
1741     }
1742 }
1743 
1744 
1745 /// \brief Common function to setup function interface for a UTF8 input stream.
1746 ///
1747 /// \param input Input stream context pointer
1748 ///
1749 void
antlr3UTF8SetupStream(pANTLR3_INPUT_STREAM input)1750 antlr3UTF8SetupStream	(pANTLR3_INPUT_STREAM input)
1751 {
1752     // Build a string factory for this stream. This is a UTF16 string factory which is a standard
1753     // part of the ANTLR3 string. The string factory is then passed through the whole chain of lexer->parser->tree->treeparser
1754     // and so on.
1755     //
1756     input->strFactory	= antlr3StringFactoryNew(input->encoding);
1757 
1758     // Generic API that does not care about endianess.
1759     //
1760     input->istream->consume	= antlr3UTF8Consume;	// Consume the next UTF32 character in the buffer
1761     input->istream->_LA         = antlr3UTF8LA;         // Return the UTF32 character at offset n (1 based)
1762     input->charByteSize		= 0;	                // Size in bytes of characters in this stream.
1763 }
1764 
1765 // ------------------------------------------------------
1766 // Following is from Unicode.org (see antlr3convertutf.c)
1767 //
1768 
1769 /// Index into the table below with the first byte of a UTF-8 sequence to
1770 /// get the number of trailing bytes that are supposed to follow it.
1771 /// Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
1772 /// left as-is for anyone who may want to do such conversion, which was
1773 /// allowed in earlier algorithms.
1774 ///
1775 static const ANTLR3_UINT32 trailingBytesForUTF8[256] = {
1776     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1777     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1778     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1779     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1780     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1781     0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1782     1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1783     2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
1784 };
1785 
1786 /// Magic values subtracted from a buffer value during UTF8 conversion.
1787 /// This table contains as many values as there might be trailing bytes
1788 /// in a UTF-8 sequence.
1789 ///
1790 static const UTF32 offsetsFromUTF8[6] =
1791     {   0x00000000UL, 0x00003080UL, 0x000E2080UL,
1792 	0x03C82080UL, 0xFA082080UL, 0x82082080UL
1793     };
1794 
1795 // End of Unicode.org tables
1796 // -------------------------
1797 
1798 
1799 /** \brief Consume the next character in a UTF8 input stream
1800  *
1801  * \param input Input stream context pointer
1802  */
1803 static void
antlr3UTF8Consume(pANTLR3_INT_STREAM is)1804 antlr3UTF8Consume(pANTLR3_INT_STREAM is)
1805 {
1806     pANTLR3_INPUT_STREAM    input;
1807     ANTLR3_UINT32           extraBytesToRead;
1808     ANTLR3_UCHAR            ch;
1809     pANTLR3_UINT8           nextChar;
1810 
1811     input   = ((pANTLR3_INPUT_STREAM) (is->super));
1812 
1813     nextChar = input->nextChar;
1814 
1815     if	(nextChar < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1816     {
1817 	// Indicate one more character in this line
1818 	//
1819 	input->charPositionInLine++;
1820 
1821         // Are there more bytes needed to make up the whole thing?
1822         //
1823         extraBytesToRead = trailingBytesForUTF8[*nextChar];
1824 
1825         if	(nextChar + extraBytesToRead >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1826         {
1827             input->nextChar = (((pANTLR3_UINT8)input->data) + input->sizeBuf);
1828             return;
1829         }
1830 
1831         // Cases deliberately fall through (see note A in antlrconvertutf.c)
1832         // Legal UTF8 is only 4 bytes but 6 bytes could be used in old UTF8 so
1833         // we allow it.
1834         //
1835         ch  = 0;
1836        	switch (extraBytesToRead) {
1837 	    case 5: ch += *nextChar++; ch <<= 6;
1838 	    case 4: ch += *nextChar++; ch <<= 6;
1839 	    case 3: ch += *nextChar++; ch <<= 6;
1840 	    case 2: ch += *nextChar++; ch <<= 6;
1841 	    case 1: ch += *nextChar++; ch <<= 6;
1842 	    case 0: ch += *nextChar++;
1843 	}
1844 
1845         // Magically correct the input value
1846         //
1847 	ch -= offsetsFromUTF8[extraBytesToRead];
1848 	if  (ch == input->newlineChar)
1849 	{
1850 	    /* Reset for start of a new line of input
1851 	     */
1852 	    input->line++;
1853 	    input->charPositionInLine	= 0;
1854 	    input->currentLine		= (void *)nextChar;
1855 	}
1856 
1857         // Update input pointer
1858         //
1859         input->nextChar = nextChar;
1860     }
1861 }
1862 /** \brief Return the input element assuming a UTF8 input
1863  *
1864  * \param[in] input Input stream context pointer
1865  * \param[in] la 1 based offset of next input stream element
1866  *
1867  * \return Next input character in internal ANTLR3 encoding (UTF32)
1868  */
1869 static ANTLR3_UCHAR
antlr3UTF8LA(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)1870 antlr3UTF8LA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
1871 {
1872     pANTLR3_INPUT_STREAM    input;
1873     ANTLR3_UINT32           extraBytesToRead;
1874     ANTLR3_UCHAR            ch;
1875     pANTLR3_UINT8           nextChar;
1876 
1877     input   = ((pANTLR3_INPUT_STREAM) (is->super));
1878 
1879     nextChar = input->nextChar;
1880 
1881     // Do we need to traverse forwards or backwards?
1882     // - LA(0) is treated as LA(1) and we assume that the nextChar is
1883     //   already positioned.
1884     // - LA(n+) ; n>1 means we must traverse forward n-1 characters catering for UTF8 encoding
1885     // - LA(-n) means we must traverse backwards n chracters
1886     //
1887     if (la > 1) {
1888 
1889         // Make sure that we have at least one character left before trying to
1890         // loop through the buffer.
1891         //
1892         if	(nextChar < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1893         {
1894             // Now traverse n-1 characters forward
1895             //
1896             while (--la > 0)
1897             {
1898                 // Does the next character require trailing bytes?
1899                 // If so advance the pointer by that many bytes as well as advancing
1900                 // one position for what will be at least a single byte character.
1901                 //
1902                 nextChar += trailingBytesForUTF8[*nextChar] + 1;
1903 
1904                 // Does that calculation take us past the byte length of the buffer?
1905                 //
1906                 if	(nextChar >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1907                 {
1908                     return ANTLR3_CHARSTREAM_EOF;
1909                 }
1910             }
1911         }
1912         else
1913         {
1914             return ANTLR3_CHARSTREAM_EOF;
1915         }
1916     }
1917     else
1918     {
1919         // LA is negative so we decrease the pointer by n character positions
1920         //
1921         while   (nextChar > (pANTLR3_UINT8)input->data && la++ < 0)
1922         {
1923             // Traversing backwards in UTF8 means decermenting by one
1924             // then continuing to decrement while ever a character pattern
1925             // is flagged as being a trailing byte of an encoded code point.
1926             // Trailing UTF8 bytes always start with 10 in binary. We assumne that
1927             // the UTF8 is well formed and do not check boundary conditions
1928             //
1929             nextChar--;
1930             while ((*nextChar & 0xC0) == 0x80)
1931             {
1932                 nextChar--;
1933             }
1934         }
1935     }
1936 
1937     // nextChar is now pointing at the UTF8 encoded character that we need to
1938     // decode and return.
1939     //
1940     // Are there more bytes needed to make up the whole thing?
1941     //
1942     extraBytesToRead = trailingBytesForUTF8[*nextChar];
1943     if	(nextChar + extraBytesToRead >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1944     {
1945         return ANTLR3_CHARSTREAM_EOF;
1946     }
1947 
1948     // Cases deliberately fall through (see note A in antlrconvertutf.c)
1949     //
1950     ch  = 0;
1951     switch (extraBytesToRead) {
1952             case 5: ch += *nextChar++; ch <<= 6;
1953             case 4: ch += *nextChar++; ch <<= 6;
1954             case 3: ch += *nextChar++; ch <<= 6;
1955             case 2: ch += *nextChar++; ch <<= 6;
1956             case 1: ch += *nextChar++; ch <<= 6;
1957             case 0: ch += *nextChar++;
1958     }
1959 
1960     // Magically correct the input value
1961     //
1962     ch -= offsetsFromUTF8[extraBytesToRead];
1963 
1964     return ch;
1965 }
1966 
1967 // EBCDIC to ASCII conversion table
1968 //
1969 // This for EBCDIC EDF04 translated to ISO-8859.1 which is the usually accepted POSIX
1970 // translation and the character tables are published all over the interweb.
1971 //
1972 const ANTLR3_UCHAR e2a[256] =
1973 {
1974     0x00, 0x01, 0x02, 0x03, 0x85, 0x09, 0x86, 0x7f,
1975     0x87, 0x8d, 0x8e, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1976     0x10, 0x11, 0x12, 0x13, 0x8f, 0x0a, 0x08, 0x97,
1977     0x18, 0x19, 0x9c, 0x9d, 0x1c, 0x1d, 0x1e, 0x1f,
1978     0x80, 0x81, 0x82, 0x83, 0x84, 0x92, 0x17, 0x1b,
1979     0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x05, 0x06, 0x07,
1980     0x90, 0x91, 0x16, 0x93, 0x94, 0x95, 0x96, 0x04,
1981     0x98, 0x99, 0x9a, 0x9b, 0x14, 0x15, 0x9e, 0x1a,
1982     0x20, 0xa0, 0xe2, 0xe4, 0xe0, 0xe1, 0xe3, 0xe5,
1983     0xe7, 0xf1, 0x60, 0x2e, 0x3c, 0x28, 0x2b, 0x7c,
1984     0x26, 0xe9, 0xea, 0xeb, 0xe8, 0xed, 0xee, 0xef,
1985     0xec, 0xdf, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0x9f,
1986     0x2d, 0x2f, 0xc2, 0xc4, 0xc0, 0xc1, 0xc3, 0xc5,
1987     0xc7, 0xd1, 0x5e, 0x2c, 0x25, 0x5f, 0x3e, 0x3f,
1988     0xf8, 0xc9, 0xca, 0xcb, 0xc8, 0xcd, 0xce, 0xcf,
1989     0xcc, 0xa8, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22,
1990     0xd8, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
1991     0x68, 0x69, 0xab, 0xbb, 0xf0, 0xfd, 0xfe, 0xb1,
1992     0xb0, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70,
1993     0x71, 0x72, 0xaa, 0xba, 0xe6, 0xb8, 0xc6, 0xa4,
1994     0xb5, 0xaf, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
1995     0x79, 0x7a, 0xa1, 0xbf, 0xd0, 0xdd, 0xde, 0xae,
1996     0xa2, 0xa3, 0xa5, 0xb7, 0xa9, 0xa7, 0xb6, 0xbc,
1997     0xbd, 0xbe, 0xac, 0x5b, 0x5c, 0x5d, 0xb4, 0xd7,
1998     0xf9, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
1999     0x48, 0x49, 0xad, 0xf4, 0xf6, 0xf2, 0xf3, 0xf5,
2000     0xa6, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50,
2001     0x51, 0x52, 0xb9, 0xfb, 0xfc, 0xdb, 0xfa, 0xff,
2002     0xd9, 0xf7, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
2003     0x59, 0x5a, 0xb2, 0xd4, 0xd6, 0xd2, 0xd3, 0xd5,
2004     0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
2005     0x38, 0x39, 0xb3, 0x7b, 0xdc, 0x7d, 0xda, 0x7e
2006 };
2007 
2008 /// \brief Common function to setup function interface for a EBCDIC input stream.
2009 ///
2010 /// \param input Input stream context pointer
2011 ///
2012 void
antlr3EBCDICSetupStream(pANTLR3_INPUT_STREAM input)2013 antlr3EBCDICSetupStream	(pANTLR3_INPUT_STREAM input)
2014 {
2015     // EBCDIC streams can use the standard 8 bit string factory
2016     //
2017     input->strFactory	= antlr3StringFactoryNew(input->encoding);
2018 
2019     // Generic API that does not care about endianess.
2020     //
2021     input->istream->_LA         = antlr3EBCDICLA;       // Return the UTF32 character at offset n (1 based)
2022     input->charByteSize		= 1;	                // Size in bytes of characters in this stream.
2023 }
2024 
2025 /// \brief Return the input element assuming an 8 bit EBCDIC input
2026 ///
2027 /// \param[in] input Input stream context pointer
2028 /// \param[in] la 1 based offset of next input stream element
2029 ///
2030 /// \return Next input character in internal ANTLR3 encoding (UTF32) after translation
2031 ///         from EBCDIC to ASCII
2032 ///
2033 static ANTLR3_UCHAR
antlr3EBCDICLA(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)2034 antlr3EBCDICLA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
2035 {
2036     pANTLR3_INPUT_STREAM input;
2037 
2038     input   = ((pANTLR3_INPUT_STREAM) (is->super));
2039 
2040     if	(( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
2041     {
2042         return	ANTLR3_CHARSTREAM_EOF;
2043     }
2044     else
2045     {
2046         // Translate the required character via the constant conversion table
2047         //
2048         return	e2a[(*((pANTLR3_UINT8)input->nextChar + la - 1))];
2049     }
2050 }