1 /// \file
2 /// Base functions to initialize and manipulate any input stream
3 ///
4
5 // [The "BSD licence"]
6 // Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC
7 // http://www.temporal-wave.com
8 // http://www.linkedin.com/in/jimidle
9 //
10 // All rights reserved.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions
14 // are met:
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 // 3. The name of the author may not be used to endorse or promote products
21 // derived from this software without specific prior written permission.
22 //
23 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
24 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
25 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
26 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
27 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
28 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
32 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33
34 #include <antlr3input.h>
35
36 // -----------------------------------
37 // Generic 8 bit input such as latin-1
38 //
39
40 // 8Bit INT Stream API
41 //
42 static void antlr38BitConsume (pANTLR3_INT_STREAM is);
43 static ANTLR3_UCHAR antlr38BitLA (pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
44 static ANTLR3_UCHAR antlr38BitLA_ucase (pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
45 static ANTLR3_MARKER antlr38BitIndex (pANTLR3_INT_STREAM is);
46 static ANTLR3_MARKER antlr38BitMark (pANTLR3_INT_STREAM is);
47 static void antlr38BitRewind (pANTLR3_INT_STREAM is, ANTLR3_MARKER mark);
48 static void antlr38BitRewindLast (pANTLR3_INT_STREAM is);
49 static void antlr38BitRelease (pANTLR3_INT_STREAM is, ANTLR3_MARKER mark);
50 static void antlr38BitSeek (pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint);
51 static pANTLR3_STRING antlr38BitGetSourceName (pANTLR3_INT_STREAM is);
52
53 // 8Bit Charstream API functions
54 //
55 static void antlr3InputClose (pANTLR3_INPUT_STREAM input);
56 static void antlr3InputReset (pANTLR3_INPUT_STREAM input);
57 static void antlr38BitReuse (pANTLR3_INPUT_STREAM input, pANTLR3_UINT8 inString, ANTLR3_UINT32 size, pANTLR3_UINT8 name);
58 static void * antlr38BitLT (pANTLR3_INPUT_STREAM input, ANTLR3_INT32 lt);
59 static ANTLR3_UINT32 antlr38BitSize (pANTLR3_INPUT_STREAM input);
60 static pANTLR3_STRING antlr38BitSubstr (pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop);
61 static ANTLR3_UINT32 antlr38BitGetLine (pANTLR3_INPUT_STREAM input);
62 static void * antlr38BitGetLineBuf (pANTLR3_INPUT_STREAM input);
63 static ANTLR3_UINT32 antlr38BitGetCharPosition (pANTLR3_INPUT_STREAM input);
64 static void antlr38BitSetLine (pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 line);
65 static void antlr38BitSetCharPosition (pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 position);
66 static void antlr38BitSetNewLineChar (pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 newlineChar);
67 static void antlr38BitSetUcaseLA (pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN flag);
68
69 // -----------------------------------
70 // UTF16 (also covers UCS2)
71 //
72 // INT Stream API
73 //
74 static void antlr3UTF16Consume (pANTLR3_INT_STREAM is);
75 static ANTLR3_UCHAR antlr3UTF16LA (pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
76 static void antlr3UTF16ConsumeLE (pANTLR3_INT_STREAM is);
77 static ANTLR3_UCHAR antlr3UTF16LALE (pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
78 static void antlr3UTF16ConsumeBE (pANTLR3_INT_STREAM is);
79 static ANTLR3_UCHAR antlr3UTF16LABE (pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
80 static ANTLR3_MARKER antlr3UTF16Index (pANTLR3_INT_STREAM is);
81 static void antlr3UTF16Seek (pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint);
82
83 // UTF16 Charstream API functions
84 //
85 static pANTLR3_STRING antlr3UTF16Substr (pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop);
86
87 // -----------------------------------
88 // UTF32 (also covers UCS2)
89 //
90 // INT Stream API
91 //
92 static void antlr3UTF32Consume (pANTLR3_INT_STREAM is);
93 static ANTLR3_UCHAR antlr3UTF32LA (pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
94 static ANTLR3_UCHAR antlr3UTF32LALE (pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
95 static ANTLR3_UCHAR antlr3UTF32LABE (pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
96 static ANTLR3_MARKER antlr3UTF32Index (pANTLR3_INT_STREAM is);
97 static void antlr3UTF32Seek (pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint);
98
99 // UTF16 Charstream API functions
100 //
101 static pANTLR3_STRING antlr3UTF32Substr (pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop);
102
103 // ------------------------------------
104 // UTF-8
105 //
106 static void antlr3UTF8Consume (pANTLR3_INT_STREAM is);
107 static ANTLR3_UCHAR antlr3UTF8LA (pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
108
109 // ------------------------------------
110 // EBCDIC
111 //
112 static ANTLR3_UCHAR antlr3EBCDICLA (pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
113
114 /// \brief Common function to setup function interface for an 8 bit input stream.
115 ///
116 /// \param input Input stream context pointer
117 ///
118 /// \remark
119 /// - Many of the 8 bit oriented file stream handling functions will be usable
120 /// by any or at least some, other input streams. Therefore it is perfectly acceptable
121 /// to call this function to install the 8Bit handler then override just those functions
122 /// that would not work for the particular input encoding, such as consume for instance.
123 ///
124 void
antlr38BitSetupStream(pANTLR3_INPUT_STREAM input)125 antlr38BitSetupStream (pANTLR3_INPUT_STREAM input)
126 {
127 // Build a string factory for this stream
128 //
129 input->strFactory = antlr3StringFactoryNew(input->encoding);
130
131 // Default stream API set up is for 8Bit, so we are done
132 //
133 }
134
135 void
antlr3GenericSetupStream(pANTLR3_INPUT_STREAM input)136 antlr3GenericSetupStream (pANTLR3_INPUT_STREAM input)
137 {
138 /* Install function pointers for an 8 bit input
139 */
140
141 /* Allocate stream interface
142 */
143 input->istream = antlr3IntStreamNew();
144 input->istream->type = ANTLR3_CHARSTREAM;
145 input->istream->super = input;
146
147 /* Intstream API
148 */
149 input->istream->consume = antlr38BitConsume; // Consume the next 8 bit character in the buffer
150 input->istream->_LA = antlr38BitLA; // Return the UTF32 character at offset n (1 based)
151 input->istream->index = antlr38BitIndex; // Current index (offset from first character
152 input->istream->mark = antlr38BitMark; // Record the current lex state for later restore
153 input->istream->rewind = antlr38BitRewind; // How to rewind the input
154 input->istream->rewindLast = antlr38BitRewindLast; // How to rewind the input
155 input->istream->seek = antlr38BitSeek; // How to seek to a specific point in the stream
156 input->istream->release = antlr38BitRelease; // Reset marks after mark n
157 input->istream->getSourceName = antlr38BitGetSourceName; // Return a string that names the input source
158
159 /* Charstream API
160 */
161 input->close = antlr3InputClose; // Close down the stream completely
162 input->free = antlr3InputClose; // Synonym for free
163 input->reset = antlr3InputReset; // Reset input to start
164 input->reuse = antlr38BitReuse; // Install a new input string and reset
165 input->_LT = antlr38BitLT; // Same as _LA for 8 bit file
166 input->size = antlr38BitSize; // Return the size of the input buffer
167 input->substr = antlr38BitSubstr; // Return a string from the input stream
168 input->getLine = antlr38BitGetLine; // Return the current line number in the input stream
169 input->getLineBuf = antlr38BitGetLineBuf; // Return a pointer to the start of the current line being consumed
170 input->getCharPositionInLine = antlr38BitGetCharPosition; // Return the offset into the current line of input
171 input->setLine = antlr38BitSetLine; // Set the input stream line number (does not set buffer pointers)
172 input->setCharPositionInLine = antlr38BitSetCharPosition; // Set the offset in to the current line (does not set any pointers)
173 input->SetNewLineChar = antlr38BitSetNewLineChar; // Set the value of the newline trigger character
174 input->setUcaseLA = antlr38BitSetUcaseLA; // Changes the LA function to return upper case always
175
176 input->charByteSize = 1; // Size in bytes of characters in this stream.
177
178 /* Initialize entries for tables etc
179 */
180 input->markers = NULL;
181
182 /* Set up the input stream brand new
183 */
184 input->reset(input);
185
186 /* Install default line separator character (it can be replaced
187 * by the grammar programmer later)
188 */
189 input->SetNewLineChar(input, (ANTLR3_UCHAR)'\n');
190 }
191
192 static pANTLR3_STRING
antlr38BitGetSourceName(pANTLR3_INT_STREAM is)193 antlr38BitGetSourceName(pANTLR3_INT_STREAM is)
194 {
195 return is->streamName;
196 }
197
198 /** \brief Close down an input stream and free any memory allocated by it.
199 *
200 * \param input Input stream context pointer
201 */
202 static void
antlr3InputClose(pANTLR3_INPUT_STREAM input)203 antlr3InputClose(pANTLR3_INPUT_STREAM input)
204 {
205 // Close any markers in the input stream
206 //
207 if (input->markers != NULL)
208 {
209 input->markers->free(input->markers);
210 input->markers = NULL;
211 }
212
213 // Close the string factory
214 //
215 if (input->strFactory != NULL)
216 {
217 input->strFactory->close(input->strFactory);
218 }
219
220 // Free the input stream buffer if we allocated it
221 //
222 if (input->isAllocated && input->data != NULL)
223 {
224 ANTLR3_FREE(input->data);
225 input->data = NULL;
226 }
227
228 input->istream->free(input->istream);
229
230 // Finally, free the space for the structure itself
231 //
232 ANTLR3_FREE(input);
233
234 // Done
235 //
236 }
237
238 static void
antlr38BitSetUcaseLA(pANTLR3_INPUT_STREAM input,ANTLR3_BOOLEAN flag)239 antlr38BitSetUcaseLA (pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN flag)
240 {
241 if (flag)
242 {
243 // Return the upper case version of the characters
244 //
245 input->istream->_LA = antlr38BitLA_ucase;
246 }
247 else
248 {
249 // Return the raw characters as they are in the buffer
250 //
251 input->istream->_LA = antlr38BitLA;
252 }
253 }
254
255
256 /** \brief Reset a re-startable input stream to the start
257 *
258 * \param input Input stream context pointer
259 */
260 static void
antlr3InputReset(pANTLR3_INPUT_STREAM input)261 antlr3InputReset(pANTLR3_INPUT_STREAM input)
262 {
263
264 input->nextChar = input->data; /* Input at first character */
265 input->line = 1; /* starts at line 1 */
266 input->charPositionInLine = 0;
267 input->currentLine = input->data;
268 input->markDepth = 0; /* Reset markers */
269
270 /* Clear out up the markers table if it is there
271 */
272 if (input->markers != NULL)
273 {
274 input->markers->clear(input->markers);
275 }
276 else
277 {
278 /* Install a new markers table
279 */
280 input->markers = antlr3VectorNew(0);
281 }
282 }
283
284 /** Install a new source code in to a working input stream so that the
285 * input stream can be reused.
286 */
287 static void
antlr38BitReuse(pANTLR3_INPUT_STREAM input,pANTLR3_UINT8 inString,ANTLR3_UINT32 size,pANTLR3_UINT8 name)288 antlr38BitReuse(pANTLR3_INPUT_STREAM input, pANTLR3_UINT8 inString, ANTLR3_UINT32 size, pANTLR3_UINT8 name)
289 {
290 input->isAllocated = ANTLR3_FALSE;
291 input->data = inString;
292 input->sizeBuf = size;
293
294 // Now we can set up the file name. As we are reusing the stream, there may already
295 // be a string that we can reuse for holding the filename.
296 //
297 if (input->istream->streamName == NULL)
298 {
299 input->istream->streamName = input->strFactory->newStr(input->strFactory, name == NULL ? (pANTLR3_UINT8)"-memory-" : name);
300 input->fileName = input->istream->streamName;
301 }
302 else
303 {
304 input->istream->streamName->set(input->istream->streamName, (name == NULL ? (const char *)"-memory-" : (const char *)name));
305 }
306
307 input->reset(input);
308 }
309
310 /** \brief Consume the next character in an 8 bit input stream
311 *
312 * \param input Input stream context pointer
313 */
314 static void
antlr38BitConsume(pANTLR3_INT_STREAM is)315 antlr38BitConsume(pANTLR3_INT_STREAM is)
316 {
317 pANTLR3_INPUT_STREAM input;
318
319 input = ((pANTLR3_INPUT_STREAM) (is->super));
320
321 if ((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
322 {
323 /* Indicate one more character in this line
324 */
325 input->charPositionInLine++;
326
327 if ((ANTLR3_UCHAR)(*((pANTLR3_UINT8)input->nextChar)) == input->newlineChar)
328 {
329 /* Reset for start of a new line of input
330 */
331 input->line++;
332 input->charPositionInLine = 0;
333 input->currentLine = (void *)(((pANTLR3_UINT8)input->nextChar) + 1);
334 }
335
336 /* Increment to next character position
337 */
338 input->nextChar = (void *)(((pANTLR3_UINT8)input->nextChar) + 1);
339 }
340 }
341
342 /** \brief Return the input element assuming an 8 bit ascii input
343 *
344 * \param[in] input Input stream context pointer
345 * \param[in] la 1 based offset of next input stream element
346 *
347 * \return Next input character in internal ANTLR3 encoding (UTF32)
348 */
349 static ANTLR3_UCHAR
antlr38BitLA(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)350 antlr38BitLA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
351 {
352 pANTLR3_INPUT_STREAM input;
353
354 input = ((pANTLR3_INPUT_STREAM) (is->super));
355
356 if (( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
357 {
358 return ANTLR3_CHARSTREAM_EOF;
359 }
360 else
361 {
362 return (ANTLR3_UCHAR)(*((pANTLR3_UINT8)input->nextChar + la - 1));
363 }
364 }
365
366 /** \brief Return the input element assuming an 8 bit input and
367 * always return the UPPER CASE character.
368 * Note that this is 8 bit and so we assume that the toupper
369 * function will use the correct locale for 8 bits.
370 *
371 * \param[in] input Input stream context pointer
372 * \param[in] la 1 based offset of next input stream element
373 *
374 * \return Next input character in internal ANTLR3 encoding (UTF32)
375 */
376 static ANTLR3_UCHAR
antlr38BitLA_ucase(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)377 antlr38BitLA_ucase (pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
378 {
379 pANTLR3_INPUT_STREAM input;
380
381 input = ((pANTLR3_INPUT_STREAM) (is->super));
382
383 if (( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
384 {
385 return ANTLR3_CHARSTREAM_EOF;
386 }
387 else
388 {
389 return (ANTLR3_UCHAR)toupper((*((pANTLR3_UINT8)input->nextChar + la - 1)));
390 }
391 }
392
393
394 /** \brief Return the input element assuming an 8 bit ascii input
395 *
396 * \param[in] input Input stream context pointer
397 * \param[in] lt 1 based offset of next input stream element
398 *
399 * \return Next input character in internal ANTLR3 encoding (UTF32)
400 */
401 static void *
antlr38BitLT(pANTLR3_INPUT_STREAM input,ANTLR3_INT32 lt)402 antlr38BitLT(pANTLR3_INPUT_STREAM input, ANTLR3_INT32 lt)
403 {
404 /* Casting is horrible but it means no warnings and LT should never be called
405 * on a character stream anyway I think. If it is then, the void * will need to be
406 * cast back in a similar manner. Yuck! But this means that LT for Token streams and
407 * tree streams is correct.
408 */
409 return (ANTLR3_FUNC_PTR(input->istream->_LA(input->istream, lt)));
410 }
411
412 /** \brief Calculate the current index in the output stream.
413 * \param[in] input Input stream context pointer
414 */
415 static ANTLR3_MARKER
antlr38BitIndex(pANTLR3_INT_STREAM is)416 antlr38BitIndex(pANTLR3_INT_STREAM is)
417 {
418 pANTLR3_INPUT_STREAM input;
419
420 input = ((pANTLR3_INPUT_STREAM) (is->super));
421
422 return (ANTLR3_MARKER)(((pANTLR3_UINT8)input->nextChar));
423 }
424
425 /** \brief Return the size of the current input stream, as an 8Bit file
426 * which in this case is the total input. Other implementations may provide
427 * more sophisticated implementations to deal with non-recoverable streams
428 * and so on.
429 *
430 * \param[in] input Input stream context pointer
431 */
432 static ANTLR3_UINT32
antlr38BitSize(pANTLR3_INPUT_STREAM input)433 antlr38BitSize(pANTLR3_INPUT_STREAM input)
434 {
435 return input->sizeBuf;
436 }
437
438 /** \brief Mark the current input point in an 8Bit 8 bit stream
439 * such as a file stream, where all the input is available in the
440 * buffer.
441 *
442 * \param[in] is Input stream context pointer
443 */
444 static ANTLR3_MARKER
antlr38BitMark(pANTLR3_INT_STREAM is)445 antlr38BitMark (pANTLR3_INT_STREAM is)
446 {
447 pANTLR3_LEX_STATE state;
448 pANTLR3_INPUT_STREAM input;
449
450 input = ((pANTLR3_INPUT_STREAM) (is->super));
451
452 /* New mark point
453 */
454 ++input->markDepth;
455
456 /* See if we are revisiting a mark as we can just reuse the vector
457 * entry if we are, otherwise, we need a new one
458 */
459 if (input->markDepth > input->markers->count)
460 {
461 state = (pANTLR3_LEX_STATE)ANTLR3_MALLOC(sizeof(ANTLR3_LEX_STATE));
462 if (state == NULL)
463 {
464 // malloc failed
465 --input->markDepth;
466 return 0;
467 }
468
469 /* Add it to the table
470 */
471 input->markers->add(input->markers, state, ANTLR3_FREE_FUNC); /* No special structure, just free() on delete */
472 }
473 else
474 {
475 state = (pANTLR3_LEX_STATE)input->markers->get(input->markers, input->markDepth - 1);
476
477 /* Assume no errors for speed, it will just blow up if the table failed
478 * for some reasons, hence lots of unit tests on the tables ;-)
479 */
480 }
481
482 /* We have created or retrieved the state, so update it with the current
483 * elements of the lexer state.
484 */
485 state->charPositionInLine = input->charPositionInLine;
486 state->currentLine = input->currentLine;
487 state->line = input->line;
488 state->nextChar = input->nextChar;
489
490 is->lastMarker = input->markDepth;
491
492 /* And that's it
493 */
494 return input->markDepth;
495 }
496 /** \brief Rewind the lexer input to the state specified by the last produced mark.
497 *
498 * \param[in] input Input stream context pointer
499 *
500 * \remark
501 * Assumes 8 Bit input stream.
502 */
503 static void
antlr38BitRewindLast(pANTLR3_INT_STREAM is)504 antlr38BitRewindLast (pANTLR3_INT_STREAM is)
505 {
506 is->rewind(is, is->lastMarker);
507 }
508
509 /** \brief Rewind the lexer input to the state specified by the supplied mark.
510 *
511 * \param[in] input Input stream context pointer
512 *
513 * \remark
514 * Assumes 8 Bit input stream.
515 */
516 static void
antlr38BitRewind(pANTLR3_INT_STREAM is,ANTLR3_MARKER mark)517 antlr38BitRewind (pANTLR3_INT_STREAM is, ANTLR3_MARKER mark)
518 {
519 pANTLR3_LEX_STATE state;
520 pANTLR3_INPUT_STREAM input;
521
522 input = ((pANTLR3_INPUT_STREAM) is->super);
523
524 /* Perform any clean up of the marks
525 */
526 input->istream->release(input->istream, mark);
527
528 /* Find the supplied mark state
529 */
530 state = (pANTLR3_LEX_STATE)input->markers->get(input->markers, (ANTLR3_UINT32)(mark - 1));
531 if (state == NULL) { return; }
532
533 /* Seek input pointer to the requested point (note we supply the void *pointer
534 * to whatever is implementing the int stream to seek).
535 */
536 antlr38BitSeek(is, (ANTLR3_MARKER)(state->nextChar));
537
538 /* Reset to the reset of the information in the mark
539 */
540 input->charPositionInLine = state->charPositionInLine;
541 input->currentLine = state->currentLine;
542 input->line = state->line;
543 input->nextChar = state->nextChar;
544
545 /* And we are done
546 */
547 }
548
549 /** \brief Rewind the lexer input to the state specified by the supplied mark.
550 *
551 * \param[in] input Input stream context pointer
552 *
553 * \remark
554 * Assumes 8 Bit input stream.
555 */
556 static void
antlr38BitRelease(pANTLR3_INT_STREAM is,ANTLR3_MARKER mark)557 antlr38BitRelease (pANTLR3_INT_STREAM is, ANTLR3_MARKER mark)
558 {
559 pANTLR3_INPUT_STREAM input;
560
561 input = ((pANTLR3_INPUT_STREAM) (is->super));
562
563 /* We don't do much here in fact as we never free any higher marks in
564 * the hashtable as we just resuse any memory allocated for them.
565 */
566 input->markDepth = (ANTLR3_UINT32)(mark - 1);
567 }
568
569 /** \brief Rewind the lexer input to the state specified by the supplied mark.
570 *
571 * \param[in] input Input stream context pointer
572 *
573 * \remark
574 * Assumes 8 Bit input stream.
575 */
576 static void
antlr38BitSeek(pANTLR3_INT_STREAM is,ANTLR3_MARKER seekPoint)577 antlr38BitSeek (pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint)
578 {
579 ANTLR3_INT32 count;
580 pANTLR3_INPUT_STREAM input;
581
582 input = (pANTLR3_INPUT_STREAM)ANTLR3_FUNC_PTR(((pANTLR3_INPUT_STREAM) is->super));
583
584 /* If the requested seek point is less than the current
585 * input point, then we assume that we are resetting from a mark
586 * and do not need to scan, but can just set to there.
587 */
588 if (seekPoint <= (ANTLR3_MARKER)(input->nextChar))
589 {
590 input->nextChar = ((pANTLR3_UINT8) seekPoint);
591 }
592 else
593 {
594 count = (ANTLR3_UINT32)(seekPoint - (ANTLR3_MARKER)(input->nextChar));
595
596 while (count--)
597 {
598 is->consume(is);
599 }
600 }
601 }
602 /** Return a substring of the 8 bit input stream in
603 * newly allocated memory.
604 *
605 * \param input Input stream context pointer
606 * \param start Offset in input stream where the string starts
607 * \param stop Offset in the input stream where the string ends.
608 */
609 static pANTLR3_STRING
antlr38BitSubstr(pANTLR3_INPUT_STREAM input,ANTLR3_MARKER start,ANTLR3_MARKER stop)610 antlr38BitSubstr (pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop)
611 {
612 return input->strFactory->newPtr(input->strFactory, (pANTLR3_UINT8)start, (ANTLR3_UINT32)(stop - start + 1));
613 }
614
615 /** \brief Return the line number as understood by the 8 bit input stream.
616 *
617 * \param input Input stream context pointer
618 * \return Line number in input stream that we believe we are working on.
619 */
620 static ANTLR3_UINT32
antlr38BitGetLine(pANTLR3_INPUT_STREAM input)621 antlr38BitGetLine (pANTLR3_INPUT_STREAM input)
622 {
623 return input->line;
624 }
625
626 /** Return a pointer into the input stream that points at the start
627 * of the current input line as triggered by the end of line character installed
628 * for the stream ('\n' unless told differently).
629 *
630 * \param[in] input
631 */
632 static void *
antlr38BitGetLineBuf(pANTLR3_INPUT_STREAM input)633 antlr38BitGetLineBuf (pANTLR3_INPUT_STREAM input)
634 {
635 return input->currentLine;
636 }
637
638 /** Return the current offset in to the current line in the input stream.
639 *
640 * \param input Input stream context pointer
641 * \return Current line offset
642 */
643 static ANTLR3_UINT32
antlr38BitGetCharPosition(pANTLR3_INPUT_STREAM input)644 antlr38BitGetCharPosition (pANTLR3_INPUT_STREAM input)
645 {
646 return input->charPositionInLine;
647 }
648
649 /** Set the current line number as understood by the input stream.
650 *
651 * \param input Input stream context pointer
652 * \param line Line number to tell the input stream we are on
653 *
654 * \remark
655 * This function does not change any pointers, it just allows the programmer to set the
656 * line number according to some external criterion, such as finding a lexed directive
657 * like: #nnn "file.c" for instance, such that error reporting and so on in is in sync
658 * with some original source format.
659 */
660 static void
antlr38BitSetLine(pANTLR3_INPUT_STREAM input,ANTLR3_UINT32 line)661 antlr38BitSetLine (pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 line)
662 {
663 input->line = line;
664 }
665
666 /** Set the current offset in the current line to be a particular setting.
667 *
668 * \param[in] input Input stream context pointer
669 * \param[in] position New setting for current offset.
670 *
671 * \remark
672 * This does not set the actual pointers in the input stream, it is purely for reporting
673 * purposes and so on as per antlr38BitSetLine();
674 */
675 static void
antlr38BitSetCharPosition(pANTLR3_INPUT_STREAM input,ANTLR3_UINT32 position)676 antlr38BitSetCharPosition (pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 position)
677 {
678 input->charPositionInLine = position;
679 }
680
681 /** Set the newline trigger character in the input stream to the supplied parameter.
682 *
683 * \param[in] input Input stream context pointer
684 * \param[in] newlineChar Character to set to be the newline trigger.
685 *
686 * \remark
687 * - The supplied newLineChar is in UTF32 encoding (which means ASCII and latin1 etc
688 * are the same encodings), but the input stream catered to by this function is 8 bit
689 * only, so it is up to the programmer to ensure that the character supplied is valid.
690 */
691 static void
antlr38BitSetNewLineChar(pANTLR3_INPUT_STREAM input,ANTLR3_UINT32 newlineChar)692 antlr38BitSetNewLineChar (pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 newlineChar)
693 {
694 input->newlineChar = newlineChar;
695 }
696
697
698 /// \brief Common function to setup function interface for a UTF16 or UCS2 input stream.
699 ///
700 /// \param input Input stream context pointer
701 ///
702 /// \remark
703 /// - Strictly speaking, there is no such thing as a UCS2 input stream as the term
704 /// tends to confuse the notions of character encoding, unicode and so on. UCS2 is
705 /// essentially UTF16 without any surrogates and so the standard UTF16
706 /// input stream is able to handle it without any special code.
707 ///
708 void
antlr3UTF16SetupStream(pANTLR3_INPUT_STREAM input,ANTLR3_BOOLEAN machineBigEndian,ANTLR3_BOOLEAN inputBigEndian)709 antlr3UTF16SetupStream (pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN machineBigEndian, ANTLR3_BOOLEAN inputBigEndian)
710 {
711 // Build a string factory for this stream. This is a UTF16 string factory which is a standard
712 // part of the ANTLR3 string. The string factory is then passed through the whole chain
713 // of lexer->parser->tree->treeparser and so on.
714 //
715 input->strFactory = antlr3StringFactoryNew(input->encoding);
716
717 // Generic API that does not care about endianess.
718 //
719 input->istream->index = antlr3UTF16Index; // Calculate current index in input stream, UTF16 based
720 input->substr = antlr3UTF16Substr; // Return a string from the input stream
721 input->istream->seek = antlr3UTF16Seek; // How to seek to a specific point in the stream
722
723 // We must install different UTF16 routines according to whether the input
724 // is the same endianess as the machine we are executing upon or not. If it is not
725 // then we must install methods that can convert the endianess on the fly as they go
726 //
727
728 switch (machineBigEndian)
729 {
730 case ANTLR3_TRUE:
731
732 // Machine is Big Endian, if the input is also then install the
733 // methods that do not access input by bytes and reverse them.
734 // Otherwise install endian aware methods.
735 //
736 if (inputBigEndian == ANTLR3_TRUE)
737 {
738 // Input is machine compatible
739 //
740 input->istream->consume = antlr3UTF16Consume; // Consume the next UTF16 character in the buffer
741 input->istream->_LA = antlr3UTF16LA; // Return the UTF32 character at offset n (1 based)
742 }
743 else
744 {
745 // Need to use methods that know that the input is little endian
746 //
747 input->istream->consume = antlr3UTF16ConsumeLE; // Consume the next UTF16 character in the buffer
748 input->istream->_LA = antlr3UTF16LALE; // Return the UTF32 character at offset n (1 based)
749 }
750 break;
751
752 case ANTLR3_FALSE:
753
754 // Machine is Little Endian, if the input is also then install the
755 // methods that do not access input by bytes and reverse them.
756 // Otherwise install endian aware methods.
757 //
758 if (inputBigEndian == ANTLR3_FALSE)
759 {
760 // Input is machine compatible
761 //
762 input->istream->consume = antlr3UTF16Consume; // Consume the next UTF16 character in the buffer
763 input->istream->_LA = antlr3UTF16LA; // Return the UTF32 character at offset n (1 based)
764 }
765 else
766 {
767 // Need to use methods that know that the input is Big Endian
768 //
769 input->istream->consume = antlr3UTF16ConsumeBE; // Consume the next UTF16 character in the buffer
770 input->istream->_LA = antlr3UTF16LABE; // Return the UTF32 character at offset n (1 based)
771 }
772 break;
773 }
774
775
776 input->charByteSize = 2; // Size in bytes of characters in this stream.
777
778 }
779
780 /// \brief Consume the next character in a UTF16 input stream
781 ///
782 /// \param input Input stream context pointer
783 ///
784 static void
antlr3UTF16Consume(pANTLR3_INT_STREAM is)785 antlr3UTF16Consume(pANTLR3_INT_STREAM is)
786 {
787 pANTLR3_INPUT_STREAM input;
788 UTF32 ch;
789 UTF32 ch2;
790
791 input = ((pANTLR3_INPUT_STREAM) (is->super));
792
793 // Buffer size is always in bytes
794 //
795 if ((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
796 {
797 // Indicate one more character in this line
798 //
799 input->charPositionInLine++;
800
801 if ((ANTLR3_UCHAR)(*((pANTLR3_UINT16)input->nextChar)) == input->newlineChar)
802 {
803 // Reset for start of a new line of input
804 //
805 input->line++;
806 input->charPositionInLine = 0;
807 input->currentLine = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
808 }
809
810 // Increment to next character position, accounting for any surrogates
811 //
812 // Next char in natural machine byte order
813 //
814 ch = *((UTF16*)input->nextChar);
815
816 // We consumed one 16 bit character
817 //
818 input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
819
820 // If we have a surrogate pair then we need to consume
821 // a following valid LO surrogate.
822 //
823 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
824
825 // If the 16 bits following the high surrogate are in the source buffer...
826 //
827 if ((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
828 {
829 // Next character is in natural machine byte order
830 //
831 ch2 = *((UTF16*)input->nextChar);
832
833 // If it's a valid low surrogate, consume it
834 //
835 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
836 {
837 // We consumed one 16 bit character
838 //
839 input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
840 }
841 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
842 // it.
843 //
844 }
845 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
846 // it because the buffer ended
847 //
848 }
849 // Note that we did not check for an invalid low surrogate here, or that fact that the
850 // lo surrogate was missing. We just picked out one 16 bit character unless the character
851 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
852 //
853 }
854 }
855
856 /// \brief Return the input element assuming an 8 bit ascii input
857 ///
858 /// \param[in] input Input stream context pointer
859 /// \param[in] la 1 based offset of next input stream element
860 ///
861 /// \return Next input character in internal ANTLR3 encoding (UTF32)
862 ///
863 static ANTLR3_UCHAR
antlr3UTF16LA(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)864 antlr3UTF16LA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
865 {
866 pANTLR3_INPUT_STREAM input;
867 UTF32 ch;
868 UTF32 ch2;
869 UTF16 * nextChar;
870
871 // Find the input interface and where we are currently pointing to
872 // in the input stream
873 //
874 input = ((pANTLR3_INPUT_STREAM) (is->super));
875 nextChar = (UTF16*)input->nextChar;
876
877 // If a positive offset then advance forward, else retreat
878 //
879 if (la >= 0)
880 {
881 while (--la > 0 && (pANTLR3_UINT8)nextChar < ((pANTLR3_UINT8)input->data) + input->sizeBuf )
882 {
883 // Advance our copy of the input pointer
884 //
885 // Next char in natural machine byte order
886 //
887 ch = *nextChar++;
888
889 // If we have a surrogate pair then we need to consume
890 // a following valid LO surrogate.
891 //
892 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
893 {
894 // If the 16 bits following the high surrogate are in the source buffer...
895 //
896 if ((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
897 {
898 // Next character is in natural machine byte order
899 //
900 ch2 = *nextChar;
901
902 // If it's a valid low surrogate, consume it
903 //
904 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
905 {
906 // We consumed one 16 bit character
907 //
908 nextChar++;
909 }
910 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
911 // it.
912 //
913 }
914 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
915 // it because the buffer ended
916 //
917 }
918 // Note that we did not check for an invalid low surrogate here, or that fact that the
919 // lo surrogate was missing. We just picked out one 16 bit character unless the character
920 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
921 //
922 }
923 }
924 else
925 {
926 // We need to go backwards from our input point
927 //
928 while (la++ < 0 && (pANTLR3_UINT8)nextChar > (pANTLR3_UINT8)input->data )
929 {
930 // Get the previous 16 bit character
931 //
932 ch = *--nextChar;
933
934 // If we found a low surrogate then go back one more character if
935 // the hi surrogate is there
936 //
937 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
938 {
939 ch2 = *(nextChar-1);
940 if (ch2 >= UNI_SUR_HIGH_START && ch2 <= UNI_SUR_HIGH_END)
941 {
942 // Yes, there is a high surrogate to match it so decrement one more and point to that
943 //
944 nextChar--;
945 }
946 }
947 }
948 }
949
950 // Our local copy of nextChar is now pointing to either the correct character or end of file
951 //
952 // Input buffer size is always in bytes
953 //
954 if ( (pANTLR3_UINT8)nextChar >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
955 {
956 return ANTLR3_CHARSTREAM_EOF;
957 }
958 else
959 {
960 // Pick up the next 16 character (native machine byte order)
961 //
962 ch = *nextChar++;
963
964 // If we have a surrogate pair then we need to consume
965 // a following valid LO surrogate.
966 //
967 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
968 {
969 // If the 16 bits following the high surrogate are in the source buffer...
970 //
971 if ((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
972 {
973 // Next character is in natural machine byte order
974 //
975 ch2 = *nextChar;
976
977 // If it's a valid low surrogate, consume it
978 //
979 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
980 {
981 // Construct the UTF32 code point
982 //
983 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
984 + (ch2 - UNI_SUR_LOW_START) + halfBase;
985 }
986 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
987 // it.
988 //
989 }
990 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
991 // it because the buffer ended
992 //
993 }
994 }
995 return ch;
996 }
997
998
999 /// \brief Calculate the current index in the output stream.
1000 /// \param[in] input Input stream context pointer
1001 ///
1002 static ANTLR3_MARKER
antlr3UTF16Index(pANTLR3_INT_STREAM is)1003 antlr3UTF16Index(pANTLR3_INT_STREAM is)
1004 {
1005 pANTLR3_INPUT_STREAM input;
1006
1007 input = ((pANTLR3_INPUT_STREAM) (is->super));
1008
1009 return (ANTLR3_MARKER)(input->nextChar);
1010 }
1011
1012 /// \brief Rewind the lexer input to the state specified by the supplied mark.
1013 ///
1014 /// \param[in] input Input stream context pointer
1015 ///
1016 /// \remark
1017 /// Assumes UTF16 input stream.
1018 ///
1019 static void
antlr3UTF16Seek(pANTLR3_INT_STREAM is,ANTLR3_MARKER seekPoint)1020 antlr3UTF16Seek (pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint)
1021 {
1022 pANTLR3_INPUT_STREAM input;
1023
1024 input = ((pANTLR3_INPUT_STREAM) is->super);
1025
1026 // If the requested seek point is less than the current
1027 // input point, then we assume that we are resetting from a mark
1028 // and do not need to scan, but can just set to there as rewind will
1029 // reset line numbers and so on.
1030 //
1031 if (seekPoint <= (ANTLR3_MARKER)(input->nextChar))
1032 {
1033 input->nextChar = (void *)seekPoint;
1034 }
1035 else
1036 {
1037 // Call consume until we reach the asked for seek point or EOF
1038 //
1039 while (is->_LA(is, 1) != ANTLR3_CHARSTREAM_EOF && seekPoint < (ANTLR3_MARKER)input->nextChar)
1040 {
1041 is->consume(is);
1042 }
1043 }
1044 }
1045 /// \brief Return a substring of the UTF16 input stream in
1046 /// newly allocated memory.
1047 ///
1048 /// \param input Input stream context pointer
1049 /// \param start Offset in input stream where the string starts
1050 /// \param stop Offset in the input stream where the string ends.
1051 ///
1052 static pANTLR3_STRING
antlr3UTF16Substr(pANTLR3_INPUT_STREAM input,ANTLR3_MARKER start,ANTLR3_MARKER stop)1053 antlr3UTF16Substr (pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop)
1054 {
1055 return input->strFactory->newPtr(input->strFactory, (pANTLR3_UINT8)start, ((ANTLR3_UINT32_CAST(stop - start))/2) + 1);
1056 }
1057
1058 /// \brief Consume the next character in a UTF16 input stream when the input is Little Endian and the machine is not
1059 /// Note that the UTF16 routines do not do any substantial verification of the input stream as for performance
1060 /// sake, we assume it is validly encoded. So if a low surrogate is found at the curent input position then we
1061 /// just consume it. Surrogate pairs should be seen as Hi, Lo. So if we have a Lo first, then the input stream
1062 /// is fubar but we just ignore that.
1063 ///
1064 /// \param input Input stream context pointer
1065 ///
1066 static void
antlr3UTF16ConsumeLE(pANTLR3_INT_STREAM is)1067 antlr3UTF16ConsumeLE(pANTLR3_INT_STREAM is)
1068 {
1069 pANTLR3_INPUT_STREAM input;
1070 UTF32 ch;
1071 UTF32 ch2;
1072
1073 input = ((pANTLR3_INPUT_STREAM) (is->super));
1074
1075 // Buffer size is always in bytes
1076 //
1077 if ((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1078 {
1079 // Indicate one more character in this line
1080 //
1081 input->charPositionInLine++;
1082
1083 if ((ANTLR3_UCHAR)(*((pANTLR3_UINT16)input->nextChar)) == input->newlineChar)
1084 {
1085 // Reset for start of a new line of input
1086 //
1087 input->line++;
1088 input->charPositionInLine = 0;
1089 input->currentLine = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
1090 }
1091
1092 // Increment to next character position, accounting for any surrogates
1093 //
1094 // Next char in litle endian form
1095 //
1096 ch = *((pANTLR3_UINT8)input->nextChar) + (*((pANTLR3_UINT8)input->nextChar + 1) <<8);
1097
1098 // We consumed one 16 bit character
1099 //
1100 input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
1101
1102 // If we have a surrogate pair then we need to consume
1103 // a following valid LO surrogate.
1104 //
1105 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
1106
1107 // If the 16 bits following the high surrogate are in the source buffer...
1108 //
1109 if ((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1110 {
1111 ch2 = *((pANTLR3_UINT8)input->nextChar) + (*((pANTLR3_UINT8)input->nextChar + 1) <<8);
1112
1113 // If it's a valid low surrogate, consume it
1114 //
1115 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
1116 {
1117 // We consumed one 16 bit character
1118 //
1119 input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
1120 }
1121 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1122 // it.
1123 //
1124 }
1125 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1126 // it because the buffer ended
1127 //
1128 }
1129 // Note that we did not check for an invalid low surrogate here, or that fact that the
1130 // lo surrogate was missing. We just picked out one 16 bit character unless the character
1131 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
1132 //
1133 }
1134 }
1135
1136 /// \brief Return the input element assuming a UTF16 input when the input is Little Endian and the machine is not
1137 ///
1138 /// \param[in] input Input stream context pointer
1139 /// \param[in] la 1 based offset of next input stream element
1140 ///
1141 /// \return Next input character in internal ANTLR3 encoding (UTF32)
1142 ///
1143 static ANTLR3_UCHAR
antlr3UTF16LALE(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)1144 antlr3UTF16LALE(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
1145 {
1146 pANTLR3_INPUT_STREAM input;
1147 UTF32 ch;
1148 UTF32 ch2;
1149 pANTLR3_UCHAR nextChar;
1150
1151 // Find the input interface and where we are currently pointing to
1152 // in the input stream
1153 //
1154 input = ((pANTLR3_INPUT_STREAM) (is->super));
1155 nextChar = (pANTLR3_UCHAR)input->nextChar;
1156
1157 // If a positive offset then advance forward, else retreat
1158 //
1159 if (la >= 0)
1160 {
1161 while (--la > 0 && (pANTLR3_UINT8)nextChar < ((pANTLR3_UINT8)input->data) + input->sizeBuf )
1162 {
1163 // Advance our copy of the input pointer
1164 //
1165 // Next char in Little Endian byte order
1166 //
1167 ch = (*nextChar) + (*(nextChar+1) << 8);
1168 nextChar += 2;
1169
1170 // If we have a surrogate pair then we need to consume
1171 // a following valid LO surrogate.
1172 //
1173 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
1174 {
1175 // If the 16 bits following the high surrogate are in the source buffer...
1176 //
1177 if ((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1178 {
1179 // Next character is in little endian byte order
1180 //
1181 ch2 = (*nextChar) + (*(nextChar+1) << 8);
1182
1183 // If it's a valid low surrogate, consume it
1184 //
1185 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
1186 {
1187 // We consumed one 16 bit character
1188 //
1189 nextChar += 2;
1190 }
1191 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1192 // it.
1193 //
1194 }
1195 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1196 // it because the buffer ended
1197 //
1198 }
1199 // Note that we did not check for an invalid low surrogate here, or that fact that the
1200 // lo surrogate was missing. We just picked out one 16 bit character unless the character
1201 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
1202 //
1203 }
1204 }
1205 else
1206 {
1207 // We need to go backwards from our input point
1208 //
1209 while (la++ < 0 && (pANTLR3_UINT8)nextChar > (pANTLR3_UINT8)input->data )
1210 {
1211 // Get the previous 16 bit character
1212 //
1213 ch = (*nextChar - 2) + ((*nextChar -1) << 8);
1214 nextChar -= 2;
1215
1216 // If we found a low surrogate then go back one more character if
1217 // the hi surrogate is there
1218 //
1219 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
1220 {
1221 ch2 = (*nextChar - 2) + ((*nextChar -1) << 8);
1222 if (ch2 >= UNI_SUR_HIGH_START && ch2 <= UNI_SUR_HIGH_END)
1223 {
1224 // Yes, there is a high surrogate to match it so decrement one more and point to that
1225 //
1226 nextChar -=2;
1227 }
1228 }
1229 }
1230 }
1231
1232 // Our local copy of nextChar is now pointing to either the correct character or end of file
1233 //
1234 // Input buffer size is always in bytes
1235 //
1236 if ( (pANTLR3_UINT8)nextChar >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1237 {
1238 return ANTLR3_CHARSTREAM_EOF;
1239 }
1240 else
1241 {
1242 // Pick up the next 16 character (little endian byte order)
1243 //
1244 ch = (*nextChar) + (*(nextChar+1) << 8);
1245 nextChar += 2;
1246
1247 // If we have a surrogate pair then we need to consume
1248 // a following valid LO surrogate.
1249 //
1250 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
1251 {
1252 // If the 16 bits following the high surrogate are in the source buffer...
1253 //
1254 if ((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1255 {
1256 // Next character is in little endian byte order
1257 //
1258 ch2 = (*nextChar) + (*(nextChar+1) << 8);
1259
1260 // If it's a valid low surrogate, consume it
1261 //
1262 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
1263 {
1264 // Construct the UTF32 code point
1265 //
1266 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
1267 + (ch2 - UNI_SUR_LOW_START) + halfBase;
1268 }
1269 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1270 // it.
1271 //
1272 }
1273 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1274 // it because the buffer ended
1275 //
1276 }
1277 }
1278 return ch;
1279 }
1280
1281 /// \brief Consume the next character in a UTF16 input stream when the input is Big Endian and the machine is not
1282 ///
1283 /// \param input Input stream context pointer
1284 ///
1285 static void
antlr3UTF16ConsumeBE(pANTLR3_INT_STREAM is)1286 antlr3UTF16ConsumeBE(pANTLR3_INT_STREAM is)
1287 {
1288 pANTLR3_INPUT_STREAM input;
1289 UTF32 ch;
1290 UTF32 ch2;
1291
1292 input = ((pANTLR3_INPUT_STREAM) (is->super));
1293
1294 // Buffer size is always in bytes
1295 //
1296 if ((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1297 {
1298 // Indicate one more character in this line
1299 //
1300 input->charPositionInLine++;
1301
1302 if ((ANTLR3_UCHAR)(*((pANTLR3_UINT16)input->nextChar)) == input->newlineChar)
1303 {
1304 // Reset for start of a new line of input
1305 //
1306 input->line++;
1307 input->charPositionInLine = 0;
1308 input->currentLine = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
1309 }
1310
1311 // Increment to next character position, accounting for any surrogates
1312 //
1313 // Next char in big endian form
1314 //
1315 ch = *((pANTLR3_UINT8)input->nextChar + 1) + (*((pANTLR3_UINT8)input->nextChar ) <<8);
1316
1317 // We consumed one 16 bit character
1318 //
1319 input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
1320
1321 // If we have a surrogate pair then we need to consume
1322 // a following valid LO surrogate.
1323 //
1324 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
1325
1326 // If the 16 bits following the high surrogate are in the source buffer...
1327 //
1328 if ((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1329 {
1330 // Big endian
1331 //
1332 ch2 = *((pANTLR3_UINT8)input->nextChar + 1) + (*((pANTLR3_UINT8)input->nextChar ) <<8);
1333
1334 // If it's a valid low surrogate, consume it
1335 //
1336 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
1337 {
1338 // We consumed one 16 bit character
1339 //
1340 input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
1341 }
1342 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1343 // it.
1344 //
1345 }
1346 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1347 // it because the buffer ended
1348 //
1349 }
1350 // Note that we did not check for an invalid low surrogate here, or that fact that the
1351 // lo surrogate was missing. We just picked out one 16 bit character unless the character
1352 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
1353 //
1354 }
1355 }
1356
1357 /// \brief Return the input element assuming a UTF16 input when the input is Little Endian and the machine is not
1358 ///
1359 /// \param[in] input Input stream context pointer
1360 /// \param[in] la 1 based offset of next input stream element
1361 ///
1362 /// \return Next input character in internal ANTLR3 encoding (UTF32)
1363 ///
1364 static ANTLR3_UCHAR
antlr3UTF16LABE(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)1365 antlr3UTF16LABE(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
1366 {
1367 pANTLR3_INPUT_STREAM input;
1368 UTF32 ch;
1369 UTF32 ch2;
1370 pANTLR3_UCHAR nextChar;
1371
1372 // Find the input interface and where we are currently pointing to
1373 // in the input stream
1374 //
1375 input = ((pANTLR3_INPUT_STREAM) (is->super));
1376 nextChar = (pANTLR3_UCHAR)input->nextChar;
1377
1378 // If a positive offset then advance forward, else retreat
1379 //
1380 if (la >= 0)
1381 {
1382 while (--la > 0 && (pANTLR3_UINT8)nextChar < ((pANTLR3_UINT8)input->data) + input->sizeBuf )
1383 {
1384 // Advance our copy of the input pointer
1385 //
1386 // Next char in Big Endian byte order
1387 //
1388 ch = ((*nextChar) << 8) + *(nextChar+1);
1389 nextChar += 2;
1390
1391 // If we have a surrogate pair then we need to consume
1392 // a following valid LO surrogate.
1393 //
1394 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
1395 {
1396 // If the 16 bits following the high surrogate are in the source buffer...
1397 //
1398 if ((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1399 {
1400 // Next character is in big endian byte order
1401 //
1402 ch2 = ((*nextChar) << 8) + *(nextChar+1);
1403
1404 // If it's a valid low surrogate, consume it
1405 //
1406 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
1407 {
1408 // We consumed one 16 bit character
1409 //
1410 nextChar += 2;
1411 }
1412 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1413 // it.
1414 //
1415 }
1416 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1417 // it because the buffer ended
1418 //
1419 }
1420 // Note that we did not check for an invalid low surrogate here, or that fact that the
1421 // lo surrogate was missing. We just picked out one 16 bit character unless the character
1422 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
1423 //
1424 }
1425 }
1426 else
1427 {
1428 // We need to go backwards from our input point
1429 //
1430 while (la++ < 0 && (pANTLR3_UINT8)nextChar > (pANTLR3_UINT8)input->data )
1431 {
1432 // Get the previous 16 bit character
1433 //
1434 ch = ((*nextChar - 2) << 8) + (*nextChar -1);
1435 nextChar -= 2;
1436
1437 // If we found a low surrogate then go back one more character if
1438 // the hi surrogate is there
1439 //
1440 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
1441 {
1442 ch2 = ((*nextChar - 2) << 8) + (*nextChar -1);
1443 if (ch2 >= UNI_SUR_HIGH_START && ch2 <= UNI_SUR_HIGH_END)
1444 {
1445 // Yes, there is a high surrogate to match it so decrement one more and point to that
1446 //
1447 nextChar -=2;
1448 }
1449 }
1450 }
1451 }
1452
1453 // Our local copy of nextChar is now pointing to either the correct character or end of file
1454 //
1455 // Input buffer size is always in bytes
1456 //
1457 if ( (pANTLR3_UINT8)nextChar >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1458 {
1459 return ANTLR3_CHARSTREAM_EOF;
1460 }
1461 else
1462 {
1463 // Pick up the next 16 character (big endian byte order)
1464 //
1465 ch = ((*nextChar) << 8) + *(nextChar+1);
1466 nextChar += 2;
1467
1468 // If we have a surrogate pair then we need to consume
1469 // a following valid LO surrogate.
1470 //
1471 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
1472 {
1473 // If the 16 bits following the high surrogate are in the source buffer...
1474 //
1475 if ((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1476 {
1477 // Next character is in big endian byte order
1478 //
1479 ch2 = ((*nextChar) << 8) + *(nextChar+1);
1480
1481 // If it's a valid low surrogate, consume it
1482 //
1483 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
1484 {
1485 // Construct the UTF32 code point
1486 //
1487 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
1488 + (ch2 - UNI_SUR_LOW_START) + halfBase;
1489 }
1490 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1491 // it.
1492 //
1493 }
1494 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1495 // it because the buffer ended
1496 //
1497 }
1498 }
1499 return ch;
1500 }
1501
1502 /// \brief Common function to setup function interface for a UTF3 input stream.
1503 ///
1504 /// \param input Input stream context pointer
1505 ///
1506 void
antlr3UTF32SetupStream(pANTLR3_INPUT_STREAM input,ANTLR3_BOOLEAN machineBigEndian,ANTLR3_BOOLEAN inputBigEndian)1507 antlr3UTF32SetupStream (pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN machineBigEndian, ANTLR3_BOOLEAN inputBigEndian)
1508 {
1509 // Build a string factory for this stream. This is a UTF32 string factory which is a standard
1510 // part of the ANTLR3 string. The string factory is then passed through the whole chain of lexer->parser->tree->treeparser
1511 // and so on.
1512 //
1513 input->strFactory = antlr3StringFactoryNew(input->encoding);
1514
1515 // Generic API that does not care about endianess.
1516 //
1517 input->istream->index = antlr3UTF32Index; // Calculate current index in input stream, UTF16 based
1518 input->substr = antlr3UTF32Substr; // Return a string from the input stream
1519 input->istream->seek = antlr3UTF32Seek; // How to seek to a specific point in the stream
1520 input->istream->consume = antlr3UTF32Consume; // Consume the next UTF32 character in the buffer
1521
1522 // We must install different UTF32 LA routines according to whether the input
1523 // is the same endianess as the machine we are executing upon or not. If it is not
1524 // then we must install methods that can convert the endianess on the fly as they go
1525 //
1526 switch (machineBigEndian)
1527 {
1528 case ANTLR3_TRUE:
1529
1530 // Machine is Big Endian, if the input is also then install the
1531 // methods that do not access input by bytes and reverse them.
1532 // Otherwise install endian aware methods.
1533 //
1534 if (inputBigEndian == ANTLR3_TRUE)
1535 {
1536 // Input is machine compatible
1537 //
1538 input->istream->_LA = antlr3UTF32LA; // Return the UTF32 character at offset n (1 based)
1539 }
1540 else
1541 {
1542 // Need to use methods that know that the input is little endian
1543 //
1544 input->istream->_LA = antlr3UTF32LALE; // Return the UTF32 character at offset n (1 based)
1545 }
1546 break;
1547
1548 case ANTLR3_FALSE:
1549
1550 // Machine is Little Endian, if the input is also then install the
1551 // methods that do not access input by bytes and reverse them.
1552 // Otherwise install endian aware methods.
1553 //
1554 if (inputBigEndian == ANTLR3_FALSE)
1555 {
1556 // Input is machine compatible
1557 //
1558 input->istream->_LA = antlr3UTF32LA; // Return the UTF32 character at offset n (1 based)
1559 }
1560 else
1561 {
1562 // Need to use methods that know that the input is Big Endian
1563 //
1564 input->istream->_LA = antlr3UTF32LABE; // Return the UTF32 character at offset n (1 based)
1565 }
1566 break;
1567 }
1568
1569 input->charByteSize = 4; // Size in bytes of characters in this stream.
1570 }
1571
1572 /** \brief Consume the next character in a UTF32 input stream
1573 *
1574 * \param input Input stream context pointer
1575 */
1576 static void
antlr3UTF32Consume(pANTLR3_INT_STREAM is)1577 antlr3UTF32Consume(pANTLR3_INT_STREAM is)
1578 {
1579 pANTLR3_INPUT_STREAM input;
1580
1581 input = ((pANTLR3_INPUT_STREAM) (is->super));
1582
1583 // SizeBuf is always in bytes
1584 //
1585 if ((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1586 {
1587 /* Indicate one more character in this line
1588 */
1589 input->charPositionInLine++;
1590
1591 if ((ANTLR3_UCHAR)(*((pANTLR3_UINT32)input->nextChar)) == input->newlineChar)
1592 {
1593 /* Reset for start of a new line of input
1594 */
1595 input->line++;
1596 input->charPositionInLine = 0;
1597 input->currentLine = (void *)(((pANTLR3_UINT32)input->nextChar) + 1);
1598 }
1599
1600 /* Increment to next character position
1601 */
1602 input->nextChar = (void *)(((pANTLR3_UINT32)input->nextChar) + 1);
1603 }
1604 }
1605
1606 /// \brief Calculate the current index in the output stream.
1607 /// \param[in] input Input stream context pointer
1608 ///
1609 static ANTLR3_MARKER
antlr3UTF32Index(pANTLR3_INT_STREAM is)1610 antlr3UTF32Index(pANTLR3_INT_STREAM is)
1611 {
1612 pANTLR3_INPUT_STREAM input;
1613
1614 input = ((pANTLR3_INPUT_STREAM) (is->super));
1615
1616 return (ANTLR3_MARKER)(input->nextChar);
1617 }
1618
1619 /// \brief Return a substring of the UTF16 input stream in
1620 /// newly allocated memory.
1621 ///
1622 /// \param input Input stream context pointer
1623 /// \param start Offset in input stream where the string starts
1624 /// \param stop Offset in the input stream where the string ends.
1625 ///
1626 static pANTLR3_STRING
antlr3UTF32Substr(pANTLR3_INPUT_STREAM input,ANTLR3_MARKER start,ANTLR3_MARKER stop)1627 antlr3UTF32Substr (pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop)
1628 {
1629 return input->strFactory->newPtr(input->strFactory, (pANTLR3_UINT8)start, ((ANTLR3_UINT32_CAST(stop - start))/4) + 1);
1630 }
1631
1632 /// \brief Rewind the lexer input to the state specified by the supplied mark.
1633 ///
1634 /// \param[in] input Input stream context pointer
1635 ///
1636 /// \remark
1637 /// Assumes UTF32 input stream.
1638 ///
1639 static void
antlr3UTF32Seek(pANTLR3_INT_STREAM is,ANTLR3_MARKER seekPoint)1640 antlr3UTF32Seek (pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint)
1641 {
1642 pANTLR3_INPUT_STREAM input;
1643
1644 input = ((pANTLR3_INPUT_STREAM) is->super);
1645
1646 // If the requested seek point is less than the current
1647 // input point, then we assume that we are resetting from a mark
1648 // and do not need to scan, but can just set to there as rewind will
1649 // reset line numbers and so on.
1650 //
1651 if (seekPoint <= (ANTLR3_MARKER)(input->nextChar))
1652 {
1653 input->nextChar = (void *)seekPoint;
1654 }
1655 else
1656 {
1657 // Call consume until we reach the asked for seek point or EOF
1658 //
1659 while (is->_LA(is, 1) != ANTLR3_CHARSTREAM_EOF && seekPoint < (ANTLR3_MARKER)input->nextChar)
1660 {
1661 is->consume(is);
1662 }
1663 }
1664 }
1665
1666 /** \brief Return the input element assuming a UTF32 input in natural machine byte order
1667 *
1668 * \param[in] input Input stream context pointer
1669 * \param[in] la 1 based offset of next input stream element
1670 *
1671 * \return Next input character in internal ANTLR3 encoding (UTF32)
1672 */
1673 static ANTLR3_UCHAR
antlr3UTF32LA(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)1674 antlr3UTF32LA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
1675 {
1676 pANTLR3_INPUT_STREAM input;
1677
1678 input = ((pANTLR3_INPUT_STREAM) (is->super));
1679
1680 if (( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1681 {
1682 return ANTLR3_CHARSTREAM_EOF;
1683 }
1684 else
1685 {
1686 return (ANTLR3_UCHAR)(*((pANTLR3_UINT32)input->nextChar + la - 1));
1687 }
1688 }
1689
1690 /** \brief Return the input element assuming a UTF32 input in little endian byte order
1691 *
1692 * \param[in] input Input stream context pointer
1693 * \param[in] la 1 based offset of next input stream element
1694 *
1695 * \return Next input character in internal ANTLR3 encoding (UTF32)
1696 */
1697 static ANTLR3_UCHAR
antlr3UTF32LALE(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)1698 antlr3UTF32LALE(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
1699 {
1700 pANTLR3_INPUT_STREAM input;
1701
1702 input = ((pANTLR3_INPUT_STREAM) (is->super));
1703
1704 if (( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1705 {
1706 return ANTLR3_CHARSTREAM_EOF;
1707 }
1708 else
1709 {
1710 ANTLR3_UCHAR c;
1711
1712 c = (ANTLR3_UCHAR)(*((pANTLR3_UINT32)input->nextChar + la - 1));
1713
1714 // Swap Endianess to Big Endian
1715 //
1716 return (c>>24) | ((c<<8) & 0x00FF0000) | ((c>>8) & 0x0000FF00) | (c<<24);
1717 }
1718 }
1719
1720 /** \brief Return the input element assuming a UTF32 input in big endian byte order
1721 *
1722 * \param[in] input Input stream context pointer
1723 * \param[in] la 1 based offset of next input stream element
1724 *
1725 * \return Next input character in internal ANTLR3 encoding (UTF32)
1726 * \remark This is the same code as LE version but seprated in case there are better optimisations fo rendinan swap
1727 */
1728 static ANTLR3_UCHAR
antlr3UTF32LABE(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)1729 antlr3UTF32LABE(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
1730 {
1731 pANTLR3_INPUT_STREAM input;
1732
1733 input = ((pANTLR3_INPUT_STREAM) (is->super));
1734
1735 if (( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1736 {
1737 return ANTLR3_CHARSTREAM_EOF;
1738 }
1739 else
1740 {
1741 ANTLR3_UCHAR c;
1742
1743 c = (ANTLR3_UCHAR)(*((pANTLR3_UINT32)input->nextChar + la - 1));
1744
1745 // Swap Endianess to Little Endian
1746 //
1747 return (c>>24) | ((c<<8) & 0x00FF0000) | ((c>>8) & 0x0000FF00) | (c<<24);
1748 }
1749 }
1750
1751
1752 /// \brief Common function to setup function interface for a UTF8 input stream.
1753 ///
1754 /// \param input Input stream context pointer
1755 ///
1756 void
antlr3UTF8SetupStream(pANTLR3_INPUT_STREAM input)1757 antlr3UTF8SetupStream (pANTLR3_INPUT_STREAM input)
1758 {
1759 // Build a string factory for this stream. This is a UTF16 string factory which is a standard
1760 // part of the ANTLR3 string. The string factory is then passed through the whole chain of lexer->parser->tree->treeparser
1761 // and so on.
1762 //
1763 input->strFactory = antlr3StringFactoryNew(input->encoding);
1764
1765 // Generic API that does not care about endianess.
1766 //
1767 input->istream->consume = antlr3UTF8Consume; // Consume the next UTF32 character in the buffer
1768 input->istream->_LA = antlr3UTF8LA; // Return the UTF32 character at offset n (1 based)
1769 input->charByteSize = 0; // Size in bytes of characters in this stream.
1770 }
1771
1772 // ------------------------------------------------------
1773 // Following is from Unicode.org (see antlr3convertutf.c)
1774 //
1775
1776 /// Index into the table below with the first byte of a UTF-8 sequence to
1777 /// get the number of trailing bytes that are supposed to follow it.
1778 /// Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
1779 /// left as-is for anyone who may want to do such conversion, which was
1780 /// allowed in earlier algorithms.
1781 ///
1782 static const ANTLR3_UINT32 trailingBytesForUTF8[256] = {
1783 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1784 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1785 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1786 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1787 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1788 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1789 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1790 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
1791 };
1792
1793 /// Magic values subtracted from a buffer value during UTF8 conversion.
1794 /// This table contains as many values as there might be trailing bytes
1795 /// in a UTF-8 sequence.
1796 ///
1797 static const UTF32 offsetsFromUTF8[6] =
1798 { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
1799 0x03C82080UL, 0xFA082080UL, 0x82082080UL
1800 };
1801
1802 // End of Unicode.org tables
1803 // -------------------------
1804
1805
1806 /** \brief Consume the next character in a UTF8 input stream
1807 *
1808 * \param input Input stream context pointer
1809 */
1810 static void
antlr3UTF8Consume(pANTLR3_INT_STREAM is)1811 antlr3UTF8Consume(pANTLR3_INT_STREAM is)
1812 {
1813 pANTLR3_INPUT_STREAM input;
1814 ANTLR3_UINT32 extraBytesToRead;
1815 ANTLR3_UCHAR ch;
1816 pANTLR3_UINT8 nextChar;
1817
1818 input = ((pANTLR3_INPUT_STREAM) (is->super));
1819
1820 nextChar = (pANTLR3_UINT8)input->nextChar;
1821
1822 if (nextChar < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1823 {
1824 // Indicate one more character in this line
1825 //
1826 input->charPositionInLine++;
1827
1828 // Are there more bytes needed to make up the whole thing?
1829 //
1830 extraBytesToRead = trailingBytesForUTF8[*nextChar];
1831
1832 if (nextChar + extraBytesToRead >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1833 {
1834 input->nextChar = (((pANTLR3_UINT8)input->data) + input->sizeBuf);
1835 return;
1836 }
1837
1838 // Cases deliberately fall through (see note A in antlrconvertutf.c)
1839 // Legal UTF8 is only 4 bytes but 6 bytes could be used in old UTF8 so
1840 // we allow it.
1841 //
1842 ch = 0;
1843 switch (extraBytesToRead) {
1844 case 5: ch += *nextChar++; ch <<= 6;
1845 case 4: ch += *nextChar++; ch <<= 6;
1846 case 3: ch += *nextChar++; ch <<= 6;
1847 case 2: ch += *nextChar++; ch <<= 6;
1848 case 1: ch += *nextChar++; ch <<= 6;
1849 case 0: ch += *nextChar++;
1850 }
1851
1852 // Magically correct the input value
1853 //
1854 ch -= offsetsFromUTF8[extraBytesToRead];
1855 if (ch == input->newlineChar)
1856 {
1857 /* Reset for start of a new line of input
1858 */
1859 input->line++;
1860 input->charPositionInLine = 0;
1861 input->currentLine = (void *)nextChar;
1862 }
1863
1864 // Update input pointer
1865 //
1866 input->nextChar = nextChar;
1867 }
1868 }
1869 /** \brief Return the input element assuming a UTF8 input
1870 *
1871 * \param[in] input Input stream context pointer
1872 * \param[in] la 1 based offset of next input stream element
1873 *
1874 * \return Next input character in internal ANTLR3 encoding (UTF32)
1875 */
1876 static ANTLR3_UCHAR
antlr3UTF8LA(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)1877 antlr3UTF8LA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
1878 {
1879 pANTLR3_INPUT_STREAM input;
1880 ANTLR3_UINT32 extraBytesToRead;
1881 ANTLR3_UCHAR ch;
1882 pANTLR3_UINT8 nextChar;
1883
1884 input = ((pANTLR3_INPUT_STREAM) (is->super));
1885
1886 nextChar = (pANTLR3_UINT8)input->nextChar;
1887
1888 // Do we need to traverse forwards or backwards?
1889 // - LA(0) is treated as LA(1) and we assume that the nextChar is
1890 // already positioned.
1891 // - LA(n+) ; n>1 means we must traverse forward n-1 characters catering for UTF8 encoding
1892 // - LA(-n) means we must traverse backwards n chracters
1893 //
1894 if (la > 1) {
1895
1896 // Make sure that we have at least one character left before trying to
1897 // loop through the buffer.
1898 //
1899 if (nextChar < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1900 {
1901 // Now traverse n-1 characters forward
1902 //
1903 while (--la > 0)
1904 {
1905 // Does the next character require trailing bytes?
1906 // If so advance the pointer by that many bytes as well as advancing
1907 // one position for what will be at least a single byte character.
1908 //
1909 nextChar += trailingBytesForUTF8[*nextChar] + 1;
1910
1911 // Does that calculation take us past the byte length of the buffer?
1912 //
1913 if (nextChar >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1914 {
1915 return ANTLR3_CHARSTREAM_EOF;
1916 }
1917 }
1918 }
1919 else
1920 {
1921 return ANTLR3_CHARSTREAM_EOF;
1922 }
1923 }
1924 else
1925 {
1926 // LA is negative so we decrease the pointer by n character positions
1927 //
1928 while (nextChar > (pANTLR3_UINT8)input->data && la++ < 0)
1929 {
1930 // Traversing backwards in UTF8 means decermenting by one
1931 // then continuing to decrement while ever a character pattern
1932 // is flagged as being a trailing byte of an encoded code point.
1933 // Trailing UTF8 bytes always start with 10 in binary. We assumne that
1934 // the UTF8 is well formed and do not check boundary conditions
1935 //
1936 nextChar--;
1937 while ((*nextChar & 0xC0) == 0x80)
1938 {
1939 nextChar--;
1940 }
1941 }
1942 }
1943
1944 // nextChar is now pointing at the UTF8 encoded character that we need to
1945 // decode and return.
1946 //
1947 // Are there more bytes needed to make up the whole thing?
1948 //
1949 extraBytesToRead = trailingBytesForUTF8[*nextChar];
1950 if (nextChar + extraBytesToRead >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1951 {
1952 return ANTLR3_CHARSTREAM_EOF;
1953 }
1954
1955 // Cases deliberately fall through (see note A in antlrconvertutf.c)
1956 //
1957 ch = 0;
1958 switch (extraBytesToRead) {
1959 case 5: ch += *nextChar++; ch <<= 6;
1960 case 4: ch += *nextChar++; ch <<= 6;
1961 case 3: ch += *nextChar++; ch <<= 6;
1962 case 2: ch += *nextChar++; ch <<= 6;
1963 case 1: ch += *nextChar++; ch <<= 6;
1964 case 0: ch += *nextChar++;
1965 }
1966
1967 // Magically correct the input value
1968 //
1969 ch -= offsetsFromUTF8[extraBytesToRead];
1970
1971 return ch;
1972 }
1973
1974 // EBCDIC to ASCII conversion table
1975 //
1976 // This for EBCDIC EDF04 translated to ISO-8859.1 which is the usually accepted POSIX
1977 // translation and the character tables are published all over the interweb.
1978 //
1979 const ANTLR3_UCHAR e2a[256] =
1980 {
1981 0x00, 0x01, 0x02, 0x03, 0x85, 0x09, 0x86, 0x7f,
1982 0x87, 0x8d, 0x8e, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1983 0x10, 0x11, 0x12, 0x13, 0x8f, 0x0a, 0x08, 0x97,
1984 0x18, 0x19, 0x9c, 0x9d, 0x1c, 0x1d, 0x1e, 0x1f,
1985 0x80, 0x81, 0x82, 0x83, 0x84, 0x92, 0x17, 0x1b,
1986 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x05, 0x06, 0x07,
1987 0x90, 0x91, 0x16, 0x93, 0x94, 0x95, 0x96, 0x04,
1988 0x98, 0x99, 0x9a, 0x9b, 0x14, 0x15, 0x9e, 0x1a,
1989 0x20, 0xa0, 0xe2, 0xe4, 0xe0, 0xe1, 0xe3, 0xe5,
1990 0xe7, 0xf1, 0x60, 0x2e, 0x3c, 0x28, 0x2b, 0x7c,
1991 0x26, 0xe9, 0xea, 0xeb, 0xe8, 0xed, 0xee, 0xef,
1992 0xec, 0xdf, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0x9f,
1993 0x2d, 0x2f, 0xc2, 0xc4, 0xc0, 0xc1, 0xc3, 0xc5,
1994 0xc7, 0xd1, 0x5e, 0x2c, 0x25, 0x5f, 0x3e, 0x3f,
1995 0xf8, 0xc9, 0xca, 0xcb, 0xc8, 0xcd, 0xce, 0xcf,
1996 0xcc, 0xa8, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22,
1997 0xd8, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
1998 0x68, 0x69, 0xab, 0xbb, 0xf0, 0xfd, 0xfe, 0xb1,
1999 0xb0, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70,
2000 0x71, 0x72, 0xaa, 0xba, 0xe6, 0xb8, 0xc6, 0xa4,
2001 0xb5, 0xaf, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
2002 0x79, 0x7a, 0xa1, 0xbf, 0xd0, 0xdd, 0xde, 0xae,
2003 0xa2, 0xa3, 0xa5, 0xb7, 0xa9, 0xa7, 0xb6, 0xbc,
2004 0xbd, 0xbe, 0xac, 0x5b, 0x5c, 0x5d, 0xb4, 0xd7,
2005 0xf9, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
2006 0x48, 0x49, 0xad, 0xf4, 0xf6, 0xf2, 0xf3, 0xf5,
2007 0xa6, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50,
2008 0x51, 0x52, 0xb9, 0xfb, 0xfc, 0xdb, 0xfa, 0xff,
2009 0xd9, 0xf7, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
2010 0x59, 0x5a, 0xb2, 0xd4, 0xd6, 0xd2, 0xd3, 0xd5,
2011 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
2012 0x38, 0x39, 0xb3, 0x7b, 0xdc, 0x7d, 0xda, 0x7e
2013 };
2014
2015 /// \brief Common function to setup function interface for a EBCDIC input stream.
2016 ///
2017 /// \param input Input stream context pointer
2018 ///
2019 void
antlr3EBCDICSetupStream(pANTLR3_INPUT_STREAM input)2020 antlr3EBCDICSetupStream (pANTLR3_INPUT_STREAM input)
2021 {
2022 // EBCDIC streams can use the standard 8 bit string factory
2023 //
2024 input->strFactory = antlr3StringFactoryNew(input->encoding);
2025
2026 // Generic API that does not care about endianess.
2027 //
2028 input->istream->_LA = antlr3EBCDICLA; // Return the UTF32 character at offset n (1 based)
2029 input->charByteSize = 1; // Size in bytes of characters in this stream.
2030 }
2031
2032 /// \brief Return the input element assuming an 8 bit EBCDIC input
2033 ///
2034 /// \param[in] input Input stream context pointer
2035 /// \param[in] la 1 based offset of next input stream element
2036 ///
2037 /// \return Next input character in internal ANTLR3 encoding (UTF32) after translation
2038 /// from EBCDIC to ASCII
2039 ///
2040 static ANTLR3_UCHAR
antlr3EBCDICLA(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)2041 antlr3EBCDICLA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
2042 {
2043 pANTLR3_INPUT_STREAM input;
2044
2045 input = ((pANTLR3_INPUT_STREAM) (is->super));
2046
2047 if (( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
2048 {
2049 return ANTLR3_CHARSTREAM_EOF;
2050 }
2051 else
2052 {
2053 // Translate the required character via the constant conversion table
2054 //
2055 return e2a[(*((pANTLR3_UINT8)input->nextChar + la - 1))];
2056 }
2057 }