1 /// \file
2 /// Base functions to initialize and manipulate any input stream
3 ///
4
5 // [The "BSD licence"]
6 // Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC
7 // http://www.temporal-wave.com
8 // http://www.linkedin.com/in/jimidle
9 //
10 // All rights reserved.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions
14 // are met:
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 // 3. The name of the author may not be used to endorse or promote products
21 // derived from this software without specific prior written permission.
22 //
23 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
24 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
25 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
26 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
27 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
28 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
32 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33
34 #include <antlr3input.h>
35
36 // -----------------------------------
37 // Generic 8 bit input such as latin-1
38 //
39
40 // 8Bit INT Stream API
41 //
42 static void antlr38BitConsume (pANTLR3_INT_STREAM is);
43 static ANTLR3_UCHAR antlr38BitLA (pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
44 static ANTLR3_UCHAR antlr38BitLA_ucase (pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
45 static ANTLR3_MARKER antlr38BitIndex (pANTLR3_INT_STREAM is);
46 static ANTLR3_MARKER antlr38BitMark (pANTLR3_INT_STREAM is);
47 static void antlr38BitRewind (pANTLR3_INT_STREAM is, ANTLR3_MARKER mark);
48 static void antlr38BitRewindLast (pANTLR3_INT_STREAM is);
49 static void antlr38BitRelease (pANTLR3_INT_STREAM is, ANTLR3_MARKER mark);
50 static void antlr38BitSeek (pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint);
51 static pANTLR3_STRING antlr38BitGetSourceName (pANTLR3_INT_STREAM is);
52
53 // 8Bit Charstream API functions
54 //
55 static void antlr3InputClose (pANTLR3_INPUT_STREAM input);
56 static void antlr3InputReset (pANTLR3_INPUT_STREAM input);
57 static void antlr38BitReuse (pANTLR3_INPUT_STREAM input, pANTLR3_UINT8 inString, ANTLR3_UINT32 size, pANTLR3_UINT8 name);
58 static void * antlr38BitLT (pANTLR3_INPUT_STREAM input, ANTLR3_INT32 lt);
59 static ANTLR3_UINT32 antlr38BitSize (pANTLR3_INPUT_STREAM input);
60 static pANTLR3_STRING antlr38BitSubstr (pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop);
61 static ANTLR3_UINT32 antlr38BitGetLine (pANTLR3_INPUT_STREAM input);
62 static void * antlr38BitGetLineBuf (pANTLR3_INPUT_STREAM input);
63 static ANTLR3_UINT32 antlr38BitGetCharPosition (pANTLR3_INPUT_STREAM input);
64 static void antlr38BitSetLine (pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 line);
65 static void antlr38BitSetCharPosition (pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 position);
66 static void antlr38BitSetNewLineChar (pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 newlineChar);
67 static void antlr38BitSetUcaseLA (pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN flag);
68
69 // -----------------------------------
70 // UTF16 (also covers UCS2)
71 //
72 // INT Stream API
73 //
74 static void antlr3UTF16Consume (pANTLR3_INT_STREAM is);
75 static ANTLR3_UCHAR antlr3UTF16LA (pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
76 static void antlr3UTF16ConsumeLE (pANTLR3_INT_STREAM is);
77 static ANTLR3_UCHAR antlr3UTF16LALE (pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
78 static void antlr3UTF16ConsumeBE (pANTLR3_INT_STREAM is);
79 static ANTLR3_UCHAR antlr3UTF16LABE (pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
80 static ANTLR3_MARKER antlr3UTF16Index (pANTLR3_INT_STREAM is);
81 static void antlr3UTF16Seek (pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint);
82
83 // UTF16 Charstream API functions
84 //
85 static pANTLR3_STRING antlr3UTF16Substr (pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop);
86
87 // -----------------------------------
88 // UTF32 (also covers UCS2)
89 //
90 // INT Stream API
91 //
92 static void antlr3UTF32Consume (pANTLR3_INT_STREAM is);
93 static ANTLR3_UCHAR antlr3UTF32LA (pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
94 static ANTLR3_UCHAR antlr3UTF32LALE (pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
95 static ANTLR3_UCHAR antlr3UTF32LABE (pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
96 static ANTLR3_MARKER antlr3UTF32Index (pANTLR3_INT_STREAM is);
97 static void antlr3UTF32Seek (pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint);
98
99 // UTF16 Charstream API functions
100 //
101 static pANTLR3_STRING antlr3UTF32Substr (pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop);
102
103 // ------------------------------------
104 // UTF-8
105 //
106 static void antlr3UTF8Consume (pANTLR3_INT_STREAM is);
107 static ANTLR3_UCHAR antlr3UTF8LA (pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
108
109 // ------------------------------------
110 // EBCDIC
111 //
112 static ANTLR3_UCHAR antlr3EBCDICLA (pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
113
114 /// \brief Common function to setup function interface for an 8 bit input stream.
115 ///
116 /// \param input Input stream context pointer
117 ///
118 /// \remark
119 /// - Many of the 8 bit oriented file stream handling functions will be usable
120 /// by any or at least some, other input streams. Therefore it is perfectly acceptable
121 /// to call this function to install the 8Bit handler then override just those functions
122 /// that would not work for the particular input encoding, such as consume for instance.
123 ///
124 void
antlr38BitSetupStream(pANTLR3_INPUT_STREAM input)125 antlr38BitSetupStream (pANTLR3_INPUT_STREAM input)
126 {
127 // Build a string factory for this stream
128 //
129 input->strFactory = antlr3StringFactoryNew(input->encoding);
130
131 // Default stream API set up is for 8Bit, so we are done
132 //
133 }
134
135 void
antlr3GenericSetupStream(pANTLR3_INPUT_STREAM input)136 antlr3GenericSetupStream (pANTLR3_INPUT_STREAM input)
137 {
138 /* Install function pointers for an 8 bit input
139 */
140
141 /* Allocate stream interface
142 */
143 input->istream = antlr3IntStreamNew();
144 input->istream->type = ANTLR3_CHARSTREAM;
145 input->istream->super = input;
146
147 /* Intstream API
148 */
149 input->istream->consume = antlr38BitConsume; // Consume the next 8 bit character in the buffer
150 input->istream->_LA = antlr38BitLA; // Return the UTF32 character at offset n (1 based)
151 input->istream->index = antlr38BitIndex; // Current index (offset from first character
152 input->istream->mark = antlr38BitMark; // Record the current lex state for later restore
153 input->istream->rewind = antlr38BitRewind; // How to rewind the input
154 input->istream->rewindLast = antlr38BitRewindLast; // How to rewind the input
155 input->istream->seek = antlr38BitSeek; // How to seek to a specific point in the stream
156 input->istream->release = antlr38BitRelease; // Reset marks after mark n
157 input->istream->getSourceName = antlr38BitGetSourceName; // Return a string that names the input source
158
159 /* Charstream API
160 */
161 input->close = antlr3InputClose; // Close down the stream completely
162 input->free = antlr3InputClose; // Synonym for free
163 input->reset = antlr3InputReset; // Reset input to start
164 input->reuse = antlr38BitReuse; // Install a new input string and reset
165 input->_LT = antlr38BitLT; // Same as _LA for 8 bit file
166 input->size = antlr38BitSize; // Return the size of the input buffer
167 input->substr = antlr38BitSubstr; // Return a string from the input stream
168 input->getLine = antlr38BitGetLine; // Return the current line number in the input stream
169 input->getLineBuf = antlr38BitGetLineBuf; // Return a pointer to the start of the current line being consumed
170 input->getCharPositionInLine = antlr38BitGetCharPosition; // Return the offset into the current line of input
171 input->setLine = antlr38BitSetLine; // Set the input stream line number (does not set buffer pointers)
172 input->setCharPositionInLine = antlr38BitSetCharPosition; // Set the offset in to the current line (does not set any pointers)
173 input->SetNewLineChar = antlr38BitSetNewLineChar; // Set the value of the newline trigger character
174 input->setUcaseLA = antlr38BitSetUcaseLA; // Changes the LA function to return upper case always
175
176 input->charByteSize = 1; // Size in bytes of characters in this stream.
177
178 /* Initialize entries for tables etc
179 */
180 input->markers = NULL;
181
182 /* Set up the input stream brand new
183 */
184 input->reset(input);
185
186 /* Install default line separator character (it can be replaced
187 * by the grammar programmer later)
188 */
189 input->SetNewLineChar(input, (ANTLR3_UCHAR)'\n');
190 }
191
192 static pANTLR3_STRING
antlr38BitGetSourceName(pANTLR3_INT_STREAM is)193 antlr38BitGetSourceName(pANTLR3_INT_STREAM is)
194 {
195 return is->streamName;
196 }
197
198 /** \brief Close down an input stream and free any memory allocated by it.
199 *
200 * \param input Input stream context pointer
201 */
202 static void
antlr3InputClose(pANTLR3_INPUT_STREAM input)203 antlr3InputClose(pANTLR3_INPUT_STREAM input)
204 {
205 // Close any markers in the input stream
206 //
207 if (input->markers != NULL)
208 {
209 input->markers->free(input->markers);
210 input->markers = NULL;
211 }
212
213 // Close the string factory
214 //
215 if (input->strFactory != NULL)
216 {
217 input->strFactory->close(input->strFactory);
218 }
219
220 // Free the input stream buffer if we allocated it
221 //
222 if (input->isAllocated && input->data != NULL)
223 {
224 ANTLR3_FREE(input->data);
225 input->data = NULL;
226 }
227
228 input->istream->free(input->istream);
229
230 // Finally, free the space for the structure itself
231 //
232 ANTLR3_FREE(input);
233
234 // Done
235 //
236 }
237
238 static void
antlr38BitSetUcaseLA(pANTLR3_INPUT_STREAM input,ANTLR3_BOOLEAN flag)239 antlr38BitSetUcaseLA (pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN flag)
240 {
241 if (flag)
242 {
243 // Return the upper case version of the characters
244 //
245 input->istream->_LA = antlr38BitLA_ucase;
246 }
247 else
248 {
249 // Return the raw characters as they are in the buffer
250 //
251 input->istream->_LA = antlr38BitLA;
252 }
253 }
254
255
256 /** \brief Reset a re-startable input stream to the start
257 *
258 * \param input Input stream context pointer
259 */
260 static void
antlr3InputReset(pANTLR3_INPUT_STREAM input)261 antlr3InputReset(pANTLR3_INPUT_STREAM input)
262 {
263
264 input->nextChar = input->data; /* Input at first character */
265 input->line = 1; /* starts at line 1 */
266 input->charPositionInLine = -1;
267 input->currentLine = input->data;
268 input->markDepth = 0; /* Reset markers */
269
270 /* Clear out up the markers table if it is there
271 */
272 if (input->markers != NULL)
273 {
274 input->markers->clear(input->markers);
275 }
276 else
277 {
278 /* Install a new markers table
279 */
280 input->markers = antlr3VectorNew(0);
281 }
282 }
283
284 /** Install a new source code in to a working input stream so that the
285 * input stream can be reused.
286 */
287 static void
antlr38BitReuse(pANTLR3_INPUT_STREAM input,pANTLR3_UINT8 inString,ANTLR3_UINT32 size,pANTLR3_UINT8 name)288 antlr38BitReuse(pANTLR3_INPUT_STREAM input, pANTLR3_UINT8 inString, ANTLR3_UINT32 size, pANTLR3_UINT8 name)
289 {
290 input->isAllocated = ANTLR3_FALSE;
291 input->data = inString;
292 input->sizeBuf = size;
293
294 // Now we can set up the file name. As we are reusing the stream, there may already
295 // be a string that we can reuse for holding the filename.
296 //
297 if (input->istream->streamName == NULL)
298 {
299 input->istream->streamName = input->strFactory->newStr(input->strFactory, name == NULL ? (pANTLR3_UINT8)"-memory-" : name);
300 input->fileName = input->istream->streamName;
301 }
302 else
303 {
304 input->istream->streamName->set(input->istream->streamName, (name == NULL ? (const char *)"-memory-" : (const char *)name));
305 }
306
307 input->reset(input);
308 }
309
310 /** \brief Consume the next character in an 8 bit input stream
311 *
312 * \param input Input stream context pointer
313 */
314 static void
antlr38BitConsume(pANTLR3_INT_STREAM is)315 antlr38BitConsume(pANTLR3_INT_STREAM is)
316 {
317 pANTLR3_INPUT_STREAM input;
318
319 input = ((pANTLR3_INPUT_STREAM) (is->super));
320
321 if ((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
322 {
323 /* Indicate one more character in this line
324 */
325 input->charPositionInLine++;
326
327 if ((ANTLR3_UCHAR)(*((pANTLR3_UINT8)input->nextChar)) == input->newlineChar)
328 {
329 /* Reset for start of a new line of input
330 */
331 input->line++;
332 input->charPositionInLine = 0;
333 input->currentLine = (void *)(((pANTLR3_UINT8)input->nextChar) + 1);
334 }
335
336 /* Increment to next character position
337 */
338 input->nextChar = (void *)(((pANTLR3_UINT8)input->nextChar) + 1);
339 }
340 }
341
342 /** \brief Return the input element assuming an 8 bit ascii input
343 *
344 * \param[in] input Input stream context pointer
345 * \param[in] la 1 based offset of next input stream element
346 *
347 * \return Next input character in internal ANTLR3 encoding (UTF32)
348 */
349 static ANTLR3_UCHAR
antlr38BitLA(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)350 antlr38BitLA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
351 {
352 pANTLR3_INPUT_STREAM input;
353
354 input = ((pANTLR3_INPUT_STREAM) (is->super));
355
356 if (( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
357 {
358 return ANTLR3_CHARSTREAM_EOF;
359 }
360 else
361 {
362 return (ANTLR3_UCHAR)(*((pANTLR3_UINT8)input->nextChar + la - 1));
363 }
364 }
365
366 /** \brief Return the input element assuming an 8 bit input and
367 * always return the UPPER CASE character.
368 * Note that this is 8 bit and so we assume that the toupper
369 * function will use the correct locale for 8 bits.
370 *
371 * \param[in] input Input stream context pointer
372 * \param[in] la 1 based offset of next input stream element
373 *
374 * \return Next input character in internal ANTLR3 encoding (UTF32)
375 */
376 static ANTLR3_UCHAR
antlr38BitLA_ucase(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)377 antlr38BitLA_ucase (pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
378 {
379 pANTLR3_INPUT_STREAM input;
380
381 input = ((pANTLR3_INPUT_STREAM) (is->super));
382
383 if (( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
384 {
385 return ANTLR3_CHARSTREAM_EOF;
386 }
387 else
388 {
389 return (ANTLR3_UCHAR)toupper((*((pANTLR3_UINT8)input->nextChar + la - 1)));
390 }
391 }
392
393
394 /** \brief Return the input element assuming an 8 bit ascii input
395 *
396 * \param[in] input Input stream context pointer
397 * \param[in] lt 1 based offset of next input stream element
398 *
399 * \return Next input character in internal ANTLR3 encoding (UTF32)
400 */
401 static void *
antlr38BitLT(pANTLR3_INPUT_STREAM input,ANTLR3_INT32 lt)402 antlr38BitLT(pANTLR3_INPUT_STREAM input, ANTLR3_INT32 lt)
403 {
404 /* Casting is horrible but it means no warnings and LT should never be called
405 * on a character stream anyway I think. If it is then, the void * will need to be
406 * cast back in a similar manner. Yuck! But this means that LT for Token streams and
407 * tree streams is correct.
408 */
409 return (ANTLR3_FUNC_PTR(input->istream->_LA(input->istream, lt)));
410 }
411
412 /** \brief Calculate the current index in the output stream.
413 * \param[in] input Input stream context pointer
414 */
415 static ANTLR3_MARKER
antlr38BitIndex(pANTLR3_INT_STREAM is)416 antlr38BitIndex(pANTLR3_INT_STREAM is)
417 {
418 pANTLR3_INPUT_STREAM input;
419
420 input = ((pANTLR3_INPUT_STREAM) (is->super));
421
422 return (ANTLR3_MARKER)(((pANTLR3_UINT8)input->nextChar));
423 }
424
425 /** \brief Return the size of the current input stream, as an 8Bit file
426 * which in this case is the total input. Other implementations may provide
427 * more sophisticated implementations to deal with non-recoverable streams
428 * and so on.
429 *
430 * \param[in] input Input stream context pointer
431 */
432 static ANTLR3_UINT32
antlr38BitSize(pANTLR3_INPUT_STREAM input)433 antlr38BitSize(pANTLR3_INPUT_STREAM input)
434 {
435 return input->sizeBuf;
436 }
437
438 /** \brief Mark the current input point in an 8Bit 8 bit stream
439 * such as a file stream, where all the input is available in the
440 * buffer.
441 *
442 * \param[in] is Input stream context pointer
443 */
444 static ANTLR3_MARKER
antlr38BitMark(pANTLR3_INT_STREAM is)445 antlr38BitMark (pANTLR3_INT_STREAM is)
446 {
447 pANTLR3_LEX_STATE state;
448 pANTLR3_INPUT_STREAM input;
449
450 input = ((pANTLR3_INPUT_STREAM) (is->super));
451
452 /* New mark point
453 */
454 input->markDepth++;
455
456 /* See if we are revisiting a mark as we can just reuse the vector
457 * entry if we are, otherwise, we need a new one
458 */
459 if (input->markDepth > input->markers->count)
460 {
461 state = ANTLR3_MALLOC(sizeof(ANTLR3_LEX_STATE));
462
463 /* Add it to the table
464 */
465 input->markers->add(input->markers, state, ANTLR3_FREE_FUNC); /* No special structure, just free() on delete */
466 }
467 else
468 {
469 state = (pANTLR3_LEX_STATE)input->markers->get(input->markers, input->markDepth - 1);
470
471 /* Assume no errors for speed, it will just blow up if the table failed
472 * for some reasons, hence lots of unit tests on the tables ;-)
473 */
474 }
475
476 /* We have created or retrieved the state, so update it with the current
477 * elements of the lexer state.
478 */
479 state->charPositionInLine = input->charPositionInLine;
480 state->currentLine = input->currentLine;
481 state->line = input->line;
482 state->nextChar = input->nextChar;
483
484 is->lastMarker = input->markDepth;
485
486 /* And that's it
487 */
488 return input->markDepth;
489 }
490 /** \brief Rewind the lexer input to the state specified by the last produced mark.
491 *
492 * \param[in] input Input stream context pointer
493 *
494 * \remark
495 * Assumes 8 Bit input stream.
496 */
497 static void
antlr38BitRewindLast(pANTLR3_INT_STREAM is)498 antlr38BitRewindLast (pANTLR3_INT_STREAM is)
499 {
500 is->rewind(is, is->lastMarker);
501 }
502
503 /** \brief Rewind the lexer input to the state specified by the supplied mark.
504 *
505 * \param[in] input Input stream context pointer
506 *
507 * \remark
508 * Assumes 8 Bit input stream.
509 */
510 static void
antlr38BitRewind(pANTLR3_INT_STREAM is,ANTLR3_MARKER mark)511 antlr38BitRewind (pANTLR3_INT_STREAM is, ANTLR3_MARKER mark)
512 {
513 pANTLR3_LEX_STATE state;
514 pANTLR3_INPUT_STREAM input;
515
516 input = ((pANTLR3_INPUT_STREAM) is->super);
517
518 /* Perform any clean up of the marks
519 */
520 input->istream->release(input->istream, mark);
521
522 /* Find the supplied mark state
523 */
524 state = (pANTLR3_LEX_STATE)input->markers->get(input->markers, (ANTLR3_UINT32)(mark - 1));
525
526 /* Seek input pointer to the requested point (note we supply the void *pointer
527 * to whatever is implementing the int stream to seek).
528 */
529 antlr38BitSeek(is, (ANTLR3_MARKER)(state->nextChar));
530
531 /* Reset to the reset of the information in the mark
532 */
533 input->charPositionInLine = state->charPositionInLine;
534 input->currentLine = state->currentLine;
535 input->line = state->line;
536 input->nextChar = state->nextChar;
537
538 /* And we are done
539 */
540 }
541
542 /** \brief Rewind the lexer input to the state specified by the supplied mark.
543 *
544 * \param[in] input Input stream context pointer
545 *
546 * \remark
547 * Assumes 8 Bit input stream.
548 */
549 static void
antlr38BitRelease(pANTLR3_INT_STREAM is,ANTLR3_MARKER mark)550 antlr38BitRelease (pANTLR3_INT_STREAM is, ANTLR3_MARKER mark)
551 {
552 pANTLR3_INPUT_STREAM input;
553
554 input = ((pANTLR3_INPUT_STREAM) (is->super));
555
556 /* We don't do much here in fact as we never free any higher marks in
557 * the hashtable as we just resuse any memory allocated for them.
558 */
559 input->markDepth = (ANTLR3_UINT32)(mark - 1);
560 }
561
562 /** \brief Rewind the lexer input to the state specified by the supplied mark.
563 *
564 * \param[in] input Input stream context pointer
565 *
566 * \remark
567 * Assumes 8 Bit input stream.
568 */
569 static void
antlr38BitSeek(pANTLR3_INT_STREAM is,ANTLR3_MARKER seekPoint)570 antlr38BitSeek (pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint)
571 {
572 ANTLR3_INT32 count;
573 pANTLR3_INPUT_STREAM input;
574
575 input = ANTLR3_FUNC_PTR(((pANTLR3_INPUT_STREAM) is->super));
576
577 /* If the requested seek point is less than the current
578 * input point, then we assume that we are resetting from a mark
579 * and do not need to scan, but can just set to there.
580 */
581 if (seekPoint <= (ANTLR3_MARKER)(input->nextChar))
582 {
583 input->nextChar = ((pANTLR3_UINT8) seekPoint);
584 }
585 else
586 {
587 count = (ANTLR3_UINT32)(seekPoint - (ANTLR3_MARKER)(input->nextChar));
588
589 while (count--)
590 {
591 is->consume(is);
592 }
593 }
594 }
595 /** Return a substring of the 8 bit input stream in
596 * newly allocated memory.
597 *
598 * \param input Input stream context pointer
599 * \param start Offset in input stream where the string starts
600 * \param stop Offset in the input stream where the string ends.
601 */
602 static pANTLR3_STRING
antlr38BitSubstr(pANTLR3_INPUT_STREAM input,ANTLR3_MARKER start,ANTLR3_MARKER stop)603 antlr38BitSubstr (pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop)
604 {
605 return input->strFactory->newPtr(input->strFactory, (pANTLR3_UINT8)start, (ANTLR3_UINT32)(stop - start + 1));
606 }
607
608 /** \brief Return the line number as understood by the 8 bit input stream.
609 *
610 * \param input Input stream context pointer
611 * \return Line number in input stream that we believe we are working on.
612 */
613 static ANTLR3_UINT32
antlr38BitGetLine(pANTLR3_INPUT_STREAM input)614 antlr38BitGetLine (pANTLR3_INPUT_STREAM input)
615 {
616 return input->line;
617 }
618
619 /** Return a pointer into the input stream that points at the start
620 * of the current input line as triggered by the end of line character installed
621 * for the stream ('\n' unless told differently).
622 *
623 * \param[in] input
624 */
625 static void *
antlr38BitGetLineBuf(pANTLR3_INPUT_STREAM input)626 antlr38BitGetLineBuf (pANTLR3_INPUT_STREAM input)
627 {
628 return input->currentLine;
629 }
630
631 /** Return the current offset in to the current line in the input stream.
632 *
633 * \param input Input stream context pointer
634 * \return Current line offset
635 */
636 static ANTLR3_UINT32
antlr38BitGetCharPosition(pANTLR3_INPUT_STREAM input)637 antlr38BitGetCharPosition (pANTLR3_INPUT_STREAM input)
638 {
639 return input->charPositionInLine;
640 }
641
642 /** Set the current line number as understood by the input stream.
643 *
644 * \param input Input stream context pointer
645 * \param line Line number to tell the input stream we are on
646 *
647 * \remark
648 * This function does not change any pointers, it just allows the programmer to set the
649 * line number according to some external criterion, such as finding a lexed directive
650 * like: #nnn "file.c" for instance, such that error reporting and so on in is in sync
651 * with some original source format.
652 */
653 static void
antlr38BitSetLine(pANTLR3_INPUT_STREAM input,ANTLR3_UINT32 line)654 antlr38BitSetLine (pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 line)
655 {
656 input->line = line;
657 }
658
659 /** Set the current offset in the current line to be a particular setting.
660 *
661 * \param[in] input Input stream context pointer
662 * \param[in] position New setting for current offset.
663 *
664 * \remark
665 * This does not set the actual pointers in the input stream, it is purely for reporting
666 * purposes and so on as per antlr38BitSetLine();
667 */
668 static void
antlr38BitSetCharPosition(pANTLR3_INPUT_STREAM input,ANTLR3_UINT32 position)669 antlr38BitSetCharPosition (pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 position)
670 {
671 input->charPositionInLine = position;
672 }
673
674 /** Set the newline trigger character in the input stream to the supplied parameter.
675 *
676 * \param[in] input Input stream context pointer
677 * \param[in] newlineChar Character to set to be the newline trigger.
678 *
679 * \remark
680 * - The supplied newLineChar is in UTF32 encoding (which means ASCII and latin1 etc
681 * are the same encodings), but the input stream catered to by this function is 8 bit
682 * only, so it is up to the programmer to ensure that the character supplied is valid.
683 */
684 static void
antlr38BitSetNewLineChar(pANTLR3_INPUT_STREAM input,ANTLR3_UINT32 newlineChar)685 antlr38BitSetNewLineChar (pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 newlineChar)
686 {
687 input->newlineChar = newlineChar;
688 }
689
690
691 /// \brief Common function to setup function interface for a UTF16 or UCS2 input stream.
692 ///
693 /// \param input Input stream context pointer
694 ///
695 /// \remark
696 /// - Strictly speaking, there is no such thing as a UCS2 input stream as the term
697 /// tends to confuse the notions of character encoding, unicode and so on. UCS2 is
698 /// essentially UTF16 without any surrogates and so the standard UTF16
699 /// input stream is able to handle it without any special code.
700 ///
701 void
antlr3UTF16SetupStream(pANTLR3_INPUT_STREAM input,ANTLR3_BOOLEAN machineBigEndian,ANTLR3_BOOLEAN inputBigEndian)702 antlr3UTF16SetupStream (pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN machineBigEndian, ANTLR3_BOOLEAN inputBigEndian)
703 {
704 // Build a string factory for this stream. This is a UTF16 string factory which is a standard
705 // part of the ANTLR3 string. The string factory is then passed through the whole chain
706 // of lexer->parser->tree->treeparser and so on.
707 //
708 input->strFactory = antlr3StringFactoryNew(input->encoding);
709
710 // Generic API that does not care about endianess.
711 //
712 input->istream->index = antlr3UTF16Index; // Calculate current index in input stream, UTF16 based
713 input->substr = antlr3UTF16Substr; // Return a string from the input stream
714 input->istream->seek = antlr3UTF16Seek; // How to seek to a specific point in the stream
715
716 // We must install different UTF16 routines according to whether the input
717 // is the same endianess as the machine we are executing upon or not. If it is not
718 // then we must install methods that can convert the endianess on the fly as they go
719 //
720
721 switch (machineBigEndian)
722 {
723 case ANTLR3_TRUE:
724
725 // Machine is Big Endian, if the input is also then install the
726 // methods that do not access input by bytes and reverse them.
727 // Otherwise install endian aware methods.
728 //
729 if (inputBigEndian == ANTLR3_TRUE)
730 {
731 // Input is machine compatible
732 //
733 input->istream->consume = antlr3UTF16Consume; // Consume the next UTF16 character in the buffer
734 input->istream->_LA = antlr3UTF16LA; // Return the UTF32 character at offset n (1 based)
735 }
736 else
737 {
738 // Need to use methods that know that the input is little endian
739 //
740 input->istream->consume = antlr3UTF16ConsumeLE; // Consume the next UTF16 character in the buffer
741 input->istream->_LA = antlr3UTF16LALE; // Return the UTF32 character at offset n (1 based)
742 }
743 break;
744
745 case ANTLR3_FALSE:
746
747 // Machine is Little Endian, if the input is also then install the
748 // methods that do not access input by bytes and reverse them.
749 // Otherwise install endian aware methods.
750 //
751 if (inputBigEndian == ANTLR3_FALSE)
752 {
753 // Input is machine compatible
754 //
755 input->istream->consume = antlr3UTF16Consume; // Consume the next UTF16 character in the buffer
756 input->istream->_LA = antlr3UTF16LA; // Return the UTF32 character at offset n (1 based)
757 }
758 else
759 {
760 // Need to use methods that know that the input is Big Endian
761 //
762 input->istream->consume = antlr3UTF16ConsumeBE; // Consume the next UTF16 character in the buffer
763 input->istream->_LA = antlr3UTF16LABE; // Return the UTF32 character at offset n (1 based)
764 }
765 break;
766 }
767
768
769 input->charByteSize = 2; // Size in bytes of characters in this stream.
770
771 }
772
773 /// \brief Consume the next character in a UTF16 input stream
774 ///
775 /// \param input Input stream context pointer
776 ///
777 static void
antlr3UTF16Consume(pANTLR3_INT_STREAM is)778 antlr3UTF16Consume(pANTLR3_INT_STREAM is)
779 {
780 pANTLR3_INPUT_STREAM input;
781 UTF32 ch;
782 UTF32 ch2;
783
784 input = ((pANTLR3_INPUT_STREAM) (is->super));
785
786 // Buffer size is always in bytes
787 //
788 if ((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
789 {
790 // Indicate one more character in this line
791 //
792 input->charPositionInLine++;
793
794 if ((ANTLR3_UCHAR)(*((pANTLR3_UINT16)input->nextChar)) == input->newlineChar)
795 {
796 // Reset for start of a new line of input
797 //
798 input->line++;
799 input->charPositionInLine = 0;
800 input->currentLine = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
801 }
802
803 // Increment to next character position, accounting for any surrogates
804 //
805 // Next char in natural machine byte order
806 //
807 ch = *((UTF16*)input->nextChar);
808
809 // We consumed one 16 bit character
810 //
811 input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
812
813 // If we have a surrogate pair then we need to consume
814 // a following valid LO surrogate.
815 //
816 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
817
818 // If the 16 bits following the high surrogate are in the source buffer...
819 //
820 if ((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
821 {
822 // Next character is in natural machine byte order
823 //
824 ch2 = *((UTF16*)input->nextChar);
825
826 // If it's a valid low surrogate, consume it
827 //
828 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
829 {
830 // We consumed one 16 bit character
831 //
832 input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
833 }
834 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
835 // it.
836 //
837 }
838 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
839 // it because the buffer ended
840 //
841 }
842 // Note that we did not check for an invalid low surrogate here, or that fact that the
843 // lo surrogate was missing. We just picked out one 16 bit character unless the character
844 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
845 //
846 }
847 }
848
849 /// \brief Return the input element assuming an 8 bit ascii input
850 ///
851 /// \param[in] input Input stream context pointer
852 /// \param[in] la 1 based offset of next input stream element
853 ///
854 /// \return Next input character in internal ANTLR3 encoding (UTF32)
855 ///
856 static ANTLR3_UCHAR
antlr3UTF16LA(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)857 antlr3UTF16LA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
858 {
859 pANTLR3_INPUT_STREAM input;
860 UTF32 ch;
861 UTF32 ch2;
862 UTF16 * nextChar;
863
864 // Find the input interface and where we are currently pointing to
865 // in the input stream
866 //
867 input = ((pANTLR3_INPUT_STREAM) (is->super));
868 nextChar = input->nextChar;
869
870 // If a positive offset then advance forward, else retreat
871 //
872 if (la >= 0)
873 {
874 while (--la > 0 && (pANTLR3_UINT8)nextChar < ((pANTLR3_UINT8)input->data) + input->sizeBuf )
875 {
876 // Advance our copy of the input pointer
877 //
878 // Next char in natural machine byte order
879 //
880 ch = *nextChar++;
881
882 // If we have a surrogate pair then we need to consume
883 // a following valid LO surrogate.
884 //
885 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
886 {
887 // If the 16 bits following the high surrogate are in the source buffer...
888 //
889 if ((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
890 {
891 // Next character is in natural machine byte order
892 //
893 ch2 = *nextChar;
894
895 // If it's a valid low surrogate, consume it
896 //
897 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
898 {
899 // We consumed one 16 bit character
900 //
901 nextChar++;
902 }
903 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
904 // it.
905 //
906 }
907 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
908 // it because the buffer ended
909 //
910 }
911 // Note that we did not check for an invalid low surrogate here, or that fact that the
912 // lo surrogate was missing. We just picked out one 16 bit character unless the character
913 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
914 //
915 }
916 }
917 else
918 {
919 // We need to go backwards from our input point
920 //
921 while (la++ < 0 && (pANTLR3_UINT8)nextChar > (pANTLR3_UINT8)input->data )
922 {
923 // Get the previous 16 bit character
924 //
925 ch = *--nextChar;
926
927 // If we found a low surrogate then go back one more character if
928 // the hi surrogate is there
929 //
930 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
931 {
932 ch2 = *(nextChar-1);
933 if (ch2 >= UNI_SUR_HIGH_START && ch2 <= UNI_SUR_HIGH_END)
934 {
935 // Yes, there is a high surrogate to match it so decrement one more and point to that
936 //
937 nextChar--;
938 }
939 }
940 }
941 }
942
943 // Our local copy of nextChar is now pointing to either the correct character or end of file
944 //
945 // Input buffer size is always in bytes
946 //
947 if ( (pANTLR3_UINT8)nextChar >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
948 {
949 return ANTLR3_CHARSTREAM_EOF;
950 }
951 else
952 {
953 // Pick up the next 16 character (native machine byte order)
954 //
955 ch = *nextChar++;
956
957 // If we have a surrogate pair then we need to consume
958 // a following valid LO surrogate.
959 //
960 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
961 {
962 // If the 16 bits following the high surrogate are in the source buffer...
963 //
964 if ((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
965 {
966 // Next character is in natural machine byte order
967 //
968 ch2 = *nextChar;
969
970 // If it's a valid low surrogate, consume it
971 //
972 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
973 {
974 // Construct the UTF32 code point
975 //
976 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
977 + (ch2 - UNI_SUR_LOW_START) + halfBase;
978 }
979 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
980 // it.
981 //
982 }
983 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
984 // it because the buffer ended
985 //
986 }
987 }
988 return ch;
989 }
990
991
992 /// \brief Calculate the current index in the output stream.
993 /// \param[in] input Input stream context pointer
994 ///
995 static ANTLR3_MARKER
antlr3UTF16Index(pANTLR3_INT_STREAM is)996 antlr3UTF16Index(pANTLR3_INT_STREAM is)
997 {
998 pANTLR3_INPUT_STREAM input;
999
1000 input = ((pANTLR3_INPUT_STREAM) (is->super));
1001
1002 return (ANTLR3_MARKER)(input->nextChar);
1003 }
1004
1005 /// \brief Rewind the lexer input to the state specified by the supplied mark.
1006 ///
1007 /// \param[in] input Input stream context pointer
1008 ///
1009 /// \remark
1010 /// Assumes UTF16 input stream.
1011 ///
1012 static void
antlr3UTF16Seek(pANTLR3_INT_STREAM is,ANTLR3_MARKER seekPoint)1013 antlr3UTF16Seek (pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint)
1014 {
1015 pANTLR3_INPUT_STREAM input;
1016
1017 input = ((pANTLR3_INPUT_STREAM) is->super);
1018
1019 // If the requested seek point is less than the current
1020 // input point, then we assume that we are resetting from a mark
1021 // and do not need to scan, but can just set to there as rewind will
1022 // reset line numbers and so on.
1023 //
1024 if (seekPoint <= (ANTLR3_MARKER)(input->nextChar))
1025 {
1026 input->nextChar = (void *)seekPoint;
1027 }
1028 else
1029 {
1030 // Call consume until we reach the asked for seek point or EOF
1031 //
1032 while (is->_LA(is, 1) != ANTLR3_CHARSTREAM_EOF && seekPoint < (ANTLR3_MARKER)input->nextChar)
1033 {
1034 is->consume(is);
1035 }
1036 }
1037 }
1038 /// \brief Return a substring of the UTF16 input stream in
1039 /// newly allocated memory.
1040 ///
1041 /// \param input Input stream context pointer
1042 /// \param start Offset in input stream where the string starts
1043 /// \param stop Offset in the input stream where the string ends.
1044 ///
1045 static pANTLR3_STRING
antlr3UTF16Substr(pANTLR3_INPUT_STREAM input,ANTLR3_MARKER start,ANTLR3_MARKER stop)1046 antlr3UTF16Substr (pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop)
1047 {
1048 return input->strFactory->newPtr(input->strFactory, (pANTLR3_UINT8)start, ((ANTLR3_UINT32_CAST(stop - start))/2) + 1);
1049 }
1050
1051 /// \brief Consume the next character in a UTF16 input stream when the input is Little Endian and the machine is not
1052 /// Note that the UTF16 routines do not do any substantial verification of the input stream as for performance
1053 /// sake, we assume it is validly encoded. So if a low surrogate is found at the curent input position then we
1054 /// just consume it. Surrogate pairs should be seen as Hi, Lo. So if we have a Lo first, then the input stream
1055 /// is fubar but we just ignore that.
1056 ///
1057 /// \param input Input stream context pointer
1058 ///
1059 static void
antlr3UTF16ConsumeLE(pANTLR3_INT_STREAM is)1060 antlr3UTF16ConsumeLE(pANTLR3_INT_STREAM is)
1061 {
1062 pANTLR3_INPUT_STREAM input;
1063 UTF32 ch;
1064 UTF32 ch2;
1065
1066 input = ((pANTLR3_INPUT_STREAM) (is->super));
1067
1068 // Buffer size is always in bytes
1069 //
1070 if ((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1071 {
1072 // Indicate one more character in this line
1073 //
1074 input->charPositionInLine++;
1075
1076 if ((ANTLR3_UCHAR)(*((pANTLR3_UINT16)input->nextChar)) == input->newlineChar)
1077 {
1078 // Reset for start of a new line of input
1079 //
1080 input->line++;
1081 input->charPositionInLine = 0;
1082 input->currentLine = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
1083 }
1084
1085 // Increment to next character position, accounting for any surrogates
1086 //
1087 // Next char in litle endian form
1088 //
1089 ch = *((pANTLR3_UINT8)input->nextChar) + (*((pANTLR3_UINT8)input->nextChar + 1) <<8);
1090
1091 // We consumed one 16 bit character
1092 //
1093 input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
1094
1095 // If we have a surrogate pair then we need to consume
1096 // a following valid LO surrogate.
1097 //
1098 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
1099
1100 // If the 16 bits following the high surrogate are in the source buffer...
1101 //
1102 if ((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1103 {
1104 ch2 = *((pANTLR3_UINT8)input->nextChar) + (*((pANTLR3_UINT8)input->nextChar + 1) <<8);
1105
1106 // If it's a valid low surrogate, consume it
1107 //
1108 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
1109 {
1110 // We consumed one 16 bit character
1111 //
1112 input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
1113 }
1114 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1115 // it.
1116 //
1117 }
1118 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1119 // it because the buffer ended
1120 //
1121 }
1122 // Note that we did not check for an invalid low surrogate here, or that fact that the
1123 // lo surrogate was missing. We just picked out one 16 bit character unless the character
1124 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
1125 //
1126 }
1127 }
1128
1129 /// \brief Return the input element assuming a UTF16 input when the input is Little Endian and the machine is not
1130 ///
1131 /// \param[in] input Input stream context pointer
1132 /// \param[in] la 1 based offset of next input stream element
1133 ///
1134 /// \return Next input character in internal ANTLR3 encoding (UTF32)
1135 ///
1136 static ANTLR3_UCHAR
antlr3UTF16LALE(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)1137 antlr3UTF16LALE(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
1138 {
1139 pANTLR3_INPUT_STREAM input;
1140 UTF32 ch;
1141 UTF32 ch2;
1142 pANTLR3_UCHAR nextChar;
1143
1144 // Find the input interface and where we are currently pointing to
1145 // in the input stream
1146 //
1147 input = ((pANTLR3_INPUT_STREAM) (is->super));
1148 nextChar = input->nextChar;
1149
1150 // If a positive offset then advance forward, else retreat
1151 //
1152 if (la >= 0)
1153 {
1154 while (--la > 0 && (pANTLR3_UINT8)nextChar < ((pANTLR3_UINT8)input->data) + input->sizeBuf )
1155 {
1156 // Advance our copy of the input pointer
1157 //
1158 // Next char in Little Endian byte order
1159 //
1160 ch = (*nextChar) + (*(nextChar+1) << 8);
1161 nextChar += 2;
1162
1163 // If we have a surrogate pair then we need to consume
1164 // a following valid LO surrogate.
1165 //
1166 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
1167 {
1168 // If the 16 bits following the high surrogate are in the source buffer...
1169 //
1170 if ((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1171 {
1172 // Next character is in little endian byte order
1173 //
1174 ch2 = (*nextChar) + (*(nextChar+1) << 8);
1175
1176 // If it's a valid low surrogate, consume it
1177 //
1178 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
1179 {
1180 // We consumed one 16 bit character
1181 //
1182 nextChar += 2;
1183 }
1184 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1185 // it.
1186 //
1187 }
1188 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1189 // it because the buffer ended
1190 //
1191 }
1192 // Note that we did not check for an invalid low surrogate here, or that fact that the
1193 // lo surrogate was missing. We just picked out one 16 bit character unless the character
1194 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
1195 //
1196 }
1197 }
1198 else
1199 {
1200 // We need to go backwards from our input point
1201 //
1202 while (la++ < 0 && (pANTLR3_UINT8)nextChar > (pANTLR3_UINT8)input->data )
1203 {
1204 // Get the previous 16 bit character
1205 //
1206 ch = (*nextChar - 2) + ((*nextChar -1) << 8);
1207 nextChar -= 2;
1208
1209 // If we found a low surrogate then go back one more character if
1210 // the hi surrogate is there
1211 //
1212 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
1213 {
1214 ch2 = (*nextChar - 2) + ((*nextChar -1) << 8);
1215 if (ch2 >= UNI_SUR_HIGH_START && ch2 <= UNI_SUR_HIGH_END)
1216 {
1217 // Yes, there is a high surrogate to match it so decrement one more and point to that
1218 //
1219 nextChar -=2;
1220 }
1221 }
1222 }
1223 }
1224
1225 // Our local copy of nextChar is now pointing to either the correct character or end of file
1226 //
1227 // Input buffer size is always in bytes
1228 //
1229 if ( (pANTLR3_UINT8)nextChar >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1230 {
1231 return ANTLR3_CHARSTREAM_EOF;
1232 }
1233 else
1234 {
1235 // Pick up the next 16 character (little endian byte order)
1236 //
1237 ch = (*nextChar) + (*(nextChar+1) << 8);
1238 nextChar += 2;
1239
1240 // If we have a surrogate pair then we need to consume
1241 // a following valid LO surrogate.
1242 //
1243 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
1244 {
1245 // If the 16 bits following the high surrogate are in the source buffer...
1246 //
1247 if ((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1248 {
1249 // Next character is in little endian byte order
1250 //
1251 ch2 = (*nextChar) + (*(nextChar+1) << 8);
1252
1253 // If it's a valid low surrogate, consume it
1254 //
1255 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
1256 {
1257 // Construct the UTF32 code point
1258 //
1259 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
1260 + (ch2 - UNI_SUR_LOW_START) + halfBase;
1261 }
1262 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1263 // it.
1264 //
1265 }
1266 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1267 // it because the buffer ended
1268 //
1269 }
1270 }
1271 return ch;
1272 }
1273
1274 /// \brief Consume the next character in a UTF16 input stream when the input is Big Endian and the machine is not
1275 ///
1276 /// \param input Input stream context pointer
1277 ///
1278 static void
antlr3UTF16ConsumeBE(pANTLR3_INT_STREAM is)1279 antlr3UTF16ConsumeBE(pANTLR3_INT_STREAM is)
1280 {
1281 pANTLR3_INPUT_STREAM input;
1282 UTF32 ch;
1283 UTF32 ch2;
1284
1285 input = ((pANTLR3_INPUT_STREAM) (is->super));
1286
1287 // Buffer size is always in bytes
1288 //
1289 if ((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1290 {
1291 // Indicate one more character in this line
1292 //
1293 input->charPositionInLine++;
1294
1295 if ((ANTLR3_UCHAR)(*((pANTLR3_UINT16)input->nextChar)) == input->newlineChar)
1296 {
1297 // Reset for start of a new line of input
1298 //
1299 input->line++;
1300 input->charPositionInLine = 0;
1301 input->currentLine = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
1302 }
1303
1304 // Increment to next character position, accounting for any surrogates
1305 //
1306 // Next char in big endian form
1307 //
1308 ch = *((pANTLR3_UINT8)input->nextChar + 1) + (*((pANTLR3_UINT8)input->nextChar ) <<8);
1309
1310 // We consumed one 16 bit character
1311 //
1312 input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
1313
1314 // If we have a surrogate pair then we need to consume
1315 // a following valid LO surrogate.
1316 //
1317 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
1318
1319 // If the 16 bits following the high surrogate are in the source buffer...
1320 //
1321 if ((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1322 {
1323 // Big endian
1324 //
1325 ch2 = *((pANTLR3_UINT8)input->nextChar + 1) + (*((pANTLR3_UINT8)input->nextChar ) <<8);
1326
1327 // If it's a valid low surrogate, consume it
1328 //
1329 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
1330 {
1331 // We consumed one 16 bit character
1332 //
1333 input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
1334 }
1335 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1336 // it.
1337 //
1338 }
1339 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1340 // it because the buffer ended
1341 //
1342 }
1343 // Note that we did not check for an invalid low surrogate here, or that fact that the
1344 // lo surrogate was missing. We just picked out one 16 bit character unless the character
1345 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
1346 //
1347 }
1348 }
1349
1350 /// \brief Return the input element assuming a UTF16 input when the input is Little Endian and the machine is not
1351 ///
1352 /// \param[in] input Input stream context pointer
1353 /// \param[in] la 1 based offset of next input stream element
1354 ///
1355 /// \return Next input character in internal ANTLR3 encoding (UTF32)
1356 ///
1357 static ANTLR3_UCHAR
antlr3UTF16LABE(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)1358 antlr3UTF16LABE(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
1359 {
1360 pANTLR3_INPUT_STREAM input;
1361 UTF32 ch;
1362 UTF32 ch2;
1363 pANTLR3_UCHAR nextChar;
1364
1365 // Find the input interface and where we are currently pointing to
1366 // in the input stream
1367 //
1368 input = ((pANTLR3_INPUT_STREAM) (is->super));
1369 nextChar = input->nextChar;
1370
1371 // If a positive offset then advance forward, else retreat
1372 //
1373 if (la >= 0)
1374 {
1375 while (--la > 0 && (pANTLR3_UINT8)nextChar < ((pANTLR3_UINT8)input->data) + input->sizeBuf )
1376 {
1377 // Advance our copy of the input pointer
1378 //
1379 // Next char in Big Endian byte order
1380 //
1381 ch = ((*nextChar) << 8) + *(nextChar+1);
1382 nextChar += 2;
1383
1384 // If we have a surrogate pair then we need to consume
1385 // a following valid LO surrogate.
1386 //
1387 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
1388 {
1389 // If the 16 bits following the high surrogate are in the source buffer...
1390 //
1391 if ((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1392 {
1393 // Next character is in big endian byte order
1394 //
1395 ch2 = ((*nextChar) << 8) + *(nextChar+1);
1396
1397 // If it's a valid low surrogate, consume it
1398 //
1399 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
1400 {
1401 // We consumed one 16 bit character
1402 //
1403 nextChar += 2;
1404 }
1405 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1406 // it.
1407 //
1408 }
1409 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1410 // it because the buffer ended
1411 //
1412 }
1413 // Note that we did not check for an invalid low surrogate here, or that fact that the
1414 // lo surrogate was missing. We just picked out one 16 bit character unless the character
1415 // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
1416 //
1417 }
1418 }
1419 else
1420 {
1421 // We need to go backwards from our input point
1422 //
1423 while (la++ < 0 && (pANTLR3_UINT8)nextChar > (pANTLR3_UINT8)input->data )
1424 {
1425 // Get the previous 16 bit character
1426 //
1427 ch = ((*nextChar - 2) << 8) + (*nextChar -1);
1428 nextChar -= 2;
1429
1430 // If we found a low surrogate then go back one more character if
1431 // the hi surrogate is there
1432 //
1433 if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
1434 {
1435 ch2 = ((*nextChar - 2) << 8) + (*nextChar -1);
1436 if (ch2 >= UNI_SUR_HIGH_START && ch2 <= UNI_SUR_HIGH_END)
1437 {
1438 // Yes, there is a high surrogate to match it so decrement one more and point to that
1439 //
1440 nextChar -=2;
1441 }
1442 }
1443 }
1444 }
1445
1446 // Our local copy of nextChar is now pointing to either the correct character or end of file
1447 //
1448 // Input buffer size is always in bytes
1449 //
1450 if ( (pANTLR3_UINT8)nextChar >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1451 {
1452 return ANTLR3_CHARSTREAM_EOF;
1453 }
1454 else
1455 {
1456 // Pick up the next 16 character (big endian byte order)
1457 //
1458 ch = ((*nextChar) << 8) + *(nextChar+1);
1459 nextChar += 2;
1460
1461 // If we have a surrogate pair then we need to consume
1462 // a following valid LO surrogate.
1463 //
1464 if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
1465 {
1466 // If the 16 bits following the high surrogate are in the source buffer...
1467 //
1468 if ((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1469 {
1470 // Next character is in big endian byte order
1471 //
1472 ch2 = ((*nextChar) << 8) + *(nextChar+1);
1473
1474 // If it's a valid low surrogate, consume it
1475 //
1476 if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
1477 {
1478 // Construct the UTF32 code point
1479 //
1480 ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
1481 + (ch2 - UNI_SUR_LOW_START) + halfBase;
1482 }
1483 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1484 // it.
1485 //
1486 }
1487 // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1488 // it because the buffer ended
1489 //
1490 }
1491 }
1492 return ch;
1493 }
1494
1495 /// \brief Common function to setup function interface for a UTF3 input stream.
1496 ///
1497 /// \param input Input stream context pointer
1498 ///
1499 void
antlr3UTF32SetupStream(pANTLR3_INPUT_STREAM input,ANTLR3_BOOLEAN machineBigEndian,ANTLR3_BOOLEAN inputBigEndian)1500 antlr3UTF32SetupStream (pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN machineBigEndian, ANTLR3_BOOLEAN inputBigEndian)
1501 {
1502 // Build a string factory for this stream. This is a UTF32 string factory which is a standard
1503 // part of the ANTLR3 string. The string factory is then passed through the whole chain of lexer->parser->tree->treeparser
1504 // and so on.
1505 //
1506 input->strFactory = antlr3StringFactoryNew(input->encoding);
1507
1508 // Generic API that does not care about endianess.
1509 //
1510 input->istream->index = antlr3UTF32Index; // Calculate current index in input stream, UTF16 based
1511 input->substr = antlr3UTF32Substr; // Return a string from the input stream
1512 input->istream->seek = antlr3UTF32Seek; // How to seek to a specific point in the stream
1513 input->istream->consume = antlr3UTF32Consume; // Consume the next UTF32 character in the buffer
1514
1515 // We must install different UTF32 LA routines according to whether the input
1516 // is the same endianess as the machine we are executing upon or not. If it is not
1517 // then we must install methods that can convert the endianess on the fly as they go
1518 //
1519 switch (machineBigEndian)
1520 {
1521 case ANTLR3_TRUE:
1522
1523 // Machine is Big Endian, if the input is also then install the
1524 // methods that do not access input by bytes and reverse them.
1525 // Otherwise install endian aware methods.
1526 //
1527 if (inputBigEndian == ANTLR3_TRUE)
1528 {
1529 // Input is machine compatible
1530 //
1531 input->istream->_LA = antlr3UTF32LA; // Return the UTF32 character at offset n (1 based)
1532 }
1533 else
1534 {
1535 // Need to use methods that know that the input is little endian
1536 //
1537 input->istream->_LA = antlr3UTF32LALE; // Return the UTF32 character at offset n (1 based)
1538 }
1539 break;
1540
1541 case ANTLR3_FALSE:
1542
1543 // Machine is Little Endian, if the input is also then install the
1544 // methods that do not access input by bytes and reverse them.
1545 // Otherwise install endian aware methods.
1546 //
1547 if (inputBigEndian == ANTLR3_FALSE)
1548 {
1549 // Input is machine compatible
1550 //
1551 input->istream->_LA = antlr3UTF32LA; // Return the UTF32 character at offset n (1 based)
1552 }
1553 else
1554 {
1555 // Need to use methods that know that the input is Big Endian
1556 //
1557 input->istream->_LA = antlr3UTF32LABE; // Return the UTF32 character at offset n (1 based)
1558 }
1559 break;
1560 }
1561
1562 input->charByteSize = 4; // Size in bytes of characters in this stream.
1563 }
1564
1565 /** \brief Consume the next character in a UTF32 input stream
1566 *
1567 * \param input Input stream context pointer
1568 */
1569 static void
antlr3UTF32Consume(pANTLR3_INT_STREAM is)1570 antlr3UTF32Consume(pANTLR3_INT_STREAM is)
1571 {
1572 pANTLR3_INPUT_STREAM input;
1573
1574 input = ((pANTLR3_INPUT_STREAM) (is->super));
1575
1576 // SizeBuf is always in bytes
1577 //
1578 if ((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1579 {
1580 /* Indicate one more character in this line
1581 */
1582 input->charPositionInLine++;
1583
1584 if ((ANTLR3_UCHAR)(*((pANTLR3_UINT32)input->nextChar)) == input->newlineChar)
1585 {
1586 /* Reset for start of a new line of input
1587 */
1588 input->line++;
1589 input->charPositionInLine = 0;
1590 input->currentLine = (void *)(((pANTLR3_UINT32)input->nextChar) + 1);
1591 }
1592
1593 /* Increment to next character position
1594 */
1595 input->nextChar = (void *)(((pANTLR3_UINT32)input->nextChar) + 1);
1596 }
1597 }
1598
1599 /// \brief Calculate the current index in the output stream.
1600 /// \param[in] input Input stream context pointer
1601 ///
1602 static ANTLR3_MARKER
antlr3UTF32Index(pANTLR3_INT_STREAM is)1603 antlr3UTF32Index(pANTLR3_INT_STREAM is)
1604 {
1605 pANTLR3_INPUT_STREAM input;
1606
1607 input = ((pANTLR3_INPUT_STREAM) (is->super));
1608
1609 return (ANTLR3_MARKER)(input->nextChar);
1610 }
1611
1612 /// \brief Return a substring of the UTF16 input stream in
1613 /// newly allocated memory.
1614 ///
1615 /// \param input Input stream context pointer
1616 /// \param start Offset in input stream where the string starts
1617 /// \param stop Offset in the input stream where the string ends.
1618 ///
1619 static pANTLR3_STRING
antlr3UTF32Substr(pANTLR3_INPUT_STREAM input,ANTLR3_MARKER start,ANTLR3_MARKER stop)1620 antlr3UTF32Substr (pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop)
1621 {
1622 return input->strFactory->newPtr(input->strFactory, (pANTLR3_UINT8)start, ((ANTLR3_UINT32_CAST(stop - start))/4) + 1);
1623 }
1624
1625 /// \brief Rewind the lexer input to the state specified by the supplied mark.
1626 ///
1627 /// \param[in] input Input stream context pointer
1628 ///
1629 /// \remark
1630 /// Assumes UTF32 input stream.
1631 ///
1632 static void
antlr3UTF32Seek(pANTLR3_INT_STREAM is,ANTLR3_MARKER seekPoint)1633 antlr3UTF32Seek (pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint)
1634 {
1635 pANTLR3_INPUT_STREAM input;
1636
1637 input = ((pANTLR3_INPUT_STREAM) is->super);
1638
1639 // If the requested seek point is less than the current
1640 // input point, then we assume that we are resetting from a mark
1641 // and do not need to scan, but can just set to there as rewind will
1642 // reset line numbers and so on.
1643 //
1644 if (seekPoint <= (ANTLR3_MARKER)(input->nextChar))
1645 {
1646 input->nextChar = (void *)seekPoint;
1647 }
1648 else
1649 {
1650 // Call consume until we reach the asked for seek point or EOF
1651 //
1652 while (is->_LA(is, 1) != ANTLR3_CHARSTREAM_EOF && seekPoint < (ANTLR3_MARKER)input->nextChar)
1653 {
1654 is->consume(is);
1655 }
1656 }
1657 }
1658
1659 /** \brief Return the input element assuming a UTF32 input in natural machine byte order
1660 *
1661 * \param[in] input Input stream context pointer
1662 * \param[in] la 1 based offset of next input stream element
1663 *
1664 * \return Next input character in internal ANTLR3 encoding (UTF32)
1665 */
1666 static ANTLR3_UCHAR
antlr3UTF32LA(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)1667 antlr3UTF32LA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
1668 {
1669 pANTLR3_INPUT_STREAM input;
1670
1671 input = ((pANTLR3_INPUT_STREAM) (is->super));
1672
1673 if (( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1674 {
1675 return ANTLR3_CHARSTREAM_EOF;
1676 }
1677 else
1678 {
1679 return (ANTLR3_UCHAR)(*((pANTLR3_UINT32)input->nextChar + la - 1));
1680 }
1681 }
1682
1683 /** \brief Return the input element assuming a UTF32 input in little endian byte order
1684 *
1685 * \param[in] input Input stream context pointer
1686 * \param[in] la 1 based offset of next input stream element
1687 *
1688 * \return Next input character in internal ANTLR3 encoding (UTF32)
1689 */
1690 static ANTLR3_UCHAR
antlr3UTF32LALE(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)1691 antlr3UTF32LALE(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
1692 {
1693 pANTLR3_INPUT_STREAM input;
1694
1695 input = ((pANTLR3_INPUT_STREAM) (is->super));
1696
1697 if (( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1698 {
1699 return ANTLR3_CHARSTREAM_EOF;
1700 }
1701 else
1702 {
1703 ANTLR3_UCHAR c;
1704
1705 c = (ANTLR3_UCHAR)(*((pANTLR3_UINT32)input->nextChar + la - 1));
1706
1707 // Swap Endianess to Big Endian
1708 //
1709 return (c>>24) | ((c<<8) & 0x00FF0000) | ((c>>8) & 0x0000FF00) | (c<<24);
1710 }
1711 }
1712
1713 /** \brief Return the input element assuming a UTF32 input in big endian byte order
1714 *
1715 * \param[in] input Input stream context pointer
1716 * \param[in] la 1 based offset of next input stream element
1717 *
1718 * \return Next input character in internal ANTLR3 encoding (UTF32)
1719 * \remark This is the same code as LE version but seprated in case there are better optimisations fo rendinan swap
1720 */
1721 static ANTLR3_UCHAR
antlr3UTF32LABE(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)1722 antlr3UTF32LABE(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
1723 {
1724 pANTLR3_INPUT_STREAM input;
1725
1726 input = ((pANTLR3_INPUT_STREAM) (is->super));
1727
1728 if (( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1729 {
1730 return ANTLR3_CHARSTREAM_EOF;
1731 }
1732 else
1733 {
1734 ANTLR3_UCHAR c;
1735
1736 c = (ANTLR3_UCHAR)(*((pANTLR3_UINT32)input->nextChar + la - 1));
1737
1738 // Swap Endianess to Little Endian
1739 //
1740 return (c>>24) | ((c<<8) & 0x00FF0000) | ((c>>8) & 0x0000FF00) | (c<<24);
1741 }
1742 }
1743
1744
1745 /// \brief Common function to setup function interface for a UTF8 input stream.
1746 ///
1747 /// \param input Input stream context pointer
1748 ///
1749 void
antlr3UTF8SetupStream(pANTLR3_INPUT_STREAM input)1750 antlr3UTF8SetupStream (pANTLR3_INPUT_STREAM input)
1751 {
1752 // Build a string factory for this stream. This is a UTF16 string factory which is a standard
1753 // part of the ANTLR3 string. The string factory is then passed through the whole chain of lexer->parser->tree->treeparser
1754 // and so on.
1755 //
1756 input->strFactory = antlr3StringFactoryNew(input->encoding);
1757
1758 // Generic API that does not care about endianess.
1759 //
1760 input->istream->consume = antlr3UTF8Consume; // Consume the next UTF32 character in the buffer
1761 input->istream->_LA = antlr3UTF8LA; // Return the UTF32 character at offset n (1 based)
1762 input->charByteSize = 0; // Size in bytes of characters in this stream.
1763 }
1764
1765 // ------------------------------------------------------
1766 // Following is from Unicode.org (see antlr3convertutf.c)
1767 //
1768
1769 /// Index into the table below with the first byte of a UTF-8 sequence to
1770 /// get the number of trailing bytes that are supposed to follow it.
1771 /// Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
1772 /// left as-is for anyone who may want to do such conversion, which was
1773 /// allowed in earlier algorithms.
1774 ///
1775 static const ANTLR3_UINT32 trailingBytesForUTF8[256] = {
1776 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1777 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1778 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1779 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1780 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1781 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1782 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1783 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
1784 };
1785
1786 /// Magic values subtracted from a buffer value during UTF8 conversion.
1787 /// This table contains as many values as there might be trailing bytes
1788 /// in a UTF-8 sequence.
1789 ///
1790 static const UTF32 offsetsFromUTF8[6] =
1791 { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
1792 0x03C82080UL, 0xFA082080UL, 0x82082080UL
1793 };
1794
1795 // End of Unicode.org tables
1796 // -------------------------
1797
1798
1799 /** \brief Consume the next character in a UTF8 input stream
1800 *
1801 * \param input Input stream context pointer
1802 */
1803 static void
antlr3UTF8Consume(pANTLR3_INT_STREAM is)1804 antlr3UTF8Consume(pANTLR3_INT_STREAM is)
1805 {
1806 pANTLR3_INPUT_STREAM input;
1807 ANTLR3_UINT32 extraBytesToRead;
1808 ANTLR3_UCHAR ch;
1809 pANTLR3_UINT8 nextChar;
1810
1811 input = ((pANTLR3_INPUT_STREAM) (is->super));
1812
1813 nextChar = input->nextChar;
1814
1815 if (nextChar < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1816 {
1817 // Indicate one more character in this line
1818 //
1819 input->charPositionInLine++;
1820
1821 // Are there more bytes needed to make up the whole thing?
1822 //
1823 extraBytesToRead = trailingBytesForUTF8[*nextChar];
1824
1825 if (nextChar + extraBytesToRead >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1826 {
1827 input->nextChar = (((pANTLR3_UINT8)input->data) + input->sizeBuf);
1828 return;
1829 }
1830
1831 // Cases deliberately fall through (see note A in antlrconvertutf.c)
1832 // Legal UTF8 is only 4 bytes but 6 bytes could be used in old UTF8 so
1833 // we allow it.
1834 //
1835 ch = 0;
1836 switch (extraBytesToRead) {
1837 case 5: ch += *nextChar++; ch <<= 6;
1838 case 4: ch += *nextChar++; ch <<= 6;
1839 case 3: ch += *nextChar++; ch <<= 6;
1840 case 2: ch += *nextChar++; ch <<= 6;
1841 case 1: ch += *nextChar++; ch <<= 6;
1842 case 0: ch += *nextChar++;
1843 }
1844
1845 // Magically correct the input value
1846 //
1847 ch -= offsetsFromUTF8[extraBytesToRead];
1848 if (ch == input->newlineChar)
1849 {
1850 /* Reset for start of a new line of input
1851 */
1852 input->line++;
1853 input->charPositionInLine = 0;
1854 input->currentLine = (void *)nextChar;
1855 }
1856
1857 // Update input pointer
1858 //
1859 input->nextChar = nextChar;
1860 }
1861 }
1862 /** \brief Return the input element assuming a UTF8 input
1863 *
1864 * \param[in] input Input stream context pointer
1865 * \param[in] la 1 based offset of next input stream element
1866 *
1867 * \return Next input character in internal ANTLR3 encoding (UTF32)
1868 */
1869 static ANTLR3_UCHAR
antlr3UTF8LA(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)1870 antlr3UTF8LA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
1871 {
1872 pANTLR3_INPUT_STREAM input;
1873 ANTLR3_UINT32 extraBytesToRead;
1874 ANTLR3_UCHAR ch;
1875 pANTLR3_UINT8 nextChar;
1876
1877 input = ((pANTLR3_INPUT_STREAM) (is->super));
1878
1879 nextChar = input->nextChar;
1880
1881 // Do we need to traverse forwards or backwards?
1882 // - LA(0) is treated as LA(1) and we assume that the nextChar is
1883 // already positioned.
1884 // - LA(n+) ; n>1 means we must traverse forward n-1 characters catering for UTF8 encoding
1885 // - LA(-n) means we must traverse backwards n chracters
1886 //
1887 if (la > 1) {
1888
1889 // Make sure that we have at least one character left before trying to
1890 // loop through the buffer.
1891 //
1892 if (nextChar < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1893 {
1894 // Now traverse n-1 characters forward
1895 //
1896 while (--la > 0)
1897 {
1898 // Does the next character require trailing bytes?
1899 // If so advance the pointer by that many bytes as well as advancing
1900 // one position for what will be at least a single byte character.
1901 //
1902 nextChar += trailingBytesForUTF8[*nextChar] + 1;
1903
1904 // Does that calculation take us past the byte length of the buffer?
1905 //
1906 if (nextChar >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1907 {
1908 return ANTLR3_CHARSTREAM_EOF;
1909 }
1910 }
1911 }
1912 else
1913 {
1914 return ANTLR3_CHARSTREAM_EOF;
1915 }
1916 }
1917 else
1918 {
1919 // LA is negative so we decrease the pointer by n character positions
1920 //
1921 while (nextChar > (pANTLR3_UINT8)input->data && la++ < 0)
1922 {
1923 // Traversing backwards in UTF8 means decermenting by one
1924 // then continuing to decrement while ever a character pattern
1925 // is flagged as being a trailing byte of an encoded code point.
1926 // Trailing UTF8 bytes always start with 10 in binary. We assumne that
1927 // the UTF8 is well formed and do not check boundary conditions
1928 //
1929 nextChar--;
1930 while ((*nextChar & 0xC0) == 0x80)
1931 {
1932 nextChar--;
1933 }
1934 }
1935 }
1936
1937 // nextChar is now pointing at the UTF8 encoded character that we need to
1938 // decode and return.
1939 //
1940 // Are there more bytes needed to make up the whole thing?
1941 //
1942 extraBytesToRead = trailingBytesForUTF8[*nextChar];
1943 if (nextChar + extraBytesToRead >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1944 {
1945 return ANTLR3_CHARSTREAM_EOF;
1946 }
1947
1948 // Cases deliberately fall through (see note A in antlrconvertutf.c)
1949 //
1950 ch = 0;
1951 switch (extraBytesToRead) {
1952 case 5: ch += *nextChar++; ch <<= 6;
1953 case 4: ch += *nextChar++; ch <<= 6;
1954 case 3: ch += *nextChar++; ch <<= 6;
1955 case 2: ch += *nextChar++; ch <<= 6;
1956 case 1: ch += *nextChar++; ch <<= 6;
1957 case 0: ch += *nextChar++;
1958 }
1959
1960 // Magically correct the input value
1961 //
1962 ch -= offsetsFromUTF8[extraBytesToRead];
1963
1964 return ch;
1965 }
1966
1967 // EBCDIC to ASCII conversion table
1968 //
1969 // This for EBCDIC EDF04 translated to ISO-8859.1 which is the usually accepted POSIX
1970 // translation and the character tables are published all over the interweb.
1971 //
1972 const ANTLR3_UCHAR e2a[256] =
1973 {
1974 0x00, 0x01, 0x02, 0x03, 0x85, 0x09, 0x86, 0x7f,
1975 0x87, 0x8d, 0x8e, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1976 0x10, 0x11, 0x12, 0x13, 0x8f, 0x0a, 0x08, 0x97,
1977 0x18, 0x19, 0x9c, 0x9d, 0x1c, 0x1d, 0x1e, 0x1f,
1978 0x80, 0x81, 0x82, 0x83, 0x84, 0x92, 0x17, 0x1b,
1979 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x05, 0x06, 0x07,
1980 0x90, 0x91, 0x16, 0x93, 0x94, 0x95, 0x96, 0x04,
1981 0x98, 0x99, 0x9a, 0x9b, 0x14, 0x15, 0x9e, 0x1a,
1982 0x20, 0xa0, 0xe2, 0xe4, 0xe0, 0xe1, 0xe3, 0xe5,
1983 0xe7, 0xf1, 0x60, 0x2e, 0x3c, 0x28, 0x2b, 0x7c,
1984 0x26, 0xe9, 0xea, 0xeb, 0xe8, 0xed, 0xee, 0xef,
1985 0xec, 0xdf, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0x9f,
1986 0x2d, 0x2f, 0xc2, 0xc4, 0xc0, 0xc1, 0xc3, 0xc5,
1987 0xc7, 0xd1, 0x5e, 0x2c, 0x25, 0x5f, 0x3e, 0x3f,
1988 0xf8, 0xc9, 0xca, 0xcb, 0xc8, 0xcd, 0xce, 0xcf,
1989 0xcc, 0xa8, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22,
1990 0xd8, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
1991 0x68, 0x69, 0xab, 0xbb, 0xf0, 0xfd, 0xfe, 0xb1,
1992 0xb0, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70,
1993 0x71, 0x72, 0xaa, 0xba, 0xe6, 0xb8, 0xc6, 0xa4,
1994 0xb5, 0xaf, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
1995 0x79, 0x7a, 0xa1, 0xbf, 0xd0, 0xdd, 0xde, 0xae,
1996 0xa2, 0xa3, 0xa5, 0xb7, 0xa9, 0xa7, 0xb6, 0xbc,
1997 0xbd, 0xbe, 0xac, 0x5b, 0x5c, 0x5d, 0xb4, 0xd7,
1998 0xf9, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
1999 0x48, 0x49, 0xad, 0xf4, 0xf6, 0xf2, 0xf3, 0xf5,
2000 0xa6, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50,
2001 0x51, 0x52, 0xb9, 0xfb, 0xfc, 0xdb, 0xfa, 0xff,
2002 0xd9, 0xf7, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
2003 0x59, 0x5a, 0xb2, 0xd4, 0xd6, 0xd2, 0xd3, 0xd5,
2004 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
2005 0x38, 0x39, 0xb3, 0x7b, 0xdc, 0x7d, 0xda, 0x7e
2006 };
2007
2008 /// \brief Common function to setup function interface for a EBCDIC input stream.
2009 ///
2010 /// \param input Input stream context pointer
2011 ///
2012 void
antlr3EBCDICSetupStream(pANTLR3_INPUT_STREAM input)2013 antlr3EBCDICSetupStream (pANTLR3_INPUT_STREAM input)
2014 {
2015 // EBCDIC streams can use the standard 8 bit string factory
2016 //
2017 input->strFactory = antlr3StringFactoryNew(input->encoding);
2018
2019 // Generic API that does not care about endianess.
2020 //
2021 input->istream->_LA = antlr3EBCDICLA; // Return the UTF32 character at offset n (1 based)
2022 input->charByteSize = 1; // Size in bytes of characters in this stream.
2023 }
2024
2025 /// \brief Return the input element assuming an 8 bit EBCDIC input
2026 ///
2027 /// \param[in] input Input stream context pointer
2028 /// \param[in] la 1 based offset of next input stream element
2029 ///
2030 /// \return Next input character in internal ANTLR3 encoding (UTF32) after translation
2031 /// from EBCDIC to ASCII
2032 ///
2033 static ANTLR3_UCHAR
antlr3EBCDICLA(pANTLR3_INT_STREAM is,ANTLR3_INT32 la)2034 antlr3EBCDICLA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
2035 {
2036 pANTLR3_INPUT_STREAM input;
2037
2038 input = ((pANTLR3_INPUT_STREAM) (is->super));
2039
2040 if (( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
2041 {
2042 return ANTLR3_CHARSTREAM_EOF;
2043 }
2044 else
2045 {
2046 // Translate the required character via the constant conversion table
2047 //
2048 return e2a[(*((pANTLR3_UINT8)input->nextChar + la - 1))];
2049 }
2050 }