1 /** \file
2 * \brief The ANTLR3 C filestream is used when the source character stream
3 * is a filesystem based input set and all the characters in the filestream
4 * can be loaded at once into memory and away the lexer goes.
5 *
6 * A number of initializers are provided in order that various character
7 * sets can be supported from input files. The ANTLR3 C runtime expects
8 * to deal with UTF32 characters only (the reasons for this are to
9 * do with the simplification of C code when using this form of Unicode
10 * encoding, though this is not a panacea. More information can be
11 * found on this by consulting:
12 * - http://www.unicode.org/versions/Unicode4.0.0/ch02.pdf#G11178
13 * Where a well grounded discussion of the encoding formats available
14 * may be found.
15 *
16 */
17
18 // [The "BSD licence"]
19 // Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC
20 // http://www.temporal-wave.com
21 // http://www.linkedin.com/in/jimidle
22 //
23 // All rights reserved.
24 //
25 // Redistribution and use in source and binary forms, with or without
26 // modification, are permitted provided that the following conditions
27 // are met:
28 // 1. Redistributions of source code must retain the above copyright
29 // notice, this list of conditions and the following disclaimer.
30 // 2. Redistributions in binary form must reproduce the above copyright
31 // notice, this list of conditions and the following disclaimer in the
32 // documentation and/or other materials provided with the distribution.
33 // 3. The name of the author may not be used to endorse or promote products
34 // derived from this software without specific prior written permission.
35 //
36 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
37 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
38 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
39 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
40 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
42 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
43 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
44 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
45 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46
47 #include <antlr3.h>
48
49 static void setupInputStream (pANTLR3_INPUT_STREAM input);
50 static pANTLR3_INPUT_STREAM antlr3CreateFileStream (pANTLR3_UINT8 fileName);
51 static pANTLR3_INPUT_STREAM antlr3CreateStringStream (pANTLR3_UINT8 data);
52
53 ANTLR3_API pANTLR3_INPUT_STREAM
antlr3FileStreamNew(pANTLR3_UINT8 fileName,ANTLR3_UINT32 encoding)54 antlr3FileStreamNew(pANTLR3_UINT8 fileName, ANTLR3_UINT32 encoding)
55 {
56 pANTLR3_INPUT_STREAM input;
57
58 // First order of business is to read the file into some buffer space
59 // as just straight 8 bit bytes. Then we will work out the encoding and
60 // byte order and adjust the API functions that are installed for the
61 // default 8Bit stream accordingly.
62 //
63 input = antlr3CreateFileStream(fileName);
64 if (input == NULL)
65 {
66 return NULL;
67 }
68
69 // We have the data in memory now so we can deal with it according to
70 // the encoding scheme we were given by the user.
71 //
72 input->encoding = encoding;
73
74 // Now we need to work out the endian type and install any
75 // API functions that differ from 8Bit
76 //
77 setupInputStream(input);
78
79 // Now we can set up the file name
80 //
81 input->istream->streamName = input->strFactory->newStr8(input->strFactory, fileName);
82 input->fileName = input->istream->streamName;
83
84 return input;
85 }
86
87
88 ANTLR3_API pANTLR3_INPUT_STREAM
antlr3StringStreamNew(pANTLR3_UINT8 data,ANTLR3_UINT32 encoding,ANTLR3_UINT32 size,pANTLR3_UINT8 name)89 antlr3StringStreamNew(pANTLR3_UINT8 data, ANTLR3_UINT32 encoding, ANTLR3_UINT32 size, pANTLR3_UINT8 name)
90 {
91 pANTLR3_INPUT_STREAM input;
92
93 // First order of business is to set up the stream and install the data pointer.
94 // Then we will work out the encoding and byte order and adjust the API functions that are installed for the
95 // default 8Bit stream accordingly.
96 //
97 input = antlr3CreateStringStream(data);
98 if (input == NULL)
99 {
100 return NULL;
101 }
102
103 // Size (in bytes) of the given 'string'
104 //
105 input->sizeBuf = size;
106
107 // We have the data in memory now so we can deal with it according to
108 // the encoding scheme we were given by the user.
109 //
110 input->encoding = encoding;
111
112 // Now we need to work out the endian type and install any
113 // API functions that differ from 8Bit
114 //
115 setupInputStream(input);
116
117 // Now we can set up the file name
118 //
119 input->istream->streamName = input->strFactory->newStr8(input->strFactory, name);
120 input->fileName = input->istream->streamName;
121
122 return input;
123 }
124
125
126 /// Determine endianess of the input stream and install the
127 /// API required for the encoding in that format.
128 ///
129 static void
setupInputStream(pANTLR3_INPUT_STREAM input)130 setupInputStream(pANTLR3_INPUT_STREAM input)
131 {
132 ANTLR3_BOOLEAN isBigEndian;
133
134 // Used to determine the endianness of the machine we are currently
135 // running on.
136 //
137 ANTLR3_UINT16 bomTest = 0xFEFF;
138
139 // What endianess is the machine we are running on? If the incoming
140 // encoding endianess is the same as this machine's natural byte order
141 // then we can use more efficient API calls.
142 //
143 if (*((pANTLR3_UINT8)(&bomTest)) == 0xFE)
144 {
145 isBigEndian = ANTLR3_TRUE;
146 }
147 else
148 {
149 isBigEndian = ANTLR3_FALSE;
150 }
151
152 // What encoding did the user tell us {s}he thought it was? I am going
153 // to get sick of the questions on antlr-interest, I know I am.
154 //
155 switch (input->encoding)
156 {
157 case ANTLR3_ENC_UTF8:
158
159 // See if there is a BOM at the start of this UTF-8 sequence
160 // and just eat it if there is. Windows .TXT files have this for instance
161 // as it identifies UTF-8 even though it is of no consequence for byte order
162 // as UTF-8 does not have a byte order.
163 //
164 if ( (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar)) == 0xEF
165 && (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0xBB
166 && (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+2)) == 0xBF
167 )
168 {
169 // The UTF8 BOM is present so skip it
170 //
171 input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 3);
172 }
173
174 // Install the UTF8 input routines
175 //
176 antlr3UTF8SetupStream(input);
177 break;
178
179 case ANTLR3_ENC_UTF16:
180
181 // See if there is a BOM at the start of the input. If not then
182 // we assume that the byte order is the natural order of this
183 // machine (or it is really UCS2). If there is a BOM we determine if the encoding
184 // is the same as the natural order of this machine.
185 //
186 if ( (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar)) == 0xFE
187 && (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0xFF
188 )
189 {
190 // BOM Present, indicates Big Endian
191 //
192 input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 2);
193
194 antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_TRUE);
195 }
196 else if ( (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar)) == 0xFF
197 && (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0xFE
198 )
199 {
200 // BOM present, indicates Little Endian
201 //
202 input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 2);
203
204 antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_FALSE);
205 }
206 else
207 {
208 // No BOM present, assume local computer byte order
209 //
210 antlr3UTF16SetupStream(input, isBigEndian, isBigEndian);
211 }
212 break;
213
214 case ANTLR3_ENC_UTF32:
215
216 // See if there is a BOM at the start of the input. If not then
217 // we assume that the byte order is the natural order of this
218 // machine. If there is we determine if the encoding
219 // is the same as the natural order of this machine.
220 //
221 if ( (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar)) == 0x00
222 && (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0x00
223 && (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+2)) == 0xFE
224 && (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+3)) == 0xFF
225 )
226 {
227 // BOM Present, indicates Big Endian
228 //
229 input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 4);
230
231 antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_TRUE);
232 }
233 else if ( (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar)) == 0xFF
234 && (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0xFE
235 && (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0x00
236 && (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0x00
237 )
238 {
239 // BOM present, indicates Little Endian
240 //
241 input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 4);
242
243 antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_FALSE);
244 }
245 else
246 {
247 // No BOM present, assume local computer byte order
248 //
249 antlr3UTF32SetupStream(input, isBigEndian, isBigEndian);
250 }
251 break;
252
253 case ANTLR3_ENC_UTF16BE:
254
255 // Encoding is definately Big Endian with no BOM
256 //
257 antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_TRUE);
258 break;
259
260 case ANTLR3_ENC_UTF16LE:
261
262 // Encoding is definately Little Endian with no BOM
263 //
264 antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_FALSE);
265 break;
266
267 case ANTLR3_ENC_UTF32BE:
268
269 // Encoding is definately Big Endian with no BOM
270 //
271 antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_TRUE);
272 break;
273
274 case ANTLR3_ENC_UTF32LE:
275
276 // Encoding is definately Little Endian with no BOM
277 //
278 antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_FALSE);
279 break;
280
281 case ANTLR3_ENC_EBCDIC:
282
283 // EBCDIC is basically the same as ASCII but with an on the
284 // fly translation to ASCII
285 //
286 antlr3EBCDICSetupStream(input);
287 break;
288
289 case ANTLR3_ENC_8BIT:
290 default:
291
292 // Standard 8bit/ASCII
293 //
294 antlr38BitSetupStream(input);
295 break;
296 }
297 }
298
299 /** \brief Use the contents of an operating system file as the input
300 * for an input stream.
301 *
302 * \param fileName Name of operating system file to read.
303 * \return
304 * - Pointer to new input stream context upon success
305 * - One of the ANTLR3_ERR_ defines on error.
306 */
307 static pANTLR3_INPUT_STREAM
antlr3CreateFileStream(pANTLR3_UINT8 fileName)308 antlr3CreateFileStream(pANTLR3_UINT8 fileName)
309 {
310 // Pointer to the input stream we are going to create
311 //
312 pANTLR3_INPUT_STREAM input;
313 ANTLR3_UINT32 status;
314
315 if (fileName == NULL)
316 {
317 return NULL;
318 }
319
320 // Allocate memory for the input stream structure
321 //
322 input = (pANTLR3_INPUT_STREAM)
323 ANTLR3_CALLOC(1, sizeof(ANTLR3_INPUT_STREAM));
324
325 if (input == NULL)
326 {
327 return NULL;
328 }
329
330 // Structure was allocated correctly, now we can read the file.
331 //
332 status = antlr3read8Bit(input, fileName);
333
334 // Call the common 8 bit input stream handler
335 // initialization.
336 //
337 antlr3GenericSetupStream(input);
338
339 // However if the file was not there or something then we
340 // need to close. Have to wait until here as we cannot call
341 // close until the API is installed of course.
342 //
343 if (status != ANTLR3_SUCCESS)
344 {
345 input->close(input);
346 return NULL;
347 }
348
349 return input;
350 }
351
352 ANTLR3_API ANTLR3_UINT32
antlr3read8Bit(pANTLR3_INPUT_STREAM input,pANTLR3_UINT8 fileName)353 antlr3read8Bit(pANTLR3_INPUT_STREAM input, pANTLR3_UINT8 fileName)
354 {
355 ANTLR3_FDSC infile;
356 ANTLR3_UINT32 fSize;
357
358 /* Open the OS file in read binary mode
359 */
360 infile = antlr3Fopen(fileName, "rb");
361
362 /* Check that it was there
363 */
364 if (infile == NULL)
365 {
366 return (ANTLR3_UINT32)ANTLR3_ERR_NOFILE;
367 }
368
369 /* It was there, so we can read the bytes now
370 */
371 fSize = antlr3Fsize(fileName); /* Size of input file */
372
373 /* Allocate buffer for this input set
374 */
375 input->data = ANTLR3_MALLOC((size_t)fSize);
376 input->sizeBuf = fSize;
377
378 if (input->data == NULL)
379 {
380 return (ANTLR3_UINT32)ANTLR3_ERR_NOMEM;
381 }
382
383 input->isAllocated = ANTLR3_TRUE;
384
385 /* Now we read the file. Characters are not converted to
386 * the internal ANTLR encoding until they are read from the buffer
387 */
388 antlr3Fread(infile, fSize, input->data);
389
390 /* And close the file handle
391 */
392 antlr3Fclose(infile);
393
394 return ANTLR3_SUCCESS;
395 }
396
397 /** \brief Open an operating system file and return the descriptor
398 * We just use the common open() and related functions here.
399 * Later we might find better ways on systems
400 * such as Windows and OpenVMS for instance. But the idea is to read the
401 * while file at once anyway, so it may be irrelevant.
402 */
403 ANTLR3_API ANTLR3_FDSC
antlr3Fopen(pANTLR3_UINT8 filename,const char * mode)404 antlr3Fopen(pANTLR3_UINT8 filename, const char * mode)
405 {
406 return (ANTLR3_FDSC)fopen((const char *)filename, mode);
407 }
408
409 /** \brief Close an operating system file and free any handles
410 * etc.
411 */
412 ANTLR3_API void
antlr3Fclose(ANTLR3_FDSC fd)413 antlr3Fclose(ANTLR3_FDSC fd)
414 {
415 fclose(fd);
416 }
417 ANTLR3_API ANTLR3_UINT32
antlr3Fsize(pANTLR3_UINT8 fileName)418 antlr3Fsize(pANTLR3_UINT8 fileName)
419 {
420 struct _stat statbuf;
421
422 _stat((const char *)fileName, &statbuf);
423
424 return (ANTLR3_UINT32)statbuf.st_size;
425 }
426
427 ANTLR3_API ANTLR3_UINT32
antlr3Fread(ANTLR3_FDSC fdsc,ANTLR3_UINT32 count,void * data)428 antlr3Fread(ANTLR3_FDSC fdsc, ANTLR3_UINT32 count, void * data)
429 {
430 return (ANTLR3_UINT32)fread(data, (size_t)count, 1, fdsc);
431 }
432
433
434 /** \brief Use the supplied 'string' as input to the stream
435 *
436 * \param data Pointer to the input data
437 * \return
438 * - Pointer to new input stream context upon success
439 * - NULL defines on error.
440 */
441 static pANTLR3_INPUT_STREAM
antlr3CreateStringStream(pANTLR3_UINT8 data)442 antlr3CreateStringStream(pANTLR3_UINT8 data)
443 {
444 // Pointer to the input stream we are going to create
445 //
446 pANTLR3_INPUT_STREAM input;
447
448 if (data == NULL)
449 {
450 return NULL;
451 }
452
453 // Allocate memory for the input stream structure
454 //
455 input = (pANTLR3_INPUT_STREAM)
456 ANTLR3_CALLOC(1, sizeof(ANTLR3_INPUT_STREAM));
457
458 if (input == NULL)
459 {
460 return NULL;
461 }
462
463 // Structure was allocated correctly, now we can install the pointer
464 //
465 input->data = data;
466 input->isAllocated = ANTLR3_FALSE;
467
468 // Call the common 8 bit input stream handler
469 // initialization.
470 //
471 antlr3GenericSetupStream(input);
472
473 return input;
474 }