• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package com.fasterxml.jackson.core.json;
2 
3 import java.io.*;
4 
5 import com.fasterxml.jackson.core.*;
6 import com.fasterxml.jackson.core.format.InputAccessor;
7 import com.fasterxml.jackson.core.format.MatchStrength;
8 import com.fasterxml.jackson.core.io.*;
9 import com.fasterxml.jackson.core.sym.ByteQuadsCanonicalizer;
10 import com.fasterxml.jackson.core.sym.CharsToNameCanonicalizer;
11 
12 /**
13  * This class is used to determine the encoding of byte stream
14  * that is to contain JSON content. Rules are fairly simple, and
15  * defined in JSON specification (RFC-4627 or newer), except
16  * for BOM handling, which is a property of underlying
17  * streams.
18  */
19 public final class ByteSourceJsonBootstrapper
20 {
21     public final static byte UTF8_BOM_1 = (byte) 0xEF;
22     public final static byte UTF8_BOM_2 = (byte) 0xBB;
23     public final static byte UTF8_BOM_3 = (byte) 0xBF;
24 
25     /*
26     /**********************************************************
27     /* Configuration
28     /**********************************************************
29      */
30 
31     private final IOContext _context;
32 
33     private final InputStream _in;
34 
35     /*
36     /**********************************************************
37     /* Input buffering
38     /**********************************************************
39      */
40 
41     private final byte[] _inputBuffer;
42 
43     private int _inputPtr;
44 
45     private int _inputEnd;
46 
47     /**
48      * Flag that indicates whether buffer above is to be recycled
49      * after being used or not.
50      */
51     private final boolean _bufferRecyclable;
52 
53     /*
54     /**********************************************************
55     /* Input location
56     /**********************************************************
57      */
58 
59     /**
60      * Current number of input units (bytes or chars) that were processed in
61      * previous blocks,
62      * before contents of current input buffer.
63      *<p>
64      * Note: includes possible BOMs, if those were part of the input.
65      */
66 //    private int _inputProcessed;
67 
68     /*
69     /**********************************************************
70     /* Data gathered
71     /**********************************************************
72      */
73 
74     /**
75      * Whether input has been detected to be in Big-Endian encoding or not.
76      */
77     private boolean _bigEndian = true;
78 
79     private int _bytesPerChar; // 0 means "dunno yet"
80 
81     /*
82     /**********************************************************
83     /* Life-cycle
84     /**********************************************************
85      */
86 
ByteSourceJsonBootstrapper(IOContext ctxt, InputStream in)87     public ByteSourceJsonBootstrapper(IOContext ctxt, InputStream in) {
88         _context = ctxt;
89         _in = in;
90         _inputBuffer = ctxt.allocReadIOBuffer();
91         _inputEnd = _inputPtr = 0;
92 //        _inputProcessed = 0;
93         _bufferRecyclable = true;
94     }
95 
ByteSourceJsonBootstrapper(IOContext ctxt, byte[] inputBuffer, int inputStart, int inputLen)96     public ByteSourceJsonBootstrapper(IOContext ctxt, byte[] inputBuffer, int inputStart, int inputLen) {
97         _context = ctxt;
98         _in = null;
99         _inputBuffer = inputBuffer;
100         _inputPtr = inputStart;
101         _inputEnd = (inputStart + inputLen);
102         // Need to offset this for correct location info
103 //        _inputProcessed = -inputStart;
104         _bufferRecyclable = false;
105     }
106 
107     /*
108     /**********************************************************
109     /*  Encoding detection during bootstrapping
110     /**********************************************************
111      */
112 
113     /**
114      * Method that should be called after constructing an instace.
115      * It will figure out encoding that content uses, to allow
116      * for instantiating a proper scanner object.
117      */
detectEncoding()118     public JsonEncoding detectEncoding() throws IOException
119     {
120         boolean foundEncoding = false;
121 
122         // First things first: BOM handling
123         /* Note: we can require 4 bytes to be read, since no
124          * combination of BOM + valid JSON content can have
125          * shorter length (shortest valid JSON content is single
126          * digit char, but BOMs are chosen such that combination
127          * is always at least 4 chars long)
128          */
129         if (ensureLoaded(4)) {
130             int quad =  (_inputBuffer[_inputPtr] << 24)
131                 | ((_inputBuffer[_inputPtr+1] & 0xFF) << 16)
132                 | ((_inputBuffer[_inputPtr+2] & 0xFF) << 8)
133                 | (_inputBuffer[_inputPtr+3] & 0xFF);
134 
135             if (handleBOM(quad)) {
136                 foundEncoding = true;
137             } else {
138                 /* If no BOM, need to auto-detect based on first char;
139                  * this works since it must be 7-bit ascii (wrt. unicode
140                  * compatible encodings, only ones JSON can be transferred
141                  * over)
142                  */
143                 // UTF-32?
144                 if (checkUTF32(quad)) {
145                     foundEncoding = true;
146                 } else if (checkUTF16(quad >>> 16)) {
147                     foundEncoding = true;
148                 }
149             }
150         } else if (ensureLoaded(2)) {
151             int i16 = ((_inputBuffer[_inputPtr] & 0xFF) << 8)
152                 | (_inputBuffer[_inputPtr+1] & 0xFF);
153             if (checkUTF16(i16)) {
154                 foundEncoding = true;
155             }
156         }
157 
158         JsonEncoding enc;
159 
160         /* Not found yet? As per specs, this means it must be UTF-8. */
161         if (!foundEncoding) {
162             enc = JsonEncoding.UTF8;
163         } else {
164             switch (_bytesPerChar) {
165             case 1: enc = JsonEncoding.UTF8;
166                 break;
167             case 2: enc = _bigEndian ? JsonEncoding.UTF16_BE : JsonEncoding.UTF16_LE;
168                 break;
169             case 4: enc = _bigEndian ? JsonEncoding.UTF32_BE : JsonEncoding.UTF32_LE;
170                 break;
171             default: throw new RuntimeException("Internal error"); // should never get here
172             }
173         }
174         _context.setEncoding(enc);
175         return enc;
176     }
177 
178     /**
179      * Helper method that may be called to see if given {@link DataInput}
180      * has BOM marker, and if so, to skip it.
181      * @throws IOException
182      *
183      * @since 2.8
184      */
skipUTF8BOM(DataInput input)185     public static int skipUTF8BOM(DataInput input) throws IOException
186     {
187         int b = input.readUnsignedByte();
188         if (b != 0xEF) {
189             return b;
190         }
191         // since this is not legal byte in JSON otherwise, except
192         // that we do get BOM; if not, report error
193         b = input.readUnsignedByte();
194         if (b != 0xBB) {
195             throw new IOException("Unexpected byte 0x"+Integer.toHexString(b)
196                 +" following 0xEF; should get 0xBB as part of UTF-8 BOM");
197         }
198         b = input.readUnsignedByte();
199         if (b != 0xBF) {
200             throw new IOException("Unexpected byte 0x"+Integer.toHexString(b)
201                 +" following 0xEF 0xBB; should get 0xBF as part of UTF-8 BOM");
202         }
203         return input.readUnsignedByte();
204     }
205 
206     /*
207     /**********************************************************
208     /* Constructing a Reader
209     /**********************************************************
210      */
211 
212     @SuppressWarnings("resource")
constructReader()213     public Reader constructReader() throws IOException
214     {
215         JsonEncoding enc = _context.getEncoding();
216         switch (enc.bits()) {
217         case 8: // only in non-common case where we don't want to do direct mapping
218         case 16:
219             {
220                 // First: do we have a Stream? If not, need to create one:
221                 InputStream in = _in;
222 
223                 if (in == null) {
224                     in = new ByteArrayInputStream(_inputBuffer, _inputPtr, _inputEnd);
225                 } else {
226                     /* Also, if we have any read but unused input (usually true),
227                      * need to merge that input in:
228                      */
229                     if (_inputPtr < _inputEnd) {
230                         in = new MergedStream(_context, in, _inputBuffer, _inputPtr, _inputEnd);
231                     }
232                 }
233                 return new InputStreamReader(in, enc.getJavaName());
234             }
235         case 32:
236             return new UTF32Reader(_context, _in, _inputBuffer, _inputPtr, _inputEnd,
237                     _context.getEncoding().isBigEndian());
238         }
239         throw new RuntimeException("Internal error"); // should never get here
240     }
241 
constructParser(int parserFeatures, ObjectCodec codec, ByteQuadsCanonicalizer rootByteSymbols, CharsToNameCanonicalizer rootCharSymbols, int factoryFeatures)242     public JsonParser constructParser(int parserFeatures, ObjectCodec codec,
243             ByteQuadsCanonicalizer rootByteSymbols, CharsToNameCanonicalizer rootCharSymbols,
244             int factoryFeatures) throws IOException
245     {
246         int prevInputPtr = _inputPtr;
247         JsonEncoding enc = detectEncoding();
248         int bytesProcessed = _inputPtr - prevInputPtr;
249 
250         if (enc == JsonEncoding.UTF8) {
251             /* and without canonicalization, byte-based approach is not performant; just use std UTF-8 reader
252              * (which is ok for larger input; not so hot for smaller; but this is not a common case)
253              */
254             if (JsonFactory.Feature.CANONICALIZE_FIELD_NAMES.enabledIn(factoryFeatures)) {
255                 ByteQuadsCanonicalizer can = rootByteSymbols.makeChild(factoryFeatures);
256                 return new UTF8StreamJsonParser(_context, parserFeatures, _in, codec, can,
257                         _inputBuffer, _inputPtr, _inputEnd, bytesProcessed, _bufferRecyclable);
258             }
259         }
260         return new ReaderBasedJsonParser(_context, parserFeatures, constructReader(), codec,
261                 rootCharSymbols.makeChild(factoryFeatures));
262     }
263 
264     /*
265     /**********************************************************
266     /*  Encoding detection for data format auto-detection
267     /**********************************************************
268      */
269 
270     /**
271      * Current implementation is not as thorough as other functionality
272      * ({@link com.fasterxml.jackson.core.json.ByteSourceJsonBootstrapper});
273      * supports UTF-8, for example. But it should work, for now, and can
274      * be improved as necessary.
275      */
hasJSONFormat(InputAccessor acc)276     public static MatchStrength hasJSONFormat(InputAccessor acc) throws IOException
277     {
278         // Ideally we should see "[" or "{"; but if not, we'll accept double-quote (String)
279         // in future could also consider accepting non-standard matches?
280 
281         if (!acc.hasMoreBytes()) {
282             return MatchStrength.INCONCLUSIVE;
283         }
284         byte b = acc.nextByte();
285         // Very first thing, a UTF-8 BOM?
286         if (b == UTF8_BOM_1) { // yes, looks like UTF-8 BOM
287             if (!acc.hasMoreBytes()) {
288                 return MatchStrength.INCONCLUSIVE;
289             }
290             if (acc.nextByte() != UTF8_BOM_2) {
291                 return MatchStrength.NO_MATCH;
292             }
293             if (!acc.hasMoreBytes()) {
294                 return MatchStrength.INCONCLUSIVE;
295             }
296             if (acc.nextByte() != UTF8_BOM_3) {
297                 return MatchStrength.NO_MATCH;
298             }
299             if (!acc.hasMoreBytes()) {
300                 return MatchStrength.INCONCLUSIVE;
301             }
302             b = acc.nextByte();
303         }
304         // Then possible leading space
305         int ch = skipSpace(acc, b);
306         if (ch < 0) {
307             return MatchStrength.INCONCLUSIVE;
308         }
309         // First, let's see if it looks like a structured type:
310         if (ch == '{') { // JSON object?
311             // Ideally we need to find either double-quote or closing bracket
312             ch = skipSpace(acc);
313             if (ch < 0) {
314                 return MatchStrength.INCONCLUSIVE;
315             }
316             if (ch == '"' || ch == '}') {
317                 return MatchStrength.SOLID_MATCH;
318             }
319             // ... should we allow non-standard? Let's not yet... can add if need be
320             return MatchStrength.NO_MATCH;
321         }
322         MatchStrength strength;
323 
324         if (ch == '[') {
325             ch = skipSpace(acc);
326             if (ch < 0) {
327                 return MatchStrength.INCONCLUSIVE;
328             }
329             // closing brackets is easy; but for now, let's also accept opening...
330             if (ch == ']' || ch == '[') {
331                 return MatchStrength.SOLID_MATCH;
332             }
333             return MatchStrength.SOLID_MATCH;
334         } else {
335             // plain old value is not very convincing...
336             strength = MatchStrength.WEAK_MATCH;
337         }
338 
339         if (ch == '"') { // string value
340             return strength;
341         }
342         if (ch <= '9' && ch >= '0') { // number
343             return strength;
344         }
345         if (ch == '-') { // negative number
346             ch = skipSpace(acc);
347             if (ch < 0) {
348                 return MatchStrength.INCONCLUSIVE;
349             }
350             return (ch <= '9' && ch >= '0') ? strength : MatchStrength.NO_MATCH;
351         }
352         // or one of literals
353         if (ch == 'n') { // null
354             return tryMatch(acc, "ull", strength);
355         }
356         if (ch == 't') { // true
357             return tryMatch(acc, "rue", strength);
358         }
359         if (ch == 'f') { // false
360             return tryMatch(acc, "alse", strength);
361         }
362         return MatchStrength.NO_MATCH;
363     }
364 
tryMatch(InputAccessor acc, String matchStr, MatchStrength fullMatchStrength)365     private static MatchStrength tryMatch(InputAccessor acc, String matchStr, MatchStrength fullMatchStrength)
366         throws IOException
367     {
368         for (int i = 0, len = matchStr.length(); i < len; ++i) {
369             if (!acc.hasMoreBytes()) {
370                 return MatchStrength.INCONCLUSIVE;
371             }
372             if (acc.nextByte() != matchStr.charAt(i)) {
373                 return MatchStrength.NO_MATCH;
374             }
375         }
376         return fullMatchStrength;
377     }
378 
skipSpace(InputAccessor acc)379     private static int skipSpace(InputAccessor acc) throws IOException
380     {
381         if (!acc.hasMoreBytes()) {
382             return -1;
383         }
384         return skipSpace(acc, acc.nextByte());
385     }
386 
skipSpace(InputAccessor acc, byte b)387     private static int skipSpace(InputAccessor acc, byte b) throws IOException
388     {
389         while (true) {
390             int ch = (int) b & 0xFF;
391             if (!(ch == ' ' || ch == '\r' || ch == '\n' || ch == '\t')) {
392                 return ch;
393             }
394             if (!acc.hasMoreBytes()) {
395                 return -1;
396             }
397             b = acc.nextByte();
398         }
399     }
400 
401     /*
402     /**********************************************************
403     /* Internal methods, parsing
404     /**********************************************************
405      */
406 
407     /**
408      * @return True if a BOM was succesfully found, and encoding
409      *   thereby recognized.
410      */
handleBOM(int quad)411     private boolean handleBOM(int quad) throws IOException
412     {
413         /* Handling of (usually) optional BOM (required for
414          * multi-byte formats); first 32-bit charsets:
415          */
416         switch (quad) {
417         case 0x0000FEFF:
418             _bigEndian = true;
419             _inputPtr += 4;
420             _bytesPerChar = 4;
421             return true;
422         case 0xFFFE0000: // UCS-4, LE?
423             _inputPtr += 4;
424             _bytesPerChar = 4;
425             _bigEndian = false;
426             return true;
427         case 0x0000FFFE: // UCS-4, in-order...
428             reportWeirdUCS4("2143"); // throws exception
429             break; // never gets here
430         case 0xFEFF0000: // UCS-4, in-order...
431             reportWeirdUCS4("3412"); // throws exception
432             break; // never gets here
433         default:
434         }
435         // Ok, if not, how about 16-bit encoding BOMs?
436         int msw = quad >>> 16;
437         if (msw == 0xFEFF) { // UTF-16, BE
438             _inputPtr += 2;
439             _bytesPerChar = 2;
440             _bigEndian = true;
441             return true;
442         }
443         if (msw == 0xFFFE) { // UTF-16, LE
444             _inputPtr += 2;
445             _bytesPerChar = 2;
446             _bigEndian = false;
447             return true;
448         }
449         // And if not, then UTF-8 BOM?
450         if ((quad >>> 8) == 0xEFBBBF) { // UTF-8
451             _inputPtr += 3;
452             _bytesPerChar = 1;
453             _bigEndian = true; // doesn't really matter
454             return true;
455         }
456         return false;
457     }
458 
checkUTF32(int quad)459     private boolean checkUTF32(int quad) throws IOException
460     {
461         /* Handling of (usually) optional BOM (required for
462          * multi-byte formats); first 32-bit charsets:
463          */
464         if ((quad >> 8) == 0) { // 0x000000?? -> UTF32-BE
465             _bigEndian = true;
466         } else if ((quad & 0x00FFFFFF) == 0) { // 0x??000000 -> UTF32-LE
467             _bigEndian = false;
468         } else if ((quad & ~0x00FF0000) == 0) { // 0x00??0000 -> UTF32-in-order
469             reportWeirdUCS4("3412");
470         } else if ((quad & ~0x0000FF00) == 0) { // 0x0000??00 -> UTF32-in-order
471             reportWeirdUCS4("2143");
472         } else {
473             // Can not be valid UTF-32 encoded JSON...
474             return false;
475         }
476         // Not BOM (just regular content), nothing to skip past:
477         //_inputPtr += 4;
478         _bytesPerChar = 4;
479         return true;
480     }
481 
checkUTF16(int i16)482     private boolean checkUTF16(int i16)
483     {
484         if ((i16 & 0xFF00) == 0) { // UTF-16BE
485             _bigEndian = true;
486         } else if ((i16 & 0x00FF) == 0) { // UTF-16LE
487             _bigEndian = false;
488         } else { // nope, not  UTF-16
489             return false;
490         }
491         // Not BOM (just regular content), nothing to skip past:
492         //_inputPtr += 2;
493         _bytesPerChar = 2;
494         return true;
495     }
496 
497     /*
498     /**********************************************************
499     /* Internal methods, problem reporting
500     /**********************************************************
501      */
502 
reportWeirdUCS4(String type)503     private void reportWeirdUCS4(String type) throws IOException {
504         throw new CharConversionException("Unsupported UCS-4 endianness ("+type+") detected");
505     }
506 
507     /*
508     /**********************************************************
509     /* Internal methods, raw input access
510     /**********************************************************
511      */
512 
ensureLoaded(int minimum)513     protected boolean ensureLoaded(int minimum) throws IOException {
514         /* Let's assume here buffer has enough room -- this will always
515          * be true for the limited used this method gets
516          */
517         int gotten = (_inputEnd - _inputPtr);
518         while (gotten < minimum) {
519             int count;
520 
521             if (_in == null) { // block source
522                 count = -1;
523             } else {
524                 count = _in.read(_inputBuffer, _inputEnd, _inputBuffer.length - _inputEnd);
525             }
526             if (count < 1) {
527                 return false;
528             }
529             _inputEnd += count;
530             gotten += count;
531         }
532         return true;
533     }
534 }
535