• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package com.fasterxml.jackson.core.io;
2 
3 import java.util.Arrays;
4 
5 public final class CharTypes
6 {
7     private final static char[] HC = "0123456789ABCDEF".toCharArray();
8     private final static byte[] HB;
9     static {
10         int len = HC.length;
11         HB = new byte[len];
12         for (int i = 0; i < len; ++i) {
13             HB[i] = (byte) HC[i];
14         }
15     }
16 
17 
18     /**
19      * Lookup table used for determining which input characters
20      * need special handling when contained in text segment.
21      */
22     private final static int[] sInputCodes;
23     static {
24         /* 96 would do for most cases (backslash is ASCII 94)
25          * but if we want to do lookups by raw bytes it's better
26          * to have full table
27          */
28         final int[] table = new int[256];
29         // Control chars and non-space white space are not allowed unquoted
30         for (int i = 0; i < 32; ++i) {
31             table[i] = -1;
32         }
33         // And then string end and quote markers are special too
34         table['"'] = 1;
35         table['\\'] = 1;
36         sInputCodes = table;
37     }
38 
39     /**
40      * Additionally we can combine UTF-8 decoding info into similar
41      * data table.
42      */
43     private final static int[] sInputCodesUTF8;
44     static {
45         final int[] table = new int[sInputCodes.length];
System.arraycopy(sInputCodes, 0, table, 0, table.length)46         System.arraycopy(sInputCodes, 0, table, 0, table.length);
47         for (int c = 128; c < 256; ++c) {
48             int code;
49 
50             // We'll add number of bytes needed for decoding
51             if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF)
52                 code = 2;
53             } else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF)
54                 code = 3;
55             } else if ((c & 0xF8) == 0xF0) {
56                 // 4 bytes; double-char with surrogates and all...
57                 code = 4;
58             } else {
59                 // And -1 seems like a good "universal" error marker...
60                 code = -1;
61             }
62             table[c] = code;
63         }
64         sInputCodesUTF8 = table;
65     }
66 
67     /**
68      * To support non-default (and -standard) unquoted field names mode,
69      * need to have alternate checking.
70      * Basically this is list of 8-bit ASCII characters that are legal
71      * as part of Javascript identifier
72      */
73     private final static int[] sInputCodesJsNames;
74     static {
75         final int[] table = new int[256];
76         // Default is "not a name char", mark ones that are
Arrays.fill(table, -1)77         Arrays.fill(table, -1);
78         // Assume rules with JS same as Java (change if/as needed)
79         for (int i = 33; i < 256; ++i) {
80             if (Character.isJavaIdentifierPart((char) i)) {
81                 table[i] = 0;
82             }
83         }
84         /* As per [JACKSON-267], '@', '#' and '*' are also to be accepted as well.
85          * And '-' (for hyphenated names); and '+' for sake of symmetricity...
86          */
87         table['@'] = 0;
88         table['#'] = 0;
89         table['*'] = 0;
90         table['-'] = 0;
91         table['+'] = 0;
92         sInputCodesJsNames = table;
93     }
94 
95     /**
96      * This table is similar to Latin-1, except that it marks all "high-bit"
97      * code as ok. They will be validated at a later point, when decoding
98      * name
99      */
100     private final static int[] sInputCodesUtf8JsNames;
101     static {
102         final int[] table = new int[256];
103         // start with 8-bit JS names
System.arraycopy(sInputCodesJsNames, 0, table, 0, table.length)104         System.arraycopy(sInputCodesJsNames, 0, table, 0, table.length);
Arrays.fill(table, 128, 128, 0)105         Arrays.fill(table, 128, 128, 0);
106         sInputCodesUtf8JsNames = table;
107     }
108 
109     /**
110      * Decoding table used to quickly determine characters that are
111      * relevant within comment content.
112      */
113     private final static int[] sInputCodesComment;
114     static {
115         final int[] buf = new int[256];
116         // but first: let's start with UTF-8 multi-byte markers:
System.arraycopy(sInputCodesUTF8, 128, buf, 128, 128)117         System.arraycopy(sInputCodesUTF8, 128, buf, 128, 128);
118 
119         // default (0) means "ok" (skip); -1 invalid, others marked by char itself
Arrays.fill(buf, 0, 32, -1)120         Arrays.fill(buf, 0, 32, -1); // invalid white space
121         buf['\t'] = 0; // tab is still fine
122         buf['\n'] = '\n'; // lf/cr need to be observed, ends cpp comment
123         buf['\r'] = '\r';
124         buf['*'] = '*'; // end marker for c-style comments
125         sInputCodesComment = buf;
126     }
127 
128     /**
129      * Decoding table used for skipping white space and comments.
130      *
131      * @since 2.3
132      */
133     private final static int[] sInputCodesWS;
134     static {
135         // but first: let's start with UTF-8 multi-byte markers:
136         final int[] buf = new int[256];
System.arraycopy(sInputCodesUTF8, 128, buf, 128, 128)137         System.arraycopy(sInputCodesUTF8, 128, buf, 128, 128);
138 
139         // default (0) means "not whitespace" (end); 1 "whitespace", -1 invalid,
140         // 2-4 UTF-8 multi-bytes, others marked by char itself
141         //
Arrays.fill(buf, 0, 32, -1)142         Arrays.fill(buf, 0, 32, -1); // invalid white space
143         buf[' '] = 1;
144         buf['\t'] = 1;
145         buf['\n'] = '\n'; // lf/cr need to be observed, ends cpp comment
146         buf['\r'] = '\r';
147         buf['/'] = '/'; // start marker for c/cpp comments
148         buf['#'] = '#'; // start marker for YAML comments
149         sInputCodesWS = buf;
150     }
151 
152     /**
153      * Lookup table used for determining which output characters in
154      * 7-bit ASCII range need to be quoted.
155      */
156     private final static int[] sOutputEscapes128;
157     static {
158         int[] table = new int[128];
159         // Control chars need generic escape sequence
160         for (int i = 0; i < 32; ++i) {
161             // 04-Mar-2011, tatu: Used to use "-(i + 1)", replaced with constant
162             table[i] = CharacterEscapes.ESCAPE_STANDARD;
163         }
164         // Others (and some within that range too) have explicit shorter sequences
165         table['"'] = '"';
166         table['\\'] = '\\';
167         // Escaping of slash is optional, so let's not add it
168         table[0x08] = 'b';
169         table[0x09] = 't';
170         table[0x0C] = 'f';
171         table[0x0A] = 'n';
172         table[0x0D] = 'r';
173         sOutputEscapes128 = table;
174     }
175 
176     /**
177      * Lookup table for the first 256 Unicode characters (ASCII / UTF-8)
178      * range. For actual hex digits, contains corresponding value;
179      * for others -1.
180      *<p>
181      * NOTE: before 2.10.1, was of size 128, extended for simpler handling
182      */
183     private final static int[] sHexValues = new int[256];
184     static {
Arrays.fill(sHexValues, -1)185         Arrays.fill(sHexValues, -1);
186         for (int i = 0; i < 10; ++i) {
187             sHexValues['0' + i] = i;
188         }
189         for (int i = 0; i < 6; ++i) {
190             sHexValues['a' + i] = 10 + i;
191             sHexValues['A' + i] = 10 + i;
192         }
193     }
194 
getInputCodeLatin1()195     public static int[] getInputCodeLatin1() { return sInputCodes; }
getInputCodeUtf8()196     public static int[] getInputCodeUtf8() { return sInputCodesUTF8; }
197 
getInputCodeLatin1JsNames()198     public static int[] getInputCodeLatin1JsNames() { return sInputCodesJsNames; }
getInputCodeUtf8JsNames()199     public static int[] getInputCodeUtf8JsNames() { return sInputCodesUtf8JsNames; }
200 
getInputCodeComment()201     public static int[] getInputCodeComment() { return sInputCodesComment; }
getInputCodeWS()202     public static int[] getInputCodeWS() { return sInputCodesWS; }
203 
204     /**
205      * Accessor for getting a read-only encoding table for first 128 Unicode
206      * code points (single-byte UTF-8 characters).
207      * Value of 0 means "no escaping"; other positive values that value is character
208      * to use after backslash; and negative values that generic (backslash - u)
209      * escaping is to be used.
210      */
get7BitOutputEscapes()211     public static int[] get7BitOutputEscapes() { return sOutputEscapes128; }
212 
213     /**
214      * Alternative to {@link #get7BitOutputEscapes()} when a non-standard quote character
215      * is used.
216      *
217      * @since 2.10
218      */
get7BitOutputEscapes(int quoteChar)219     public static int[] get7BitOutputEscapes(int quoteChar) {
220         if (quoteChar == '"') {
221             return sOutputEscapes128;
222         }
223         return AltEscapes.instance.escapesFor(quoteChar);
224     }
225 
charToHex(int ch)226     public static int charToHex(int ch)
227     {
228         // 08-Nov-2019, tatu: As per [core#540] and [core#578], changed to
229         //   force masking here so caller need not do that.
230         return sHexValues[ch & 0xFF];
231     }
232 
appendQuoted(StringBuilder sb, String content)233     public static void appendQuoted(StringBuilder sb, String content)
234     {
235         final int[] escCodes = sOutputEscapes128;
236         int escLen = escCodes.length;
237         for (int i = 0, len = content.length(); i < len; ++i) {
238             char c = content.charAt(i);
239             if (c >= escLen || escCodes[c] == 0) {
240                 sb.append(c);
241                 continue;
242             }
243             sb.append('\\');
244             int escCode = escCodes[c];
245             if (escCode < 0) { // generic quoting (hex value)
246                 // The only negative value sOutputEscapes128 returns
247                 // is CharacterEscapes.ESCAPE_STANDARD, which mean
248                 // appendQuotes should encode using the Unicode encoding;
249                 // not sure if this is the right way to encode for
250                 // CharacterEscapes.ESCAPE_CUSTOM or other (future)
251                 // CharacterEscapes.ESCAPE_XXX values.
252 
253                 // We know that it has to fit in just 2 hex chars
254                 sb.append('u');
255                 sb.append('0');
256                 sb.append('0');
257                 int value = c;  // widening
258                 sb.append(HC[value >> 4]);
259                 sb.append(HC[value & 0xF]);
260             } else { // "named", i.e. prepend with slash
261                 sb.append((char) escCode);
262             }
263         }
264     }
265 
copyHexChars()266     public static char[] copyHexChars() {
267         return (char[]) HC.clone();
268     }
269 
copyHexBytes()270     public static byte[] copyHexBytes() {
271         return (byte[]) HB.clone();
272     }
273 
274     // @since 2.10
275     private static class AltEscapes {
276         public final static AltEscapes instance = new AltEscapes();
277 
278         private int[][] _altEscapes = new int[128][];
279 
escapesFor(int quoteChar)280         public int[] escapesFor(int quoteChar) {
281             int[] esc = _altEscapes[quoteChar];
282             if (esc == null) {
283                 esc = Arrays.copyOf(sOutputEscapes128, 128);
284                 // Only add escape setting if character does not already have it
285                 if (esc[quoteChar] == 0) {
286                     esc[quoteChar] = CharacterEscapes.ESCAPE_STANDARD;
287                 }
288                 _altEscapes[quoteChar] = esc;
289             }
290             return esc;
291         }
292     }
293 }
294 
295