1 package com.fasterxml.jackson.core.io; 2 3 import java.util.Arrays; 4 5 public final class CharTypes 6 { 7 private final static char[] HC = "0123456789ABCDEF".toCharArray(); 8 private final static byte[] HB; 9 static { 10 int len = HC.length; 11 HB = new byte[len]; 12 for (int i = 0; i < len; ++i) { 13 HB[i] = (byte) HC[i]; 14 } 15 } 16 17 18 /** 19 * Lookup table used for determining which input characters 20 * need special handling when contained in text segment. 21 */ 22 private final static int[] sInputCodes; 23 static { 24 /* 96 would do for most cases (backslash is ASCII 94) 25 * but if we want to do lookups by raw bytes it's better 26 * to have full table 27 */ 28 final int[] table = new int[256]; 29 // Control chars and non-space white space are not allowed unquoted 30 for (int i = 0; i < 32; ++i) { 31 table[i] = -1; 32 } 33 // And then string end and quote markers are special too 34 table['"'] = 1; 35 table['\\'] = 1; 36 sInputCodes = table; 37 } 38 39 /** 40 * Additionally we can combine UTF-8 decoding info into similar 41 * data table. 42 */ 43 private final static int[] sInputCodesUTF8; 44 static { 45 final int[] table = new int[sInputCodes.length]; System.arraycopy(sInputCodes, 0, table, 0, table.length)46 System.arraycopy(sInputCodes, 0, table, 0, table.length); 47 for (int c = 128; c < 256; ++c) { 48 int code; 49 50 // We'll add number of bytes needed for decoding 51 if ((c & 0xE0) == 0xC0) { // 2 bytes (0x0080 - 0x07FF) 52 code = 2; 53 } else if ((c & 0xF0) == 0xE0) { // 3 bytes (0x0800 - 0xFFFF) 54 code = 3; 55 } else if ((c & 0xF8) == 0xF0) { 56 // 4 bytes; double-char with surrogates and all... 57 code = 4; 58 } else { 59 // And -1 seems like a good "universal" error marker... 60 code = -1; 61 } 62 table[c] = code; 63 } 64 sInputCodesUTF8 = table; 65 } 66 67 /** 68 * To support non-default (and -standard) unquoted field names mode, 69 * need to have alternate checking. 70 * Basically this is list of 8-bit ASCII characters that are legal 71 * as part of Javascript identifier 72 */ 73 private final static int[] sInputCodesJsNames; 74 static { 75 final int[] table = new int[256]; 76 // Default is "not a name char", mark ones that are Arrays.fill(table, -1)77 Arrays.fill(table, -1); 78 // Assume rules with JS same as Java (change if/as needed) 79 for (int i = 33; i < 256; ++i) { 80 if (Character.isJavaIdentifierPart((char) i)) { 81 table[i] = 0; 82 } 83 } 84 /* As per [JACKSON-267], '@', '#' and '*' are also to be accepted as well. 85 * And '-' (for hyphenated names); and '+' for sake of symmetricity... 86 */ 87 table['@'] = 0; 88 table['#'] = 0; 89 table['*'] = 0; 90 table['-'] = 0; 91 table['+'] = 0; 92 sInputCodesJsNames = table; 93 } 94 95 /** 96 * This table is similar to Latin-1, except that it marks all "high-bit" 97 * code as ok. They will be validated at a later point, when decoding 98 * name 99 */ 100 private final static int[] sInputCodesUtf8JsNames; 101 static { 102 final int[] table = new int[256]; 103 // start with 8-bit JS names System.arraycopy(sInputCodesJsNames, 0, table, 0, table.length)104 System.arraycopy(sInputCodesJsNames, 0, table, 0, table.length); Arrays.fill(table, 128, 128, 0)105 Arrays.fill(table, 128, 128, 0); 106 sInputCodesUtf8JsNames = table; 107 } 108 109 /** 110 * Decoding table used to quickly determine characters that are 111 * relevant within comment content. 112 */ 113 private final static int[] sInputCodesComment; 114 static { 115 final int[] buf = new int[256]; 116 // but first: let's start with UTF-8 multi-byte markers: System.arraycopy(sInputCodesUTF8, 128, buf, 128, 128)117 System.arraycopy(sInputCodesUTF8, 128, buf, 128, 128); 118 119 // default (0) means "ok" (skip); -1 invalid, others marked by char itself Arrays.fill(buf, 0, 32, -1)120 Arrays.fill(buf, 0, 32, -1); // invalid white space 121 buf['\t'] = 0; // tab is still fine 122 buf['\n'] = '\n'; // lf/cr need to be observed, ends cpp comment 123 buf['\r'] = '\r'; 124 buf['*'] = '*'; // end marker for c-style comments 125 sInputCodesComment = buf; 126 } 127 128 /** 129 * Decoding table used for skipping white space and comments. 130 * 131 * @since 2.3 132 */ 133 private final static int[] sInputCodesWS; 134 static { 135 // but first: let's start with UTF-8 multi-byte markers: 136 final int[] buf = new int[256]; System.arraycopy(sInputCodesUTF8, 128, buf, 128, 128)137 System.arraycopy(sInputCodesUTF8, 128, buf, 128, 128); 138 139 // default (0) means "not whitespace" (end); 1 "whitespace", -1 invalid, 140 // 2-4 UTF-8 multi-bytes, others marked by char itself 141 // Arrays.fill(buf, 0, 32, -1)142 Arrays.fill(buf, 0, 32, -1); // invalid white space 143 buf[' '] = 1; 144 buf['\t'] = 1; 145 buf['\n'] = '\n'; // lf/cr need to be observed, ends cpp comment 146 buf['\r'] = '\r'; 147 buf['/'] = '/'; // start marker for c/cpp comments 148 buf['#'] = '#'; // start marker for YAML comments 149 sInputCodesWS = buf; 150 } 151 152 /** 153 * Lookup table used for determining which output characters in 154 * 7-bit ASCII range need to be quoted. 155 */ 156 private final static int[] sOutputEscapes128; 157 static { 158 int[] table = new int[128]; 159 // Control chars need generic escape sequence 160 for (int i = 0; i < 32; ++i) { 161 // 04-Mar-2011, tatu: Used to use "-(i + 1)", replaced with constant 162 table[i] = CharacterEscapes.ESCAPE_STANDARD; 163 } 164 // Others (and some within that range too) have explicit shorter sequences 165 table['"'] = '"'; 166 table['\\'] = '\\'; 167 // Escaping of slash is optional, so let's not add it 168 table[0x08] = 'b'; 169 table[0x09] = 't'; 170 table[0x0C] = 'f'; 171 table[0x0A] = 'n'; 172 table[0x0D] = 'r'; 173 sOutputEscapes128 = table; 174 } 175 176 /** 177 * Lookup table for the first 256 Unicode characters (ASCII / UTF-8) 178 * range. For actual hex digits, contains corresponding value; 179 * for others -1. 180 *<p> 181 * NOTE: before 2.10.1, was of size 128, extended for simpler handling 182 */ 183 private final static int[] sHexValues = new int[256]; 184 static { Arrays.fill(sHexValues, -1)185 Arrays.fill(sHexValues, -1); 186 for (int i = 0; i < 10; ++i) { 187 sHexValues['0' + i] = i; 188 } 189 for (int i = 0; i < 6; ++i) { 190 sHexValues['a' + i] = 10 + i; 191 sHexValues['A' + i] = 10 + i; 192 } 193 } 194 getInputCodeLatin1()195 public static int[] getInputCodeLatin1() { return sInputCodes; } getInputCodeUtf8()196 public static int[] getInputCodeUtf8() { return sInputCodesUTF8; } 197 getInputCodeLatin1JsNames()198 public static int[] getInputCodeLatin1JsNames() { return sInputCodesJsNames; } getInputCodeUtf8JsNames()199 public static int[] getInputCodeUtf8JsNames() { return sInputCodesUtf8JsNames; } 200 getInputCodeComment()201 public static int[] getInputCodeComment() { return sInputCodesComment; } getInputCodeWS()202 public static int[] getInputCodeWS() { return sInputCodesWS; } 203 204 /** 205 * Accessor for getting a read-only encoding table for first 128 Unicode 206 * code points (single-byte UTF-8 characters). 207 * Value of 0 means "no escaping"; other positive values that value is character 208 * to use after backslash; and negative values that generic (backslash - u) 209 * escaping is to be used. 210 */ get7BitOutputEscapes()211 public static int[] get7BitOutputEscapes() { return sOutputEscapes128; } 212 213 /** 214 * Alternative to {@link #get7BitOutputEscapes()} when a non-standard quote character 215 * is used. 216 * 217 * @since 2.10 218 */ get7BitOutputEscapes(int quoteChar)219 public static int[] get7BitOutputEscapes(int quoteChar) { 220 if (quoteChar == '"') { 221 return sOutputEscapes128; 222 } 223 return AltEscapes.instance.escapesFor(quoteChar); 224 } 225 charToHex(int ch)226 public static int charToHex(int ch) 227 { 228 // 08-Nov-2019, tatu: As per [core#540] and [core#578], changed to 229 // force masking here so caller need not do that. 230 return sHexValues[ch & 0xFF]; 231 } 232 appendQuoted(StringBuilder sb, String content)233 public static void appendQuoted(StringBuilder sb, String content) 234 { 235 final int[] escCodes = sOutputEscapes128; 236 int escLen = escCodes.length; 237 for (int i = 0, len = content.length(); i < len; ++i) { 238 char c = content.charAt(i); 239 if (c >= escLen || escCodes[c] == 0) { 240 sb.append(c); 241 continue; 242 } 243 sb.append('\\'); 244 int escCode = escCodes[c]; 245 if (escCode < 0) { // generic quoting (hex value) 246 // The only negative value sOutputEscapes128 returns 247 // is CharacterEscapes.ESCAPE_STANDARD, which mean 248 // appendQuotes should encode using the Unicode encoding; 249 // not sure if this is the right way to encode for 250 // CharacterEscapes.ESCAPE_CUSTOM or other (future) 251 // CharacterEscapes.ESCAPE_XXX values. 252 253 // We know that it has to fit in just 2 hex chars 254 sb.append('u'); 255 sb.append('0'); 256 sb.append('0'); 257 int value = c; // widening 258 sb.append(HC[value >> 4]); 259 sb.append(HC[value & 0xF]); 260 } else { // "named", i.e. prepend with slash 261 sb.append((char) escCode); 262 } 263 } 264 } 265 copyHexChars()266 public static char[] copyHexChars() { 267 return (char[]) HC.clone(); 268 } 269 copyHexBytes()270 public static byte[] copyHexBytes() { 271 return (byte[]) HB.clone(); 272 } 273 274 // @since 2.10 275 private static class AltEscapes { 276 public final static AltEscapes instance = new AltEscapes(); 277 278 private int[][] _altEscapes = new int[128][]; 279 escapesFor(int quoteChar)280 public int[] escapesFor(int quoteChar) { 281 int[] esc = _altEscapes[quoteChar]; 282 if (esc == null) { 283 esc = Arrays.copyOf(sOutputEscapes128, 128); 284 // Only add escape setting if character does not already have it 285 if (esc[quoteChar] == 0) { 286 esc[quoteChar] = CharacterEscapes.ESCAPE_STANDARD; 287 } 288 _altEscapes[quoteChar] = esc; 289 } 290 return esc; 291 } 292 } 293 } 294 295