1 // Copyright (c) 2012 Jeff Ichnowski 2 // All rights reserved. 3 // 4 // Redistribution and use in source and binary forms, with or without 5 // modification, are permitted provided that the following conditions 6 // are met: 7 // 8 // * Redistributions of source code must retain the above 9 // copyright notice, this list of conditions and the following 10 // disclaimer. 11 // 12 // * Redistributions in binary form must reproduce the above 13 // copyright notice, this list of conditions and the following 14 // disclaimer in the documentation and/or other materials 15 // provided with the distribution. 16 // 17 // * Neither the name of the OWASP nor the names of its 18 // contributors may be used to endorse or promote products 19 // derived from this software without specific prior written 20 // permission. 21 // 22 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 27 // INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 28 // (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 29 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 30 // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 31 // STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 32 // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED 33 // OF THE POSSIBILITY OF SUCH DAMAGE. 34 package org.owasp.encoder; 35 36 import java.nio.CharBuffer; 37 import java.nio.charset.CoderResult; 38 39 /** 40 * JavaEncoder -- Encoder for Java based strings. Useful if in Java code 41 * generators to generate efficiently encoded strings for arbitrary data. This 42 * encoder uses the minimal sequence of characters required to encode a 43 * character (e.g. standard backslash escapes, such as "\n", "\\" , "\'", octal 44 * escapes, and unicode escapes). This encoder does NOT check UTF-16 surrogate 45 * pair sequences. The target output context supports mismatched UTF-16 pairs 46 * (e.g. it will compile, run, etc... with them). 47 * 48 * @author Jeff Ichnowski 49 */ 50 class JavaEncoder extends Encoder { 51 52 /** 53 * The length of a Unicode escape, e.g. "\\u1234". 54 */ 55 static final int U_ESCAPE_LENGTH = 6; 56 /** 57 * The length of a octal escape sequence, e.g. "\377". 58 */ 59 static final int OCT_ESCAPE_LENGTH = 4; 60 /** 61 * Number of bits to shift for each octal unit. 62 */ 63 static final int OCT_SHIFT = 3; 64 /** 65 * The bit-mask for an octal unit. 66 */ 67 static final int OCT_MASK = 7; 68 69 @Override maxEncodedLength(int n)70 protected int maxEncodedLength(int n) { 71 // "\\u####" 72 return n * U_ESCAPE_LENGTH; 73 } 74 75 @Override firstEncodedOffset(String input, int off, int len)76 protected int firstEncodedOffset(String input, int off, int len) { 77 final int n = off + len; 78 for (int i = off; i < n; ++i) { 79 char ch = input.charAt(i); 80 if (ch >= ' ' && ch <= '~') { 81 if (ch == '\\' || ch == '\'' || ch == '\"') { 82 return i; 83 } 84 } else { 85 return i; 86 } 87 } 88 return n; 89 } 90 91 @Override encodeArrays(CharBuffer input, CharBuffer output, boolean endOfInput)92 protected CoderResult encodeArrays(CharBuffer input, CharBuffer output, boolean endOfInput) { 93 final char[] in = input.array(); 94 final char[] out = output.array(); 95 int i = input.arrayOffset() + input.position(); 96 final int n = input.arrayOffset() + input.limit(); 97 int j = output.arrayOffset() + output.position(); 98 final int m = output.arrayOffset() + output.limit(); 99 100 charLoop: 101 for (; i < n; ++i) { 102 final char ch = in[i]; 103 if (ch >= ' ' && ch <= '~') { 104 if (ch == '\\' || ch == '\'' || ch == '\"') { 105 if (j + 1 >= m) { 106 return overflow(input, i, output, j); 107 } 108 out[j++] = '\\'; 109 out[j++] = ch; 110 } else { 111 if (j >= m) { 112 return overflow(input, i, output, j); 113 } 114 out[j++] = ch; 115 } 116 } else { 117 switch (ch) { 118 case '\b': 119 if (j + 1 >= m) { 120 return overflow(input, i, output, j); 121 } 122 out[j++] = '\\'; 123 out[j++] = 'b'; 124 break; 125 case '\t': 126 if (j + 1 >= m) { 127 return overflow(input, i, output, j); 128 } 129 out[j++] = '\\'; 130 out[j++] = 't'; 131 break; 132 case '\n': 133 if (j + 1 >= m) { 134 return overflow(input, i, output, j); 135 } 136 out[j++] = '\\'; 137 out[j++] = 'n'; 138 break; 139 case '\f': 140 if (j + 1 >= m) { 141 return overflow(input, i, output, j); 142 } 143 out[j++] = '\\'; 144 out[j++] = 'f'; 145 break; 146 case '\r': 147 if (j + 1 >= m) { 148 return overflow(input, i, output, j); 149 } 150 out[j++] = '\\'; 151 out[j++] = 'r'; 152 break; 153 default: 154 if (ch <= '\377') { 155 longEscapeNeeded: 156 { 157 if (ch <= '\37') { 158 // "short" octal escapes: '\0' to '\37' 159 // cannot be followed by '0' to '7' thus 160 // require a lookahead to use. 161 if (i + 1 < n) { 162 char la = in[i + 1]; 163 if ('0' <= la && la <= '7') { 164 break longEscapeNeeded; 165 } 166 } else if (!endOfInput) { 167 // need more characters to see if we can use 168 // a short octal escape. 169 break charLoop; 170 } 171 172 if (ch <= '\7') { 173 if (j + 1 >= m) { 174 return overflow(input, i, output, j); 175 } 176 out[j++] = '\\'; 177 out[j++] = (char) (ch + '0'); 178 } else { 179 if (j + 2 >= m) { 180 return overflow(input, i, output, j); 181 } 182 out[j++] = '\\'; 183 out[j++] = (char) ((ch >>> OCT_SHIFT) + '0'); 184 out[j++] = (char) ((ch & OCT_MASK) + '0'); 185 } 186 187 continue; 188 } 189 } 190 191 if (j + OCT_ESCAPE_LENGTH > m) { 192 return overflow(input, i, output, j); 193 } 194 out[j++] = '\\'; 195 out[j++] = (char) ((ch >>> 2 * OCT_SHIFT) + '0'); 196 out[j++] = (char) (((ch >>> OCT_SHIFT) & OCT_MASK) + '0'); 197 out[j++] = (char) ((ch & OCT_MASK) + '0'); 198 } else { 199 if (j + U_ESCAPE_LENGTH > m) { 200 return overflow(input, i, output, j); 201 } 202 out[j++] = '\\'; 203 out[j++] = 'u'; 204 out[j++] = HEX[ch >>> 3 * HEX_SHIFT]; 205 out[j++] = HEX[(ch >>> 2 * HEX_SHIFT) & HEX_MASK]; 206 out[j++] = HEX[(ch >>> HEX_SHIFT) & HEX_MASK]; 207 out[j++] = HEX[ch & HEX_MASK]; 208 } 209 } 210 } 211 } 212 213 return underflow(input, i, output, j); 214 } 215 } 216