1 // ================================================================================================= 2 // ADOBE SYSTEMS INCORPORATED 3 // Copyright 2006 Adobe Systems Incorporated 4 // All Rights Reserved 5 // 6 // NOTICE: Adobe permits you to use, modify, and distribute this file in accordance with the terms 7 // of the Adobe license agreement accompanying it. 8 // ================================================================================================= 9 10 11 12 package com.adobe.xmp.impl; 13 14 import java.io.UnsupportedEncodingException; 15 16 17 /** 18 * @since 12.10.2006 19 */ 20 public class Latin1Converter 21 { 22 /** */ 23 private static final int STATE_START = 0; 24 /** */ 25 private static final int STATE_UTF8CHAR = 11; 26 27 28 /** 29 * Private constructor 30 */ Latin1Converter()31 private Latin1Converter() 32 { 33 // EMPTY 34 } 35 36 37 /** 38 * A converter that processes a byte buffer containing a mix of UTF8 and Latin-1/Cp1252 chars. 39 * The result is a buffer where those chars have been converted to UTF-8; 40 * that means it contains only valid UTF-8 chars. 41 * <p> 42 * <em>Explanation of the processing:</em> First the encoding of the buffer is detected looking 43 * at the first four bytes (that works only if the buffer starts with an ASCII-char, 44 * like xmls '<'). UTF-16/32 flavours do not require further proccessing. 45 * <p> 46 * In the case, UTF-8 is detected, it assumes wrong UTF8 chars to be a sequence of 47 * Latin-1/Cp1252 encoded bytes and converts the chars to their corresponding UTF-8 byte 48 * sequence. 49 * <p> 50 * The 0x80..0x9F range is undefined in Latin-1, but is defined in Windows code 51 * page 1252. The bytes 0x81, 0x8D, 0x8F, 0x90, and 0x9D are formally undefined 52 * by Windows 1252. These are in XML's RestrictedChar set, so we map them to a 53 * space. 54 * <p> 55 * The official Latin-1 characters in the range 0xA0..0xFF are converted into 56 * the Unicode Latin Supplement range U+00A0 - U+00FF. 57 * <p> 58 * <em>Example:</em> If an Euro-symbol (€) appears in the byte buffer (0xE2, 0x82, 0xAC), 59 * it will be left as is. But if only the first two bytes are appearing, 60 * followed by an ASCII char a (0xE2 - 0x82 - 0x41), it will be converted to 61 * 0xC3, 0xA2 (â) - 0xE2, 0x80, 0x9A (‚) - 0x41 (a). 62 * 63 * @param buffer a byte buffer contain 64 * @return Returns a new buffer containing valid UTF-8 65 */ convert(ByteBuffer buffer)66 public static ByteBuffer convert(ByteBuffer buffer) 67 { 68 if ("UTF-8".equals(buffer.getEncoding())) 69 { 70 // the buffer containing one UTF-8 char (up to 8 bytes) 71 byte[] readAheadBuffer = new byte[8]; 72 // the number of bytes read ahead. 73 int readAhead = 0; 74 // expected UTF8 bytesto come 75 int expectedBytes = 0; 76 // output buffer with estimated length 77 ByteBuffer out = new ByteBuffer(buffer.length() * 4 / 3); 78 79 int state = STATE_START; 80 for (int i = 0; i < buffer.length(); i++) 81 { 82 int b = buffer.charAt(i); 83 84 switch (state) 85 { 86 default: 87 case STATE_START: 88 if (b < 0x7F) 89 { 90 out.append((byte) b); 91 } 92 else if (b >= 0xC0) 93 { 94 // start of UTF8 sequence 95 expectedBytes = -1; 96 int test = b; 97 for (; expectedBytes < 8 && (test & 0x80) == 0x80; test = test << 1) 98 { 99 expectedBytes++; 100 } 101 readAheadBuffer[readAhead++] = (byte) b; 102 state = STATE_UTF8CHAR; 103 } 104 else // implicitly: b >= 0x80 && b < 0xC0 105 { 106 // invalid UTF8 start char, assume to be Latin-1 107 byte[] utf8 = convertToUTF8((byte) b); 108 out.append(utf8); 109 } 110 break; 111 112 case STATE_UTF8CHAR: 113 if (expectedBytes > 0 && (b & 0xC0) == 0x80) 114 { 115 // valid UTF8 char, add to readAheadBuffer 116 readAheadBuffer[readAhead++] = (byte) b; 117 expectedBytes--; 118 119 if (expectedBytes == 0) 120 { 121 out.append(readAheadBuffer, 0, readAhead); 122 readAhead = 0; 123 124 state = STATE_START; 125 } 126 } 127 else 128 { 129 // invalid UTF8 char: 130 // 1. convert first of seq to UTF8 131 byte[] utf8 = convertToUTF8(readAheadBuffer[0]); 132 out.append(utf8); 133 134 // 2. continue processing at second byte of sequence 135 i = i - readAhead; 136 readAhead = 0; 137 138 state = STATE_START; 139 } 140 break; 141 } 142 } 143 144 // loop ends with "half" Utf8 char --> assume that the bytes are Latin-1 145 if (state == STATE_UTF8CHAR) 146 { 147 for (int j = 0; j < readAhead; j++) 148 { 149 byte b = readAheadBuffer[j]; 150 byte[] utf8 = convertToUTF8(b); 151 out.append(utf8); 152 } 153 } 154 155 return out; 156 } 157 else 158 { 159 // Latin-1 fixing applies only to UTF-8 160 return buffer; 161 } 162 } 163 164 165 /** 166 * Converts a Cp1252 char (contains all Latin-1 chars above 0x80) into a 167 * UTF-8 byte sequence. The bytes 0x81, 0x8D, 0x8F, 0x90, and 0x9D are 168 * formally undefined by Windows 1252 and therefore replaced by a space 169 * (0x20). 170 * 171 * @param ch 172 * an Cp1252 / Latin-1 byte 173 * @return Returns a byte array containing a UTF-8 byte sequence. 174 */ convertToUTF8(byte ch)175 private static byte[] convertToUTF8(byte ch) 176 { 177 int c = ch & 0xFF; 178 try 179 { 180 if (c >= 0x80) 181 { 182 if (c == 0x81 || c == 0x8D || c == 0x8F || c == 0x90 || c == 0x9D) 183 { 184 return new byte[] { 0x20 }; // space for undefined 185 } 186 187 // interpret byte as Windows Cp1252 char 188 return new String(new byte[] { ch }, "cp1252").getBytes("UTF-8"); 189 } 190 } 191 catch (UnsupportedEncodingException e) 192 { 193 // EMPTY 194 } 195 return new byte[] { ch }; 196 } 197 } 198