• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // =================================================================================================
2 // ADOBE SYSTEMS INCORPORATED
3 // Copyright 2006 Adobe Systems Incorporated
4 // All Rights Reserved
5 //
6 // NOTICE:  Adobe permits you to use, modify, and distribute this file in accordance with the terms
7 // of the Adobe license agreement accompanying it.
8 // =================================================================================================
9 
10 
11 
12 package com.adobe.xmp.impl;
13 
14 import java.io.UnsupportedEncodingException;
15 
16 
17 /**
18  * @since   12.10.2006
19  */
20 public class Latin1Converter
21 {
22 	/** */
23 	private static final int STATE_START = 0;
24 	/** */
25 	private static final int STATE_UTF8CHAR = 11;
26 
27 
28 	/**
29 	 * Private constructor
30 	 */
Latin1Converter()31 	private Latin1Converter()
32 	{
33 		// EMPTY
34 	}
35 
36 
37 	/**
38 	 * A converter that processes a byte buffer containing a mix of UTF8 and Latin-1/Cp1252 chars.
39 	 * The result is a buffer where those chars have been converted to UTF-8;
40 	 * that means it contains only valid UTF-8 chars.
41 	 * <p>
42 	 * <em>Explanation of the processing:</em> First the encoding of the buffer is detected looking
43 	 * at the first four bytes (that works only if the buffer starts with an ASCII-char,
44 	 * like xmls &apos;&lt;&apos;). UTF-16/32 flavours do not require further proccessing.
45 	 * <p>
46 	 * In the case, UTF-8 is detected, it assumes wrong UTF8 chars to be a sequence of
47 	 * Latin-1/Cp1252 encoded bytes and converts the chars to their corresponding UTF-8 byte
48 	 * sequence.
49 	 * <p>
50 	 * The 0x80..0x9F range is undefined in Latin-1, but is defined in Windows code
51 	 * page 1252. The bytes 0x81, 0x8D, 0x8F, 0x90, and 0x9D are formally undefined
52 	 * by Windows 1252. These are in XML's RestrictedChar set, so we map them to a
53 	 * space.
54 	 * <p>
55 	 * The official Latin-1 characters in the range 0xA0..0xFF are converted into
56 	 * the Unicode Latin Supplement range U+00A0 - U+00FF.
57 	 * <p>
58 	 * <em>Example:</em> If an Euro-symbol (€) appears in the byte buffer (0xE2, 0x82, 0xAC),
59 	 * it will be left as is. But if only the first two bytes are appearing,
60 	 * followed by an ASCII char a (0xE2 - 0x82 - 0x41), it will be converted to
61 	 * 0xC3, 0xA2 (â) - 0xE2, 0x80, 0x9A (‚) - 0x41 (a).
62 	 *
63 	 * @param buffer a byte buffer contain
64 	 * @return Returns a new buffer containing valid UTF-8
65 	 */
convert(ByteBuffer buffer)66 	public static ByteBuffer convert(ByteBuffer buffer)
67 	{
68 		if ("UTF-8".equals(buffer.getEncoding()))
69 		{
70 			// the buffer containing one UTF-8 char (up to 8 bytes)
71 			byte[] readAheadBuffer = new byte[8];
72 			// the number of bytes read ahead.
73 			int readAhead  = 0;
74 			// expected UTF8 bytesto come
75 			int expectedBytes = 0;
76 			// output buffer with estimated length
77 			ByteBuffer out = new ByteBuffer(buffer.length() * 4 / 3);
78 
79 			int state = STATE_START;
80 			for (int i = 0; i < buffer.length(); i++)
81 			{
82 				int b = buffer.charAt(i);
83 
84 				switch (state)
85 				{
86 					default:
87 					case STATE_START:
88 						if (b < 0x7F)
89 						{
90 							out.append((byte) b);
91 						}
92 						else if (b >= 0xC0)
93 						{
94 							// start of UTF8 sequence
95 							expectedBytes = -1;
96 							int test = b;
97 							for (; expectedBytes < 8  &&  (test & 0x80) == 0x80; test = test << 1)
98 							{
99 								expectedBytes++;
100 							}
101 							readAheadBuffer[readAhead++] = (byte) b;
102 							state = STATE_UTF8CHAR;
103 						}
104 						else //  implicitly:  b >= 0x80  &&  b < 0xC0
105 						{
106 							// invalid UTF8 start char, assume to be Latin-1
107 							byte[] utf8 = convertToUTF8((byte) b);
108 							out.append(utf8);
109 						}
110 						break;
111 
112 					case STATE_UTF8CHAR:
113 						if (expectedBytes > 0  &&  (b & 0xC0) == 0x80)
114 						{
115 							// valid UTF8 char, add to readAheadBuffer
116 							readAheadBuffer[readAhead++] = (byte) b;
117 							expectedBytes--;
118 
119 							if (expectedBytes == 0)
120 							{
121 								out.append(readAheadBuffer, 0, readAhead);
122 								readAhead = 0;
123 
124 								state = STATE_START;
125 							}
126 						}
127 						else
128 						{
129 							// invalid UTF8 char:
130 							// 1. convert first of seq to UTF8
131 							byte[] utf8 = convertToUTF8(readAheadBuffer[0]);
132 							out.append(utf8);
133 
134 							// 2. continue processing at second byte of sequence
135 							i = i - readAhead;
136 							readAhead = 0;
137 
138 							state = STATE_START;
139 						}
140 						break;
141 				}
142 			}
143 
144 			// loop ends with "half" Utf8 char --> assume that the bytes are Latin-1
145 			if (state == STATE_UTF8CHAR)
146 			{
147 				for (int j = 0; j < readAhead; j++)
148 				{
149 					byte b = readAheadBuffer[j];
150 					byte[] utf8 = convertToUTF8(b);
151 					out.append(utf8);
152 				}
153 			}
154 
155 			return out;
156 		}
157 		else
158 		{
159 			// Latin-1 fixing applies only to UTF-8
160 			return buffer;
161 		}
162 	}
163 
164 
165 	/**
166 	 * Converts a Cp1252 char (contains all Latin-1 chars above 0x80) into a
167 	 * UTF-8 byte sequence. The bytes 0x81, 0x8D, 0x8F, 0x90, and 0x9D are
168 	 * formally undefined by Windows 1252 and therefore replaced by a space
169 	 * (0x20).
170 	 *
171 	 * @param ch
172 	 *            an Cp1252 / Latin-1 byte
173 	 * @return Returns a byte array containing a UTF-8 byte sequence.
174 	 */
convertToUTF8(byte ch)175 	private static byte[] convertToUTF8(byte ch)
176 	{
177 		int c = ch & 0xFF;
178 		try
179 		{
180 			if (c >= 0x80)
181 			{
182 				if (c == 0x81  ||  c == 0x8D  ||  c == 0x8F  ||  c == 0x90  ||  c == 0x9D)
183 				{
184 					return new byte[] { 0x20 }; // space for undefined
185 				}
186 
187 				// interpret byte as Windows Cp1252 char
188 				return new String(new byte[] { ch }, "cp1252").getBytes("UTF-8");
189 			}
190 		}
191 		catch (UnsupportedEncodingException e)
192 		{
193 			// EMPTY
194 		}
195 		return new byte[] { ch };
196 	}
197 }
198