1 /**************************************************************** 2 * Licensed to the Apache Software Foundation (ASF) under one * 3 * or more contributor license agreements. See the NOTICE file * 4 * distributed with this work for additional information * 5 * regarding copyright ownership. The ASF licenses this file * 6 * to you under the Apache License, Version 2.0 (the * 7 * "License"); you may not use this file except in compliance * 8 * with the License. You may obtain a copy of the License at * 9 * * 10 * http://www.apache.org/licenses/LICENSE-2.0 * 11 * * 12 * Unless required by applicable law or agreed to in writing, * 13 * software distributed under the License is distributed on an * 14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * 15 * KIND, either express or implied. See the License for the * 16 * specific language governing permissions and limitations * 17 * under the License. * 18 ****************************************************************/ 19 20 package org.apache.james.mime4j; 21 22 import java.io.IOException; 23 import java.io.InputStream; 24 import java.util.BitSet; 25 import java.util.LinkedList; 26 27 import org.apache.commons.logging.Log; 28 import org.apache.commons.logging.LogFactory; 29 import org.apache.james.mime4j.decoder.Base64InputStream; 30 import org.apache.james.mime4j.decoder.QuotedPrintableInputStream; 31 32 /** 33 * <p> 34 * Parses MIME (or RFC822) message streams of bytes or characters and reports 35 * parsing events to a <code>ContentHandler</code> instance. 36 * </p> 37 * <p> 38 * Typical usage:<br/> 39 * <pre> 40 * ContentHandler handler = new MyHandler(); 41 * MimeStreamParser parser = new MimeStreamParser(); 42 * parser.setContentHandler(handler); 43 * parser.parse(new BufferedInputStream(new FileInputStream("mime.msg"))); 44 * </pre> 45 * <strong>NOTE:</strong> All lines must end with CRLF 46 * (<code>\r\n</code>). If you are unsure of the line endings in your stream 47 * you should wrap it in a {@link org.apache.james.mime4j.EOLConvertingInputStream} instance. 48 * 49 * 50 * @version $Id: MimeStreamParser.java,v 1.8 2005/02/11 10:12:02 ntherning Exp $ 51 */ 52 public class MimeStreamParser { 53 private static final Log log = LogFactory.getLog(MimeStreamParser.class); 54 55 private static BitSet fieldChars = null; 56 57 private RootInputStream rootStream = null; 58 private LinkedList bodyDescriptors = new LinkedList(); 59 private ContentHandler handler = null; 60 private boolean raw = false; 61 62 static { 63 fieldChars = new BitSet(); 64 for (int i = 0x21; i <= 0x39; i++) { 65 fieldChars.set(i); 66 } 67 for (int i = 0x3b; i <= 0x7e; i++) { 68 fieldChars.set(i); 69 } 70 } 71 72 /** 73 * Creates a new <code>MimeStreamParser</code> instance. 74 */ MimeStreamParser()75 public MimeStreamParser() { 76 } 77 78 /** 79 * Parses a stream of bytes containing a MIME message. 80 * 81 * @param is the stream to parse. 82 * @throws IOException on I/O errors. 83 */ parse(InputStream is)84 public void parse(InputStream is) throws IOException { 85 rootStream = new RootInputStream(is); 86 parseMessage(rootStream); 87 } 88 89 /** 90 * Determines if this parser is currently in raw mode. 91 * 92 * @return <code>true</code> if in raw mode, <code>false</code> 93 * otherwise. 94 * @see #setRaw(boolean) 95 */ isRaw()96 public boolean isRaw() { 97 return raw; 98 } 99 100 /** 101 * Enables or disables raw mode. In raw mode all future entities 102 * (messages or body parts) in the stream will be reported to the 103 * {@link ContentHandler#raw(InputStream)} handler method only. 104 * The stream will contain the entire unparsed entity contents 105 * including header fields and whatever is in the body. 106 * 107 * @param raw <code>true</code> enables raw mode, <code>false</code> 108 * disables it. 109 */ setRaw(boolean raw)110 public void setRaw(boolean raw) { 111 this.raw = raw; 112 } 113 114 /** 115 * Finishes the parsing and stops reading lines. 116 * NOTE: No more lines will be parsed but the parser 117 * will still call 118 * {@link ContentHandler#endMultipart()}, 119 * {@link ContentHandler#endBodyPart()}, 120 * {@link ContentHandler#endMessage()}, etc to match previous calls 121 * to 122 * {@link ContentHandler#startMultipart(BodyDescriptor)}, 123 * {@link ContentHandler#startBodyPart()}, 124 * {@link ContentHandler#startMessage()}, etc. 125 */ stop()126 public void stop() { 127 rootStream.truncate(); 128 } 129 130 /** 131 * Parses an entity which consists of a header followed by a body containing 132 * arbitrary data, body parts or an embedded message. 133 * 134 * @param is the stream to parse. 135 * @throws IOException on I/O errors. 136 */ parseEntity(InputStream is)137 private void parseEntity(InputStream is) throws IOException { 138 BodyDescriptor bd = parseHeader(is); 139 140 if (bd.isMultipart()) { 141 bodyDescriptors.addFirst(bd); 142 143 handler.startMultipart(bd); 144 145 MimeBoundaryInputStream tempIs = 146 new MimeBoundaryInputStream(is, bd.getBoundary()); 147 handler.preamble(new CloseShieldInputStream(tempIs)); 148 tempIs.consume(); 149 150 while (tempIs.hasMoreParts()) { 151 tempIs = new MimeBoundaryInputStream(is, bd.getBoundary()); 152 parseBodyPart(tempIs); 153 tempIs.consume(); 154 if (tempIs.parentEOF()) { 155 if (log.isWarnEnabled()) { 156 log.warn("Line " + rootStream.getLineNumber() 157 + ": Body part ended prematurely. " 158 + "Higher level boundary detected or " 159 + "EOF reached."); 160 } 161 break; 162 } 163 } 164 165 handler.epilogue(new CloseShieldInputStream(is)); 166 167 handler.endMultipart(); 168 169 bodyDescriptors.removeFirst(); 170 171 } else if (bd.isMessage()) { 172 if (bd.isBase64Encoded()) { 173 log.warn("base64 encoded message/rfc822 detected"); 174 is = new EOLConvertingInputStream( 175 new Base64InputStream(is)); 176 } else if (bd.isQuotedPrintableEncoded()) { 177 log.warn("quoted-printable encoded message/rfc822 detected"); 178 is = new EOLConvertingInputStream( 179 new QuotedPrintableInputStream(is)); 180 } 181 bodyDescriptors.addFirst(bd); 182 parseMessage(is); 183 bodyDescriptors.removeFirst(); 184 } else { 185 handler.body(bd, new CloseShieldInputStream(is)); 186 } 187 188 /* 189 * Make sure the stream has been consumed. 190 */ 191 while (is.read() != -1) { 192 } 193 } 194 parseMessage(InputStream is)195 private void parseMessage(InputStream is) throws IOException { 196 if (raw) { 197 handler.raw(new CloseShieldInputStream(is)); 198 } else { 199 handler.startMessage(); 200 parseEntity(is); 201 handler.endMessage(); 202 } 203 } 204 parseBodyPart(InputStream is)205 private void parseBodyPart(InputStream is) throws IOException { 206 if (raw) { 207 handler.raw(new CloseShieldInputStream(is)); 208 } else { 209 handler.startBodyPart(); 210 parseEntity(is); 211 handler.endBodyPart(); 212 } 213 } 214 215 /** 216 * Parses a header. 217 * 218 * @param is the stream to parse. 219 * @return a <code>BodyDescriptor</code> describing the body following 220 * the header. 221 */ parseHeader(InputStream is)222 private BodyDescriptor parseHeader(InputStream is) throws IOException { 223 BodyDescriptor bd = new BodyDescriptor(bodyDescriptors.isEmpty() 224 ? null : (BodyDescriptor) bodyDescriptors.getFirst()); 225 226 handler.startHeader(); 227 228 int lineNumber = rootStream.getLineNumber(); 229 230 StringBuffer sb = new StringBuffer(); 231 int curr = 0; 232 int prev = 0; 233 while ((curr = is.read()) != -1) { 234 if (curr == '\n' && (prev == '\n' || prev == 0)) { 235 /* 236 * [\r]\n[\r]\n or an immediate \r\n have been seen. 237 */ 238 sb.deleteCharAt(sb.length() - 1); 239 break; 240 } 241 sb.append((char) curr); 242 prev = curr == '\r' ? prev : curr; 243 } 244 245 if (curr == -1 && log.isWarnEnabled()) { 246 log.warn("Line " + rootStream.getLineNumber() 247 + ": Unexpected end of headers detected. " 248 + "Boundary detected in header or EOF reached."); 249 } 250 251 int start = 0; 252 int pos = 0; 253 int startLineNumber = lineNumber; 254 while (pos < sb.length()) { 255 while (pos < sb.length() && sb.charAt(pos) != '\r') { 256 pos++; 257 } 258 if (pos < sb.length() - 1 && sb.charAt(pos + 1) != '\n') { 259 pos++; 260 continue; 261 } 262 263 if (pos >= sb.length() - 2 || fieldChars.get(sb.charAt(pos + 2))) { 264 265 /* 266 * field should be the complete field data excluding the 267 * trailing \r\n. 268 */ 269 String field = sb.substring(start, pos); 270 start = pos + 2; 271 272 /* 273 * Check for a valid field. 274 */ 275 int index = field.indexOf(':'); 276 boolean valid = false; 277 if (index != -1 && fieldChars.get(field.charAt(0))) { 278 valid = true; 279 String fieldName = field.substring(0, index).trim(); 280 for (int i = 0; i < fieldName.length(); i++) { 281 if (!fieldChars.get(fieldName.charAt(i))) { 282 valid = false; 283 break; 284 } 285 } 286 287 if (valid) { 288 handler.field(field); 289 bd.addField(fieldName, field.substring(index + 1)); 290 } 291 } 292 293 if (!valid && log.isWarnEnabled()) { 294 log.warn("Line " + startLineNumber 295 + ": Ignoring invalid field: '" + field.trim() + "'"); 296 } 297 298 startLineNumber = lineNumber; 299 } 300 301 pos += 2; 302 lineNumber++; 303 } 304 305 handler.endHeader(); 306 307 return bd; 308 } 309 310 /** 311 * Sets the <code>ContentHandler</code> to use when reporting 312 * parsing events. 313 * 314 * @param h the <code>ContentHandler</code>. 315 */ setContentHandler(ContentHandler h)316 public void setContentHandler(ContentHandler h) { 317 this.handler = h; 318 } 319 320 } 321