1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "net/tools/balsa/balsa_frame.h"
6
7 #include <assert.h>
8 #if __SSE2__
9 #include <emmintrin.h>
10 #endif // __SSE2__
11
12 #include <limits>
13 #include <string>
14 #include <utility>
15 #include <vector>
16
17 #include "base/logging.h"
18 #include "base/port.h"
19 #include "base/strings/string_piece.h"
20 #include "net/tools/balsa/balsa_enums.h"
21 #include "net/tools/balsa/balsa_headers.h"
22 #include "net/tools/balsa/balsa_visitor_interface.h"
23 #include "net/tools/balsa/buffer_interface.h"
24 #include "net/tools/balsa/simple_buffer.h"
25 #include "net/tools/balsa/split.h"
26 #include "net/tools/balsa/string_piece_utils.h"
27
28 #if defined(COMPILER_MSVC)
29 #include <string.h>
30 #define strncasecmp _strnicmp
31 #else
32 #include <strings.h>
33 #endif
34
35 namespace net {
36
37 // Constants holding some header names for headers which can affect the way the
38 // HTTP message is framed, and so must be processed specially:
39 static const char kContentLength[] = "content-length";
40 static const size_t kContentLengthSize = sizeof(kContentLength) - 1;
41 static const char kTransferEncoding[] = "transfer-encoding";
42 static const size_t kTransferEncodingSize = sizeof(kTransferEncoding) - 1;
43
BalsaFrame()44 BalsaFrame::BalsaFrame()
45 : last_char_was_slash_r_(false),
46 saw_non_newline_char_(false),
47 start_was_space_(true),
48 chunk_length_character_extracted_(false),
49 is_request_(true),
50 request_was_head_(false),
51 max_header_length_(16 * 1024),
52 max_request_uri_length_(2048),
53 visitor_(&do_nothing_visitor_),
54 chunk_length_remaining_(0),
55 content_length_remaining_(0),
56 last_slash_n_loc_(NULL),
57 last_recorded_slash_n_loc_(NULL),
58 last_slash_n_idx_(0),
59 term_chars_(0),
60 parse_state_(BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE),
61 last_error_(BalsaFrameEnums::NO_ERROR),
62 headers_(NULL) {
63 }
64
~BalsaFrame()65 BalsaFrame::~BalsaFrame() {}
66
Reset()67 void BalsaFrame::Reset() {
68 last_char_was_slash_r_ = false;
69 saw_non_newline_char_ = false;
70 start_was_space_ = true;
71 chunk_length_character_extracted_ = false;
72 // is_request_ = true; // not reset between messages.
73 // request_was_head_ = false; // not reset between messages.
74 // max_header_length_ = 4096; // not reset between messages.
75 // max_request_uri_length_ = 2048; // not reset between messages.
76 // visitor_ = &do_nothing_visitor_; // not reset between messages.
77 chunk_length_remaining_ = 0;
78 content_length_remaining_ = 0;
79 last_slash_n_loc_ = NULL;
80 last_recorded_slash_n_loc_ = NULL;
81 last_slash_n_idx_ = 0;
82 term_chars_ = 0;
83 parse_state_ = BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE;
84 last_error_ = BalsaFrameEnums::NO_ERROR;
85 lines_.clear();
86 if (headers_ != NULL) {
87 headers_->Clear();
88 }
89 }
90
ParseStateToString(BalsaFrameEnums::ParseState error_code)91 const char* BalsaFrameEnums::ParseStateToString(
92 BalsaFrameEnums::ParseState error_code) {
93 switch (error_code) {
94 case PARSE_ERROR:
95 return "PARSE_ERROR";
96 case READING_HEADER_AND_FIRSTLINE:
97 return "READING_HEADER_AND_FIRSTLINE";
98 case READING_CHUNK_LENGTH:
99 return "READING_CHUNK_LENGTH";
100 case READING_CHUNK_EXTENSION:
101 return "READING_CHUNK_EXTENSION";
102 case READING_CHUNK_DATA:
103 return "READING_CHUNK_DATA";
104 case READING_CHUNK_TERM:
105 return "READING_CHUNK_TERM";
106 case READING_LAST_CHUNK_TERM:
107 return "READING_LAST_CHUNK_TERM";
108 case READING_TRAILER:
109 return "READING_TRAILER";
110 case READING_UNTIL_CLOSE:
111 return "READING_UNTIL_CLOSE";
112 case READING_CONTENT:
113 return "READING_CONTENT";
114 case MESSAGE_FULLY_READ:
115 return "MESSAGE_FULLY_READ";
116 case NUM_STATES:
117 return "UNKNOWN_STATE";
118 }
119 return "UNKNOWN_STATE";
120 }
121
ErrorCodeToString(BalsaFrameEnums::ErrorCode error_code)122 const char* BalsaFrameEnums::ErrorCodeToString(
123 BalsaFrameEnums::ErrorCode error_code) {
124 switch (error_code) {
125 case NO_ERROR:
126 return "NO_ERROR";
127 case NO_STATUS_LINE_IN_RESPONSE:
128 return "NO_STATUS_LINE_IN_RESPONSE";
129 case NO_REQUEST_LINE_IN_REQUEST:
130 return "NO_REQUEST_LINE_IN_REQUEST";
131 case FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION:
132 return "FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION";
133 case FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD:
134 return "FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD";
135 case FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE:
136 return "FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE";
137 case FAILED_TO_FIND_WS_AFTER_REQUEST_REQUEST_URI:
138 return "FAILED_TO_FIND_WS_AFTER_REQUEST_REQUEST_URI";
139 case FAILED_TO_FIND_NL_AFTER_RESPONSE_REASON_PHRASE:
140 return "FAILED_TO_FIND_NL_AFTER_RESPONSE_REASON_PHRASE";
141 case FAILED_TO_FIND_NL_AFTER_REQUEST_HTTP_VERSION:
142 return "FAILED_TO_FIND_NL_AFTER_REQUEST_HTTP_VERSION";
143 case FAILED_CONVERTING_STATUS_CODE_TO_INT:
144 return "FAILED_CONVERTING_STATUS_CODE_TO_INT";
145 case REQUEST_URI_TOO_LONG:
146 return "REQUEST_URI_TOO_LONG";
147 case HEADERS_TOO_LONG:
148 return "HEADERS_TOO_LONG";
149 case UNPARSABLE_CONTENT_LENGTH:
150 return "UNPARSABLE_CONTENT_LENGTH";
151 case MAYBE_BODY_BUT_NO_CONTENT_LENGTH:
152 return "MAYBE_BODY_BUT_NO_CONTENT_LENGTH";
153 case REQUIRED_BODY_BUT_NO_CONTENT_LENGTH:
154 return "REQUIRED_BODY_BUT_NO_CONTENT_LENGTH";
155 case HEADER_MISSING_COLON:
156 return "HEADER_MISSING_COLON";
157 case INVALID_CHUNK_LENGTH:
158 return "INVALID_CHUNK_LENGTH";
159 case CHUNK_LENGTH_OVERFLOW:
160 return "CHUNK_LENGTH_OVERFLOW";
161 case CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO:
162 return "CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO";
163 case CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT:
164 return "CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT";
165 case MULTIPLE_CONTENT_LENGTH_KEYS:
166 return "MULTIPLE_CONTENT_LENGTH_KEYS";
167 case MULTIPLE_TRANSFER_ENCODING_KEYS:
168 return "MULTIPLE_TRANSFER_ENCODING_KEYS";
169 case UNKNOWN_TRANSFER_ENCODING:
170 return "UNKNOWN_TRANSFER_ENCODING";
171 case INVALID_HEADER_FORMAT:
172 return "INVALID_HEADER_FORMAT";
173 case INTERNAL_LOGIC_ERROR:
174 return "INTERNAL_LOGIC_ERROR";
175 case NUM_ERROR_CODES:
176 return "UNKNOWN_ERROR";
177 }
178 return "UNKNOWN_ERROR";
179 }
180
181 // Summary:
182 // Parses the first line of either a request or response.
183 // Note that in the case of a detected warning, error_code will be set
184 // but the function will not return false.
185 // Exactly zero or one warning or error (but not both) may be detected
186 // by this function.
187 // Note that this function will not write the data of the first-line
188 // into the header's buffer (that should already have been done elsewhere).
189 //
190 // Pre-conditions:
191 // begin != end
192 // *begin should be a character which is > ' '. This implies that there
193 // is at least one non-whitespace characters between [begin, end).
194 // headers is a valid pointer to a BalsaHeaders class.
195 // error_code is a valid pointer to a BalsaFrameEnums::ErrorCode value.
196 // Entire first line must exist between [begin, end)
197 // Exactly zero or one newlines -may- exist between [begin, end)
198 // [begin, end) should exist in the header's buffer.
199 //
200 // Side-effects:
201 // headers will be modified
202 // error_code may be modified if either a warning or error is detected
203 //
204 // Returns:
205 // True if no error (as opposed to warning) is detected.
206 // False if an error (as opposed to warning) is detected.
207
208 //
209 // If there is indeed non-whitespace in the line, then the following
210 // will take care of this for you:
211 // while (*begin <= ' ') ++begin;
212 // ProcessFirstLine(begin, end, is_request, &headers, &error_code);
213 //
ParseHTTPFirstLine(const char * begin,const char * end,bool is_request,size_t max_request_uri_length,BalsaHeaders * headers,BalsaFrameEnums::ErrorCode * error_code)214 bool ParseHTTPFirstLine(const char* begin,
215 const char* end,
216 bool is_request,
217 size_t max_request_uri_length,
218 BalsaHeaders* headers,
219 BalsaFrameEnums::ErrorCode* error_code) {
220 const char* current = begin;
221 // HTTP firstlines all have the following structure:
222 // LWS NONWS LWS NONWS LWS NONWS NOTCRLF CRLF
223 // [\t \r\n]+ [^\t ]+ [\t ]+ [^\t ]+ [\t ]+ [^\t ]+ [^\r\n]+ "\r\n"
224 // ws1 nws1 ws2 nws2 ws3 nws3 ws4
225 // | [-------) [-------) [----------------)
226 // REQ: method request_uri version
227 // RESP: version statuscode reason
228 //
229 // The first NONWS->LWS component we'll call firstline_a.
230 // The second firstline_b, and the third firstline_c.
231 //
232 // firstline_a goes from nws1 to (but not including) ws2
233 // firstline_b goes from nws2 to (but not including) ws3
234 // firstline_c goes from nws3 to (but not including) ws4
235 //
236 // In the code:
237 // ws1 == whitespace_1_idx_
238 // nws1 == non_whitespace_1_idx_
239 // ws2 == whitespace_2_idx_
240 // nws2 == non_whitespace_2_idx_
241 // ws3 == whitespace_3_idx_
242 // nws3 == non_whitespace_3_idx_
243 // ws4 == whitespace_4_idx_
244
245 // Kill all whitespace (including '\r\n') at the end of the line.
246 --end;
247 if (*end != '\n') {
248 *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR;
249 LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n"
250 << headers->OriginalHeadersForDebugging();
251 return false;
252 }
253 while (begin < end && *end <= ' ') {
254 --end;
255 }
256 DCHECK(*end != '\n');
257 if (*end == '\n') {
258 *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR;
259 LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n"
260 << headers->OriginalHeadersForDebugging();
261 return false;
262 }
263 ++end;
264
265 // The two following statements should not be possible.
266 if (end == begin) {
267 *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR;
268 LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n"
269 << headers->OriginalHeadersForDebugging();
270 return false;
271 }
272
273 // whitespace_1_idx_
274 headers->whitespace_1_idx_ = current - begin;
275 // This loop is commented out as it is never used in current code. This is
276 // true only because we don't begin parsing the headers at all until we've
277 // encountered a non whitespace character at the beginning of the stream, at
278 // which point we begin our demarcation of header-start. If we did -not- do
279 // this (for instance, only looked for [\r\n] instead of (< ' ')), this loop
280 // would be necessary for the proper functioning of this parsing.
281 // This is left here as this function may (in the future) be refactored out
282 // of the BalsaFrame class so that it may be shared between code in
283 // BalsaFrame and BalsaHeaders (where it would be used in some variant of the
284 // set_first_line() function (at which point it would be necessary).
285 #if 0
286 while (*current <= ' ') {
287 ++current;
288 }
289 #endif
290 // non_whitespace_1_idx_
291 headers->non_whitespace_1_idx_ = current - begin;
292 do {
293 // The first time through, we're guaranteed that the current character
294 // won't be a whitespace (else the loop above wouldn't have terminated).
295 // That implies that we're guaranteed to get at least one non-whitespace
296 // character if we get into this loop at all.
297 ++current;
298 if (current == end) {
299 headers->whitespace_2_idx_ = current - begin;
300 headers->non_whitespace_2_idx_ = current - begin;
301 headers->whitespace_3_idx_ = current - begin;
302 headers->non_whitespace_3_idx_ = current - begin;
303 headers->whitespace_4_idx_ = current - begin;
304 // FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD for request
305 // FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION for response
306 *error_code =
307 static_cast<BalsaFrameEnums::ErrorCode>(
308 BalsaFrameEnums::FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION +
309 is_request);
310 if (!is_request) { // FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION
311 return false;
312 }
313 goto output_exhausted;
314 }
315 } while (*current > ' ');
316 // whitespace_2_idx_
317 headers->whitespace_2_idx_ = current - begin;
318 do {
319 ++current;
320 // Note that due to the loop which consumes all of the whitespace
321 // at the end of the line, current can never == end while in this function.
322 } while (*current <= ' ');
323 // non_whitespace_2_idx_
324 headers->non_whitespace_2_idx_ = current - begin;
325 do {
326 ++current;
327 if (current == end) {
328 headers->whitespace_3_idx_ = current - begin;
329 headers->non_whitespace_3_idx_ = current - begin;
330 headers->whitespace_4_idx_ = current - begin;
331 // FAILED_TO_FIND_START_OF_REQUEST_REQUEST_URI for request
332 // FAILED_TO_FIND_START_OF_RESPONSE_STATUSCODE for response
333 *error_code =
334 static_cast<BalsaFrameEnums::ErrorCode>(
335 BalsaFrameEnums::FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE
336 + is_request);
337 goto output_exhausted;
338 }
339 } while (*current > ' ');
340 // whitespace_3_idx_
341 headers->whitespace_3_idx_ = current - begin;
342 do {
343 ++current;
344 // Note that due to the loop which consumes all of the whitespace
345 // at the end of the line, current can never == end while in this function.
346 } while (*current <= ' ');
347 // non_whitespace_3_idx_
348 headers->non_whitespace_3_idx_ = current - begin;
349 headers->whitespace_4_idx_ = end - begin;
350
351 output_exhausted:
352 // Note that we don't fail the parse immediately when parsing of the
353 // firstline fails. Depending on the protocol type, we may want to accept
354 // a firstline with only one or two elements, e.g., for HTTP/0.9:
355 // GET\r\n
356 // or
357 // GET /\r\n
358 // should be parsed without issue (though the visitor should know that
359 // parsing the entire line was not exactly as it should be).
360 //
361 // Eventually, these errors may be removed alltogether, as the visitor can
362 // detect them on its own by examining the size of the various fields.
363 // headers->set_first_line(non_whitespace_1_idx_, current);
364
365 if (is_request) {
366 if ((headers->whitespace_3_idx_ - headers->non_whitespace_2_idx_) >
367 max_request_uri_length) {
368 // For requests, we need at least the method. We could assume that a
369 // blank URI means "/". If version isn't stated, it should be assumed
370 // to be HTTP/0.9 by the visitor.
371 *error_code = BalsaFrameEnums::REQUEST_URI_TOO_LONG;
372 return false;
373 }
374 } else {
375 headers->parsed_response_code_ = 0;
376 {
377 const char* parsed_response_code_current =
378 begin + headers->non_whitespace_2_idx_;
379 const char* parsed_response_code_end = begin + headers->whitespace_3_idx_;
380 const size_t kMaxDiv10 = std::numeric_limits<size_t>::max() / 10;
381
382 // Convert a string of [0-9]* into an int.
383 // Note that this allows for the conversion of response codes which
384 // are outside the bounds of normal HTTP response codes (no checking
385 // is done to ensure that these are valid-- they're merely parsed)!
386 while (parsed_response_code_current < parsed_response_code_end) {
387 if (*parsed_response_code_current < '0' ||
388 *parsed_response_code_current > '9') {
389 *error_code = BalsaFrameEnums::FAILED_CONVERTING_STATUS_CODE_TO_INT;
390 return false;
391 }
392 size_t status_code_x_10 = headers->parsed_response_code_ * 10;
393 uint8 c = *parsed_response_code_current - '0';
394 if ((headers->parsed_response_code_ > kMaxDiv10) ||
395 (std::numeric_limits<size_t>::max() - status_code_x_10) < c) {
396 // overflow.
397 *error_code = BalsaFrameEnums::FAILED_CONVERTING_STATUS_CODE_TO_INT;
398 return false;
399 }
400 headers->parsed_response_code_ = status_code_x_10 + c;
401 ++parsed_response_code_current;
402 }
403 }
404 }
405 return true;
406 }
407
408 // begin - beginning of the firstline
409 // end - end of the firstline
410 //
411 // A precondition for this function is that there is non-whitespace between
412 // [begin, end). If this precondition is not met, the function will not perform
413 // as expected (and bad things may happen, and it will eat your first, second,
414 // and third unborn children!).
415 //
416 // Another precondition for this function is that [begin, end) includes
417 // at most one newline, which must be at the end of the line.
ProcessFirstLine(const char * begin,const char * end)418 void BalsaFrame::ProcessFirstLine(const char* begin, const char* end) {
419 BalsaFrameEnums::ErrorCode previous_error = last_error_;
420 if (!ParseHTTPFirstLine(begin,
421 end,
422 is_request_,
423 max_request_uri_length_,
424 headers_,
425 &last_error_)) {
426 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
427 visitor_->HandleHeaderError(this);
428 return;
429 }
430 if (previous_error != last_error_) {
431 visitor_->HandleHeaderWarning(this);
432 }
433
434 if (is_request_) {
435 size_t version_length =
436 headers_->whitespace_4_idx_ - headers_->non_whitespace_3_idx_;
437 visitor_->ProcessRequestFirstLine(
438 begin + headers_->non_whitespace_1_idx_,
439 headers_->whitespace_4_idx_ - headers_->non_whitespace_1_idx_,
440 begin + headers_->non_whitespace_1_idx_,
441 headers_->whitespace_2_idx_ - headers_->non_whitespace_1_idx_,
442 begin + headers_->non_whitespace_2_idx_,
443 headers_->whitespace_3_idx_ - headers_->non_whitespace_2_idx_,
444 begin + headers_->non_whitespace_3_idx_,
445 version_length);
446 if (version_length == 0)
447 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
448 } else {
449 visitor_->ProcessResponseFirstLine(
450 begin + headers_->non_whitespace_1_idx_,
451 headers_->whitespace_4_idx_ - headers_->non_whitespace_1_idx_,
452 begin + headers_->non_whitespace_1_idx_,
453 headers_->whitespace_2_idx_ - headers_->non_whitespace_1_idx_,
454 begin + headers_->non_whitespace_2_idx_,
455 headers_->whitespace_3_idx_ - headers_->non_whitespace_2_idx_,
456 begin + headers_->non_whitespace_3_idx_,
457 headers_->whitespace_4_idx_ - headers_->non_whitespace_3_idx_);
458 }
459 }
460
461 // 'stream_begin' points to the first character of the headers buffer.
462 // 'line_begin' points to the first character of the line.
463 // 'current' points to a char which is ':'.
464 // 'line_end' points to the position of '\n' + 1.
465 // 'line_begin' points to the position of first character of line.
CleanUpKeyValueWhitespace(const char * stream_begin,const char * line_begin,const char * current,const char * line_end,HeaderLineDescription * current_header_line)466 void BalsaFrame::CleanUpKeyValueWhitespace(
467 const char* stream_begin,
468 const char* line_begin,
469 const char* current,
470 const char* line_end,
471 HeaderLineDescription* current_header_line) {
472 const char* colon_loc = current;
473 DCHECK_LT(colon_loc, line_end);
474 DCHECK_EQ(':', *colon_loc);
475 DCHECK_EQ(':', *current);
476 DCHECK_GE(' ', *line_end)
477 << "\"" << std::string(line_begin, line_end) << "\"";
478
479 // TODO(fenix): Investigate whether or not the bounds tests in the
480 // while loops here are redundant, and if so, remove them.
481 --current;
482 while (current > line_begin && *current <= ' ') --current;
483 current += (current != colon_loc);
484 current_header_line->key_end_idx = current - stream_begin;
485
486 current = colon_loc;
487 DCHECK_EQ(':', *current);
488 ++current;
489 while (current < line_end && *current <= ' ') ++current;
490 current_header_line->value_begin_idx = current - stream_begin;
491
492 DCHECK_GE(current_header_line->key_end_idx,
493 current_header_line->first_char_idx);
494 DCHECK_GE(current_header_line->value_begin_idx,
495 current_header_line->key_end_idx);
496 DCHECK_GE(current_header_line->last_char_idx,
497 current_header_line->value_begin_idx);
498 }
499
FindColonsAndParseIntoKeyValue()500 inline void BalsaFrame::FindColonsAndParseIntoKeyValue() {
501 DCHECK(!lines_.empty());
502 const char* stream_begin = headers_->OriginalHeaderStreamBegin();
503 // The last line is always just a newline (and is uninteresting).
504 const Lines::size_type lines_size_m1 = lines_.size() - 1;
505 #if __SSE2__
506 const __v16qi colons = { ':', ':', ':', ':', ':', ':', ':', ':',
507 ':', ':', ':', ':', ':', ':', ':', ':'};
508 const char* header_lines_end_m16 = headers_->OriginalHeaderStreamEnd() - 16;
509 #endif // __SSE2__
510 const char* current = stream_begin + lines_[1].first;
511 // This code is a bit more subtle than it may appear at first glance.
512 // This code looks for a colon in the current line... but it also looks
513 // beyond the current line. If there is no colon in the current line, then
514 // for each subsequent line (until the colon which -has- been found is
515 // associated with a line), no searching for a colon will be performed. In
516 // this way, we minimize the amount of bytes we have scanned for a colon.
517 for (Lines::size_type i = 1; i < lines_size_m1;) {
518 const char* line_begin = stream_begin + lines_[i].first;
519
520 // Here we handle possible continuations. Note that we do not replace
521 // the '\n' in the line before a continuation (at least, as of now),
522 // which implies that any code which looks for a value must deal with
523 // "\r\n", etc -within- the line (and not just at the end of it).
524 for (++i; i < lines_size_m1; ++i) {
525 const char c = *(stream_begin + lines_[i].first);
526 if (c > ' ') {
527 // Not a continuation, so stop. Note that if the 'original' i = 1,
528 // and the next line is not a continuation, we'll end up with i = 2
529 // when we break. This handles the incrementing of i for the outer
530 // loop.
531 break;
532 }
533 }
534 const char* line_end = stream_begin + lines_[i - 1].second;
535 DCHECK_LT(line_begin - stream_begin, line_end - stream_begin);
536
537 // We cleanup the whitespace at the end of the line before doing anything
538 // else of interest as it allows us to do nothing when irregularly formatted
539 // headers are parsed (e.g. those with only keys, only values, or no colon).
540 //
541 // We're guaranteed to have *line_end > ' ' while line_end >= line_begin.
542 --line_end;
543 DCHECK_EQ('\n', *line_end)
544 << "\"" << std::string(line_begin, line_end) << "\"";
545 while (*line_end <= ' ' && line_end > line_begin) {
546 --line_end;
547 }
548 ++line_end;
549 DCHECK_GE(' ', *line_end);
550 DCHECK_LT(line_begin, line_end);
551
552 // We use '0' for the block idx, because we're always writing to the first
553 // block from the framer (we do this because the framer requires that the
554 // entire header sequence be in a contiguous buffer).
555 headers_->header_lines_.push_back(
556 HeaderLineDescription(line_begin - stream_begin,
557 line_end - stream_begin,
558 line_end - stream_begin,
559 line_end - stream_begin,
560 0));
561 if (current >= line_end) {
562 last_error_ = BalsaFrameEnums::HEADER_MISSING_COLON;
563 visitor_->HandleHeaderWarning(this);
564 // Then the next colon will not be found within this header line-- time
565 // to try again with another header-line.
566 continue;
567 } else if (current < line_begin) {
568 // When this condition is true, the last detected colon was part of a
569 // previous line. We reset to the beginning of the line as we don't care
570 // about the presence of any colon before the beginning of the current
571 // line.
572 current = line_begin;
573 }
574 #if __SSE2__
575 while (current < header_lines_end_m16) {
576 __m128i header_bytes =
577 _mm_loadu_si128(reinterpret_cast<const __m128i *>(current));
578 __m128i colon_cmp =
579 _mm_cmpeq_epi8(header_bytes, reinterpret_cast<__m128i>(colons));
580 int colon_msk = _mm_movemask_epi8(colon_cmp);
581 if (colon_msk == 0) {
582 current += 16;
583 continue;
584 }
585 current += (ffs(colon_msk) - 1);
586 if (current > line_end) {
587 break;
588 }
589 goto found_colon;
590 }
591 #endif // __SSE2__
592 for (; current < line_end; ++current) {
593 if (*current != ':') {
594 continue;
595 }
596 goto found_colon;
597 }
598 // If we've gotten to here, then there was no colon
599 // in the line. The arguments we passed into the construction
600 // for the HeaderLineDescription object should be OK-- it assumes
601 // that the entire content is 'key' by default (which is true, as
602 // there was no colon, there can be no value). Note that this is a
603 // construct which is technically not allowed by the spec.
604 last_error_ = BalsaFrameEnums::HEADER_MISSING_COLON;
605 visitor_->HandleHeaderWarning(this);
606 continue;
607 found_colon:
608 DCHECK_EQ(*current, ':');
609 DCHECK_LE(current - stream_begin, line_end - stream_begin);
610 DCHECK_LE(stream_begin - stream_begin, current - stream_begin);
611
612 HeaderLineDescription& current_header_line = headers_->header_lines_.back();
613 current_header_line.key_end_idx = current - stream_begin;
614 current_header_line.value_begin_idx = current_header_line.key_end_idx;
615 if (current < line_end) {
616 ++current_header_line.key_end_idx;
617
618 CleanUpKeyValueWhitespace(stream_begin,
619 line_begin,
620 current,
621 line_end,
622 ¤t_header_line);
623 }
624 }
625 }
626
ProcessContentLengthLine(HeaderLines::size_type line_idx,BalsaHeadersEnums::ContentLengthStatus * status,size_t * length)627 void BalsaFrame::ProcessContentLengthLine(
628 HeaderLines::size_type line_idx,
629 BalsaHeadersEnums::ContentLengthStatus* status,
630 size_t* length) {
631 const HeaderLineDescription& header_line = headers_->header_lines_[line_idx];
632 const char* stream_begin = headers_->OriginalHeaderStreamBegin();
633 const char* line_end = stream_begin + header_line.last_char_idx;
634 const char* value_begin = (stream_begin + header_line.value_begin_idx);
635
636 if (value_begin >= line_end) {
637 // There is no non-whitespace value data.
638 #if DEBUGFRAMER
639 LOG(INFO) << "invalid content-length -- no non-whitespace value data";
640 #endif
641 *status = BalsaHeadersEnums::INVALID_CONTENT_LENGTH;
642 return;
643 }
644
645 *length = 0;
646 while (value_begin < line_end) {
647 if (*value_begin < '0' || *value_begin > '9') {
648 // bad! content-length found, and couldn't parse all of it!
649 *status = BalsaHeadersEnums::INVALID_CONTENT_LENGTH;
650 #if DEBUGFRAMER
651 LOG(INFO) << "invalid content-length - non numeric character detected";
652 #endif // DEBUGFRAMER
653 return;
654 }
655 const size_t kMaxDiv10 = std::numeric_limits<size_t>::max() / 10;
656 size_t length_x_10 = *length * 10;
657 const unsigned char c = *value_begin - '0';
658 if (*length > kMaxDiv10 ||
659 (std::numeric_limits<size_t>::max() - length_x_10) < c) {
660 *status = BalsaHeadersEnums::CONTENT_LENGTH_OVERFLOW;
661 #if DEBUGFRAMER
662 LOG(INFO) << "content-length overflow";
663 #endif // DEBUGFRAMER
664 return;
665 }
666 *length = length_x_10 + c;
667 ++value_begin;
668 }
669 #if DEBUGFRAMER
670 LOG(INFO) << "content_length parsed: " << *length;
671 #endif // DEBUGFRAMER
672 *status = BalsaHeadersEnums::VALID_CONTENT_LENGTH;
673 }
674
ProcessTransferEncodingLine(HeaderLines::size_type line_idx)675 void BalsaFrame::ProcessTransferEncodingLine(HeaderLines::size_type line_idx) {
676 const HeaderLineDescription& header_line = headers_->header_lines_[line_idx];
677 const char* stream_begin = headers_->OriginalHeaderStreamBegin();
678 const char* line_end = stream_begin + header_line.last_char_idx;
679 const char* value_begin = stream_begin + header_line.value_begin_idx;
680 size_t value_length = line_end - value_begin;
681
682 if ((value_length == 7) &&
683 !strncasecmp(value_begin, "chunked", 7)) {
684 headers_->transfer_encoding_is_chunked_ = true;
685 } else if ((value_length == 8) &&
686 !strncasecmp(value_begin, "identity", 8)) {
687 headers_->transfer_encoding_is_chunked_ = false;
688 } else {
689 last_error_ = BalsaFrameEnums::UNKNOWN_TRANSFER_ENCODING;
690 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
691 visitor_->HandleHeaderError(this);
692 return;
693 }
694 }
695
696 namespace {
SplitStringPiece(base::StringPiece original,char delim,base::StringPiece * before,base::StringPiece * after)697 bool SplitStringPiece(base::StringPiece original, char delim,
698 base::StringPiece* before, base::StringPiece* after) {
699 const char* p = original.data();
700 const char* end = p + original.size();
701
702 while (p != end) {
703 if (*p == delim) {
704 ++p;
705 } else {
706 const char* start = p;
707 while (++p != end && *p != delim) {
708 // Skip to the next occurence of the delimiter.
709 }
710 *before = base::StringPiece(start, p - start);
711 if (p != end)
712 *after = base::StringPiece(p + 1, end - (p + 1));
713 else
714 *after = base::StringPiece("");
715 StringPieceUtils::RemoveWhitespaceContext(before);
716 StringPieceUtils::RemoveWhitespaceContext(after);
717 return true;
718 }
719 }
720
721 *before = original;
722 *after = "";
723 return false;
724 }
725
726 // TODO(phython): Fix this function to properly deal with quoted values.
727 // E.g. ";;foo", "\";;\"", or \"aa;
728 // The last example, the semi-colon is a separator between extensions.
ProcessChunkExtensionsManual(base::StringPiece all_extensions,BalsaHeaders * extensions)729 void ProcessChunkExtensionsManual(base::StringPiece all_extensions,
730 BalsaHeaders* extensions) {
731 base::StringPiece extension;
732 base::StringPiece remaining;
733 StringPieceUtils::RemoveWhitespaceContext(&all_extensions);
734 SplitStringPiece(all_extensions, ';', &extension, &remaining);
735 while (!extension.empty()) {
736 base::StringPiece key;
737 base::StringPiece value;
738 SplitStringPiece(extension, '=', &key, &value);
739 if (!value.empty()) {
740 // Strip quotation marks if they exist.
741 if (!value.empty() && value[0] == '"')
742 value.remove_prefix(1);
743 if (!value.empty() && value[value.length() - 1] == '"')
744 value.remove_suffix(1);
745 }
746
747 extensions->AppendHeader(key, value);
748
749 StringPieceUtils::RemoveWhitespaceContext(&remaining);
750 SplitStringPiece(remaining, ';', &extension, &remaining);
751 }
752 }
753
754 } // anonymous namespace
755
ProcessChunkExtensions(const char * input,size_t size,BalsaHeaders * extensions)756 void BalsaFrame::ProcessChunkExtensions(const char* input, size_t size,
757 BalsaHeaders* extensions) {
758 ProcessChunkExtensionsManual(base::StringPiece(input, size), extensions);
759 }
760
ProcessHeaderLines()761 void BalsaFrame::ProcessHeaderLines() {
762 HeaderLines::size_type content_length_idx = 0;
763 HeaderLines::size_type transfer_encoding_idx = 0;
764
765 DCHECK(!lines_.empty());
766 #if DEBUGFRAMER
767 LOG(INFO) << "******@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@**********\n";
768 #endif // DEBUGFRAMER
769
770 // There is no need to attempt to process headers if no header lines exist.
771 // There are at least two lines in the message which are not header lines.
772 // These two non-header lines are the first line of the message, and the
773 // last line of the message (which is an empty line).
774 // Thus, we test to see if we have more than two lines total before attempting
775 // to parse any header lines.
776 if (lines_.size() > 2) {
777 const char* stream_begin = headers_->OriginalHeaderStreamBegin();
778
779 // Then, for the rest of the header data, we parse these into key-value
780 // pairs.
781 FindColonsAndParseIntoKeyValue();
782 // At this point, we've parsed all of the headers. Time to look for those
783 // headers which we require for framing.
784 const HeaderLines::size_type
785 header_lines_size = headers_->header_lines_.size();
786 for (HeaderLines::size_type i = 0; i < header_lines_size; ++i) {
787 const HeaderLineDescription& current_header_line =
788 headers_->header_lines_[i];
789 const char* key_begin =
790 (stream_begin + current_header_line.first_char_idx);
791 const char* key_end = (stream_begin + current_header_line.key_end_idx);
792 const size_t key_len = key_end - key_begin;
793 const char c = *key_begin;
794 #if DEBUGFRAMER
795 LOG(INFO) << "[" << i << "]: " << std::string(key_begin, key_len)
796 << " c: '" << c << "' key_len: " << key_len;
797 #endif // DEBUGFRAMER
798 // If a header begins with either lowercase or uppercase 'c' or 't', then
799 // the header may be one of content-length, connection, content-encoding
800 // or transfer-encoding. These headers are special, as they change the way
801 // that the message is framed, and so the framer is required to search
802 // for them.
803
804
805 if (c == 'c' || c == 'C') {
806 if ((key_len == kContentLengthSize) &&
807 0 == strncasecmp(key_begin, kContentLength, kContentLengthSize)) {
808 BalsaHeadersEnums::ContentLengthStatus content_length_status =
809 BalsaHeadersEnums::NO_CONTENT_LENGTH;
810 size_t length = 0;
811 ProcessContentLengthLine(i, &content_length_status, &length);
812 if (content_length_idx != 0) { // then we've already seen one!
813 if ((headers_->content_length_status_ != content_length_status) ||
814 ((headers_->content_length_status_ ==
815 BalsaHeadersEnums::VALID_CONTENT_LENGTH) &&
816 length != headers_->content_length_)) {
817 last_error_ = BalsaFrameEnums::MULTIPLE_CONTENT_LENGTH_KEYS;
818 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
819 visitor_->HandleHeaderError(this);
820 return;
821 }
822 continue;
823 } else {
824 content_length_idx = i + 1;
825 headers_->content_length_status_ = content_length_status;
826 headers_->content_length_ = length;
827 content_length_remaining_ = length;
828 }
829
830 }
831 } else if (c == 't' || c == 'T') {
832 if ((key_len == kTransferEncodingSize) &&
833 0 == strncasecmp(key_begin, kTransferEncoding,
834 kTransferEncodingSize)) {
835 if (transfer_encoding_idx != 0) {
836 last_error_ = BalsaFrameEnums::MULTIPLE_TRANSFER_ENCODING_KEYS;
837 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
838 visitor_->HandleHeaderError(this);
839 return;
840 }
841 transfer_encoding_idx = i + 1;
842 }
843 } else if (i == 0 && (key_len == 0 || c == ' ')) {
844 last_error_ = BalsaFrameEnums::INVALID_HEADER_FORMAT;
845 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
846 visitor_->HandleHeaderError(this);
847 return;
848 }
849 }
850 if (headers_->transfer_encoding_is_chunked_) {
851 headers_->content_length_ = 0;
852 headers_->content_length_status_ = BalsaHeadersEnums::NO_CONTENT_LENGTH;
853 content_length_remaining_ = 0;
854 }
855 if (transfer_encoding_idx != 0) {
856 ProcessTransferEncodingLine(transfer_encoding_idx - 1);
857 }
858 }
859 }
860
AssignParseStateAfterHeadersHaveBeenParsed()861 void BalsaFrame::AssignParseStateAfterHeadersHaveBeenParsed() {
862 // For responses, can't have a body if the request was a HEAD, or if it is
863 // one of these response-codes. rfc2616 section 4.3
864 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
865 if (is_request_ ||
866 !(request_was_head_ ||
867 (headers_->parsed_response_code_ >= 100 &&
868 headers_->parsed_response_code_ < 200) ||
869 (headers_->parsed_response_code_ == 204) ||
870 (headers_->parsed_response_code_ == 304))) {
871 // Then we can have a body.
872 if (headers_->transfer_encoding_is_chunked_) {
873 // Note that
874 // if ( Transfer-Encoding: chunked && Content-length: )
875 // then Transfer-Encoding: chunked trumps.
876 // This is as specified in the spec.
877 // rfc2616 section 4.4.3
878 parse_state_ = BalsaFrameEnums::READING_CHUNK_LENGTH;
879 } else {
880 // Errors parsing content-length definitely can cause
881 // protocol errors/warnings
882 switch (headers_->content_length_status_) {
883 // If we have a content-length, and it is parsed
884 // properly, there are two options.
885 // 1) zero content, in which case the message is done, and
886 // 2) nonzero content, in which case we have to
887 // consume the body.
888 case BalsaHeadersEnums::VALID_CONTENT_LENGTH:
889 if (headers_->content_length_ == 0) {
890 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
891 } else {
892 parse_state_ = BalsaFrameEnums::READING_CONTENT;
893 }
894 break;
895 case BalsaHeadersEnums::CONTENT_LENGTH_OVERFLOW:
896 case BalsaHeadersEnums::INVALID_CONTENT_LENGTH:
897 // If there were characters left-over after parsing the
898 // content length, we should flag an error and stop.
899 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
900 last_error_ = BalsaFrameEnums::UNPARSABLE_CONTENT_LENGTH;
901 visitor_->HandleHeaderError(this);
902 break;
903 // We can have: no transfer-encoding, no content length, and no
904 // connection: close...
905 // Unfortunately, this case doesn't seem to be covered in the spec.
906 // We'll assume that the safest thing to do here is what the google
907 // binaries before 2008 already do, which is to assume that
908 // everything until the connection is closed is body.
909 case BalsaHeadersEnums::NO_CONTENT_LENGTH:
910 if (is_request_) {
911 base::StringPiece method = headers_->request_method();
912 // POSTs and PUTs should have a detectable body length. If they
913 // do not we consider it an error.
914 if ((method.size() == 4 &&
915 strncmp(method.data(), "POST", 4) == 0) ||
916 (method.size() == 3 &&
917 strncmp(method.data(), "PUT", 3) == 0)) {
918 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
919 last_error_ =
920 BalsaFrameEnums::REQUIRED_BODY_BUT_NO_CONTENT_LENGTH;
921 visitor_->HandleHeaderError(this);
922 break;
923 }
924 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
925 } else {
926 parse_state_ = BalsaFrameEnums::READING_UNTIL_CLOSE;
927 last_error_ = BalsaFrameEnums::MAYBE_BODY_BUT_NO_CONTENT_LENGTH;
928 visitor_->HandleHeaderWarning(this);
929 }
930 break;
931 // The COV_NF_... statements here provide hints to the apparatus
932 // which computes coverage reports/ratios that this code is never
933 // intended to be executed, and should technically be impossible.
934 // COV_NF_START
935 default:
936 LOG(FATAL) << "Saw a content_length_status: "
937 << headers_->content_length_status_ << " which is unknown.";
938 // COV_NF_END
939 }
940 }
941 }
942 }
943
ProcessHeaders(const char * message_start,size_t message_length)944 size_t BalsaFrame::ProcessHeaders(const char* message_start,
945 size_t message_length) {
946 const char* const original_message_start = message_start;
947 const char* const message_end = message_start + message_length;
948 const char* message_current = message_start;
949 const char* checkpoint = message_start;
950
951 if (message_length == 0) {
952 goto bottom;
953 }
954
955 while (message_current < message_end) {
956 size_t base_idx = headers_->GetReadableBytesFromHeaderStream();
957
958 // Yes, we could use strchr (assuming null termination), or
959 // memchr, but as it turns out that is slower than this tight loop
960 // for the input that we see.
961 if (!saw_non_newline_char_) {
962 do {
963 const char c = *message_current;
964 if (c != '\r' && c != '\n') {
965 if (c <= ' ') {
966 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
967 last_error_ = BalsaFrameEnums::NO_REQUEST_LINE_IN_REQUEST;
968 visitor_->HandleHeaderError(this);
969 goto bottom;
970 } else {
971 saw_non_newline_char_ = true;
972 checkpoint = message_start = message_current;
973 goto read_real_message;
974 }
975 }
976 ++message_current;
977 } while (message_current < message_end);
978 goto bottom; // this is necessary to skip 'last_char_was_slash_r' checks
979 } else {
980 read_real_message:
981 // Note that SSE2 can be enabled on certain piii platforms.
982 #if __SSE2__
983 {
984 const char* const message_end_m16 = message_end - 16;
985 __v16qi newlines = { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
986 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' };
987 while (message_current < message_end_m16) {
988 // What this does (using compiler intrinsics):
989 //
990 // Load 16 '\n's into an xmm register
991 // Load 16 bytes of currennt message into an xmm register
992 // Do byte-wise equals on those two xmm registers
993 // Take the first bit of each byte, and put that into the first
994 // 16 bits of a mask
995 // If the mask is zero, no '\n' found. increment by 16 and try again
996 // Else scan forward to find the first set bit.
997 // Increment current by the index of the first set bit
998 // (ffs returns index of first set bit + 1)
999 __m128i msg_bytes =
1000 _mm_loadu_si128(const_cast<__m128i *>(
1001 reinterpret_cast<const __m128i *>(message_current)));
1002 __m128i newline_cmp =
1003 _mm_cmpeq_epi8(msg_bytes, reinterpret_cast<__m128i>(newlines));
1004 int newline_msk = _mm_movemask_epi8(newline_cmp);
1005 if (newline_msk == 0) {
1006 message_current += 16;
1007 continue;
1008 }
1009 message_current += (ffs(newline_msk) - 1);
1010 const size_t relative_idx = message_current - message_start;
1011 const size_t message_current_idx = 1 + base_idx + relative_idx;
1012 lines_.push_back(std::make_pair(last_slash_n_idx_,
1013 message_current_idx));
1014 if (lines_.size() == 1) {
1015 headers_->WriteFromFramer(checkpoint,
1016 1 + message_current - checkpoint);
1017 checkpoint = message_current + 1;
1018 const char* begin = headers_->OriginalHeaderStreamBegin();
1019 #if DEBUGFRAMER
1020 LOG(INFO) << "First line " << std::string(begin, lines_[0].second);
1021 LOG(INFO) << "is_request_: " << is_request_;
1022 #endif
1023 ProcessFirstLine(begin, begin + lines_[0].second);
1024 if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ)
1025 goto process_lines;
1026 else if (parse_state_ == BalsaFrameEnums::PARSE_ERROR)
1027 goto bottom;
1028 }
1029 const size_t chars_since_last_slash_n = (message_current_idx -
1030 last_slash_n_idx_);
1031 last_slash_n_idx_ = message_current_idx;
1032 if (chars_since_last_slash_n > 2) {
1033 // We have a slash-n, but the last slash n was
1034 // more than 2 characters away from this. Thus, we know
1035 // that this cannot be an end-of-header.
1036 ++message_current;
1037 continue;
1038 }
1039 if ((chars_since_last_slash_n == 1) ||
1040 (((message_current > message_start) &&
1041 (*(message_current - 1) == '\r')) ||
1042 (last_char_was_slash_r_))) {
1043 goto process_lines;
1044 }
1045 ++message_current;
1046 }
1047 }
1048 #endif // __SSE2__
1049 while (message_current < message_end) {
1050 if (*message_current != '\n') {
1051 ++message_current;
1052 continue;
1053 }
1054 const size_t relative_idx = message_current - message_start;
1055 const size_t message_current_idx = 1 + base_idx + relative_idx;
1056 lines_.push_back(std::make_pair(last_slash_n_idx_,
1057 message_current_idx));
1058 if (lines_.size() == 1) {
1059 headers_->WriteFromFramer(checkpoint,
1060 1 + message_current - checkpoint);
1061 checkpoint = message_current + 1;
1062 const char* begin = headers_->OriginalHeaderStreamBegin();
1063 #if DEBUGFRAMER
1064 LOG(INFO) << "First line " << std::string(begin, lines_[0].second);
1065 LOG(INFO) << "is_request_: " << is_request_;
1066 #endif
1067 ProcessFirstLine(begin, begin + lines_[0].second);
1068 if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ)
1069 goto process_lines;
1070 else if (parse_state_ == BalsaFrameEnums::PARSE_ERROR)
1071 goto bottom;
1072 }
1073 const size_t chars_since_last_slash_n = (message_current_idx -
1074 last_slash_n_idx_);
1075 last_slash_n_idx_ = message_current_idx;
1076 if (chars_since_last_slash_n > 2) {
1077 // false positive.
1078 ++message_current;
1079 continue;
1080 }
1081 if ((chars_since_last_slash_n == 1) ||
1082 (((message_current > message_start) &&
1083 (*(message_current - 1) == '\r')) ||
1084 (last_char_was_slash_r_))) {
1085 goto process_lines;
1086 }
1087 ++message_current;
1088 }
1089 }
1090 continue;
1091 process_lines:
1092 ++message_current;
1093 DCHECK(message_current >= message_start);
1094 if (message_current > message_start) {
1095 headers_->WriteFromFramer(checkpoint, message_current - checkpoint);
1096 }
1097
1098 // Check if we have exceeded maximum headers length
1099 // Although we check for this limit before and after we call this function
1100 // we check it here as well to make sure that in case the visitor changed
1101 // the max_header_length_ (for example after processing the first line)
1102 // we handle it gracefully.
1103 if (headers_->GetReadableBytesFromHeaderStream() > max_header_length_) {
1104 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1105 last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG;
1106 visitor_->HandleHeaderError(this);
1107 goto bottom;
1108 }
1109
1110 // Since we know that we won't be writing any more bytes of the header,
1111 // we tell that to the headers object. The headers object may make
1112 // more efficient allocation decisions when this is signaled.
1113 headers_->DoneWritingFromFramer();
1114 {
1115 const char* readable_ptr = NULL;
1116 size_t readable_size = 0;
1117 headers_->GetReadablePtrFromHeaderStream(&readable_ptr, &readable_size);
1118 visitor_->ProcessHeaderInput(readable_ptr, readable_size);
1119 }
1120
1121 // Ok, now that we've written everything into our header buffer, it is
1122 // time to process the header lines (extract proper values for headers
1123 // which are important for framing).
1124 ProcessHeaderLines();
1125 if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) {
1126 goto bottom;
1127 }
1128 AssignParseStateAfterHeadersHaveBeenParsed();
1129 if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) {
1130 goto bottom;
1131 }
1132 visitor_->ProcessHeaders(*headers_);
1133 visitor_->HeaderDone();
1134 if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ) {
1135 visitor_->MessageDone();
1136 }
1137 goto bottom;
1138 }
1139 // If we've gotten to here, it means that we've consumed all of the
1140 // available input. We need to record whether or not the last character we
1141 // saw was a '\r' so that a subsequent call to ProcessInput correctly finds
1142 // a header framing that is split across the two calls.
1143 last_char_was_slash_r_ = (*(message_end - 1) == '\r');
1144 DCHECK(message_current >= message_start);
1145 if (message_current > message_start) {
1146 headers_->WriteFromFramer(checkpoint, message_current - checkpoint);
1147 }
1148 bottom:
1149 return message_current - original_message_start;
1150 }
1151
1152
BytesSafeToSplice() const1153 size_t BalsaFrame::BytesSafeToSplice() const {
1154 switch (parse_state_) {
1155 case BalsaFrameEnums::READING_CHUNK_DATA:
1156 return chunk_length_remaining_;
1157 case BalsaFrameEnums::READING_UNTIL_CLOSE:
1158 return std::numeric_limits<size_t>::max();
1159 case BalsaFrameEnums::READING_CONTENT:
1160 return content_length_remaining_;
1161 default:
1162 return 0;
1163 }
1164 }
1165
BytesSpliced(size_t bytes_spliced)1166 void BalsaFrame::BytesSpliced(size_t bytes_spliced) {
1167 switch (parse_state_) {
1168 case BalsaFrameEnums::READING_CHUNK_DATA:
1169 if (chunk_length_remaining_ >= bytes_spliced) {
1170 chunk_length_remaining_ -= bytes_spliced;
1171 if (chunk_length_remaining_ == 0) {
1172 parse_state_ = BalsaFrameEnums::READING_CHUNK_TERM;
1173 }
1174 return;
1175 } else {
1176 last_error_ =
1177 BalsaFrameEnums::CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT;
1178 goto error_exit;
1179 }
1180
1181 case BalsaFrameEnums::READING_UNTIL_CLOSE:
1182 return;
1183
1184 case BalsaFrameEnums::READING_CONTENT:
1185 if (content_length_remaining_ >= bytes_spliced) {
1186 content_length_remaining_ -= bytes_spliced;
1187 if (content_length_remaining_ == 0) {
1188 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1189 visitor_->MessageDone();
1190 }
1191 return;
1192 } else {
1193 last_error_ =
1194 BalsaFrameEnums::CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT;
1195 goto error_exit;
1196 }
1197
1198 default:
1199 last_error_ = BalsaFrameEnums::CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO;
1200 goto error_exit;
1201 }
1202
1203 error_exit:
1204 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1205 visitor_->HandleBodyError(this);
1206 };
1207
1208 // You may note that the state-machine contained within this function has both
1209 // switch and goto labels for nearly the same thing. For instance, the
1210 // following two labels refer to the same code block:
1211 // label_reading_chunk_data:
1212 // case BalsaFrameEnums::READING_CHUNK_DATA:
1213 // The 'case' statement is required for the switch statement which occurs when
1214 // ProcessInput is invoked. The goto label is required as the state-machine
1215 // does not use a computed goto in any subsequent operations.
1216 //
1217 // Since several states exit the state machine for various reasons, there is
1218 // also one label at the bottom of the function. When it is appropriate to
1219 // return from the function, that part of the state machine instead issues a
1220 // goto bottom; This results in less code duplication, and makes debugging
1221 // easier (as you can add a statement to a section of code which is guaranteed
1222 // to be invoked when the function is exiting.
ProcessInput(const char * input,size_t size)1223 size_t BalsaFrame::ProcessInput(const char* input, size_t size) {
1224 const char* current = input;
1225 const char* on_entry = current;
1226 const char* end = current + size;
1227 #if DEBUGFRAMER
1228 LOG(INFO) << "\n=============="
1229 << BalsaFrameEnums::ParseStateToString(parse_state_)
1230 << "===============\n";
1231 #endif // DEBUGFRAMER
1232
1233 DCHECK(headers_ != NULL);
1234 if (headers_ == NULL) return 0;
1235
1236 if (parse_state_ == BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE) {
1237 const size_t header_length = headers_->GetReadableBytesFromHeaderStream();
1238 // Yes, we still have to check this here as the user can change the
1239 // max_header_length amount!
1240 // Also it is possible that we have reached the maximum allowed header size,
1241 // and we have more to consume (remember we are still inside
1242 // READING_HEADER_AND_FIRSTLINE) in which case we directly declare an error.
1243 if (header_length > max_header_length_ ||
1244 (header_length == max_header_length_ && size > 0)) {
1245 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1246 last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG;
1247 visitor_->HandleHeaderError(this);
1248 goto bottom;
1249 }
1250 size_t bytes_to_process = max_header_length_ - header_length;
1251 if (bytes_to_process > size) {
1252 bytes_to_process = size;
1253 }
1254 current += ProcessHeaders(input, bytes_to_process);
1255 // If we are still reading headers check if we have crossed the headers
1256 // limit. Note that we check for >= as opposed to >. This is because if
1257 // header_length_after equals max_header_length_ and we are still in the
1258 // parse_state_ BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE we know for
1259 // sure that the headers limit will be crossed later on
1260 if (parse_state_ == BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE) {
1261 // Note that headers_ is valid only if we are still reading headers.
1262 const size_t header_length_after =
1263 headers_->GetReadableBytesFromHeaderStream();
1264 if (header_length_after >= max_header_length_) {
1265 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1266 last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG;
1267 visitor_->HandleHeaderError(this);
1268 }
1269 }
1270 goto bottom;
1271 } else if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ ||
1272 parse_state_ == BalsaFrameEnums::PARSE_ERROR) {
1273 // Can do nothing more 'till we're reset.
1274 goto bottom;
1275 }
1276
1277 while (current < end) {
1278 switch (parse_state_) {
1279 label_reading_chunk_length:
1280 case BalsaFrameEnums::READING_CHUNK_LENGTH:
1281 // In this state we read the chunk length.
1282 // Note that once we hit a character which is not in:
1283 // [0-9;A-Fa-f\n], we transition to a different state.
1284 //
1285 {
1286 // If we used strtol, etc, we'd have to buffer this line.
1287 // This is more annoying than simply doing the conversion
1288 // here. This code accounts for overflow.
1289 static const signed char buf[] = {
1290 // %0 %1 %2 %3 %4 %5 %6 %7 %8 \t \n %b %c \r %e %f
1291 -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -1, -1, -2, -1, -1,
1292 // %10 %11 %12 %13 %14 %15 %16 %17 %18 %19 %1a %1b %1c %1d %1e %1f
1293 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1294 // ' ' %21 %22 %23 %24 %25 %26 %27 %28 %29 %2a %2b %2c %2d %2e %2f
1295 -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1296 // %30 %31 %32 %33 %34 %35 %36 %37 %38 %39 %3a ';' %3c %3d %3e %3f
1297 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -2, -1, -1, -1, -1,
1298 // %40 'A' 'B' 'C' 'D' 'E' 'F' %47 %48 %49 %4a %4b %4c %4d %4e %4f
1299 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1300 // %50 %51 %52 %53 %54 %55 %56 %57 %58 %59 %5a %5b %5c %5d %5e %5f
1301 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1302 // %60 'a' 'b' 'c' 'd' 'e' 'f' %67 %68 %69 %6a %6b %6c %6d %6e %6f
1303 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1304 // %70 %71 %72 %73 %74 %75 %76 %77 %78 %79 %7a %7b %7c %7d %7e %7f
1305 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1306 };
1307 // valid cases:
1308 // "09123\n" // -> 09123
1309 // "09123\r\n" // -> 09123
1310 // "09123 \n" // -> 09123
1311 // "09123 \r\n" // -> 09123
1312 // "09123 12312\n" // -> 09123
1313 // "09123 12312\r\n" // -> 09123
1314 // "09123; foo=bar\n" // -> 09123
1315 // "09123; foo=bar\r\n" // -> 09123
1316 // "FFFFFFFFFFFFFFFF\r\n" // -> FFFFFFFFFFFFFFFF
1317 // "FFFFFFFFFFFFFFFF 22\r\n" // -> FFFFFFFFFFFFFFFF
1318 // invalid cases:
1319 // "[ \t]+[^\n]*\n"
1320 // "FFFFFFFFFFFFFFFFF\r\n" (would overflow)
1321 // "\r\n"
1322 // "\n"
1323 while (current < end) {
1324 const char c = *current;
1325 ++current;
1326 const signed char addition = buf[static_cast<int>(c)];
1327 if (addition >= 0) {
1328 chunk_length_character_extracted_ = true;
1329 size_t length_x_16 = chunk_length_remaining_ * 16;
1330 const size_t kMaxDiv16 = std::numeric_limits<size_t>::max() / 16;
1331 if ((chunk_length_remaining_ > kMaxDiv16) ||
1332 ((std::numeric_limits<size_t>::max() - length_x_16) <
1333 static_cast<size_t>(addition))) {
1334 // overflow -- asked for a chunk-length greater than 2^64 - 1!!
1335 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1336 last_error_ = BalsaFrameEnums::CHUNK_LENGTH_OVERFLOW;
1337 visitor_->ProcessBodyInput(on_entry, current - on_entry);
1338 visitor_->HandleChunkingError(this);
1339 goto bottom;
1340 }
1341 chunk_length_remaining_ = length_x_16 + addition;
1342 continue;
1343 }
1344
1345 if (!chunk_length_character_extracted_ || addition == -1) {
1346 // ^[0-9;A-Fa-f][ \t\n] -- was not matched, either because no
1347 // characters were converted, or an unexpected character was
1348 // seen.
1349 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1350 last_error_ = BalsaFrameEnums::INVALID_CHUNK_LENGTH;
1351 visitor_->ProcessBodyInput(on_entry, current - on_entry);
1352 visitor_->HandleChunkingError(this);
1353 goto bottom;
1354 }
1355
1356 --current;
1357 parse_state_ = BalsaFrameEnums::READING_CHUNK_EXTENSION;
1358 visitor_->ProcessChunkLength(chunk_length_remaining_);
1359 goto label_reading_chunk_extension;
1360 }
1361 }
1362 visitor_->ProcessBodyInput(on_entry, current - on_entry);
1363 goto bottom; // case BalsaFrameEnums::READING_CHUNK_LENGTH
1364
1365 label_reading_chunk_extension:
1366 case BalsaFrameEnums::READING_CHUNK_EXTENSION:
1367 {
1368 // TODO(phython): Convert this scanning to be 16 bytes at a time if
1369 // there is data to be read.
1370 const char* extensions_start = current;
1371 size_t extensions_length = 0;
1372 while (current < end) {
1373 const char c = *current;
1374 if (c == '\r' || c == '\n') {
1375 extensions_length =
1376 (extensions_start == current) ?
1377 0 :
1378 current - extensions_start - 1;
1379 }
1380
1381 ++current;
1382 if (c == '\n') {
1383 chunk_length_character_extracted_ = false;
1384 visitor_->ProcessChunkExtensions(
1385 extensions_start, extensions_length);
1386 if (chunk_length_remaining_ != 0) {
1387 parse_state_ = BalsaFrameEnums::READING_CHUNK_DATA;
1388 goto label_reading_chunk_data;
1389 }
1390 HeaderFramingFound('\n');
1391 parse_state_ = BalsaFrameEnums::READING_LAST_CHUNK_TERM;
1392 goto label_reading_last_chunk_term;
1393 }
1394 }
1395 visitor_->ProcessChunkExtensions(
1396 extensions_start, extensions_length);
1397 }
1398
1399 visitor_->ProcessBodyInput(on_entry, current - on_entry);
1400 goto bottom; // case BalsaFrameEnums::READING_CHUNK_EXTENSION
1401
1402 label_reading_chunk_data:
1403 case BalsaFrameEnums::READING_CHUNK_DATA:
1404 while (current < end) {
1405 if (chunk_length_remaining_ == 0) {
1406 break;
1407 }
1408 // read in the chunk
1409 size_t bytes_remaining = end - current;
1410 size_t consumed_bytes = (chunk_length_remaining_ < bytes_remaining) ?
1411 chunk_length_remaining_ : bytes_remaining;
1412 const char* tmp_current = current + consumed_bytes;
1413 visitor_->ProcessBodyInput(on_entry, tmp_current - on_entry);
1414 visitor_->ProcessBodyData(current, consumed_bytes);
1415 on_entry = current = tmp_current;
1416 chunk_length_remaining_ -= consumed_bytes;
1417 }
1418 if (chunk_length_remaining_ == 0) {
1419 parse_state_ = BalsaFrameEnums::READING_CHUNK_TERM;
1420 goto label_reading_chunk_term;
1421 }
1422 visitor_->ProcessBodyInput(on_entry, current - on_entry);
1423 goto bottom; // case BalsaFrameEnums::READING_CHUNK_DATA
1424
1425 label_reading_chunk_term:
1426 case BalsaFrameEnums::READING_CHUNK_TERM:
1427 while (current < end) {
1428 const char c = *current;
1429 ++current;
1430
1431 if (c == '\n') {
1432 parse_state_ = BalsaFrameEnums::READING_CHUNK_LENGTH;
1433 goto label_reading_chunk_length;
1434 }
1435 }
1436 visitor_->ProcessBodyInput(on_entry, current - on_entry);
1437 goto bottom; // case BalsaFrameEnums::READING_CHUNK_TERM
1438
1439 label_reading_last_chunk_term:
1440 case BalsaFrameEnums::READING_LAST_CHUNK_TERM:
1441 while (current < end) {
1442 const char c = *current;
1443
1444 if (!HeaderFramingFound(c)) {
1445 // If not, however, since the spec only suggests that the
1446 // client SHOULD indicate the presence of trailers, we get to
1447 // *test* that they did or didn't.
1448 // If all of the bytes we've seen since:
1449 // OPTIONAL_WS 0 OPTIONAL_STUFF CRLF
1450 // are either '\r', or '\n', then we can assume that we don't yet
1451 // know if we need to parse headers, or if the next byte will make
1452 // the HeaderFramingFound condition (above) true.
1453 if (HeaderFramingMayBeFound()) {
1454 // If true, then we have seen only characters '\r' or '\n'.
1455 ++current;
1456
1457 // Lets try again! There is no state change here.
1458 continue;
1459 } else {
1460 // If (!HeaderFramingMayBeFound()), then we know that we must be
1461 // reading the first non CRLF character of a trailer.
1462 parse_state_ = BalsaFrameEnums::READING_TRAILER;
1463 visitor_->ProcessBodyInput(on_entry, current - on_entry);
1464 on_entry = current;
1465 goto label_reading_trailer;
1466 }
1467 } else {
1468 // If we've found a "\r\n\r\n", then the message
1469 // is done.
1470 ++current;
1471 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1472 visitor_->ProcessBodyInput(on_entry, current - on_entry);
1473 visitor_->MessageDone();
1474 goto bottom;
1475 }
1476 break; // from while loop
1477 }
1478 visitor_->ProcessBodyInput(on_entry, current - on_entry);
1479 goto bottom; // case BalsaFrameEnums::READING_LAST_CHUNK_TERM
1480
1481 label_reading_trailer:
1482 case BalsaFrameEnums::READING_TRAILER:
1483 while (current < end) {
1484 const char c = *current;
1485 ++current;
1486 // TODO(fenix): If we ever care about trailers as part of framing,
1487 // deal with them here (see below for part of the 'solution')
1488 // if (LineFramingFound(c)) {
1489 // trailer_lines_.push_back(make_pair(start_of_line_,
1490 // trailer_length_ - 1));
1491 // start_of_line_ = trailer_length_;
1492 // }
1493 if (HeaderFramingFound(c)) {
1494 // ProcessTrailers(visitor_, &trailers_);
1495 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1496 visitor_->ProcessTrailerInput(on_entry, current - on_entry);
1497 visitor_->MessageDone();
1498 goto bottom;
1499 }
1500 }
1501 visitor_->ProcessTrailerInput(on_entry, current - on_entry);
1502 break; // case BalsaFrameEnums::READING_TRAILER
1503
1504 // Note that there is no label:
1505 // 'label_reading_until_close'
1506 // here. This is because the state-machine exists immediately after
1507 // reading the headers instead of transitioning here (as it would
1508 // do if it was consuming all the data it could, all the time).
1509 case BalsaFrameEnums::READING_UNTIL_CLOSE:
1510 {
1511 const size_t bytes_remaining = end - current;
1512 if (bytes_remaining > 0) {
1513 visitor_->ProcessBodyInput(current, bytes_remaining);
1514 visitor_->ProcessBodyData(current, bytes_remaining);
1515 current += bytes_remaining;
1516 }
1517 }
1518 goto bottom; // case BalsaFrameEnums::READING_UNTIL_CLOSE
1519
1520 // label_reading_content:
1521 case BalsaFrameEnums::READING_CONTENT:
1522 #if DEBUGFRAMER
1523 LOG(INFO) << "ReadingContent: " << content_length_remaining_;
1524 #endif // DEBUGFRAMER
1525 while (content_length_remaining_ && current < end) {
1526 // read in the content
1527 const size_t bytes_remaining = end - current;
1528 const size_t consumed_bytes =
1529 (content_length_remaining_ < bytes_remaining) ?
1530 content_length_remaining_ : bytes_remaining;
1531 visitor_->ProcessBodyInput(current, consumed_bytes);
1532 visitor_->ProcessBodyData(current, consumed_bytes);
1533 current += consumed_bytes;
1534 content_length_remaining_ -= consumed_bytes;
1535 }
1536 if (content_length_remaining_ == 0) {
1537 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1538 visitor_->MessageDone();
1539 }
1540 goto bottom; // case BalsaFrameEnums::READING_CONTENT
1541
1542 default:
1543 // The state-machine should never be in a state that isn't handled
1544 // above. This is a glaring logic error, and we should do something
1545 // drastic to ensure that this gets looked-at and fixed.
1546 LOG(FATAL) << "Unknown state: " << parse_state_ // COV_NF_LINE
1547 << " memory corruption?!"; // COV_NF_LINE
1548 }
1549 }
1550 bottom:
1551 #if DEBUGFRAMER
1552 LOG(INFO) << "\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n"
1553 << std::string(input, current)
1554 << "\n$$$$$$$$$$$$$$"
1555 << BalsaFrameEnums::ParseStateToString(parse_state_)
1556 << "$$$$$$$$$$$$$$$"
1557 << " consumed: " << (current - input);
1558 if (Error()) {
1559 LOG(INFO) << BalsaFrameEnums::ErrorCodeToString(ErrorCode());
1560 }
1561 #endif // DEBUGFRAMER
1562 return current - input;
1563 }
1564
1565 } // namespace net
1566