1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "net/tools/flip_server/balsa_frame.h"
6
7 #include <assert.h>
8 #include <emmintrin.h>
9 #include <strings.h>
10
11 #include <limits>
12 #include <iostream>
13 #include <string>
14 #include <utility>
15 #include <vector>
16
17 #include "base/logging.h"
18 #include "base/port.h"
19 #include "base/string_piece.h"
20 #include "net/tools/flip_server/balsa_enums.h"
21 #include "net/tools/flip_server/balsa_headers.h"
22 #include "net/tools/flip_server/balsa_visitor_interface.h"
23 #include "net/tools/flip_server/buffer_interface.h"
24 #include "net/tools/flip_server/simple_buffer.h"
25 #include "net/tools/flip_server/split.h"
26 #include "net/tools/flip_server/string_piece_utils.h"
27
28 namespace net {
29
30 // Constants holding some header names for headers which can affect the way the
31 // HTTP message is framed, and so must be processed specially:
32 static const char kContentLength[] = "content-length";
33 static const size_t kContentLengthSize = sizeof(kContentLength) - 1;
34 static const char kTransferEncoding[] = "transfer-encoding";
35 static const size_t kTransferEncodingSize = sizeof(kTransferEncoding) - 1;
36
Reset()37 void BalsaFrame::Reset() {
38 last_char_was_slash_r_ = false;
39 saw_non_newline_char_ = false;
40 start_was_space_ = true;
41 chunk_length_character_extracted_ = false;
42 // is_request_ = true; // not reset between messages.
43 // request_was_head_ = false; // not reset between messages.
44 // max_header_length_ = 4096; // not reset between messages.
45 // max_request_uri_length_ = 2048; // not reset between messages.
46 // visitor_ = &do_nothing_visitor_; // not reset between messages.
47 chunk_length_remaining_ = 0;
48 content_length_remaining_ = 0;
49 last_slash_n_loc_ = NULL;
50 last_recorded_slash_n_loc_ = NULL;
51 last_slash_n_idx_ = 0;
52 term_chars_ = 0;
53 parse_state_ = BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE;
54 last_error_ = BalsaFrameEnums::NO_ERROR;
55 lines_.clear();
56 if (headers_ != NULL) {
57 headers_->Clear();
58 }
59 }
60
ParseStateToString(BalsaFrameEnums::ParseState error_code)61 const char* BalsaFrameEnums::ParseStateToString(
62 BalsaFrameEnums::ParseState error_code) {
63 switch (error_code) {
64 case ERROR:
65 return "ERROR";
66 case READING_HEADER_AND_FIRSTLINE:
67 return "READING_HEADER_AND_FIRSTLINE";
68 case READING_CHUNK_LENGTH:
69 return "READING_CHUNK_LENGTH";
70 case READING_CHUNK_EXTENSION:
71 return "READING_CHUNK_EXTENSION";
72 case READING_CHUNK_DATA:
73 return "READING_CHUNK_DATA";
74 case READING_CHUNK_TERM:
75 return "READING_CHUNK_TERM";
76 case READING_LAST_CHUNK_TERM:
77 return "READING_LAST_CHUNK_TERM";
78 case READING_TRAILER:
79 return "READING_TRAILER";
80 case READING_UNTIL_CLOSE:
81 return "READING_UNTIL_CLOSE";
82 case READING_CONTENT:
83 return "READING_CONTENT";
84 case MESSAGE_FULLY_READ:
85 return "MESSAGE_FULLY_READ";
86 case NUM_STATES:
87 return "UNKNOWN_STATE";
88 }
89 return "UNKNOWN_STATE";
90 }
91
ErrorCodeToString(BalsaFrameEnums::ErrorCode error_code)92 const char* BalsaFrameEnums::ErrorCodeToString(
93 BalsaFrameEnums::ErrorCode error_code) {
94 switch (error_code) {
95 case NO_ERROR:
96 return "NO_ERROR";
97 case NO_STATUS_LINE_IN_RESPONSE:
98 return "NO_STATUS_LINE_IN_RESPONSE";
99 case NO_REQUEST_LINE_IN_REQUEST:
100 return "NO_REQUEST_LINE_IN_REQUEST";
101 case FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION:
102 return "FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION";
103 case FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD:
104 return "FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD";
105 case FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE:
106 return "FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE";
107 case FAILED_TO_FIND_WS_AFTER_REQUEST_REQUEST_URI:
108 return "FAILED_TO_FIND_WS_AFTER_REQUEST_REQUEST_URI";
109 case FAILED_TO_FIND_NL_AFTER_RESPONSE_REASON_PHRASE:
110 return "FAILED_TO_FIND_NL_AFTER_RESPONSE_REASON_PHRASE";
111 case FAILED_TO_FIND_NL_AFTER_REQUEST_HTTP_VERSION:
112 return "FAILED_TO_FIND_NL_AFTER_REQUEST_HTTP_VERSION";
113 case FAILED_CONVERTING_STATUS_CODE_TO_INT:
114 return "FAILED_CONVERTING_STATUS_CODE_TO_INT";
115 case REQUEST_URI_TOO_LONG:
116 return "REQUEST_URI_TOO_LONG";
117 case HEADERS_TOO_LONG:
118 return "HEADERS_TOO_LONG";
119 case UNPARSABLE_CONTENT_LENGTH:
120 return "UNPARSABLE_CONTENT_LENGTH";
121 case MAYBE_BODY_BUT_NO_CONTENT_LENGTH:
122 return "MAYBE_BODY_BUT_NO_CONTENT_LENGTH";
123 case REQUIRED_BODY_BUT_NO_CONTENT_LENGTH:
124 return "REQUIRED_BODY_BUT_NO_CONTENT_LENGTH";
125 case HEADER_MISSING_COLON:
126 return "HEADER_MISSING_COLON";
127 case INVALID_CHUNK_LENGTH:
128 return "INVALID_CHUNK_LENGTH";
129 case CHUNK_LENGTH_OVERFLOW:
130 return "CHUNK_LENGTH_OVERFLOW";
131 case CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO:
132 return "CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO";
133 case CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT:
134 return "CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT";
135 case MULTIPLE_CONTENT_LENGTH_KEYS:
136 return "MULTIPLE_CONTENT_LENGTH_KEYS";
137 case MULTIPLE_TRANSFER_ENCODING_KEYS:
138 return "MULTIPLE_TRANSFER_ENCODING_KEYS";
139 case UNKNOWN_TRANSFER_ENCODING:
140 return "UNKNOWN_TRANSFER_ENCODING";
141 case INVALID_HEADER_FORMAT:
142 return "INVALID_HEADER_FORMAT";
143 case INTERNAL_LOGIC_ERROR:
144 return "INTERNAL_LOGIC_ERROR";
145 case NUM_ERROR_CODES:
146 return "UNKNOWN_ERROR";
147 }
148 return "UNKNOWN_ERROR";
149 }
150
151 // Summary:
152 // Parses the first line of either a request or response.
153 // Note that in the case of a detected warning, error_code will be set
154 // but the function will not return false.
155 // Exactly zero or one warning or error (but not both) may be detected
156 // by this function.
157 // Note that this function will not write the data of the first-line
158 // into the header's buffer (that should already have been done elsewhere).
159 //
160 // Pre-conditions:
161 // begin != end
162 // *begin should be a character which is > ' '. This implies that there
163 // is at least one non-whitespace characters between [begin, end).
164 // headers is a valid pointer to a BalsaHeaders class.
165 // error_code is a valid pointer to a BalsaFrameEnums::ErrorCode value.
166 // Entire first line must exist between [begin, end)
167 // Exactly zero or one newlines -may- exist between [begin, end)
168 // [begin, end) should exist in the header's buffer.
169 //
170 // Side-effects:
171 // headers will be modified
172 // error_code may be modified if either a warning or error is detected
173 //
174 // Returns:
175 // True if no error (as opposed to warning) is detected.
176 // False if an error (as opposed to warning) is detected.
177
178 //
179 // If there is indeed non-whitespace in the line, then the following
180 // will take care of this for you:
181 // while (*begin <= ' ') ++begin;
182 // ProcessFirstLine(begin, end, is_request, &headers, &error_code);
183 //
ParseHTTPFirstLine(const char * begin,const char * end,bool is_request,size_t max_request_uri_length,BalsaHeaders * headers,BalsaFrameEnums::ErrorCode * error_code)184 bool ParseHTTPFirstLine(const char* begin,
185 const char* end,
186 bool is_request,
187 size_t max_request_uri_length,
188 BalsaHeaders* headers,
189 BalsaFrameEnums::ErrorCode* error_code) {
190 const char* current = begin;
191 // HTTP firstlines all have the following structure:
192 // LWS NONWS LWS NONWS LWS NONWS NOTCRLF CRLF
193 // [\t \r\n]+ [^\t ]+ [\t ]+ [^\t ]+ [\t ]+ [^\t ]+ [^\r\n]+ "\r\n"
194 // ws1 nws1 ws2 nws2 ws3 nws3 ws4
195 // | [-------) [-------) [----------------)
196 // REQ: method request_uri version
197 // RESP: version statuscode reason
198 //
199 // The first NONWS->LWS component we'll call firstline_a.
200 // The second firstline_b, and the third firstline_c.
201 //
202 // firstline_a goes from nws1 to (but not including) ws2
203 // firstline_b goes from nws2 to (but not including) ws3
204 // firstline_c goes from nws3 to (but not including) ws4
205 //
206 // In the code:
207 // ws1 == whitespace_1_idx_
208 // nws1 == non_whitespace_1_idx_
209 // ws2 == whitespace_2_idx_
210 // nws2 == non_whitespace_2_idx_
211 // ws3 == whitespace_3_idx_
212 // nws3 == non_whitespace_3_idx_
213 // ws4 == whitespace_4_idx_
214
215 // Kill all whitespace (including '\r\n') at the end of the line.
216 --end;
217 if (*end != '\n') {
218 *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR;
219 LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n"
220 << headers->OriginalHeadersForDebugging();
221 return false;
222 }
223 while (begin < end && *end <= ' ') {
224 --end;
225 }
226 DCHECK(*end != '\n');
227 if (*end == '\n') {
228 *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR;
229 LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n"
230 << headers->OriginalHeadersForDebugging();
231 return false;
232 }
233 ++end;
234
235 // The two following statements should not be possible.
236 if (end == begin) {
237 *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR;
238 LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n"
239 << headers->OriginalHeadersForDebugging();
240 return false;
241 }
242
243 // whitespace_1_idx_
244 headers->whitespace_1_idx_ = current - begin;
245 // This loop is commented out as it is never used in current code. This is
246 // true only because we don't begin parsing the headers at all until we've
247 // encountered a non whitespace character at the beginning of the stream, at
248 // which point we begin our demarcation of header-start. If we did -not- do
249 // this (for instance, only looked for [\r\n] instead of (< ' ')), this loop
250 // would be necessary for the proper functioning of this parsing.
251 // This is left here as this function may (in the future) be refactored out
252 // of the BalsaFrame class so that it may be shared between code in
253 // BalsaFrame and BalsaHeaders (where it would be used in some variant of the
254 // set_first_line() function (at which point it would be necessary).
255 #if 0
256 while (*current <= ' ') {
257 ++current;
258 }
259 #endif
260 // non_whitespace_1_idx_
261 headers->non_whitespace_1_idx_ = current - begin;
262 do {
263 // The first time through, we're guaranteed that the current character
264 // won't be a whitespace (else the loop above wouldn't have terminated).
265 // That implies that we're guaranteed to get at least one non-whitespace
266 // character if we get into this loop at all.
267 ++current;
268 if (current == end) {
269 headers->whitespace_2_idx_ = current - begin;
270 headers->non_whitespace_2_idx_ = current - begin;
271 headers->whitespace_3_idx_ = current - begin;
272 headers->non_whitespace_3_idx_ = current - begin;
273 headers->whitespace_4_idx_ = current - begin;
274 // FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD for request
275 // FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION for response
276 *error_code =
277 static_cast<BalsaFrameEnums::ErrorCode>(
278 BalsaFrameEnums::FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION +
279 is_request);
280 if (!is_request) { // FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION
281 return false;
282 }
283 goto output_exhausted;
284 }
285 } while (*current > ' ');
286 // whitespace_2_idx_
287 headers->whitespace_2_idx_ = current - begin;
288 do {
289 ++current;
290 // Note that due to the loop which consumes all of the whitespace
291 // at the end of the line, current can never == end while in this function.
292 } while (*current <= ' ');
293 // non_whitespace_2_idx_
294 headers->non_whitespace_2_idx_ = current - begin;
295 do {
296 ++current;
297 if (current == end) {
298 headers->whitespace_3_idx_ = current - begin;
299 headers->non_whitespace_3_idx_ = current - begin;
300 headers->whitespace_4_idx_ = current - begin;
301 // FAILED_TO_FIND_START_OF_REQUEST_REQUEST_URI for request
302 // FAILED_TO_FIND_START_OF_RESPONSE_STATUSCODE for response
303 *error_code =
304 static_cast<BalsaFrameEnums::ErrorCode>(
305 BalsaFrameEnums::FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE
306 + is_request);
307 goto output_exhausted;
308 }
309 } while (*current > ' ');
310 // whitespace_3_idx_
311 headers->whitespace_3_idx_ = current - begin;
312 do {
313 ++current;
314 // Note that due to the loop which consumes all of the whitespace
315 // at the end of the line, current can never == end while in this function.
316 } while (*current <= ' ');
317 // non_whitespace_3_idx_
318 headers->non_whitespace_3_idx_ = current - begin;
319 headers->whitespace_4_idx_ = end - begin;
320
321 output_exhausted:
322 // Note that we don't fail the parse immediately when parsing of the
323 // firstline fails. Depending on the protocol type, we may want to accept
324 // a firstline with only one or two elements, e.g., for HTTP/0.9:
325 // GET\r\n
326 // or
327 // GET /\r\n
328 // should be parsed without issue (though the visitor should know that
329 // parsing the entire line was not exactly as it should be).
330 //
331 // Eventually, these errors may be removed alltogether, as the visitor can
332 // detect them on its own by examining the size of the various fields.
333 // headers->set_first_line(non_whitespace_1_idx_, current);
334
335 if (is_request) {
336 if ((headers->whitespace_3_idx_ - headers->non_whitespace_2_idx_) >
337 max_request_uri_length) {
338 // For requests, we need at least the method. We could assume that a
339 // blank URI means "/". If version isn't stated, it should be assumed
340 // to be HTTP/0.9 by the visitor.
341 *error_code = BalsaFrameEnums::REQUEST_URI_TOO_LONG;
342 return false;
343 }
344 } else {
345 headers->parsed_response_code_ = 0;
346 {
347 const char* parsed_response_code_current =
348 begin + headers->non_whitespace_2_idx_;
349 const char* parsed_response_code_end = begin + headers->whitespace_3_idx_;
350 const size_t kMaxDiv10 = std::numeric_limits<size_t>::max() / 10;
351
352 // Convert a string of [0-9]* into an int.
353 // Note that this allows for the conversion of response codes which
354 // are outside the bounds of normal HTTP response codes (no checking
355 // is done to ensure that these are valid-- they're merely parsed)!
356 while (parsed_response_code_current < parsed_response_code_end) {
357 if (*parsed_response_code_current < '0' ||
358 *parsed_response_code_current > '9') {
359 *error_code = BalsaFrameEnums::FAILED_CONVERTING_STATUS_CODE_TO_INT;
360 return false;
361 }
362 size_t status_code_x_10 = headers->parsed_response_code_ * 10;
363 uint8 c = *parsed_response_code_current - '0';
364 if ((headers->parsed_response_code_ > kMaxDiv10) ||
365 (std::numeric_limits<size_t>::max() - status_code_x_10) < c) {
366 // overflow.
367 *error_code = BalsaFrameEnums::FAILED_CONVERTING_STATUS_CODE_TO_INT;
368 return false;
369 }
370 headers->parsed_response_code_ = status_code_x_10 + c;
371 ++parsed_response_code_current;
372 }
373 }
374 }
375 return true;
376 }
377
378 // begin - beginning of the firstline
379 // end - end of the firstline
380 //
381 // A precondition for this function is that there is non-whitespace between
382 // [begin, end). If this precondition is not met, the function will not perform
383 // as expected (and bad things may happen, and it will eat your first, second,
384 // and third unborn children!).
385 //
386 // Another precondition for this function is that [begin, end) includes
387 // at most one newline, which must be at the end of the line.
ProcessFirstLine(const char * begin,const char * end)388 void BalsaFrame::ProcessFirstLine(const char* begin, const char* end) {
389 BalsaFrameEnums::ErrorCode previous_error = last_error_;
390 if (!ParseHTTPFirstLine(begin,
391 end,
392 is_request_,
393 max_request_uri_length_,
394 headers_,
395 &last_error_)) {
396 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
397 visitor_->HandleHeaderError(this);
398 return;
399 }
400 if (previous_error != last_error_) {
401 visitor_->HandleHeaderWarning(this);
402 }
403
404 if (is_request_) {
405 int version_length =
406 headers_->whitespace_4_idx_ - headers_->non_whitespace_3_idx_;
407 visitor_->ProcessRequestFirstLine(
408 begin + headers_->non_whitespace_1_idx_,
409 headers_->whitespace_4_idx_ - headers_->non_whitespace_1_idx_,
410 begin + headers_->non_whitespace_1_idx_,
411 headers_->whitespace_2_idx_ - headers_->non_whitespace_1_idx_,
412 begin + headers_->non_whitespace_2_idx_,
413 headers_->whitespace_3_idx_ - headers_->non_whitespace_2_idx_,
414 begin + headers_->non_whitespace_3_idx_,
415 version_length);
416 if (version_length == 0)
417 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
418 } else {
419 visitor_->ProcessResponseFirstLine(
420 begin + headers_->non_whitespace_1_idx_,
421 headers_->whitespace_4_idx_ - headers_->non_whitespace_1_idx_,
422 begin + headers_->non_whitespace_1_idx_,
423 headers_->whitespace_2_idx_ - headers_->non_whitespace_1_idx_,
424 begin + headers_->non_whitespace_2_idx_,
425 headers_->whitespace_3_idx_ - headers_->non_whitespace_2_idx_,
426 begin + headers_->non_whitespace_3_idx_,
427 headers_->whitespace_4_idx_ - headers_->non_whitespace_3_idx_);
428 }
429 }
430
431 // 'stream_begin' points to the first character of the headers buffer.
432 // 'line_begin' points to the first character of the line.
433 // 'current' points to a char which is ':'.
434 // 'line_end' points to the position of '\n' + 1.
435 // 'line_begin' points to the position of first character of line.
CleanUpKeyValueWhitespace(const char * stream_begin,const char * line_begin,const char * current,const char * line_end,HeaderLineDescription * current_header_line)436 void BalsaFrame::CleanUpKeyValueWhitespace(
437 const char* stream_begin,
438 const char* line_begin,
439 const char* current,
440 const char* line_end,
441 HeaderLineDescription* current_header_line) {
442 const char* colon_loc = current;
443 DCHECK_LT(colon_loc, line_end);
444 DCHECK_EQ(':', *colon_loc);
445 DCHECK_EQ(':', *current);
446 DCHECK_GE(' ', *line_end)
447 << "\"" << std::string(line_begin, line_end) << "\"";
448
449 // TODO(fenix): Investigate whether or not the bounds tests in the
450 // while loops here are redundant, and if so, remove them.
451 --current;
452 while (current > line_begin && *current <= ' ') --current;
453 current += (current != colon_loc);
454 current_header_line->key_end_idx = current - stream_begin;
455
456 current = colon_loc;
457 DCHECK_EQ(':', *current);
458 ++current;
459 while (current < line_end && *current <= ' ') ++current;
460 current_header_line->value_begin_idx = current - stream_begin;
461
462 DCHECK_GE(current_header_line->key_end_idx,
463 current_header_line->first_char_idx);
464 DCHECK_GE(current_header_line->value_begin_idx,
465 current_header_line->key_end_idx);
466 DCHECK_GE(current_header_line->last_char_idx,
467 current_header_line->value_begin_idx);
468 }
469
FindColonsAndParseIntoKeyValue()470 inline void BalsaFrame::FindColonsAndParseIntoKeyValue() {
471 DCHECK(!lines_.empty());
472 const char* stream_begin = headers_->OriginalHeaderStreamBegin();
473 // The last line is always just a newline (and is uninteresting).
474 const Lines::size_type lines_size_m1 = lines_.size() - 1;
475 #if __SSE2__
476 const __v16qi colons = { ':', ':', ':', ':', ':', ':', ':', ':',
477 ':', ':', ':', ':', ':', ':', ':', ':'};
478 const char* header_lines_end_m16 = headers_->OriginalHeaderStreamEnd() - 16;
479 #endif // __SSE2__
480 const char* current = stream_begin + lines_[1].first;
481 // This code is a bit more subtle than it may appear at first glance.
482 // This code looks for a colon in the current line... but it also looks
483 // beyond the current line. If there is no colon in the current line, then
484 // for each subsequent line (until the colon which -has- been found is
485 // associated with a line), no searching for a colon will be performed. In
486 // this way, we minimize the amount of bytes we have scanned for a colon.
487 for (Lines::size_type i = 1; i < lines_size_m1;) {
488 const char* line_begin = stream_begin + lines_[i].first;
489
490 // Here we handle possible continuations. Note that we do not replace
491 // the '\n' in the line before a continuation (at least, as of now),
492 // which implies that any code which looks for a value must deal with
493 // "\r\n", etc -within- the line (and not just at the end of it).
494 for (++i; i < lines_size_m1; ++i) {
495 const char c = *(stream_begin + lines_[i].first);
496 if (c > ' ') {
497 // Not a continuation, so stop. Note that if the 'original' i = 1,
498 // and the next line is not a continuation, we'll end up with i = 2
499 // when we break. This handles the incrementing of i for the outer
500 // loop.
501 break;
502 }
503 }
504 const char* line_end = stream_begin + lines_[i - 1].second;
505 DCHECK_LT(line_begin - stream_begin, line_end - stream_begin);
506
507 // We cleanup the whitespace at the end of the line before doing anything
508 // else of interest as it allows us to do nothing when irregularly formatted
509 // headers are parsed (e.g. those with only keys, only values, or no colon).
510 //
511 // We're guaranteed to have *line_end > ' ' while line_end >= line_begin.
512 --line_end;
513 DCHECK_EQ('\n', *line_end)
514 << "\"" << std::string(line_begin, line_end) << "\"";
515 while (*line_end <= ' ' && line_end > line_begin) {
516 --line_end;
517 }
518 ++line_end;
519 DCHECK_GE(' ', *line_end);
520 DCHECK_LT(line_begin, line_end);
521
522 // We use '0' for the block idx, because we're always writing to the first
523 // block from the framer (we do this because the framer requires that the
524 // entire header sequence be in a contiguous buffer).
525 headers_->header_lines_.push_back(
526 HeaderLineDescription(line_begin - stream_begin,
527 line_end - stream_begin,
528 line_end - stream_begin,
529 line_end - stream_begin,
530 0));
531 if (current >= line_end) {
532 last_error_ = BalsaFrameEnums::HEADER_MISSING_COLON;
533 visitor_->HandleHeaderWarning(this);
534 // Then the next colon will not be found within this header line-- time
535 // to try again with another header-line.
536 continue;
537 } else if (current < line_begin) {
538 // When this condition is true, the last detected colon was part of a
539 // previous line. We reset to the beginning of the line as we don't care
540 // about the presence of any colon before the beginning of the current
541 // line.
542 current = line_begin;
543 }
544 #if __SSE2__
545 while (current < header_lines_end_m16) {
546 __m128i header_bytes =
547 _mm_loadu_si128(reinterpret_cast<const __m128i *>(current));
548 __m128i colon_cmp =
549 _mm_cmpeq_epi8(header_bytes, reinterpret_cast<__m128i>(colons));
550 int colon_msk = _mm_movemask_epi8(colon_cmp);
551 if (colon_msk == 0) {
552 current += 16;
553 continue;
554 }
555 current += (ffs(colon_msk) - 1);
556 if (current > line_end) {
557 break;
558 }
559 goto found_colon;
560 }
561 #endif // __SSE2__
562 for (; current < line_end; ++current) {
563 if (*current != ':') {
564 continue;
565 }
566 goto found_colon;
567 }
568 // If we've gotten to here, then there was no colon
569 // in the line. The arguments we passed into the construction
570 // for the HeaderLineDescription object should be OK-- it assumes
571 // that the entire content is 'key' by default (which is true, as
572 // there was no colon, there can be no value). Note that this is a
573 // construct which is technically not allowed by the spec.
574 last_error_ = BalsaFrameEnums::HEADER_MISSING_COLON;
575 visitor_->HandleHeaderWarning(this);
576 continue;
577 found_colon:
578 DCHECK_EQ(*current, ':');
579 DCHECK_LE(current - stream_begin, line_end - stream_begin);
580 DCHECK_LE(stream_begin - stream_begin, current - stream_begin);
581
582 HeaderLineDescription& current_header_line = headers_->header_lines_.back();
583 current_header_line.key_end_idx = current - stream_begin;
584 current_header_line.value_begin_idx = current_header_line.key_end_idx;
585 if (current < line_end) {
586 ++current_header_line.key_end_idx;
587
588 CleanUpKeyValueWhitespace(stream_begin,
589 line_begin,
590 current,
591 line_end,
592 ¤t_header_line);
593 }
594 }
595 }
596
ProcessContentLengthLine(HeaderLines::size_type line_idx,BalsaHeadersEnums::ContentLengthStatus * status,size_t * length)597 void BalsaFrame::ProcessContentLengthLine(
598 HeaderLines::size_type line_idx,
599 BalsaHeadersEnums::ContentLengthStatus* status,
600 size_t* length) {
601 const HeaderLineDescription& header_line = headers_->header_lines_[line_idx];
602 const char* stream_begin = headers_->OriginalHeaderStreamBegin();
603 const char* line_end = stream_begin + header_line.last_char_idx;
604 const char* value_begin = (stream_begin + header_line.value_begin_idx);
605
606 if (value_begin >= line_end) {
607 // There is no non-whitespace value data.
608 #if DEBUGFRAMER
609 LOG(INFO) << "invalid content-length -- no non-whitespace value data";
610 #endif
611 *status = BalsaHeadersEnums::INVALID_CONTENT_LENGTH;
612 return;
613 }
614
615 *length = 0;
616 while (value_begin < line_end) {
617 if (*value_begin < '0' || *value_begin > '9') {
618 // bad! content-length found, and couldn't parse all of it!
619 *status = BalsaHeadersEnums::INVALID_CONTENT_LENGTH;
620 #if DEBUGFRAMER
621 LOG(INFO) << "invalid content-length - non numeric character detected";
622 #endif // DEBUGFRAMER
623 return;
624 }
625 const size_t kMaxDiv10 = std::numeric_limits<size_t>::max() / 10;
626 size_t length_x_10 = *length * 10;
627 const unsigned char c = *value_begin - '0';
628 if (*length > kMaxDiv10 ||
629 (std::numeric_limits<size_t>::max() - length_x_10) < c) {
630 *status = BalsaHeadersEnums::CONTENT_LENGTH_OVERFLOW;
631 #if DEBUGFRAMER
632 LOG(INFO) << "content-length overflow";
633 #endif // DEBUGFRAMER
634 return;
635 }
636 *length = length_x_10 + c;
637 ++value_begin;
638 }
639 #if DEBUGFRAMER
640 LOG(INFO) << "content_length parsed: " << *length;
641 #endif // DEBUGFRAMER
642 *status = BalsaHeadersEnums::VALID_CONTENT_LENGTH;
643 }
644
ProcessTransferEncodingLine(HeaderLines::size_type line_idx)645 void BalsaFrame::ProcessTransferEncodingLine(HeaderLines::size_type line_idx) {
646 const HeaderLineDescription& header_line = headers_->header_lines_[line_idx];
647 const char* stream_begin = headers_->OriginalHeaderStreamBegin();
648 const char* line_end = stream_begin + header_line.last_char_idx;
649 const char* value_begin = stream_begin + header_line.value_begin_idx;
650 size_t value_length = line_end - value_begin;
651
652 if ((value_length == 7) &&
653 !strncasecmp(value_begin, "chunked", 7)) {
654 headers_->transfer_encoding_is_chunked_ = true;
655 } else if ((value_length == 8) &&
656 !strncasecmp(value_begin, "identity", 8)) {
657 headers_->transfer_encoding_is_chunked_ = false;
658 } else {
659 last_error_ = BalsaFrameEnums::UNKNOWN_TRANSFER_ENCODING;
660 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
661 visitor_->HandleHeaderError(this);
662 return;
663 }
664 }
665
666 namespace {
SplitStringPiece(base::StringPiece original,char delim,base::StringPiece * before,base::StringPiece * after)667 bool SplitStringPiece(base::StringPiece original, char delim,
668 base::StringPiece* before, base::StringPiece* after) {
669 const char* p = original.data();
670 const char* end = p + original.size();
671
672 while (p != end) {
673 if (*p == delim) {
674 ++p;
675 } else {
676 const char* start = p;
677 while (++p != end && *p != delim) {
678 // Skip to the next occurence of the delimiter.
679 }
680 *before = base::StringPiece(start, p - start);
681 if (p != end)
682 *after = base::StringPiece(p + 1, end - (p + 1));
683 else
684 *after = base::StringPiece("");
685 StringPieceUtils::RemoveWhitespaceContext(before);
686 StringPieceUtils::RemoveWhitespaceContext(after);
687 return true;
688 }
689 }
690
691 *before = original;
692 *after = "";
693 return false;
694 }
695
696 // TODO(phython): Fix this function to properly deal with quoted values.
697 // E.g. ";;foo", "\";;\"", or \"aa;
698 // The last example, the semi-colon is a separator between extensions.
ProcessChunkExtensionsManual(base::StringPiece all_extensions,BalsaHeaders * extensions)699 void ProcessChunkExtensionsManual(base::StringPiece all_extensions,
700 BalsaHeaders* extensions) {
701 base::StringPiece extension;
702 base::StringPiece remaining;
703 StringPieceUtils::RemoveWhitespaceContext(&all_extensions);
704 SplitStringPiece(all_extensions, ';', &extension, &remaining);
705 while (!extension.empty()) {
706 base::StringPiece key;
707 base::StringPiece value;
708 SplitStringPiece(extension, '=', &key, &value);
709 if (!value.empty()) {
710 // Strip quotation marks if they exist.
711 if (!value.empty() && value[0] == '"')
712 value.remove_prefix(1);
713 if (!value.empty() && value[value.length() - 1] == '"')
714 value.remove_suffix(1);
715 }
716
717 extensions->AppendHeader(key, value);
718
719 StringPieceUtils::RemoveWhitespaceContext(&remaining);
720 SplitStringPiece(remaining, ';', &extension, &remaining);
721 }
722 }
723
724 // TODO(phython): Fix this function to properly deal with quoted values.
725 // E.g. ";;foo", "\";;\"", or \"aa;
726 // The last example, the semi-colon is a separator between extensions.
ProcessChunkExtensionsGoogle3(const char * input,size_t size,BalsaHeaders * extensions)727 void ProcessChunkExtensionsGoogle3(const char* input, size_t size,
728 BalsaHeaders* extensions) {
729 std::vector<base::StringPiece> key_values;
730 SplitStringPieceToVector(base::StringPiece(input, size), ";",
731 &key_values, true);
732 for (unsigned int i = 0; i < key_values.size(); ++i) {
733 base::StringPiece key = key_values[i].substr(0, key_values[i].find('='));
734 base::StringPiece value;
735 if (key.length() < key_values[i].length()) {
736 value = key_values[i].substr(key.length() + 1);
737 // Remove any leading and trailing whitespace.
738 StringPieceUtils::RemoveWhitespaceContext(&value);
739
740 // Strip quotation marks if they exist.
741 if (!value.empty() && value[0] == '"')
742 value.remove_prefix(1);
743 if (!value.empty() && value[value.length() - 1] == '"')
744 value.remove_suffix(1);
745 }
746
747 // Strip the key whitespace after checking that there is a value.
748 StringPieceUtils::RemoveWhitespaceContext(&key);
749 extensions->AppendHeader(key, value);
750 }
751 }
752
753 } // anonymous namespace
754
ProcessChunkExtensions(const char * input,size_t size,BalsaHeaders * extensions)755 void BalsaFrame::ProcessChunkExtensions(const char* input, size_t size,
756 BalsaHeaders* extensions) {
757 #if 0
758 ProcessChunkExtensionsGoogle3(input, size, extensions);
759 #else
760 ProcessChunkExtensionsManual(base::StringPiece(input, size), extensions);
761 #endif
762 }
763
ProcessHeaderLines()764 void BalsaFrame::ProcessHeaderLines() {
765 HeaderLines::size_type content_length_idx = 0;
766 HeaderLines::size_type transfer_encoding_idx = 0;
767
768 DCHECK(!lines_.empty());
769 #if DEBUGFRAMER
770 LOG(INFO) << "******@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@**********\n";
771 #endif // DEBUGFRAMER
772
773 // There is no need to attempt to process headers if no header lines exist.
774 // There are at least two lines in the message which are not header lines.
775 // These two non-header lines are the first line of the message, and the
776 // last line of the message (which is an empty line).
777 // Thus, we test to see if we have more than two lines total before attempting
778 // to parse any header lines.
779 if (lines_.size() > 2) {
780 const char* stream_begin = headers_->OriginalHeaderStreamBegin();
781
782 // Then, for the rest of the header data, we parse these into key-value
783 // pairs.
784 FindColonsAndParseIntoKeyValue();
785 // At this point, we've parsed all of the headers. Time to look for those
786 // headers which we require for framing.
787 const HeaderLines::size_type
788 header_lines_size = headers_->header_lines_.size();
789 for (HeaderLines::size_type i = 0; i < header_lines_size; ++i) {
790 const HeaderLineDescription& current_header_line =
791 headers_->header_lines_[i];
792 const char* key_begin =
793 (stream_begin + current_header_line.first_char_idx);
794 const char* key_end = (stream_begin + current_header_line.key_end_idx);
795 const size_t key_len = key_end - key_begin;
796 const char c = *key_begin;
797 #if DEBUGFRAMER
798 LOG(INFO) << "[" << i << "]: " << std::string(key_begin, key_len)
799 << " c: '" << c << "' key_len: " << key_len;
800 #endif // DEBUGFRAMER
801 // If a header begins with either lowercase or uppercase 'c' or 't', then
802 // the header may be one of content-length, connection, content-encoding
803 // or transfer-encoding. These headers are special, as they change the way
804 // that the message is framed, and so the framer is required to search
805 // for them.
806
807
808 if (c == 'c' || c == 'C') {
809 if ((key_len == kContentLengthSize) &&
810 0 == strncasecmp(key_begin, kContentLength, kContentLengthSize)) {
811 BalsaHeadersEnums::ContentLengthStatus content_length_status =
812 BalsaHeadersEnums::NO_CONTENT_LENGTH;
813 size_t length = 0;
814 ProcessContentLengthLine(i, &content_length_status, &length);
815 if (content_length_idx != 0) { // then we've already seen one!
816 if ((headers_->content_length_status_ != content_length_status) ||
817 ((headers_->content_length_status_ ==
818 BalsaHeadersEnums::VALID_CONTENT_LENGTH) &&
819 length != headers_->content_length_)) {
820 last_error_ = BalsaFrameEnums::MULTIPLE_CONTENT_LENGTH_KEYS;
821 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
822 visitor_->HandleHeaderError(this);
823 return;
824 }
825 continue;
826 } else {
827 content_length_idx = i + 1;
828 headers_->content_length_status_ = content_length_status;
829 headers_->content_length_ = length;
830 content_length_remaining_ = length;
831 }
832
833 }
834 } else if (c == 't' || c == 'T') {
835 if ((key_len == kTransferEncodingSize) &&
836 0 == strncasecmp(key_begin, kTransferEncoding,
837 kTransferEncodingSize)) {
838 if (transfer_encoding_idx != 0) {
839 last_error_ = BalsaFrameEnums::MULTIPLE_TRANSFER_ENCODING_KEYS;
840 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
841 visitor_->HandleHeaderError(this);
842 return;
843 }
844 transfer_encoding_idx = i + 1;
845 }
846 } else if (i == 0 && (key_len == 0 || c == ' ')) {
847 last_error_ = BalsaFrameEnums::INVALID_HEADER_FORMAT;
848 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
849 visitor_->HandleHeaderError(this);
850 return;
851 }
852 }
853 if (headers_->transfer_encoding_is_chunked_) {
854 headers_->content_length_ = 0;
855 headers_->content_length_status_ = BalsaHeadersEnums::NO_CONTENT_LENGTH;
856 content_length_remaining_ = 0;
857 }
858 if (transfer_encoding_idx != 0) {
859 ProcessTransferEncodingLine(transfer_encoding_idx - 1);
860 }
861 }
862 }
863
AssignParseStateAfterHeadersHaveBeenParsed()864 void BalsaFrame::AssignParseStateAfterHeadersHaveBeenParsed() {
865 // For responses, can't have a body if the request was a HEAD, or if it is
866 // one of these response-codes. rfc2616 section 4.3
867 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
868 if (is_request_ ||
869 !(request_was_head_ ||
870 (headers_->parsed_response_code_ >= 100 &&
871 headers_->parsed_response_code_ < 200) ||
872 (headers_->parsed_response_code_ == 204) ||
873 (headers_->parsed_response_code_ == 304))) {
874 // Then we can have a body.
875 if (headers_->transfer_encoding_is_chunked_) {
876 // Note that
877 // if ( Transfer-Encoding: chunked && Content-length: )
878 // then Transfer-Encoding: chunked trumps.
879 // This is as specified in the spec.
880 // rfc2616 section 4.4.3
881 parse_state_ = BalsaFrameEnums::READING_CHUNK_LENGTH;
882 } else {
883 // Errors parsing content-length definitely can cause
884 // protocol errors/warnings
885 switch (headers_->content_length_status_) {
886 // If we have a content-length, and it is parsed
887 // properly, there are two options.
888 // 1) zero content, in which case the message is done, and
889 // 2) nonzero content, in which case we have to
890 // consume the body.
891 case BalsaHeadersEnums::VALID_CONTENT_LENGTH:
892 if (headers_->content_length_ == 0) {
893 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
894 } else {
895 parse_state_ = BalsaFrameEnums::READING_CONTENT;
896 }
897 break;
898 case BalsaHeadersEnums::CONTENT_LENGTH_OVERFLOW:
899 case BalsaHeadersEnums::INVALID_CONTENT_LENGTH:
900 // If there were characters left-over after parsing the
901 // content length, we should flag an error and stop.
902 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
903 last_error_ = BalsaFrameEnums::UNPARSABLE_CONTENT_LENGTH;
904 visitor_->HandleHeaderError(this);
905 break;
906 // We can have: no transfer-encoding, no content length, and no
907 // connection: close...
908 // Unfortunately, this case doesn't seem to be covered in the spec.
909 // We'll assume that the safest thing to do here is what the google
910 // binaries before 2008 already do, which is to assume that
911 // everything until the connection is closed is body.
912 case BalsaHeadersEnums::NO_CONTENT_LENGTH:
913 if (is_request_) {
914 base::StringPiece method = headers_->request_method();
915 // POSTs and PUTs should have a detectable body length. If they
916 // do not we consider it an error.
917 if ((method.size() == 4 &&
918 strncmp(method.data(), "POST", 4) == 0) ||
919 (method.size() == 3 &&
920 strncmp(method.data(), "PUT", 3) == 0)) {
921 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
922 last_error_ =
923 BalsaFrameEnums::REQUIRED_BODY_BUT_NO_CONTENT_LENGTH;
924 visitor_->HandleHeaderError(this);
925 break;
926 }
927 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
928 } else {
929 parse_state_ = BalsaFrameEnums::READING_UNTIL_CLOSE;
930 last_error_ = BalsaFrameEnums::MAYBE_BODY_BUT_NO_CONTENT_LENGTH;
931 visitor_->HandleHeaderWarning(this);
932 }
933 break;
934 // The COV_NF_... statements here provide hints to the apparatus
935 // which computes coverage reports/ratios that this code is never
936 // intended to be executed, and should technically be impossible.
937 // COV_NF_START
938 default:
939 LOG(FATAL) << "Saw a content_length_status: "
940 << headers_->content_length_status_ << " which is unknown.";
941 // COV_NF_END
942 }
943 }
944 }
945 }
946
ProcessHeaders(const char * message_start,size_t message_length)947 size_t BalsaFrame::ProcessHeaders(const char* message_start,
948 size_t message_length) {
949 const char* const original_message_start = message_start;
950 const char* const message_end = message_start + message_length;
951 const char* message_current = message_start;
952 const char* checkpoint = message_start;
953
954 if (message_length == 0) {
955 goto bottom;
956 }
957
958 while (message_current < message_end) {
959 size_t base_idx = headers_->GetReadableBytesFromHeaderStream();
960
961 // Yes, we could use strchr (assuming null termination), or
962 // memchr, but as it turns out that is slower than this tight loop
963 // for the input that we see.
964 if (!saw_non_newline_char_) {
965 do {
966 const char c = *message_current;
967 if (c != '\r' && c != '\n') {
968 if (c <= ' ') {
969 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
970 last_error_ = BalsaFrameEnums::NO_REQUEST_LINE_IN_REQUEST;
971 visitor_->HandleHeaderError(this);
972 goto bottom;
973 } else {
974 saw_non_newline_char_ = true;
975 checkpoint = message_start = message_current;
976 goto read_real_message;
977 }
978 }
979 ++message_current;
980 } while (message_current < message_end);
981 goto bottom; // this is necessary to skip 'last_char_was_slash_r' checks
982 } else {
983 read_real_message:
984 // Note that SSE2 can be enabled on certain piii platforms.
985 #if __SSE2__
986 {
987 const char* const message_end_m16 = message_end - 16;
988 __v16qi newlines = { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
989 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' };
990 while (message_current < message_end_m16) {
991 // What this does (using compiler intrinsics):
992 //
993 // Load 16 '\n's into an xmm register
994 // Load 16 bytes of currennt message into an xmm register
995 // Do byte-wise equals on those two xmm registers
996 // Take the first bit of each byte, and put that into the first
997 // 16 bits of a mask
998 // If the mask is zero, no '\n' found. increment by 16 and try again
999 // Else scan forward to find the first set bit.
1000 // Increment current by the index of the first set bit
1001 // (ffs returns index of first set bit + 1)
1002 __m128i msg_bytes =
1003 _mm_loadu_si128(const_cast<__m128i *>(
1004 reinterpret_cast<const __m128i *>(message_current)));
1005 __m128i newline_cmp =
1006 _mm_cmpeq_epi8(msg_bytes, reinterpret_cast<__m128i>(newlines));
1007 int newline_msk = _mm_movemask_epi8(newline_cmp);
1008 if (newline_msk == 0) {
1009 message_current += 16;
1010 continue;
1011 }
1012 message_current += (ffs(newline_msk) - 1);
1013 const size_t relative_idx = message_current - message_start;
1014 const size_t message_current_idx = 1 + base_idx + relative_idx;
1015 lines_.push_back(std::make_pair(last_slash_n_idx_,
1016 message_current_idx));
1017 if (lines_.size() == 1) {
1018 headers_->WriteFromFramer(checkpoint,
1019 1 + message_current - checkpoint);
1020 checkpoint = message_current + 1;
1021 const char* begin = headers_->OriginalHeaderStreamBegin();
1022 #if DEBUGFRAMER
1023 LOG(INFO) << "First line " << std::string(begin, lines_[0].second);
1024 LOG(INFO) << "is_request_: " << is_request_;
1025 #endif
1026 ProcessFirstLine(begin, begin + lines_[0].second);
1027 if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ)
1028 goto process_lines;
1029 else if (parse_state_ == BalsaFrameEnums::PARSE_ERROR)
1030 goto bottom;
1031 }
1032 const size_t chars_since_last_slash_n = (message_current_idx -
1033 last_slash_n_idx_);
1034 last_slash_n_idx_ = message_current_idx;
1035 if (chars_since_last_slash_n > 2) {
1036 // We have a slash-n, but the last slash n was
1037 // more than 2 characters away from this. Thus, we know
1038 // that this cannot be an end-of-header.
1039 ++message_current;
1040 continue;
1041 }
1042 if ((chars_since_last_slash_n == 1) ||
1043 (((message_current > message_start) &&
1044 (*(message_current - 1) == '\r')) ||
1045 (last_char_was_slash_r_))) {
1046 goto process_lines;
1047 }
1048 ++message_current;
1049 }
1050 }
1051 #endif // __SSE2__
1052 while (message_current < message_end) {
1053 if (*message_current != '\n') {
1054 ++message_current;
1055 continue;
1056 }
1057 const size_t relative_idx = message_current - message_start;
1058 const size_t message_current_idx = 1 + base_idx + relative_idx;
1059 lines_.push_back(std::make_pair(last_slash_n_idx_,
1060 message_current_idx));
1061 if (lines_.size() == 1) {
1062 headers_->WriteFromFramer(checkpoint,
1063 1 + message_current - checkpoint);
1064 checkpoint = message_current + 1;
1065 const char* begin = headers_->OriginalHeaderStreamBegin();
1066 #if DEBUGFRAMER
1067 LOG(INFO) << "First line " << std::string(begin, lines_[0].second);
1068 LOG(INFO) << "is_request_: " << is_request_;
1069 #endif
1070 ProcessFirstLine(begin, begin + lines_[0].second);
1071 if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ)
1072 goto process_lines;
1073 else if (parse_state_ == BalsaFrameEnums::PARSE_ERROR)
1074 goto bottom;
1075 }
1076 const size_t chars_since_last_slash_n = (message_current_idx -
1077 last_slash_n_idx_);
1078 last_slash_n_idx_ = message_current_idx;
1079 if (chars_since_last_slash_n > 2) {
1080 // false positive.
1081 ++message_current;
1082 continue;
1083 }
1084 if ((chars_since_last_slash_n == 1) ||
1085 (((message_current > message_start) &&
1086 (*(message_current - 1) == '\r')) ||
1087 (last_char_was_slash_r_))) {
1088 goto process_lines;
1089 }
1090 ++message_current;
1091 }
1092 }
1093 continue;
1094 process_lines:
1095 ++message_current;
1096 DCHECK(message_current >= message_start);
1097 if (message_current > message_start) {
1098 headers_->WriteFromFramer(checkpoint, message_current - checkpoint);
1099 }
1100
1101 // Check if we have exceeded maximum headers length
1102 // Although we check for this limit before and after we call this function
1103 // we check it here as well to make sure that in case the visitor changed
1104 // the max_header_length_ (for example after processing the first line)
1105 // we handle it gracefully.
1106 if (headers_->GetReadableBytesFromHeaderStream() > max_header_length_) {
1107 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1108 last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG;
1109 visitor_->HandleHeaderError(this);
1110 goto bottom;
1111 }
1112
1113 // Since we know that we won't be writing any more bytes of the header,
1114 // we tell that to the headers object. The headers object may make
1115 // more efficient allocation decisions when this is signaled.
1116 headers_->DoneWritingFromFramer();
1117 {
1118 const char* readable_ptr = NULL;
1119 size_t readable_size = 0;
1120 headers_->GetReadablePtrFromHeaderStream(&readable_ptr, &readable_size);
1121 visitor_->ProcessHeaderInput(readable_ptr, readable_size);
1122 }
1123
1124 // Ok, now that we've written everything into our header buffer, it is
1125 // time to process the header lines (extract proper values for headers
1126 // which are important for framing).
1127 ProcessHeaderLines();
1128 if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) {
1129 goto bottom;
1130 }
1131 AssignParseStateAfterHeadersHaveBeenParsed();
1132 if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) {
1133 goto bottom;
1134 }
1135 visitor_->ProcessHeaders(*headers_);
1136 visitor_->HeaderDone();
1137 if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ) {
1138 visitor_->MessageDone();
1139 }
1140 goto bottom;
1141 }
1142 // If we've gotten to here, it means that we've consumed all of the
1143 // available input. We need to record whether or not the last character we
1144 // saw was a '\r' so that a subsequent call to ProcessInput correctly finds
1145 // a header framing that is split across the two calls.
1146 last_char_was_slash_r_ = (*(message_end - 1) == '\r');
1147 DCHECK(message_current >= message_start);
1148 if (message_current > message_start) {
1149 headers_->WriteFromFramer(checkpoint, message_current - checkpoint);
1150 }
1151 bottom:
1152 return message_current - original_message_start;
1153 }
1154
1155
BytesSafeToSplice() const1156 size_t BalsaFrame::BytesSafeToSplice() const {
1157 switch (parse_state_) {
1158 case BalsaFrameEnums::READING_CHUNK_DATA:
1159 return chunk_length_remaining_;
1160 case BalsaFrameEnums::READING_UNTIL_CLOSE:
1161 return std::numeric_limits<size_t>::max();
1162 case BalsaFrameEnums::READING_CONTENT:
1163 return content_length_remaining_;
1164 default:
1165 return 0;
1166 }
1167 }
1168
BytesSpliced(size_t bytes_spliced)1169 void BalsaFrame::BytesSpliced(size_t bytes_spliced) {
1170 switch (parse_state_) {
1171 case BalsaFrameEnums::READING_CHUNK_DATA:
1172 if (chunk_length_remaining_ >= bytes_spliced) {
1173 chunk_length_remaining_ -= bytes_spliced;
1174 if (chunk_length_remaining_ == 0) {
1175 parse_state_ = BalsaFrameEnums::READING_CHUNK_TERM;
1176 }
1177 return;
1178 } else {
1179 last_error_ =
1180 BalsaFrameEnums::CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT;
1181 goto error_exit;
1182 }
1183
1184 case BalsaFrameEnums::READING_UNTIL_CLOSE:
1185 return;
1186
1187 case BalsaFrameEnums::READING_CONTENT:
1188 if (content_length_remaining_ >= bytes_spliced) {
1189 content_length_remaining_ -= bytes_spliced;
1190 if (content_length_remaining_ == 0) {
1191 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1192 visitor_->MessageDone();
1193 }
1194 return;
1195 } else {
1196 last_error_ =
1197 BalsaFrameEnums::CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT;
1198 goto error_exit;
1199 }
1200
1201 default:
1202 last_error_ = BalsaFrameEnums::CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO;
1203 goto error_exit;
1204 }
1205
1206 error_exit:
1207 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1208 visitor_->HandleBodyError(this);
1209 };
1210
1211 // You may note that the state-machine contained within this function has both
1212 // switch and goto labels for nearly the same thing. For instance, the
1213 // following two labels refer to the same code block:
1214 // label_reading_chunk_data:
1215 // case BalsaFrameEnums::READING_CHUNK_DATA:
1216 // The 'case' statement is required for the switch statement which occurs when
1217 // ProcessInput is invoked. The goto label is required as the state-machine
1218 // does not use a computed goto in any subsequent operations.
1219 //
1220 // Since several states exit the state machine for various reasons, there is
1221 // also one label at the bottom of the function. When it is appropriate to
1222 // return from the function, that part of the state machine instead issues a
1223 // goto bottom; This results in less code duplication, and makes debugging
1224 // easier (as you can add a statement to a section of code which is guaranteed
1225 // to be invoked when the function is exiting.
ProcessInput(const char * input,size_t size)1226 size_t BalsaFrame::ProcessInput(const char* input, size_t size) {
1227 const char* current = input;
1228 const char* on_entry = current;
1229 const char* end = current + size;
1230 #if DEBUGFRAMER
1231 LOG(INFO) << "\n=============="
1232 << BalsaFrameEnums::ParseStateToString(parse_state_)
1233 << "===============\n";
1234 #endif // DEBUGFRAMER
1235
1236 DCHECK(headers_ != NULL);
1237 if (headers_ == NULL) return 0;
1238
1239 if (parse_state_ == BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE) {
1240 const size_t header_length = headers_->GetReadableBytesFromHeaderStream();
1241 // Yes, we still have to check this here as the user can change the
1242 // max_header_length amount!
1243 // Also it is possible that we have reached the maximum allowed header size,
1244 // and we have more to consume (remember we are still inside
1245 // READING_HEADER_AND_FIRSTLINE) in which case we directly declare an error.
1246 if (header_length > max_header_length_ ||
1247 (header_length == max_header_length_ && size > 0)) {
1248 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1249 last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG;
1250 visitor_->HandleHeaderError(this);
1251 goto bottom;
1252 }
1253 size_t bytes_to_process = max_header_length_ - header_length;
1254 if (bytes_to_process > size) {
1255 bytes_to_process = size;
1256 }
1257 current += ProcessHeaders(input, bytes_to_process);
1258 // If we are still reading headers check if we have crossed the headers
1259 // limit. Note that we check for >= as opposed to >. This is because if
1260 // header_length_after equals max_header_length_ and we are still in the
1261 // parse_state_ BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE we know for
1262 // sure that the headers limit will be crossed later on
1263 if ((parse_state_ == BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE)) {
1264 // Note that headers_ is valid only if we are still reading headers.
1265 const size_t header_length_after =
1266 headers_->GetReadableBytesFromHeaderStream();
1267 if (header_length_after >= max_header_length_) {
1268 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1269 last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG;
1270 visitor_->HandleHeaderError(this);
1271 }
1272 }
1273 goto bottom;
1274 } else if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ ||
1275 parse_state_ == BalsaFrameEnums::PARSE_ERROR) {
1276 // Can do nothing more 'till we're reset.
1277 goto bottom;
1278 }
1279
1280 while (current < end) {
1281 switch (parse_state_) {
1282 label_reading_chunk_length:
1283 case BalsaFrameEnums::READING_CHUNK_LENGTH:
1284 // In this state we read the chunk length.
1285 // Note that once we hit a character which is not in:
1286 // [0-9;A-Fa-f\n], we transition to a different state.
1287 //
1288 {
1289 // If we used strtol, etc, we'd have to buffer this line.
1290 // This is more annoying than simply doing the conversion
1291 // here. This code accounts for overflow.
1292 static const signed char buf[] = {
1293 // %0 %1 %2 %3 %4 %5 %6 %7 %8 \t \n %b %c \r %e %f
1294 -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -1, -1, -2, -1, -1,
1295 // %10 %11 %12 %13 %14 %15 %16 %17 %18 %19 %1a %1b %1c %1d %1e %1f
1296 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1297 // ' ' %21 %22 %23 %24 %25 %26 %27 %28 %29 %2a %2b %2c %2d %2e %2f
1298 -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1299 // %30 %31 %32 %33 %34 %35 %36 %37 %38 %39 %3a ';' %3c %3d %3e %3f
1300 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -2, -1, -1, -1, -1,
1301 // %40 'A' 'B' 'C' 'D' 'E' 'F' %47 %48 %49 %4a %4b %4c %4d %4e %4f
1302 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1303 // %50 %51 %52 %53 %54 %55 %56 %57 %58 %59 %5a %5b %5c %5d %5e %5f
1304 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1305 // %60 'a' 'b' 'c' 'd' 'e' 'f' %67 %68 %69 %6a %6b %6c %6d %6e %6f
1306 -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1307 // %70 %71 %72 %73 %74 %75 %76 %77 %78 %79 %7a %7b %7c %7d %7e %7f
1308 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1309 };
1310 // valid cases:
1311 // "09123\n" // -> 09123
1312 // "09123\r\n" // -> 09123
1313 // "09123 \n" // -> 09123
1314 // "09123 \r\n" // -> 09123
1315 // "09123 12312\n" // -> 09123
1316 // "09123 12312\r\n" // -> 09123
1317 // "09123; foo=bar\n" // -> 09123
1318 // "09123; foo=bar\r\n" // -> 09123
1319 // "FFFFFFFFFFFFFFFF\r\n" // -> FFFFFFFFFFFFFFFF
1320 // "FFFFFFFFFFFFFFFF 22\r\n" // -> FFFFFFFFFFFFFFFF
1321 // invalid cases:
1322 // "[ \t]+[^\n]*\n"
1323 // "FFFFFFFFFFFFFFFFF\r\n" (would overflow)
1324 // "\r\n"
1325 // "\n"
1326 while (current < end) {
1327 const char c = *current;
1328 ++current;
1329 const signed char addition = buf[static_cast<int>(c)];
1330 if (addition >= 0) {
1331 chunk_length_character_extracted_ = true;
1332 size_t length_x_16 = chunk_length_remaining_ * 16;
1333 const size_t kMaxDiv16 = std::numeric_limits<size_t>::max() / 16;
1334 if ((chunk_length_remaining_ > kMaxDiv16) ||
1335 ((std::numeric_limits<size_t>::max() - length_x_16) <
1336 static_cast<size_t>(addition))) {
1337 // overflow -- asked for a chunk-length greater than 2^64 - 1!!
1338 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1339 last_error_ = BalsaFrameEnums::CHUNK_LENGTH_OVERFLOW;
1340 visitor_->ProcessBodyInput(on_entry, current - on_entry);
1341 visitor_->HandleChunkingError(this);
1342 goto bottom;
1343 }
1344 chunk_length_remaining_ = length_x_16 + addition;
1345 continue;
1346 }
1347
1348 if (!chunk_length_character_extracted_ || addition == -1) {
1349 // ^[0-9;A-Fa-f][ \t\n] -- was not matched, either because no
1350 // characters were converted, or an unexpected character was
1351 // seen.
1352 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1353 last_error_ = BalsaFrameEnums::INVALID_CHUNK_LENGTH;
1354 visitor_->ProcessBodyInput(on_entry, current - on_entry);
1355 visitor_->HandleChunkingError(this);
1356 goto bottom;
1357 }
1358
1359 --current;
1360 parse_state_ = BalsaFrameEnums::READING_CHUNK_EXTENSION;
1361 visitor_->ProcessChunkLength(chunk_length_remaining_);
1362 goto label_reading_chunk_extension;
1363 }
1364 }
1365 visitor_->ProcessBodyInput(on_entry, current - on_entry);
1366 goto bottom; // case BalsaFrameEnums::READING_CHUNK_LENGTH
1367
1368 label_reading_chunk_extension:
1369 case BalsaFrameEnums::READING_CHUNK_EXTENSION:
1370 {
1371 // TODO(phython): Convert this scanning to be 16 bytes at a time if
1372 // there is data to be read.
1373 const char* extensions_start = current;
1374 size_t extensions_length = 0;
1375 while (current < end) {
1376 const char c = *current;
1377 if (c == '\r' || c == '\n') {
1378 extensions_length =
1379 (extensions_start == current) ?
1380 0 :
1381 current - extensions_start - 1;
1382 }
1383
1384 ++current;
1385 if (c == '\n') {
1386 chunk_length_character_extracted_ = false;
1387 visitor_->ProcessChunkExtensions(
1388 extensions_start, extensions_length);
1389 if (chunk_length_remaining_ != 0) {
1390 parse_state_ = BalsaFrameEnums::READING_CHUNK_DATA;
1391 goto label_reading_chunk_data;
1392 }
1393 HeaderFramingFound('\n');
1394 parse_state_ = BalsaFrameEnums::READING_LAST_CHUNK_TERM;
1395 goto label_reading_last_chunk_term;
1396 }
1397 }
1398 visitor_->ProcessChunkExtensions(
1399 extensions_start, extensions_length);
1400 }
1401
1402 visitor_->ProcessBodyInput(on_entry, current - on_entry);
1403 goto bottom; // case BalsaFrameEnums::READING_CHUNK_EXTENSION
1404
1405 label_reading_chunk_data:
1406 case BalsaFrameEnums::READING_CHUNK_DATA:
1407 while (current < end) {
1408 if (chunk_length_remaining_ == 0) {
1409 break;
1410 }
1411 // read in the chunk
1412 size_t bytes_remaining = end - current;
1413 size_t consumed_bytes = (chunk_length_remaining_ < bytes_remaining) ?
1414 chunk_length_remaining_ : bytes_remaining;
1415 const char* tmp_current = current + consumed_bytes;
1416 visitor_->ProcessBodyInput(on_entry, tmp_current - on_entry);
1417 visitor_->ProcessBodyData(current, consumed_bytes);
1418 on_entry = current = tmp_current;
1419 chunk_length_remaining_ -= consumed_bytes;
1420 }
1421 if (chunk_length_remaining_ == 0) {
1422 parse_state_ = BalsaFrameEnums::READING_CHUNK_TERM;
1423 goto label_reading_chunk_term;
1424 }
1425 visitor_->ProcessBodyInput(on_entry, current - on_entry);
1426 goto bottom; // case BalsaFrameEnums::READING_CHUNK_DATA
1427
1428 label_reading_chunk_term:
1429 case BalsaFrameEnums::READING_CHUNK_TERM:
1430 while (current < end) {
1431 const char c = *current;
1432 ++current;
1433
1434 if (c == '\n') {
1435 parse_state_ = BalsaFrameEnums::READING_CHUNK_LENGTH;
1436 goto label_reading_chunk_length;
1437 }
1438 }
1439 visitor_->ProcessBodyInput(on_entry, current - on_entry);
1440 goto bottom; // case BalsaFrameEnums::READING_CHUNK_TERM
1441
1442 label_reading_last_chunk_term:
1443 case BalsaFrameEnums::READING_LAST_CHUNK_TERM:
1444 while (current < end) {
1445 const char c = *current;
1446
1447 if (!HeaderFramingFound(c)) {
1448 // If not, however, since the spec only suggests that the
1449 // client SHOULD indicate the presence of trailers, we get to
1450 // *test* that they did or didn't.
1451 // If all of the bytes we've seen since:
1452 // OPTIONAL_WS 0 OPTIONAL_STUFF CRLF
1453 // are either '\r', or '\n', then we can assume that we don't yet
1454 // know if we need to parse headers, or if the next byte will make
1455 // the HeaderFramingFound condition (above) true.
1456 if (HeaderFramingMayBeFound()) {
1457 // If true, then we have seen only characters '\r' or '\n'.
1458 ++current;
1459
1460 // Lets try again! There is no state change here.
1461 continue;
1462 } else {
1463 // If (!HeaderFramingMayBeFound()), then we know that we must be
1464 // reading the first non CRLF character of a trailer.
1465 parse_state_ = BalsaFrameEnums::READING_TRAILER;
1466 visitor_->ProcessBodyInput(on_entry, current - on_entry);
1467 on_entry = current;
1468 goto label_reading_trailer;
1469 }
1470 } else {
1471 // If we've found a "\r\n\r\n", then the message
1472 // is done.
1473 ++current;
1474 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1475 visitor_->ProcessBodyInput(on_entry, current - on_entry);
1476 visitor_->MessageDone();
1477 goto bottom;
1478 }
1479 break; // from while loop
1480 }
1481 visitor_->ProcessBodyInput(on_entry, current - on_entry);
1482 goto bottom; // case BalsaFrameEnums::READING_LAST_CHUNK_TERM
1483
1484 label_reading_trailer:
1485 case BalsaFrameEnums::READING_TRAILER:
1486 while (current < end) {
1487 const char c = *current;
1488 ++current;
1489 // TODO(fenix): If we ever care about trailers as part of framing,
1490 // deal with them here (see below for part of the 'solution')
1491 // if (LineFramingFound(c)) {
1492 // trailer_lines_.push_back(make_pair(start_of_line_,
1493 // trailer_length_ - 1));
1494 // start_of_line_ = trailer_length_;
1495 // }
1496 if (HeaderFramingFound(c)) {
1497 // ProcessTrailers(visitor_, &trailers_);
1498 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1499 visitor_->ProcessTrailerInput(on_entry, current - on_entry);
1500 visitor_->MessageDone();
1501 goto bottom;
1502 }
1503 }
1504 visitor_->ProcessTrailerInput(on_entry, current - on_entry);
1505 break; // case BalsaFrameEnums::READING_TRAILER
1506
1507 // Note that there is no label:
1508 // 'label_reading_until_close'
1509 // here. This is because the state-machine exists immediately after
1510 // reading the headers instead of transitioning here (as it would
1511 // do if it was consuming all the data it could, all the time).
1512 case BalsaFrameEnums::READING_UNTIL_CLOSE:
1513 {
1514 const size_t bytes_remaining = end - current;
1515 if (bytes_remaining > 0) {
1516 visitor_->ProcessBodyInput(current, bytes_remaining);
1517 visitor_->ProcessBodyData(current, bytes_remaining);
1518 current += bytes_remaining;
1519 }
1520 }
1521 goto bottom; // case BalsaFrameEnums::READING_UNTIL_CLOSE
1522
1523 // label_reading_content:
1524 case BalsaFrameEnums::READING_CONTENT:
1525 #if DEBUGFRAMER
1526 LOG(INFO) << "ReadingContent: " << content_length_remaining_;
1527 #endif // DEBUGFRAMER
1528 while (content_length_remaining_ && current < end) {
1529 // read in the content
1530 const size_t bytes_remaining = end - current;
1531 const size_t consumed_bytes =
1532 (content_length_remaining_ < bytes_remaining) ?
1533 content_length_remaining_ : bytes_remaining;
1534 visitor_->ProcessBodyInput(current, consumed_bytes);
1535 visitor_->ProcessBodyData(current, consumed_bytes);
1536 current += consumed_bytes;
1537 content_length_remaining_ -= consumed_bytes;
1538 }
1539 if (content_length_remaining_ == 0) {
1540 parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1541 visitor_->MessageDone();
1542 }
1543 goto bottom; // case BalsaFrameEnums::READING_CONTENT
1544
1545 default:
1546 // The state-machine should never be in a state that isn't handled
1547 // above. This is a glaring logic error, and we should do something
1548 // drastic to ensure that this gets looked-at and fixed.
1549 LOG(FATAL) << "Unknown state: " << parse_state_ // COV_NF_LINE
1550 << " memory corruption?!"; // COV_NF_LINE
1551 }
1552 }
1553 bottom:
1554 #if DEBUGFRAMER
1555 LOG(INFO) << "\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n"
1556 << std::string(input, current)
1557 << "\n$$$$$$$$$$$$$$"
1558 << BalsaFrameEnums::ParseStateToString(parse_state_)
1559 << "$$$$$$$$$$$$$$$"
1560 << " consumed: " << (current - input);
1561 if (Error()) {
1562 LOG(INFO) << BalsaFrameEnums::ErrorCodeToString(ErrorCode());
1563 }
1564 #endif // DEBUGFRAMER
1565 return current - input;
1566 }
1567
1568 const uint32 BalsaFrame::kValidTerm1;
1569 const uint32 BalsaFrame::kValidTerm1Mask;
1570 const uint32 BalsaFrame::kValidTerm2;
1571 const uint32 BalsaFrame::kValidTerm2Mask;
1572
1573 } // namespace net
1574
1575