• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "net/tools/balsa/balsa_frame.h"
6 
7 #include <assert.h>
8 #if __SSE2__
9 #include <emmintrin.h>
10 #endif  // __SSE2__
11 
12 #include <limits>
13 #include <string>
14 #include <utility>
15 #include <vector>
16 
17 #include "base/logging.h"
18 #include "base/port.h"
19 #include "base/strings/string_piece.h"
20 #include "net/tools/balsa/balsa_enums.h"
21 #include "net/tools/balsa/balsa_headers.h"
22 #include "net/tools/balsa/balsa_visitor_interface.h"
23 #include "net/tools/balsa/buffer_interface.h"
24 #include "net/tools/balsa/simple_buffer.h"
25 #include "net/tools/balsa/split.h"
26 #include "net/tools/balsa/string_piece_utils.h"
27 
28 #if defined(COMPILER_MSVC)
29 #include <string.h>
30 #define strncasecmp _strnicmp
31 #else
32 #include <strings.h>
33 #endif
34 
35 namespace net {
36 
37 // Constants holding some header names for headers which can affect the way the
38 // HTTP message is framed, and so must be processed specially:
39 static const char kContentLength[] = "content-length";
40 static const size_t kContentLengthSize = sizeof(kContentLength) - 1;
41 static const char kTransferEncoding[] = "transfer-encoding";
42 static const size_t kTransferEncodingSize = sizeof(kTransferEncoding) - 1;
43 
BalsaFrame()44 BalsaFrame::BalsaFrame()
45     : last_char_was_slash_r_(false),
46       saw_non_newline_char_(false),
47       start_was_space_(true),
48       chunk_length_character_extracted_(false),
49       is_request_(true),
50       request_was_head_(false),
51       max_header_length_(16 * 1024),
52       max_request_uri_length_(2048),
53       visitor_(&do_nothing_visitor_),
54       chunk_length_remaining_(0),
55       content_length_remaining_(0),
56       last_slash_n_loc_(NULL),
57       last_recorded_slash_n_loc_(NULL),
58       last_slash_n_idx_(0),
59       term_chars_(0),
60       parse_state_(BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE),
61       last_error_(BalsaFrameEnums::NO_ERROR),
62       headers_(NULL) {
63 }
64 
~BalsaFrame()65 BalsaFrame::~BalsaFrame() {}
66 
Reset()67 void BalsaFrame::Reset() {
68   last_char_was_slash_r_ = false;
69   saw_non_newline_char_ = false;
70   start_was_space_ = true;
71   chunk_length_character_extracted_ = false;
72   // is_request_ = true;               // not reset between messages.
73   // request_was_head_ = false;        // not reset between messages.
74   // max_header_length_ = 4096;        // not reset between messages.
75   // max_request_uri_length_ = 2048;   // not reset between messages.
76   // visitor_ = &do_nothing_visitor_;  // not reset between messages.
77   chunk_length_remaining_ = 0;
78   content_length_remaining_ = 0;
79   last_slash_n_loc_ = NULL;
80   last_recorded_slash_n_loc_ = NULL;
81   last_slash_n_idx_ = 0;
82   term_chars_ = 0;
83   parse_state_ = BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE;
84   last_error_ = BalsaFrameEnums::NO_ERROR;
85   lines_.clear();
86   if (headers_ != NULL) {
87     headers_->Clear();
88   }
89 }
90 
ParseStateToString(BalsaFrameEnums::ParseState error_code)91 const char* BalsaFrameEnums::ParseStateToString(
92     BalsaFrameEnums::ParseState error_code) {
93   switch (error_code) {
94     case PARSE_ERROR:
95       return "PARSE_ERROR";
96     case READING_HEADER_AND_FIRSTLINE:
97       return "READING_HEADER_AND_FIRSTLINE";
98     case READING_CHUNK_LENGTH:
99       return "READING_CHUNK_LENGTH";
100     case READING_CHUNK_EXTENSION:
101       return "READING_CHUNK_EXTENSION";
102     case READING_CHUNK_DATA:
103       return "READING_CHUNK_DATA";
104     case READING_CHUNK_TERM:
105       return "READING_CHUNK_TERM";
106     case READING_LAST_CHUNK_TERM:
107       return "READING_LAST_CHUNK_TERM";
108     case READING_TRAILER:
109       return "READING_TRAILER";
110     case READING_UNTIL_CLOSE:
111       return "READING_UNTIL_CLOSE";
112     case READING_CONTENT:
113       return "READING_CONTENT";
114     case MESSAGE_FULLY_READ:
115       return "MESSAGE_FULLY_READ";
116     case NUM_STATES:
117       return "UNKNOWN_STATE";
118   }
119   return "UNKNOWN_STATE";
120 }
121 
ErrorCodeToString(BalsaFrameEnums::ErrorCode error_code)122 const char* BalsaFrameEnums::ErrorCodeToString(
123     BalsaFrameEnums::ErrorCode error_code) {
124   switch (error_code) {
125     case NO_ERROR:
126       return "NO_ERROR";
127     case NO_STATUS_LINE_IN_RESPONSE:
128       return "NO_STATUS_LINE_IN_RESPONSE";
129     case NO_REQUEST_LINE_IN_REQUEST:
130       return "NO_REQUEST_LINE_IN_REQUEST";
131     case FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION:
132       return "FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION";
133     case FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD:
134       return "FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD";
135     case FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE:
136       return "FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE";
137     case FAILED_TO_FIND_WS_AFTER_REQUEST_REQUEST_URI:
138       return "FAILED_TO_FIND_WS_AFTER_REQUEST_REQUEST_URI";
139     case FAILED_TO_FIND_NL_AFTER_RESPONSE_REASON_PHRASE:
140       return "FAILED_TO_FIND_NL_AFTER_RESPONSE_REASON_PHRASE";
141     case FAILED_TO_FIND_NL_AFTER_REQUEST_HTTP_VERSION:
142       return "FAILED_TO_FIND_NL_AFTER_REQUEST_HTTP_VERSION";
143     case FAILED_CONVERTING_STATUS_CODE_TO_INT:
144       return "FAILED_CONVERTING_STATUS_CODE_TO_INT";
145     case REQUEST_URI_TOO_LONG:
146       return "REQUEST_URI_TOO_LONG";
147     case HEADERS_TOO_LONG:
148       return "HEADERS_TOO_LONG";
149     case UNPARSABLE_CONTENT_LENGTH:
150       return "UNPARSABLE_CONTENT_LENGTH";
151     case MAYBE_BODY_BUT_NO_CONTENT_LENGTH:
152       return "MAYBE_BODY_BUT_NO_CONTENT_LENGTH";
153     case REQUIRED_BODY_BUT_NO_CONTENT_LENGTH:
154       return "REQUIRED_BODY_BUT_NO_CONTENT_LENGTH";
155     case HEADER_MISSING_COLON:
156       return "HEADER_MISSING_COLON";
157     case INVALID_CHUNK_LENGTH:
158       return "INVALID_CHUNK_LENGTH";
159     case CHUNK_LENGTH_OVERFLOW:
160       return "CHUNK_LENGTH_OVERFLOW";
161     case CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO:
162       return "CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO";
163     case CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT:
164       return "CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT";
165     case MULTIPLE_CONTENT_LENGTH_KEYS:
166       return "MULTIPLE_CONTENT_LENGTH_KEYS";
167     case MULTIPLE_TRANSFER_ENCODING_KEYS:
168       return "MULTIPLE_TRANSFER_ENCODING_KEYS";
169     case UNKNOWN_TRANSFER_ENCODING:
170       return "UNKNOWN_TRANSFER_ENCODING";
171     case INVALID_HEADER_FORMAT:
172       return "INVALID_HEADER_FORMAT";
173     case INTERNAL_LOGIC_ERROR:
174       return "INTERNAL_LOGIC_ERROR";
175     case NUM_ERROR_CODES:
176       return "UNKNOWN_ERROR";
177   }
178   return "UNKNOWN_ERROR";
179 }
180 
181 // Summary:
182 //     Parses the first line of either a request or response.
183 //     Note that in the case of a detected warning, error_code will be set
184 //   but the function will not return false.
185 //     Exactly zero or one warning or error (but not both) may be detected
186 //   by this function.
187 //     Note that this function will not write the data of the first-line
188 //   into the header's buffer (that should already have been done elsewhere).
189 //
190 // Pre-conditions:
191 //     begin != end
192 //     *begin should be a character which is > ' '. This implies that there
193 //   is at least one non-whitespace characters between [begin, end).
194 //   headers is a valid pointer to a BalsaHeaders class.
195 //     error_code is a valid pointer to a BalsaFrameEnums::ErrorCode value.
196 //     Entire first line must exist between [begin, end)
197 //     Exactly zero or one newlines -may- exist between [begin, end)
198 //     [begin, end) should exist in the header's buffer.
199 //
200 // Side-effects:
201 //   headers will be modified
202 //   error_code may be modified if either a warning or error is detected
203 //
204 // Returns:
205 //   True if no error (as opposed to warning) is detected.
206 //   False if an error (as opposed to warning) is detected.
207 
208 //
209 // If there is indeed non-whitespace in the line, then the following
210 // will take care of this for you:
211 //  while (*begin <= ' ') ++begin;
212 //  ProcessFirstLine(begin, end, is_request, &headers, &error_code);
213 //
ParseHTTPFirstLine(const char * begin,const char * end,bool is_request,size_t max_request_uri_length,BalsaHeaders * headers,BalsaFrameEnums::ErrorCode * error_code)214 bool ParseHTTPFirstLine(const char* begin,
215                         const char* end,
216                         bool is_request,
217                         size_t max_request_uri_length,
218                         BalsaHeaders* headers,
219                         BalsaFrameEnums::ErrorCode* error_code) {
220   const char* current = begin;
221   // HTTP firstlines all have the following structure:
222   //  LWS         NONWS  LWS    NONWS   LWS    NONWS   NOTCRLF  CRLF
223   //  [\t \r\n]+ [^\t ]+ [\t ]+ [^\t ]+ [\t ]+ [^\t ]+ [^\r\n]+ "\r\n"
224   //  ws1        nws1    ws2    nws2    ws3    nws3             ws4
225   //  |          [-------)      [-------)      [----------------)
226   //    REQ:     method         request_uri    version
227   //   RESP:     version        statuscode     reason
228   //
229   //   The first NONWS->LWS component we'll call firstline_a.
230   //   The second firstline_b, and the third firstline_c.
231   //
232   //   firstline_a goes from nws1 to (but not including) ws2
233   //   firstline_b goes from nws2 to (but not including) ws3
234   //   firstline_c goes from nws3 to (but not including) ws4
235   //
236   // In the code:
237   //    ws1 == whitespace_1_idx_
238   //   nws1 == non_whitespace_1_idx_
239   //    ws2 == whitespace_2_idx_
240   //   nws2 == non_whitespace_2_idx_
241   //    ws3 == whitespace_3_idx_
242   //   nws3 == non_whitespace_3_idx_
243   //    ws4 == whitespace_4_idx_
244 
245   // Kill all whitespace (including '\r\n') at the end of the line.
246   --end;
247   if (*end != '\n') {
248     *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR;
249     LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n"
250                 << headers->OriginalHeadersForDebugging();
251     return false;
252   }
253   while (begin < end && *end <= ' ') {
254     --end;
255   }
256   DCHECK(*end != '\n');
257   if (*end == '\n') {
258     *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR;
259     LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n"
260                 << headers->OriginalHeadersForDebugging();
261     return false;
262   }
263   ++end;
264 
265   // The two following statements should not be possible.
266   if (end == begin) {
267     *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR;
268     LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n"
269                 << headers->OriginalHeadersForDebugging();
270     return false;
271   }
272 
273   // whitespace_1_idx_
274   headers->whitespace_1_idx_ = current - begin;
275   // This loop is commented out as it is never used in current code.  This is
276   // true only because we don't begin parsing the headers at all until we've
277   // encountered a non whitespace character at the beginning of the stream, at
278   // which point we begin our demarcation of header-start.  If we did -not- do
279   // this (for instance, only looked for [\r\n] instead of (< ' ')), this loop
280   // would be necessary for the proper functioning of this parsing.
281   // This is left here as this function may (in the future) be refactored out
282   // of the BalsaFrame class so that it may be shared between code in
283   // BalsaFrame and BalsaHeaders (where it would be used in some variant of the
284   // set_first_line() function (at which point it would be necessary).
285 #if 0
286   while (*current <= ' ') {
287     ++current;
288   }
289 #endif
290   // non_whitespace_1_idx_
291   headers->non_whitespace_1_idx_ = current - begin;
292   do {
293     // The first time through, we're guaranteed that the current character
294     // won't be a whitespace (else the loop above wouldn't have terminated).
295     // That implies that we're guaranteed to get at least one non-whitespace
296     // character if we get into this loop at all.
297     ++current;
298     if (current == end) {
299       headers->whitespace_2_idx_ = current - begin;
300       headers->non_whitespace_2_idx_ = current - begin;
301       headers->whitespace_3_idx_ = current - begin;
302       headers->non_whitespace_3_idx_ = current - begin;
303       headers->whitespace_4_idx_ = current - begin;
304       // FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD   for request
305       // FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION for response
306       *error_code =
307         static_cast<BalsaFrameEnums::ErrorCode>(
308             BalsaFrameEnums::FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION +
309             is_request);
310       if (!is_request) {  // FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION
311         return false;
312       }
313       goto output_exhausted;
314     }
315   } while (*current > ' ');
316   // whitespace_2_idx_
317   headers->whitespace_2_idx_ = current - begin;
318   do {
319     ++current;
320     // Note that due to the loop which consumes all of the whitespace
321     // at the end of the line, current can never == end while in this function.
322   } while (*current <= ' ');
323   // non_whitespace_2_idx_
324   headers->non_whitespace_2_idx_ = current - begin;
325   do {
326     ++current;
327     if (current == end) {
328       headers->whitespace_3_idx_ = current - begin;
329       headers->non_whitespace_3_idx_ = current - begin;
330       headers->whitespace_4_idx_ = current - begin;
331       // FAILED_TO_FIND_START_OF_REQUEST_REQUEST_URI for request
332       // FAILED_TO_FIND_START_OF_RESPONSE_STATUSCODE for response
333       *error_code =
334         static_cast<BalsaFrameEnums::ErrorCode>(
335             BalsaFrameEnums::FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE
336                                  + is_request);
337       goto output_exhausted;
338     }
339   } while (*current > ' ');
340   // whitespace_3_idx_
341   headers->whitespace_3_idx_ = current - begin;
342   do {
343     ++current;
344     // Note that due to the loop which consumes all of the whitespace
345     // at the end of the line, current can never == end while in this function.
346   } while (*current <= ' ');
347   // non_whitespace_3_idx_
348   headers->non_whitespace_3_idx_ = current - begin;
349   headers->whitespace_4_idx_ = end - begin;
350 
351  output_exhausted:
352   // Note that we don't fail the parse immediately when parsing of the
353   // firstline fails.  Depending on the protocol type, we may want to accept
354   // a firstline with only one or two elements, e.g., for HTTP/0.9:
355   //   GET\r\n
356   // or
357   //   GET /\r\n
358   // should be parsed without issue (though the visitor should know that
359   // parsing the entire line was not exactly as it should be).
360   //
361   // Eventually, these errors may be removed alltogether, as the visitor can
362   // detect them on its own by examining the size of the various fields.
363   // headers->set_first_line(non_whitespace_1_idx_, current);
364 
365   if (is_request) {
366     if ((headers->whitespace_3_idx_ - headers->non_whitespace_2_idx_) >
367         max_request_uri_length) {
368       // For requests, we need at least the method.  We could assume that a
369       // blank URI means "/".  If version isn't stated, it should be assumed
370       // to be HTTP/0.9 by the visitor.
371       *error_code = BalsaFrameEnums::REQUEST_URI_TOO_LONG;
372       return false;
373     }
374   } else {
375     headers->parsed_response_code_ = 0;
376     {
377       const char* parsed_response_code_current =
378         begin + headers->non_whitespace_2_idx_;
379       const char* parsed_response_code_end = begin + headers->whitespace_3_idx_;
380       const size_t kMaxDiv10 = std::numeric_limits<size_t>::max() / 10;
381 
382       // Convert a string of [0-9]* into an int.
383       // Note that this allows for the conversion of response codes which
384       // are outside the bounds of normal HTTP response codes (no checking
385       // is done to ensure that these are valid-- they're merely parsed)!
386       while (parsed_response_code_current < parsed_response_code_end) {
387         if (*parsed_response_code_current < '0' ||
388             *parsed_response_code_current > '9') {
389           *error_code = BalsaFrameEnums::FAILED_CONVERTING_STATUS_CODE_TO_INT;
390           return false;
391         }
392         size_t status_code_x_10 = headers->parsed_response_code_ * 10;
393         uint8 c = *parsed_response_code_current - '0';
394         if ((headers->parsed_response_code_ > kMaxDiv10) ||
395             (std::numeric_limits<size_t>::max() - status_code_x_10) < c) {
396           // overflow.
397           *error_code = BalsaFrameEnums::FAILED_CONVERTING_STATUS_CODE_TO_INT;
398           return false;
399         }
400         headers->parsed_response_code_ = status_code_x_10 + c;
401         ++parsed_response_code_current;
402       }
403     }
404   }
405   return true;
406 }
407 
408 // begin - beginning of the firstline
409 // end - end of the firstline
410 //
411 // A precondition for this function is that there is non-whitespace between
412 // [begin, end). If this precondition is not met, the function will not perform
413 // as expected (and bad things may happen, and it will eat your first, second,
414 // and third unborn children!).
415 //
416 // Another precondition for this function is that [begin, end) includes
417 // at most one newline, which must be at the end of the line.
ProcessFirstLine(const char * begin,const char * end)418 void BalsaFrame::ProcessFirstLine(const char* begin, const char* end) {
419   BalsaFrameEnums::ErrorCode previous_error = last_error_;
420   if (!ParseHTTPFirstLine(begin,
421                           end,
422                           is_request_,
423                           max_request_uri_length_,
424                           headers_,
425                           &last_error_)) {
426     parse_state_ = BalsaFrameEnums::PARSE_ERROR;
427     visitor_->HandleHeaderError(this);
428     return;
429   }
430   if (previous_error != last_error_) {
431     visitor_->HandleHeaderWarning(this);
432   }
433 
434   if (is_request_) {
435     size_t version_length =
436         headers_->whitespace_4_idx_ - headers_->non_whitespace_3_idx_;
437     visitor_->ProcessRequestFirstLine(
438         begin + headers_->non_whitespace_1_idx_,
439         headers_->whitespace_4_idx_ - headers_->non_whitespace_1_idx_,
440         begin + headers_->non_whitespace_1_idx_,
441         headers_->whitespace_2_idx_ - headers_->non_whitespace_1_idx_,
442         begin + headers_->non_whitespace_2_idx_,
443         headers_->whitespace_3_idx_ - headers_->non_whitespace_2_idx_,
444         begin + headers_->non_whitespace_3_idx_,
445         version_length);
446     if (version_length == 0)
447       parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
448   } else {
449     visitor_->ProcessResponseFirstLine(
450         begin + headers_->non_whitespace_1_idx_,
451         headers_->whitespace_4_idx_ - headers_->non_whitespace_1_idx_,
452         begin + headers_->non_whitespace_1_idx_,
453         headers_->whitespace_2_idx_ - headers_->non_whitespace_1_idx_,
454         begin + headers_->non_whitespace_2_idx_,
455         headers_->whitespace_3_idx_ - headers_->non_whitespace_2_idx_,
456         begin + headers_->non_whitespace_3_idx_,
457         headers_->whitespace_4_idx_ - headers_->non_whitespace_3_idx_);
458   }
459 }
460 
461 // 'stream_begin' points to the first character of the headers buffer.
462 // 'line_begin' points to the first character of the line.
463 // 'current' points to a char which is ':'.
464 // 'line_end' points to the position of '\n' + 1.
465 // 'line_begin' points to the position of first character of line.
CleanUpKeyValueWhitespace(const char * stream_begin,const char * line_begin,const char * current,const char * line_end,HeaderLineDescription * current_header_line)466 void BalsaFrame::CleanUpKeyValueWhitespace(
467     const char* stream_begin,
468     const char* line_begin,
469     const char* current,
470     const char* line_end,
471     HeaderLineDescription* current_header_line) {
472   const char* colon_loc = current;
473   DCHECK_LT(colon_loc, line_end);
474   DCHECK_EQ(':', *colon_loc);
475   DCHECK_EQ(':', *current);
476   DCHECK_GE(' ', *line_end)
477     << "\"" << std::string(line_begin, line_end) << "\"";
478 
479   // TODO(fenix): Investigate whether or not the bounds tests in the
480   // while loops here are redundant, and if so, remove them.
481   --current;
482   while (current > line_begin && *current <= ' ') --current;
483   current += (current != colon_loc);
484   current_header_line->key_end_idx = current - stream_begin;
485 
486   current = colon_loc;
487   DCHECK_EQ(':', *current);
488   ++current;
489   while (current < line_end && *current <= ' ') ++current;
490   current_header_line->value_begin_idx = current - stream_begin;
491 
492   DCHECK_GE(current_header_line->key_end_idx,
493             current_header_line->first_char_idx);
494   DCHECK_GE(current_header_line->value_begin_idx,
495             current_header_line->key_end_idx);
496   DCHECK_GE(current_header_line->last_char_idx,
497             current_header_line->value_begin_idx);
498 }
499 
FindColonsAndParseIntoKeyValue()500 inline void BalsaFrame::FindColonsAndParseIntoKeyValue() {
501   DCHECK(!lines_.empty());
502   const char* stream_begin = headers_->OriginalHeaderStreamBegin();
503   // The last line is always just a newline (and is uninteresting).
504   const Lines::size_type lines_size_m1 = lines_.size() - 1;
505 #if __SSE2__
506   const __v16qi colons = { ':', ':', ':', ':', ':', ':', ':', ':',
507                            ':', ':', ':', ':', ':', ':', ':', ':'};
508   const char* header_lines_end_m16 = headers_->OriginalHeaderStreamEnd() - 16;
509 #endif  // __SSE2__
510   const char* current = stream_begin + lines_[1].first;
511   // This code is a bit more subtle than it may appear at first glance.
512   // This code looks for a colon in the current line... but it also looks
513   // beyond the current line. If there is no colon in the current line, then
514   // for each subsequent line (until the colon which -has- been found is
515   // associated with a line), no searching for a colon will be performed. In
516   // this way, we minimize the amount of bytes we have scanned for a colon.
517   for (Lines::size_type i = 1; i < lines_size_m1;) {
518     const char* line_begin = stream_begin + lines_[i].first;
519 
520     // Here we handle possible continuations.  Note that we do not replace
521     // the '\n' in the line before a continuation (at least, as of now),
522     // which implies that any code which looks for a value must deal with
523     // "\r\n", etc -within- the line (and not just at the end of it).
524     for (++i; i < lines_size_m1; ++i) {
525       const char c = *(stream_begin + lines_[i].first);
526       if (c > ' ') {
527         // Not a continuation, so stop.  Note that if the 'original' i = 1,
528         // and the next line is not a continuation, we'll end up with i = 2
529         // when we break. This handles the incrementing of i for the outer
530         // loop.
531         break;
532       }
533     }
534     const char* line_end = stream_begin + lines_[i - 1].second;
535     DCHECK_LT(line_begin - stream_begin, line_end - stream_begin);
536 
537     // We cleanup the whitespace at the end of the line before doing anything
538     // else of interest as it allows us to do nothing when irregularly formatted
539     // headers are parsed (e.g. those with only keys, only values, or no colon).
540     //
541     // We're guaranteed to have *line_end > ' ' while line_end >= line_begin.
542     --line_end;
543     DCHECK_EQ('\n', *line_end)
544       << "\"" << std::string(line_begin, line_end) << "\"";
545     while (*line_end <= ' ' && line_end > line_begin) {
546       --line_end;
547     }
548     ++line_end;
549     DCHECK_GE(' ', *line_end);
550     DCHECK_LT(line_begin, line_end);
551 
552     // We use '0' for the block idx, because we're always writing to the first
553     // block from the framer (we do this because the framer requires that the
554     // entire header sequence be in a contiguous buffer).
555     headers_->header_lines_.push_back(
556         HeaderLineDescription(line_begin - stream_begin,
557                               line_end - stream_begin,
558                               line_end - stream_begin,
559                               line_end - stream_begin,
560                               0));
561     if (current >= line_end) {
562       last_error_ = BalsaFrameEnums::HEADER_MISSING_COLON;
563       visitor_->HandleHeaderWarning(this);
564       // Then the next colon will not be found within this header line-- time
565       // to try again with another header-line.
566       continue;
567     } else if (current < line_begin) {
568       // When this condition is true, the last detected colon was part of a
569       // previous line.  We reset to the beginning of the line as we don't care
570       // about the presence of any colon before the beginning of the current
571       // line.
572       current = line_begin;
573     }
574 #if __SSE2__
575     while (current < header_lines_end_m16) {
576       __m128i header_bytes =
577         _mm_loadu_si128(reinterpret_cast<const __m128i *>(current));
578       __m128i colon_cmp =
579         _mm_cmpeq_epi8(header_bytes, reinterpret_cast<__m128i>(colons));
580       int colon_msk = _mm_movemask_epi8(colon_cmp);
581       if (colon_msk == 0) {
582         current += 16;
583         continue;
584       }
585       current += (ffs(colon_msk) - 1);
586       if (current > line_end) {
587         break;
588       }
589       goto found_colon;
590     }
591 #endif  // __SSE2__
592     for (; current < line_end; ++current) {
593       if (*current != ':') {
594         continue;
595       }
596       goto found_colon;
597     }
598     // If we've gotten to here, then there was no colon
599     // in the line. The arguments we passed into the construction
600     // for the HeaderLineDescription object should be OK-- it assumes
601     // that the entire content is 'key' by default (which is true, as
602     // there was no colon, there can be no value). Note that this is a
603     // construct which is technically not allowed by the spec.
604     last_error_ = BalsaFrameEnums::HEADER_MISSING_COLON;
605     visitor_->HandleHeaderWarning(this);
606     continue;
607  found_colon:
608     DCHECK_EQ(*current, ':');
609     DCHECK_LE(current - stream_begin, line_end - stream_begin);
610     DCHECK_LE(stream_begin - stream_begin, current - stream_begin);
611 
612     HeaderLineDescription& current_header_line = headers_->header_lines_.back();
613     current_header_line.key_end_idx = current - stream_begin;
614     current_header_line.value_begin_idx = current_header_line.key_end_idx;
615     if (current < line_end) {
616       ++current_header_line.key_end_idx;
617 
618       CleanUpKeyValueWhitespace(stream_begin,
619                                 line_begin,
620                                 current,
621                                 line_end,
622                                 &current_header_line);
623     }
624   }
625 }
626 
ProcessContentLengthLine(HeaderLines::size_type line_idx,BalsaHeadersEnums::ContentLengthStatus * status,size_t * length)627 void BalsaFrame::ProcessContentLengthLine(
628     HeaderLines::size_type line_idx,
629     BalsaHeadersEnums::ContentLengthStatus* status,
630     size_t* length) {
631   const HeaderLineDescription& header_line = headers_->header_lines_[line_idx];
632   const char* stream_begin = headers_->OriginalHeaderStreamBegin();
633   const char* line_end = stream_begin + header_line.last_char_idx;
634   const char* value_begin = (stream_begin + header_line.value_begin_idx);
635 
636   if (value_begin >= line_end) {
637     // There is no non-whitespace value data.
638 #if DEBUGFRAMER
639       LOG(INFO) << "invalid content-length -- no non-whitespace value data";
640 #endif
641     *status = BalsaHeadersEnums::INVALID_CONTENT_LENGTH;
642     return;
643   }
644 
645   *length = 0;
646   while (value_begin < line_end) {
647     if (*value_begin < '0' || *value_begin > '9') {
648       // bad! content-length found, and couldn't parse all of it!
649       *status = BalsaHeadersEnums::INVALID_CONTENT_LENGTH;
650 #if DEBUGFRAMER
651       LOG(INFO) << "invalid content-length - non numeric character detected";
652 #endif  // DEBUGFRAMER
653       return;
654     }
655     const size_t kMaxDiv10 = std::numeric_limits<size_t>::max() / 10;
656     size_t length_x_10 = *length * 10;
657     const unsigned char c = *value_begin - '0';
658     if (*length > kMaxDiv10 ||
659         (std::numeric_limits<size_t>::max() - length_x_10) < c) {
660       *status = BalsaHeadersEnums::CONTENT_LENGTH_OVERFLOW;
661 #if DEBUGFRAMER
662       LOG(INFO) << "content-length overflow";
663 #endif  // DEBUGFRAMER
664       return;
665     }
666     *length = length_x_10 + c;
667     ++value_begin;
668   }
669 #if DEBUGFRAMER
670   LOG(INFO) << "content_length parsed: " << *length;
671 #endif  // DEBUGFRAMER
672   *status = BalsaHeadersEnums::VALID_CONTENT_LENGTH;
673 }
674 
ProcessTransferEncodingLine(HeaderLines::size_type line_idx)675 void BalsaFrame::ProcessTransferEncodingLine(HeaderLines::size_type line_idx) {
676   const HeaderLineDescription& header_line = headers_->header_lines_[line_idx];
677   const char* stream_begin = headers_->OriginalHeaderStreamBegin();
678   const char* line_end = stream_begin + header_line.last_char_idx;
679   const char* value_begin = stream_begin + header_line.value_begin_idx;
680   size_t value_length = line_end - value_begin;
681 
682   if ((value_length == 7) &&
683       !strncasecmp(value_begin, "chunked", 7)) {
684     headers_->transfer_encoding_is_chunked_ = true;
685   } else if ((value_length == 8) &&
686       !strncasecmp(value_begin, "identity", 8)) {
687     headers_->transfer_encoding_is_chunked_ = false;
688   } else {
689     last_error_ = BalsaFrameEnums::UNKNOWN_TRANSFER_ENCODING;
690     parse_state_ = BalsaFrameEnums::PARSE_ERROR;
691     visitor_->HandleHeaderError(this);
692     return;
693   }
694 }
695 
696 namespace {
SplitStringPiece(base::StringPiece original,char delim,base::StringPiece * before,base::StringPiece * after)697 bool SplitStringPiece(base::StringPiece original, char delim,
698                       base::StringPiece* before, base::StringPiece* after) {
699   const char* p = original.data();
700   const char* end = p + original.size();
701 
702   while (p != end) {
703     if (*p == delim) {
704       ++p;
705     } else {
706       const char* start = p;
707       while (++p != end && *p != delim) {
708         // Skip to the next occurence of the delimiter.
709       }
710       *before = base::StringPiece(start, p - start);
711       if (p != end)
712         *after = base::StringPiece(p + 1, end - (p + 1));
713       else
714         *after = base::StringPiece("");
715       StringPieceUtils::RemoveWhitespaceContext(before);
716       StringPieceUtils::RemoveWhitespaceContext(after);
717       return true;
718     }
719   }
720 
721   *before = original;
722   *after = "";
723   return false;
724 }
725 
726 // TODO(phython): Fix this function to properly deal with quoted values.
727 // E.g. ";;foo", "\";;\"", or \"aa;
728 // The last example, the semi-colon is a separator between extensions.
ProcessChunkExtensionsManual(base::StringPiece all_extensions,BalsaHeaders * extensions)729 void ProcessChunkExtensionsManual(base::StringPiece all_extensions,
730                                   BalsaHeaders* extensions) {
731   base::StringPiece extension;
732   base::StringPiece remaining;
733   StringPieceUtils::RemoveWhitespaceContext(&all_extensions);
734   SplitStringPiece(all_extensions, ';', &extension, &remaining);
735   while (!extension.empty()) {
736     base::StringPiece key;
737     base::StringPiece value;
738     SplitStringPiece(extension, '=', &key, &value);
739     if (!value.empty()) {
740       // Strip quotation marks if they exist.
741       if (!value.empty() && value[0] == '"')
742         value.remove_prefix(1);
743       if (!value.empty() && value[value.length() - 1] == '"')
744         value.remove_suffix(1);
745     }
746 
747     extensions->AppendHeader(key, value);
748 
749     StringPieceUtils::RemoveWhitespaceContext(&remaining);
750     SplitStringPiece(remaining, ';', &extension, &remaining);
751   }
752 }
753 
754 }  // anonymous namespace
755 
ProcessChunkExtensions(const char * input,size_t size,BalsaHeaders * extensions)756 void BalsaFrame::ProcessChunkExtensions(const char* input, size_t size,
757                                         BalsaHeaders* extensions) {
758   ProcessChunkExtensionsManual(base::StringPiece(input, size), extensions);
759 }
760 
ProcessHeaderLines()761 void BalsaFrame::ProcessHeaderLines() {
762   HeaderLines::size_type content_length_idx = 0;
763   HeaderLines::size_type transfer_encoding_idx = 0;
764 
765   DCHECK(!lines_.empty());
766 #if DEBUGFRAMER
767   LOG(INFO) << "******@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@**********\n";
768 #endif  // DEBUGFRAMER
769 
770   // There is no need to attempt to process headers if no header lines exist.
771   // There are at least two lines in the message which are not header lines.
772   // These two non-header lines are the first line of the message, and the
773   // last line of the message (which is an empty line).
774   // Thus, we test to see if we have more than two lines total before attempting
775   // to parse any header lines.
776   if (lines_.size() > 2) {
777     const char* stream_begin = headers_->OriginalHeaderStreamBegin();
778 
779     // Then, for the rest of the header data, we parse these into key-value
780     // pairs.
781     FindColonsAndParseIntoKeyValue();
782     // At this point, we've parsed all of the headers.  Time to look for those
783     // headers which we require for framing.
784     const HeaderLines::size_type
785       header_lines_size = headers_->header_lines_.size();
786     for (HeaderLines::size_type i = 0; i < header_lines_size; ++i) {
787       const HeaderLineDescription& current_header_line =
788         headers_->header_lines_[i];
789       const char* key_begin =
790         (stream_begin + current_header_line.first_char_idx);
791       const char* key_end = (stream_begin + current_header_line.key_end_idx);
792       const size_t key_len = key_end - key_begin;
793       const char c = *key_begin;
794 #if DEBUGFRAMER
795       LOG(INFO) << "[" << i << "]: " << std::string(key_begin, key_len)
796                 << " c: '" << c << "' key_len: " << key_len;
797 #endif  // DEBUGFRAMER
798       // If a header begins with either lowercase or uppercase 'c' or 't', then
799       // the header may be one of content-length, connection, content-encoding
800       // or transfer-encoding. These headers are special, as they change the way
801       // that the message is framed, and so the framer is required to search
802       // for them.
803 
804 
805       if (c == 'c' || c == 'C') {
806         if ((key_len == kContentLengthSize) &&
807             0 == strncasecmp(key_begin, kContentLength, kContentLengthSize)) {
808           BalsaHeadersEnums::ContentLengthStatus content_length_status =
809             BalsaHeadersEnums::NO_CONTENT_LENGTH;
810           size_t length = 0;
811           ProcessContentLengthLine(i, &content_length_status, &length);
812           if (content_length_idx != 0) {  // then we've already seen one!
813             if ((headers_->content_length_status_ != content_length_status) ||
814                 ((headers_->content_length_status_ ==
815                   BalsaHeadersEnums::VALID_CONTENT_LENGTH) &&
816                  length != headers_->content_length_)) {
817               last_error_ = BalsaFrameEnums::MULTIPLE_CONTENT_LENGTH_KEYS;
818               parse_state_ = BalsaFrameEnums::PARSE_ERROR;
819               visitor_->HandleHeaderError(this);
820               return;
821             }
822             continue;
823           } else {
824             content_length_idx = i + 1;
825             headers_->content_length_status_ = content_length_status;
826             headers_->content_length_ = length;
827             content_length_remaining_ = length;
828           }
829 
830         }
831       } else if (c == 't' || c == 'T') {
832         if ((key_len == kTransferEncodingSize) &&
833             0 == strncasecmp(key_begin, kTransferEncoding,
834                              kTransferEncodingSize)) {
835           if (transfer_encoding_idx != 0) {
836             last_error_ = BalsaFrameEnums::MULTIPLE_TRANSFER_ENCODING_KEYS;
837             parse_state_ = BalsaFrameEnums::PARSE_ERROR;
838             visitor_->HandleHeaderError(this);
839             return;
840           }
841           transfer_encoding_idx = i + 1;
842         }
843       } else if (i == 0 && (key_len == 0 || c == ' ')) {
844         last_error_ = BalsaFrameEnums::INVALID_HEADER_FORMAT;
845         parse_state_ = BalsaFrameEnums::PARSE_ERROR;
846         visitor_->HandleHeaderError(this);
847         return;
848       }
849     }
850     if (headers_->transfer_encoding_is_chunked_) {
851       headers_->content_length_ = 0;
852       headers_->content_length_status_ = BalsaHeadersEnums::NO_CONTENT_LENGTH;
853       content_length_remaining_ = 0;
854     }
855     if (transfer_encoding_idx != 0) {
856       ProcessTransferEncodingLine(transfer_encoding_idx - 1);
857     }
858   }
859 }
860 
AssignParseStateAfterHeadersHaveBeenParsed()861 void BalsaFrame::AssignParseStateAfterHeadersHaveBeenParsed() {
862   // For responses, can't have a body if the request was a HEAD, or if it is
863   // one of these response-codes.  rfc2616 section 4.3
864   parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
865   if (is_request_ ||
866       !(request_was_head_ ||
867         (headers_->parsed_response_code_ >= 100 &&
868          headers_->parsed_response_code_ < 200) ||
869         (headers_->parsed_response_code_ == 204) ||
870         (headers_->parsed_response_code_ == 304))) {
871     // Then we can have a body.
872     if (headers_->transfer_encoding_is_chunked_) {
873       // Note that
874       // if ( Transfer-Encoding: chunked &&  Content-length: )
875       // then Transfer-Encoding: chunked trumps.
876       // This is as specified in the spec.
877       // rfc2616 section 4.4.3
878       parse_state_ = BalsaFrameEnums::READING_CHUNK_LENGTH;
879     } else {
880       // Errors parsing content-length definitely can cause
881       // protocol errors/warnings
882       switch (headers_->content_length_status_) {
883         // If we have a content-length, and it is parsed
884         // properly, there are two options.
885         // 1) zero content, in which case the message is done, and
886         // 2) nonzero content, in which case we have to
887         //    consume the body.
888         case BalsaHeadersEnums::VALID_CONTENT_LENGTH:
889           if (headers_->content_length_ == 0) {
890             parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
891           } else {
892             parse_state_ = BalsaFrameEnums::READING_CONTENT;
893           }
894           break;
895         case BalsaHeadersEnums::CONTENT_LENGTH_OVERFLOW:
896         case BalsaHeadersEnums::INVALID_CONTENT_LENGTH:
897           // If there were characters left-over after parsing the
898           // content length, we should flag an error and stop.
899           parse_state_ = BalsaFrameEnums::PARSE_ERROR;
900           last_error_ = BalsaFrameEnums::UNPARSABLE_CONTENT_LENGTH;
901           visitor_->HandleHeaderError(this);
902           break;
903           // We can have: no transfer-encoding, no content length, and no
904           // connection: close...
905           // Unfortunately, this case doesn't seem to be covered in the spec.
906           // We'll assume that the safest thing to do here is what the google
907           // binaries before 2008 already do, which is to assume that
908           // everything until the connection is closed is body.
909         case BalsaHeadersEnums::NO_CONTENT_LENGTH:
910           if (is_request_) {
911             base::StringPiece method = headers_->request_method();
912             // POSTs and PUTs should have a detectable body length.  If they
913             // do not we consider it an error.
914             if ((method.size() == 4 &&
915                  strncmp(method.data(), "POST", 4) == 0) ||
916                 (method.size() == 3 &&
917                  strncmp(method.data(), "PUT", 3) == 0)) {
918               parse_state_ = BalsaFrameEnums::PARSE_ERROR;
919               last_error_ =
920                   BalsaFrameEnums::REQUIRED_BODY_BUT_NO_CONTENT_LENGTH;
921               visitor_->HandleHeaderError(this);
922               break;
923             }
924             parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
925           } else {
926             parse_state_ = BalsaFrameEnums::READING_UNTIL_CLOSE;
927             last_error_ = BalsaFrameEnums::MAYBE_BODY_BUT_NO_CONTENT_LENGTH;
928             visitor_->HandleHeaderWarning(this);
929           }
930           break;
931           // The COV_NF_... statements here provide hints to the apparatus
932           // which computes coverage reports/ratios that this code is never
933           // intended to be executed, and should technically be impossible.
934           // COV_NF_START
935         default:
936           LOG(FATAL) << "Saw a content_length_status: "
937            << headers_->content_length_status_ << " which is unknown.";
938           // COV_NF_END
939       }
940     }
941   }
942 }
943 
ProcessHeaders(const char * message_start,size_t message_length)944 size_t BalsaFrame::ProcessHeaders(const char* message_start,
945                                   size_t message_length) {
946   const char* const original_message_start = message_start;
947   const char* const message_end = message_start + message_length;
948   const char* message_current = message_start;
949   const char* checkpoint = message_start;
950 
951   if (message_length == 0) {
952     goto bottom;
953   }
954 
955   while (message_current < message_end) {
956     size_t base_idx = headers_->GetReadableBytesFromHeaderStream();
957 
958     // Yes, we could use strchr (assuming null termination), or
959     // memchr, but as it turns out that is slower than this tight loop
960     // for the input that we see.
961     if (!saw_non_newline_char_) {
962       do {
963         const char c = *message_current;
964         if (c != '\r' && c != '\n') {
965           if (c <= ' ') {
966             parse_state_ = BalsaFrameEnums::PARSE_ERROR;
967             last_error_ = BalsaFrameEnums::NO_REQUEST_LINE_IN_REQUEST;
968             visitor_->HandleHeaderError(this);
969             goto bottom;
970           } else {
971             saw_non_newline_char_ = true;
972             checkpoint = message_start = message_current;
973             goto read_real_message;
974           }
975         }
976         ++message_current;
977       } while (message_current < message_end);
978       goto bottom;  // this is necessary to skip 'last_char_was_slash_r' checks
979     } else {
980  read_real_message:
981       // Note that SSE2 can be enabled on certain piii platforms.
982 #if __SSE2__
983       {
984         const char* const message_end_m16 = message_end - 16;
985         __v16qi newlines = { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
986                              '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' };
987         while (message_current < message_end_m16) {
988           // What this does (using compiler intrinsics):
989           //
990           // Load 16 '\n's into an xmm register
991           // Load 16 bytes of currennt message into an xmm register
992           // Do byte-wise equals on those two xmm registers
993           // Take the first bit of each byte, and put that into the first
994           //   16 bits of a mask
995           // If the mask is zero, no '\n' found. increment by 16 and try again
996           // Else scan forward to find the first set bit.
997           // Increment current by the index of the first set bit
998           //   (ffs returns index of first set bit + 1)
999           __m128i msg_bytes =
1000             _mm_loadu_si128(const_cast<__m128i *>(
1001                     reinterpret_cast<const __m128i *>(message_current)));
1002           __m128i newline_cmp =
1003             _mm_cmpeq_epi8(msg_bytes, reinterpret_cast<__m128i>(newlines));
1004           int newline_msk = _mm_movemask_epi8(newline_cmp);
1005           if (newline_msk == 0) {
1006             message_current += 16;
1007             continue;
1008           }
1009           message_current += (ffs(newline_msk) - 1);
1010           const size_t relative_idx = message_current - message_start;
1011           const size_t message_current_idx = 1 + base_idx + relative_idx;
1012           lines_.push_back(std::make_pair(last_slash_n_idx_,
1013                                           message_current_idx));
1014           if (lines_.size() == 1) {
1015             headers_->WriteFromFramer(checkpoint,
1016                                       1 + message_current - checkpoint);
1017             checkpoint = message_current + 1;
1018             const char* begin = headers_->OriginalHeaderStreamBegin();
1019 #if DEBUGFRAMER
1020           LOG(INFO) << "First line " << std::string(begin, lines_[0].second);
1021           LOG(INFO) << "is_request_: " << is_request_;
1022 #endif
1023             ProcessFirstLine(begin, begin + lines_[0].second);
1024             if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ)
1025               goto process_lines;
1026             else if (parse_state_ == BalsaFrameEnums::PARSE_ERROR)
1027               goto bottom;
1028           }
1029           const size_t chars_since_last_slash_n = (message_current_idx -
1030                                                    last_slash_n_idx_);
1031           last_slash_n_idx_ = message_current_idx;
1032           if (chars_since_last_slash_n > 2) {
1033             // We have a slash-n, but the last slash n was
1034             // more than 2 characters away from this. Thus, we know
1035             // that this cannot be an end-of-header.
1036             ++message_current;
1037             continue;
1038           }
1039           if ((chars_since_last_slash_n == 1) ||
1040               (((message_current > message_start) &&
1041                 (*(message_current - 1) == '\r')) ||
1042                (last_char_was_slash_r_))) {
1043             goto process_lines;
1044           }
1045           ++message_current;
1046         }
1047       }
1048 #endif  // __SSE2__
1049       while (message_current < message_end) {
1050         if (*message_current != '\n') {
1051           ++message_current;
1052           continue;
1053         }
1054         const size_t relative_idx = message_current - message_start;
1055         const size_t message_current_idx = 1 + base_idx + relative_idx;
1056         lines_.push_back(std::make_pair(last_slash_n_idx_,
1057                                         message_current_idx));
1058         if (lines_.size() == 1) {
1059           headers_->WriteFromFramer(checkpoint,
1060                                     1 + message_current - checkpoint);
1061           checkpoint = message_current + 1;
1062           const char* begin = headers_->OriginalHeaderStreamBegin();
1063 #if DEBUGFRAMER
1064           LOG(INFO) << "First line " << std::string(begin, lines_[0].second);
1065           LOG(INFO) << "is_request_: " << is_request_;
1066 #endif
1067           ProcessFirstLine(begin, begin + lines_[0].second);
1068           if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ)
1069             goto process_lines;
1070           else if (parse_state_ == BalsaFrameEnums::PARSE_ERROR)
1071             goto bottom;
1072         }
1073         const size_t chars_since_last_slash_n = (message_current_idx -
1074                                                  last_slash_n_idx_);
1075         last_slash_n_idx_ = message_current_idx;
1076         if (chars_since_last_slash_n > 2) {
1077           // false positive.
1078           ++message_current;
1079           continue;
1080         }
1081         if ((chars_since_last_slash_n == 1) ||
1082             (((message_current > message_start) &&
1083               (*(message_current - 1) == '\r')) ||
1084              (last_char_was_slash_r_))) {
1085           goto process_lines;
1086         }
1087         ++message_current;
1088       }
1089     }
1090     continue;
1091  process_lines:
1092     ++message_current;
1093     DCHECK(message_current >= message_start);
1094     if (message_current > message_start) {
1095       headers_->WriteFromFramer(checkpoint, message_current - checkpoint);
1096     }
1097 
1098     // Check if we have exceeded maximum headers length
1099     // Although we check for this limit before and after we call this function
1100     // we check it here as well to make sure that in case the visitor changed
1101     // the max_header_length_ (for example after processing the first line)
1102     // we handle it gracefully.
1103     if (headers_->GetReadableBytesFromHeaderStream() > max_header_length_) {
1104       parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1105       last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG;
1106       visitor_->HandleHeaderError(this);
1107       goto bottom;
1108     }
1109 
1110     // Since we know that we won't be writing any more bytes of the header,
1111     // we tell that to the headers object. The headers object may make
1112     // more efficient allocation decisions when this is signaled.
1113     headers_->DoneWritingFromFramer();
1114     {
1115       const char* readable_ptr = NULL;
1116       size_t readable_size = 0;
1117       headers_->GetReadablePtrFromHeaderStream(&readable_ptr, &readable_size);
1118       visitor_->ProcessHeaderInput(readable_ptr, readable_size);
1119     }
1120 
1121     // Ok, now that we've written everything into our header buffer, it is
1122     // time to process the header lines (extract proper values for headers
1123     // which are important for framing).
1124     ProcessHeaderLines();
1125     if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) {
1126       goto bottom;
1127     }
1128     AssignParseStateAfterHeadersHaveBeenParsed();
1129     if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) {
1130       goto bottom;
1131     }
1132     visitor_->ProcessHeaders(*headers_);
1133     visitor_->HeaderDone();
1134     if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ) {
1135       visitor_->MessageDone();
1136     }
1137     goto bottom;
1138   }
1139   // If we've gotten to here, it means that we've consumed all of the
1140   // available input. We need to record whether or not the last character we
1141   // saw was a '\r' so that a subsequent call to ProcessInput correctly finds
1142   // a header framing that is split across the two calls.
1143   last_char_was_slash_r_ = (*(message_end - 1) == '\r');
1144   DCHECK(message_current >= message_start);
1145   if (message_current > message_start) {
1146     headers_->WriteFromFramer(checkpoint, message_current - checkpoint);
1147   }
1148  bottom:
1149   return message_current - original_message_start;
1150 }
1151 
1152 
BytesSafeToSplice() const1153 size_t BalsaFrame::BytesSafeToSplice() const {
1154   switch (parse_state_) {
1155     case BalsaFrameEnums::READING_CHUNK_DATA:
1156       return chunk_length_remaining_;
1157     case BalsaFrameEnums::READING_UNTIL_CLOSE:
1158       return std::numeric_limits<size_t>::max();
1159     case BalsaFrameEnums::READING_CONTENT:
1160       return content_length_remaining_;
1161     default:
1162       return 0;
1163   }
1164 }
1165 
BytesSpliced(size_t bytes_spliced)1166 void BalsaFrame::BytesSpliced(size_t bytes_spliced) {
1167   switch (parse_state_) {
1168     case BalsaFrameEnums::READING_CHUNK_DATA:
1169       if (chunk_length_remaining_ >= bytes_spliced) {
1170         chunk_length_remaining_ -= bytes_spliced;
1171         if (chunk_length_remaining_ == 0) {
1172           parse_state_ = BalsaFrameEnums::READING_CHUNK_TERM;
1173         }
1174         return;
1175       } else {
1176         last_error_ =
1177           BalsaFrameEnums::CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT;
1178         goto error_exit;
1179       }
1180 
1181     case BalsaFrameEnums::READING_UNTIL_CLOSE:
1182       return;
1183 
1184     case BalsaFrameEnums::READING_CONTENT:
1185       if (content_length_remaining_ >= bytes_spliced) {
1186         content_length_remaining_ -= bytes_spliced;
1187         if (content_length_remaining_ == 0) {
1188           parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1189           visitor_->MessageDone();
1190         }
1191         return;
1192       } else {
1193         last_error_ =
1194           BalsaFrameEnums::CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT;
1195         goto error_exit;
1196       }
1197 
1198     default:
1199       last_error_ = BalsaFrameEnums::CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO;
1200       goto error_exit;
1201   }
1202 
1203  error_exit:
1204   parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1205   visitor_->HandleBodyError(this);
1206 };
1207 
1208 // You may note that the state-machine contained within this function has both
1209 // switch and goto labels for nearly the same thing. For instance, the
1210 // following two labels refer to the same code block:
1211 //   label_reading_chunk_data:
1212 //   case BalsaFrameEnums::READING_CHUNK_DATA:
1213 // The 'case' statement is required for the switch statement which occurs when
1214 // ProcessInput is invoked. The goto label is required as the state-machine
1215 // does not use a computed goto in any subsequent operations.
1216 //
1217 // Since several states exit the state machine for various reasons, there is
1218 // also one label at the bottom of the function. When it is appropriate to
1219 // return from the function, that part of the state machine instead issues a
1220 // goto bottom; This results in less code duplication, and makes debugging
1221 // easier (as you can add a statement to a section of code which is guaranteed
1222 // to be invoked when the function is exiting.
ProcessInput(const char * input,size_t size)1223 size_t BalsaFrame::ProcessInput(const char* input, size_t size) {
1224   const char* current = input;
1225   const char* on_entry = current;
1226   const char* end = current + size;
1227 #if DEBUGFRAMER
1228   LOG(INFO) << "\n=============="
1229             << BalsaFrameEnums::ParseStateToString(parse_state_)
1230             << "===============\n";
1231 #endif  // DEBUGFRAMER
1232 
1233   DCHECK(headers_ != NULL);
1234   if (headers_ == NULL) return 0;
1235 
1236   if (parse_state_ == BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE) {
1237     const size_t header_length = headers_->GetReadableBytesFromHeaderStream();
1238     // Yes, we still have to check this here as the user can change the
1239     // max_header_length amount!
1240     // Also it is possible that we have reached the maximum allowed header size,
1241     // and we have more to consume (remember we are still inside
1242     // READING_HEADER_AND_FIRSTLINE) in which case we directly declare an error.
1243     if (header_length > max_header_length_ ||
1244         (header_length == max_header_length_ && size > 0)) {
1245       parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1246       last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG;
1247       visitor_->HandleHeaderError(this);
1248       goto bottom;
1249     }
1250     size_t bytes_to_process = max_header_length_ - header_length;
1251     if (bytes_to_process > size) {
1252       bytes_to_process = size;
1253     }
1254     current += ProcessHeaders(input, bytes_to_process);
1255     // If we are still reading headers check if we have crossed the headers
1256     // limit. Note that we check for >= as opposed to >. This is because if
1257     // header_length_after equals max_header_length_ and we are still in the
1258     // parse_state_  BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE we know for
1259     // sure that the headers limit will be crossed later on
1260     if (parse_state_ == BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE) {
1261       // Note that headers_ is valid only if we are still reading headers.
1262       const size_t header_length_after =
1263           headers_->GetReadableBytesFromHeaderStream();
1264       if (header_length_after >= max_header_length_) {
1265         parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1266         last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG;
1267         visitor_->HandleHeaderError(this);
1268       }
1269     }
1270     goto bottom;
1271   } else if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ ||
1272              parse_state_ == BalsaFrameEnums::PARSE_ERROR) {
1273     // Can do nothing more 'till we're reset.
1274     goto bottom;
1275   }
1276 
1277   while (current < end) {
1278     switch (parse_state_) {
1279  label_reading_chunk_length:
1280       case BalsaFrameEnums::READING_CHUNK_LENGTH:
1281         // In this state we read the chunk length.
1282         // Note that once we hit a character which is not in:
1283         // [0-9;A-Fa-f\n], we transition to a different state.
1284         //
1285         {
1286           // If we used strtol, etc, we'd have to buffer this line.
1287           // This is more annoying than simply doing the conversion
1288           // here. This code accounts for overflow.
1289           static const signed char buf[] = {
1290             // %0  %1  %2  %3  %4  %5  %6  %7  %8  \t  \n  %b  %c  \r  %e  %f
1291                -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -1, -1, -2, -1, -1,
1292             // %10 %11 %12 %13 %14 %15 %16 %17 %18 %19 %1a %1b %1c %1d %1e %1f
1293                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1294             // ' ' %21 %22 %23 %24 %25 %26 %27 %28 %29 %2a %2b %2c %2d %2e %2f
1295                -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1296             // %30 %31 %32 %33 %34 %35 %36 %37 %38 %39 %3a ';' %3c %3d %3e %3f
1297                 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, -1, -2, -1, -1, -1, -1,
1298             // %40 'A' 'B' 'C' 'D' 'E' 'F' %47 %48 %49 %4a %4b %4c %4d %4e %4f
1299                -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1300             // %50 %51 %52 %53 %54 %55 %56 %57 %58 %59 %5a %5b %5c %5d %5e %5f
1301                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1302             // %60 'a' 'b' 'c' 'd' 'e' 'f' %67 %68 %69 %6a %6b %6c %6d %6e %6f
1303                -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1304             // %70 %71 %72 %73 %74 %75 %76 %77 %78 %79 %7a %7b %7c %7d %7e %7f
1305                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1306           };
1307           // valid cases:
1308           //  "09123\n"                      // -> 09123
1309           //  "09123\r\n"                    // -> 09123
1310           //  "09123  \n"                    // -> 09123
1311           //  "09123  \r\n"                  // -> 09123
1312           //  "09123  12312\n"               // -> 09123
1313           //  "09123  12312\r\n"             // -> 09123
1314           //  "09123; foo=bar\n"             // -> 09123
1315           //  "09123; foo=bar\r\n"           // -> 09123
1316           //  "FFFFFFFFFFFFFFFF\r\n"         // -> FFFFFFFFFFFFFFFF
1317           //  "FFFFFFFFFFFFFFFF 22\r\n"      // -> FFFFFFFFFFFFFFFF
1318           // invalid cases:
1319           // "[ \t]+[^\n]*\n"
1320           // "FFFFFFFFFFFFFFFFF\r\n"  (would overflow)
1321           // "\r\n"
1322           // "\n"
1323           while (current < end) {
1324             const char c = *current;
1325             ++current;
1326             const signed char addition = buf[static_cast<int>(c)];
1327             if (addition >= 0) {
1328               chunk_length_character_extracted_ = true;
1329               size_t length_x_16 = chunk_length_remaining_ * 16;
1330               const size_t kMaxDiv16 = std::numeric_limits<size_t>::max() / 16;
1331               if ((chunk_length_remaining_ > kMaxDiv16) ||
1332                   ((std::numeric_limits<size_t>::max() - length_x_16) <
1333                    static_cast<size_t>(addition))) {
1334                 // overflow -- asked for a chunk-length greater than 2^64 - 1!!
1335                 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1336                 last_error_ = BalsaFrameEnums::CHUNK_LENGTH_OVERFLOW;
1337                 visitor_->ProcessBodyInput(on_entry, current - on_entry);
1338                 visitor_->HandleChunkingError(this);
1339                 goto bottom;
1340               }
1341               chunk_length_remaining_ = length_x_16 + addition;
1342               continue;
1343             }
1344 
1345             if (!chunk_length_character_extracted_ || addition == -1) {
1346               // ^[0-9;A-Fa-f][ \t\n] -- was not matched, either because no
1347               // characters were converted, or an unexpected character was
1348               // seen.
1349               parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1350               last_error_ = BalsaFrameEnums::INVALID_CHUNK_LENGTH;
1351               visitor_->ProcessBodyInput(on_entry, current - on_entry);
1352               visitor_->HandleChunkingError(this);
1353               goto bottom;
1354             }
1355 
1356             --current;
1357             parse_state_ = BalsaFrameEnums::READING_CHUNK_EXTENSION;
1358             visitor_->ProcessChunkLength(chunk_length_remaining_);
1359             goto label_reading_chunk_extension;
1360           }
1361         }
1362         visitor_->ProcessBodyInput(on_entry, current - on_entry);
1363         goto bottom;  // case BalsaFrameEnums::READING_CHUNK_LENGTH
1364 
1365  label_reading_chunk_extension:
1366       case BalsaFrameEnums::READING_CHUNK_EXTENSION:
1367         {
1368           // TODO(phython): Convert this scanning to be 16 bytes at a time if
1369           // there is data to be read.
1370           const char* extensions_start = current;
1371           size_t extensions_length = 0;
1372           while (current < end) {
1373             const char c = *current;
1374             if (c == '\r' || c == '\n') {
1375               extensions_length =
1376                   (extensions_start == current) ?
1377                   0 :
1378                   current - extensions_start - 1;
1379             }
1380 
1381             ++current;
1382             if (c == '\n') {
1383               chunk_length_character_extracted_ = false;
1384               visitor_->ProcessChunkExtensions(
1385                   extensions_start, extensions_length);
1386               if (chunk_length_remaining_ != 0) {
1387                 parse_state_ = BalsaFrameEnums::READING_CHUNK_DATA;
1388                 goto label_reading_chunk_data;
1389               }
1390               HeaderFramingFound('\n');
1391               parse_state_ = BalsaFrameEnums::READING_LAST_CHUNK_TERM;
1392               goto label_reading_last_chunk_term;
1393             }
1394           }
1395           visitor_->ProcessChunkExtensions(
1396               extensions_start, extensions_length);
1397         }
1398 
1399         visitor_->ProcessBodyInput(on_entry, current - on_entry);
1400         goto bottom;  // case BalsaFrameEnums::READING_CHUNK_EXTENSION
1401 
1402  label_reading_chunk_data:
1403       case BalsaFrameEnums::READING_CHUNK_DATA:
1404         while (current < end) {
1405           if (chunk_length_remaining_ == 0) {
1406             break;
1407           }
1408           // read in the chunk
1409           size_t bytes_remaining = end - current;
1410           size_t consumed_bytes = (chunk_length_remaining_ < bytes_remaining) ?
1411             chunk_length_remaining_ : bytes_remaining;
1412           const char* tmp_current = current + consumed_bytes;
1413           visitor_->ProcessBodyInput(on_entry, tmp_current - on_entry);
1414           visitor_->ProcessBodyData(current, consumed_bytes);
1415           on_entry = current = tmp_current;
1416           chunk_length_remaining_ -= consumed_bytes;
1417         }
1418         if (chunk_length_remaining_ == 0) {
1419           parse_state_ = BalsaFrameEnums::READING_CHUNK_TERM;
1420           goto label_reading_chunk_term;
1421         }
1422         visitor_->ProcessBodyInput(on_entry, current - on_entry);
1423         goto bottom;  // case BalsaFrameEnums::READING_CHUNK_DATA
1424 
1425  label_reading_chunk_term:
1426       case BalsaFrameEnums::READING_CHUNK_TERM:
1427         while (current < end) {
1428           const char c = *current;
1429           ++current;
1430 
1431           if (c == '\n') {
1432             parse_state_ = BalsaFrameEnums::READING_CHUNK_LENGTH;
1433             goto label_reading_chunk_length;
1434           }
1435         }
1436         visitor_->ProcessBodyInput(on_entry, current - on_entry);
1437         goto bottom;  // case BalsaFrameEnums::READING_CHUNK_TERM
1438 
1439  label_reading_last_chunk_term:
1440       case BalsaFrameEnums::READING_LAST_CHUNK_TERM:
1441         while (current < end) {
1442           const char c = *current;
1443 
1444           if (!HeaderFramingFound(c)) {
1445             // If not, however, since the spec only suggests that the
1446             // client SHOULD indicate the presence of trailers, we get to
1447             // *test* that they did or didn't.
1448             // If all of the bytes we've seen since:
1449             //   OPTIONAL_WS 0 OPTIONAL_STUFF CRLF
1450             // are either '\r', or '\n', then we can assume that we don't yet
1451             // know if we need to parse headers, or if the next byte will make
1452             // the HeaderFramingFound condition (above) true.
1453             if (HeaderFramingMayBeFound()) {
1454               // If true, then we have seen only characters '\r' or '\n'.
1455               ++current;
1456 
1457               // Lets try again! There is no state change here.
1458               continue;
1459             } else {
1460               // If (!HeaderFramingMayBeFound()), then we know that we must be
1461               // reading the first non CRLF character of a trailer.
1462               parse_state_ = BalsaFrameEnums::READING_TRAILER;
1463               visitor_->ProcessBodyInput(on_entry, current - on_entry);
1464               on_entry = current;
1465               goto label_reading_trailer;
1466             }
1467           } else {
1468             // If we've found a "\r\n\r\n", then the message
1469             // is done.
1470             ++current;
1471             parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1472             visitor_->ProcessBodyInput(on_entry, current - on_entry);
1473             visitor_->MessageDone();
1474             goto bottom;
1475           }
1476           break;  // from while loop
1477         }
1478         visitor_->ProcessBodyInput(on_entry, current - on_entry);
1479         goto bottom;  // case BalsaFrameEnums::READING_LAST_CHUNK_TERM
1480 
1481  label_reading_trailer:
1482       case BalsaFrameEnums::READING_TRAILER:
1483         while (current < end) {
1484           const char c = *current;
1485           ++current;
1486           // TODO(fenix): If we ever care about trailers as part of framing,
1487           // deal with them here (see below for part of the 'solution')
1488           // if (LineFramingFound(c)) {
1489           // trailer_lines_.push_back(make_pair(start_of_line_,
1490           //                                   trailer_length_ - 1));
1491           // start_of_line_ = trailer_length_;
1492           // }
1493           if (HeaderFramingFound(c)) {
1494             // ProcessTrailers(visitor_, &trailers_);
1495             parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1496             visitor_->ProcessTrailerInput(on_entry, current - on_entry);
1497             visitor_->MessageDone();
1498             goto bottom;
1499           }
1500         }
1501         visitor_->ProcessTrailerInput(on_entry, current - on_entry);
1502         break;  // case BalsaFrameEnums::READING_TRAILER
1503 
1504         // Note that there is no label:
1505         //   'label_reading_until_close'
1506         // here. This is because the state-machine exists immediately after
1507         // reading the headers instead of transitioning here (as it would
1508         // do if it was consuming all the data it could, all the time).
1509       case BalsaFrameEnums::READING_UNTIL_CLOSE:
1510         {
1511           const size_t bytes_remaining = end - current;
1512           if (bytes_remaining > 0) {
1513             visitor_->ProcessBodyInput(current, bytes_remaining);
1514             visitor_->ProcessBodyData(current, bytes_remaining);
1515             current += bytes_remaining;
1516           }
1517         }
1518         goto bottom;  // case BalsaFrameEnums::READING_UNTIL_CLOSE
1519 
1520         // label_reading_content:
1521       case BalsaFrameEnums::READING_CONTENT:
1522 #if DEBUGFRAMER
1523         LOG(INFO) << "ReadingContent: " << content_length_remaining_;
1524 #endif  // DEBUGFRAMER
1525         while (content_length_remaining_ && current < end) {
1526           // read in the content
1527           const size_t bytes_remaining = end - current;
1528           const size_t consumed_bytes =
1529             (content_length_remaining_ < bytes_remaining) ?
1530             content_length_remaining_ : bytes_remaining;
1531           visitor_->ProcessBodyInput(current, consumed_bytes);
1532           visitor_->ProcessBodyData(current, consumed_bytes);
1533           current += consumed_bytes;
1534           content_length_remaining_ -= consumed_bytes;
1535         }
1536         if (content_length_remaining_ == 0) {
1537           parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1538           visitor_->MessageDone();
1539         }
1540         goto bottom;  // case BalsaFrameEnums::READING_CONTENT
1541 
1542       default:
1543         // The state-machine should never be in a state that isn't handled
1544         // above.  This is a glaring logic error, and we should do something
1545         // drastic to ensure that this gets looked-at and fixed.
1546         LOG(FATAL) << "Unknown state: " << parse_state_  // COV_NF_LINE
1547           << " memory corruption?!";                     // COV_NF_LINE
1548     }
1549   }
1550  bottom:
1551 #if DEBUGFRAMER
1552   LOG(INFO) << "\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n"
1553     << std::string(input, current)
1554     << "\n$$$$$$$$$$$$$$"
1555     << BalsaFrameEnums::ParseStateToString(parse_state_)
1556     << "$$$$$$$$$$$$$$$"
1557     << " consumed: " << (current - input);
1558   if (Error()) {
1559     LOG(INFO) << BalsaFrameEnums::ErrorCodeToString(ErrorCode());
1560   }
1561 #endif  // DEBUGFRAMER
1562   return current - input;
1563 }
1564 
1565 }  // namespace net
1566