• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "net/tools/balsa/balsa_frame.h"
6 
7 // Visual C++ defines _M_IX86_FP as 2 if the /arch:SSE2 compiler option is
8 // specified.
9 #if !defined(__SSE2__) && _M_IX86_FP == 2
10 #define __SSE2__ 1
11 #endif
12 
13 #include <assert.h>
14 #if __SSE2__
15 #include <emmintrin.h>
16 #endif  // __SSE2__
17 
18 #include <limits>
19 #include <string>
20 #include <utility>
21 #include <vector>
22 
23 #include "base/logging.h"
24 #include "base/port.h"
25 #include "base/strings/string_piece.h"
26 #include "net/tools/balsa/balsa_enums.h"
27 #include "net/tools/balsa/balsa_headers.h"
28 #include "net/tools/balsa/balsa_visitor_interface.h"
29 #include "net/tools/balsa/buffer_interface.h"
30 #include "net/tools/balsa/simple_buffer.h"
31 #include "net/tools/balsa/split.h"
32 #include "net/tools/balsa/string_piece_utils.h"
33 
34 #if defined(COMPILER_MSVC)
35 #include <intrin.h>
36 #include <string.h>
37 
38 #pragma intrinsic(_BitScanForward)
39 
ffs(int i)40 static int ffs(int i) {
41   unsigned long index;
42   return _BitScanForward(&index, i) ? index + 1 : 0;
43 }
44 
45 #define strncasecmp _strnicmp
46 #else
47 #include <strings.h>
48 #endif
49 
50 namespace net {
51 
52 // Constants holding some header names for headers which can affect the way the
53 // HTTP message is framed, and so must be processed specially:
54 static const char kContentLength[] = "content-length";
55 static const size_t kContentLengthSize = sizeof(kContentLength) - 1;
56 static const char kTransferEncoding[] = "transfer-encoding";
57 static const size_t kTransferEncodingSize = sizeof(kTransferEncoding) - 1;
58 
BalsaFrame()59 BalsaFrame::BalsaFrame()
60     : last_char_was_slash_r_(false),
61       saw_non_newline_char_(false),
62       start_was_space_(true),
63       chunk_length_character_extracted_(false),
64       is_request_(true),
65       request_was_head_(false),
66       max_header_length_(16 * 1024),
67       max_request_uri_length_(2048),
68       visitor_(&do_nothing_visitor_),
69       chunk_length_remaining_(0),
70       content_length_remaining_(0),
71       last_slash_n_loc_(NULL),
72       last_recorded_slash_n_loc_(NULL),
73       last_slash_n_idx_(0),
74       term_chars_(0),
75       parse_state_(BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE),
76       last_error_(BalsaFrameEnums::NO_ERROR),
77       headers_(NULL) {
78 }
79 
~BalsaFrame()80 BalsaFrame::~BalsaFrame() {}
81 
Reset()82 void BalsaFrame::Reset() {
83   last_char_was_slash_r_ = false;
84   saw_non_newline_char_ = false;
85   start_was_space_ = true;
86   chunk_length_character_extracted_ = false;
87   // is_request_ = true;               // not reset between messages.
88   // request_was_head_ = false;        // not reset between messages.
89   // max_header_length_ = 4096;        // not reset between messages.
90   // max_request_uri_length_ = 2048;   // not reset between messages.
91   // visitor_ = &do_nothing_visitor_;  // not reset between messages.
92   chunk_length_remaining_ = 0;
93   content_length_remaining_ = 0;
94   last_slash_n_loc_ = NULL;
95   last_recorded_slash_n_loc_ = NULL;
96   last_slash_n_idx_ = 0;
97   term_chars_ = 0;
98   parse_state_ = BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE;
99   last_error_ = BalsaFrameEnums::NO_ERROR;
100   lines_.clear();
101   if (headers_ != NULL) {
102     headers_->Clear();
103   }
104 }
105 
ParseStateToString(BalsaFrameEnums::ParseState error_code)106 const char* BalsaFrameEnums::ParseStateToString(
107     BalsaFrameEnums::ParseState error_code) {
108   switch (error_code) {
109     case PARSE_ERROR:
110       return "PARSE_ERROR";
111     case READING_HEADER_AND_FIRSTLINE:
112       return "READING_HEADER_AND_FIRSTLINE";
113     case READING_CHUNK_LENGTH:
114       return "READING_CHUNK_LENGTH";
115     case READING_CHUNK_EXTENSION:
116       return "READING_CHUNK_EXTENSION";
117     case READING_CHUNK_DATA:
118       return "READING_CHUNK_DATA";
119     case READING_CHUNK_TERM:
120       return "READING_CHUNK_TERM";
121     case READING_LAST_CHUNK_TERM:
122       return "READING_LAST_CHUNK_TERM";
123     case READING_TRAILER:
124       return "READING_TRAILER";
125     case READING_UNTIL_CLOSE:
126       return "READING_UNTIL_CLOSE";
127     case READING_CONTENT:
128       return "READING_CONTENT";
129     case MESSAGE_FULLY_READ:
130       return "MESSAGE_FULLY_READ";
131     case NUM_STATES:
132       return "UNKNOWN_STATE";
133   }
134   return "UNKNOWN_STATE";
135 }
136 
ErrorCodeToString(BalsaFrameEnums::ErrorCode error_code)137 const char* BalsaFrameEnums::ErrorCodeToString(
138     BalsaFrameEnums::ErrorCode error_code) {
139   switch (error_code) {
140     case NO_ERROR:
141       return "NO_ERROR";
142     case NO_STATUS_LINE_IN_RESPONSE:
143       return "NO_STATUS_LINE_IN_RESPONSE";
144     case NO_REQUEST_LINE_IN_REQUEST:
145       return "NO_REQUEST_LINE_IN_REQUEST";
146     case FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION:
147       return "FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION";
148     case FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD:
149       return "FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD";
150     case FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE:
151       return "FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE";
152     case FAILED_TO_FIND_WS_AFTER_REQUEST_REQUEST_URI:
153       return "FAILED_TO_FIND_WS_AFTER_REQUEST_REQUEST_URI";
154     case FAILED_TO_FIND_NL_AFTER_RESPONSE_REASON_PHRASE:
155       return "FAILED_TO_FIND_NL_AFTER_RESPONSE_REASON_PHRASE";
156     case FAILED_TO_FIND_NL_AFTER_REQUEST_HTTP_VERSION:
157       return "FAILED_TO_FIND_NL_AFTER_REQUEST_HTTP_VERSION";
158     case FAILED_CONVERTING_STATUS_CODE_TO_INT:
159       return "FAILED_CONVERTING_STATUS_CODE_TO_INT";
160     case REQUEST_URI_TOO_LONG:
161       return "REQUEST_URI_TOO_LONG";
162     case HEADERS_TOO_LONG:
163       return "HEADERS_TOO_LONG";
164     case UNPARSABLE_CONTENT_LENGTH:
165       return "UNPARSABLE_CONTENT_LENGTH";
166     case MAYBE_BODY_BUT_NO_CONTENT_LENGTH:
167       return "MAYBE_BODY_BUT_NO_CONTENT_LENGTH";
168     case REQUIRED_BODY_BUT_NO_CONTENT_LENGTH:
169       return "REQUIRED_BODY_BUT_NO_CONTENT_LENGTH";
170     case HEADER_MISSING_COLON:
171       return "HEADER_MISSING_COLON";
172     case INVALID_CHUNK_LENGTH:
173       return "INVALID_CHUNK_LENGTH";
174     case CHUNK_LENGTH_OVERFLOW:
175       return "CHUNK_LENGTH_OVERFLOW";
176     case CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO:
177       return "CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO";
178     case CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT:
179       return "CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT";
180     case MULTIPLE_CONTENT_LENGTH_KEYS:
181       return "MULTIPLE_CONTENT_LENGTH_KEYS";
182     case MULTIPLE_TRANSFER_ENCODING_KEYS:
183       return "MULTIPLE_TRANSFER_ENCODING_KEYS";
184     case UNKNOWN_TRANSFER_ENCODING:
185       return "UNKNOWN_TRANSFER_ENCODING";
186     case INVALID_HEADER_FORMAT:
187       return "INVALID_HEADER_FORMAT";
188     case INTERNAL_LOGIC_ERROR:
189       return "INTERNAL_LOGIC_ERROR";
190     case NUM_ERROR_CODES:
191       return "UNKNOWN_ERROR";
192   }
193   return "UNKNOWN_ERROR";
194 }
195 
196 // Summary:
197 //     Parses the first line of either a request or response.
198 //     Note that in the case of a detected warning, error_code will be set
199 //   but the function will not return false.
200 //     Exactly zero or one warning or error (but not both) may be detected
201 //   by this function.
202 //     Note that this function will not write the data of the first-line
203 //   into the header's buffer (that should already have been done elsewhere).
204 //
205 // Pre-conditions:
206 //     begin != end
207 //     *begin should be a character which is > ' '. This implies that there
208 //   is at least one non-whitespace characters between [begin, end).
209 //   headers is a valid pointer to a BalsaHeaders class.
210 //     error_code is a valid pointer to a BalsaFrameEnums::ErrorCode value.
211 //     Entire first line must exist between [begin, end)
212 //     Exactly zero or one newlines -may- exist between [begin, end)
213 //     [begin, end) should exist in the header's buffer.
214 //
215 // Side-effects:
216 //   headers will be modified
217 //   error_code may be modified if either a warning or error is detected
218 //
219 // Returns:
220 //   True if no error (as opposed to warning) is detected.
221 //   False if an error (as opposed to warning) is detected.
222 
223 //
224 // If there is indeed non-whitespace in the line, then the following
225 // will take care of this for you:
226 //  while (*begin <= ' ') ++begin;
227 //  ProcessFirstLine(begin, end, is_request, &headers, &error_code);
228 //
ParseHTTPFirstLine(const char * begin,const char * end,bool is_request,size_t max_request_uri_length,BalsaHeaders * headers,BalsaFrameEnums::ErrorCode * error_code)229 bool ParseHTTPFirstLine(const char* begin,
230                         const char* end,
231                         bool is_request,
232                         size_t max_request_uri_length,
233                         BalsaHeaders* headers,
234                         BalsaFrameEnums::ErrorCode* error_code) {
235   const char* current = begin;
236   // HTTP firstlines all have the following structure:
237   //  LWS         NONWS  LWS    NONWS   LWS    NONWS   NOTCRLF  CRLF
238   //  [\t \r\n]+ [^\t ]+ [\t ]+ [^\t ]+ [\t ]+ [^\t ]+ [^\r\n]+ "\r\n"
239   //  ws1        nws1    ws2    nws2    ws3    nws3             ws4
240   //  |          [-------)      [-------)      [----------------)
241   //    REQ:     method         request_uri    version
242   //   RESP:     version        statuscode     reason
243   //
244   //   The first NONWS->LWS component we'll call firstline_a.
245   //   The second firstline_b, and the third firstline_c.
246   //
247   //   firstline_a goes from nws1 to (but not including) ws2
248   //   firstline_b goes from nws2 to (but not including) ws3
249   //   firstline_c goes from nws3 to (but not including) ws4
250   //
251   // In the code:
252   //    ws1 == whitespace_1_idx_
253   //   nws1 == non_whitespace_1_idx_
254   //    ws2 == whitespace_2_idx_
255   //   nws2 == non_whitespace_2_idx_
256   //    ws3 == whitespace_3_idx_
257   //   nws3 == non_whitespace_3_idx_
258   //    ws4 == whitespace_4_idx_
259 
260   // Kill all whitespace (including '\r\n') at the end of the line.
261   --end;
262   if (*end != '\n') {
263     *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR;
264     LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n"
265                 << headers->OriginalHeadersForDebugging();
266     return false;
267   }
268   while (begin < end && *end <= ' ') {
269     --end;
270   }
271   DCHECK(*end != '\n');
272   if (*end == '\n') {
273     *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR;
274     LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n"
275                 << headers->OriginalHeadersForDebugging();
276     return false;
277   }
278   ++end;
279 
280   // The two following statements should not be possible.
281   if (end == begin) {
282     *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR;
283     LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n"
284                 << headers->OriginalHeadersForDebugging();
285     return false;
286   }
287 
288   // whitespace_1_idx_
289   headers->whitespace_1_idx_ = current - begin;
290   // This loop is commented out as it is never used in current code.  This is
291   // true only because we don't begin parsing the headers at all until we've
292   // encountered a non whitespace character at the beginning of the stream, at
293   // which point we begin our demarcation of header-start.  If we did -not- do
294   // this (for instance, only looked for [\r\n] instead of (< ' ')), this loop
295   // would be necessary for the proper functioning of this parsing.
296   // This is left here as this function may (in the future) be refactored out
297   // of the BalsaFrame class so that it may be shared between code in
298   // BalsaFrame and BalsaHeaders (where it would be used in some variant of the
299   // set_first_line() function (at which point it would be necessary).
300 #if 0
301   while (*current <= ' ') {
302     ++current;
303   }
304 #endif
305   // non_whitespace_1_idx_
306   headers->non_whitespace_1_idx_ = current - begin;
307   do {
308     // The first time through, we're guaranteed that the current character
309     // won't be a whitespace (else the loop above wouldn't have terminated).
310     // That implies that we're guaranteed to get at least one non-whitespace
311     // character if we get into this loop at all.
312     ++current;
313     if (current == end) {
314       headers->whitespace_2_idx_ = current - begin;
315       headers->non_whitespace_2_idx_ = current - begin;
316       headers->whitespace_3_idx_ = current - begin;
317       headers->non_whitespace_3_idx_ = current - begin;
318       headers->whitespace_4_idx_ = current - begin;
319       // FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD   for request
320       // FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION for response
321       *error_code =
322         static_cast<BalsaFrameEnums::ErrorCode>(
323             BalsaFrameEnums::FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION +
324             is_request);
325       if (!is_request) {  // FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION
326         return false;
327       }
328       goto output_exhausted;
329     }
330   } while (*current > ' ');
331   // whitespace_2_idx_
332   headers->whitespace_2_idx_ = current - begin;
333   do {
334     ++current;
335     // Note that due to the loop which consumes all of the whitespace
336     // at the end of the line, current can never == end while in this function.
337   } while (*current <= ' ');
338   // non_whitespace_2_idx_
339   headers->non_whitespace_2_idx_ = current - begin;
340   do {
341     ++current;
342     if (current == end) {
343       headers->whitespace_3_idx_ = current - begin;
344       headers->non_whitespace_3_idx_ = current - begin;
345       headers->whitespace_4_idx_ = current - begin;
346       // FAILED_TO_FIND_START_OF_REQUEST_REQUEST_URI for request
347       // FAILED_TO_FIND_START_OF_RESPONSE_STATUSCODE for response
348       *error_code =
349         static_cast<BalsaFrameEnums::ErrorCode>(
350             BalsaFrameEnums::FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE
351                                  + is_request);
352       goto output_exhausted;
353     }
354   } while (*current > ' ');
355   // whitespace_3_idx_
356   headers->whitespace_3_idx_ = current - begin;
357   do {
358     ++current;
359     // Note that due to the loop which consumes all of the whitespace
360     // at the end of the line, current can never == end while in this function.
361   } while (*current <= ' ');
362   // non_whitespace_3_idx_
363   headers->non_whitespace_3_idx_ = current - begin;
364   headers->whitespace_4_idx_ = end - begin;
365 
366  output_exhausted:
367   // Note that we don't fail the parse immediately when parsing of the
368   // firstline fails.  Depending on the protocol type, we may want to accept
369   // a firstline with only one or two elements, e.g., for HTTP/0.9:
370   //   GET\r\n
371   // or
372   //   GET /\r\n
373   // should be parsed without issue (though the visitor should know that
374   // parsing the entire line was not exactly as it should be).
375   //
376   // Eventually, these errors may be removed alltogether, as the visitor can
377   // detect them on its own by examining the size of the various fields.
378   // headers->set_first_line(non_whitespace_1_idx_, current);
379 
380   if (is_request) {
381     if ((headers->whitespace_3_idx_ - headers->non_whitespace_2_idx_) >
382         max_request_uri_length) {
383       // For requests, we need at least the method.  We could assume that a
384       // blank URI means "/".  If version isn't stated, it should be assumed
385       // to be HTTP/0.9 by the visitor.
386       *error_code = BalsaFrameEnums::REQUEST_URI_TOO_LONG;
387       return false;
388     }
389   } else {
390     headers->parsed_response_code_ = 0;
391     {
392       const char* parsed_response_code_current =
393         begin + headers->non_whitespace_2_idx_;
394       const char* parsed_response_code_end = begin + headers->whitespace_3_idx_;
395       const size_t kMaxDiv10 = std::numeric_limits<size_t>::max() / 10;
396 
397       // Convert a string of [0-9]* into an int.
398       // Note that this allows for the conversion of response codes which
399       // are outside the bounds of normal HTTP response codes (no checking
400       // is done to ensure that these are valid-- they're merely parsed)!
401       while (parsed_response_code_current < parsed_response_code_end) {
402         if (*parsed_response_code_current < '0' ||
403             *parsed_response_code_current > '9') {
404           *error_code = BalsaFrameEnums::FAILED_CONVERTING_STATUS_CODE_TO_INT;
405           return false;
406         }
407         size_t status_code_x_10 = headers->parsed_response_code_ * 10;
408         uint8 c = *parsed_response_code_current - '0';
409         if ((headers->parsed_response_code_ > kMaxDiv10) ||
410             (std::numeric_limits<size_t>::max() - status_code_x_10) < c) {
411           // overflow.
412           *error_code = BalsaFrameEnums::FAILED_CONVERTING_STATUS_CODE_TO_INT;
413           return false;
414         }
415         headers->parsed_response_code_ = status_code_x_10 + c;
416         ++parsed_response_code_current;
417       }
418     }
419   }
420   return true;
421 }
422 
423 // begin - beginning of the firstline
424 // end - end of the firstline
425 //
426 // A precondition for this function is that there is non-whitespace between
427 // [begin, end). If this precondition is not met, the function will not perform
428 // as expected (and bad things may happen, and it will eat your first, second,
429 // and third unborn children!).
430 //
431 // Another precondition for this function is that [begin, end) includes
432 // at most one newline, which must be at the end of the line.
ProcessFirstLine(const char * begin,const char * end)433 void BalsaFrame::ProcessFirstLine(const char* begin, const char* end) {
434   BalsaFrameEnums::ErrorCode previous_error = last_error_;
435   if (!ParseHTTPFirstLine(begin,
436                           end,
437                           is_request_,
438                           max_request_uri_length_,
439                           headers_,
440                           &last_error_)) {
441     parse_state_ = BalsaFrameEnums::PARSE_ERROR;
442     visitor_->HandleHeaderError(this);
443     return;
444   }
445   if (previous_error != last_error_) {
446     visitor_->HandleHeaderWarning(this);
447   }
448 
449   if (is_request_) {
450     size_t version_length =
451         headers_->whitespace_4_idx_ - headers_->non_whitespace_3_idx_;
452     visitor_->ProcessRequestFirstLine(
453         begin + headers_->non_whitespace_1_idx_,
454         headers_->whitespace_4_idx_ - headers_->non_whitespace_1_idx_,
455         begin + headers_->non_whitespace_1_idx_,
456         headers_->whitespace_2_idx_ - headers_->non_whitespace_1_idx_,
457         begin + headers_->non_whitespace_2_idx_,
458         headers_->whitespace_3_idx_ - headers_->non_whitespace_2_idx_,
459         begin + headers_->non_whitespace_3_idx_,
460         version_length);
461     if (version_length == 0)
462       parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
463   } else {
464     visitor_->ProcessResponseFirstLine(
465         begin + headers_->non_whitespace_1_idx_,
466         headers_->whitespace_4_idx_ - headers_->non_whitespace_1_idx_,
467         begin + headers_->non_whitespace_1_idx_,
468         headers_->whitespace_2_idx_ - headers_->non_whitespace_1_idx_,
469         begin + headers_->non_whitespace_2_idx_,
470         headers_->whitespace_3_idx_ - headers_->non_whitespace_2_idx_,
471         begin + headers_->non_whitespace_3_idx_,
472         headers_->whitespace_4_idx_ - headers_->non_whitespace_3_idx_);
473   }
474 }
475 
476 // 'stream_begin' points to the first character of the headers buffer.
477 // 'line_begin' points to the first character of the line.
478 // 'current' points to a char which is ':'.
479 // 'line_end' points to the position of '\n' + 1.
480 // 'line_begin' points to the position of first character of line.
CleanUpKeyValueWhitespace(const char * stream_begin,const char * line_begin,const char * current,const char * line_end,HeaderLineDescription * current_header_line)481 void BalsaFrame::CleanUpKeyValueWhitespace(
482     const char* stream_begin,
483     const char* line_begin,
484     const char* current,
485     const char* line_end,
486     HeaderLineDescription* current_header_line) {
487   const char* colon_loc = current;
488   DCHECK_LT(colon_loc, line_end);
489   DCHECK_EQ(':', *colon_loc);
490   DCHECK_EQ(':', *current);
491   DCHECK_GE(' ', *line_end)
492     << "\"" << std::string(line_begin, line_end) << "\"";
493 
494   // TODO(fenix): Investigate whether or not the bounds tests in the
495   // while loops here are redundant, and if so, remove them.
496   --current;
497   while (current > line_begin && *current <= ' ') --current;
498   current += (current != colon_loc);
499   current_header_line->key_end_idx = current - stream_begin;
500 
501   current = colon_loc;
502   DCHECK_EQ(':', *current);
503   ++current;
504   while (current < line_end && *current <= ' ') ++current;
505   current_header_line->value_begin_idx = current - stream_begin;
506 
507   DCHECK_GE(current_header_line->key_end_idx,
508             current_header_line->first_char_idx);
509   DCHECK_GE(current_header_line->value_begin_idx,
510             current_header_line->key_end_idx);
511   DCHECK_GE(current_header_line->last_char_idx,
512             current_header_line->value_begin_idx);
513 }
514 
FindColonsAndParseIntoKeyValue()515 inline void BalsaFrame::FindColonsAndParseIntoKeyValue() {
516   DCHECK(!lines_.empty());
517   const char* stream_begin = headers_->OriginalHeaderStreamBegin();
518   // The last line is always just a newline (and is uninteresting).
519   const Lines::size_type lines_size_m1 = lines_.size() - 1;
520 #if __SSE2__
521   const __m128i colons = _mm_set1_epi8(':');
522   const char* header_lines_end_m16 = headers_->OriginalHeaderStreamEnd() - 16;
523 #endif  // __SSE2__
524   const char* current = stream_begin + lines_[1].first;
525   // This code is a bit more subtle than it may appear at first glance.
526   // This code looks for a colon in the current line... but it also looks
527   // beyond the current line. If there is no colon in the current line, then
528   // for each subsequent line (until the colon which -has- been found is
529   // associated with a line), no searching for a colon will be performed. In
530   // this way, we minimize the amount of bytes we have scanned for a colon.
531   for (Lines::size_type i = 1; i < lines_size_m1;) {
532     const char* line_begin = stream_begin + lines_[i].first;
533 
534     // Here we handle possible continuations.  Note that we do not replace
535     // the '\n' in the line before a continuation (at least, as of now),
536     // which implies that any code which looks for a value must deal with
537     // "\r\n", etc -within- the line (and not just at the end of it).
538     for (++i; i < lines_size_m1; ++i) {
539       const char c = *(stream_begin + lines_[i].first);
540       if (c > ' ') {
541         // Not a continuation, so stop.  Note that if the 'original' i = 1,
542         // and the next line is not a continuation, we'll end up with i = 2
543         // when we break. This handles the incrementing of i for the outer
544         // loop.
545         break;
546       }
547     }
548     const char* line_end = stream_begin + lines_[i - 1].second;
549     DCHECK_LT(line_begin - stream_begin, line_end - stream_begin);
550 
551     // We cleanup the whitespace at the end of the line before doing anything
552     // else of interest as it allows us to do nothing when irregularly formatted
553     // headers are parsed (e.g. those with only keys, only values, or no colon).
554     //
555     // We're guaranteed to have *line_end > ' ' while line_end >= line_begin.
556     --line_end;
557     DCHECK_EQ('\n', *line_end)
558       << "\"" << std::string(line_begin, line_end) << "\"";
559     while (*line_end <= ' ' && line_end > line_begin) {
560       --line_end;
561     }
562     ++line_end;
563     DCHECK_GE(' ', *line_end);
564     DCHECK_LT(line_begin, line_end);
565 
566     // We use '0' for the block idx, because we're always writing to the first
567     // block from the framer (we do this because the framer requires that the
568     // entire header sequence be in a contiguous buffer).
569     headers_->header_lines_.push_back(
570         HeaderLineDescription(line_begin - stream_begin,
571                               line_end - stream_begin,
572                               line_end - stream_begin,
573                               line_end - stream_begin,
574                               0));
575     if (current >= line_end) {
576       last_error_ = BalsaFrameEnums::HEADER_MISSING_COLON;
577       visitor_->HandleHeaderWarning(this);
578       // Then the next colon will not be found within this header line-- time
579       // to try again with another header-line.
580       continue;
581     } else if (current < line_begin) {
582       // When this condition is true, the last detected colon was part of a
583       // previous line.  We reset to the beginning of the line as we don't care
584       // about the presence of any colon before the beginning of the current
585       // line.
586       current = line_begin;
587     }
588 #if __SSE2__
589     while (current < header_lines_end_m16) {
590       __m128i header_bytes =
591         _mm_loadu_si128(reinterpret_cast<const __m128i *>(current));
592       __m128i colon_cmp = _mm_cmpeq_epi8(header_bytes, colons);
593       int colon_msk = _mm_movemask_epi8(colon_cmp);
594       if (colon_msk == 0) {
595         current += 16;
596         continue;
597       }
598       current += (ffs(colon_msk) - 1);
599       if (current > line_end) {
600         break;
601       }
602       goto found_colon;
603     }
604 #endif  // __SSE2__
605     for (; current < line_end; ++current) {
606       if (*current != ':') {
607         continue;
608       }
609       goto found_colon;
610     }
611     // If we've gotten to here, then there was no colon
612     // in the line. The arguments we passed into the construction
613     // for the HeaderLineDescription object should be OK-- it assumes
614     // that the entire content is 'key' by default (which is true, as
615     // there was no colon, there can be no value). Note that this is a
616     // construct which is technically not allowed by the spec.
617     last_error_ = BalsaFrameEnums::HEADER_MISSING_COLON;
618     visitor_->HandleHeaderWarning(this);
619     continue;
620  found_colon:
621     DCHECK_EQ(*current, ':');
622     DCHECK_LE(current - stream_begin, line_end - stream_begin);
623     DCHECK_LE(stream_begin - stream_begin, current - stream_begin);
624 
625     HeaderLineDescription& current_header_line = headers_->header_lines_.back();
626     current_header_line.key_end_idx = current - stream_begin;
627     current_header_line.value_begin_idx = current_header_line.key_end_idx;
628     if (current < line_end) {
629       ++current_header_line.key_end_idx;
630 
631       CleanUpKeyValueWhitespace(stream_begin,
632                                 line_begin,
633                                 current,
634                                 line_end,
635                                 &current_header_line);
636     }
637   }
638 }
639 
ProcessContentLengthLine(HeaderLines::size_type line_idx,BalsaHeadersEnums::ContentLengthStatus * status,size_t * length)640 void BalsaFrame::ProcessContentLengthLine(
641     HeaderLines::size_type line_idx,
642     BalsaHeadersEnums::ContentLengthStatus* status,
643     size_t* length) {
644   const HeaderLineDescription& header_line = headers_->header_lines_[line_idx];
645   const char* stream_begin = headers_->OriginalHeaderStreamBegin();
646   const char* line_end = stream_begin + header_line.last_char_idx;
647   const char* value_begin = (stream_begin + header_line.value_begin_idx);
648 
649   if (value_begin >= line_end) {
650     // There is no non-whitespace value data.
651 #if DEBUGFRAMER
652       LOG(INFO) << "invalid content-length -- no non-whitespace value data";
653 #endif
654     *status = BalsaHeadersEnums::INVALID_CONTENT_LENGTH;
655     return;
656   }
657 
658   *length = 0;
659   while (value_begin < line_end) {
660     if (*value_begin < '0' || *value_begin > '9') {
661       // bad! content-length found, and couldn't parse all of it!
662       *status = BalsaHeadersEnums::INVALID_CONTENT_LENGTH;
663 #if DEBUGFRAMER
664       LOG(INFO) << "invalid content-length - non numeric character detected";
665 #endif  // DEBUGFRAMER
666       return;
667     }
668     const size_t kMaxDiv10 = std::numeric_limits<size_t>::max() / 10;
669     size_t length_x_10 = *length * 10;
670     const unsigned char c = *value_begin - '0';
671     if (*length > kMaxDiv10 ||
672         (std::numeric_limits<size_t>::max() - length_x_10) < c) {
673       *status = BalsaHeadersEnums::CONTENT_LENGTH_OVERFLOW;
674 #if DEBUGFRAMER
675       LOG(INFO) << "content-length overflow";
676 #endif  // DEBUGFRAMER
677       return;
678     }
679     *length = length_x_10 + c;
680     ++value_begin;
681   }
682 #if DEBUGFRAMER
683   LOG(INFO) << "content_length parsed: " << *length;
684 #endif  // DEBUGFRAMER
685   *status = BalsaHeadersEnums::VALID_CONTENT_LENGTH;
686 }
687 
ProcessTransferEncodingLine(HeaderLines::size_type line_idx)688 void BalsaFrame::ProcessTransferEncodingLine(HeaderLines::size_type line_idx) {
689   const HeaderLineDescription& header_line = headers_->header_lines_[line_idx];
690   const char* stream_begin = headers_->OriginalHeaderStreamBegin();
691   const char* line_end = stream_begin + header_line.last_char_idx;
692   const char* value_begin = stream_begin + header_line.value_begin_idx;
693   size_t value_length = line_end - value_begin;
694 
695   if ((value_length == 7) &&
696       !strncasecmp(value_begin, "chunked", 7)) {
697     headers_->transfer_encoding_is_chunked_ = true;
698   } else if ((value_length == 8) &&
699       !strncasecmp(value_begin, "identity", 8)) {
700     headers_->transfer_encoding_is_chunked_ = false;
701   } else {
702     last_error_ = BalsaFrameEnums::UNKNOWN_TRANSFER_ENCODING;
703     parse_state_ = BalsaFrameEnums::PARSE_ERROR;
704     visitor_->HandleHeaderError(this);
705     return;
706   }
707 }
708 
709 namespace {
SplitStringPiece(base::StringPiece original,char delim,base::StringPiece * before,base::StringPiece * after)710 bool SplitStringPiece(base::StringPiece original, char delim,
711                       base::StringPiece* before, base::StringPiece* after) {
712   const char* p = original.data();
713   const char* end = p + original.size();
714 
715   while (p != end) {
716     if (*p == delim) {
717       ++p;
718     } else {
719       const char* start = p;
720       while (++p != end && *p != delim) {
721         // Skip to the next occurence of the delimiter.
722       }
723       *before = base::StringPiece(start, p - start);
724       if (p != end)
725         *after = base::StringPiece(p + 1, end - (p + 1));
726       else
727         *after = base::StringPiece("");
728       StringPieceUtils::RemoveWhitespaceContext(before);
729       StringPieceUtils::RemoveWhitespaceContext(after);
730       return true;
731     }
732   }
733 
734   *before = original;
735   *after = "";
736   return false;
737 }
738 
739 // TODO(phython): Fix this function to properly deal with quoted values.
740 // E.g. ";;foo", "\";;\"", or \"aa;
741 // The last example, the semi-colon is a separator between extensions.
ProcessChunkExtensionsManual(base::StringPiece all_extensions,BalsaHeaders * extensions)742 void ProcessChunkExtensionsManual(base::StringPiece all_extensions,
743                                   BalsaHeaders* extensions) {
744   base::StringPiece extension;
745   base::StringPiece remaining;
746   StringPieceUtils::RemoveWhitespaceContext(&all_extensions);
747   SplitStringPiece(all_extensions, ';', &extension, &remaining);
748   while (!extension.empty()) {
749     base::StringPiece key;
750     base::StringPiece value;
751     SplitStringPiece(extension, '=', &key, &value);
752     if (!value.empty()) {
753       // Strip quotation marks if they exist.
754       if (!value.empty() && value[0] == '"')
755         value.remove_prefix(1);
756       if (!value.empty() && value[value.length() - 1] == '"')
757         value.remove_suffix(1);
758     }
759 
760     extensions->AppendHeader(key, value);
761 
762     StringPieceUtils::RemoveWhitespaceContext(&remaining);
763     SplitStringPiece(remaining, ';', &extension, &remaining);
764   }
765 }
766 
767 }  // anonymous namespace
768 
ProcessChunkExtensions(const char * input,size_t size,BalsaHeaders * extensions)769 void BalsaFrame::ProcessChunkExtensions(const char* input, size_t size,
770                                         BalsaHeaders* extensions) {
771   ProcessChunkExtensionsManual(base::StringPiece(input, size), extensions);
772 }
773 
ProcessHeaderLines()774 void BalsaFrame::ProcessHeaderLines() {
775   HeaderLines::size_type content_length_idx = 0;
776   HeaderLines::size_type transfer_encoding_idx = 0;
777 
778   DCHECK(!lines_.empty());
779 #if DEBUGFRAMER
780   LOG(INFO) << "******@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@**********\n";
781 #endif  // DEBUGFRAMER
782 
783   // There is no need to attempt to process headers if no header lines exist.
784   // There are at least two lines in the message which are not header lines.
785   // These two non-header lines are the first line of the message, and the
786   // last line of the message (which is an empty line).
787   // Thus, we test to see if we have more than two lines total before attempting
788   // to parse any header lines.
789   if (lines_.size() > 2) {
790     const char* stream_begin = headers_->OriginalHeaderStreamBegin();
791 
792     // Then, for the rest of the header data, we parse these into key-value
793     // pairs.
794     FindColonsAndParseIntoKeyValue();
795     // At this point, we've parsed all of the headers.  Time to look for those
796     // headers which we require for framing.
797     const HeaderLines::size_type
798       header_lines_size = headers_->header_lines_.size();
799     for (HeaderLines::size_type i = 0; i < header_lines_size; ++i) {
800       const HeaderLineDescription& current_header_line =
801         headers_->header_lines_[i];
802       const char* key_begin =
803         (stream_begin + current_header_line.first_char_idx);
804       const char* key_end = (stream_begin + current_header_line.key_end_idx);
805       const size_t key_len = key_end - key_begin;
806       const char c = *key_begin;
807 #if DEBUGFRAMER
808       LOG(INFO) << "[" << i << "]: " << std::string(key_begin, key_len)
809                 << " c: '" << c << "' key_len: " << key_len;
810 #endif  // DEBUGFRAMER
811       // If a header begins with either lowercase or uppercase 'c' or 't', then
812       // the header may be one of content-length, connection, content-encoding
813       // or transfer-encoding. These headers are special, as they change the way
814       // that the message is framed, and so the framer is required to search
815       // for them.
816 
817 
818       if (c == 'c' || c == 'C') {
819         if ((key_len == kContentLengthSize) &&
820             0 == strncasecmp(key_begin, kContentLength, kContentLengthSize)) {
821           BalsaHeadersEnums::ContentLengthStatus content_length_status =
822             BalsaHeadersEnums::NO_CONTENT_LENGTH;
823           size_t length = 0;
824           ProcessContentLengthLine(i, &content_length_status, &length);
825           if (content_length_idx != 0) {  // then we've already seen one!
826             if ((headers_->content_length_status_ != content_length_status) ||
827                 ((headers_->content_length_status_ ==
828                   BalsaHeadersEnums::VALID_CONTENT_LENGTH) &&
829                  length != headers_->content_length_)) {
830               last_error_ = BalsaFrameEnums::MULTIPLE_CONTENT_LENGTH_KEYS;
831               parse_state_ = BalsaFrameEnums::PARSE_ERROR;
832               visitor_->HandleHeaderError(this);
833               return;
834             }
835             continue;
836           } else {
837             content_length_idx = i + 1;
838             headers_->content_length_status_ = content_length_status;
839             headers_->content_length_ = length;
840             content_length_remaining_ = length;
841           }
842 
843         }
844       } else if (c == 't' || c == 'T') {
845         if ((key_len == kTransferEncodingSize) &&
846             0 == strncasecmp(key_begin, kTransferEncoding,
847                              kTransferEncodingSize)) {
848           if (transfer_encoding_idx != 0) {
849             last_error_ = BalsaFrameEnums::MULTIPLE_TRANSFER_ENCODING_KEYS;
850             parse_state_ = BalsaFrameEnums::PARSE_ERROR;
851             visitor_->HandleHeaderError(this);
852             return;
853           }
854           transfer_encoding_idx = i + 1;
855         }
856       } else if (i == 0 && (key_len == 0 || c == ' ')) {
857         last_error_ = BalsaFrameEnums::INVALID_HEADER_FORMAT;
858         parse_state_ = BalsaFrameEnums::PARSE_ERROR;
859         visitor_->HandleHeaderError(this);
860         return;
861       }
862     }
863     if (headers_->transfer_encoding_is_chunked_) {
864       headers_->content_length_ = 0;
865       headers_->content_length_status_ = BalsaHeadersEnums::NO_CONTENT_LENGTH;
866       content_length_remaining_ = 0;
867     }
868     if (transfer_encoding_idx != 0) {
869       ProcessTransferEncodingLine(transfer_encoding_idx - 1);
870     }
871   }
872 }
873 
AssignParseStateAfterHeadersHaveBeenParsed()874 void BalsaFrame::AssignParseStateAfterHeadersHaveBeenParsed() {
875   // For responses, can't have a body if the request was a HEAD, or if it is
876   // one of these response-codes.  rfc2616 section 4.3
877   parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
878   if (is_request_ ||
879       !(request_was_head_ ||
880         (headers_->parsed_response_code_ >= 100 &&
881          headers_->parsed_response_code_ < 200) ||
882         (headers_->parsed_response_code_ == 204) ||
883         (headers_->parsed_response_code_ == 304))) {
884     // Then we can have a body.
885     if (headers_->transfer_encoding_is_chunked_) {
886       // Note that
887       // if ( Transfer-Encoding: chunked &&  Content-length: )
888       // then Transfer-Encoding: chunked trumps.
889       // This is as specified in the spec.
890       // rfc2616 section 4.4.3
891       parse_state_ = BalsaFrameEnums::READING_CHUNK_LENGTH;
892     } else {
893       // Errors parsing content-length definitely can cause
894       // protocol errors/warnings
895       switch (headers_->content_length_status_) {
896         // If we have a content-length, and it is parsed
897         // properly, there are two options.
898         // 1) zero content, in which case the message is done, and
899         // 2) nonzero content, in which case we have to
900         //    consume the body.
901         case BalsaHeadersEnums::VALID_CONTENT_LENGTH:
902           if (headers_->content_length_ == 0) {
903             parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
904           } else {
905             parse_state_ = BalsaFrameEnums::READING_CONTENT;
906           }
907           break;
908         case BalsaHeadersEnums::CONTENT_LENGTH_OVERFLOW:
909         case BalsaHeadersEnums::INVALID_CONTENT_LENGTH:
910           // If there were characters left-over after parsing the
911           // content length, we should flag an error and stop.
912           parse_state_ = BalsaFrameEnums::PARSE_ERROR;
913           last_error_ = BalsaFrameEnums::UNPARSABLE_CONTENT_LENGTH;
914           visitor_->HandleHeaderError(this);
915           break;
916           // We can have: no transfer-encoding, no content length, and no
917           // connection: close...
918           // Unfortunately, this case doesn't seem to be covered in the spec.
919           // We'll assume that the safest thing to do here is what the google
920           // binaries before 2008 already do, which is to assume that
921           // everything until the connection is closed is body.
922         case BalsaHeadersEnums::NO_CONTENT_LENGTH:
923           if (is_request_) {
924             base::StringPiece method = headers_->request_method();
925             // POSTs and PUTs should have a detectable body length.  If they
926             // do not we consider it an error.
927             if ((method.size() == 4 &&
928                  strncmp(method.data(), "POST", 4) == 0) ||
929                 (method.size() == 3 &&
930                  strncmp(method.data(), "PUT", 3) == 0)) {
931               parse_state_ = BalsaFrameEnums::PARSE_ERROR;
932               last_error_ =
933                   BalsaFrameEnums::REQUIRED_BODY_BUT_NO_CONTENT_LENGTH;
934               visitor_->HandleHeaderError(this);
935               break;
936             }
937             parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
938           } else {
939             parse_state_ = BalsaFrameEnums::READING_UNTIL_CLOSE;
940             last_error_ = BalsaFrameEnums::MAYBE_BODY_BUT_NO_CONTENT_LENGTH;
941             visitor_->HandleHeaderWarning(this);
942           }
943           break;
944           // The COV_NF_... statements here provide hints to the apparatus
945           // which computes coverage reports/ratios that this code is never
946           // intended to be executed, and should technically be impossible.
947           // COV_NF_START
948         default:
949           LOG(FATAL) << "Saw a content_length_status: "
950            << headers_->content_length_status_ << " which is unknown.";
951           // COV_NF_END
952       }
953     }
954   }
955 }
956 
ProcessHeaders(const char * message_start,size_t message_length)957 size_t BalsaFrame::ProcessHeaders(const char* message_start,
958                                   size_t message_length) {
959   const char* const original_message_start = message_start;
960   const char* const message_end = message_start + message_length;
961   const char* message_current = message_start;
962   const char* checkpoint = message_start;
963 
964   if (message_length == 0) {
965     goto bottom;
966   }
967 
968   while (message_current < message_end) {
969     size_t base_idx = headers_->GetReadableBytesFromHeaderStream();
970 
971     // Yes, we could use strchr (assuming null termination), or
972     // memchr, but as it turns out that is slower than this tight loop
973     // for the input that we see.
974     if (!saw_non_newline_char_) {
975       do {
976         const char c = *message_current;
977         if (c != '\r' && c != '\n') {
978           if (c <= ' ') {
979             parse_state_ = BalsaFrameEnums::PARSE_ERROR;
980             last_error_ = BalsaFrameEnums::NO_REQUEST_LINE_IN_REQUEST;
981             visitor_->HandleHeaderError(this);
982             goto bottom;
983           } else {
984             saw_non_newline_char_ = true;
985             checkpoint = message_start = message_current;
986             goto read_real_message;
987           }
988         }
989         ++message_current;
990       } while (message_current < message_end);
991       goto bottom;  // this is necessary to skip 'last_char_was_slash_r' checks
992     } else {
993  read_real_message:
994       // Note that SSE2 can be enabled on certain piii platforms.
995 #if __SSE2__
996       {
997         const char* const message_end_m16 = message_end - 16;
998         __m128i newlines = _mm_set1_epi8('\n');
999         while (message_current < message_end_m16) {
1000           // What this does (using compiler intrinsics):
1001           //
1002           // Load 16 '\n's into an xmm register
1003           // Load 16 bytes of currennt message into an xmm register
1004           // Do byte-wise equals on those two xmm registers
1005           // Take the first bit of each byte, and put that into the first
1006           //   16 bits of a mask
1007           // If the mask is zero, no '\n' found. increment by 16 and try again
1008           // Else scan forward to find the first set bit.
1009           // Increment current by the index of the first set bit
1010           //   (ffs returns index of first set bit + 1)
1011           __m128i msg_bytes =
1012             _mm_loadu_si128(const_cast<__m128i *>(
1013                     reinterpret_cast<const __m128i *>(message_current)));
1014           __m128i newline_cmp = _mm_cmpeq_epi8(msg_bytes, newlines);
1015           int newline_msk = _mm_movemask_epi8(newline_cmp);
1016           if (newline_msk == 0) {
1017             message_current += 16;
1018             continue;
1019           }
1020           message_current += (ffs(newline_msk) - 1);
1021           const size_t relative_idx = message_current - message_start;
1022           const size_t message_current_idx = 1 + base_idx + relative_idx;
1023           lines_.push_back(std::make_pair(last_slash_n_idx_,
1024                                           message_current_idx));
1025           if (lines_.size() == 1) {
1026             headers_->WriteFromFramer(checkpoint,
1027                                       1 + message_current - checkpoint);
1028             checkpoint = message_current + 1;
1029             const char* begin = headers_->OriginalHeaderStreamBegin();
1030 #if DEBUGFRAMER
1031           LOG(INFO) << "First line " << std::string(begin, lines_[0].second);
1032           LOG(INFO) << "is_request_: " << is_request_;
1033 #endif
1034             ProcessFirstLine(begin, begin + lines_[0].second);
1035             if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ)
1036               goto process_lines;
1037             else if (parse_state_ == BalsaFrameEnums::PARSE_ERROR)
1038               goto bottom;
1039           }
1040           const size_t chars_since_last_slash_n = (message_current_idx -
1041                                                    last_slash_n_idx_);
1042           last_slash_n_idx_ = message_current_idx;
1043           if (chars_since_last_slash_n > 2) {
1044             // We have a slash-n, but the last slash n was
1045             // more than 2 characters away from this. Thus, we know
1046             // that this cannot be an end-of-header.
1047             ++message_current;
1048             continue;
1049           }
1050           if ((chars_since_last_slash_n == 1) ||
1051               (((message_current > message_start) &&
1052                 (*(message_current - 1) == '\r')) ||
1053                (last_char_was_slash_r_))) {
1054             goto process_lines;
1055           }
1056           ++message_current;
1057         }
1058       }
1059 #endif  // __SSE2__
1060       while (message_current < message_end) {
1061         if (*message_current != '\n') {
1062           ++message_current;
1063           continue;
1064         }
1065         const size_t relative_idx = message_current - message_start;
1066         const size_t message_current_idx = 1 + base_idx + relative_idx;
1067         lines_.push_back(std::make_pair(last_slash_n_idx_,
1068                                         message_current_idx));
1069         if (lines_.size() == 1) {
1070           headers_->WriteFromFramer(checkpoint,
1071                                     1 + message_current - checkpoint);
1072           checkpoint = message_current + 1;
1073           const char* begin = headers_->OriginalHeaderStreamBegin();
1074 #if DEBUGFRAMER
1075           LOG(INFO) << "First line " << std::string(begin, lines_[0].second);
1076           LOG(INFO) << "is_request_: " << is_request_;
1077 #endif
1078           ProcessFirstLine(begin, begin + lines_[0].second);
1079           if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ)
1080             goto process_lines;
1081           else if (parse_state_ == BalsaFrameEnums::PARSE_ERROR)
1082             goto bottom;
1083         }
1084         const size_t chars_since_last_slash_n = (message_current_idx -
1085                                                  last_slash_n_idx_);
1086         last_slash_n_idx_ = message_current_idx;
1087         if (chars_since_last_slash_n > 2) {
1088           // false positive.
1089           ++message_current;
1090           continue;
1091         }
1092         if ((chars_since_last_slash_n == 1) ||
1093             (((message_current > message_start) &&
1094               (*(message_current - 1) == '\r')) ||
1095              (last_char_was_slash_r_))) {
1096           goto process_lines;
1097         }
1098         ++message_current;
1099       }
1100     }
1101     continue;
1102  process_lines:
1103     ++message_current;
1104     DCHECK(message_current >= message_start);
1105     if (message_current > message_start) {
1106       headers_->WriteFromFramer(checkpoint, message_current - checkpoint);
1107     }
1108 
1109     // Check if we have exceeded maximum headers length
1110     // Although we check for this limit before and after we call this function
1111     // we check it here as well to make sure that in case the visitor changed
1112     // the max_header_length_ (for example after processing the first line)
1113     // we handle it gracefully.
1114     if (headers_->GetReadableBytesFromHeaderStream() > max_header_length_) {
1115       parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1116       last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG;
1117       visitor_->HandleHeaderError(this);
1118       goto bottom;
1119     }
1120 
1121     // Since we know that we won't be writing any more bytes of the header,
1122     // we tell that to the headers object. The headers object may make
1123     // more efficient allocation decisions when this is signaled.
1124     headers_->DoneWritingFromFramer();
1125     {
1126       const char* readable_ptr = NULL;
1127       size_t readable_size = 0;
1128       headers_->GetReadablePtrFromHeaderStream(&readable_ptr, &readable_size);
1129       visitor_->ProcessHeaderInput(readable_ptr, readable_size);
1130     }
1131 
1132     // Ok, now that we've written everything into our header buffer, it is
1133     // time to process the header lines (extract proper values for headers
1134     // which are important for framing).
1135     ProcessHeaderLines();
1136     if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) {
1137       goto bottom;
1138     }
1139     AssignParseStateAfterHeadersHaveBeenParsed();
1140     if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) {
1141       goto bottom;
1142     }
1143     visitor_->ProcessHeaders(*headers_);
1144     visitor_->HeaderDone();
1145     if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ) {
1146       visitor_->MessageDone();
1147     }
1148     goto bottom;
1149   }
1150   // If we've gotten to here, it means that we've consumed all of the
1151   // available input. We need to record whether or not the last character we
1152   // saw was a '\r' so that a subsequent call to ProcessInput correctly finds
1153   // a header framing that is split across the two calls.
1154   last_char_was_slash_r_ = (*(message_end - 1) == '\r');
1155   DCHECK(message_current >= message_start);
1156   if (message_current > message_start) {
1157     headers_->WriteFromFramer(checkpoint, message_current - checkpoint);
1158   }
1159  bottom:
1160   return message_current - original_message_start;
1161 }
1162 
1163 
BytesSafeToSplice() const1164 size_t BalsaFrame::BytesSafeToSplice() const {
1165   switch (parse_state_) {
1166     case BalsaFrameEnums::READING_CHUNK_DATA:
1167       return chunk_length_remaining_;
1168     case BalsaFrameEnums::READING_UNTIL_CLOSE:
1169       return std::numeric_limits<size_t>::max();
1170     case BalsaFrameEnums::READING_CONTENT:
1171       return content_length_remaining_;
1172     default:
1173       return 0;
1174   }
1175 }
1176 
BytesSpliced(size_t bytes_spliced)1177 void BalsaFrame::BytesSpliced(size_t bytes_spliced) {
1178   switch (parse_state_) {
1179     case BalsaFrameEnums::READING_CHUNK_DATA:
1180       if (chunk_length_remaining_ >= bytes_spliced) {
1181         chunk_length_remaining_ -= bytes_spliced;
1182         if (chunk_length_remaining_ == 0) {
1183           parse_state_ = BalsaFrameEnums::READING_CHUNK_TERM;
1184         }
1185         return;
1186       } else {
1187         last_error_ =
1188           BalsaFrameEnums::CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT;
1189         goto error_exit;
1190       }
1191 
1192     case BalsaFrameEnums::READING_UNTIL_CLOSE:
1193       return;
1194 
1195     case BalsaFrameEnums::READING_CONTENT:
1196       if (content_length_remaining_ >= bytes_spliced) {
1197         content_length_remaining_ -= bytes_spliced;
1198         if (content_length_remaining_ == 0) {
1199           parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1200           visitor_->MessageDone();
1201         }
1202         return;
1203       } else {
1204         last_error_ =
1205           BalsaFrameEnums::CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT;
1206         goto error_exit;
1207       }
1208 
1209     default:
1210       last_error_ = BalsaFrameEnums::CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO;
1211       goto error_exit;
1212   }
1213 
1214  error_exit:
1215   parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1216   visitor_->HandleBodyError(this);
1217 };
1218 
1219 // You may note that the state-machine contained within this function has both
1220 // switch and goto labels for nearly the same thing. For instance, the
1221 // following two labels refer to the same code block:
1222 //   label_reading_chunk_data:
1223 //   case BalsaFrameEnums::READING_CHUNK_DATA:
1224 // The 'case' statement is required for the switch statement which occurs when
1225 // ProcessInput is invoked. The goto label is required as the state-machine
1226 // does not use a computed goto in any subsequent operations.
1227 //
1228 // Since several states exit the state machine for various reasons, there is
1229 // also one label at the bottom of the function. When it is appropriate to
1230 // return from the function, that part of the state machine instead issues a
1231 // goto bottom; This results in less code duplication, and makes debugging
1232 // easier (as you can add a statement to a section of code which is guaranteed
1233 // to be invoked when the function is exiting.
ProcessInput(const char * input,size_t size)1234 size_t BalsaFrame::ProcessInput(const char* input, size_t size) {
1235   const char* current = input;
1236   const char* on_entry = current;
1237   const char* end = current + size;
1238 #if DEBUGFRAMER
1239   LOG(INFO) << "\n=============="
1240             << BalsaFrameEnums::ParseStateToString(parse_state_)
1241             << "===============\n";
1242 #endif  // DEBUGFRAMER
1243 
1244   DCHECK(headers_ != NULL);
1245   if (headers_ == NULL) return 0;
1246 
1247   if (parse_state_ == BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE) {
1248     const size_t header_length = headers_->GetReadableBytesFromHeaderStream();
1249     // Yes, we still have to check this here as the user can change the
1250     // max_header_length amount!
1251     // Also it is possible that we have reached the maximum allowed header size,
1252     // and we have more to consume (remember we are still inside
1253     // READING_HEADER_AND_FIRSTLINE) in which case we directly declare an error.
1254     if (header_length > max_header_length_ ||
1255         (header_length == max_header_length_ && size > 0)) {
1256       parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1257       last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG;
1258       visitor_->HandleHeaderError(this);
1259       goto bottom;
1260     }
1261     size_t bytes_to_process = max_header_length_ - header_length;
1262     if (bytes_to_process > size) {
1263       bytes_to_process = size;
1264     }
1265     current += ProcessHeaders(input, bytes_to_process);
1266     // If we are still reading headers check if we have crossed the headers
1267     // limit. Note that we check for >= as opposed to >. This is because if
1268     // header_length_after equals max_header_length_ and we are still in the
1269     // parse_state_  BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE we know for
1270     // sure that the headers limit will be crossed later on
1271     if (parse_state_ == BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE) {
1272       // Note that headers_ is valid only if we are still reading headers.
1273       const size_t header_length_after =
1274           headers_->GetReadableBytesFromHeaderStream();
1275       if (header_length_after >= max_header_length_) {
1276         parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1277         last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG;
1278         visitor_->HandleHeaderError(this);
1279       }
1280     }
1281     goto bottom;
1282   } else if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ ||
1283              parse_state_ == BalsaFrameEnums::PARSE_ERROR) {
1284     // Can do nothing more 'till we're reset.
1285     goto bottom;
1286   }
1287 
1288   while (current < end) {
1289     switch (parse_state_) {
1290  label_reading_chunk_length:
1291       case BalsaFrameEnums::READING_CHUNK_LENGTH:
1292         // In this state we read the chunk length.
1293         // Note that once we hit a character which is not in:
1294         // [0-9;A-Fa-f\n], we transition to a different state.
1295         //
1296         {
1297           // If we used strtol, etc, we'd have to buffer this line.
1298           // This is more annoying than simply doing the conversion
1299           // here. This code accounts for overflow.
1300           static const signed char buf[] = {
1301             // %0  %1  %2  %3  %4  %5  %6  %7  %8  \t  \n  %b  %c  \r  %e  %f
1302                -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -1, -1, -2, -1, -1,
1303             // %10 %11 %12 %13 %14 %15 %16 %17 %18 %19 %1a %1b %1c %1d %1e %1f
1304                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1305             // ' ' %21 %22 %23 %24 %25 %26 %27 %28 %29 %2a %2b %2c %2d %2e %2f
1306                -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1307             // %30 %31 %32 %33 %34 %35 %36 %37 %38 %39 %3a ';' %3c %3d %3e %3f
1308                 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, -1, -2, -1, -1, -1, -1,
1309             // %40 'A' 'B' 'C' 'D' 'E' 'F' %47 %48 %49 %4a %4b %4c %4d %4e %4f
1310                -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1311             // %50 %51 %52 %53 %54 %55 %56 %57 %58 %59 %5a %5b %5c %5d %5e %5f
1312                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1313             // %60 'a' 'b' 'c' 'd' 'e' 'f' %67 %68 %69 %6a %6b %6c %6d %6e %6f
1314                -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1315             // %70 %71 %72 %73 %74 %75 %76 %77 %78 %79 %7a %7b %7c %7d %7e %7f
1316                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1317           };
1318           // valid cases:
1319           //  "09123\n"                      // -> 09123
1320           //  "09123\r\n"                    // -> 09123
1321           //  "09123  \n"                    // -> 09123
1322           //  "09123  \r\n"                  // -> 09123
1323           //  "09123  12312\n"               // -> 09123
1324           //  "09123  12312\r\n"             // -> 09123
1325           //  "09123; foo=bar\n"             // -> 09123
1326           //  "09123; foo=bar\r\n"           // -> 09123
1327           //  "FFFFFFFFFFFFFFFF\r\n"         // -> FFFFFFFFFFFFFFFF
1328           //  "FFFFFFFFFFFFFFFF 22\r\n"      // -> FFFFFFFFFFFFFFFF
1329           // invalid cases:
1330           // "[ \t]+[^\n]*\n"
1331           // "FFFFFFFFFFFFFFFFF\r\n"  (would overflow)
1332           // "\r\n"
1333           // "\n"
1334           while (current < end) {
1335             const char c = *current;
1336             ++current;
1337             const signed char addition = buf[static_cast<int>(c)];
1338             if (addition >= 0) {
1339               chunk_length_character_extracted_ = true;
1340               size_t length_x_16 = chunk_length_remaining_ * 16;
1341               const size_t kMaxDiv16 = std::numeric_limits<size_t>::max() / 16;
1342               if ((chunk_length_remaining_ > kMaxDiv16) ||
1343                   ((std::numeric_limits<size_t>::max() - length_x_16) <
1344                    static_cast<size_t>(addition))) {
1345                 // overflow -- asked for a chunk-length greater than 2^64 - 1!!
1346                 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1347                 last_error_ = BalsaFrameEnums::CHUNK_LENGTH_OVERFLOW;
1348                 visitor_->ProcessBodyInput(on_entry, current - on_entry);
1349                 visitor_->HandleChunkingError(this);
1350                 goto bottom;
1351               }
1352               chunk_length_remaining_ = length_x_16 + addition;
1353               continue;
1354             }
1355 
1356             if (!chunk_length_character_extracted_ || addition == -1) {
1357               // ^[0-9;A-Fa-f][ \t\n] -- was not matched, either because no
1358               // characters were converted, or an unexpected character was
1359               // seen.
1360               parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1361               last_error_ = BalsaFrameEnums::INVALID_CHUNK_LENGTH;
1362               visitor_->ProcessBodyInput(on_entry, current - on_entry);
1363               visitor_->HandleChunkingError(this);
1364               goto bottom;
1365             }
1366 
1367             --current;
1368             parse_state_ = BalsaFrameEnums::READING_CHUNK_EXTENSION;
1369             visitor_->ProcessChunkLength(chunk_length_remaining_);
1370             goto label_reading_chunk_extension;
1371           }
1372         }
1373         visitor_->ProcessBodyInput(on_entry, current - on_entry);
1374         goto bottom;  // case BalsaFrameEnums::READING_CHUNK_LENGTH
1375 
1376  label_reading_chunk_extension:
1377       case BalsaFrameEnums::READING_CHUNK_EXTENSION:
1378         {
1379           // TODO(phython): Convert this scanning to be 16 bytes at a time if
1380           // there is data to be read.
1381           const char* extensions_start = current;
1382           size_t extensions_length = 0;
1383           while (current < end) {
1384             const char c = *current;
1385             if (c == '\r' || c == '\n') {
1386               extensions_length =
1387                   (extensions_start == current) ?
1388                   0 :
1389                   current - extensions_start - 1;
1390             }
1391 
1392             ++current;
1393             if (c == '\n') {
1394               chunk_length_character_extracted_ = false;
1395               visitor_->ProcessChunkExtensions(
1396                   extensions_start, extensions_length);
1397               if (chunk_length_remaining_ != 0) {
1398                 parse_state_ = BalsaFrameEnums::READING_CHUNK_DATA;
1399                 goto label_reading_chunk_data;
1400               }
1401               HeaderFramingFound('\n');
1402               parse_state_ = BalsaFrameEnums::READING_LAST_CHUNK_TERM;
1403               goto label_reading_last_chunk_term;
1404             }
1405           }
1406           visitor_->ProcessChunkExtensions(
1407               extensions_start, extensions_length);
1408         }
1409 
1410         visitor_->ProcessBodyInput(on_entry, current - on_entry);
1411         goto bottom;  // case BalsaFrameEnums::READING_CHUNK_EXTENSION
1412 
1413  label_reading_chunk_data:
1414       case BalsaFrameEnums::READING_CHUNK_DATA:
1415         while (current < end) {
1416           if (chunk_length_remaining_ == 0) {
1417             break;
1418           }
1419           // read in the chunk
1420           size_t bytes_remaining = end - current;
1421           size_t consumed_bytes = (chunk_length_remaining_ < bytes_remaining) ?
1422             chunk_length_remaining_ : bytes_remaining;
1423           const char* tmp_current = current + consumed_bytes;
1424           visitor_->ProcessBodyInput(on_entry, tmp_current - on_entry);
1425           visitor_->ProcessBodyData(current, consumed_bytes);
1426           on_entry = current = tmp_current;
1427           chunk_length_remaining_ -= consumed_bytes;
1428         }
1429         if (chunk_length_remaining_ == 0) {
1430           parse_state_ = BalsaFrameEnums::READING_CHUNK_TERM;
1431           goto label_reading_chunk_term;
1432         }
1433         visitor_->ProcessBodyInput(on_entry, current - on_entry);
1434         goto bottom;  // case BalsaFrameEnums::READING_CHUNK_DATA
1435 
1436  label_reading_chunk_term:
1437       case BalsaFrameEnums::READING_CHUNK_TERM:
1438         while (current < end) {
1439           const char c = *current;
1440           ++current;
1441 
1442           if (c == '\n') {
1443             parse_state_ = BalsaFrameEnums::READING_CHUNK_LENGTH;
1444             goto label_reading_chunk_length;
1445           }
1446         }
1447         visitor_->ProcessBodyInput(on_entry, current - on_entry);
1448         goto bottom;  // case BalsaFrameEnums::READING_CHUNK_TERM
1449 
1450  label_reading_last_chunk_term:
1451       case BalsaFrameEnums::READING_LAST_CHUNK_TERM:
1452         while (current < end) {
1453           const char c = *current;
1454 
1455           if (!HeaderFramingFound(c)) {
1456             // If not, however, since the spec only suggests that the
1457             // client SHOULD indicate the presence of trailers, we get to
1458             // *test* that they did or didn't.
1459             // If all of the bytes we've seen since:
1460             //   OPTIONAL_WS 0 OPTIONAL_STUFF CRLF
1461             // are either '\r', or '\n', then we can assume that we don't yet
1462             // know if we need to parse headers, or if the next byte will make
1463             // the HeaderFramingFound condition (above) true.
1464             if (HeaderFramingMayBeFound()) {
1465               // If true, then we have seen only characters '\r' or '\n'.
1466               ++current;
1467 
1468               // Lets try again! There is no state change here.
1469               continue;
1470             } else {
1471               // If (!HeaderFramingMayBeFound()), then we know that we must be
1472               // reading the first non CRLF character of a trailer.
1473               parse_state_ = BalsaFrameEnums::READING_TRAILER;
1474               visitor_->ProcessBodyInput(on_entry, current - on_entry);
1475               on_entry = current;
1476               goto label_reading_trailer;
1477             }
1478           } else {
1479             // If we've found a "\r\n\r\n", then the message
1480             // is done.
1481             ++current;
1482             parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1483             visitor_->ProcessBodyInput(on_entry, current - on_entry);
1484             visitor_->MessageDone();
1485             goto bottom;
1486           }
1487           break;  // from while loop
1488         }
1489         visitor_->ProcessBodyInput(on_entry, current - on_entry);
1490         goto bottom;  // case BalsaFrameEnums::READING_LAST_CHUNK_TERM
1491 
1492  label_reading_trailer:
1493       case BalsaFrameEnums::READING_TRAILER:
1494         while (current < end) {
1495           const char c = *current;
1496           ++current;
1497           // TODO(fenix): If we ever care about trailers as part of framing,
1498           // deal with them here (see below for part of the 'solution')
1499           // if (LineFramingFound(c)) {
1500           // trailer_lines_.push_back(make_pair(start_of_line_,
1501           //                                   trailer_length_ - 1));
1502           // start_of_line_ = trailer_length_;
1503           // }
1504           if (HeaderFramingFound(c)) {
1505             // ProcessTrailers(visitor_, &trailers_);
1506             parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1507             visitor_->ProcessTrailerInput(on_entry, current - on_entry);
1508             visitor_->MessageDone();
1509             goto bottom;
1510           }
1511         }
1512         visitor_->ProcessTrailerInput(on_entry, current - on_entry);
1513         break;  // case BalsaFrameEnums::READING_TRAILER
1514 
1515         // Note that there is no label:
1516         //   'label_reading_until_close'
1517         // here. This is because the state-machine exists immediately after
1518         // reading the headers instead of transitioning here (as it would
1519         // do if it was consuming all the data it could, all the time).
1520       case BalsaFrameEnums::READING_UNTIL_CLOSE:
1521         {
1522           const size_t bytes_remaining = end - current;
1523           if (bytes_remaining > 0) {
1524             visitor_->ProcessBodyInput(current, bytes_remaining);
1525             visitor_->ProcessBodyData(current, bytes_remaining);
1526             current += bytes_remaining;
1527           }
1528         }
1529         goto bottom;  // case BalsaFrameEnums::READING_UNTIL_CLOSE
1530 
1531         // label_reading_content:
1532       case BalsaFrameEnums::READING_CONTENT:
1533 #if DEBUGFRAMER
1534         LOG(INFO) << "ReadingContent: " << content_length_remaining_;
1535 #endif  // DEBUGFRAMER
1536         while (content_length_remaining_ && current < end) {
1537           // read in the content
1538           const size_t bytes_remaining = end - current;
1539           const size_t consumed_bytes =
1540             (content_length_remaining_ < bytes_remaining) ?
1541             content_length_remaining_ : bytes_remaining;
1542           visitor_->ProcessBodyInput(current, consumed_bytes);
1543           visitor_->ProcessBodyData(current, consumed_bytes);
1544           current += consumed_bytes;
1545           content_length_remaining_ -= consumed_bytes;
1546         }
1547         if (content_length_remaining_ == 0) {
1548           parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1549           visitor_->MessageDone();
1550         }
1551         goto bottom;  // case BalsaFrameEnums::READING_CONTENT
1552 
1553       default:
1554         // The state-machine should never be in a state that isn't handled
1555         // above.  This is a glaring logic error, and we should do something
1556         // drastic to ensure that this gets looked-at and fixed.
1557         LOG(FATAL) << "Unknown state: " << parse_state_  // COV_NF_LINE
1558           << " memory corruption?!";                     // COV_NF_LINE
1559     }
1560   }
1561  bottom:
1562 #if DEBUGFRAMER
1563   LOG(INFO) << "\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n"
1564     << std::string(input, current)
1565     << "\n$$$$$$$$$$$$$$"
1566     << BalsaFrameEnums::ParseStateToString(parse_state_)
1567     << "$$$$$$$$$$$$$$$"
1568     << " consumed: " << (current - input);
1569   if (Error()) {
1570     LOG(INFO) << BalsaFrameEnums::ErrorCodeToString(ErrorCode());
1571   }
1572 #endif  // DEBUGFRAMER
1573   return current - input;
1574 }
1575 
1576 }  // namespace net
1577