• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "net/tools/flip_server/balsa_frame.h"
6 
7 #include <assert.h>
8 #include <emmintrin.h>
9 #include <strings.h>
10 
11 #include <limits>
12 #include <iostream>
13 #include <string>
14 #include <utility>
15 #include <vector>
16 
17 #include "base/logging.h"
18 #include "base/port.h"
19 #include "base/string_piece.h"
20 #include "net/tools/flip_server/balsa_enums.h"
21 #include "net/tools/flip_server/balsa_headers.h"
22 #include "net/tools/flip_server/balsa_visitor_interface.h"
23 #include "net/tools/flip_server/buffer_interface.h"
24 #include "net/tools/flip_server/simple_buffer.h"
25 #include "net/tools/flip_server/split.h"
26 #include "net/tools/flip_server/string_piece_utils.h"
27 
28 namespace net {
29 
30 // Constants holding some header names for headers which can affect the way the
31 // HTTP message is framed, and so must be processed specially:
32 static const char kContentLength[] = "content-length";
33 static const size_t kContentLengthSize = sizeof(kContentLength) - 1;
34 static const char kTransferEncoding[] = "transfer-encoding";
35 static const size_t kTransferEncodingSize = sizeof(kTransferEncoding) - 1;
36 
Reset()37 void BalsaFrame::Reset() {
38   last_char_was_slash_r_ = false;
39   saw_non_newline_char_ = false;
40   start_was_space_ = true;
41   chunk_length_character_extracted_ = false;
42   // is_request_ = true;               // not reset between messages.
43   // request_was_head_ = false;        // not reset between messages.
44   // max_header_length_ = 4096;        // not reset between messages.
45   // max_request_uri_length_ = 2048;   // not reset between messages.
46   // visitor_ = &do_nothing_visitor_;  // not reset between messages.
47   chunk_length_remaining_ = 0;
48   content_length_remaining_ = 0;
49   last_slash_n_loc_ = NULL;
50   last_recorded_slash_n_loc_ = NULL;
51   last_slash_n_idx_ = 0;
52   term_chars_ = 0;
53   parse_state_ = BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE;
54   last_error_ = BalsaFrameEnums::NO_ERROR;
55   lines_.clear();
56   if (headers_ != NULL) {
57     headers_->Clear();
58   }
59 }
60 
ParseStateToString(BalsaFrameEnums::ParseState error_code)61 const char* BalsaFrameEnums::ParseStateToString(
62     BalsaFrameEnums::ParseState error_code) {
63   switch (error_code) {
64     case ERROR:
65       return "ERROR";
66     case READING_HEADER_AND_FIRSTLINE:
67       return "READING_HEADER_AND_FIRSTLINE";
68     case READING_CHUNK_LENGTH:
69       return "READING_CHUNK_LENGTH";
70     case READING_CHUNK_EXTENSION:
71       return "READING_CHUNK_EXTENSION";
72     case READING_CHUNK_DATA:
73       return "READING_CHUNK_DATA";
74     case READING_CHUNK_TERM:
75       return "READING_CHUNK_TERM";
76     case READING_LAST_CHUNK_TERM:
77       return "READING_LAST_CHUNK_TERM";
78     case READING_TRAILER:
79       return "READING_TRAILER";
80     case READING_UNTIL_CLOSE:
81       return "READING_UNTIL_CLOSE";
82     case READING_CONTENT:
83       return "READING_CONTENT";
84     case MESSAGE_FULLY_READ:
85       return "MESSAGE_FULLY_READ";
86     case NUM_STATES:
87       return "UNKNOWN_STATE";
88   }
89   return "UNKNOWN_STATE";
90 }
91 
ErrorCodeToString(BalsaFrameEnums::ErrorCode error_code)92 const char* BalsaFrameEnums::ErrorCodeToString(
93     BalsaFrameEnums::ErrorCode error_code) {
94   switch (error_code) {
95     case NO_ERROR:
96       return "NO_ERROR";
97     case NO_STATUS_LINE_IN_RESPONSE:
98       return "NO_STATUS_LINE_IN_RESPONSE";
99     case NO_REQUEST_LINE_IN_REQUEST:
100       return "NO_REQUEST_LINE_IN_REQUEST";
101     case FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION:
102       return "FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION";
103     case FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD:
104       return "FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD";
105     case FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE:
106       return "FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE";
107     case FAILED_TO_FIND_WS_AFTER_REQUEST_REQUEST_URI:
108       return "FAILED_TO_FIND_WS_AFTER_REQUEST_REQUEST_URI";
109     case FAILED_TO_FIND_NL_AFTER_RESPONSE_REASON_PHRASE:
110       return "FAILED_TO_FIND_NL_AFTER_RESPONSE_REASON_PHRASE";
111     case FAILED_TO_FIND_NL_AFTER_REQUEST_HTTP_VERSION:
112       return "FAILED_TO_FIND_NL_AFTER_REQUEST_HTTP_VERSION";
113     case FAILED_CONVERTING_STATUS_CODE_TO_INT:
114       return "FAILED_CONVERTING_STATUS_CODE_TO_INT";
115     case REQUEST_URI_TOO_LONG:
116       return "REQUEST_URI_TOO_LONG";
117     case HEADERS_TOO_LONG:
118       return "HEADERS_TOO_LONG";
119     case UNPARSABLE_CONTENT_LENGTH:
120       return "UNPARSABLE_CONTENT_LENGTH";
121     case MAYBE_BODY_BUT_NO_CONTENT_LENGTH:
122       return "MAYBE_BODY_BUT_NO_CONTENT_LENGTH";
123     case REQUIRED_BODY_BUT_NO_CONTENT_LENGTH:
124       return "REQUIRED_BODY_BUT_NO_CONTENT_LENGTH";
125     case HEADER_MISSING_COLON:
126       return "HEADER_MISSING_COLON";
127     case INVALID_CHUNK_LENGTH:
128       return "INVALID_CHUNK_LENGTH";
129     case CHUNK_LENGTH_OVERFLOW:
130       return "CHUNK_LENGTH_OVERFLOW";
131     case CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO:
132       return "CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO";
133     case CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT:
134       return "CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT";
135     case MULTIPLE_CONTENT_LENGTH_KEYS:
136       return "MULTIPLE_CONTENT_LENGTH_KEYS";
137     case MULTIPLE_TRANSFER_ENCODING_KEYS:
138       return "MULTIPLE_TRANSFER_ENCODING_KEYS";
139     case UNKNOWN_TRANSFER_ENCODING:
140       return "UNKNOWN_TRANSFER_ENCODING";
141     case INVALID_HEADER_FORMAT:
142       return "INVALID_HEADER_FORMAT";
143     case INTERNAL_LOGIC_ERROR:
144       return "INTERNAL_LOGIC_ERROR";
145     case NUM_ERROR_CODES:
146       return "UNKNOWN_ERROR";
147   }
148   return "UNKNOWN_ERROR";
149 }
150 
151 // Summary:
152 //     Parses the first line of either a request or response.
153 //     Note that in the case of a detected warning, error_code will be set
154 //   but the function will not return false.
155 //     Exactly zero or one warning or error (but not both) may be detected
156 //   by this function.
157 //     Note that this function will not write the data of the first-line
158 //   into the header's buffer (that should already have been done elsewhere).
159 //
160 // Pre-conditions:
161 //     begin != end
162 //     *begin should be a character which is > ' '. This implies that there
163 //   is at least one non-whitespace characters between [begin, end).
164 //   headers is a valid pointer to a BalsaHeaders class.
165 //     error_code is a valid pointer to a BalsaFrameEnums::ErrorCode value.
166 //     Entire first line must exist between [begin, end)
167 //     Exactly zero or one newlines -may- exist between [begin, end)
168 //     [begin, end) should exist in the header's buffer.
169 //
170 // Side-effects:
171 //   headers will be modified
172 //   error_code may be modified if either a warning or error is detected
173 //
174 // Returns:
175 //   True if no error (as opposed to warning) is detected.
176 //   False if an error (as opposed to warning) is detected.
177 
178 //
179 // If there is indeed non-whitespace in the line, then the following
180 // will take care of this for you:
181 //  while (*begin <= ' ') ++begin;
182 //  ProcessFirstLine(begin, end, is_request, &headers, &error_code);
183 //
ParseHTTPFirstLine(const char * begin,const char * end,bool is_request,size_t max_request_uri_length,BalsaHeaders * headers,BalsaFrameEnums::ErrorCode * error_code)184 bool ParseHTTPFirstLine(const char* begin,
185                         const char* end,
186                         bool is_request,
187                         size_t max_request_uri_length,
188                         BalsaHeaders* headers,
189                         BalsaFrameEnums::ErrorCode* error_code) {
190   const char* current = begin;
191   // HTTP firstlines all have the following structure:
192   //  LWS         NONWS  LWS    NONWS   LWS    NONWS   NOTCRLF  CRLF
193   //  [\t \r\n]+ [^\t ]+ [\t ]+ [^\t ]+ [\t ]+ [^\t ]+ [^\r\n]+ "\r\n"
194   //  ws1        nws1    ws2    nws2    ws3    nws3             ws4
195   //  |          [-------)      [-------)      [----------------)
196   //    REQ:     method         request_uri    version
197   //   RESP:     version        statuscode     reason
198   //
199   //   The first NONWS->LWS component we'll call firstline_a.
200   //   The second firstline_b, and the third firstline_c.
201   //
202   //   firstline_a goes from nws1 to (but not including) ws2
203   //   firstline_b goes from nws2 to (but not including) ws3
204   //   firstline_c goes from nws3 to (but not including) ws4
205   //
206   // In the code:
207   //    ws1 == whitespace_1_idx_
208   //   nws1 == non_whitespace_1_idx_
209   //    ws2 == whitespace_2_idx_
210   //   nws2 == non_whitespace_2_idx_
211   //    ws3 == whitespace_3_idx_
212   //   nws3 == non_whitespace_3_idx_
213   //    ws4 == whitespace_4_idx_
214 
215   // Kill all whitespace (including '\r\n') at the end of the line.
216   --end;
217   if (*end != '\n') {
218     *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR;
219     LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n"
220                 << headers->OriginalHeadersForDebugging();
221     return false;
222   }
223   while (begin < end && *end <= ' ') {
224     --end;
225   }
226   DCHECK(*end != '\n');
227   if (*end == '\n') {
228     *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR;
229     LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n"
230                 << headers->OriginalHeadersForDebugging();
231     return false;
232   }
233   ++end;
234 
235   // The two following statements should not be possible.
236   if (end == begin) {
237     *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR;
238     LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n"
239                 << headers->OriginalHeadersForDebugging();
240     return false;
241   }
242 
243   // whitespace_1_idx_
244   headers->whitespace_1_idx_ = current - begin;
245   // This loop is commented out as it is never used in current code.  This is
246   // true only because we don't begin parsing the headers at all until we've
247   // encountered a non whitespace character at the beginning of the stream, at
248   // which point we begin our demarcation of header-start.  If we did -not- do
249   // this (for instance, only looked for [\r\n] instead of (< ' ')), this loop
250   // would be necessary for the proper functioning of this parsing.
251   // This is left here as this function may (in the future) be refactored out
252   // of the BalsaFrame class so that it may be shared between code in
253   // BalsaFrame and BalsaHeaders (where it would be used in some variant of the
254   // set_first_line() function (at which point it would be necessary).
255 #if 0
256   while (*current <= ' ') {
257     ++current;
258   }
259 #endif
260   // non_whitespace_1_idx_
261   headers->non_whitespace_1_idx_ = current - begin;
262   do {
263     // The first time through, we're guaranteed that the current character
264     // won't be a whitespace (else the loop above wouldn't have terminated).
265     // That implies that we're guaranteed to get at least one non-whitespace
266     // character if we get into this loop at all.
267     ++current;
268     if (current == end) {
269       headers->whitespace_2_idx_ = current - begin;
270       headers->non_whitespace_2_idx_ = current - begin;
271       headers->whitespace_3_idx_ = current - begin;
272       headers->non_whitespace_3_idx_ = current - begin;
273       headers->whitespace_4_idx_ = current - begin;
274       // FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD   for request
275       // FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION for response
276       *error_code =
277         static_cast<BalsaFrameEnums::ErrorCode>(
278             BalsaFrameEnums::FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION +
279             is_request);
280       if (!is_request) {  // FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION
281         return false;
282       }
283       goto output_exhausted;
284     }
285   } while (*current > ' ');
286   // whitespace_2_idx_
287   headers->whitespace_2_idx_ = current - begin;
288   do {
289     ++current;
290     // Note that due to the loop which consumes all of the whitespace
291     // at the end of the line, current can never == end while in this function.
292   } while (*current <= ' ');
293   // non_whitespace_2_idx_
294   headers->non_whitespace_2_idx_ = current - begin;
295   do {
296     ++current;
297     if (current == end) {
298       headers->whitespace_3_idx_ = current - begin;
299       headers->non_whitespace_3_idx_ = current - begin;
300       headers->whitespace_4_idx_ = current - begin;
301       // FAILED_TO_FIND_START_OF_REQUEST_REQUEST_URI for request
302       // FAILED_TO_FIND_START_OF_RESPONSE_STATUSCODE for response
303       *error_code =
304         static_cast<BalsaFrameEnums::ErrorCode>(
305             BalsaFrameEnums::FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE
306                                  + is_request);
307       goto output_exhausted;
308     }
309   } while (*current > ' ');
310   // whitespace_3_idx_
311   headers->whitespace_3_idx_ = current - begin;
312   do {
313     ++current;
314     // Note that due to the loop which consumes all of the whitespace
315     // at the end of the line, current can never == end while in this function.
316   } while (*current <= ' ');
317   // non_whitespace_3_idx_
318   headers->non_whitespace_3_idx_ = current - begin;
319   headers->whitespace_4_idx_ = end - begin;
320 
321  output_exhausted:
322   // Note that we don't fail the parse immediately when parsing of the
323   // firstline fails.  Depending on the protocol type, we may want to accept
324   // a firstline with only one or two elements, e.g., for HTTP/0.9:
325   //   GET\r\n
326   // or
327   //   GET /\r\n
328   // should be parsed without issue (though the visitor should know that
329   // parsing the entire line was not exactly as it should be).
330   //
331   // Eventually, these errors may be removed alltogether, as the visitor can
332   // detect them on its own by examining the size of the various fields.
333   // headers->set_first_line(non_whitespace_1_idx_, current);
334 
335   if (is_request) {
336     if ((headers->whitespace_3_idx_ - headers->non_whitespace_2_idx_) >
337         max_request_uri_length) {
338       // For requests, we need at least the method.  We could assume that a
339       // blank URI means "/".  If version isn't stated, it should be assumed
340       // to be HTTP/0.9 by the visitor.
341       *error_code = BalsaFrameEnums::REQUEST_URI_TOO_LONG;
342       return false;
343     }
344   } else {
345     headers->parsed_response_code_ = 0;
346     {
347       const char* parsed_response_code_current =
348         begin + headers->non_whitespace_2_idx_;
349       const char* parsed_response_code_end = begin + headers->whitespace_3_idx_;
350       const size_t kMaxDiv10 = std::numeric_limits<size_t>::max() / 10;
351 
352       // Convert a string of [0-9]* into an int.
353       // Note that this allows for the conversion of response codes which
354       // are outside the bounds of normal HTTP response codes (no checking
355       // is done to ensure that these are valid-- they're merely parsed)!
356       while (parsed_response_code_current < parsed_response_code_end) {
357         if (*parsed_response_code_current < '0' ||
358             *parsed_response_code_current > '9') {
359           *error_code = BalsaFrameEnums::FAILED_CONVERTING_STATUS_CODE_TO_INT;
360           return false;
361         }
362         size_t status_code_x_10 = headers->parsed_response_code_ * 10;
363         uint8 c = *parsed_response_code_current - '0';
364         if ((headers->parsed_response_code_ > kMaxDiv10) ||
365             (std::numeric_limits<size_t>::max() - status_code_x_10) < c) {
366           // overflow.
367           *error_code = BalsaFrameEnums::FAILED_CONVERTING_STATUS_CODE_TO_INT;
368           return false;
369         }
370         headers->parsed_response_code_ = status_code_x_10 + c;
371         ++parsed_response_code_current;
372       }
373     }
374   }
375   return true;
376 }
377 
378 // begin - beginning of the firstline
379 // end - end of the firstline
380 //
381 // A precondition for this function is that there is non-whitespace between
382 // [begin, end). If this precondition is not met, the function will not perform
383 // as expected (and bad things may happen, and it will eat your first, second,
384 // and third unborn children!).
385 //
386 // Another precondition for this function is that [begin, end) includes
387 // at most one newline, which must be at the end of the line.
ProcessFirstLine(const char * begin,const char * end)388 void BalsaFrame::ProcessFirstLine(const char* begin, const char* end) {
389   BalsaFrameEnums::ErrorCode previous_error = last_error_;
390   if (!ParseHTTPFirstLine(begin,
391                           end,
392                           is_request_,
393                           max_request_uri_length_,
394                           headers_,
395                           &last_error_)) {
396     parse_state_ = BalsaFrameEnums::PARSE_ERROR;
397     visitor_->HandleHeaderError(this);
398     return;
399   }
400   if (previous_error != last_error_) {
401     visitor_->HandleHeaderWarning(this);
402   }
403 
404   if (is_request_) {
405     int version_length =
406         headers_->whitespace_4_idx_ - headers_->non_whitespace_3_idx_;
407     visitor_->ProcessRequestFirstLine(
408         begin + headers_->non_whitespace_1_idx_,
409         headers_->whitespace_4_idx_ - headers_->non_whitespace_1_idx_,
410         begin + headers_->non_whitespace_1_idx_,
411         headers_->whitespace_2_idx_ - headers_->non_whitespace_1_idx_,
412         begin + headers_->non_whitespace_2_idx_,
413         headers_->whitespace_3_idx_ - headers_->non_whitespace_2_idx_,
414         begin + headers_->non_whitespace_3_idx_,
415         version_length);
416     if (version_length == 0)
417       parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
418   } else {
419     visitor_->ProcessResponseFirstLine(
420         begin + headers_->non_whitespace_1_idx_,
421         headers_->whitespace_4_idx_ - headers_->non_whitespace_1_idx_,
422         begin + headers_->non_whitespace_1_idx_,
423         headers_->whitespace_2_idx_ - headers_->non_whitespace_1_idx_,
424         begin + headers_->non_whitespace_2_idx_,
425         headers_->whitespace_3_idx_ - headers_->non_whitespace_2_idx_,
426         begin + headers_->non_whitespace_3_idx_,
427         headers_->whitespace_4_idx_ - headers_->non_whitespace_3_idx_);
428   }
429 }
430 
431 // 'stream_begin' points to the first character of the headers buffer.
432 // 'line_begin' points to the first character of the line.
433 // 'current' points to a char which is ':'.
434 // 'line_end' points to the position of '\n' + 1.
435 // 'line_begin' points to the position of first character of line.
CleanUpKeyValueWhitespace(const char * stream_begin,const char * line_begin,const char * current,const char * line_end,HeaderLineDescription * current_header_line)436 void BalsaFrame::CleanUpKeyValueWhitespace(
437     const char* stream_begin,
438     const char* line_begin,
439     const char* current,
440     const char* line_end,
441     HeaderLineDescription* current_header_line) {
442   const char* colon_loc = current;
443   DCHECK_LT(colon_loc, line_end);
444   DCHECK_EQ(':', *colon_loc);
445   DCHECK_EQ(':', *current);
446   DCHECK_GE(' ', *line_end)
447     << "\"" << std::string(line_begin, line_end) << "\"";
448 
449   // TODO(fenix): Investigate whether or not the bounds tests in the
450   // while loops here are redundant, and if so, remove them.
451   --current;
452   while (current > line_begin && *current <= ' ') --current;
453   current += (current != colon_loc);
454   current_header_line->key_end_idx = current - stream_begin;
455 
456   current = colon_loc;
457   DCHECK_EQ(':', *current);
458   ++current;
459   while (current < line_end && *current <= ' ') ++current;
460   current_header_line->value_begin_idx = current - stream_begin;
461 
462   DCHECK_GE(current_header_line->key_end_idx,
463             current_header_line->first_char_idx);
464   DCHECK_GE(current_header_line->value_begin_idx,
465             current_header_line->key_end_idx);
466   DCHECK_GE(current_header_line->last_char_idx,
467             current_header_line->value_begin_idx);
468 }
469 
FindColonsAndParseIntoKeyValue()470 inline void BalsaFrame::FindColonsAndParseIntoKeyValue() {
471   DCHECK(!lines_.empty());
472   const char* stream_begin = headers_->OriginalHeaderStreamBegin();
473   // The last line is always just a newline (and is uninteresting).
474   const Lines::size_type lines_size_m1 = lines_.size() - 1;
475 #if __SSE2__
476   const __v16qi colons = { ':', ':', ':', ':', ':', ':', ':', ':',
477                            ':', ':', ':', ':', ':', ':', ':', ':'};
478   const char* header_lines_end_m16 = headers_->OriginalHeaderStreamEnd() - 16;
479 #endif  // __SSE2__
480   const char* current = stream_begin + lines_[1].first;
481   // This code is a bit more subtle than it may appear at first glance.
482   // This code looks for a colon in the current line... but it also looks
483   // beyond the current line. If there is no colon in the current line, then
484   // for each subsequent line (until the colon which -has- been found is
485   // associated with a line), no searching for a colon will be performed. In
486   // this way, we minimize the amount of bytes we have scanned for a colon.
487   for (Lines::size_type i = 1; i < lines_size_m1;) {
488     const char* line_begin = stream_begin + lines_[i].first;
489 
490     // Here we handle possible continuations.  Note that we do not replace
491     // the '\n' in the line before a continuation (at least, as of now),
492     // which implies that any code which looks for a value must deal with
493     // "\r\n", etc -within- the line (and not just at the end of it).
494     for (++i; i < lines_size_m1; ++i) {
495       const char c = *(stream_begin + lines_[i].first);
496       if (c > ' ') {
497         // Not a continuation, so stop.  Note that if the 'original' i = 1,
498         // and the next line is not a continuation, we'll end up with i = 2
499         // when we break. This handles the incrementing of i for the outer
500         // loop.
501         break;
502       }
503     }
504     const char* line_end = stream_begin + lines_[i - 1].second;
505     DCHECK_LT(line_begin - stream_begin, line_end - stream_begin);
506 
507     // We cleanup the whitespace at the end of the line before doing anything
508     // else of interest as it allows us to do nothing when irregularly formatted
509     // headers are parsed (e.g. those with only keys, only values, or no colon).
510     //
511     // We're guaranteed to have *line_end > ' ' while line_end >= line_begin.
512     --line_end;
513     DCHECK_EQ('\n', *line_end)
514       << "\"" << std::string(line_begin, line_end) << "\"";
515     while (*line_end <= ' ' && line_end > line_begin) {
516       --line_end;
517     }
518     ++line_end;
519     DCHECK_GE(' ', *line_end);
520     DCHECK_LT(line_begin, line_end);
521 
522     // We use '0' for the block idx, because we're always writing to the first
523     // block from the framer (we do this because the framer requires that the
524     // entire header sequence be in a contiguous buffer).
525     headers_->header_lines_.push_back(
526         HeaderLineDescription(line_begin - stream_begin,
527                               line_end - stream_begin,
528                               line_end - stream_begin,
529                               line_end - stream_begin,
530                               0));
531     if (current >= line_end) {
532       last_error_ = BalsaFrameEnums::HEADER_MISSING_COLON;
533       visitor_->HandleHeaderWarning(this);
534       // Then the next colon will not be found within this header line-- time
535       // to try again with another header-line.
536       continue;
537     } else if (current < line_begin) {
538       // When this condition is true, the last detected colon was part of a
539       // previous line.  We reset to the beginning of the line as we don't care
540       // about the presence of any colon before the beginning of the current
541       // line.
542       current = line_begin;
543     }
544 #if __SSE2__
545     while (current < header_lines_end_m16) {
546       __m128i header_bytes =
547         _mm_loadu_si128(reinterpret_cast<const __m128i *>(current));
548       __m128i colon_cmp =
549         _mm_cmpeq_epi8(header_bytes, reinterpret_cast<__m128i>(colons));
550       int colon_msk = _mm_movemask_epi8(colon_cmp);
551       if (colon_msk == 0) {
552         current += 16;
553         continue;
554       }
555       current += (ffs(colon_msk) - 1);
556       if (current > line_end) {
557         break;
558       }
559       goto found_colon;
560     }
561 #endif  // __SSE2__
562     for (; current < line_end; ++current) {
563       if (*current != ':') {
564         continue;
565       }
566       goto found_colon;
567     }
568     // If we've gotten to here, then there was no colon
569     // in the line. The arguments we passed into the construction
570     // for the HeaderLineDescription object should be OK-- it assumes
571     // that the entire content is 'key' by default (which is true, as
572     // there was no colon, there can be no value). Note that this is a
573     // construct which is technically not allowed by the spec.
574     last_error_ = BalsaFrameEnums::HEADER_MISSING_COLON;
575     visitor_->HandleHeaderWarning(this);
576     continue;
577  found_colon:
578     DCHECK_EQ(*current, ':');
579     DCHECK_LE(current - stream_begin, line_end - stream_begin);
580     DCHECK_LE(stream_begin - stream_begin, current - stream_begin);
581 
582     HeaderLineDescription& current_header_line = headers_->header_lines_.back();
583     current_header_line.key_end_idx = current - stream_begin;
584     current_header_line.value_begin_idx = current_header_line.key_end_idx;
585     if (current < line_end) {
586       ++current_header_line.key_end_idx;
587 
588       CleanUpKeyValueWhitespace(stream_begin,
589                                 line_begin,
590                                 current,
591                                 line_end,
592                                 &current_header_line);
593     }
594   }
595 }
596 
ProcessContentLengthLine(HeaderLines::size_type line_idx,BalsaHeadersEnums::ContentLengthStatus * status,size_t * length)597 void BalsaFrame::ProcessContentLengthLine(
598     HeaderLines::size_type line_idx,
599     BalsaHeadersEnums::ContentLengthStatus* status,
600     size_t* length) {
601   const HeaderLineDescription& header_line = headers_->header_lines_[line_idx];
602   const char* stream_begin = headers_->OriginalHeaderStreamBegin();
603   const char* line_end = stream_begin + header_line.last_char_idx;
604   const char* value_begin = (stream_begin + header_line.value_begin_idx);
605 
606   if (value_begin >= line_end) {
607     // There is no non-whitespace value data.
608 #if DEBUGFRAMER
609       LOG(INFO) << "invalid content-length -- no non-whitespace value data";
610 #endif
611     *status = BalsaHeadersEnums::INVALID_CONTENT_LENGTH;
612     return;
613   }
614 
615   *length = 0;
616   while (value_begin < line_end) {
617     if (*value_begin < '0' || *value_begin > '9') {
618       // bad! content-length found, and couldn't parse all of it!
619       *status = BalsaHeadersEnums::INVALID_CONTENT_LENGTH;
620 #if DEBUGFRAMER
621       LOG(INFO) << "invalid content-length - non numeric character detected";
622 #endif  // DEBUGFRAMER
623       return;
624     }
625     const size_t kMaxDiv10 = std::numeric_limits<size_t>::max() / 10;
626     size_t length_x_10 = *length * 10;
627     const unsigned char c = *value_begin - '0';
628     if (*length > kMaxDiv10 ||
629         (std::numeric_limits<size_t>::max() - length_x_10) < c) {
630       *status = BalsaHeadersEnums::CONTENT_LENGTH_OVERFLOW;
631 #if DEBUGFRAMER
632       LOG(INFO) << "content-length overflow";
633 #endif  // DEBUGFRAMER
634       return;
635     }
636     *length = length_x_10 + c;
637     ++value_begin;
638   }
639 #if DEBUGFRAMER
640   LOG(INFO) << "content_length parsed: " << *length;
641 #endif  // DEBUGFRAMER
642   *status = BalsaHeadersEnums::VALID_CONTENT_LENGTH;
643 }
644 
ProcessTransferEncodingLine(HeaderLines::size_type line_idx)645 void BalsaFrame::ProcessTransferEncodingLine(HeaderLines::size_type line_idx) {
646   const HeaderLineDescription& header_line = headers_->header_lines_[line_idx];
647   const char* stream_begin = headers_->OriginalHeaderStreamBegin();
648   const char* line_end = stream_begin + header_line.last_char_idx;
649   const char* value_begin = stream_begin + header_line.value_begin_idx;
650   size_t value_length = line_end - value_begin;
651 
652   if ((value_length == 7) &&
653       !strncasecmp(value_begin, "chunked", 7)) {
654     headers_->transfer_encoding_is_chunked_ = true;
655   } else if ((value_length == 8) &&
656       !strncasecmp(value_begin, "identity", 8)) {
657     headers_->transfer_encoding_is_chunked_ = false;
658   } else {
659     last_error_ = BalsaFrameEnums::UNKNOWN_TRANSFER_ENCODING;
660     parse_state_ = BalsaFrameEnums::PARSE_ERROR;
661     visitor_->HandleHeaderError(this);
662     return;
663   }
664 }
665 
666 namespace {
SplitStringPiece(base::StringPiece original,char delim,base::StringPiece * before,base::StringPiece * after)667 bool SplitStringPiece(base::StringPiece original, char delim,
668                       base::StringPiece* before, base::StringPiece* after) {
669   const char* p = original.data();
670   const char* end = p + original.size();
671 
672   while (p != end) {
673     if (*p == delim) {
674       ++p;
675     } else {
676       const char* start = p;
677       while (++p != end && *p != delim) {
678         // Skip to the next occurence of the delimiter.
679       }
680       *before = base::StringPiece(start, p - start);
681       if (p != end)
682         *after = base::StringPiece(p + 1, end - (p + 1));
683       else
684         *after = base::StringPiece("");
685       StringPieceUtils::RemoveWhitespaceContext(before);
686       StringPieceUtils::RemoveWhitespaceContext(after);
687       return true;
688     }
689   }
690 
691   *before = original;
692   *after = "";
693   return false;
694 }
695 
696 // TODO(phython): Fix this function to properly deal with quoted values.
697 // E.g. ";;foo", "\";;\"", or \"aa;
698 // The last example, the semi-colon is a separator between extensions.
ProcessChunkExtensionsManual(base::StringPiece all_extensions,BalsaHeaders * extensions)699 void ProcessChunkExtensionsManual(base::StringPiece all_extensions,
700                                   BalsaHeaders* extensions) {
701   base::StringPiece extension;
702   base::StringPiece remaining;
703   StringPieceUtils::RemoveWhitespaceContext(&all_extensions);
704   SplitStringPiece(all_extensions, ';', &extension, &remaining);
705   while (!extension.empty()) {
706     base::StringPiece key;
707     base::StringPiece value;
708     SplitStringPiece(extension, '=', &key, &value);
709     if (!value.empty()) {
710       // Strip quotation marks if they exist.
711       if (!value.empty() && value[0] == '"')
712         value.remove_prefix(1);
713       if (!value.empty() && value[value.length() - 1] == '"')
714         value.remove_suffix(1);
715     }
716 
717     extensions->AppendHeader(key, value);
718 
719     StringPieceUtils::RemoveWhitespaceContext(&remaining);
720     SplitStringPiece(remaining, ';', &extension, &remaining);
721   }
722 }
723 
724 // TODO(phython): Fix this function to properly deal with quoted values.
725 // E.g. ";;foo", "\";;\"", or \"aa;
726 // The last example, the semi-colon is a separator between extensions.
ProcessChunkExtensionsGoogle3(const char * input,size_t size,BalsaHeaders * extensions)727 void ProcessChunkExtensionsGoogle3(const char* input, size_t size,
728                                    BalsaHeaders* extensions) {
729   std::vector<base::StringPiece> key_values;
730   SplitStringPieceToVector(base::StringPiece(input, size), ";",
731                            &key_values, true);
732   for (unsigned int i = 0; i < key_values.size(); ++i) {
733     base::StringPiece key = key_values[i].substr(0, key_values[i].find('='));
734     base::StringPiece value;
735     if (key.length() < key_values[i].length()) {
736       value = key_values[i].substr(key.length() + 1);
737       // Remove any leading and trailing whitespace.
738       StringPieceUtils::RemoveWhitespaceContext(&value);
739 
740       // Strip quotation marks if they exist.
741       if (!value.empty() && value[0] == '"')
742         value.remove_prefix(1);
743       if (!value.empty() && value[value.length() - 1] == '"')
744         value.remove_suffix(1);
745     }
746 
747     // Strip the key whitespace after checking that there is a value.
748     StringPieceUtils::RemoveWhitespaceContext(&key);
749     extensions->AppendHeader(key, value);
750   }
751 }
752 
753 }  // anonymous namespace
754 
ProcessChunkExtensions(const char * input,size_t size,BalsaHeaders * extensions)755 void BalsaFrame::ProcessChunkExtensions(const char* input, size_t size,
756                                         BalsaHeaders* extensions) {
757 #if 0
758   ProcessChunkExtensionsGoogle3(input, size, extensions);
759 #else
760   ProcessChunkExtensionsManual(base::StringPiece(input, size), extensions);
761 #endif
762 }
763 
ProcessHeaderLines()764 void BalsaFrame::ProcessHeaderLines() {
765   HeaderLines::size_type content_length_idx = 0;
766   HeaderLines::size_type transfer_encoding_idx = 0;
767 
768   DCHECK(!lines_.empty());
769 #if DEBUGFRAMER
770   LOG(INFO) << "******@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@**********\n";
771 #endif  // DEBUGFRAMER
772 
773   // There is no need to attempt to process headers if no header lines exist.
774   // There are at least two lines in the message which are not header lines.
775   // These two non-header lines are the first line of the message, and the
776   // last line of the message (which is an empty line).
777   // Thus, we test to see if we have more than two lines total before attempting
778   // to parse any header lines.
779   if (lines_.size() > 2) {
780     const char* stream_begin = headers_->OriginalHeaderStreamBegin();
781 
782     // Then, for the rest of the header data, we parse these into key-value
783     // pairs.
784     FindColonsAndParseIntoKeyValue();
785     // At this point, we've parsed all of the headers.  Time to look for those
786     // headers which we require for framing.
787     const HeaderLines::size_type
788       header_lines_size = headers_->header_lines_.size();
789     for (HeaderLines::size_type i = 0; i < header_lines_size; ++i) {
790       const HeaderLineDescription& current_header_line =
791         headers_->header_lines_[i];
792       const char* key_begin =
793         (stream_begin + current_header_line.first_char_idx);
794       const char* key_end = (stream_begin + current_header_line.key_end_idx);
795       const size_t key_len = key_end - key_begin;
796       const char c = *key_begin;
797 #if DEBUGFRAMER
798       LOG(INFO) << "[" << i << "]: " << std::string(key_begin, key_len)
799                 << " c: '" << c << "' key_len: " << key_len;
800 #endif  // DEBUGFRAMER
801       // If a header begins with either lowercase or uppercase 'c' or 't', then
802       // the header may be one of content-length, connection, content-encoding
803       // or transfer-encoding. These headers are special, as they change the way
804       // that the message is framed, and so the framer is required to search
805       // for them.
806 
807 
808       if (c == 'c' || c == 'C') {
809         if ((key_len == kContentLengthSize) &&
810             0 == strncasecmp(key_begin, kContentLength, kContentLengthSize)) {
811           BalsaHeadersEnums::ContentLengthStatus content_length_status =
812             BalsaHeadersEnums::NO_CONTENT_LENGTH;
813           size_t length = 0;
814           ProcessContentLengthLine(i, &content_length_status, &length);
815           if (content_length_idx != 0) {  // then we've already seen one!
816             if ((headers_->content_length_status_ != content_length_status) ||
817                 ((headers_->content_length_status_ ==
818                   BalsaHeadersEnums::VALID_CONTENT_LENGTH) &&
819                  length != headers_->content_length_)) {
820               last_error_ = BalsaFrameEnums::MULTIPLE_CONTENT_LENGTH_KEYS;
821               parse_state_ = BalsaFrameEnums::PARSE_ERROR;
822               visitor_->HandleHeaderError(this);
823               return;
824             }
825             continue;
826           } else {
827             content_length_idx = i + 1;
828             headers_->content_length_status_ = content_length_status;
829             headers_->content_length_ = length;
830             content_length_remaining_ = length;
831           }
832 
833         }
834       } else if (c == 't' || c == 'T') {
835         if ((key_len == kTransferEncodingSize) &&
836             0 == strncasecmp(key_begin, kTransferEncoding,
837                              kTransferEncodingSize)) {
838           if (transfer_encoding_idx != 0) {
839             last_error_ = BalsaFrameEnums::MULTIPLE_TRANSFER_ENCODING_KEYS;
840             parse_state_ = BalsaFrameEnums::PARSE_ERROR;
841             visitor_->HandleHeaderError(this);
842             return;
843           }
844           transfer_encoding_idx = i + 1;
845         }
846       } else if (i == 0 && (key_len == 0 || c == ' ')) {
847         last_error_ = BalsaFrameEnums::INVALID_HEADER_FORMAT;
848         parse_state_ = BalsaFrameEnums::PARSE_ERROR;
849         visitor_->HandleHeaderError(this);
850         return;
851       }
852     }
853     if (headers_->transfer_encoding_is_chunked_) {
854       headers_->content_length_ = 0;
855       headers_->content_length_status_ = BalsaHeadersEnums::NO_CONTENT_LENGTH;
856       content_length_remaining_ = 0;
857     }
858     if (transfer_encoding_idx != 0) {
859       ProcessTransferEncodingLine(transfer_encoding_idx - 1);
860     }
861   }
862 }
863 
AssignParseStateAfterHeadersHaveBeenParsed()864 void BalsaFrame::AssignParseStateAfterHeadersHaveBeenParsed() {
865   // For responses, can't have a body if the request was a HEAD, or if it is
866   // one of these response-codes.  rfc2616 section 4.3
867   parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
868   if (is_request_ ||
869       !(request_was_head_ ||
870         (headers_->parsed_response_code_ >= 100 &&
871          headers_->parsed_response_code_ < 200) ||
872         (headers_->parsed_response_code_ == 204) ||
873         (headers_->parsed_response_code_ == 304))) {
874     // Then we can have a body.
875     if (headers_->transfer_encoding_is_chunked_) {
876       // Note that
877       // if ( Transfer-Encoding: chunked &&  Content-length: )
878       // then Transfer-Encoding: chunked trumps.
879       // This is as specified in the spec.
880       // rfc2616 section 4.4.3
881       parse_state_ = BalsaFrameEnums::READING_CHUNK_LENGTH;
882     } else {
883       // Errors parsing content-length definitely can cause
884       // protocol errors/warnings
885       switch (headers_->content_length_status_) {
886         // If we have a content-length, and it is parsed
887         // properly, there are two options.
888         // 1) zero content, in which case the message is done, and
889         // 2) nonzero content, in which case we have to
890         //    consume the body.
891         case BalsaHeadersEnums::VALID_CONTENT_LENGTH:
892           if (headers_->content_length_ == 0) {
893             parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
894           } else {
895             parse_state_ = BalsaFrameEnums::READING_CONTENT;
896           }
897           break;
898         case BalsaHeadersEnums::CONTENT_LENGTH_OVERFLOW:
899         case BalsaHeadersEnums::INVALID_CONTENT_LENGTH:
900           // If there were characters left-over after parsing the
901           // content length, we should flag an error and stop.
902           parse_state_ = BalsaFrameEnums::PARSE_ERROR;
903           last_error_ = BalsaFrameEnums::UNPARSABLE_CONTENT_LENGTH;
904           visitor_->HandleHeaderError(this);
905           break;
906           // We can have: no transfer-encoding, no content length, and no
907           // connection: close...
908           // Unfortunately, this case doesn't seem to be covered in the spec.
909           // We'll assume that the safest thing to do here is what the google
910           // binaries before 2008 already do, which is to assume that
911           // everything until the connection is closed is body.
912         case BalsaHeadersEnums::NO_CONTENT_LENGTH:
913           if (is_request_) {
914             base::StringPiece method = headers_->request_method();
915             // POSTs and PUTs should have a detectable body length.  If they
916             // do not we consider it an error.
917             if ((method.size() == 4 &&
918                  strncmp(method.data(), "POST", 4) == 0) ||
919                 (method.size() == 3 &&
920                  strncmp(method.data(), "PUT", 3) == 0)) {
921               parse_state_ = BalsaFrameEnums::PARSE_ERROR;
922               last_error_ =
923                   BalsaFrameEnums::REQUIRED_BODY_BUT_NO_CONTENT_LENGTH;
924               visitor_->HandleHeaderError(this);
925               break;
926             }
927             parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
928           } else {
929             parse_state_ = BalsaFrameEnums::READING_UNTIL_CLOSE;
930             last_error_ = BalsaFrameEnums::MAYBE_BODY_BUT_NO_CONTENT_LENGTH;
931             visitor_->HandleHeaderWarning(this);
932           }
933           break;
934           // The COV_NF_... statements here provide hints to the apparatus
935           // which computes coverage reports/ratios that this code is never
936           // intended to be executed, and should technically be impossible.
937           // COV_NF_START
938         default:
939           LOG(FATAL) << "Saw a content_length_status: "
940            << headers_->content_length_status_ << " which is unknown.";
941           // COV_NF_END
942       }
943     }
944   }
945 }
946 
ProcessHeaders(const char * message_start,size_t message_length)947 size_t BalsaFrame::ProcessHeaders(const char* message_start,
948                                   size_t message_length) {
949   const char* const original_message_start = message_start;
950   const char* const message_end = message_start + message_length;
951   const char* message_current = message_start;
952   const char* checkpoint = message_start;
953 
954   if (message_length == 0) {
955     goto bottom;
956   }
957 
958   while (message_current < message_end) {
959     size_t base_idx = headers_->GetReadableBytesFromHeaderStream();
960 
961     // Yes, we could use strchr (assuming null termination), or
962     // memchr, but as it turns out that is slower than this tight loop
963     // for the input that we see.
964     if (!saw_non_newline_char_) {
965       do {
966         const char c = *message_current;
967         if (c != '\r' && c != '\n') {
968           if (c <= ' ') {
969             parse_state_ = BalsaFrameEnums::PARSE_ERROR;
970             last_error_ = BalsaFrameEnums::NO_REQUEST_LINE_IN_REQUEST;
971             visitor_->HandleHeaderError(this);
972             goto bottom;
973           } else {
974             saw_non_newline_char_ = true;
975             checkpoint = message_start = message_current;
976             goto read_real_message;
977           }
978         }
979         ++message_current;
980       } while (message_current < message_end);
981       goto bottom;  // this is necessary to skip 'last_char_was_slash_r' checks
982     } else {
983  read_real_message:
984       // Note that SSE2 can be enabled on certain piii platforms.
985 #if __SSE2__
986       {
987         const char* const message_end_m16 = message_end - 16;
988         __v16qi newlines = { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
989                              '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' };
990         while (message_current < message_end_m16) {
991           // What this does (using compiler intrinsics):
992           //
993           // Load 16 '\n's into an xmm register
994           // Load 16 bytes of currennt message into an xmm register
995           // Do byte-wise equals on those two xmm registers
996           // Take the first bit of each byte, and put that into the first
997           //   16 bits of a mask
998           // If the mask is zero, no '\n' found. increment by 16 and try again
999           // Else scan forward to find the first set bit.
1000           // Increment current by the index of the first set bit
1001           //   (ffs returns index of first set bit + 1)
1002           __m128i msg_bytes =
1003             _mm_loadu_si128(const_cast<__m128i *>(
1004                     reinterpret_cast<const __m128i *>(message_current)));
1005           __m128i newline_cmp =
1006             _mm_cmpeq_epi8(msg_bytes, reinterpret_cast<__m128i>(newlines));
1007           int newline_msk = _mm_movemask_epi8(newline_cmp);
1008           if (newline_msk == 0) {
1009             message_current += 16;
1010             continue;
1011           }
1012           message_current += (ffs(newline_msk) - 1);
1013           const size_t relative_idx = message_current - message_start;
1014           const size_t message_current_idx = 1 + base_idx + relative_idx;
1015           lines_.push_back(std::make_pair(last_slash_n_idx_,
1016                                           message_current_idx));
1017           if (lines_.size() == 1) {
1018             headers_->WriteFromFramer(checkpoint,
1019                                       1 + message_current - checkpoint);
1020             checkpoint = message_current + 1;
1021             const char* begin = headers_->OriginalHeaderStreamBegin();
1022 #if DEBUGFRAMER
1023           LOG(INFO) << "First line " << std::string(begin, lines_[0].second);
1024           LOG(INFO) << "is_request_: " << is_request_;
1025 #endif
1026             ProcessFirstLine(begin, begin + lines_[0].second);
1027             if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ)
1028               goto process_lines;
1029             else if (parse_state_ == BalsaFrameEnums::PARSE_ERROR)
1030               goto bottom;
1031           }
1032           const size_t chars_since_last_slash_n = (message_current_idx -
1033                                                    last_slash_n_idx_);
1034           last_slash_n_idx_ = message_current_idx;
1035           if (chars_since_last_slash_n > 2) {
1036             // We have a slash-n, but the last slash n was
1037             // more than 2 characters away from this. Thus, we know
1038             // that this cannot be an end-of-header.
1039             ++message_current;
1040             continue;
1041           }
1042           if ((chars_since_last_slash_n == 1) ||
1043               (((message_current > message_start) &&
1044                 (*(message_current - 1) == '\r')) ||
1045                (last_char_was_slash_r_))) {
1046             goto process_lines;
1047           }
1048           ++message_current;
1049         }
1050       }
1051 #endif  // __SSE2__
1052       while (message_current < message_end) {
1053         if (*message_current != '\n') {
1054           ++message_current;
1055           continue;
1056         }
1057         const size_t relative_idx = message_current - message_start;
1058         const size_t message_current_idx = 1 + base_idx + relative_idx;
1059         lines_.push_back(std::make_pair(last_slash_n_idx_,
1060                                         message_current_idx));
1061         if (lines_.size() == 1) {
1062           headers_->WriteFromFramer(checkpoint,
1063                                     1 + message_current - checkpoint);
1064           checkpoint = message_current + 1;
1065           const char* begin = headers_->OriginalHeaderStreamBegin();
1066 #if DEBUGFRAMER
1067           LOG(INFO) << "First line " << std::string(begin, lines_[0].second);
1068           LOG(INFO) << "is_request_: " << is_request_;
1069 #endif
1070           ProcessFirstLine(begin, begin + lines_[0].second);
1071           if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ)
1072             goto process_lines;
1073           else if (parse_state_ == BalsaFrameEnums::PARSE_ERROR)
1074             goto bottom;
1075         }
1076         const size_t chars_since_last_slash_n = (message_current_idx -
1077                                                  last_slash_n_idx_);
1078         last_slash_n_idx_ = message_current_idx;
1079         if (chars_since_last_slash_n > 2) {
1080           // false positive.
1081           ++message_current;
1082           continue;
1083         }
1084         if ((chars_since_last_slash_n == 1) ||
1085             (((message_current > message_start) &&
1086               (*(message_current - 1) == '\r')) ||
1087              (last_char_was_slash_r_))) {
1088           goto process_lines;
1089         }
1090         ++message_current;
1091       }
1092     }
1093     continue;
1094  process_lines:
1095     ++message_current;
1096     DCHECK(message_current >= message_start);
1097     if (message_current > message_start) {
1098       headers_->WriteFromFramer(checkpoint, message_current - checkpoint);
1099     }
1100 
1101     // Check if we have exceeded maximum headers length
1102     // Although we check for this limit before and after we call this function
1103     // we check it here as well to make sure that in case the visitor changed
1104     // the max_header_length_ (for example after processing the first line)
1105     // we handle it gracefully.
1106     if (headers_->GetReadableBytesFromHeaderStream() > max_header_length_) {
1107       parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1108       last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG;
1109       visitor_->HandleHeaderError(this);
1110       goto bottom;
1111     }
1112 
1113     // Since we know that we won't be writing any more bytes of the header,
1114     // we tell that to the headers object. The headers object may make
1115     // more efficient allocation decisions when this is signaled.
1116     headers_->DoneWritingFromFramer();
1117     {
1118       const char* readable_ptr = NULL;
1119       size_t readable_size = 0;
1120       headers_->GetReadablePtrFromHeaderStream(&readable_ptr, &readable_size);
1121       visitor_->ProcessHeaderInput(readable_ptr, readable_size);
1122     }
1123 
1124     // Ok, now that we've written everything into our header buffer, it is
1125     // time to process the header lines (extract proper values for headers
1126     // which are important for framing).
1127     ProcessHeaderLines();
1128     if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) {
1129       goto bottom;
1130     }
1131     AssignParseStateAfterHeadersHaveBeenParsed();
1132     if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) {
1133       goto bottom;
1134     }
1135     visitor_->ProcessHeaders(*headers_);
1136     visitor_->HeaderDone();
1137     if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ) {
1138       visitor_->MessageDone();
1139     }
1140     goto bottom;
1141   }
1142   // If we've gotten to here, it means that we've consumed all of the
1143   // available input. We need to record whether or not the last character we
1144   // saw was a '\r' so that a subsequent call to ProcessInput correctly finds
1145   // a header framing that is split across the two calls.
1146   last_char_was_slash_r_ = (*(message_end - 1) == '\r');
1147   DCHECK(message_current >= message_start);
1148   if (message_current > message_start) {
1149     headers_->WriteFromFramer(checkpoint, message_current - checkpoint);
1150   }
1151  bottom:
1152   return message_current - original_message_start;
1153 }
1154 
1155 
BytesSafeToSplice() const1156 size_t BalsaFrame::BytesSafeToSplice() const {
1157   switch (parse_state_) {
1158     case BalsaFrameEnums::READING_CHUNK_DATA:
1159       return chunk_length_remaining_;
1160     case BalsaFrameEnums::READING_UNTIL_CLOSE:
1161       return std::numeric_limits<size_t>::max();
1162     case BalsaFrameEnums::READING_CONTENT:
1163       return content_length_remaining_;
1164     default:
1165       return 0;
1166   }
1167 }
1168 
BytesSpliced(size_t bytes_spliced)1169 void BalsaFrame::BytesSpliced(size_t bytes_spliced) {
1170   switch (parse_state_) {
1171     case BalsaFrameEnums::READING_CHUNK_DATA:
1172       if (chunk_length_remaining_ >= bytes_spliced) {
1173         chunk_length_remaining_ -= bytes_spliced;
1174         if (chunk_length_remaining_ == 0) {
1175           parse_state_ = BalsaFrameEnums::READING_CHUNK_TERM;
1176         }
1177         return;
1178       } else {
1179         last_error_ =
1180           BalsaFrameEnums::CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT;
1181         goto error_exit;
1182       }
1183 
1184     case BalsaFrameEnums::READING_UNTIL_CLOSE:
1185       return;
1186 
1187     case BalsaFrameEnums::READING_CONTENT:
1188       if (content_length_remaining_ >= bytes_spliced) {
1189         content_length_remaining_ -= bytes_spliced;
1190         if (content_length_remaining_ == 0) {
1191           parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1192           visitor_->MessageDone();
1193         }
1194         return;
1195       } else {
1196         last_error_ =
1197           BalsaFrameEnums::CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT;
1198         goto error_exit;
1199       }
1200 
1201     default:
1202       last_error_ = BalsaFrameEnums::CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO;
1203       goto error_exit;
1204   }
1205 
1206  error_exit:
1207   parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1208   visitor_->HandleBodyError(this);
1209 };
1210 
1211 // You may note that the state-machine contained within this function has both
1212 // switch and goto labels for nearly the same thing. For instance, the
1213 // following two labels refer to the same code block:
1214 //   label_reading_chunk_data:
1215 //   case BalsaFrameEnums::READING_CHUNK_DATA:
1216 // The 'case' statement is required for the switch statement which occurs when
1217 // ProcessInput is invoked. The goto label is required as the state-machine
1218 // does not use a computed goto in any subsequent operations.
1219 //
1220 // Since several states exit the state machine for various reasons, there is
1221 // also one label at the bottom of the function. When it is appropriate to
1222 // return from the function, that part of the state machine instead issues a
1223 // goto bottom; This results in less code duplication, and makes debugging
1224 // easier (as you can add a statement to a section of code which is guaranteed
1225 // to be invoked when the function is exiting.
ProcessInput(const char * input,size_t size)1226 size_t BalsaFrame::ProcessInput(const char* input, size_t size) {
1227   const char* current = input;
1228   const char* on_entry = current;
1229   const char* end = current + size;
1230 #if DEBUGFRAMER
1231   LOG(INFO) << "\n=============="
1232             << BalsaFrameEnums::ParseStateToString(parse_state_)
1233             << "===============\n";
1234 #endif  // DEBUGFRAMER
1235 
1236   DCHECK(headers_ != NULL);
1237   if (headers_ == NULL) return 0;
1238 
1239   if (parse_state_ == BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE) {
1240     const size_t header_length = headers_->GetReadableBytesFromHeaderStream();
1241     // Yes, we still have to check this here as the user can change the
1242     // max_header_length amount!
1243     // Also it is possible that we have reached the maximum allowed header size,
1244     // and we have more to consume (remember we are still inside
1245     // READING_HEADER_AND_FIRSTLINE) in which case we directly declare an error.
1246     if (header_length > max_header_length_ ||
1247         (header_length == max_header_length_ && size > 0)) {
1248       parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1249       last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG;
1250       visitor_->HandleHeaderError(this);
1251       goto bottom;
1252     }
1253     size_t bytes_to_process = max_header_length_ - header_length;
1254     if (bytes_to_process > size) {
1255       bytes_to_process = size;
1256     }
1257     current += ProcessHeaders(input, bytes_to_process);
1258     // If we are still reading headers check if we have crossed the headers
1259     // limit. Note that we check for >= as opposed to >. This is because if
1260     // header_length_after equals max_header_length_ and we are still in the
1261     // parse_state_  BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE we know for
1262     // sure that the headers limit will be crossed later on
1263     if ((parse_state_ == BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE)) {
1264       // Note that headers_ is valid only if we are still reading headers.
1265       const size_t header_length_after =
1266           headers_->GetReadableBytesFromHeaderStream();
1267       if (header_length_after >= max_header_length_) {
1268         parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1269         last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG;
1270         visitor_->HandleHeaderError(this);
1271       }
1272     }
1273     goto bottom;
1274   } else if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ ||
1275              parse_state_ == BalsaFrameEnums::PARSE_ERROR) {
1276     // Can do nothing more 'till we're reset.
1277     goto bottom;
1278   }
1279 
1280   while (current < end) {
1281     switch (parse_state_) {
1282  label_reading_chunk_length:
1283       case BalsaFrameEnums::READING_CHUNK_LENGTH:
1284         // In this state we read the chunk length.
1285         // Note that once we hit a character which is not in:
1286         // [0-9;A-Fa-f\n], we transition to a different state.
1287         //
1288         {
1289           // If we used strtol, etc, we'd have to buffer this line.
1290           // This is more annoying than simply doing the conversion
1291           // here. This code accounts for overflow.
1292           static const signed char buf[] = {
1293             // %0  %1  %2  %3  %4  %5  %6  %7  %8  \t  \n  %b  %c  \r  %e  %f
1294                -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -1, -1, -2, -1, -1,
1295             // %10 %11 %12 %13 %14 %15 %16 %17 %18 %19 %1a %1b %1c %1d %1e %1f
1296                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1297             // ' ' %21 %22 %23 %24 %25 %26 %27 %28 %29 %2a %2b %2c %2d %2e %2f
1298                -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1299             // %30 %31 %32 %33 %34 %35 %36 %37 %38 %39 %3a ';' %3c %3d %3e %3f
1300                 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, -1, -2, -1, -1, -1, -1,
1301             // %40 'A' 'B' 'C' 'D' 'E' 'F' %47 %48 %49 %4a %4b %4c %4d %4e %4f
1302                -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1303             // %50 %51 %52 %53 %54 %55 %56 %57 %58 %59 %5a %5b %5c %5d %5e %5f
1304                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1305             // %60 'a' 'b' 'c' 'd' 'e' 'f' %67 %68 %69 %6a %6b %6c %6d %6e %6f
1306                -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1307             // %70 %71 %72 %73 %74 %75 %76 %77 %78 %79 %7a %7b %7c %7d %7e %7f
1308                -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1309           };
1310           // valid cases:
1311           //  "09123\n"                      // -> 09123
1312           //  "09123\r\n"                    // -> 09123
1313           //  "09123  \n"                    // -> 09123
1314           //  "09123  \r\n"                  // -> 09123
1315           //  "09123  12312\n"               // -> 09123
1316           //  "09123  12312\r\n"             // -> 09123
1317           //  "09123; foo=bar\n"             // -> 09123
1318           //  "09123; foo=bar\r\n"           // -> 09123
1319           //  "FFFFFFFFFFFFFFFF\r\n"         // -> FFFFFFFFFFFFFFFF
1320           //  "FFFFFFFFFFFFFFFF 22\r\n"      // -> FFFFFFFFFFFFFFFF
1321           // invalid cases:
1322           // "[ \t]+[^\n]*\n"
1323           // "FFFFFFFFFFFFFFFFF\r\n"  (would overflow)
1324           // "\r\n"
1325           // "\n"
1326           while (current < end) {
1327             const char c = *current;
1328             ++current;
1329             const signed char addition = buf[static_cast<int>(c)];
1330             if (addition >= 0) {
1331               chunk_length_character_extracted_ = true;
1332               size_t length_x_16 = chunk_length_remaining_ * 16;
1333               const size_t kMaxDiv16 = std::numeric_limits<size_t>::max() / 16;
1334               if ((chunk_length_remaining_ > kMaxDiv16) ||
1335                   ((std::numeric_limits<size_t>::max() - length_x_16) <
1336                    static_cast<size_t>(addition))) {
1337                 // overflow -- asked for a chunk-length greater than 2^64 - 1!!
1338                 parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1339                 last_error_ = BalsaFrameEnums::CHUNK_LENGTH_OVERFLOW;
1340                 visitor_->ProcessBodyInput(on_entry, current - on_entry);
1341                 visitor_->HandleChunkingError(this);
1342                 goto bottom;
1343               }
1344               chunk_length_remaining_ = length_x_16 + addition;
1345               continue;
1346             }
1347 
1348             if (!chunk_length_character_extracted_ || addition == -1) {
1349               // ^[0-9;A-Fa-f][ \t\n] -- was not matched, either because no
1350               // characters were converted, or an unexpected character was
1351               // seen.
1352               parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1353               last_error_ = BalsaFrameEnums::INVALID_CHUNK_LENGTH;
1354               visitor_->ProcessBodyInput(on_entry, current - on_entry);
1355               visitor_->HandleChunkingError(this);
1356               goto bottom;
1357             }
1358 
1359             --current;
1360             parse_state_ = BalsaFrameEnums::READING_CHUNK_EXTENSION;
1361             visitor_->ProcessChunkLength(chunk_length_remaining_);
1362             goto label_reading_chunk_extension;
1363           }
1364         }
1365         visitor_->ProcessBodyInput(on_entry, current - on_entry);
1366         goto bottom;  // case BalsaFrameEnums::READING_CHUNK_LENGTH
1367 
1368  label_reading_chunk_extension:
1369       case BalsaFrameEnums::READING_CHUNK_EXTENSION:
1370         {
1371           // TODO(phython): Convert this scanning to be 16 bytes at a time if
1372           // there is data to be read.
1373           const char* extensions_start = current;
1374           size_t extensions_length = 0;
1375           while (current < end) {
1376             const char c = *current;
1377             if (c == '\r' || c == '\n') {
1378               extensions_length =
1379                   (extensions_start == current) ?
1380                   0 :
1381                   current - extensions_start - 1;
1382             }
1383 
1384             ++current;
1385             if (c == '\n') {
1386               chunk_length_character_extracted_ = false;
1387               visitor_->ProcessChunkExtensions(
1388                   extensions_start, extensions_length);
1389               if (chunk_length_remaining_ != 0) {
1390                 parse_state_ = BalsaFrameEnums::READING_CHUNK_DATA;
1391                 goto label_reading_chunk_data;
1392               }
1393               HeaderFramingFound('\n');
1394               parse_state_ = BalsaFrameEnums::READING_LAST_CHUNK_TERM;
1395               goto label_reading_last_chunk_term;
1396             }
1397           }
1398           visitor_->ProcessChunkExtensions(
1399               extensions_start, extensions_length);
1400         }
1401 
1402         visitor_->ProcessBodyInput(on_entry, current - on_entry);
1403         goto bottom;  // case BalsaFrameEnums::READING_CHUNK_EXTENSION
1404 
1405  label_reading_chunk_data:
1406       case BalsaFrameEnums::READING_CHUNK_DATA:
1407         while (current < end) {
1408           if (chunk_length_remaining_ == 0) {
1409             break;
1410           }
1411           // read in the chunk
1412           size_t bytes_remaining = end - current;
1413           size_t consumed_bytes = (chunk_length_remaining_ < bytes_remaining) ?
1414             chunk_length_remaining_ : bytes_remaining;
1415           const char* tmp_current = current + consumed_bytes;
1416           visitor_->ProcessBodyInput(on_entry, tmp_current - on_entry);
1417           visitor_->ProcessBodyData(current, consumed_bytes);
1418           on_entry = current = tmp_current;
1419           chunk_length_remaining_ -= consumed_bytes;
1420         }
1421         if (chunk_length_remaining_ == 0) {
1422           parse_state_ = BalsaFrameEnums::READING_CHUNK_TERM;
1423           goto label_reading_chunk_term;
1424         }
1425         visitor_->ProcessBodyInput(on_entry, current - on_entry);
1426         goto bottom;  // case BalsaFrameEnums::READING_CHUNK_DATA
1427 
1428  label_reading_chunk_term:
1429       case BalsaFrameEnums::READING_CHUNK_TERM:
1430         while (current < end) {
1431           const char c = *current;
1432           ++current;
1433 
1434           if (c == '\n') {
1435             parse_state_ = BalsaFrameEnums::READING_CHUNK_LENGTH;
1436             goto label_reading_chunk_length;
1437           }
1438         }
1439         visitor_->ProcessBodyInput(on_entry, current - on_entry);
1440         goto bottom;  // case BalsaFrameEnums::READING_CHUNK_TERM
1441 
1442  label_reading_last_chunk_term:
1443       case BalsaFrameEnums::READING_LAST_CHUNK_TERM:
1444         while (current < end) {
1445           const char c = *current;
1446 
1447           if (!HeaderFramingFound(c)) {
1448             // If not, however, since the spec only suggests that the
1449             // client SHOULD indicate the presence of trailers, we get to
1450             // *test* that they did or didn't.
1451             // If all of the bytes we've seen since:
1452             //   OPTIONAL_WS 0 OPTIONAL_STUFF CRLF
1453             // are either '\r', or '\n', then we can assume that we don't yet
1454             // know if we need to parse headers, or if the next byte will make
1455             // the HeaderFramingFound condition (above) true.
1456             if (HeaderFramingMayBeFound()) {
1457               // If true, then we have seen only characters '\r' or '\n'.
1458               ++current;
1459 
1460               // Lets try again! There is no state change here.
1461               continue;
1462             } else {
1463               // If (!HeaderFramingMayBeFound()), then we know that we must be
1464               // reading the first non CRLF character of a trailer.
1465               parse_state_ = BalsaFrameEnums::READING_TRAILER;
1466               visitor_->ProcessBodyInput(on_entry, current - on_entry);
1467               on_entry = current;
1468               goto label_reading_trailer;
1469             }
1470           } else {
1471             // If we've found a "\r\n\r\n", then the message
1472             // is done.
1473             ++current;
1474             parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1475             visitor_->ProcessBodyInput(on_entry, current - on_entry);
1476             visitor_->MessageDone();
1477             goto bottom;
1478           }
1479           break;  // from while loop
1480         }
1481         visitor_->ProcessBodyInput(on_entry, current - on_entry);
1482         goto bottom;  // case BalsaFrameEnums::READING_LAST_CHUNK_TERM
1483 
1484  label_reading_trailer:
1485       case BalsaFrameEnums::READING_TRAILER:
1486         while (current < end) {
1487           const char c = *current;
1488           ++current;
1489           // TODO(fenix): If we ever care about trailers as part of framing,
1490           // deal with them here (see below for part of the 'solution')
1491           // if (LineFramingFound(c)) {
1492           // trailer_lines_.push_back(make_pair(start_of_line_,
1493           //                                   trailer_length_ - 1));
1494           // start_of_line_ = trailer_length_;
1495           // }
1496           if (HeaderFramingFound(c)) {
1497             // ProcessTrailers(visitor_, &trailers_);
1498             parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1499             visitor_->ProcessTrailerInput(on_entry, current - on_entry);
1500             visitor_->MessageDone();
1501             goto bottom;
1502           }
1503         }
1504         visitor_->ProcessTrailerInput(on_entry, current - on_entry);
1505         break;  // case BalsaFrameEnums::READING_TRAILER
1506 
1507         // Note that there is no label:
1508         //   'label_reading_until_close'
1509         // here. This is because the state-machine exists immediately after
1510         // reading the headers instead of transitioning here (as it would
1511         // do if it was consuming all the data it could, all the time).
1512       case BalsaFrameEnums::READING_UNTIL_CLOSE:
1513         {
1514           const size_t bytes_remaining = end - current;
1515           if (bytes_remaining > 0) {
1516             visitor_->ProcessBodyInput(current, bytes_remaining);
1517             visitor_->ProcessBodyData(current, bytes_remaining);
1518             current += bytes_remaining;
1519           }
1520         }
1521         goto bottom;  // case BalsaFrameEnums::READING_UNTIL_CLOSE
1522 
1523         // label_reading_content:
1524       case BalsaFrameEnums::READING_CONTENT:
1525 #if DEBUGFRAMER
1526         LOG(INFO) << "ReadingContent: " << content_length_remaining_;
1527 #endif  // DEBUGFRAMER
1528         while (content_length_remaining_ && current < end) {
1529           // read in the content
1530           const size_t bytes_remaining = end - current;
1531           const size_t consumed_bytes =
1532             (content_length_remaining_ < bytes_remaining) ?
1533             content_length_remaining_ : bytes_remaining;
1534           visitor_->ProcessBodyInput(current, consumed_bytes);
1535           visitor_->ProcessBodyData(current, consumed_bytes);
1536           current += consumed_bytes;
1537           content_length_remaining_ -= consumed_bytes;
1538         }
1539         if (content_length_remaining_ == 0) {
1540           parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1541           visitor_->MessageDone();
1542         }
1543         goto bottom;  // case BalsaFrameEnums::READING_CONTENT
1544 
1545       default:
1546         // The state-machine should never be in a state that isn't handled
1547         // above.  This is a glaring logic error, and we should do something
1548         // drastic to ensure that this gets looked-at and fixed.
1549         LOG(FATAL) << "Unknown state: " << parse_state_  // COV_NF_LINE
1550           << " memory corruption?!";                     // COV_NF_LINE
1551     }
1552   }
1553  bottom:
1554 #if DEBUGFRAMER
1555   LOG(INFO) << "\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n"
1556     << std::string(input, current)
1557     << "\n$$$$$$$$$$$$$$"
1558     << BalsaFrameEnums::ParseStateToString(parse_state_)
1559     << "$$$$$$$$$$$$$$$"
1560     << " consumed: " << (current - input);
1561   if (Error()) {
1562     LOG(INFO) << BalsaFrameEnums::ErrorCodeToString(ErrorCode());
1563   }
1564 #endif  // DEBUGFRAMER
1565   return current - input;
1566 }
1567 
1568 const uint32 BalsaFrame::kValidTerm1;
1569 const uint32 BalsaFrame::kValidTerm1Mask;
1570 const uint32 BalsaFrame::kValidTerm2;
1571 const uint32 BalsaFrame::kValidTerm2Mask;
1572 
1573 }  // namespace net
1574 
1575