1#!/usr/bin/env python 2# Copyright 2010 Google Inc. All Rights Reserved. 3# 4# Licensed under the Apache License, Version 2.0 (the "License"); 5# you may not use this file except in compliance with the License. 6# You may obtain a copy of the License at 7# 8# http://www.apache.org/licenses/LICENSE-2.0 9# 10# Unless required by applicable law or agreed to in writing, software 11# distributed under the License is distributed on an "AS IS" BASIS, 12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13# See the License for the specific language governing permissions and 14# limitations under the License. 15 16"""View and edit HTTP Archives. 17 18To list all URLs in an archive: 19 $ ./httparchive.py ls archive.wpr 20 21To view the content of all URLs from example.com: 22 $ ./httparchive.py cat --host example.com archive.wpr 23 24To view the content of a particular URL: 25 $ ./httparchive.py cat --host www.example.com --full_path /foo archive.wpr 26 27To view the content of all URLs: 28 $ ./httparchive.py cat archive.wpr 29 30To edit a particular URL: 31 $ ./httparchive.py edit --host www.example.com --full_path /foo archive.wpr 32 33To print statistics of an archive: 34 $ ./httparchive.py stats archive.wpr 35 36To print statistics of a set of URLs: 37 $ ./httparchive.py stats --host www.example.com archive.wpr 38 39To merge multiple archives 40 $ ./httparchive.py merge --merged_file new.wpr archive1.wpr archive2.wpr ... 41""" 42 43import calendar 44import certutils 45import cPickle 46import difflib 47import email.utils 48import httplib 49import httpzlib 50import json 51import logging 52import optparse 53import os 54import StringIO 55import subprocess 56import sys 57import tempfile 58import time 59import urlparse 60from collections import defaultdict 61 62 63 64def LogRunTime(fn): 65 """Annotation which logs the run time of the function.""" 66 def wrapped(self, *args, **kwargs): 67 start_time = time.time() 68 try: 69 return fn(self, *args, **kwargs) 70 finally: 71 run_time = (time.time() - start_time) * 1000.0 72 logging.debug('%s: %dms', fn.__name__, run_time) 73 return wrapped 74 75 76class HttpArchiveException(Exception): 77 """Base class for all exceptions in httparchive.""" 78 pass 79 80 81class HttpArchive(dict): 82 """Dict with ArchivedHttpRequest keys and ArchivedHttpResponse values. 83 84 Attributes: 85 responses_by_host: dict of {hostname, {request: response}}. This must remain 86 in sync with the underlying dict of self. It is used as an optimization 87 so that get_requests() doesn't have to linearly search all requests in 88 the archive to find potential matches. 89 """ 90 91 def __init__(self): # pylint: disable=super-init-not-called 92 self.responses_by_host = defaultdict(dict) 93 94 def __setstate__(self, state): 95 """Influence how to unpickle. 96 97 Args: 98 state: a dictionary for __dict__ 99 """ 100 self.__dict__.update(state) 101 self.responses_by_host = defaultdict(dict) 102 for request in self: 103 self.responses_by_host[request.host][request] = self[request] 104 105 def __getstate__(self): 106 """Influence how to pickle. 107 108 Returns: 109 a dict to use for pickling 110 """ 111 state = self.__dict__.copy() 112 del state['responses_by_host'] 113 return state 114 115 def __setitem__(self, key, value): 116 super(HttpArchive, self).__setitem__(key, value) 117 if hasattr(self, 'responses_by_host'): 118 self.responses_by_host[key.host][key] = value 119 120 def __delitem__(self, key): 121 super(HttpArchive, self).__delitem__(key) 122 del self.responses_by_host[key.host][key] 123 124 def get(self, request, default=None): 125 """Return the archived response for a given request. 126 127 Does extra checking for handling some HTTP request headers. 128 129 Args: 130 request: instance of ArchivedHttpRequest 131 default: default value to return if request is not found 132 133 Returns: 134 Instance of ArchivedHttpResponse or default if no matching 135 response is found 136 """ 137 if request in self: 138 return self[request] 139 return self.get_conditional_response(request, default) 140 141 def get_conditional_response(self, request, default): 142 """Get the response based on the conditional HTTP request headers. 143 144 Args: 145 request: an ArchivedHttpRequest representing the original request. 146 default: default ArchivedHttpResponse 147 original request with matched headers removed. 148 149 Returns: 150 an ArchivedHttpResponse with a status of 200, 302 (not modified), or 151 412 (precondition failed) 152 """ 153 response = default 154 if request.is_conditional(): 155 stripped_request = request.create_request_without_conditions() 156 if stripped_request in self: 157 response = self[stripped_request] 158 if response.status == 200: 159 status = self.get_conditional_status(request, response) 160 if status != 200: 161 response = create_response(status) 162 return response 163 164 def get_conditional_status(self, request, response): 165 status = 200 166 last_modified = email.utils.parsedate( 167 response.update_date(response.get_header('last-modified'))) 168 response_etag = response.get_header('etag') 169 is_get_or_head = request.command.upper() in ('GET', 'HEAD') 170 171 match_value = request.headers.get('if-match', None) 172 if match_value: 173 if self.is_etag_match(match_value, response_etag): 174 status = 200 175 else: 176 status = 412 # precondition failed 177 none_match_value = request.headers.get('if-none-match', None) 178 if none_match_value: 179 if self.is_etag_match(none_match_value, response_etag): 180 status = 304 181 elif is_get_or_head: 182 status = 200 183 else: 184 status = 412 185 if is_get_or_head and last_modified: 186 for header in ('if-modified-since', 'if-unmodified-since'): 187 date = email.utils.parsedate(request.headers.get(header, None)) 188 if date: 189 if ((header == 'if-modified-since' and last_modified > date) or 190 (header == 'if-unmodified-since' and last_modified < date)): 191 if status != 412: 192 status = 200 193 else: 194 status = 304 # not modified 195 return status 196 197 @staticmethod 198 def is_etag_match(request_etag, response_etag): 199 """Determines whether the entity tags of the request/response matches. 200 201 Args: 202 request_etag: the value string of the "if-(none)-match:" 203 portion of the request header 204 response_etag: the etag value of the response 205 206 Returns: 207 True on match, False otherwise 208 """ 209 response_etag = response_etag.strip('" ') 210 for etag in request_etag.split(','): 211 etag = etag.strip('" ') 212 if etag in ('*', response_etag): 213 return True 214 return False 215 216 def get_requests(self, command=None, host=None, full_path=None, is_ssl=None, 217 use_query=True): 218 """Return a list of requests that match the given args.""" 219 if host: 220 return [r for r in self.responses_by_host[host] 221 if r.matches(command, None, full_path, is_ssl, 222 use_query=use_query)] 223 else: 224 return [r for r in self 225 if r.matches(command, host, full_path, is_ssl, 226 use_query=use_query)] 227 228 def ls(self, command=None, host=None, full_path=None): 229 """List all URLs that match given params.""" 230 return ''.join(sorted( 231 '%s\n' % r for r in self.get_requests(command, host, full_path))) 232 233 def cat(self, command=None, host=None, full_path=None): 234 """Print the contents of all URLs that match given params.""" 235 out = StringIO.StringIO() 236 for request in self.get_requests(command, host, full_path): 237 print >>out, str(request) 238 print >>out, 'Untrimmed request headers:' 239 for k in request.headers: 240 print >>out, ' %s: %s' % (k, request.headers[k]) 241 if request.request_body: 242 print >>out, request.request_body 243 print >>out, '---- Response Info', '-' * 51 244 response = self[request] 245 chunk_lengths = [len(x) for x in response.response_data] 246 print >>out, ('Status: %s\n' 247 'Reason: %s\n' 248 'Headers delay: %s\n' 249 'Untrimmed response headers:') % ( 250 response.status, response.reason, response.delays['headers']) 251 for k, v in response.original_headers: 252 print >>out, ' %s: %s' % (k, v) 253 print >>out, ('Chunk count: %s\n' 254 'Chunk lengths: %s\n' 255 'Chunk delays: %s') % ( 256 len(chunk_lengths), chunk_lengths, response.delays['data']) 257 body = response.get_data_as_text() 258 print >>out, '---- Response Data', '-' * 51 259 if body: 260 print >>out, body 261 else: 262 print >>out, '[binary data]' 263 print >>out, '=' * 70 264 return out.getvalue() 265 266 def stats(self, command=None, host=None, full_path=None): 267 """Print stats about the archive for all URLs that match given params.""" 268 matching_requests = self.get_requests(command, host, full_path) 269 if not matching_requests: 270 print 'Failed to find any requests matching given command, host, path.' 271 return 272 273 out = StringIO.StringIO() 274 stats = { 275 'Total': len(matching_requests), 276 'Domains': defaultdict(int), 277 'HTTP_response_code': defaultdict(int), 278 'content_type': defaultdict(int), 279 'Documents': defaultdict(int), 280 } 281 282 for request in matching_requests: 283 stats['Domains'][request.host] += 1 284 stats['HTTP_response_code'][self[request].status] += 1 285 286 content_type = self[request].get_header('content-type') 287 # Remove content type options for readability and higher level groupings. 288 str_content_type = str(content_type.split(';')[0] 289 if content_type else None) 290 stats['content_type'][str_content_type] += 1 291 292 # Documents are the main URL requested and not a referenced resource. 293 if str_content_type == 'text/html' and not 'referer' in request.headers: 294 stats['Documents'][request.host] += 1 295 296 print >>out, json.dumps(stats, indent=4) 297 return out.getvalue() 298 299 def merge(self, merged_archive=None, other_archives=None): 300 """Merge multiple archives into merged_archive by 'chaining' resources, 301 only resources that are not part of the accumlated archive are added""" 302 if not other_archives: 303 print 'No archives passed to merge' 304 return 305 306 # Note we already loaded 'replay_file'. 307 print 'Loaded %d responses' % len(self) 308 309 for archive in other_archives: 310 if not os.path.exists(archive): 311 print 'Error: Replay file "%s" does not exist' % archive 312 return 313 314 http_archive_other = HttpArchive.Load(archive) 315 print 'Loaded %d responses from %s' % (len(http_archive_other), archive) 316 for r in http_archive_other: 317 # Only resources that are not already part of the current archive 318 # get added. 319 if r not in self: 320 print '\t %s ' % r 321 self[r] = http_archive_other[r] 322 self.Persist('%s' % merged_archive) 323 324 def edit(self, command=None, host=None, full_path=None): 325 """Edits the single request which matches given params.""" 326 editor = os.getenv('EDITOR') 327 if not editor: 328 print 'You must set the EDITOR environmental variable.' 329 return 330 331 matching_requests = self.get_requests(command, host, full_path) 332 if not matching_requests: 333 print ('Failed to find any requests matching given command, host, ' 334 'full_path.') 335 return 336 337 if len(matching_requests) > 1: 338 print 'Found multiple matching requests. Please refine.' 339 print self.ls(command, host, full_path) 340 341 response = self[matching_requests[0]] 342 tmp_file = tempfile.NamedTemporaryFile(delete=False) 343 tmp_file.write(response.get_response_as_text()) 344 tmp_file.close() 345 subprocess.check_call([editor, tmp_file.name]) 346 response.set_response_from_text(''.join(open(tmp_file.name).readlines())) 347 os.remove(tmp_file.name) 348 349 def find_closest_request(self, request, use_path=False): 350 """Find the closest matching request in the archive to the given request. 351 352 Args: 353 request: an ArchivedHttpRequest 354 use_path: If True, closest matching request's path component must match. 355 (Note: this refers to the 'path' component within the URL, not the 356 'full path' which includes the query string component.) 357 358 If use_path=True, candidate will NOT match in example below 359 e.g. request = GET www.test.com/a?p=1 360 candidate = GET www.test.com/b?p=1 361 362 Even if use_path=False, urls with same paths are always favored. 363 For example, candidate1 is considered a better match than candidate2. 364 request = GET www.test.com/a?p=1&q=2&r=3 365 candidate1 = GET www.test.com/a?s=4 366 candidate2 = GET www.test.com/b?p=1&q=2&r=3 367 368 Returns: 369 If a close match is found, return the instance of ArchivedHttpRequest. 370 Otherwise, return None. 371 """ 372 # Start with strictest constraints. This trims search space considerably. 373 requests = self.get_requests(request.command, request.host, 374 request.full_path, is_ssl=request.is_ssl, 375 use_query=True) 376 # Relax constraint: use_query if there is no match. 377 if not requests: 378 requests = self.get_requests(request.command, request.host, 379 request.full_path, is_ssl=request.is_ssl, 380 use_query=False) 381 # Relax constraint: full_path if there is no match and use_path=False. 382 if not requests and not use_path: 383 requests = self.get_requests(request.command, request.host, 384 None, is_ssl=request.is_ssl, 385 use_query=False) 386 387 if not requests: 388 return None 389 390 if len(requests) == 1: 391 return requests[0] 392 393 matcher = difflib.SequenceMatcher(b=request.cmp_seq) 394 395 # quick_ratio() is cheap to compute, but ratio() is expensive. So we call 396 # quick_ratio() on all requests, sort them descending, and then loop through 397 # until we find a candidate whose ratio() is >= the next quick_ratio(). 398 # This works because quick_ratio() is guaranteed to be an upper bound on 399 # ratio(). 400 candidates = [] 401 for candidate in requests: 402 matcher.set_seq1(candidate.cmp_seq) 403 candidates.append((matcher.quick_ratio(), candidate)) 404 405 candidates.sort(reverse=True, key=lambda c: c[0]) 406 407 best_match = (0, None) 408 for i in xrange(len(candidates)): 409 matcher.set_seq1(candidates[i][1].cmp_seq) 410 best_match = max(best_match, (matcher.ratio(), candidates[i][1])) 411 if i + 1 < len(candidates) and best_match[0] >= candidates[i+1][0]: 412 break 413 return best_match[1] 414 415 def diff(self, request): 416 """Diff the given request to the closest matching request in the archive. 417 418 Args: 419 request: an ArchivedHttpRequest 420 Returns: 421 If a close match is found, return a textual diff between the requests. 422 Otherwise, return None. 423 """ 424 request_lines = request.formatted_request.split('\n') 425 closest_request = self.find_closest_request(request) 426 if closest_request: 427 closest_request_lines = closest_request.formatted_request.split('\n') 428 return '\n'.join(difflib.ndiff(closest_request_lines, request_lines)) 429 return None 430 431 def get_server_cert(self, host): 432 """Gets certificate from the server and stores it in archive""" 433 request = ArchivedHttpRequest('SERVER_CERT', host, '', None, {}) 434 if request not in self: 435 self[request] = create_response(200, body=certutils.get_host_cert(host)) 436 return self[request].response_data[0] 437 438 def get_certificate(self, host): 439 request = ArchivedHttpRequest('DUMMY_CERT', host, '', None, {}) 440 if request not in self: 441 self[request] = create_response(200, body=self._generate_cert(host)) 442 return self[request].response_data[0] 443 444 @classmethod 445 def AssertWritable(cls, filename): 446 """Raises an IOError if filename is not writable.""" 447 persist_dir = os.path.dirname(os.path.abspath(filename)) 448 if not os.path.exists(persist_dir): 449 raise IOError('Directory does not exist: %s' % persist_dir) 450 if os.path.exists(filename): 451 if not os.access(filename, os.W_OK): 452 raise IOError('Need write permission on file: %s' % filename) 453 elif not os.access(persist_dir, os.W_OK): 454 raise IOError('Need write permission on directory: %s' % persist_dir) 455 456 @classmethod 457 def Load(cls, filename): 458 """Load an instance from filename.""" 459 return cPickle.load(open(filename, 'rb')) 460 461 def Persist(self, filename): 462 """Persist all state to filename.""" 463 try: 464 original_checkinterval = sys.getcheckinterval() 465 sys.setcheckinterval(2**31-1) # Lock out other threads so nothing can 466 # modify |self| during pickling. 467 pickled_self = cPickle.dumps(self, cPickle.HIGHEST_PROTOCOL) 468 finally: 469 sys.setcheckinterval(original_checkinterval) 470 with open(filename, 'wb') as f: 471 f.write(pickled_self) 472 473 474class ArchivedHttpRequest(object): 475 """Record all the state that goes into a request. 476 477 ArchivedHttpRequest instances are considered immutable so they can 478 serve as keys for HttpArchive instances. 479 (The immutability is not enforced.) 480 481 Upon creation, the headers are "trimmed" (i.e. edited or dropped) 482 and saved to self.trimmed_headers to allow requests to match in a wider 483 variety of playback situations (e.g. using different user agents). 484 485 For unpickling, 'trimmed_headers' is recreated from 'headers'. That 486 allows for changes to the trim function and can help with debugging. 487 """ 488 CONDITIONAL_HEADERS = [ 489 'if-none-match', 'if-match', 490 'if-modified-since', 'if-unmodified-since'] 491 492 def __init__(self, command, host, full_path, request_body, headers, 493 is_ssl=False): 494 """Initialize an ArchivedHttpRequest. 495 496 Args: 497 command: a string (e.g. 'GET' or 'POST'). 498 host: a host name (e.g. 'www.google.com'). 499 full_path: a request path. Includes everything after the host & port in 500 the URL (e.g. '/search?q=dogs'). 501 request_body: a request body string for a POST or None. 502 headers: {key: value, ...} where key and value are strings. 503 is_ssl: a boolean which is True iff request is make via SSL. 504 """ 505 self.command = command 506 self.host = host 507 self.full_path = full_path 508 parsed_url = urlparse.urlparse(full_path) if full_path else None 509 self.path = parsed_url.path if parsed_url else None 510 self.request_body = request_body 511 self.headers = headers 512 self.is_ssl = is_ssl 513 self.trimmed_headers = self._TrimHeaders(headers) 514 self.formatted_request = self._GetFormattedRequest() 515 self.cmp_seq = self._GetCmpSeq(parsed_url.query if parsed_url else None) 516 517 def __str__(self): 518 scheme = 'https' if self.is_ssl else 'http' 519 return '%s %s://%s%s %s' % ( 520 self.command, scheme, self.host, self.full_path, self.trimmed_headers) 521 522 def __repr__(self): 523 return repr((self.command, self.host, self.full_path, self.request_body, 524 self.trimmed_headers, self.is_ssl)) 525 526 def __hash__(self): 527 """Return a integer hash to use for hashed collections including dict.""" 528 return hash(repr(self)) 529 530 def __eq__(self, other): 531 """Define the __eq__ method to match the hash behavior.""" 532 return repr(self) == repr(other) 533 534 def __setstate__(self, state): 535 """Influence how to unpickle. 536 537 "headers" are the original request headers. 538 "trimmed_headers" are the trimmed headers used for matching requests 539 during replay. 540 541 Args: 542 state: a dictionary for __dict__ 543 """ 544 if 'full_headers' in state: 545 # Fix older version of archive. 546 state['headers'] = state['full_headers'] 547 del state['full_headers'] 548 if 'headers' not in state: 549 raise HttpArchiveException( 550 'Archived HTTP request is missing "headers". The HTTP archive is' 551 ' likely from a previous version and must be re-recorded.') 552 if 'path' in state: 553 # before, 'path' and 'path_without_query' were used and 'path' was 554 # pickled. Now, 'path' has been renamed to 'full_path' and 555 # 'path_without_query' has been renamed to 'path'. 'full_path' is 556 # pickled, but 'path' is not. If we see 'path' here it means we are 557 # dealing with an older archive. 558 state['full_path'] = state['path'] 559 del state['path'] 560 state['trimmed_headers'] = self._TrimHeaders(dict(state['headers'])) 561 if 'is_ssl' not in state: 562 state['is_ssl'] = False 563 self.__dict__.update(state) 564 parsed_url = urlparse.urlparse(self.full_path) 565 self.path = parsed_url.path 566 self.formatted_request = self._GetFormattedRequest() 567 self.cmp_seq = self._GetCmpSeq(parsed_url.query) 568 569 def __getstate__(self): 570 """Influence how to pickle. 571 572 Returns: 573 a dict to use for pickling 574 """ 575 state = self.__dict__.copy() 576 del state['trimmed_headers'] 577 del state['path'] 578 del state['formatted_request'] 579 del state['cmp_seq'] 580 return state 581 582 def _GetFormattedRequest(self): 583 """Format request to make diffs easier to read. 584 585 Returns: 586 A string consisting of the request. Example: 587 'GET www.example.com/path\nHeader-Key: header value\n' 588 """ 589 parts = ['%s %s%s\n' % (self.command, self.host, self.full_path)] 590 if self.request_body: 591 parts.append('%s\n' % self.request_body) 592 for k, v in self.trimmed_headers: 593 k = '-'.join(x.capitalize() for x in k.split('-')) 594 parts.append('%s: %s\n' % (k, v)) 595 return ''.join(parts) 596 597 def _GetCmpSeq(self, query=None): 598 """Compute a sequence out of query and header for difflib to compare. 599 For example: 600 [('q1', 'a1'), ('q2', 'a2'), ('k1', 'v1'), ('k2', 'v2')] 601 will be returned for a request with URL: 602 http://example.com/index.html?q1=a2&q2=a2 603 and header: 604 k1: v1 605 k2: v2 606 607 Args: 608 query: the query string in the URL. 609 610 Returns: 611 A sequence for difflib to compare. 612 """ 613 if not query: 614 return self.trimmed_headers 615 return sorted(urlparse.parse_qsl(query)) + self.trimmed_headers 616 617 def matches(self, command=None, host=None, full_path=None, is_ssl=None, 618 use_query=True): 619 """Returns true iff the request matches all parameters. 620 621 Args: 622 command: a string (e.g. 'GET' or 'POST'). 623 host: a host name (e.g. 'www.google.com'). 624 full_path: a request path with query string (e.g. '/search?q=dogs') 625 is_ssl: whether the request is secure. 626 use_query: 627 If use_query is True, request matching uses both the hierarchical path 628 and query string component. 629 If use_query is False, request matching only uses the hierarchical path 630 631 e.g. req1 = GET www.test.com/index?aaaa 632 req2 = GET www.test.com/index?bbbb 633 634 If use_query is True, req1.matches(req2) evaluates to False 635 If use_query is False, req1.matches(req2) evaluates to True 636 637 Returns: 638 True iff the request matches all parameters 639 """ 640 if command is not None and command != self.command: 641 return False 642 if is_ssl is not None and is_ssl != self.is_ssl: 643 return False 644 if host is not None and host != self.host: 645 return False 646 if full_path is None: 647 return True 648 if use_query: 649 return full_path == self.full_path 650 else: 651 return self.path == urlparse.urlparse(full_path).path 652 653 @classmethod 654 def _TrimHeaders(cls, headers): 655 """Removes headers that are known to cause problems during replay. 656 657 These headers are removed for the following reasons: 658 - accept: Causes problems with www.bing.com. During record, CSS is fetched 659 with *. During replay, it's text/css. 660 - accept-charset, accept-language, referer: vary between clients. 661 - cache-control: sometimes sent from Chrome with 'max-age=0' as value. 662 - connection, method, scheme, url, version: Cause problems with spdy. 663 - cookie: Extremely sensitive to request/response order. 664 - keep-alive: Doesn't affect the content of the request, only some 665 transient state of the transport layer. 666 - user-agent: Changes with every Chrome version. 667 - proxy-connection: Sent for proxy requests. 668 - x-chrome-variations, x-client-data: Unique to each Chrome binary. Used by 669 Google to collect statistics about Chrome's enabled features. 670 671 Another variant to consider is dropping only the value from the header. 672 However, this is particularly bad for the cookie header, because the 673 presence of the cookie depends on the responses we've seen when the request 674 is made. 675 676 Args: 677 headers: {header_key: header_value, ...} 678 679 Returns: 680 [(header_key, header_value), ...] # (with undesirable headers removed) 681 """ 682 # TODO(tonyg): Strip sdch from the request headers because we can't 683 # guarantee that the dictionary will be recorded, so replay may not work. 684 if 'accept-encoding' in headers: 685 accept_encoding = headers['accept-encoding'] 686 accept_encoding = accept_encoding.replace('sdch', '') 687 # Strip lzma so Opera's requests matches archives recorded using Chrome. 688 accept_encoding = accept_encoding.replace('lzma', '') 689 stripped_encodings = [e.strip() for e in accept_encoding.split(',')] 690 accept_encoding = ','.join(filter(bool, stripped_encodings)) 691 headers['accept-encoding'] = accept_encoding 692 undesirable_keys = [ 693 'accept', 'accept-charset', 'accept-language', 'cache-control', 694 'connection', 'cookie', 'keep-alive', 'method', 695 'referer', 'scheme', 'url', 'version', 'user-agent', 'proxy-connection', 696 'x-chrome-variations', 'x-client-data'] 697 return sorted([(k, v) for k, v in headers.items() 698 if k.lower() not in undesirable_keys]) 699 700 def is_conditional(self): 701 """Return list of headers that match conditional headers.""" 702 for header in self.CONDITIONAL_HEADERS: 703 if header in self.headers: 704 return True 705 return False 706 707 def create_request_without_conditions(self): 708 stripped_headers = dict((k, v) for k, v in self.headers.iteritems() 709 if k.lower() not in self.CONDITIONAL_HEADERS) 710 return ArchivedHttpRequest( 711 self.command, self.host, self.full_path, self.request_body, 712 stripped_headers, self.is_ssl) 713 714class ArchivedHttpResponse(object): 715 """All the data needed to recreate all HTTP response. 716 717 Upon creation, the headers are "trimmed" (i.e. edited or dropped). 718 The original headers are saved to self.original_headers, while the 719 trimmed ones are used to allow responses to match in a wider variety 720 of playback situations. 721 722 For pickling, 'original_headers' are stored in the archive. For unpickling 723 the headers are trimmed again. That allows for changes to the trim 724 function and can help with debugging. 725 """ 726 727 # CHUNK_EDIT_SEPARATOR is used to edit and view text content. 728 # It is not sent in responses. It is added by get_data_as_text() 729 # and removed by set_data(). 730 CHUNK_EDIT_SEPARATOR = '[WEB_PAGE_REPLAY_CHUNK_BOUNDARY]' 731 732 # DELAY_EDIT_SEPARATOR is used to edit and view server delays. 733 DELAY_EDIT_SEPARATOR = ('\n[WEB_PAGE_REPLAY_EDIT_ARCHIVE --- ' 734 'Delays are above. Response content is below.]\n') 735 736 def __init__(self, version, status, reason, headers, response_data, 737 delays=None): 738 """Initialize an ArchivedHttpResponse. 739 740 Args: 741 version: HTTP protocol version used by server. 742 10 for HTTP/1.0, 11 for HTTP/1.1 (same as httplib). 743 status: Status code returned by server (e.g. 200). 744 reason: Reason phrase returned by server (e.g. "OK"). 745 headers: list of (header, value) tuples. 746 response_data: list of content chunks. 747 Concatenating the chunks gives the complete contents 748 (i.e. the chunks do not have any lengths or delimiters). 749 Do not include the final, zero-length chunk that marks the end. 750 delays: dict of (ms) delays for 'connect', 'headers' and 'data'. 751 e.g. {'connect': 50, 'headers': 150, 'data': [0, 10, 10]} 752 connect - The time to connect to the server. 753 Each resource has a value because Replay's record mode captures it. 754 This includes the time for the SYN and SYN/ACK (1 rtt). 755 headers - The time elapsed between the TCP connect and the headers. 756 This typically includes all the server-time to generate a response. 757 data - If the response is chunked, these are the times for each chunk. 758 """ 759 self.version = version 760 self.status = status 761 self.reason = reason 762 self.original_headers = headers 763 self.headers = self._TrimHeaders(headers) 764 self.response_data = response_data 765 self.delays = delays 766 self.fix_delays() 767 768 def fix_delays(self): 769 """Initialize delays, or check the number of data delays.""" 770 expected_num_delays = len(self.response_data) 771 if not self.delays: 772 self.delays = { 773 'connect': 0, 774 'headers': 0, 775 'data': [0] * expected_num_delays 776 } 777 else: 778 num_delays = len(self.delays['data']) 779 if num_delays != expected_num_delays: 780 raise HttpArchiveException( 781 'Server delay length mismatch: %d (expected %d): %s', 782 num_delays, expected_num_delays, self.delays['data']) 783 784 @classmethod 785 def _TrimHeaders(cls, headers): 786 """Removes headers that are known to cause problems during replay. 787 788 These headers are removed for the following reasons: 789 - content-security-policy: Causes problems with script injection. 790 """ 791 undesirable_keys = ['content-security-policy'] 792 return [(k, v) for k, v in headers if k.lower() not in undesirable_keys] 793 794 def __repr__(self): 795 return repr((self.version, self.status, self.reason, sorted(self.headers), 796 self.response_data)) 797 798 def __hash__(self): 799 """Return a integer hash to use for hashed collections including dict.""" 800 return hash(repr(self)) 801 802 def __eq__(self, other): 803 """Define the __eq__ method to match the hash behavior.""" 804 return repr(self) == repr(other) 805 806 def __setstate__(self, state): 807 """Influence how to unpickle. 808 809 "original_headers" are the original request headers. 810 "headers" are the trimmed headers used for replaying responses. 811 812 Args: 813 state: a dictionary for __dict__ 814 """ 815 if 'server_delays' in state: 816 state['delays'] = { 817 'connect': 0, 818 'headers': 0, 819 'data': state['server_delays'] 820 } 821 del state['server_delays'] 822 elif 'delays' not in state: 823 state['delays'] = None 824 state['original_headers'] = state['headers'] 825 state['headers'] = self._TrimHeaders(state['original_headers']) 826 self.__dict__.update(state) 827 self.fix_delays() 828 829 def __getstate__(self): 830 """Influence how to pickle. 831 832 Returns: 833 a dict to use for pickling 834 """ 835 state = self.__dict__.copy() 836 state['headers'] = state['original_headers'] 837 del state['original_headers'] 838 return state 839 840 def get_header(self, key, default=None): 841 for k, v in self.headers: 842 if key.lower() == k.lower(): 843 return v 844 return default 845 846 def set_header(self, key, value): 847 for i, (k, v) in enumerate(self.headers): 848 if key == k: 849 self.headers[i] = (key, value) 850 return 851 self.headers.append((key, value)) 852 853 def remove_header(self, key): 854 for i, (k, v) in enumerate(self.headers): 855 if key.lower() == k.lower(): 856 self.headers.pop(i) 857 return 858 859 @staticmethod 860 def _get_epoch_seconds(date_str): 861 """Return the epoch seconds of a date header. 862 863 Args: 864 date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT") 865 Returns: 866 epoch seconds as a float 867 """ 868 date_tuple = email.utils.parsedate(date_str) 869 if date_tuple: 870 return calendar.timegm(date_tuple) 871 return None 872 873 def update_date(self, date_str, now=None): 874 """Return an updated date based on its delta from the "Date" header. 875 876 For example, if |date_str| is one week later than the "Date" header, 877 then the returned date string is one week later than the current date. 878 879 Args: 880 date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT") 881 Returns: 882 a date string 883 """ 884 date_seconds = self._get_epoch_seconds(self.get_header('date')) 885 header_seconds = self._get_epoch_seconds(date_str) 886 if date_seconds and header_seconds: 887 updated_seconds = header_seconds + (now or time.time()) - date_seconds 888 return email.utils.formatdate(updated_seconds, usegmt=True) 889 return date_str 890 891 def is_gzip(self): 892 return self.get_header('content-encoding') == 'gzip' 893 894 def is_compressed(self): 895 return self.get_header('content-encoding') in ('gzip', 'deflate') 896 897 def is_chunked(self): 898 return self.get_header('transfer-encoding') == 'chunked' 899 900 def get_data_as_chunks(self): 901 """Return content as a list of strings, each corresponding to a chunk. 902 903 Uncompresses the chunks, if needed. 904 """ 905 content_type = self.get_header('content-type') 906 if (not content_type or 907 not (content_type.startswith('text/') or 908 content_type == 'application/x-javascript' or 909 content_type.startswith('application/json'))): 910 return None 911 if self.is_compressed(): 912 return httpzlib.uncompress_chunks(self.response_data, self.is_gzip()) 913 else: 914 return self.response_data 915 916 def get_data_as_text(self): 917 """Return content as a single string. 918 919 Uncompresses and concatenates chunks with CHUNK_EDIT_SEPARATOR. 920 """ 921 return self.CHUNK_EDIT_SEPARATOR.join(self.get_data_as_chunks()) 922 923 def get_delays_as_text(self): 924 """Return delays as editable text.""" 925 return json.dumps(self.delays, indent=2) 926 927 def get_response_as_text(self): 928 """Returns response content as a single string. 929 930 Server delays are separated on a per-chunk basis. Delays are in seconds. 931 Response content begins after DELAY_EDIT_SEPARATOR 932 """ 933 data = self.get_data_as_text() 934 if data is None: 935 logging.warning('Data can not be represented as text.') 936 data = '' 937 delays = self.get_delays_as_text() 938 return self.DELAY_EDIT_SEPARATOR.join((delays, data)) 939 940 def set_data_from_chunks(self, text_chunks): 941 """Inverse of get_data_as_chunks(). 942 943 Compress, if needed. 944 """ 945 if self.is_compressed(): 946 self.response_data = httpzlib.compress_chunks(text_chunks, self.is_gzip()) 947 else: 948 self.response_data = text_chunks 949 if not self.is_chunked(): 950 content_length = sum(len(c) for c in self.response_data) 951 self.set_header('content-length', str(content_length)) 952 953 def set_data(self, text): 954 """Inverse of get_data_as_text(). 955 956 Split on CHUNK_EDIT_SEPARATOR and compress if needed. 957 """ 958 self.set_data_from_chunks(text.split(self.CHUNK_EDIT_SEPARATOR)) 959 960 def set_delays(self, delays_text): 961 """Inverse of get_delays_as_text(). 962 963 Args: 964 delays_text: JSON encoded text such as the following: 965 { 966 connect: 80, 967 headers: 80, 968 data: [6, 55, 0] 969 } 970 Times are in milliseconds. 971 Each data delay corresponds with one response_data value. 972 """ 973 try: 974 self.delays = json.loads(delays_text) 975 except (ValueError, KeyError) as e: 976 logging.critical('Unable to parse delays %s: %s', delays_text, e) 977 self.fix_delays() 978 979 def set_response_from_text(self, text): 980 """Inverse of get_response_as_text(). 981 982 Modifies the state of the archive according to the textual representation. 983 """ 984 try: 985 delays, data = text.split(self.DELAY_EDIT_SEPARATOR) 986 except ValueError: 987 logging.critical( 988 'Error parsing text representation. Skipping edits.') 989 return 990 self.set_delays(delays) 991 self.set_data(data) 992 993 994def create_response(status, reason=None, headers=None, body=None): 995 """Convenience method for creating simple ArchivedHttpResponse objects.""" 996 if reason is None: 997 reason = httplib.responses.get(status, 'Unknown') 998 if headers is None: 999 headers = [('content-type', 'text/plain')] 1000 if body is None: 1001 body = "%s %s" % (status, reason) 1002 return ArchivedHttpResponse(11, status, reason, headers, [body]) 1003 1004 1005def main(): 1006 class PlainHelpFormatter(optparse.IndentedHelpFormatter): 1007 def format_description(self, description): 1008 if description: 1009 return description + '\n' 1010 else: 1011 return '' 1012 1013 option_parser = optparse.OptionParser( 1014 usage='%prog [ls|cat|edit|stats|merge] [options] replay_file(s)', 1015 formatter=PlainHelpFormatter(), 1016 description=__doc__, 1017 epilog='http://code.google.com/p/web-page-replay/') 1018 1019 option_parser.add_option('-c', '--command', default=None, 1020 action='store', 1021 type='string', 1022 help='Only show URLs matching this command.') 1023 option_parser.add_option('-o', '--host', default=None, 1024 action='store', 1025 type='string', 1026 help='Only show URLs matching this host.') 1027 option_parser.add_option('-p', '--full_path', default=None, 1028 action='store', 1029 type='string', 1030 help='Only show URLs matching this full path.') 1031 option_parser.add_option('-f', '--merged_file', default=None, 1032 action='store', 1033 type='string', 1034 help='The output file to use when using the merge command.') 1035 1036 options, args = option_parser.parse_args() 1037 1038 # Merge command expects an umlimited number of archives. 1039 if len(args) < 2: 1040 print 'args: %s' % args 1041 option_parser.error('Must specify a command and replay_file') 1042 1043 command = args[0] 1044 replay_file = args[1] 1045 1046 if not os.path.exists(replay_file): 1047 option_parser.error('Replay file "%s" does not exist' % replay_file) 1048 1049 http_archive = HttpArchive.Load(replay_file) 1050 if command == 'ls': 1051 print http_archive.ls(options.command, options.host, options.full_path) 1052 elif command == 'cat': 1053 print http_archive.cat(options.command, options.host, options.full_path) 1054 elif command == 'stats': 1055 print http_archive.stats(options.command, options.host, options.full_path) 1056 elif command == 'merge': 1057 if not options.merged_file: 1058 print 'Error: Must specify a merged file name (use --merged_file)' 1059 return 1060 http_archive.merge(options.merged_file, args[2:]) 1061 elif command == 'edit': 1062 http_archive.edit(options.command, options.host, options.full_path) 1063 http_archive.Persist(replay_file) 1064 else: 1065 option_parser.error('Unknown command "%s"' % command) 1066 return 0 1067 1068 1069if __name__ == '__main__': 1070 sys.exit(main()) 1071