• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python
2# Copyright 2010 Google Inc. All Rights Reserved.
3#
4# Licensed under the Apache License, Version 2.0 (the "License");
5# you may not use this file except in compliance with the License.
6# You may obtain a copy of the License at
7#
8#      http://www.apache.org/licenses/LICENSE-2.0
9#
10# Unless required by applicable law or agreed to in writing, software
11# distributed under the License is distributed on an "AS IS" BASIS,
12# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13# See the License for the specific language governing permissions and
14# limitations under the License.
15
16"""View and edit HTTP Archives.
17
18To list all URLs in an archive:
19  $ ./httparchive.py ls archive.wpr
20
21To view the content of all URLs from example.com:
22  $ ./httparchive.py cat --host example.com archive.wpr
23
24To view the content of a particular URL:
25  $ ./httparchive.py cat --host www.example.com --full_path /foo archive.wpr
26
27To view the content of all URLs:
28  $ ./httparchive.py cat archive.wpr
29
30To edit a particular URL:
31  $ ./httparchive.py edit --host www.example.com --full_path /foo archive.wpr
32
33To print statistics of an archive:
34  $ ./httparchive.py stats archive.wpr
35
36To print statistics of a set of URLs:
37  $ ./httparchive.py stats --host www.example.com archive.wpr
38
39To merge multiple archives
40  $ ./httparchive.py merge --merged_file new.wpr archive1.wpr archive2.wpr ...
41"""
42
43import calendar
44import certutils
45import cPickle
46import difflib
47import email.utils
48import httplib
49import httpzlib
50import json
51import logging
52import optparse
53import os
54import StringIO
55import subprocess
56import sys
57import tempfile
58import time
59import urlparse
60from collections import defaultdict
61
62
63
64def LogRunTime(fn):
65  """Annotation which logs the run time of the function."""
66  def wrapped(self, *args, **kwargs):
67    start_time = time.time()
68    try:
69      return fn(self, *args, **kwargs)
70    finally:
71      run_time = (time.time() - start_time) * 1000.0
72      logging.debug('%s: %dms', fn.__name__, run_time)
73  return wrapped
74
75
76class HttpArchiveException(Exception):
77  """Base class for all exceptions in httparchive."""
78  pass
79
80
81class HttpArchive(dict):
82  """Dict with ArchivedHttpRequest keys and ArchivedHttpResponse values.
83
84  Attributes:
85    responses_by_host: dict of {hostname, {request: response}}. This must remain
86        in sync with the underlying dict of self. It is used as an optimization
87        so that get_requests() doesn't have to linearly search all requests in
88        the archive to find potential matches.
89  """
90
91  def __init__(self):  # pylint: disable=super-init-not-called
92    self.responses_by_host = defaultdict(dict)
93
94  def __setstate__(self, state):
95    """Influence how to unpickle.
96
97    Args:
98      state: a dictionary for __dict__
99    """
100    self.__dict__.update(state)
101    self.responses_by_host = defaultdict(dict)
102    for request in self:
103      self.responses_by_host[request.host][request] = self[request]
104
105  def __getstate__(self):
106    """Influence how to pickle.
107
108    Returns:
109      a dict to use for pickling
110    """
111    state = self.__dict__.copy()
112    del state['responses_by_host']
113    return state
114
115  def __setitem__(self, key, value):
116    super(HttpArchive, self).__setitem__(key, value)
117    if hasattr(self, 'responses_by_host'):
118      self.responses_by_host[key.host][key] = value
119
120  def __delitem__(self, key):
121    super(HttpArchive, self).__delitem__(key)
122    del self.responses_by_host[key.host][key]
123
124  def get(self, request, default=None):
125    """Return the archived response for a given request.
126
127    Does extra checking for handling some HTTP request headers.
128
129    Args:
130      request: instance of ArchivedHttpRequest
131      default: default value to return if request is not found
132
133    Returns:
134      Instance of ArchivedHttpResponse or default if no matching
135      response is found
136    """
137    if request in self:
138      return self[request]
139    return self.get_conditional_response(request, default)
140
141  def get_conditional_response(self, request, default):
142    """Get the response based on the conditional HTTP request headers.
143
144    Args:
145      request: an ArchivedHttpRequest representing the original request.
146      default: default ArchivedHttpResponse
147          original request with matched headers removed.
148
149    Returns:
150      an ArchivedHttpResponse with a status of 200, 302 (not modified), or
151          412 (precondition failed)
152    """
153    response = default
154    if request.is_conditional():
155      stripped_request = request.create_request_without_conditions()
156      if stripped_request in self:
157        response = self[stripped_request]
158        if response.status == 200:
159          status = self.get_conditional_status(request, response)
160          if status != 200:
161            response = create_response(status)
162    return response
163
164  def get_conditional_status(self, request, response):
165    status = 200
166    last_modified = email.utils.parsedate(
167        response.update_date(response.get_header('last-modified')))
168    response_etag = response.get_header('etag')
169    is_get_or_head = request.command.upper() in ('GET', 'HEAD')
170
171    match_value = request.headers.get('if-match', None)
172    if match_value:
173      if self.is_etag_match(match_value, response_etag):
174        status = 200
175      else:
176        status = 412  # precondition failed
177    none_match_value = request.headers.get('if-none-match', None)
178    if none_match_value:
179      if self.is_etag_match(none_match_value, response_etag):
180        status = 304
181      elif is_get_or_head:
182        status = 200
183      else:
184        status = 412
185    if is_get_or_head and last_modified:
186      for header in ('if-modified-since', 'if-unmodified-since'):
187        date = email.utils.parsedate(request.headers.get(header, None))
188        if date:
189          if ((header == 'if-modified-since' and last_modified > date) or
190              (header == 'if-unmodified-since' and last_modified < date)):
191            if status != 412:
192              status = 200
193          else:
194            status = 304  # not modified
195    return status
196
197  @staticmethod
198  def is_etag_match(request_etag, response_etag):
199    """Determines whether the entity tags of the request/response matches.
200
201    Args:
202      request_etag: the value string of the "if-(none)-match:"
203                    portion of the request header
204      response_etag: the etag value of the response
205
206    Returns:
207      True on match, False otherwise
208    """
209    response_etag = response_etag.strip('" ')
210    for etag in request_etag.split(','):
211      etag = etag.strip('" ')
212      if etag in ('*', response_etag):
213        return True
214    return False
215
216  def get_requests(self, command=None, host=None, full_path=None, is_ssl=None,
217                   use_query=True):
218    """Return a list of requests that match the given args."""
219    if host:
220      return [r for r in self.responses_by_host[host]
221              if r.matches(command, None, full_path, is_ssl,
222                           use_query=use_query)]
223    else:
224      return [r for r in self
225              if r.matches(command, host, full_path, is_ssl,
226                           use_query=use_query)]
227
228  def ls(self, command=None, host=None, full_path=None):
229    """List all URLs that match given params."""
230    return ''.join(sorted(
231        '%s\n' % r for r in self.get_requests(command, host, full_path)))
232
233  def cat(self, command=None, host=None, full_path=None):
234    """Print the contents of all URLs that match given params."""
235    out = StringIO.StringIO()
236    for request in self.get_requests(command, host, full_path):
237      print >>out, str(request)
238      print >>out, 'Untrimmed request headers:'
239      for k in request.headers:
240        print >>out, '    %s: %s' % (k, request.headers[k])
241      if request.request_body:
242        print >>out, request.request_body
243      print >>out, '---- Response Info', '-' * 51
244      response = self[request]
245      chunk_lengths = [len(x) for x in response.response_data]
246      print >>out, ('Status: %s\n'
247                    'Reason: %s\n'
248                    'Headers delay: %s\n'
249                    'Untrimmed response headers:') % (
250          response.status, response.reason, response.delays['headers'])
251      for k, v in response.original_headers:
252        print >>out, '    %s: %s' % (k, v)
253      print >>out, ('Chunk count: %s\n'
254                    'Chunk lengths: %s\n'
255                    'Chunk delays: %s') % (
256          len(chunk_lengths), chunk_lengths, response.delays['data'])
257      body = response.get_data_as_text()
258      print >>out, '---- Response Data', '-' * 51
259      if body:
260        print >>out, body
261      else:
262        print >>out, '[binary data]'
263      print >>out, '=' * 70
264    return out.getvalue()
265
266  def stats(self, command=None, host=None, full_path=None):
267    """Print stats about the archive for all URLs that match given params."""
268    matching_requests = self.get_requests(command, host, full_path)
269    if not matching_requests:
270      print 'Failed to find any requests matching given command, host, path.'
271      return
272
273    out = StringIO.StringIO()
274    stats = {
275        'Total': len(matching_requests),
276        'Domains': defaultdict(int),
277        'HTTP_response_code': defaultdict(int),
278        'content_type': defaultdict(int),
279        'Documents': defaultdict(int),
280        }
281
282    for request in matching_requests:
283      stats['Domains'][request.host] += 1
284      stats['HTTP_response_code'][self[request].status] += 1
285
286      content_type = self[request].get_header('content-type')
287      # Remove content type options for readability and higher level groupings.
288      str_content_type = str(content_type.split(';')[0]
289                            if content_type else None)
290      stats['content_type'][str_content_type] += 1
291
292      #  Documents are the main URL requested and not a referenced resource.
293      if str_content_type == 'text/html' and not 'referer' in request.headers:
294        stats['Documents'][request.host] += 1
295
296    print >>out, json.dumps(stats, indent=4)
297    return out.getvalue()
298
299  def merge(self, merged_archive=None, other_archives=None):
300    """Merge multiple archives into merged_archive by 'chaining' resources,
301    only resources that are not part of the accumlated archive are added"""
302    if not other_archives:
303      print 'No archives passed to merge'
304      return
305
306    # Note we already loaded 'replay_file'.
307    print 'Loaded %d responses' % len(self)
308
309    for archive in other_archives:
310      if not os.path.exists(archive):
311        print 'Error: Replay file "%s" does not exist' % archive
312        return
313
314      http_archive_other = HttpArchive.Load(archive)
315      print 'Loaded %d responses from %s' % (len(http_archive_other), archive)
316      for r in http_archive_other:
317        # Only resources that are not already part of the current archive
318        # get added.
319        if r not in self:
320          print '\t %s ' % r
321          self[r] = http_archive_other[r]
322    self.Persist('%s' % merged_archive)
323
324  def edit(self, command=None, host=None, full_path=None):
325    """Edits the single request which matches given params."""
326    editor = os.getenv('EDITOR')
327    if not editor:
328      print 'You must set the EDITOR environmental variable.'
329      return
330
331    matching_requests = self.get_requests(command, host, full_path)
332    if not matching_requests:
333      print ('Failed to find any requests matching given command, host, '
334             'full_path.')
335      return
336
337    if len(matching_requests) > 1:
338      print 'Found multiple matching requests. Please refine.'
339      print self.ls(command, host, full_path)
340
341    response = self[matching_requests[0]]
342    tmp_file = tempfile.NamedTemporaryFile(delete=False)
343    tmp_file.write(response.get_response_as_text())
344    tmp_file.close()
345    subprocess.check_call([editor, tmp_file.name])
346    response.set_response_from_text(''.join(open(tmp_file.name).readlines()))
347    os.remove(tmp_file.name)
348
349  def find_closest_request(self, request, use_path=False):
350    """Find the closest matching request in the archive to the given request.
351
352    Args:
353      request: an ArchivedHttpRequest
354      use_path: If True, closest matching request's path component must match.
355        (Note: this refers to the 'path' component within the URL, not the
356         'full path' which includes the query string component.)
357
358        If use_path=True, candidate will NOT match in example below
359        e.g. request   = GET www.test.com/a?p=1
360             candidate = GET www.test.com/b?p=1
361
362        Even if use_path=False, urls with same paths are always favored.
363        For example, candidate1 is considered a better match than candidate2.
364          request    = GET www.test.com/a?p=1&q=2&r=3
365          candidate1 = GET www.test.com/a?s=4
366          candidate2 = GET www.test.com/b?p=1&q=2&r=3
367
368    Returns:
369      If a close match is found, return the instance of ArchivedHttpRequest.
370      Otherwise, return None.
371    """
372    # Start with strictest constraints. This trims search space considerably.
373    requests = self.get_requests(request.command, request.host,
374                                 request.full_path, is_ssl=request.is_ssl,
375                                 use_query=True)
376    # Relax constraint: use_query if there is no match.
377    if not requests:
378      requests = self.get_requests(request.command, request.host,
379                                   request.full_path, is_ssl=request.is_ssl,
380                                   use_query=False)
381    # Relax constraint: full_path if there is no match and use_path=False.
382    if not requests and not use_path:
383      requests = self.get_requests(request.command, request.host,
384                                   None, is_ssl=request.is_ssl,
385                                   use_query=False)
386
387    if not requests:
388      return None
389
390    if len(requests) == 1:
391      return requests[0]
392
393    matcher = difflib.SequenceMatcher(b=request.cmp_seq)
394
395    # quick_ratio() is cheap to compute, but ratio() is expensive. So we call
396    # quick_ratio() on all requests, sort them descending, and then loop through
397    # until we find a candidate whose ratio() is >= the next quick_ratio().
398    # This works because quick_ratio() is guaranteed to be an upper bound on
399    # ratio().
400    candidates = []
401    for candidate in requests:
402      matcher.set_seq1(candidate.cmp_seq)
403      candidates.append((matcher.quick_ratio(), candidate))
404
405    candidates.sort(reverse=True, key=lambda c: c[0])
406
407    best_match = (0, None)
408    for i in xrange(len(candidates)):
409      matcher.set_seq1(candidates[i][1].cmp_seq)
410      best_match = max(best_match, (matcher.ratio(), candidates[i][1]))
411      if i + 1 < len(candidates) and best_match[0] >= candidates[i+1][0]:
412        break
413    return best_match[1]
414
415  def diff(self, request):
416    """Diff the given request to the closest matching request in the archive.
417
418    Args:
419      request: an ArchivedHttpRequest
420    Returns:
421      If a close match is found, return a textual diff between the requests.
422      Otherwise, return None.
423    """
424    request_lines = request.formatted_request.split('\n')
425    closest_request = self.find_closest_request(request)
426    if closest_request:
427      closest_request_lines = closest_request.formatted_request.split('\n')
428      return '\n'.join(difflib.ndiff(closest_request_lines, request_lines))
429    return None
430
431  def get_server_cert(self, host):
432    """Gets certificate from the server and stores it in archive"""
433    request = ArchivedHttpRequest('SERVER_CERT', host, '', None, {})
434    if request not in self:
435      self[request] = create_response(200, body=certutils.get_host_cert(host))
436    return self[request].response_data[0]
437
438  def get_certificate(self, host):
439    request = ArchivedHttpRequest('DUMMY_CERT', host, '', None, {})
440    if request not in self:
441      self[request] = create_response(200, body=self._generate_cert(host))
442    return self[request].response_data[0]
443
444  @classmethod
445  def AssertWritable(cls, filename):
446    """Raises an IOError if filename is not writable."""
447    persist_dir = os.path.dirname(os.path.abspath(filename))
448    if not os.path.exists(persist_dir):
449      raise IOError('Directory does not exist: %s' % persist_dir)
450    if os.path.exists(filename):
451      if not os.access(filename, os.W_OK):
452        raise IOError('Need write permission on file: %s' % filename)
453    elif not os.access(persist_dir, os.W_OK):
454      raise IOError('Need write permission on directory: %s' % persist_dir)
455
456  @classmethod
457  def Load(cls, filename):
458    """Load an instance from filename."""
459    return cPickle.load(open(filename, 'rb'))
460
461  def Persist(self, filename):
462    """Persist all state to filename."""
463    try:
464      original_checkinterval = sys.getcheckinterval()
465      sys.setcheckinterval(2**31-1)  # Lock out other threads so nothing can
466                                     # modify |self| during pickling.
467      pickled_self = cPickle.dumps(self, cPickle.HIGHEST_PROTOCOL)
468    finally:
469      sys.setcheckinterval(original_checkinterval)
470    with open(filename, 'wb') as f:
471      f.write(pickled_self)
472
473
474class ArchivedHttpRequest(object):
475  """Record all the state that goes into a request.
476
477  ArchivedHttpRequest instances are considered immutable so they can
478  serve as keys for HttpArchive instances.
479  (The immutability is not enforced.)
480
481  Upon creation, the headers are "trimmed" (i.e. edited or dropped)
482  and saved to self.trimmed_headers to allow requests to match in a wider
483  variety of playback situations (e.g. using different user agents).
484
485  For unpickling, 'trimmed_headers' is recreated from 'headers'. That
486  allows for changes to the trim function and can help with debugging.
487  """
488  CONDITIONAL_HEADERS = [
489      'if-none-match', 'if-match',
490      'if-modified-since', 'if-unmodified-since']
491
492  def __init__(self, command, host, full_path, request_body, headers,
493               is_ssl=False):
494    """Initialize an ArchivedHttpRequest.
495
496    Args:
497      command: a string (e.g. 'GET' or 'POST').
498      host: a host name (e.g. 'www.google.com').
499      full_path: a request path.  Includes everything after the host & port in
500          the URL (e.g. '/search?q=dogs').
501      request_body: a request body string for a POST or None.
502      headers: {key: value, ...} where key and value are strings.
503      is_ssl: a boolean which is True iff request is make via SSL.
504    """
505    self.command = command
506    self.host = host
507    self.full_path = full_path
508    parsed_url = urlparse.urlparse(full_path) if full_path else None
509    self.path = parsed_url.path if parsed_url else None
510    self.request_body = request_body
511    self.headers = headers
512    self.is_ssl = is_ssl
513    self.trimmed_headers = self._TrimHeaders(headers)
514    self.formatted_request = self._GetFormattedRequest()
515    self.cmp_seq = self._GetCmpSeq(parsed_url.query if parsed_url else None)
516
517  def __str__(self):
518    scheme = 'https' if self.is_ssl else 'http'
519    return '%s %s://%s%s %s' % (
520        self.command, scheme, self.host, self.full_path, self.trimmed_headers)
521
522  def __repr__(self):
523    return repr((self.command, self.host, self.full_path, self.request_body,
524                 self.trimmed_headers, self.is_ssl))
525
526  def __hash__(self):
527    """Return a integer hash to use for hashed collections including dict."""
528    return hash(repr(self))
529
530  def __eq__(self, other):
531    """Define the __eq__ method to match the hash behavior."""
532    return repr(self) == repr(other)
533
534  def __setstate__(self, state):
535    """Influence how to unpickle.
536
537    "headers" are the original request headers.
538    "trimmed_headers" are the trimmed headers used for matching requests
539    during replay.
540
541    Args:
542      state: a dictionary for __dict__
543    """
544    if 'full_headers' in state:
545      # Fix older version of archive.
546      state['headers'] = state['full_headers']
547      del state['full_headers']
548    if 'headers' not in state:
549      raise HttpArchiveException(
550          'Archived HTTP request is missing "headers". The HTTP archive is'
551          ' likely from a previous version and must be re-recorded.')
552    if 'path' in state:
553      # before, 'path' and 'path_without_query' were used and 'path' was
554      # pickled.  Now, 'path' has been renamed to 'full_path' and
555      # 'path_without_query' has been renamed to 'path'.  'full_path' is
556      # pickled, but 'path' is not.  If we see 'path' here it means we are
557      # dealing with an older archive.
558      state['full_path'] = state['path']
559      del state['path']
560    state['trimmed_headers'] = self._TrimHeaders(dict(state['headers']))
561    if 'is_ssl' not in state:
562      state['is_ssl'] = False
563    self.__dict__.update(state)
564    parsed_url = urlparse.urlparse(self.full_path)
565    self.path = parsed_url.path
566    self.formatted_request = self._GetFormattedRequest()
567    self.cmp_seq = self._GetCmpSeq(parsed_url.query)
568
569  def __getstate__(self):
570    """Influence how to pickle.
571
572    Returns:
573      a dict to use for pickling
574    """
575    state = self.__dict__.copy()
576    del state['trimmed_headers']
577    del state['path']
578    del state['formatted_request']
579    del state['cmp_seq']
580    return state
581
582  def _GetFormattedRequest(self):
583    """Format request to make diffs easier to read.
584
585    Returns:
586      A string consisting of the request. Example:
587      'GET www.example.com/path\nHeader-Key: header value\n'
588    """
589    parts = ['%s %s%s\n' % (self.command, self.host, self.full_path)]
590    if self.request_body:
591      parts.append('%s\n' % self.request_body)
592    for k, v in self.trimmed_headers:
593      k = '-'.join(x.capitalize() for x in k.split('-'))
594      parts.append('%s: %s\n' % (k, v))
595    return ''.join(parts)
596
597  def _GetCmpSeq(self, query=None):
598    """Compute a sequence out of query and header for difflib to compare.
599    For example:
600      [('q1', 'a1'), ('q2', 'a2'), ('k1', 'v1'), ('k2', 'v2')]
601    will be returned for a request with URL:
602      http://example.com/index.html?q1=a2&q2=a2
603    and header:
604      k1: v1
605      k2: v2
606
607    Args:
608      query: the query string in the URL.
609
610    Returns:
611      A sequence for difflib to compare.
612    """
613    if not query:
614      return self.trimmed_headers
615    return sorted(urlparse.parse_qsl(query)) + self.trimmed_headers
616
617  def matches(self, command=None, host=None, full_path=None, is_ssl=None,
618              use_query=True):
619    """Returns true iff the request matches all parameters.
620
621    Args:
622      command: a string (e.g. 'GET' or 'POST').
623      host: a host name (e.g. 'www.google.com').
624      full_path: a request path with query string (e.g. '/search?q=dogs')
625      is_ssl: whether the request is secure.
626      use_query:
627        If use_query is True, request matching uses both the hierarchical path
628        and query string component.
629        If use_query is False, request matching only uses the hierarchical path
630
631        e.g. req1 = GET www.test.com/index?aaaa
632             req2 = GET www.test.com/index?bbbb
633
634        If use_query is True, req1.matches(req2) evaluates to False
635        If use_query is False, req1.matches(req2) evaluates to True
636
637    Returns:
638      True iff the request matches all parameters
639    """
640    if command is not None and command != self.command:
641      return False
642    if is_ssl is not None and is_ssl != self.is_ssl:
643      return False
644    if host is not None and host != self.host:
645      return False
646    if full_path is None:
647      return True
648    if use_query:
649      return full_path == self.full_path
650    else:
651      return self.path == urlparse.urlparse(full_path).path
652
653  @classmethod
654  def _TrimHeaders(cls, headers):
655    """Removes headers that are known to cause problems during replay.
656
657    These headers are removed for the following reasons:
658    - accept: Causes problems with www.bing.com. During record, CSS is fetched
659              with *. During replay, it's text/css.
660    - accept-charset, accept-language, referer: vary between clients.
661    - cache-control:  sometimes sent from Chrome with 'max-age=0' as value.
662    - connection, method, scheme, url, version: Cause problems with spdy.
663    - cookie: Extremely sensitive to request/response order.
664    - keep-alive: Doesn't affect the content of the request, only some
665      transient state of the transport layer.
666    - user-agent: Changes with every Chrome version.
667    - proxy-connection: Sent for proxy requests.
668    - x-chrome-variations, x-client-data: Unique to each Chrome binary. Used by
669      Google to collect statistics about Chrome's enabled features.
670
671    Another variant to consider is dropping only the value from the header.
672    However, this is particularly bad for the cookie header, because the
673    presence of the cookie depends on the responses we've seen when the request
674    is made.
675
676    Args:
677      headers: {header_key: header_value, ...}
678
679    Returns:
680      [(header_key, header_value), ...]  # (with undesirable headers removed)
681    """
682    # TODO(tonyg): Strip sdch from the request headers because we can't
683    # guarantee that the dictionary will be recorded, so replay may not work.
684    if 'accept-encoding' in headers:
685      accept_encoding = headers['accept-encoding']
686      accept_encoding = accept_encoding.replace('sdch', '')
687      # Strip lzma so Opera's requests matches archives recorded using Chrome.
688      accept_encoding = accept_encoding.replace('lzma', '')
689      stripped_encodings = [e.strip() for e in accept_encoding.split(',')]
690      accept_encoding = ','.join(filter(bool, stripped_encodings))
691      headers['accept-encoding'] = accept_encoding
692    undesirable_keys = [
693        'accept', 'accept-charset', 'accept-language', 'cache-control',
694        'connection', 'cookie', 'keep-alive', 'method',
695        'referer', 'scheme', 'url', 'version', 'user-agent', 'proxy-connection',
696        'x-chrome-variations', 'x-client-data']
697    return sorted([(k, v) for k, v in headers.items()
698                   if k.lower() not in undesirable_keys])
699
700  def is_conditional(self):
701    """Return list of headers that match conditional headers."""
702    for header in self.CONDITIONAL_HEADERS:
703      if header in self.headers:
704        return True
705    return False
706
707  def create_request_without_conditions(self):
708    stripped_headers = dict((k, v) for k, v in self.headers.iteritems()
709                            if k.lower() not in self.CONDITIONAL_HEADERS)
710    return ArchivedHttpRequest(
711        self.command, self.host, self.full_path, self.request_body,
712        stripped_headers, self.is_ssl)
713
714class ArchivedHttpResponse(object):
715  """All the data needed to recreate all HTTP response.
716
717  Upon creation, the headers are "trimmed" (i.e. edited or dropped).
718  The original headers are saved to self.original_headers, while the
719  trimmed ones are used to allow responses to match in a wider variety
720  of playback situations.
721
722  For pickling, 'original_headers' are stored in the archive.  For unpickling
723  the headers are trimmed again. That allows for changes to the trim
724  function and can help with debugging.
725  """
726
727  # CHUNK_EDIT_SEPARATOR is used to edit and view text content.
728  # It is not sent in responses. It is added by get_data_as_text()
729  # and removed by set_data().
730  CHUNK_EDIT_SEPARATOR = '[WEB_PAGE_REPLAY_CHUNK_BOUNDARY]'
731
732  # DELAY_EDIT_SEPARATOR is used to edit and view server delays.
733  DELAY_EDIT_SEPARATOR = ('\n[WEB_PAGE_REPLAY_EDIT_ARCHIVE --- '
734                          'Delays are above. Response content is below.]\n')
735
736  def __init__(self, version, status, reason, headers, response_data,
737               delays=None):
738    """Initialize an ArchivedHttpResponse.
739
740    Args:
741      version: HTTP protocol version used by server.
742          10 for HTTP/1.0, 11 for HTTP/1.1 (same as httplib).
743      status: Status code returned by server (e.g. 200).
744      reason: Reason phrase returned by server (e.g. "OK").
745      headers: list of (header, value) tuples.
746      response_data: list of content chunks.
747          Concatenating the chunks gives the complete contents
748          (i.e. the chunks do not have any lengths or delimiters).
749          Do not include the final, zero-length chunk that marks the end.
750      delays: dict of (ms) delays for 'connect', 'headers' and 'data'.
751          e.g. {'connect': 50, 'headers': 150, 'data': [0, 10, 10]}
752          connect - The time to connect to the server.
753            Each resource has a value because Replay's record mode captures it.
754            This includes the time for the SYN and SYN/ACK (1 rtt).
755          headers - The time elapsed between the TCP connect and the headers.
756            This typically includes all the server-time to generate a response.
757          data - If the response is chunked, these are the times for each chunk.
758    """
759    self.version = version
760    self.status = status
761    self.reason = reason
762    self.original_headers = headers
763    self.headers = self._TrimHeaders(headers)
764    self.response_data = response_data
765    self.delays = delays
766    self.fix_delays()
767
768  def fix_delays(self):
769    """Initialize delays, or check the number of data delays."""
770    expected_num_delays = len(self.response_data)
771    if not self.delays:
772      self.delays = {
773          'connect': 0,
774          'headers': 0,
775          'data': [0] * expected_num_delays
776          }
777    else:
778      num_delays = len(self.delays['data'])
779      if num_delays != expected_num_delays:
780        raise HttpArchiveException(
781            'Server delay length mismatch: %d (expected %d): %s',
782            num_delays, expected_num_delays, self.delays['data'])
783
784  @classmethod
785  def _TrimHeaders(cls, headers):
786    """Removes headers that are known to cause problems during replay.
787
788    These headers are removed for the following reasons:
789    - content-security-policy: Causes problems with script injection.
790    """
791    undesirable_keys = ['content-security-policy']
792    return [(k, v) for k, v in headers if k.lower() not in undesirable_keys]
793
794  def __repr__(self):
795    return repr((self.version, self.status, self.reason, sorted(self.headers),
796                 self.response_data))
797
798  def __hash__(self):
799    """Return a integer hash to use for hashed collections including dict."""
800    return hash(repr(self))
801
802  def __eq__(self, other):
803    """Define the __eq__ method to match the hash behavior."""
804    return repr(self) == repr(other)
805
806  def __setstate__(self, state):
807    """Influence how to unpickle.
808
809    "original_headers" are the original request headers.
810    "headers" are the trimmed headers used for replaying responses.
811
812    Args:
813      state: a dictionary for __dict__
814    """
815    if 'server_delays' in state:
816      state['delays'] = {
817          'connect': 0,
818          'headers': 0,
819          'data': state['server_delays']
820          }
821      del state['server_delays']
822    elif 'delays' not in state:
823      state['delays'] = None
824    state['original_headers'] = state['headers']
825    state['headers'] = self._TrimHeaders(state['original_headers'])
826    self.__dict__.update(state)
827    self.fix_delays()
828
829  def __getstate__(self):
830    """Influence how to pickle.
831
832    Returns:
833      a dict to use for pickling
834    """
835    state = self.__dict__.copy()
836    state['headers'] = state['original_headers']
837    del state['original_headers']
838    return state
839
840  def get_header(self, key, default=None):
841    for k, v in self.headers:
842      if key.lower() == k.lower():
843        return v
844    return default
845
846  def set_header(self, key, value):
847    for i, (k, v) in enumerate(self.headers):
848      if key == k:
849        self.headers[i] = (key, value)
850        return
851    self.headers.append((key, value))
852
853  def remove_header(self, key):
854    for i, (k, v) in enumerate(self.headers):
855      if key.lower() == k.lower():
856        self.headers.pop(i)
857        return
858
859  @staticmethod
860  def _get_epoch_seconds(date_str):
861    """Return the epoch seconds of a date header.
862
863    Args:
864      date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT")
865    Returns:
866      epoch seconds as a float
867    """
868    date_tuple = email.utils.parsedate(date_str)
869    if date_tuple:
870      return calendar.timegm(date_tuple)
871    return None
872
873  def update_date(self, date_str, now=None):
874    """Return an updated date based on its delta from the "Date" header.
875
876    For example, if |date_str| is one week later than the "Date" header,
877    then the returned date string is one week later than the current date.
878
879    Args:
880      date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT")
881    Returns:
882      a date string
883    """
884    date_seconds = self._get_epoch_seconds(self.get_header('date'))
885    header_seconds = self._get_epoch_seconds(date_str)
886    if date_seconds and header_seconds:
887      updated_seconds = header_seconds + (now or time.time()) - date_seconds
888      return email.utils.formatdate(updated_seconds, usegmt=True)
889    return date_str
890
891  def is_gzip(self):
892    return self.get_header('content-encoding') == 'gzip'
893
894  def is_compressed(self):
895    return self.get_header('content-encoding') in ('gzip', 'deflate')
896
897  def is_chunked(self):
898    return self.get_header('transfer-encoding') == 'chunked'
899
900  def get_data_as_chunks(self):
901    """Return content as a list of strings, each corresponding to a chunk.
902
903    Uncompresses the chunks, if needed.
904    """
905    content_type = self.get_header('content-type')
906    if (not content_type or
907        not (content_type.startswith('text/') or
908             content_type == 'application/x-javascript' or
909             content_type.startswith('application/json'))):
910      return None
911    if self.is_compressed():
912      return httpzlib.uncompress_chunks(self.response_data, self.is_gzip())
913    else:
914      return self.response_data
915
916  def get_data_as_text(self):
917    """Return content as a single string.
918
919    Uncompresses and concatenates chunks with CHUNK_EDIT_SEPARATOR.
920    """
921    return self.CHUNK_EDIT_SEPARATOR.join(self.get_data_as_chunks())
922
923  def get_delays_as_text(self):
924    """Return delays as editable text."""
925    return json.dumps(self.delays, indent=2)
926
927  def get_response_as_text(self):
928    """Returns response content as a single string.
929
930    Server delays are separated on a per-chunk basis. Delays are in seconds.
931    Response content begins after DELAY_EDIT_SEPARATOR
932    """
933    data = self.get_data_as_text()
934    if data is None:
935      logging.warning('Data can not be represented as text.')
936      data = ''
937    delays = self.get_delays_as_text()
938    return self.DELAY_EDIT_SEPARATOR.join((delays, data))
939
940  def set_data_from_chunks(self, text_chunks):
941    """Inverse of get_data_as_chunks().
942
943    Compress, if needed.
944    """
945    if self.is_compressed():
946      self.response_data = httpzlib.compress_chunks(text_chunks, self.is_gzip())
947    else:
948      self.response_data = text_chunks
949    if not self.is_chunked():
950      content_length = sum(len(c) for c in self.response_data)
951      self.set_header('content-length', str(content_length))
952
953  def set_data(self, text):
954    """Inverse of get_data_as_text().
955
956    Split on CHUNK_EDIT_SEPARATOR and compress if needed.
957    """
958    self.set_data_from_chunks(text.split(self.CHUNK_EDIT_SEPARATOR))
959
960  def set_delays(self, delays_text):
961    """Inverse of get_delays_as_text().
962
963    Args:
964      delays_text: JSON encoded text such as the following:
965          {
966            connect: 80,
967            headers: 80,
968            data: [6, 55, 0]
969          }
970        Times are in milliseconds.
971        Each data delay corresponds with one response_data value.
972    """
973    try:
974      self.delays = json.loads(delays_text)
975    except (ValueError, KeyError) as e:
976      logging.critical('Unable to parse delays %s: %s', delays_text, e)
977    self.fix_delays()
978
979  def set_response_from_text(self, text):
980    """Inverse of get_response_as_text().
981
982    Modifies the state of the archive according to the textual representation.
983    """
984    try:
985      delays, data = text.split(self.DELAY_EDIT_SEPARATOR)
986    except ValueError:
987      logging.critical(
988          'Error parsing text representation. Skipping edits.')
989      return
990    self.set_delays(delays)
991    self.set_data(data)
992
993
994def create_response(status, reason=None, headers=None, body=None):
995  """Convenience method for creating simple ArchivedHttpResponse objects."""
996  if reason is None:
997    reason = httplib.responses.get(status, 'Unknown')
998  if headers is None:
999    headers = [('content-type', 'text/plain')]
1000  if body is None:
1001    body = "%s %s" % (status, reason)
1002  return ArchivedHttpResponse(11, status, reason, headers, [body])
1003
1004
1005def main():
1006  class PlainHelpFormatter(optparse.IndentedHelpFormatter):
1007    def format_description(self, description):
1008      if description:
1009        return description + '\n'
1010      else:
1011        return ''
1012
1013  option_parser = optparse.OptionParser(
1014      usage='%prog [ls|cat|edit|stats|merge] [options] replay_file(s)',
1015      formatter=PlainHelpFormatter(),
1016      description=__doc__,
1017      epilog='http://code.google.com/p/web-page-replay/')
1018
1019  option_parser.add_option('-c', '--command', default=None,
1020      action='store',
1021      type='string',
1022      help='Only show URLs matching this command.')
1023  option_parser.add_option('-o', '--host', default=None,
1024      action='store',
1025      type='string',
1026      help='Only show URLs matching this host.')
1027  option_parser.add_option('-p', '--full_path', default=None,
1028      action='store',
1029      type='string',
1030      help='Only show URLs matching this full path.')
1031  option_parser.add_option('-f', '--merged_file', default=None,
1032        action='store',
1033        type='string',
1034        help='The output file to use when using the merge command.')
1035
1036  options, args = option_parser.parse_args()
1037
1038  # Merge command expects an umlimited number of archives.
1039  if len(args) < 2:
1040    print 'args: %s' % args
1041    option_parser.error('Must specify a command and replay_file')
1042
1043  command = args[0]
1044  replay_file = args[1]
1045
1046  if not os.path.exists(replay_file):
1047    option_parser.error('Replay file "%s" does not exist' % replay_file)
1048
1049  http_archive = HttpArchive.Load(replay_file)
1050  if command == 'ls':
1051    print http_archive.ls(options.command, options.host, options.full_path)
1052  elif command == 'cat':
1053    print http_archive.cat(options.command, options.host, options.full_path)
1054  elif command == 'stats':
1055    print http_archive.stats(options.command, options.host, options.full_path)
1056  elif command == 'merge':
1057    if not options.merged_file:
1058      print 'Error: Must specify a merged file name (use --merged_file)'
1059      return
1060    http_archive.merge(options.merged_file, args[2:])
1061  elif command == 'edit':
1062    http_archive.edit(options.command, options.host, options.full_path)
1063    http_archive.Persist(replay_file)
1064  else:
1065    option_parser.error('Unknown command "%s"' % command)
1066  return 0
1067
1068
1069if __name__ == '__main__':
1070  sys.exit(main())
1071