• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python
2#
3# Copyright 2009 Google Inc.
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9#   http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17
18"""A class to serve pages from zip files and use memcache for performance.
19
20This contains a class and a function to create an anonymous instance of the
21class to serve HTTP GET requests. Memcache is used to increase response speed
22and lower processing cycles used in serving. Credit to Guido van Rossum and
23his implementation of zipserve which served as a reference as I wrote this.
24
25  MemcachedZipHandler: Class that serves request
26  create_handler: method to create instance of MemcachedZipHandler
27"""
28
29__author__ = 'jmatt@google.com (Justin Mattson)'
30
31import email.Utils
32import logging
33import mimetypes
34import time
35import zipfile
36
37from google.appengine.api import memcache
38from google.appengine.ext import webapp
39from google.appengine.ext.webapp import util
40from time import localtime, strftime
41
42def create_handler(zip_files, max_age=None, public=None):
43  """Factory method to create a MemcachedZipHandler instance.
44
45  Args:
46    zip_files: A list of file names, or a list of lists of file name, first
47        member of file mappings. See MemcachedZipHandler documentation for
48        more information about using the list of lists format
49    max_age: The maximum client-side cache lifetime
50    public: Whether this should be declared public in the client-side cache
51  Returns:
52    A MemcachedZipHandler wrapped in a pretty, anonymous bow for use with App
53    Engine
54
55  Raises:
56    ValueError: if the zip_files argument is not a list
57  """
58  # verify argument integrity. If the argument is passed in list format,
59  # convert it to list of lists format
60  if zip_files and type(zip_files).__name__ == 'list':
61    num_items = len(zip_files)
62    while num_items > 0:
63      if type(zip_files[num_items - 1]).__name__ != 'list':
64        zip_files[num_items - 1] = [zip_files[num_items-1]]
65      num_items -= 1
66  else:
67    raise ValueError('File name arguments must be a list')
68
69  class HandlerWrapper(MemcachedZipHandler):
70    """Simple wrapper for an instance of MemcachedZipHandler.
71
72    I'm still not sure why this is needed
73    """
74    def get(self, name):
75      self.zipfilenames = zip_files
76      self.TrueGet(name)
77      if max_age is not None:
78        MAX_AGE = max_age
79      if public is not None:
80        PUBLIC = public
81
82  return HandlerWrapper
83
84
85class MemcachedZipHandler(webapp.RequestHandler):
86  """Handles get requests for a given URL.
87
88  Serves a GET request from a series of zip files. As files are served they are
89  put into memcache, which is much faster than retreiving them from the zip
90  source file again. It also uses considerably fewer CPU cycles.
91  """
92  zipfile_cache = {}                # class cache of source zip files
93  MAX_AGE = 600                     # max client-side cache lifetime
94  PUBLIC = True                     # public cache setting
95  CACHE_PREFIX = 'cache://'         # memcache key prefix for actual URLs
96  NEG_CACHE_PREFIX = 'noncache://'  # memcache key prefix for non-existant URL
97  intlString = 'intl/'
98  validLangs = ['en', 'de', 'es', 'fr','it','ja','zh-CN','zh-TW']
99
100  def TrueGet(self, reqUri):
101    """The top-level entry point to serving requests.
102
103    Called 'True' get because it does the work when called from the wrapper
104    class' get method. Some logic is applied to the request to serve files
105    from an intl/<lang>/... directory or fall through to the default language.
106
107    Args:
108      name: URL requested
109
110    Returns:
111      None
112    """
113    langName = 'en'
114    resetLangCookie = False
115    urlLangName = None
116    retry = False
117    isValidIntl = False
118
119    # Try to retrieve the user's lang pref from the cookie. If there is no
120    # lang pref cookie in the request, add set-cookie to the response with the
121    # default value of 'en'.
122    try:
123      langName = self.request.cookies['android_developer_pref_lang']
124    except KeyError:
125      resetLangCookie = True
126      #logging.info('==========================EXCEPTION: NO LANG COOKIE FOUND, USING [%s]', langName)
127    logging.info('==========================REQ INIT name [%s] langName [%s]', reqUri, langName)
128
129    # Preprocess the req url. If it references a directory or the domain itself,
130    # append '/index.html' to the url and 302 redirect. Otherwise, continue
131    # processing the request below.
132    name = self.PreprocessUrl(reqUri, langName)
133    if name:
134      # Do some prep for handling intl requests. Parse the url and validate
135      # the intl/lang substring, extract the url lang code (urlLangName) and the
136      # the uri that follows the intl/lang substring(contentUri)
137      sections = name.split("/", 2)
138      contentUri = 0
139      isIntl = len(sections) > 1 and (sections[0] == "intl")
140      if isIntl:
141        isValidIntl = sections[1] in self.validLangs
142        if isValidIntl:
143          urlLangName = sections[1]
144          contentUri = sections[2]
145          if (langName != urlLangName):
146            # if the lang code in the request is different from that in
147            # the cookie, reset the cookie to the url lang value.
148            langName = urlLangName
149            resetLangCookie = True
150            #logging.info('INTL PREP resetting langName to urlLangName [%s]', langName)
151          #else:
152          #  logging.info('INTL PREP no need to reset langName')
153
154      # Send for processing
155      if self.isCleanUrl(name, langName, isValidIntl):
156        # handle a 'clean' request.
157        # Try to form a response using the actual request url.
158        if not self.CreateResponse(name, langName, isValidIntl, resetLangCookie):
159          # If CreateResponse returns False, there was no such document
160          # in the intl/lang tree. Before going to 404, see if there is an
161          # English-language version of the doc in the default
162          # default tree and return it, else go to 404.
163          self.CreateResponse(contentUri, langName, False, resetLangCookie)
164
165      elif isIntl:
166        # handle the case where we need to pass through an invalid intl req
167        # for processing (so as to get 404 as appropriate). This is needed
168        # because intl urls are passed through clean and retried in English,
169        # if necessary.
170        logging.info('  Handling an invalid intl request...')
171        self.CreateResponse(name, langName, isValidIntl, resetLangCookie)
172
173      else:
174        # handle the case where we have a non-clean url (usually a non-intl
175        # url) that we need to interpret in the context of any lang pref
176        # that is set. Prepend an intl/lang string to the request url and
177        # send it as a 302 redirect. After the redirect, the subsequent
178        # request will be handled as a clean url.
179        self.RedirToIntl(name, self.intlString, langName)
180
181  def isCleanUrl(self, name, langName, isValidIntl):
182    """Determine whether to pass an incoming url straight to processing.
183
184       Args:
185         name: The incoming URL
186
187       Returns:
188         boolean: Whether the URL should be sent straight to processing
189    """
190    if (langName == 'en') or isValidIntl or not ('.html' in name) or (not isValidIntl and not langName):
191      return True
192
193  def PreprocessUrl(self, name, langName):
194    """Any preprocessing work on the URL when it comes in.
195
196    Put any work related to interpretting the incoming URL here. For example,
197    this is used to redirect requests for a directory to the index.html file
198    in that directory. Subclasses should override this method to do different
199    preprocessing.
200
201    Args:
202      name: The incoming URL
203
204    Returns:
205      False if the request was redirected to '/index.html', or
206      The processed URL, otherwise
207    """
208    # determine if this is a request for a directory
209    final_path_segment = name
210    final_slash_offset = name.rfind('/')
211    if final_slash_offset != len(name) - 1:
212      final_path_segment = name[final_slash_offset + 1:]
213      if final_path_segment.find('.') == -1:
214        name = ''.join([name, '/'])
215
216    # if this is a directory or the domain itself, redirect to /index.html
217    if not name or (name[len(name) - 1:] == '/'):
218      uri = ''.join(['/', name, 'index.html'])
219      logging.info('--->PREPROCESSING REDIRECT [%s] to [%s] with langName [%s]', name, uri, langName)
220      self.redirect(uri, False)
221      return False
222    else:
223      return name
224
225  def RedirToIntl(self, name, intlString, langName):
226    """Redirect an incoming request to the appropriate intl uri.
227
228       Builds the intl/lang string from a base (en) string
229       and redirects (302) the request to look for a version
230       of the file in the language that matches the client-
231       supplied cookie value.
232
233    Args:
234      name: The incoming, preprocessed URL
235
236    Returns:
237      The lang-specific URL
238    """
239    builtIntlLangUri = ''.join([intlString, langName, '/', name, '?', self.request.query_string])
240    uri = ''.join(['/', builtIntlLangUri])
241    logging.info('-->>REDIRECTING %s to  %s', name, uri)
242    self.redirect(uri, False)
243    return uri
244
245  def CreateResponse(self, name, langName, isValidIntl, resetLangCookie):
246    """Process the url and form a response, if appropriate.
247
248       Attempts to retrieve the requested file (name) from cache,
249       negative cache, or store (zip) and form the response.
250       For intl requests that are not found (in the localized tree),
251       returns False rather than forming a response, so that
252       the request can be retried with the base url (this is the
253       fallthrough to default language).
254
255       For requests that are found, forms the headers and
256       adds the content to the response entity. If the request was
257       for an intl (localized) url, also resets the language cookie
258       to the language specified in the url if needed, to ensure that
259       the client language and response data remain harmonious.
260
261    Args:
262      name: The incoming, preprocessed URL
263      langName: The language id. Used as necessary to reset the
264                language cookie in the response.
265      isValidIntl: If present, indicates whether the request is
266                   for a language-specific url
267      resetLangCookie: Whether the response should reset the
268                       language cookie to 'langName'
269
270    Returns:
271      True: A response was successfully created for the request
272      False: No response was created.
273    """
274    # see if we have the page in the memcache
275    logging.info('PROCESSING %s langName [%s] isValidIntl [%s] resetLang [%s]',
276      name, langName, isValidIntl, resetLangCookie)
277    resp_data = self.GetFromCache(name)
278    if resp_data is None:
279      logging.info('  Cache miss for %s', name)
280      resp_data = self.GetFromNegativeCache(name)
281      if resp_data is None:
282        resp_data = self.GetFromStore(name)
283
284        # IF we have the file, put it in the memcache
285        # ELSE put it in the negative cache
286        if resp_data is not None:
287          self.StoreOrUpdateInCache(name, resp_data)
288        elif isValidIntl:
289          # couldn't find the intl doc. Try to fall through to English.
290          #logging.info('  Retrying with base uri...')
291          return False
292        else:
293          logging.info('  Adding %s to negative cache, serving 404', name)
294          self.StoreInNegativeCache(name)
295          self.Write404Error()
296          return True
297      else:
298        # found it in negative cache
299        self.Write404Error()
300        return True
301
302    # found content from cache or store
303    logging.info('FOUND CLEAN')
304    if resetLangCookie:
305      logging.info('  Resetting android_developer_pref_lang cookie to [%s]',
306      langName)
307      expireDate = time.mktime(localtime()) + 60 * 60 * 24 * 365 * 10
308      self.response.headers.add_header('Set-Cookie',
309      'android_developer_pref_lang=%s; path=/; expires=%s' %
310      (langName, strftime("%a, %d %b %Y %H:%M:%S", localtime(expireDate))))
311    mustRevalidate = False
312    if ('.html' in name):
313      # revalidate html files -- workaround for cache inconsistencies for
314      # negotiated responses
315      mustRevalidate = True
316      logging.info('  Adding [Vary: Cookie] to response...')
317      self.response.headers.add_header('Vary', 'Cookie')
318    content_type, encoding = mimetypes.guess_type(name)
319    if content_type:
320      self.response.headers['Content-Type'] = content_type
321      self.SetCachingHeaders(mustRevalidate)
322      self.response.out.write(resp_data)
323    elif (name == 'favicon.ico'):
324      self.response.headers['Content-Type'] = 'image/x-icon'
325      self.SetCachingHeaders(mustRevalidate)
326      self.response.out.write(resp_data)
327    elif name.endswith('.psd'):
328      self.response.headers['Content-Type'] = 'application/octet-stream'
329      self.SetCachingHeaders(mustRevalidate)
330      self.response.out.write(resp_data)
331    return True
332
333  def GetFromStore(self, file_path):
334    """Retrieve file from zip files.
335
336    Get the file from the source, it must not have been in the memcache. If
337    possible, we'll use the zip file index to quickly locate where the file
338    should be found. (See MapToFileArchive documentation for assumptions about
339    file ordering.) If we don't have an index or don't find the file where the
340    index says we should, look through all the zip files to find it.
341
342    Args:
343      file_path: the file that we're looking for
344
345    Returns:
346      The contents of the requested file
347    """
348    resp_data = None
349    file_itr = iter(self.zipfilenames)
350
351    # check the index, if we have one, to see what archive the file is in
352    archive_name = self.MapFileToArchive(file_path)
353    if not archive_name:
354      archive_name = file_itr.next()[0]
355
356    while resp_data is None and archive_name:
357      zip_archive = self.LoadZipFile(archive_name)
358      if zip_archive:
359
360        # we expect some lookups will fail, and that's okay, 404s will deal
361        # with that
362        try:
363          resp_data = zip_archive.read(file_path)
364        except (KeyError, RuntimeError), err:
365          # no op
366          x = False
367        if resp_data is not None:
368          logging.info('%s read from %s', file_path, archive_name)
369
370      try:
371        archive_name = file_itr.next()[0]
372      except (StopIteration), err:
373        archive_name = False
374
375    return resp_data
376
377  def LoadZipFile(self, zipfilename):
378    """Convenience method to load zip file.
379
380    Just a convenience method to load the zip file from the data store. This is
381    useful if we ever want to change data stores and also as a means of
382    dependency injection for testing. This method will look at our file cache
383    first, and then load and cache the file if there's a cache miss
384
385    Args:
386      zipfilename: the name of the zip file to load
387
388    Returns:
389      The zip file requested, or None if there is an I/O error
390    """
391    zip_archive = None
392    zip_archive = self.zipfile_cache.get(zipfilename)
393    if zip_archive is None:
394      try:
395        zip_archive = zipfile.ZipFile(zipfilename)
396        self.zipfile_cache[zipfilename] = zip_archive
397      except (IOError, RuntimeError), err:
398        logging.error('Can\'t open zipfile %s, cause: %s' % (zipfilename,
399                                                             err))
400    return zip_archive
401
402  def MapFileToArchive(self, file_path):
403    """Given a file name, determine what archive it should be in.
404
405    This method makes two critical assumptions.
406    (1) The zip files passed as an argument to the handler, if concatenated
407        in that same order, would result in a total ordering
408        of all the files. See (2) for ordering type.
409    (2) Upper case letters before lower case letters. The traversal of a
410        directory tree is depth first. A parent directory's files are added
411        before the files of any child directories
412
413    Args:
414      file_path: the file to be mapped to an archive
415
416    Returns:
417      The name of the archive where we expect the file to be
418    """
419    num_archives = len(self.zipfilenames)
420    while num_archives > 0:
421      target = self.zipfilenames[num_archives - 1]
422      if len(target) > 1:
423        if self.CompareFilenames(target[1], file_path) >= 0:
424          return target[0]
425      num_archives -= 1
426
427    return None
428
429  def CompareFilenames(self, file1, file2):
430    """Determines whether file1 is lexigraphically 'before' file2.
431
432    WARNING: This method assumes that paths are output in a depth-first,
433    with parent directories' files stored before childs'
434
435    We say that file1 is lexigraphically before file2 if the last non-matching
436    path segment of file1 is alphabetically before file2.
437
438    Args:
439      file1: the first file path
440      file2: the second file path
441
442    Returns:
443      A positive number if file1 is before file2
444      A negative number if file2 is before file1
445      0 if filenames are the same
446    """
447    f1_segments = file1.split('/')
448    f2_segments = file2.split('/')
449
450    segment_ptr = 0
451    while (segment_ptr < len(f1_segments) and
452           segment_ptr < len(f2_segments) and
453           f1_segments[segment_ptr] == f2_segments[segment_ptr]):
454      segment_ptr += 1
455
456    if len(f1_segments) == len(f2_segments):
457
458      # we fell off the end, the paths much be the same
459      if segment_ptr == len(f1_segments):
460        return 0
461
462      # we didn't fall of the end, compare the segments where they differ
463      if f1_segments[segment_ptr] < f2_segments[segment_ptr]:
464        return 1
465      elif f1_segments[segment_ptr] > f2_segments[segment_ptr]:
466        return -1
467      else:
468        return 0
469
470      # the number of segments differs, we either mismatched comparing
471      # directories, or comparing a file to a directory
472    else:
473
474      # IF we were looking at the last segment of one of the paths,
475      # the one with fewer segments is first because files come before
476      # directories
477      # ELSE we just need to compare directory names
478      if (segment_ptr + 1 == len(f1_segments) or
479          segment_ptr + 1 == len(f2_segments)):
480        return len(f2_segments) - len(f1_segments)
481      else:
482        if f1_segments[segment_ptr] < f2_segments[segment_ptr]:
483          return 1
484        elif f1_segments[segment_ptr] > f2_segments[segment_ptr]:
485          return -1
486        else:
487          return 0
488
489  def SetCachingHeaders(self, revalidate):
490    """Set caching headers for the request."""
491    max_age = self.MAX_AGE
492    #self.response.headers['Expires'] = email.Utils.formatdate(
493    #    time.time() + max_age, usegmt=True)
494	cache_control = []
495    if self.PUBLIC:
496      cache_control.append('public')
497    cache_control.append('max-age=%d' % max_age)
498    if revalidate:
499      cache_control.append('must-revalidate')
500    self.response.headers['Cache-Control'] = ', '.join(cache_control)
501
502  def GetFromCache(self, filename):
503    """Get file from memcache, if available.
504
505    Args:
506      filename: The URL of the file to return
507
508    Returns:
509      The content of the file
510    """
511    return memcache.get('%s%s' % (self.CACHE_PREFIX, filename))
512
513  def StoreOrUpdateInCache(self, filename, data):
514    """Store data in the cache.
515
516    Store a piece of data in the memcache. Memcache has a maximum item size of
517    1*10^6 bytes. If the data is too large, fail, but log the failure. Future
518    work will consider compressing the data before storing or chunking it
519
520    Args:
521      filename: the name of the file to store
522      data: the data of the file
523
524    Returns:
525      None
526    """
527    try:
528      if not memcache.add('%s%s' % (self.CACHE_PREFIX, filename), data):
529        memcache.replace('%s%s' % (self.CACHE_PREFIX, filename), data)
530    except (ValueError), err:
531      logging.warning('Data size too large to cache\n%s' % err)
532
533  def Write404Error(self):
534    """Ouptut a simple 404 response."""
535    self.error(404)
536    self.response.out.write(
537        ''.join(['<html><head><title>404: Not Found</title></head>',
538                 '<body><b><h2>Error 404</h2><br/>',
539                 'File not found</b></body></html>']))
540
541  def StoreInNegativeCache(self, filename):
542    """If a non-existant URL is accessed, cache this result as well.
543
544    Future work should consider setting a maximum negative cache size to
545    prevent it from from negatively impacting the real cache.
546
547    Args:
548      filename: URL to add ot negative cache
549
550    Returns:
551      None
552    """
553    memcache.add('%s%s' % (self.NEG_CACHE_PREFIX, filename), -1)
554
555  def GetFromNegativeCache(self, filename):
556    """Retrieve from negative cache.
557
558    Args:
559      filename: URL to retreive
560
561    Returns:
562      The file contents if present in the negative cache.
563    """
564    return memcache.get('%s%s' % (self.NEG_CACHE_PREFIX, filename))
565
566def main():
567  application = webapp.WSGIApplication([('/([^/]+)/(.*)',
568                                         MemcachedZipHandler)])
569  util.run_wsgi_app(application)
570
571
572if __name__ == '__main__':
573  main()
574