• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python
2#
3# Copyright 2009 Google Inc.
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9#   http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17
18"""A class to serve pages from zip files and use memcache for performance.
19
20This contains a class and a function to create an anonymous instance of the
21class to serve HTTP GET requests. Memcache is used to increase response speed
22and lower processing cycles used in serving. Credit to Guido van Rossum and
23his implementation of zipserve which served as a reference as I wrote this.
24
25  MemcachedZipHandler: Class that serves request
26  create_handler: method to create instance of MemcachedZipHandler
27"""
28
29__author__ = 'jmatt@google.com (Justin Mattson)'
30
31import email.Utils
32import logging
33import mimetypes
34import time
35import zipfile
36
37from google.appengine.api import memcache
38from google.appengine.ext import webapp
39from google.appengine.ext.webapp import util
40from time import localtime, strftime
41
42def create_handler(zip_files, max_age=None, public=None):
43  """Factory method to create a MemcachedZipHandler instance.
44
45  Args:
46    zip_files: A list of file names, or a list of lists of file name, first
47        member of file mappings. See MemcachedZipHandler documentation for
48        more information about using the list of lists format
49    max_age: The maximum client-side cache lifetime
50    public: Whether this should be declared public in the client-side cache
51  Returns:
52    A MemcachedZipHandler wrapped in a pretty, anonymous bow for use with App
53    Engine
54
55  Raises:
56    ValueError: if the zip_files argument is not a list
57  """
58  # verify argument integrity. If the argument is passed in list format,
59  # convert it to list of lists format
60  if zip_files and type(zip_files).__name__ == 'list':
61    num_items = len(zip_files)
62    while num_items > 0:
63      if type(zip_files[num_items - 1]).__name__ != 'list':
64        zip_files[num_items - 1] = [zip_files[num_items-1]]
65      num_items -= 1
66  else:
67    raise ValueError('File name arguments must be a list')
68
69  class HandlerWrapper(MemcachedZipHandler):
70    """Simple wrapper for an instance of MemcachedZipHandler.
71
72    I'm still not sure why this is needed
73    """
74    def get(self, name):
75      self.zipfilenames = zip_files
76      self.TrueGet(name)
77      if max_age is not None:
78        MAX_AGE = max_age
79      if public is not None:
80        PUBLIC = public
81
82  return HandlerWrapper
83
84
85class MemcachedZipHandler(webapp.RequestHandler):
86  """Handles get requests for a given URL.
87
88  Serves a GET request from a series of zip files. As files are served they are
89  put into memcache, which is much faster than retreiving them from the zip
90  source file again. It also uses considerably fewer CPU cycles.
91  """
92  zipfile_cache = {}                # class cache of source zip files
93  MAX_AGE = 600                     # max client-side cache lifetime
94  PUBLIC = True                     # public cache setting
95  CACHE_PREFIX = 'cache://'         # memcache key prefix for actual URLs
96  NEG_CACHE_PREFIX = 'noncache://'  # memcache key prefix for non-existant URL
97  intlString = 'intl/'
98  validLangs = ['en', 'de', 'es', 'fr','it','ja','zh-CN','zh-TW']
99
100  def TrueGet(self, reqUri):
101    """The top-level entry point to serving requests.
102
103    Called 'True' get because it does the work when called from the wrapper
104    class' get method. Some logic is applied to the request to serve files
105    from an intl/<lang>/... directory or fall through to the default language.
106
107    Args:
108      name: URL requested
109
110    Returns:
111      None
112    """
113    langName = 'en'
114    resetLangCookie = False
115    urlLangName = None
116    retry = False
117    isValidIntl = False
118    isStripped = False
119
120    # Try to retrieve the user's lang pref from the cookie. If there is no
121    # lang pref cookie in the request, add set-cookie to the response with the
122    # default value of 'en'.
123    try:
124      langName = self.request.cookies['android_developer_pref_lang']
125    except KeyError:
126      resetLangCookie = True
127      #logging.info('==========================EXCEPTION: NO LANG COOKIE FOUND, USING [%s]', langName)
128    logging.info('==========================REQ INIT name [%s] langName [%s] resetLangCookie [%s]', reqUri, langName, resetLangCookie)
129
130    # Preprocess the req url. If it references a directory or the domain itself,
131    # append '/index.html' to the url and 302 redirect. Otherwise, continue
132    # processing the request below.
133    name = self.PreprocessUrl(reqUri, langName)
134    if name:
135      # Do some prep for handling intl requests. Parse the url and validate
136      # the intl/lang substring, extract the url lang code (urlLangName) and the
137      # the uri that follows the intl/lang substring(contentUri)
138      sections = name.split("/", 2)
139      contentUri = 0
140      isIntl = len(sections) > 1 and (sections[0] == "intl")
141      if isIntl:
142        isValidIntl = sections[1] in self.validLangs
143        if isValidIntl:
144          urlLangName = sections[1]
145          contentUri = sections[2]
146          logging.info('  Content URI is [%s]...', contentUri)
147          if (urlLangName != langName) or (langName == 'en'):
148            # if the lang code in the request is different from that in
149            # the cookie, or if the target lang is en, strip the
150            # intl/nn substring. It will later be redirected to
151            # the user's preferred language url.
152            # logging.info('  Handling a MISMATCHED intl request')
153            name = contentUri
154            isStripped = True
155            isValidIntl = False
156            isIntl = False
157
158      # Send for processing
159      if self.isCleanUrl(name, langName, isValidIntl, isStripped):
160        # handle a 'clean' request.
161        # Try to form a response using the actual request url.
162        # logging.info('  Request being handled as clean: [%s]', name)
163        if not self.CreateResponse(name, langName, isValidIntl, resetLangCookie):
164          # If CreateResponse returns False, there was no such document
165          # in the intl/lang tree. Before going to 404, see if there is an
166          # English-language version of the doc in the default
167          # default tree and return it, else go to 404.
168          self.CreateResponse(contentUri, langName, False, resetLangCookie)
169
170      elif isIntl:
171        # handle the case where we need to pass through an invalid intl req
172        # for processing (so as to get 404 as appropriate). This is needed
173        # because intl urls are passed through clean and retried in English,
174        # if necessary.
175        # logging.info('  Handling an invalid intl request...')
176        self.CreateResponse(name, langName, isValidIntl, resetLangCookie)
177
178      else:
179        # handle the case where we have a non-clean url (usually a non-intl
180        # url) that we need to interpret in the context of any lang pref
181        # that is set. Prepend an intl/lang string to the request url and
182        # send it as a 302 redirect. After the redirect, the subsequent
183        # request will be handled as a clean url.
184        self.RedirToIntl(name, self.intlString, langName)
185
186  def isCleanUrl(self, name, langName, isValidIntl, isStripped):
187    """Determine whether to pass an incoming url straight to processing.
188
189       Args:
190         name: The incoming URL
191
192       Returns:
193         boolean: Whether the URL should be sent straight to processing
194    """
195    # logging.info('  >>>> isCleanUrl name [%s] langName [%s] isValidIntl [%s]', name, langName, isValidIntl)
196    if (langName == 'en' and not isStripped) or isValidIntl or not ('.html' in name) or (not isValidIntl and not langName):
197      return True
198
199  def PreprocessUrl(self, name, langName):
200    """Any preprocessing work on the URL when it comes in.
201
202    Put any work related to interpreting the incoming URL here. For example,
203    this is used to redirect requests for a directory to the index.html file
204    in that directory. Subclasses should override this method to do different
205    preprocessing.
206
207    Args:
208      name: The incoming URL
209
210    Returns:
211      False if the request was redirected to '/index.html', or
212      The processed URL, otherwise
213    """
214    # determine if this is a request for a directory
215    final_path_segment = name
216    final_slash_offset = name.rfind('/')
217    if final_slash_offset != len(name) - 1:
218      final_path_segment = name[final_slash_offset + 1:]
219      if final_path_segment.find('.') == -1:
220        name = ''.join([name, '/'])
221
222    # if this is a directory or the domain itself, redirect to /index.html
223    if not name or (name[len(name) - 1:] == '/'):
224      uri = ''.join(['/', name, 'index.html'])
225      # logging.info('--->PREPROCESSING REDIRECT [%s] to [%s] with langName [%s]', name, uri, langName)
226      self.redirect(uri, False)
227      return False
228    else:
229      return name
230
231  def RedirToIntl(self, name, intlString, langName):
232    """Redirect an incoming request to the appropriate intl uri.
233
234       For non-en langName, builds the intl/lang string from a
235       base (en) string and redirects (302) the request to look for
236       a version of the file in langName. For en langName, simply
237       redirects a stripped uri string (intl/nn removed).
238
239    Args:
240      name: The incoming, preprocessed URL
241
242    Returns:
243      The lang-specific URL
244    """
245    if not (langName == 'en'):
246      builtIntlLangUri = ''.join([intlString, langName, '/', name, '?', self.request.query_string])
247    else:
248      builtIntlLangUri = name
249    uri = ''.join(['/', builtIntlLangUri])
250    logging.info('-->>REDIRECTING %s to  %s', name, uri)
251    self.redirect(uri, False)
252    return uri
253
254  def CreateResponse(self, name, langName, isValidIntl, resetLangCookie):
255    """Process the url and form a response, if appropriate.
256
257       Attempts to retrieve the requested file (name) from cache,
258       negative cache, or store (zip) and form the response.
259       For intl requests that are not found (in the localized tree),
260       returns False rather than forming a response, so that
261       the request can be retried with the base url (this is the
262       fallthrough to default language).
263
264       For requests that are found, forms the headers and
265       adds the content to the response entity. If the request was
266       for an intl (localized) url, also resets the language cookie
267       to the language specified in the url if needed, to ensure that
268       the client language and response data remain harmonious.
269
270    Args:
271      name: The incoming, preprocessed URL
272      langName: The language id. Used as necessary to reset the
273                language cookie in the response.
274      isValidIntl: If present, indicates whether the request is
275                   for a language-specific url
276      resetLangCookie: Whether the response should reset the
277                       language cookie to 'langName'
278
279    Returns:
280      True: A response was successfully created for the request
281      False: No response was created.
282    """
283    # see if we have the page in the memcache
284    logging.info('PROCESSING %s langName [%s] isValidIntl [%s] resetLang [%s]',
285      name, langName, isValidIntl, resetLangCookie)
286    resp_data = self.GetFromCache(name)
287    if resp_data is None:
288      logging.info('  Cache miss for %s', name)
289      resp_data = self.GetFromNegativeCache(name)
290      if resp_data is None:
291        resp_data = self.GetFromStore(name)
292
293        # IF we have the file, put it in the memcache
294        # ELSE put it in the negative cache
295        if resp_data is not None:
296          self.StoreOrUpdateInCache(name, resp_data)
297        elif isValidIntl:
298          # couldn't find the intl doc. Try to fall through to English.
299          #logging.info('  Retrying with base uri...')
300          return False
301        else:
302          logging.info('  Adding %s to negative cache, serving 404', name)
303          self.StoreInNegativeCache(name)
304          self.Write404Error()
305          return True
306      else:
307        # found it in negative cache
308        self.Write404Error()
309        return True
310
311    # found content from cache or store
312    logging.info('FOUND CLEAN')
313    if resetLangCookie:
314      logging.info('  Resetting android_developer_pref_lang cookie to [%s]',
315      langName)
316      expireDate = time.mktime(localtime()) + 60 * 60 * 24 * 365 * 10
317      self.response.headers.add_header('Set-Cookie',
318      'android_developer_pref_lang=%s; path=/; expires=%s' %
319      (langName, strftime("%a, %d %b %Y %H:%M:%S", localtime(expireDate))))
320    mustRevalidate = False
321    if ('.html' in name):
322      # revalidate html files -- workaround for cache inconsistencies for
323      # negotiated responses
324      mustRevalidate = True
325      #logging.info('  Adding [Vary: Cookie] to response...')
326      self.response.headers.add_header('Vary', 'Cookie')
327    content_type, encoding = mimetypes.guess_type(name)
328    if content_type:
329      self.response.headers['Content-Type'] = content_type
330      self.SetCachingHeaders(mustRevalidate)
331      self.response.out.write(resp_data)
332    elif (name == 'favicon.ico'):
333      self.response.headers['Content-Type'] = 'image/x-icon'
334      self.SetCachingHeaders(mustRevalidate)
335      self.response.out.write(resp_data)
336    elif name.endswith('.psd'):
337      self.response.headers['Content-Type'] = 'application/octet-stream'
338      self.SetCachingHeaders(mustRevalidate)
339      self.response.out.write(resp_data)
340    return True
341
342  def GetFromStore(self, file_path):
343    """Retrieve file from zip files.
344
345    Get the file from the source, it must not have been in the memcache. If
346    possible, we'll use the zip file index to quickly locate where the file
347    should be found. (See MapToFileArchive documentation for assumptions about
348    file ordering.) If we don't have an index or don't find the file where the
349    index says we should, look through all the zip files to find it.
350
351    Args:
352      file_path: the file that we're looking for
353
354    Returns:
355      The contents of the requested file
356    """
357    resp_data = None
358    file_itr = iter(self.zipfilenames)
359
360    # check the index, if we have one, to see what archive the file is in
361    archive_name = self.MapFileToArchive(file_path)
362    if not archive_name:
363      archive_name = file_itr.next()[0]
364
365    while resp_data is None and archive_name:
366      zip_archive = self.LoadZipFile(archive_name)
367      if zip_archive:
368
369        # we expect some lookups will fail, and that's okay, 404s will deal
370        # with that
371        try:
372          resp_data = zip_archive.read(file_path)
373        except (KeyError, RuntimeError), err:
374          # no op
375          x = False
376        if resp_data is not None:
377          logging.info('%s read from %s', file_path, archive_name)
378
379      try:
380        archive_name = file_itr.next()[0]
381      except (StopIteration), err:
382        archive_name = False
383
384    return resp_data
385
386  def LoadZipFile(self, zipfilename):
387    """Convenience method to load zip file.
388
389    Just a convenience method to load the zip file from the data store. This is
390    useful if we ever want to change data stores and also as a means of
391    dependency injection for testing. This method will look at our file cache
392    first, and then load and cache the file if there's a cache miss
393
394    Args:
395      zipfilename: the name of the zip file to load
396
397    Returns:
398      The zip file requested, or None if there is an I/O error
399    """
400    zip_archive = None
401    zip_archive = self.zipfile_cache.get(zipfilename)
402    if zip_archive is None:
403      try:
404        zip_archive = zipfile.ZipFile(zipfilename)
405        self.zipfile_cache[zipfilename] = zip_archive
406      except (IOError, RuntimeError), err:
407        logging.error('Can\'t open zipfile %s, cause: %s' % (zipfilename,
408                                                             err))
409    return zip_archive
410
411  def MapFileToArchive(self, file_path):
412    """Given a file name, determine what archive it should be in.
413
414    This method makes two critical assumptions.
415    (1) The zip files passed as an argument to the handler, if concatenated
416        in that same order, would result in a total ordering
417        of all the files. See (2) for ordering type.
418    (2) Upper case letters before lower case letters. The traversal of a
419        directory tree is depth first. A parent directory's files are added
420        before the files of any child directories
421
422    Args:
423      file_path: the file to be mapped to an archive
424
425    Returns:
426      The name of the archive where we expect the file to be
427    """
428    num_archives = len(self.zipfilenames)
429    while num_archives > 0:
430      target = self.zipfilenames[num_archives - 1]
431      if len(target) > 1:
432        if self.CompareFilenames(target[1], file_path) >= 0:
433          return target[0]
434      num_archives -= 1
435
436    return None
437
438  def CompareFilenames(self, file1, file2):
439    """Determines whether file1 is lexigraphically 'before' file2.
440
441    WARNING: This method assumes that paths are output in a depth-first,
442    with parent directories' files stored before childs'
443
444    We say that file1 is lexigraphically before file2 if the last non-matching
445    path segment of file1 is alphabetically before file2.
446
447    Args:
448      file1: the first file path
449      file2: the second file path
450
451    Returns:
452      A positive number if file1 is before file2
453      A negative number if file2 is before file1
454      0 if filenames are the same
455    """
456    f1_segments = file1.split('/')
457    f2_segments = file2.split('/')
458
459    segment_ptr = 0
460    while (segment_ptr < len(f1_segments) and
461           segment_ptr < len(f2_segments) and
462           f1_segments[segment_ptr] == f2_segments[segment_ptr]):
463      segment_ptr += 1
464
465    if len(f1_segments) == len(f2_segments):
466
467      # we fell off the end, the paths much be the same
468      if segment_ptr == len(f1_segments):
469        return 0
470
471      # we didn't fall of the end, compare the segments where they differ
472      if f1_segments[segment_ptr] < f2_segments[segment_ptr]:
473        return 1
474      elif f1_segments[segment_ptr] > f2_segments[segment_ptr]:
475        return -1
476      else:
477        return 0
478
479      # the number of segments differs, we either mismatched comparing
480      # directories, or comparing a file to a directory
481    else:
482
483      # IF we were looking at the last segment of one of the paths,
484      # the one with fewer segments is first because files come before
485      # directories
486      # ELSE we just need to compare directory names
487      if (segment_ptr + 1 == len(f1_segments) or
488          segment_ptr + 1 == len(f2_segments)):
489        return len(f2_segments) - len(f1_segments)
490      else:
491        if f1_segments[segment_ptr] < f2_segments[segment_ptr]:
492          return 1
493        elif f1_segments[segment_ptr] > f2_segments[segment_ptr]:
494          return -1
495        else:
496          return 0
497
498  def SetCachingHeaders(self, revalidate):
499    """Set caching headers for the request."""
500    max_age = self.MAX_AGE
501    #self.response.headers['Expires'] = email.Utils.formatdate(
502    #    time.time() + max_age, usegmt=True)
503    cache_control = []
504    if self.PUBLIC:
505      cache_control.append('public')
506    cache_control.append('max-age=%d' % max_age)
507    if revalidate:
508      cache_control.append('must-revalidate')
509    self.response.headers['Cache-Control'] = ', '.join(cache_control)
510
511  def GetFromCache(self, filename):
512    """Get file from memcache, if available.
513
514    Args:
515      filename: The URL of the file to return
516
517    Returns:
518      The content of the file
519    """
520    return memcache.get('%s%s' % (self.CACHE_PREFIX, filename))
521
522  def StoreOrUpdateInCache(self, filename, data):
523    """Store data in the cache.
524
525    Store a piece of data in the memcache. Memcache has a maximum item size of
526    1*10^6 bytes. If the data is too large, fail, but log the failure. Future
527    work will consider compressing the data before storing or chunking it
528
529    Args:
530      filename: the name of the file to store
531      data: the data of the file
532
533    Returns:
534      None
535    """
536    try:
537      if not memcache.add('%s%s' % (self.CACHE_PREFIX, filename), data):
538        memcache.replace('%s%s' % (self.CACHE_PREFIX, filename), data)
539    except (ValueError), err:
540      logging.warning('Data size too large to cache\n%s' % err)
541
542  def Write404Error(self):
543    """Ouptut a simple 404 response."""
544    self.error(404)
545    self.response.out.write(
546        ''.join(['<html><head><title>404: Not Found</title></head>',
547                 '<body><b><h2>Error 404</h2><br/>',
548                 'File not found</b></body></html>']))
549
550  def StoreInNegativeCache(self, filename):
551    """If a non-existant URL is accessed, cache this result as well.
552
553    Future work should consider setting a maximum negative cache size to
554    prevent it from from negatively impacting the real cache.
555
556    Args:
557      filename: URL to add ot negative cache
558
559    Returns:
560      None
561    """
562    memcache.add('%s%s' % (self.NEG_CACHE_PREFIX, filename), -1)
563
564  def GetFromNegativeCache(self, filename):
565    """Retrieve from negative cache.
566
567    Args:
568      filename: URL to retreive
569
570    Returns:
571      The file contents if present in the negative cache.
572    """
573    return memcache.get('%s%s' % (self.NEG_CACHE_PREFIX, filename))
574
575def main():
576  application = webapp.WSGIApplication([('/([^/]+)/(.*)',
577                                         MemcachedZipHandler)])
578  util.run_wsgi_app(application)
579
580
581if __name__ == '__main__':
582  main()
583