1#!/usr/bin/env python 2# 3# Copyright 2009 Google Inc. 4# 5# Licensed under the Apache License, Version 2.0 (the "License"); 6# you may not use this file except in compliance with the License. 7# You may obtain a copy of the License at 8# 9# http://www.apache.org/licenses/LICENSE-2.0 10# 11# Unless required by applicable law or agreed to in writing, software 12# distributed under the License is distributed on an "AS IS" BASIS, 13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14# See the License for the specific language governing permissions and 15# limitations under the License. 16# 17 18"""A class to serve pages from zip files and use memcache for performance. 19 20This contains a class and a function to create an anonymous instance of the 21class to serve HTTP GET requests. Memcache is used to increase response speed 22and lower processing cycles used in serving. Credit to Guido van Rossum and 23his implementation of zipserve which served as a reference as I wrote this. 24 25 MemcachedZipHandler: Class that serves request 26 create_handler: method to create instance of MemcachedZipHandler 27""" 28 29__author__ = 'jmatt@google.com (Justin Mattson)' 30 31import email.Utils 32import logging 33import mimetypes 34import time 35import zipfile 36 37from google.appengine.api import memcache 38from google.appengine.ext import webapp 39from google.appengine.ext.webapp import util 40from time import localtime, strftime 41 42def create_handler(zip_files, max_age=None, public=None): 43 """Factory method to create a MemcachedZipHandler instance. 44 45 Args: 46 zip_files: A list of file names, or a list of lists of file name, first 47 member of file mappings. See MemcachedZipHandler documentation for 48 more information about using the list of lists format 49 max_age: The maximum client-side cache lifetime 50 public: Whether this should be declared public in the client-side cache 51 Returns: 52 A MemcachedZipHandler wrapped in a pretty, anonymous bow for use with App 53 Engine 54 55 Raises: 56 ValueError: if the zip_files argument is not a list 57 """ 58 # verify argument integrity. If the argument is passed in list format, 59 # convert it to list of lists format 60 if zip_files and type(zip_files).__name__ == 'list': 61 num_items = len(zip_files) 62 while num_items > 0: 63 if type(zip_files[num_items - 1]).__name__ != 'list': 64 zip_files[num_items - 1] = [zip_files[num_items-1]] 65 num_items -= 1 66 else: 67 raise ValueError('File name arguments must be a list') 68 69 class HandlerWrapper(MemcachedZipHandler): 70 """Simple wrapper for an instance of MemcachedZipHandler. 71 72 I'm still not sure why this is needed 73 """ 74 def get(self, name): 75 self.zipfilenames = zip_files 76 self.TrueGet(name) 77 if max_age is not None: 78 MAX_AGE = max_age 79 if public is not None: 80 PUBLIC = public 81 82 return HandlerWrapper 83 84 85class MemcachedZipHandler(webapp.RequestHandler): 86 """Handles get requests for a given URL. 87 88 Serves a GET request from a series of zip files. As files are served they are 89 put into memcache, which is much faster than retreiving them from the zip 90 source file again. It also uses considerably fewer CPU cycles. 91 """ 92 zipfile_cache = {} # class cache of source zip files 93 MAX_AGE = 600 # max client-side cache lifetime 94 PUBLIC = True # public cache setting 95 CACHE_PREFIX = 'cache://' # memcache key prefix for actual URLs 96 NEG_CACHE_PREFIX = 'noncache://' # memcache key prefix for non-existant URL 97 intlString = 'intl/' 98 validLangs = ['en', 'de', 'es', 'fr','it','ja','zh-CN','zh-TW'] 99 100 def TrueGet(self, reqUri): 101 """The top-level entry point to serving requests. 102 103 Called 'True' get because it does the work when called from the wrapper 104 class' get method. Some logic is applied to the request to serve files 105 from an intl/<lang>/... directory or fall through to the default language. 106 107 Args: 108 name: URL requested 109 110 Returns: 111 None 112 """ 113 langName = 'en' 114 resetLangCookie = False 115 urlLangName = None 116 retry = False 117 isValidIntl = False 118 119 # Try to retrieve the user's lang pref from the cookie. If there is no 120 # lang pref cookie in the request, add set-cookie to the response with the 121 # default value of 'en'. 122 try: 123 langName = self.request.cookies['android_developer_pref_lang'] 124 except KeyError: 125 resetLangCookie = True 126 #logging.info('==========================EXCEPTION: NO LANG COOKIE FOUND, USING [%s]', langName) 127 logging.info('==========================REQ INIT name [%s] langName [%s]', reqUri, langName) 128 129 # Preprocess the req url. If it references a directory or the domain itself, 130 # append '/index.html' to the url and 302 redirect. Otherwise, continue 131 # processing the request below. 132 name = self.PreprocessUrl(reqUri, langName) 133 if name: 134 # Do some prep for handling intl requests. Parse the url and validate 135 # the intl/lang substring, extract the url lang code (urlLangName) and the 136 # the uri that follows the intl/lang substring(contentUri) 137 sections = name.split("/", 2) 138 contentUri = 0 139 isIntl = len(sections) > 1 and (sections[0] == "intl") 140 if isIntl: 141 isValidIntl = sections[1] in self.validLangs 142 if isValidIntl: 143 urlLangName = sections[1] 144 contentUri = sections[2] 145 if (langName != urlLangName): 146 # if the lang code in the request is different from that in 147 # the cookie, reset the cookie to the url lang value. 148 langName = urlLangName 149 resetLangCookie = True 150 #logging.info('INTL PREP resetting langName to urlLangName [%s]', langName) 151 #else: 152 # logging.info('INTL PREP no need to reset langName') 153 154 # Send for processing 155 if self.isCleanUrl(name, langName, isValidIntl): 156 # handle a 'clean' request. 157 # Try to form a response using the actual request url. 158 if not self.CreateResponse(name, langName, isValidIntl, resetLangCookie): 159 # If CreateResponse returns False, there was no such document 160 # in the intl/lang tree. Before going to 404, see if there is an 161 # English-language version of the doc in the default 162 # default tree and return it, else go to 404. 163 self.CreateResponse(contentUri, langName, False, resetLangCookie) 164 165 elif isIntl: 166 # handle the case where we need to pass through an invalid intl req 167 # for processing (so as to get 404 as appropriate). This is needed 168 # because intl urls are passed through clean and retried in English, 169 # if necessary. 170 logging.info(' Handling an invalid intl request...') 171 self.CreateResponse(name, langName, isValidIntl, resetLangCookie) 172 173 else: 174 # handle the case where we have a non-clean url (usually a non-intl 175 # url) that we need to interpret in the context of any lang pref 176 # that is set. Prepend an intl/lang string to the request url and 177 # send it as a 302 redirect. After the redirect, the subsequent 178 # request will be handled as a clean url. 179 self.RedirToIntl(name, self.intlString, langName) 180 181 def isCleanUrl(self, name, langName, isValidIntl): 182 """Determine whether to pass an incoming url straight to processing. 183 184 Args: 185 name: The incoming URL 186 187 Returns: 188 boolean: Whether the URL should be sent straight to processing 189 """ 190 if (langName == 'en') or isValidIntl or not ('.html' in name) or (not isValidIntl and not langName): 191 return True 192 193 def PreprocessUrl(self, name, langName): 194 """Any preprocessing work on the URL when it comes in. 195 196 Put any work related to interpretting the incoming URL here. For example, 197 this is used to redirect requests for a directory to the index.html file 198 in that directory. Subclasses should override this method to do different 199 preprocessing. 200 201 Args: 202 name: The incoming URL 203 204 Returns: 205 False if the request was redirected to '/index.html', or 206 The processed URL, otherwise 207 """ 208 # determine if this is a request for a directory 209 final_path_segment = name 210 final_slash_offset = name.rfind('/') 211 if final_slash_offset != len(name) - 1: 212 final_path_segment = name[final_slash_offset + 1:] 213 if final_path_segment.find('.') == -1: 214 name = ''.join([name, '/']) 215 216 # if this is a directory or the domain itself, redirect to /index.html 217 if not name or (name[len(name) - 1:] == '/'): 218 uri = ''.join(['/', name, 'index.html']) 219 logging.info('--->PREPROCESSING REDIRECT [%s] to [%s] with langName [%s]', name, uri, langName) 220 self.redirect(uri, False) 221 return False 222 else: 223 return name 224 225 def RedirToIntl(self, name, intlString, langName): 226 """Redirect an incoming request to the appropriate intl uri. 227 228 Builds the intl/lang string from a base (en) string 229 and redirects (302) the request to look for a version 230 of the file in the language that matches the client- 231 supplied cookie value. 232 233 Args: 234 name: The incoming, preprocessed URL 235 236 Returns: 237 The lang-specific URL 238 """ 239 builtIntlLangUri = ''.join([intlString, langName, '/', name, '?', self.request.query_string]) 240 uri = ''.join(['/', builtIntlLangUri]) 241 logging.info('-->>REDIRECTING %s to %s', name, uri) 242 self.redirect(uri, False) 243 return uri 244 245 def CreateResponse(self, name, langName, isValidIntl, resetLangCookie): 246 """Process the url and form a response, if appropriate. 247 248 Attempts to retrieve the requested file (name) from cache, 249 negative cache, or store (zip) and form the response. 250 For intl requests that are not found (in the localized tree), 251 returns False rather than forming a response, so that 252 the request can be retried with the base url (this is the 253 fallthrough to default language). 254 255 For requests that are found, forms the headers and 256 adds the content to the response entity. If the request was 257 for an intl (localized) url, also resets the language cookie 258 to the language specified in the url if needed, to ensure that 259 the client language and response data remain harmonious. 260 261 Args: 262 name: The incoming, preprocessed URL 263 langName: The language id. Used as necessary to reset the 264 language cookie in the response. 265 isValidIntl: If present, indicates whether the request is 266 for a language-specific url 267 resetLangCookie: Whether the response should reset the 268 language cookie to 'langName' 269 270 Returns: 271 True: A response was successfully created for the request 272 False: No response was created. 273 """ 274 # see if we have the page in the memcache 275 logging.info('PROCESSING %s langName [%s] isValidIntl [%s] resetLang [%s]', 276 name, langName, isValidIntl, resetLangCookie) 277 resp_data = self.GetFromCache(name) 278 if resp_data is None: 279 logging.info(' Cache miss for %s', name) 280 resp_data = self.GetFromNegativeCache(name) 281 if resp_data is None: 282 resp_data = self.GetFromStore(name) 283 284 # IF we have the file, put it in the memcache 285 # ELSE put it in the negative cache 286 if resp_data is not None: 287 self.StoreOrUpdateInCache(name, resp_data) 288 elif isValidIntl: 289 # couldn't find the intl doc. Try to fall through to English. 290 #logging.info(' Retrying with base uri...') 291 return False 292 else: 293 logging.info(' Adding %s to negative cache, serving 404', name) 294 self.StoreInNegativeCache(name) 295 self.Write404Error() 296 return True 297 else: 298 # found it in negative cache 299 self.Write404Error() 300 return True 301 302 # found content from cache or store 303 logging.info('FOUND CLEAN') 304 if resetLangCookie: 305 logging.info(' Resetting android_developer_pref_lang cookie to [%s]', 306 langName) 307 expireDate = time.mktime(localtime()) + 60 * 60 * 24 * 365 * 10 308 self.response.headers.add_header('Set-Cookie', 309 'android_developer_pref_lang=%s; path=/; expires=%s' % 310 (langName, strftime("%a, %d %b %Y %H:%M:%S", localtime(expireDate)))) 311 mustRevalidate = False 312 if ('.html' in name): 313 # revalidate html files -- workaround for cache inconsistencies for 314 # negotiated responses 315 mustRevalidate = True 316 logging.info(' Adding [Vary: Cookie] to response...') 317 self.response.headers.add_header('Vary', 'Cookie') 318 content_type, encoding = mimetypes.guess_type(name) 319 if content_type: 320 self.response.headers['Content-Type'] = content_type 321 self.SetCachingHeaders(mustRevalidate) 322 self.response.out.write(resp_data) 323 elif (name == 'favicon.ico'): 324 self.response.headers['Content-Type'] = 'image/x-icon' 325 self.SetCachingHeaders(mustRevalidate) 326 self.response.out.write(resp_data) 327 elif name.endswith('.psd'): 328 self.response.headers['Content-Type'] = 'application/octet-stream' 329 self.SetCachingHeaders(mustRevalidate) 330 self.response.out.write(resp_data) 331 return True 332 333 def GetFromStore(self, file_path): 334 """Retrieve file from zip files. 335 336 Get the file from the source, it must not have been in the memcache. If 337 possible, we'll use the zip file index to quickly locate where the file 338 should be found. (See MapToFileArchive documentation for assumptions about 339 file ordering.) If we don't have an index or don't find the file where the 340 index says we should, look through all the zip files to find it. 341 342 Args: 343 file_path: the file that we're looking for 344 345 Returns: 346 The contents of the requested file 347 """ 348 resp_data = None 349 file_itr = iter(self.zipfilenames) 350 351 # check the index, if we have one, to see what archive the file is in 352 archive_name = self.MapFileToArchive(file_path) 353 if not archive_name: 354 archive_name = file_itr.next()[0] 355 356 while resp_data is None and archive_name: 357 zip_archive = self.LoadZipFile(archive_name) 358 if zip_archive: 359 360 # we expect some lookups will fail, and that's okay, 404s will deal 361 # with that 362 try: 363 resp_data = zip_archive.read(file_path) 364 except (KeyError, RuntimeError), err: 365 # no op 366 x = False 367 if resp_data is not None: 368 logging.info('%s read from %s', file_path, archive_name) 369 370 try: 371 archive_name = file_itr.next()[0] 372 except (StopIteration), err: 373 archive_name = False 374 375 return resp_data 376 377 def LoadZipFile(self, zipfilename): 378 """Convenience method to load zip file. 379 380 Just a convenience method to load the zip file from the data store. This is 381 useful if we ever want to change data stores and also as a means of 382 dependency injection for testing. This method will look at our file cache 383 first, and then load and cache the file if there's a cache miss 384 385 Args: 386 zipfilename: the name of the zip file to load 387 388 Returns: 389 The zip file requested, or None if there is an I/O error 390 """ 391 zip_archive = None 392 zip_archive = self.zipfile_cache.get(zipfilename) 393 if zip_archive is None: 394 try: 395 zip_archive = zipfile.ZipFile(zipfilename) 396 self.zipfile_cache[zipfilename] = zip_archive 397 except (IOError, RuntimeError), err: 398 logging.error('Can\'t open zipfile %s, cause: %s' % (zipfilename, 399 err)) 400 return zip_archive 401 402 def MapFileToArchive(self, file_path): 403 """Given a file name, determine what archive it should be in. 404 405 This method makes two critical assumptions. 406 (1) The zip files passed as an argument to the handler, if concatenated 407 in that same order, would result in a total ordering 408 of all the files. See (2) for ordering type. 409 (2) Upper case letters before lower case letters. The traversal of a 410 directory tree is depth first. A parent directory's files are added 411 before the files of any child directories 412 413 Args: 414 file_path: the file to be mapped to an archive 415 416 Returns: 417 The name of the archive where we expect the file to be 418 """ 419 num_archives = len(self.zipfilenames) 420 while num_archives > 0: 421 target = self.zipfilenames[num_archives - 1] 422 if len(target) > 1: 423 if self.CompareFilenames(target[1], file_path) >= 0: 424 return target[0] 425 num_archives -= 1 426 427 return None 428 429 def CompareFilenames(self, file1, file2): 430 """Determines whether file1 is lexigraphically 'before' file2. 431 432 WARNING: This method assumes that paths are output in a depth-first, 433 with parent directories' files stored before childs' 434 435 We say that file1 is lexigraphically before file2 if the last non-matching 436 path segment of file1 is alphabetically before file2. 437 438 Args: 439 file1: the first file path 440 file2: the second file path 441 442 Returns: 443 A positive number if file1 is before file2 444 A negative number if file2 is before file1 445 0 if filenames are the same 446 """ 447 f1_segments = file1.split('/') 448 f2_segments = file2.split('/') 449 450 segment_ptr = 0 451 while (segment_ptr < len(f1_segments) and 452 segment_ptr < len(f2_segments) and 453 f1_segments[segment_ptr] == f2_segments[segment_ptr]): 454 segment_ptr += 1 455 456 if len(f1_segments) == len(f2_segments): 457 458 # we fell off the end, the paths much be the same 459 if segment_ptr == len(f1_segments): 460 return 0 461 462 # we didn't fall of the end, compare the segments where they differ 463 if f1_segments[segment_ptr] < f2_segments[segment_ptr]: 464 return 1 465 elif f1_segments[segment_ptr] > f2_segments[segment_ptr]: 466 return -1 467 else: 468 return 0 469 470 # the number of segments differs, we either mismatched comparing 471 # directories, or comparing a file to a directory 472 else: 473 474 # IF we were looking at the last segment of one of the paths, 475 # the one with fewer segments is first because files come before 476 # directories 477 # ELSE we just need to compare directory names 478 if (segment_ptr + 1 == len(f1_segments) or 479 segment_ptr + 1 == len(f2_segments)): 480 return len(f2_segments) - len(f1_segments) 481 else: 482 if f1_segments[segment_ptr] < f2_segments[segment_ptr]: 483 return 1 484 elif f1_segments[segment_ptr] > f2_segments[segment_ptr]: 485 return -1 486 else: 487 return 0 488 489 def SetCachingHeaders(self, revalidate): 490 """Set caching headers for the request.""" 491 max_age = self.MAX_AGE 492 #self.response.headers['Expires'] = email.Utils.formatdate( 493 # time.time() + max_age, usegmt=True) 494 cache_control = [] 495 if self.PUBLIC: 496 cache_control.append('public') 497 cache_control.append('max-age=%d' % max_age) 498 if revalidate: 499 cache_control.append('must-revalidate') 500 self.response.headers['Cache-Control'] = ', '.join(cache_control) 501 502 def GetFromCache(self, filename): 503 """Get file from memcache, if available. 504 505 Args: 506 filename: The URL of the file to return 507 508 Returns: 509 The content of the file 510 """ 511 return memcache.get('%s%s' % (self.CACHE_PREFIX, filename)) 512 513 def StoreOrUpdateInCache(self, filename, data): 514 """Store data in the cache. 515 516 Store a piece of data in the memcache. Memcache has a maximum item size of 517 1*10^6 bytes. If the data is too large, fail, but log the failure. Future 518 work will consider compressing the data before storing or chunking it 519 520 Args: 521 filename: the name of the file to store 522 data: the data of the file 523 524 Returns: 525 None 526 """ 527 try: 528 if not memcache.add('%s%s' % (self.CACHE_PREFIX, filename), data): 529 memcache.replace('%s%s' % (self.CACHE_PREFIX, filename), data) 530 except (ValueError), err: 531 logging.warning('Data size too large to cache\n%s' % err) 532 533 def Write404Error(self): 534 """Ouptut a simple 404 response.""" 535 self.error(404) 536 self.response.out.write( 537 ''.join(['<html><head><title>404: Not Found</title></head>', 538 '<body><b><h2>Error 404</h2><br/>', 539 'File not found</b></body></html>'])) 540 541 def StoreInNegativeCache(self, filename): 542 """If a non-existant URL is accessed, cache this result as well. 543 544 Future work should consider setting a maximum negative cache size to 545 prevent it from from negatively impacting the real cache. 546 547 Args: 548 filename: URL to add ot negative cache 549 550 Returns: 551 None 552 """ 553 memcache.add('%s%s' % (self.NEG_CACHE_PREFIX, filename), -1) 554 555 def GetFromNegativeCache(self, filename): 556 """Retrieve from negative cache. 557 558 Args: 559 filename: URL to retreive 560 561 Returns: 562 The file contents if present in the negative cache. 563 """ 564 return memcache.get('%s%s' % (self.NEG_CACHE_PREFIX, filename)) 565 566def main(): 567 application = webapp.WSGIApplication([('/([^/]+)/(.*)', 568 MemcachedZipHandler)]) 569 util.run_wsgi_app(application) 570 571 572if __name__ == '__main__': 573 main() 574