• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python2.4
2#
3# Copyright (C) 2008 Google Inc.
4#
5# Licensed under the Apache License, Version 2.0 (the "License");
6# you may not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9#      http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS,
13# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17
18"""Module to compress directories in to series of zip files.
19
20This module will take a directory and compress all its contents, including
21child directories into a series of zip files named N.zip where 'N' ranges from
220 to infinity. The zip files will all be below a certain specified maximum
23threshold.
24
25The directory is compressed with a depth first traversal, each directory's
26file contents being compressed as it is visisted, before the compression of any
27child directory's contents. In this way the files within an archive are ordered
28and the archives themselves are ordered.
29
30The class also constructs a 'main.py' file intended for use with Google App
31Engine with a custom App Engine program not currently distributed with this
32code base. The custom App Engine runtime can leverage the index files written
33out by this class to more quickly locate which zip file to serve a given URL
34from.
35"""
36
37__author__ = 'jmatt@google.com (Justin Mattson)'
38
39import optparse
40import os
41import stat
42import sys
43import zipfile
44import divide_and_compress_constants
45
46
47def CreateOptionsParser():
48  """Creates the parser for command line arguments.
49
50  Returns:
51    A configured optparse.OptionParser object.
52  """
53  rtn = optparse.OptionParser()
54  rtn.add_option('-s', '--sourcefiles', dest='sourcefiles', default=None,
55                 help='The directory containing the files to compress')
56  rtn.add_option('-d', '--destination', dest='destination', default=None,
57                 help=('Where to put the archive files, this should not be'
58                       ' a child of where the source files exist.'))
59  rtn.add_option('-f', '--filesize', dest='filesize', default='1M',
60                 help=('Maximum size of archive files. A number followed by '
61                       'a magnitude indicator either "B", "K", "M", or "G". '
62                       'Examples:\n  1000000B == one million BYTES\n'
63                       '  1.2M == one point two MEGABYTES\n'
64                       '  1M == 1048576 BYTES'))
65  rtn.add_option('-n', '--nocompress', action='store_false', dest='compress',
66                 default=True,
67                 help=('Whether the archive files should be compressed, or '
68                       'just a concatenation of the source files'))
69  return rtn
70
71
72def VerifyArguments(options, parser):
73  """Runs simple checks on correctness of commandline arguments.
74
75  Args:
76    options: The command line options passed.
77    parser: The parser object used to parse the command string.
78  """
79  try:
80    if options.sourcefiles is None or options.destination is None:
81      parser.print_help()
82      sys.exit(-1)
83  except AttributeError:
84    parser.print_help()
85    sys.exit(-1)
86
87
88def ParseSize(size_str):
89  """Parse the file size argument from a string to a number of bytes.
90
91  Args:
92    size_str: The string representation of the file size.
93
94  Returns:
95    The file size in bytes.
96
97  Raises:
98    ValueError: Raises an error if the numeric or qualifier portions of the
99      file size argument is invalid.
100  """
101  if len(size_str) < 2:
102    raise ValueError(('filesize argument not understood, please include'
103                      ' a numeric value and magnitude indicator'))
104  magnitude = size_str[-1]
105  if not magnitude in ('B', 'K', 'M', 'G'):
106    raise ValueError(('filesize magnitude indicator not valid, must be "B",'
107                      '"K","M", or "G"'))
108  numeral = float(size_str[:-1])
109  if magnitude == 'K':
110    numeral *= 1024
111  elif magnitude == 'M':
112    numeral *= 1048576
113  elif magnitude == 'G':
114    numeral *= 1073741824
115  return int(numeral)
116
117
118class DirectoryZipper(object):
119  """Class to compress a directory and all its sub-directories."""
120
121  def __init__(self, output_path, base_dir, archive_size, enable_compression):
122    """DirectoryZipper constructor.
123
124    Args:
125      output_path: A string, the path to write the archives and index file to.
126      base_dir: A string, the directory to compress.
127      archive_size: An number, the maximum size, in bytes, of a single
128        archive file.
129      enable_compression: A boolean, whether or not compression should be
130        enabled, if disabled, the files will be written into an uncompresed
131        zip.
132    """
133    self.output_dir = output_path
134    self.current_archive = '0.zip'
135    self.base_path = base_dir
136    self.max_size = archive_size
137    self.compress = enable_compression
138
139    # Set index_fp to None, because we don't know what it will be yet.
140    self.index_fp = None
141
142  def StartCompress(self):
143    """Start compress of the directory.
144
145    This will start the compression process and write the archives to the
146    specified output directory. It will also produce an 'index.txt' file in the
147    output directory that maps from file to archive.
148    """
149    self.index_fp = open(os.path.join(self.output_dir, 'main.py'), 'w')
150    self.index_fp.write(divide_and_compress_constants.file_preamble)
151    os.path.walk(self.base_path, self.CompressDirectory, 1)
152    self.index_fp.write(divide_and_compress_constants.file_endpiece)
153    self.index_fp.close()
154
155  def RemoveLastFile(self, archive_path=None):
156    """Removes the last item in the archive.
157
158    This removes the last item in the archive by reading the items out of the
159    archive, adding them to a new archive, deleting the old archive, and
160    moving the new archive to the location of the old archive.
161
162    Args:
163      archive_path: Path to the archive to modify. This archive should not be
164        open elsewhere, since it will need to be deleted.
165
166    Returns:
167      A new ZipFile object that points to the modified archive file.
168    """
169    if archive_path is None:
170      archive_path = os.path.join(self.output_dir, self.current_archive)
171
172    # Move the old file and create a new one at its old location.
173    root, ext = os.path.splitext(archive_path)
174    old_archive = ''.join([root, '-old', ext])
175    os.rename(archive_path, old_archive)
176    old_fp = self.OpenZipFileAtPath(old_archive, mode='r')
177
178    # By default, store uncompressed.
179    compress_bit = zipfile.ZIP_STORED
180    if self.compress:
181      compress_bit = zipfile.ZIP_DEFLATED
182    new_fp = self.OpenZipFileAtPath(archive_path,
183                                    mode='w',
184                                    compress=compress_bit)
185
186    # Read the old archive in a new archive, except the last one.
187    for zip_member in old_fp.infolist()[:-1]:
188      new_fp.writestr(zip_member, old_fp.read(zip_member.filename))
189
190    # Close files and delete the old one.
191    old_fp.close()
192    new_fp.close()
193    os.unlink(old_archive)
194
195  def OpenZipFileAtPath(self, path, mode=None, compress=zipfile.ZIP_DEFLATED):
196    """This method is mainly for testing purposes, eg dependency injection."""
197    if mode is None:
198      if os.path.exists(path):
199        mode = 'a'
200      else:
201        mode = 'w'
202
203    if mode == 'r':
204      return zipfile.ZipFile(path, mode)
205    else:
206      return zipfile.ZipFile(path, mode, compress)
207
208  def CompressDirectory(self, unused_id, dir_path, dir_contents):
209    """Method to compress the given directory.
210
211    This method compresses the directory 'dir_path'. It will add to an existing
212    zip file that still has space and create new ones as necessary to keep zip
213    file sizes under the maximum specified size. This also writes out the
214    mapping of files to archives to the self.index_fp file descriptor
215
216    Args:
217      unused_id: A numeric identifier passed by the os.path.walk method, this
218        is not used by this method.
219      dir_path: A string, the path to the directory to compress.
220      dir_contents: A list of directory contents to be compressed.
221    """
222    # Construct the queue of files to be added that this method will use
223    # it seems that dir_contents is given in reverse alphabetical order,
224    # so put them in alphabetical order by inserting to front of the list.
225    dir_contents.sort()
226    zip_queue = []
227    for filename in dir_contents:
228      zip_queue.append(os.path.join(dir_path, filename))
229    compress_bit = zipfile.ZIP_DEFLATED
230    if not self.compress:
231      compress_bit = zipfile.ZIP_STORED
232
233    # Zip all files in this directory, adding to existing archives and creating
234    # as necessary.
235    while zip_queue:
236      target_file = zip_queue[0]
237      if os.path.isfile(target_file):
238        self.AddFileToArchive(target_file, compress_bit)
239
240        # See if adding the new file made our archive too large.
241        if not self.ArchiveIsValid():
242
243          # IF fixing fails, the last added file was to large, skip it
244          # ELSE the current archive filled normally, make a new one and try
245          #  adding the file again.
246          if not self.FixArchive('SIZE'):
247            zip_queue.pop(0)
248          else:
249            self.current_archive = '%i.zip' % (
250                int(self.current_archive[
251                    0:self.current_archive.rfind('.zip')]) + 1)
252        else:
253
254          # Write an index record if necessary.
255          self.WriteIndexRecord()
256          zip_queue.pop(0)
257      else:
258        zip_queue.pop(0)
259
260  def WriteIndexRecord(self):
261    """Write an index record to the index file.
262
263    Only write an index record if this is the first file to go into archive
264
265    Returns:
266      True if an archive record is written, False if it isn't.
267    """
268    archive = self.OpenZipFileAtPath(
269        os.path.join(self.output_dir, self.current_archive), 'r')
270    archive_index = archive.infolist()
271    if len(archive_index) == 1:
272      self.index_fp.write(
273          '[\'%s\', \'%s\'],\n' % (self.current_archive,
274                                   archive_index[0].filename))
275      archive.close()
276      return True
277    else:
278      archive.close()
279      return False
280
281  def FixArchive(self, problem):
282    """Make the archive compliant.
283
284    Args:
285      problem: An enum, the reason the archive is invalid.
286
287    Returns:
288      Whether the file(s) removed to fix the archive could conceivably be
289      in an archive, but for some reason can't be added to this one.
290    """
291    archive_path = os.path.join(self.output_dir, self.current_archive)
292    return_value = None
293
294    if problem == 'SIZE':
295      archive_obj = self.OpenZipFileAtPath(archive_path, mode='r')
296      num_archive_files = len(archive_obj.infolist())
297
298      # IF there is a single file, that means its too large to compress,
299      # delete the created archive
300      # ELSE do normal finalization.
301      if num_archive_files == 1:
302        print ('WARNING: %s%s is too large to store.' % (
303            self.base_path, archive_obj.infolist()[0].filename))
304        archive_obj.close()
305        os.unlink(archive_path)
306        return_value = False
307      else:
308        archive_obj.close()
309        self.RemoveLastFile(
310          os.path.join(self.output_dir, self.current_archive))
311        print 'Final archive size for %s is %i' % (
312            self.current_archive, os.path.getsize(archive_path))
313        return_value = True
314    return return_value
315
316  def AddFileToArchive(self, filepath, compress_bit):
317    """Add the file at filepath to the current archive.
318
319    Args:
320      filepath: A string, the path of the file to add.
321      compress_bit: A boolean, whether or not this file should be compressed
322        when added.
323
324    Returns:
325      True if the file could be added (typically because this is a file) or
326      False if it couldn't be added (typically because its a directory).
327    """
328    curr_archive_path = os.path.join(self.output_dir, self.current_archive)
329    if os.path.isfile(filepath) and not os.path.islink(filepath):
330      if os.path.getsize(filepath) > 1048576:
331        print 'Warning: %s is potentially too large to serve on GAE' % filepath
332      archive = self.OpenZipFileAtPath(curr_archive_path,
333                                       compress=compress_bit)
334      # Add the file to the archive.
335      archive.write(filepath, filepath[len(self.base_path):])
336      archive.close()
337      return True
338    else:
339      return False
340
341  def ArchiveIsValid(self):
342    """Check whether the archive is valid.
343
344    Currently this only checks whether the archive is under the required size.
345    The thought is that eventually this will do additional validation
346
347    Returns:
348      True if the archive is valid, False if its not.
349    """
350    archive_path = os.path.join(self.output_dir, self.current_archive)
351    return os.path.getsize(archive_path) <= self.max_size
352
353
354def main(argv):
355  parser = CreateOptionsParser()
356  (options, unused_args) = parser.parse_args(args=argv[1:])
357  VerifyArguments(options, parser)
358  zipper = DirectoryZipper(options.destination,
359                           options.sourcefiles,
360                           ParseSize(options.filesize),
361                           options.compress)
362  zipper.StartCompress()
363
364
365if __name__ == '__main__':
366  main(sys.argv)
367