• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""A dumb and slow but simple dbm clone.
2
3For database spam, spam.dir contains the index (a text file),
4spam.bak *may* contain a backup of the index (also a text file),
5while spam.dat contains the data (a binary file).
6
7XXX TO DO:
8
9- seems to contain a bug when updating...
10
11- reclaim free space (currently, space once occupied by deleted or expanded
12items is never reused)
13
14- support concurrent access (currently, if two processes take turns making
15updates, they can mess up the index)
16
17- support efficient access to large databases (currently, the whole index
18is read when the database is opened, and some updates rewrite the whole index)
19
20- support opening for read-only (flag = 'm')
21
22"""
23
24import ast as _ast
25import io as _io
26import os as _os
27import collections
28
29__all__ = ["error", "open"]
30
31_BLOCKSIZE = 512
32
33error = OSError
34
35class _Database(collections.MutableMapping):
36
37    # The on-disk directory and data files can remain in mutually
38    # inconsistent states for an arbitrarily long time (see comments
39    # at the end of __setitem__).  This is only repaired when _commit()
40    # gets called.  One place _commit() gets called is from __del__(),
41    # and if that occurs at program shutdown time, module globals may
42    # already have gotten rebound to None.  Since it's crucial that
43    # _commit() finish successfully, we can't ignore shutdown races
44    # here, and _commit() must not reference any globals.
45    _os = _os       # for _commit()
46    _io = _io       # for _commit()
47
48    def __init__(self, filebasename, mode, flag='c'):
49        self._mode = mode
50        self._readonly = (flag == 'r')
51
52        # The directory file is a text file.  Each line looks like
53        #    "%r, (%d, %d)\n" % (key, pos, siz)
54        # where key is the string key, pos is the offset into the dat
55        # file of the associated value's first byte, and siz is the number
56        # of bytes in the associated value.
57        self._dirfile = filebasename + '.dir'
58
59        # The data file is a binary file pointed into by the directory
60        # file, and holds the values associated with keys.  Each value
61        # begins at a _BLOCKSIZE-aligned byte offset, and is a raw
62        # binary 8-bit string value.
63        self._datfile = filebasename + '.dat'
64        self._bakfile = filebasename + '.bak'
65
66        # The index is an in-memory dict, mirroring the directory file.
67        self._index = None  # maps keys to (pos, siz) pairs
68
69        # Handle the creation
70        self._create(flag)
71        self._update()
72
73    def _create(self, flag):
74        if flag == 'n':
75            for filename in (self._datfile, self._bakfile, self._dirfile):
76                try:
77                    _os.remove(filename)
78                except OSError:
79                    pass
80        # Mod by Jack: create data file if needed
81        try:
82            f = _io.open(self._datfile, 'r', encoding="Latin-1")
83        except OSError:
84            if flag not in ('c', 'n'):
85                import warnings
86                warnings.warn("The database file is missing, the "
87                              "semantics of the 'c' flag will be used.",
88                              DeprecationWarning, stacklevel=4)
89            with _io.open(self._datfile, 'w', encoding="Latin-1") as f:
90                self._chmod(self._datfile)
91        else:
92            f.close()
93
94    # Read directory file into the in-memory index dict.
95    def _update(self):
96        self._index = {}
97        try:
98            f = _io.open(self._dirfile, 'r', encoding="Latin-1")
99        except OSError:
100            self._modified = not self._readonly
101        else:
102            self._modified = False
103            with f:
104                for line in f:
105                    line = line.rstrip()
106                    key, pos_and_siz_pair = _ast.literal_eval(line)
107                    key = key.encode('Latin-1')
108                    self._index[key] = pos_and_siz_pair
109
110    # Write the index dict to the directory file.  The original directory
111    # file (if any) is renamed with a .bak extension first.  If a .bak
112    # file currently exists, it's deleted.
113    def _commit(self):
114        # CAUTION:  It's vital that _commit() succeed, and _commit() can
115        # be called from __del__().  Therefore we must never reference a
116        # global in this routine.
117        if self._index is None or not self._modified:
118            return  # nothing to do
119
120        try:
121            self._os.unlink(self._bakfile)
122        except OSError:
123            pass
124
125        try:
126            self._os.rename(self._dirfile, self._bakfile)
127        except OSError:
128            pass
129
130        with self._io.open(self._dirfile, 'w', encoding="Latin-1") as f:
131            self._chmod(self._dirfile)
132            for key, pos_and_siz_pair in self._index.items():
133                # Use Latin-1 since it has no qualms with any value in any
134                # position; UTF-8, though, does care sometimes.
135                entry = "%r, %r\n" % (key.decode('Latin-1'), pos_and_siz_pair)
136                f.write(entry)
137
138    sync = _commit
139
140    def _verify_open(self):
141        if self._index is None:
142            raise error('DBM object has already been closed')
143
144    def __getitem__(self, key):
145        if isinstance(key, str):
146            key = key.encode('utf-8')
147        self._verify_open()
148        pos, siz = self._index[key]     # may raise KeyError
149        with _io.open(self._datfile, 'rb') as f:
150            f.seek(pos)
151            dat = f.read(siz)
152        return dat
153
154    # Append val to the data file, starting at a _BLOCKSIZE-aligned
155    # offset.  The data file is first padded with NUL bytes (if needed)
156    # to get to an aligned offset.  Return pair
157    #     (starting offset of val, len(val))
158    def _addval(self, val):
159        with _io.open(self._datfile, 'rb+') as f:
160            f.seek(0, 2)
161            pos = int(f.tell())
162            npos = ((pos + _BLOCKSIZE - 1) // _BLOCKSIZE) * _BLOCKSIZE
163            f.write(b'\0'*(npos-pos))
164            pos = npos
165            f.write(val)
166        return (pos, len(val))
167
168    # Write val to the data file, starting at offset pos.  The caller
169    # is responsible for ensuring that there's enough room starting at
170    # pos to hold val, without overwriting some other value.  Return
171    # pair (pos, len(val)).
172    def _setval(self, pos, val):
173        with _io.open(self._datfile, 'rb+') as f:
174            f.seek(pos)
175            f.write(val)
176        return (pos, len(val))
177
178    # key is a new key whose associated value starts in the data file
179    # at offset pos and with length siz.  Add an index record to
180    # the in-memory index dict, and append one to the directory file.
181    def _addkey(self, key, pos_and_siz_pair):
182        self._index[key] = pos_and_siz_pair
183        with _io.open(self._dirfile, 'a', encoding="Latin-1") as f:
184            self._chmod(self._dirfile)
185            f.write("%r, %r\n" % (key.decode("Latin-1"), pos_and_siz_pair))
186
187    def __setitem__(self, key, val):
188        if self._readonly:
189            import warnings
190            warnings.warn('The database is opened for reading only',
191                          DeprecationWarning, stacklevel=2)
192        if isinstance(key, str):
193            key = key.encode('utf-8')
194        elif not isinstance(key, (bytes, bytearray)):
195            raise TypeError("keys must be bytes or strings")
196        if isinstance(val, str):
197            val = val.encode('utf-8')
198        elif not isinstance(val, (bytes, bytearray)):
199            raise TypeError("values must be bytes or strings")
200        self._verify_open()
201        self._modified = True
202        if key not in self._index:
203            self._addkey(key, self._addval(val))
204        else:
205            # See whether the new value is small enough to fit in the
206            # (padded) space currently occupied by the old value.
207            pos, siz = self._index[key]
208            oldblocks = (siz + _BLOCKSIZE - 1) // _BLOCKSIZE
209            newblocks = (len(val) + _BLOCKSIZE - 1) // _BLOCKSIZE
210            if newblocks <= oldblocks:
211                self._index[key] = self._setval(pos, val)
212            else:
213                # The new value doesn't fit in the (padded) space used
214                # by the old value.  The blocks used by the old value are
215                # forever lost.
216                self._index[key] = self._addval(val)
217
218            # Note that _index may be out of synch with the directory
219            # file now:  _setval() and _addval() don't update the directory
220            # file.  This also means that the on-disk directory and data
221            # files are in a mutually inconsistent state, and they'll
222            # remain that way until _commit() is called.  Note that this
223            # is a disaster (for the database) if the program crashes
224            # (so that _commit() never gets called).
225
226    def __delitem__(self, key):
227        if self._readonly:
228            import warnings
229            warnings.warn('The database is opened for reading only',
230                          DeprecationWarning, stacklevel=2)
231        if isinstance(key, str):
232            key = key.encode('utf-8')
233        self._verify_open()
234        self._modified = True
235        # The blocks used by the associated value are lost.
236        del self._index[key]
237        # XXX It's unclear why we do a _commit() here (the code always
238        # XXX has, so I'm not changing it).  __setitem__ doesn't try to
239        # XXX keep the directory file in synch.  Why should we?  Or
240        # XXX why shouldn't __setitem__?
241        self._commit()
242
243    def keys(self):
244        try:
245            return list(self._index)
246        except TypeError:
247            raise error('DBM object has already been closed') from None
248
249    def items(self):
250        self._verify_open()
251        return [(key, self[key]) for key in self._index.keys()]
252
253    def __contains__(self, key):
254        if isinstance(key, str):
255            key = key.encode('utf-8')
256        try:
257            return key in self._index
258        except TypeError:
259            if self._index is None:
260                raise error('DBM object has already been closed') from None
261            else:
262                raise
263
264    def iterkeys(self):
265        try:
266            return iter(self._index)
267        except TypeError:
268            raise error('DBM object has already been closed') from None
269    __iter__ = iterkeys
270
271    def __len__(self):
272        try:
273            return len(self._index)
274        except TypeError:
275            raise error('DBM object has already been closed') from None
276
277    def close(self):
278        try:
279            self._commit()
280        finally:
281            self._index = self._datfile = self._dirfile = self._bakfile = None
282
283    __del__ = close
284
285    def _chmod(self, file):
286        if hasattr(self._os, 'chmod'):
287            self._os.chmod(file, self._mode)
288
289    def __enter__(self):
290        return self
291
292    def __exit__(self, *args):
293        self.close()
294
295
296def open(file, flag='c', mode=0o666):
297    """Open the database file, filename, and return corresponding object.
298
299    The flag argument, used to control how the database is opened in the
300    other DBM implementations, supports only the semantics of 'c' and 'n'
301    values.  Other values will default to the semantics of 'c' value:
302    the database will always opened for update and will be created if it
303    does not exist.
304
305    The optional mode argument is the UNIX mode of the file, used only when
306    the database has to be created.  It defaults to octal code 0o666 (and
307    will be modified by the prevailing umask).
308
309    """
310
311    # Modify mode depending on the umask
312    try:
313        um = _os.umask(0)
314        _os.umask(um)
315    except AttributeError:
316        pass
317    else:
318        # Turn off any bits that are set in the umask
319        mode = mode & (~um)
320    if flag not in ('r', 'w', 'c', 'n'):
321        import warnings
322        warnings.warn("Flag must be one of 'r', 'w', 'c', or 'n'",
323                      DeprecationWarning, stacklevel=2)
324    return _Database(file, mode, flag=flag)
325