• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""A dumb and slow but simple dbm clone.
2
3For database spam, spam.dir contains the index (a text file),
4spam.bak *may* contain a backup of the index (also a text file),
5while spam.dat contains the data (a binary file).
6
7XXX TO DO:
8
9- seems to contain a bug when updating...
10
11- reclaim free space (currently, space once occupied by deleted or expanded
12items is never reused)
13
14- support concurrent access (currently, if two processes take turns making
15updates, they can mess up the index)
16
17- support efficient access to large databases (currently, the whole index
18is read when the database is opened, and some updates rewrite the whole index)
19
20- support opening for read-only (flag = 'm')
21
22"""
23
24import ast as _ast
25import io as _io
26import os as _os
27import collections.abc
28
29__all__ = ["error", "open"]
30
31_BLOCKSIZE = 512
32
33error = OSError
34
35class _Database(collections.abc.MutableMapping):
36
37    # The on-disk directory and data files can remain in mutually
38    # inconsistent states for an arbitrarily long time (see comments
39    # at the end of __setitem__).  This is only repaired when _commit()
40    # gets called.  One place _commit() gets called is from __del__(),
41    # and if that occurs at program shutdown time, module globals may
42    # already have gotten rebound to None.  Since it's crucial that
43    # _commit() finish successfully, we can't ignore shutdown races
44    # here, and _commit() must not reference any globals.
45    _os = _os       # for _commit()
46    _io = _io       # for _commit()
47
48    def __init__(self, filebasename, mode, flag='c'):
49        filebasename = self._os.fsencode(filebasename)
50        self._mode = mode
51        self._readonly = (flag == 'r')
52
53        # The directory file is a text file.  Each line looks like
54        #    "%r, (%d, %d)\n" % (key, pos, siz)
55        # where key is the string key, pos is the offset into the dat
56        # file of the associated value's first byte, and siz is the number
57        # of bytes in the associated value.
58        self._dirfile = filebasename + b'.dir'
59
60        # The data file is a binary file pointed into by the directory
61        # file, and holds the values associated with keys.  Each value
62        # begins at a _BLOCKSIZE-aligned byte offset, and is a raw
63        # binary 8-bit string value.
64        self._datfile = filebasename + b'.dat'
65        self._bakfile = filebasename + b'.bak'
66
67        # The index is an in-memory dict, mirroring the directory file.
68        self._index = None  # maps keys to (pos, siz) pairs
69
70        # Handle the creation
71        self._create(flag)
72        self._update(flag)
73
74    def _create(self, flag):
75        if flag == 'n':
76            for filename in (self._datfile, self._bakfile, self._dirfile):
77                try:
78                    _os.remove(filename)
79                except OSError:
80                    pass
81        # Mod by Jack: create data file if needed
82        try:
83            f = _io.open(self._datfile, 'r', encoding="Latin-1")
84        except OSError:
85            if flag not in ('c', 'n'):
86                raise
87            with _io.open(self._datfile, 'w', encoding="Latin-1") as f:
88                self._chmod(self._datfile)
89        else:
90            f.close()
91
92    # Read directory file into the in-memory index dict.
93    def _update(self, flag):
94        self._modified = False
95        self._index = {}
96        try:
97            f = _io.open(self._dirfile, 'r', encoding="Latin-1")
98        except OSError:
99            if flag not in ('c', 'n'):
100                raise
101            with self._io.open(self._dirfile, 'w', encoding="Latin-1") as f:
102                self._chmod(self._dirfile)
103        else:
104            with f:
105                for line in f:
106                    line = line.rstrip()
107                    key, pos_and_siz_pair = _ast.literal_eval(line)
108                    key = key.encode('Latin-1')
109                    self._index[key] = pos_and_siz_pair
110
111    # Write the index dict to the directory file.  The original directory
112    # file (if any) is renamed with a .bak extension first.  If a .bak
113    # file currently exists, it's deleted.
114    def _commit(self):
115        # CAUTION:  It's vital that _commit() succeed, and _commit() can
116        # be called from __del__().  Therefore we must never reference a
117        # global in this routine.
118        if self._index is None or not self._modified:
119            return  # nothing to do
120
121        try:
122            self._os.unlink(self._bakfile)
123        except OSError:
124            pass
125
126        try:
127            self._os.rename(self._dirfile, self._bakfile)
128        except OSError:
129            pass
130
131        with self._io.open(self._dirfile, 'w', encoding="Latin-1") as f:
132            self._chmod(self._dirfile)
133            for key, pos_and_siz_pair in self._index.items():
134                # Use Latin-1 since it has no qualms with any value in any
135                # position; UTF-8, though, does care sometimes.
136                entry = "%r, %r\n" % (key.decode('Latin-1'), pos_and_siz_pair)
137                f.write(entry)
138        self._modified = False
139
140    sync = _commit
141
142    def _verify_open(self):
143        if self._index is None:
144            raise error('DBM object has already been closed')
145
146    def __getitem__(self, key):
147        if isinstance(key, str):
148            key = key.encode('utf-8')
149        self._verify_open()
150        pos, siz = self._index[key]     # may raise KeyError
151        with _io.open(self._datfile, 'rb') as f:
152            f.seek(pos)
153            dat = f.read(siz)
154        return dat
155
156    # Append val to the data file, starting at a _BLOCKSIZE-aligned
157    # offset.  The data file is first padded with NUL bytes (if needed)
158    # to get to an aligned offset.  Return pair
159    #     (starting offset of val, len(val))
160    def _addval(self, val):
161        with _io.open(self._datfile, 'rb+') as f:
162            f.seek(0, 2)
163            pos = int(f.tell())
164            npos = ((pos + _BLOCKSIZE - 1) // _BLOCKSIZE) * _BLOCKSIZE
165            f.write(b'\0'*(npos-pos))
166            pos = npos
167            f.write(val)
168        return (pos, len(val))
169
170    # Write val to the data file, starting at offset pos.  The caller
171    # is responsible for ensuring that there's enough room starting at
172    # pos to hold val, without overwriting some other value.  Return
173    # pair (pos, len(val)).
174    def _setval(self, pos, val):
175        with _io.open(self._datfile, 'rb+') as f:
176            f.seek(pos)
177            f.write(val)
178        return (pos, len(val))
179
180    # key is a new key whose associated value starts in the data file
181    # at offset pos and with length siz.  Add an index record to
182    # the in-memory index dict, and append one to the directory file.
183    def _addkey(self, key, pos_and_siz_pair):
184        self._index[key] = pos_and_siz_pair
185        with _io.open(self._dirfile, 'a', encoding="Latin-1") as f:
186            self._chmod(self._dirfile)
187            f.write("%r, %r\n" % (key.decode("Latin-1"), pos_and_siz_pair))
188
189    def __setitem__(self, key, val):
190        if self._readonly:
191            raise error('The database is opened for reading only')
192        if isinstance(key, str):
193            key = key.encode('utf-8')
194        elif not isinstance(key, (bytes, bytearray)):
195            raise TypeError("keys must be bytes or strings")
196        if isinstance(val, str):
197            val = val.encode('utf-8')
198        elif not isinstance(val, (bytes, bytearray)):
199            raise TypeError("values must be bytes or strings")
200        self._verify_open()
201        self._modified = True
202        if key not in self._index:
203            self._addkey(key, self._addval(val))
204        else:
205            # See whether the new value is small enough to fit in the
206            # (padded) space currently occupied by the old value.
207            pos, siz = self._index[key]
208            oldblocks = (siz + _BLOCKSIZE - 1) // _BLOCKSIZE
209            newblocks = (len(val) + _BLOCKSIZE - 1) // _BLOCKSIZE
210            if newblocks <= oldblocks:
211                self._index[key] = self._setval(pos, val)
212            else:
213                # The new value doesn't fit in the (padded) space used
214                # by the old value.  The blocks used by the old value are
215                # forever lost.
216                self._index[key] = self._addval(val)
217
218            # Note that _index may be out of synch with the directory
219            # file now:  _setval() and _addval() don't update the directory
220            # file.  This also means that the on-disk directory and data
221            # files are in a mutually inconsistent state, and they'll
222            # remain that way until _commit() is called.  Note that this
223            # is a disaster (for the database) if the program crashes
224            # (so that _commit() never gets called).
225
226    def __delitem__(self, key):
227        if self._readonly:
228            raise error('The database is opened for reading only')
229        if isinstance(key, str):
230            key = key.encode('utf-8')
231        self._verify_open()
232        self._modified = True
233        # The blocks used by the associated value are lost.
234        del self._index[key]
235        # XXX It's unclear why we do a _commit() here (the code always
236        # XXX has, so I'm not changing it).  __setitem__ doesn't try to
237        # XXX keep the directory file in synch.  Why should we?  Or
238        # XXX why shouldn't __setitem__?
239        self._commit()
240
241    def keys(self):
242        try:
243            return list(self._index)
244        except TypeError:
245            raise error('DBM object has already been closed') from None
246
247    def items(self):
248        self._verify_open()
249        return [(key, self[key]) for key in self._index.keys()]
250
251    def __contains__(self, key):
252        if isinstance(key, str):
253            key = key.encode('utf-8')
254        try:
255            return key in self._index
256        except TypeError:
257            if self._index is None:
258                raise error('DBM object has already been closed') from None
259            else:
260                raise
261
262    def iterkeys(self):
263        try:
264            return iter(self._index)
265        except TypeError:
266            raise error('DBM object has already been closed') from None
267    __iter__ = iterkeys
268
269    def __len__(self):
270        try:
271            return len(self._index)
272        except TypeError:
273            raise error('DBM object has already been closed') from None
274
275    def close(self):
276        try:
277            self._commit()
278        finally:
279            self._index = self._datfile = self._dirfile = self._bakfile = None
280
281    __del__ = close
282
283    def _chmod(self, file):
284        self._os.chmod(file, self._mode)
285
286    def __enter__(self):
287        return self
288
289    def __exit__(self, *args):
290        self.close()
291
292
293def open(file, flag='c', mode=0o666):
294    """Open the database file, filename, and return corresponding object.
295
296    The flag argument, used to control how the database is opened in the
297    other DBM implementations, supports only the semantics of 'c' and 'n'
298    values.  Other values will default to the semantics of 'c' value:
299    the database will always opened for update and will be created if it
300    does not exist.
301
302    The optional mode argument is the UNIX mode of the file, used only when
303    the database has to be created.  It defaults to octal code 0o666 (and
304    will be modified by the prevailing umask).
305
306    """
307
308    # Modify mode depending on the umask
309    try:
310        um = _os.umask(0)
311        _os.umask(um)
312    except AttributeError:
313        pass
314    else:
315        # Turn off any bits that are set in the umask
316        mode = mode & (~um)
317    if flag not in ('r', 'w', 'c', 'n'):
318        raise ValueError("Flag must be one of 'r', 'w', 'c', or 'n'")
319    return _Database(file, mode, flag=flag)
320