• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""A dumb and slow but simple dbm clone.
2
3For database spam, spam.dir contains the index (a text file),
4spam.bak *may* contain a backup of the index (also a text file),
5while spam.dat contains the data (a binary file).
6
7XXX TO DO:
8
9- seems to contain a bug when updating...
10
11- reclaim free space (currently, space once occupied by deleted or expanded
12items is never reused)
13
14- support concurrent access (currently, if two processes take turns making
15updates, they can mess up the index)
16
17- support efficient access to large databases (currently, the whole index
18is read when the database is opened, and some updates rewrite the whole index)
19
20- support opening for read-only (flag = 'm')
21
22"""
23
24import ast as _ast
25import io as _io
26import os as _os
27import collections.abc
28
29__all__ = ["error", "open"]
30
31_BLOCKSIZE = 512
32
33error = OSError
34
35class _Database(collections.abc.MutableMapping):
36
37    # The on-disk directory and data files can remain in mutually
38    # inconsistent states for an arbitrarily long time (see comments
39    # at the end of __setitem__).  This is only repaired when _commit()
40    # gets called.  One place _commit() gets called is from __del__(),
41    # and if that occurs at program shutdown time, module globals may
42    # already have gotten rebound to None.  Since it's crucial that
43    # _commit() finish successfully, we can't ignore shutdown races
44    # here, and _commit() must not reference any globals.
45    _os = _os       # for _commit()
46    _io = _io       # for _commit()
47
48    def __init__(self, filebasename, mode, flag='c'):
49        self._mode = mode
50        self._readonly = (flag == 'r')
51
52        # The directory file is a text file.  Each line looks like
53        #    "%r, (%d, %d)\n" % (key, pos, siz)
54        # where key is the string key, pos is the offset into the dat
55        # file of the associated value's first byte, and siz is the number
56        # of bytes in the associated value.
57        self._dirfile = filebasename + '.dir'
58
59        # The data file is a binary file pointed into by the directory
60        # file, and holds the values associated with keys.  Each value
61        # begins at a _BLOCKSIZE-aligned byte offset, and is a raw
62        # binary 8-bit string value.
63        self._datfile = filebasename + '.dat'
64        self._bakfile = filebasename + '.bak'
65
66        # The index is an in-memory dict, mirroring the directory file.
67        self._index = None  # maps keys to (pos, siz) pairs
68
69        # Handle the creation
70        self._create(flag)
71        self._update(flag)
72
73    def _create(self, flag):
74        if flag == 'n':
75            for filename in (self._datfile, self._bakfile, self._dirfile):
76                try:
77                    _os.remove(filename)
78                except OSError:
79                    pass
80        # Mod by Jack: create data file if needed
81        try:
82            f = _io.open(self._datfile, 'r', encoding="Latin-1")
83        except OSError:
84            if flag not in ('c', 'n'):
85                raise
86            with _io.open(self._datfile, 'w', encoding="Latin-1") as f:
87                self._chmod(self._datfile)
88        else:
89            f.close()
90
91    # Read directory file into the in-memory index dict.
92    def _update(self, flag):
93        self._modified = False
94        self._index = {}
95        try:
96            f = _io.open(self._dirfile, 'r', encoding="Latin-1")
97        except OSError:
98            if flag not in ('c', 'n'):
99                raise
100            self._modified = True
101        else:
102            with f:
103                for line in f:
104                    line = line.rstrip()
105                    key, pos_and_siz_pair = _ast.literal_eval(line)
106                    key = key.encode('Latin-1')
107                    self._index[key] = pos_and_siz_pair
108
109    # Write the index dict to the directory file.  The original directory
110    # file (if any) is renamed with a .bak extension first.  If a .bak
111    # file currently exists, it's deleted.
112    def _commit(self):
113        # CAUTION:  It's vital that _commit() succeed, and _commit() can
114        # be called from __del__().  Therefore we must never reference a
115        # global in this routine.
116        if self._index is None or not self._modified:
117            return  # nothing to do
118
119        try:
120            self._os.unlink(self._bakfile)
121        except OSError:
122            pass
123
124        try:
125            self._os.rename(self._dirfile, self._bakfile)
126        except OSError:
127            pass
128
129        with self._io.open(self._dirfile, 'w', encoding="Latin-1") as f:
130            self._chmod(self._dirfile)
131            for key, pos_and_siz_pair in self._index.items():
132                # Use Latin-1 since it has no qualms with any value in any
133                # position; UTF-8, though, does care sometimes.
134                entry = "%r, %r\n" % (key.decode('Latin-1'), pos_and_siz_pair)
135                f.write(entry)
136
137    sync = _commit
138
139    def _verify_open(self):
140        if self._index is None:
141            raise error('DBM object has already been closed')
142
143    def __getitem__(self, key):
144        if isinstance(key, str):
145            key = key.encode('utf-8')
146        self._verify_open()
147        pos, siz = self._index[key]     # may raise KeyError
148        with _io.open(self._datfile, 'rb') as f:
149            f.seek(pos)
150            dat = f.read(siz)
151        return dat
152
153    # Append val to the data file, starting at a _BLOCKSIZE-aligned
154    # offset.  The data file is first padded with NUL bytes (if needed)
155    # to get to an aligned offset.  Return pair
156    #     (starting offset of val, len(val))
157    def _addval(self, val):
158        with _io.open(self._datfile, 'rb+') as f:
159            f.seek(0, 2)
160            pos = int(f.tell())
161            npos = ((pos + _BLOCKSIZE - 1) // _BLOCKSIZE) * _BLOCKSIZE
162            f.write(b'\0'*(npos-pos))
163            pos = npos
164            f.write(val)
165        return (pos, len(val))
166
167    # Write val to the data file, starting at offset pos.  The caller
168    # is responsible for ensuring that there's enough room starting at
169    # pos to hold val, without overwriting some other value.  Return
170    # pair (pos, len(val)).
171    def _setval(self, pos, val):
172        with _io.open(self._datfile, 'rb+') as f:
173            f.seek(pos)
174            f.write(val)
175        return (pos, len(val))
176
177    # key is a new key whose associated value starts in the data file
178    # at offset pos and with length siz.  Add an index record to
179    # the in-memory index dict, and append one to the directory file.
180    def _addkey(self, key, pos_and_siz_pair):
181        self._index[key] = pos_and_siz_pair
182        with _io.open(self._dirfile, 'a', encoding="Latin-1") as f:
183            self._chmod(self._dirfile)
184            f.write("%r, %r\n" % (key.decode("Latin-1"), pos_and_siz_pair))
185
186    def __setitem__(self, key, val):
187        if self._readonly:
188            raise error('The database is opened for reading only')
189        if isinstance(key, str):
190            key = key.encode('utf-8')
191        elif not isinstance(key, (bytes, bytearray)):
192            raise TypeError("keys must be bytes or strings")
193        if isinstance(val, str):
194            val = val.encode('utf-8')
195        elif not isinstance(val, (bytes, bytearray)):
196            raise TypeError("values must be bytes or strings")
197        self._verify_open()
198        self._modified = True
199        if key not in self._index:
200            self._addkey(key, self._addval(val))
201        else:
202            # See whether the new value is small enough to fit in the
203            # (padded) space currently occupied by the old value.
204            pos, siz = self._index[key]
205            oldblocks = (siz + _BLOCKSIZE - 1) // _BLOCKSIZE
206            newblocks = (len(val) + _BLOCKSIZE - 1) // _BLOCKSIZE
207            if newblocks <= oldblocks:
208                self._index[key] = self._setval(pos, val)
209            else:
210                # The new value doesn't fit in the (padded) space used
211                # by the old value.  The blocks used by the old value are
212                # forever lost.
213                self._index[key] = self._addval(val)
214
215            # Note that _index may be out of synch with the directory
216            # file now:  _setval() and _addval() don't update the directory
217            # file.  This also means that the on-disk directory and data
218            # files are in a mutually inconsistent state, and they'll
219            # remain that way until _commit() is called.  Note that this
220            # is a disaster (for the database) if the program crashes
221            # (so that _commit() never gets called).
222
223    def __delitem__(self, key):
224        if self._readonly:
225            raise error('The database is opened for reading only')
226        if isinstance(key, str):
227            key = key.encode('utf-8')
228        self._verify_open()
229        self._modified = True
230        # The blocks used by the associated value are lost.
231        del self._index[key]
232        # XXX It's unclear why we do a _commit() here (the code always
233        # XXX has, so I'm not changing it).  __setitem__ doesn't try to
234        # XXX keep the directory file in synch.  Why should we?  Or
235        # XXX why shouldn't __setitem__?
236        self._commit()
237
238    def keys(self):
239        try:
240            return list(self._index)
241        except TypeError:
242            raise error('DBM object has already been closed') from None
243
244    def items(self):
245        self._verify_open()
246        return [(key, self[key]) for key in self._index.keys()]
247
248    def __contains__(self, key):
249        if isinstance(key, str):
250            key = key.encode('utf-8')
251        try:
252            return key in self._index
253        except TypeError:
254            if self._index is None:
255                raise error('DBM object has already been closed') from None
256            else:
257                raise
258
259    def iterkeys(self):
260        try:
261            return iter(self._index)
262        except TypeError:
263            raise error('DBM object has already been closed') from None
264    __iter__ = iterkeys
265
266    def __len__(self):
267        try:
268            return len(self._index)
269        except TypeError:
270            raise error('DBM object has already been closed') from None
271
272    def close(self):
273        try:
274            self._commit()
275        finally:
276            self._index = self._datfile = self._dirfile = self._bakfile = None
277
278    __del__ = close
279
280    def _chmod(self, file):
281        self._os.chmod(file, self._mode)
282
283    def __enter__(self):
284        return self
285
286    def __exit__(self, *args):
287        self.close()
288
289
290def open(file, flag='c', mode=0o666):
291    """Open the database file, filename, and return corresponding object.
292
293    The flag argument, used to control how the database is opened in the
294    other DBM implementations, supports only the semantics of 'c' and 'n'
295    values.  Other values will default to the semantics of 'c' value:
296    the database will always opened for update and will be created if it
297    does not exist.
298
299    The optional mode argument is the UNIX mode of the file, used only when
300    the database has to be created.  It defaults to octal code 0o666 (and
301    will be modified by the prevailing umask).
302
303    """
304
305    # Modify mode depending on the umask
306    try:
307        um = _os.umask(0)
308        _os.umask(um)
309    except AttributeError:
310        pass
311    else:
312        # Turn off any bits that are set in the umask
313        mode = mode & (~um)
314    if flag not in ('r', 'w', 'c', 'n'):
315        raise ValueError("Flag must be one of 'r', 'w', 'c', or 'n'")
316    return _Database(file, mode, flag=flag)
317