1"""A dumb and slow but simple dbm clone. 2 3For database spam, spam.dir contains the index (a text file), 4spam.bak *may* contain a backup of the index (also a text file), 5while spam.dat contains the data (a binary file). 6 7XXX TO DO: 8 9- seems to contain a bug when updating... 10 11- reclaim free space (currently, space once occupied by deleted or expanded 12items is never reused) 13 14- support concurrent access (currently, if two processes take turns making 15updates, they can mess up the index) 16 17- support efficient access to large databases (currently, the whole index 18is read when the database is opened, and some updates rewrite the whole index) 19 20- support opening for read-only (flag = 'm') 21 22""" 23 24import ast as _ast 25import io as _io 26import os as _os 27import collections 28 29__all__ = ["error", "open"] 30 31_BLOCKSIZE = 512 32 33error = OSError 34 35class _Database(collections.MutableMapping): 36 37 # The on-disk directory and data files can remain in mutually 38 # inconsistent states for an arbitrarily long time (see comments 39 # at the end of __setitem__). This is only repaired when _commit() 40 # gets called. One place _commit() gets called is from __del__(), 41 # and if that occurs at program shutdown time, module globals may 42 # already have gotten rebound to None. Since it's crucial that 43 # _commit() finish successfully, we can't ignore shutdown races 44 # here, and _commit() must not reference any globals. 45 _os = _os # for _commit() 46 _io = _io # for _commit() 47 48 def __init__(self, filebasename, mode, flag='c'): 49 self._mode = mode 50 self._readonly = (flag == 'r') 51 52 # The directory file is a text file. Each line looks like 53 # "%r, (%d, %d)\n" % (key, pos, siz) 54 # where key is the string key, pos is the offset into the dat 55 # file of the associated value's first byte, and siz is the number 56 # of bytes in the associated value. 57 self._dirfile = filebasename + '.dir' 58 59 # The data file is a binary file pointed into by the directory 60 # file, and holds the values associated with keys. Each value 61 # begins at a _BLOCKSIZE-aligned byte offset, and is a raw 62 # binary 8-bit string value. 63 self._datfile = filebasename + '.dat' 64 self._bakfile = filebasename + '.bak' 65 66 # The index is an in-memory dict, mirroring the directory file. 67 self._index = None # maps keys to (pos, siz) pairs 68 69 # Handle the creation 70 self._create(flag) 71 self._update() 72 73 def _create(self, flag): 74 if flag == 'n': 75 for filename in (self._datfile, self._bakfile, self._dirfile): 76 try: 77 _os.remove(filename) 78 except OSError: 79 pass 80 # Mod by Jack: create data file if needed 81 try: 82 f = _io.open(self._datfile, 'r', encoding="Latin-1") 83 except OSError: 84 if flag not in ('c', 'n'): 85 import warnings 86 warnings.warn("The database file is missing, the " 87 "semantics of the 'c' flag will be used.", 88 DeprecationWarning, stacklevel=4) 89 with _io.open(self._datfile, 'w', encoding="Latin-1") as f: 90 self._chmod(self._datfile) 91 else: 92 f.close() 93 94 # Read directory file into the in-memory index dict. 95 def _update(self): 96 self._index = {} 97 try: 98 f = _io.open(self._dirfile, 'r', encoding="Latin-1") 99 except OSError: 100 self._modified = not self._readonly 101 else: 102 self._modified = False 103 with f: 104 for line in f: 105 line = line.rstrip() 106 key, pos_and_siz_pair = _ast.literal_eval(line) 107 key = key.encode('Latin-1') 108 self._index[key] = pos_and_siz_pair 109 110 # Write the index dict to the directory file. The original directory 111 # file (if any) is renamed with a .bak extension first. If a .bak 112 # file currently exists, it's deleted. 113 def _commit(self): 114 # CAUTION: It's vital that _commit() succeed, and _commit() can 115 # be called from __del__(). Therefore we must never reference a 116 # global in this routine. 117 if self._index is None or not self._modified: 118 return # nothing to do 119 120 try: 121 self._os.unlink(self._bakfile) 122 except OSError: 123 pass 124 125 try: 126 self._os.rename(self._dirfile, self._bakfile) 127 except OSError: 128 pass 129 130 with self._io.open(self._dirfile, 'w', encoding="Latin-1") as f: 131 self._chmod(self._dirfile) 132 for key, pos_and_siz_pair in self._index.items(): 133 # Use Latin-1 since it has no qualms with any value in any 134 # position; UTF-8, though, does care sometimes. 135 entry = "%r, %r\n" % (key.decode('Latin-1'), pos_and_siz_pair) 136 f.write(entry) 137 138 sync = _commit 139 140 def _verify_open(self): 141 if self._index is None: 142 raise error('DBM object has already been closed') 143 144 def __getitem__(self, key): 145 if isinstance(key, str): 146 key = key.encode('utf-8') 147 self._verify_open() 148 pos, siz = self._index[key] # may raise KeyError 149 with _io.open(self._datfile, 'rb') as f: 150 f.seek(pos) 151 dat = f.read(siz) 152 return dat 153 154 # Append val to the data file, starting at a _BLOCKSIZE-aligned 155 # offset. The data file is first padded with NUL bytes (if needed) 156 # to get to an aligned offset. Return pair 157 # (starting offset of val, len(val)) 158 def _addval(self, val): 159 with _io.open(self._datfile, 'rb+') as f: 160 f.seek(0, 2) 161 pos = int(f.tell()) 162 npos = ((pos + _BLOCKSIZE - 1) // _BLOCKSIZE) * _BLOCKSIZE 163 f.write(b'\0'*(npos-pos)) 164 pos = npos 165 f.write(val) 166 return (pos, len(val)) 167 168 # Write val to the data file, starting at offset pos. The caller 169 # is responsible for ensuring that there's enough room starting at 170 # pos to hold val, without overwriting some other value. Return 171 # pair (pos, len(val)). 172 def _setval(self, pos, val): 173 with _io.open(self._datfile, 'rb+') as f: 174 f.seek(pos) 175 f.write(val) 176 return (pos, len(val)) 177 178 # key is a new key whose associated value starts in the data file 179 # at offset pos and with length siz. Add an index record to 180 # the in-memory index dict, and append one to the directory file. 181 def _addkey(self, key, pos_and_siz_pair): 182 self._index[key] = pos_and_siz_pair 183 with _io.open(self._dirfile, 'a', encoding="Latin-1") as f: 184 self._chmod(self._dirfile) 185 f.write("%r, %r\n" % (key.decode("Latin-1"), pos_and_siz_pair)) 186 187 def __setitem__(self, key, val): 188 if self._readonly: 189 import warnings 190 warnings.warn('The database is opened for reading only', 191 DeprecationWarning, stacklevel=2) 192 if isinstance(key, str): 193 key = key.encode('utf-8') 194 elif not isinstance(key, (bytes, bytearray)): 195 raise TypeError("keys must be bytes or strings") 196 if isinstance(val, str): 197 val = val.encode('utf-8') 198 elif not isinstance(val, (bytes, bytearray)): 199 raise TypeError("values must be bytes or strings") 200 self._verify_open() 201 self._modified = True 202 if key not in self._index: 203 self._addkey(key, self._addval(val)) 204 else: 205 # See whether the new value is small enough to fit in the 206 # (padded) space currently occupied by the old value. 207 pos, siz = self._index[key] 208 oldblocks = (siz + _BLOCKSIZE - 1) // _BLOCKSIZE 209 newblocks = (len(val) + _BLOCKSIZE - 1) // _BLOCKSIZE 210 if newblocks <= oldblocks: 211 self._index[key] = self._setval(pos, val) 212 else: 213 # The new value doesn't fit in the (padded) space used 214 # by the old value. The blocks used by the old value are 215 # forever lost. 216 self._index[key] = self._addval(val) 217 218 # Note that _index may be out of synch with the directory 219 # file now: _setval() and _addval() don't update the directory 220 # file. This also means that the on-disk directory and data 221 # files are in a mutually inconsistent state, and they'll 222 # remain that way until _commit() is called. Note that this 223 # is a disaster (for the database) if the program crashes 224 # (so that _commit() never gets called). 225 226 def __delitem__(self, key): 227 if self._readonly: 228 import warnings 229 warnings.warn('The database is opened for reading only', 230 DeprecationWarning, stacklevel=2) 231 if isinstance(key, str): 232 key = key.encode('utf-8') 233 self._verify_open() 234 self._modified = True 235 # The blocks used by the associated value are lost. 236 del self._index[key] 237 # XXX It's unclear why we do a _commit() here (the code always 238 # XXX has, so I'm not changing it). __setitem__ doesn't try to 239 # XXX keep the directory file in synch. Why should we? Or 240 # XXX why shouldn't __setitem__? 241 self._commit() 242 243 def keys(self): 244 try: 245 return list(self._index) 246 except TypeError: 247 raise error('DBM object has already been closed') from None 248 249 def items(self): 250 self._verify_open() 251 return [(key, self[key]) for key in self._index.keys()] 252 253 def __contains__(self, key): 254 if isinstance(key, str): 255 key = key.encode('utf-8') 256 try: 257 return key in self._index 258 except TypeError: 259 if self._index is None: 260 raise error('DBM object has already been closed') from None 261 else: 262 raise 263 264 def iterkeys(self): 265 try: 266 return iter(self._index) 267 except TypeError: 268 raise error('DBM object has already been closed') from None 269 __iter__ = iterkeys 270 271 def __len__(self): 272 try: 273 return len(self._index) 274 except TypeError: 275 raise error('DBM object has already been closed') from None 276 277 def close(self): 278 try: 279 self._commit() 280 finally: 281 self._index = self._datfile = self._dirfile = self._bakfile = None 282 283 __del__ = close 284 285 def _chmod(self, file): 286 if hasattr(self._os, 'chmod'): 287 self._os.chmod(file, self._mode) 288 289 def __enter__(self): 290 return self 291 292 def __exit__(self, *args): 293 self.close() 294 295 296def open(file, flag='c', mode=0o666): 297 """Open the database file, filename, and return corresponding object. 298 299 The flag argument, used to control how the database is opened in the 300 other DBM implementations, supports only the semantics of 'c' and 'n' 301 values. Other values will default to the semantics of 'c' value: 302 the database will always opened for update and will be created if it 303 does not exist. 304 305 The optional mode argument is the UNIX mode of the file, used only when 306 the database has to be created. It defaults to octal code 0o666 (and 307 will be modified by the prevailing umask). 308 309 """ 310 311 # Modify mode depending on the umask 312 try: 313 um = _os.umask(0) 314 _os.umask(um) 315 except AttributeError: 316 pass 317 else: 318 # Turn off any bits that are set in the umask 319 mode = mode & (~um) 320 if flag not in ('r', 'w', 'c', 'n'): 321 import warnings 322 warnings.warn("Flag must be one of 'r', 'w', 'c', or 'n'", 323 DeprecationWarning, stacklevel=2) 324 return _Database(file, mode, flag=flag) 325