1"""A dumb and slow but simple dbm clone. 2 3For database spam, spam.dir contains the index (a text file), 4spam.bak *may* contain a backup of the index (also a text file), 5while spam.dat contains the data (a binary file). 6 7XXX TO DO: 8 9- seems to contain a bug when updating... 10 11- reclaim free space (currently, space once occupied by deleted or expanded 12items is never reused) 13 14- support concurrent access (currently, if two processes take turns making 15updates, they can mess up the index) 16 17- support efficient access to large databases (currently, the whole index 18is read when the database is opened, and some updates rewrite the whole index) 19 20- support opening for read-only (flag = 'm') 21 22""" 23 24import ast as _ast 25import io as _io 26import os as _os 27import collections.abc 28 29__all__ = ["error", "open"] 30 31_BLOCKSIZE = 512 32 33error = OSError 34 35class _Database(collections.abc.MutableMapping): 36 37 # The on-disk directory and data files can remain in mutually 38 # inconsistent states for an arbitrarily long time (see comments 39 # at the end of __setitem__). This is only repaired when _commit() 40 # gets called. One place _commit() gets called is from __del__(), 41 # and if that occurs at program shutdown time, module globals may 42 # already have gotten rebound to None. Since it's crucial that 43 # _commit() finish successfully, we can't ignore shutdown races 44 # here, and _commit() must not reference any globals. 45 _os = _os # for _commit() 46 _io = _io # for _commit() 47 48 def __init__(self, filebasename, mode, flag='c'): 49 filebasename = self._os.fsencode(filebasename) 50 self._mode = mode 51 self._readonly = (flag == 'r') 52 53 # The directory file is a text file. Each line looks like 54 # "%r, (%d, %d)\n" % (key, pos, siz) 55 # where key is the string key, pos is the offset into the dat 56 # file of the associated value's first byte, and siz is the number 57 # of bytes in the associated value. 58 self._dirfile = filebasename + b'.dir' 59 60 # The data file is a binary file pointed into by the directory 61 # file, and holds the values associated with keys. Each value 62 # begins at a _BLOCKSIZE-aligned byte offset, and is a raw 63 # binary 8-bit string value. 64 self._datfile = filebasename + b'.dat' 65 self._bakfile = filebasename + b'.bak' 66 67 # The index is an in-memory dict, mirroring the directory file. 68 self._index = None # maps keys to (pos, siz) pairs 69 70 # Handle the creation 71 self._create(flag) 72 self._update(flag) 73 74 def _create(self, flag): 75 if flag == 'n': 76 for filename in (self._datfile, self._bakfile, self._dirfile): 77 try: 78 _os.remove(filename) 79 except OSError: 80 pass 81 # Mod by Jack: create data file if needed 82 try: 83 f = _io.open(self._datfile, 'r', encoding="Latin-1") 84 except OSError: 85 if flag not in ('c', 'n'): 86 raise 87 with _io.open(self._datfile, 'w', encoding="Latin-1") as f: 88 self._chmod(self._datfile) 89 else: 90 f.close() 91 92 # Read directory file into the in-memory index dict. 93 def _update(self, flag): 94 self._modified = False 95 self._index = {} 96 try: 97 f = _io.open(self._dirfile, 'r', encoding="Latin-1") 98 except OSError: 99 if flag not in ('c', 'n'): 100 raise 101 with self._io.open(self._dirfile, 'w', encoding="Latin-1") as f: 102 self._chmod(self._dirfile) 103 else: 104 with f: 105 for line in f: 106 line = line.rstrip() 107 key, pos_and_siz_pair = _ast.literal_eval(line) 108 key = key.encode('Latin-1') 109 self._index[key] = pos_and_siz_pair 110 111 # Write the index dict to the directory file. The original directory 112 # file (if any) is renamed with a .bak extension first. If a .bak 113 # file currently exists, it's deleted. 114 def _commit(self): 115 # CAUTION: It's vital that _commit() succeed, and _commit() can 116 # be called from __del__(). Therefore we must never reference a 117 # global in this routine. 118 if self._index is None or not self._modified: 119 return # nothing to do 120 121 try: 122 self._os.unlink(self._bakfile) 123 except OSError: 124 pass 125 126 try: 127 self._os.rename(self._dirfile, self._bakfile) 128 except OSError: 129 pass 130 131 with self._io.open(self._dirfile, 'w', encoding="Latin-1") as f: 132 self._chmod(self._dirfile) 133 for key, pos_and_siz_pair in self._index.items(): 134 # Use Latin-1 since it has no qualms with any value in any 135 # position; UTF-8, though, does care sometimes. 136 entry = "%r, %r\n" % (key.decode('Latin-1'), pos_and_siz_pair) 137 f.write(entry) 138 self._modified = False 139 140 sync = _commit 141 142 def _verify_open(self): 143 if self._index is None: 144 raise error('DBM object has already been closed') 145 146 def __getitem__(self, key): 147 if isinstance(key, str): 148 key = key.encode('utf-8') 149 self._verify_open() 150 pos, siz = self._index[key] # may raise KeyError 151 with _io.open(self._datfile, 'rb') as f: 152 f.seek(pos) 153 dat = f.read(siz) 154 return dat 155 156 # Append val to the data file, starting at a _BLOCKSIZE-aligned 157 # offset. The data file is first padded with NUL bytes (if needed) 158 # to get to an aligned offset. Return pair 159 # (starting offset of val, len(val)) 160 def _addval(self, val): 161 with _io.open(self._datfile, 'rb+') as f: 162 f.seek(0, 2) 163 pos = int(f.tell()) 164 npos = ((pos + _BLOCKSIZE - 1) // _BLOCKSIZE) * _BLOCKSIZE 165 f.write(b'\0'*(npos-pos)) 166 pos = npos 167 f.write(val) 168 return (pos, len(val)) 169 170 # Write val to the data file, starting at offset pos. The caller 171 # is responsible for ensuring that there's enough room starting at 172 # pos to hold val, without overwriting some other value. Return 173 # pair (pos, len(val)). 174 def _setval(self, pos, val): 175 with _io.open(self._datfile, 'rb+') as f: 176 f.seek(pos) 177 f.write(val) 178 return (pos, len(val)) 179 180 # key is a new key whose associated value starts in the data file 181 # at offset pos and with length siz. Add an index record to 182 # the in-memory index dict, and append one to the directory file. 183 def _addkey(self, key, pos_and_siz_pair): 184 self._index[key] = pos_and_siz_pair 185 with _io.open(self._dirfile, 'a', encoding="Latin-1") as f: 186 self._chmod(self._dirfile) 187 f.write("%r, %r\n" % (key.decode("Latin-1"), pos_and_siz_pair)) 188 189 def __setitem__(self, key, val): 190 if self._readonly: 191 raise error('The database is opened for reading only') 192 if isinstance(key, str): 193 key = key.encode('utf-8') 194 elif not isinstance(key, (bytes, bytearray)): 195 raise TypeError("keys must be bytes or strings") 196 if isinstance(val, str): 197 val = val.encode('utf-8') 198 elif not isinstance(val, (bytes, bytearray)): 199 raise TypeError("values must be bytes or strings") 200 self._verify_open() 201 self._modified = True 202 if key not in self._index: 203 self._addkey(key, self._addval(val)) 204 else: 205 # See whether the new value is small enough to fit in the 206 # (padded) space currently occupied by the old value. 207 pos, siz = self._index[key] 208 oldblocks = (siz + _BLOCKSIZE - 1) // _BLOCKSIZE 209 newblocks = (len(val) + _BLOCKSIZE - 1) // _BLOCKSIZE 210 if newblocks <= oldblocks: 211 self._index[key] = self._setval(pos, val) 212 else: 213 # The new value doesn't fit in the (padded) space used 214 # by the old value. The blocks used by the old value are 215 # forever lost. 216 self._index[key] = self._addval(val) 217 218 # Note that _index may be out of synch with the directory 219 # file now: _setval() and _addval() don't update the directory 220 # file. This also means that the on-disk directory and data 221 # files are in a mutually inconsistent state, and they'll 222 # remain that way until _commit() is called. Note that this 223 # is a disaster (for the database) if the program crashes 224 # (so that _commit() never gets called). 225 226 def __delitem__(self, key): 227 if self._readonly: 228 raise error('The database is opened for reading only') 229 if isinstance(key, str): 230 key = key.encode('utf-8') 231 self._verify_open() 232 self._modified = True 233 # The blocks used by the associated value are lost. 234 del self._index[key] 235 # XXX It's unclear why we do a _commit() here (the code always 236 # XXX has, so I'm not changing it). __setitem__ doesn't try to 237 # XXX keep the directory file in synch. Why should we? Or 238 # XXX why shouldn't __setitem__? 239 self._commit() 240 241 def keys(self): 242 try: 243 return list(self._index) 244 except TypeError: 245 raise error('DBM object has already been closed') from None 246 247 def items(self): 248 self._verify_open() 249 return [(key, self[key]) for key in self._index.keys()] 250 251 def __contains__(self, key): 252 if isinstance(key, str): 253 key = key.encode('utf-8') 254 try: 255 return key in self._index 256 except TypeError: 257 if self._index is None: 258 raise error('DBM object has already been closed') from None 259 else: 260 raise 261 262 def iterkeys(self): 263 try: 264 return iter(self._index) 265 except TypeError: 266 raise error('DBM object has already been closed') from None 267 __iter__ = iterkeys 268 269 def __len__(self): 270 try: 271 return len(self._index) 272 except TypeError: 273 raise error('DBM object has already been closed') from None 274 275 def close(self): 276 try: 277 self._commit() 278 finally: 279 self._index = self._datfile = self._dirfile = self._bakfile = None 280 281 __del__ = close 282 283 def _chmod(self, file): 284 self._os.chmod(file, self._mode) 285 286 def __enter__(self): 287 return self 288 289 def __exit__(self, *args): 290 self.close() 291 292 293def open(file, flag='c', mode=0o666): 294 """Open the database file, filename, and return corresponding object. 295 296 The flag argument, used to control how the database is opened in the 297 other DBM implementations, supports only the semantics of 'c' and 'n' 298 values. Other values will default to the semantics of 'c' value: 299 the database will always opened for update and will be created if it 300 does not exist. 301 302 The optional mode argument is the UNIX mode of the file, used only when 303 the database has to be created. It defaults to octal code 0o666 (and 304 will be modified by the prevailing umask). 305 306 """ 307 308 # Modify mode depending on the umask 309 try: 310 um = _os.umask(0) 311 _os.umask(um) 312 except AttributeError: 313 pass 314 else: 315 # Turn off any bits that are set in the umask 316 mode = mode & (~um) 317 if flag not in ('r', 'w', 'c', 'n'): 318 raise ValueError("Flag must be one of 'r', 'w', 'c', or 'n'") 319 return _Database(file, mode, flag=flag) 320