1# Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15"""File IO methods that wrap the C++ FileSystem API. 16 17The C++ FileSystem API is SWIG wrapped in file_io.i. These functions call those 18to accomplish basic File IO operations. 19""" 20from __future__ import absolute_import 21from __future__ import division 22from __future__ import print_function 23 24import binascii 25import os 26import uuid 27 28import six 29 30from tensorflow.python import pywrap_tensorflow 31from tensorflow.python.framework import c_api_util 32from tensorflow.python.framework import errors 33from tensorflow.python.util import compat 34from tensorflow.python.util import deprecation 35from tensorflow.python.util.tf_export import tf_export 36 37# A good default block size depends on the system in question. 38# A somewhat conservative default chosen here. 39_DEFAULT_BLOCK_SIZE = 16 * 1024 * 1024 40 41 42class FileIO(object): 43 """FileIO class that exposes methods to read / write to / from files. 44 45 The constructor takes the following arguments: 46 name: name of the file 47 mode: one of 'r', 'w', 'a', 'r+', 'w+', 'a+'. Append 'b' for bytes mode. 48 49 Can be used as an iterator to iterate over lines in the file. 50 51 The default buffer size used for the BufferedInputStream used for reading 52 the file line by line is 1024 * 512 bytes. 53 """ 54 55 def __init__(self, name, mode): 56 self.__name = name 57 self.__mode = mode 58 self._read_buf = None 59 self._writable_file = None 60 self._binary_mode = "b" in mode 61 mode = mode.replace("b", "") 62 if mode not in ("r", "w", "a", "r+", "w+", "a+"): 63 raise errors.InvalidArgumentError( 64 None, None, "mode is not 'r' or 'w' or 'a' or 'r+' or 'w+' or 'a+'") 65 self._read_check_passed = mode in ("r", "r+", "a+", "w+") 66 self._write_check_passed = mode in ("a", "w", "r+", "a+", "w+") 67 68 @property 69 def name(self): 70 """Returns the file name.""" 71 return self.__name 72 73 @property 74 def mode(self): 75 """Returns the mode in which the file was opened.""" 76 return self.__mode 77 78 def _preread_check(self): 79 if not self._read_buf: 80 if not self._read_check_passed: 81 raise errors.PermissionDeniedError(None, None, 82 "File isn't open for reading") 83 with errors.raise_exception_on_not_ok_status() as status: 84 self._read_buf = pywrap_tensorflow.CreateBufferedInputStream( 85 compat.as_bytes(self.__name), 1024 * 512, status) 86 87 def _prewrite_check(self): 88 if not self._writable_file: 89 if not self._write_check_passed: 90 raise errors.PermissionDeniedError(None, None, 91 "File isn't open for writing") 92 with errors.raise_exception_on_not_ok_status() as status: 93 self._writable_file = pywrap_tensorflow.CreateWritableFile( 94 compat.as_bytes(self.__name), compat.as_bytes(self.__mode), status) 95 96 def _prepare_value(self, val): 97 if self._binary_mode: 98 return compat.as_bytes(val) 99 else: 100 return compat.as_str_any(val) 101 102 def size(self): 103 """Returns the size of the file.""" 104 return stat(self.__name).length 105 106 def write(self, file_content): 107 """Writes file_content to the file. Appends to the end of the file.""" 108 self._prewrite_check() 109 with errors.raise_exception_on_not_ok_status() as status: 110 pywrap_tensorflow.AppendToFile( 111 compat.as_bytes(file_content), self._writable_file, status) 112 113 def read(self, n=-1): 114 """Returns the contents of a file as a string. 115 116 Starts reading from current position in file. 117 118 Args: 119 n: Read 'n' bytes if n != -1. If n = -1, reads to end of file. 120 121 Returns: 122 'n' bytes of the file (or whole file) in bytes mode or 'n' bytes of the 123 string if in string (regular) mode. 124 """ 125 self._preread_check() 126 with errors.raise_exception_on_not_ok_status() as status: 127 if n == -1: 128 length = self.size() - self.tell() 129 else: 130 length = n 131 return self._prepare_value( 132 pywrap_tensorflow.ReadFromStream(self._read_buf, length, status)) 133 134 @deprecation.deprecated_args( 135 None, 136 "position is deprecated in favor of the offset argument.", 137 "position") 138 def seek(self, offset=None, whence=0, position=None): 139 # TODO(jhseu): Delete later. Used to omit `position` from docs. 140 # pylint: disable=g-doc-args 141 """Seeks to the offset in the file. 142 143 Args: 144 offset: The byte count relative to the whence argument. 145 whence: Valid values for whence are: 146 0: start of the file (default) 147 1: relative to the current position of the file 148 2: relative to the end of file. offset is usually negative. 149 """ 150 # pylint: enable=g-doc-args 151 self._preread_check() 152 # We needed to make offset a keyword argument for backwards-compatibility. 153 # This check exists so that we can convert back to having offset be a 154 # positional argument. 155 # TODO(jhseu): Make `offset` a positional argument after `position` is 156 # deleted. 157 if offset is None and position is None: 158 raise TypeError("seek(): offset argument required") 159 if offset is not None and position is not None: 160 raise TypeError("seek(): offset and position may not be set " 161 "simultaneously.") 162 163 if position is not None: 164 offset = position 165 166 with errors.raise_exception_on_not_ok_status() as status: 167 if whence == 0: 168 pass 169 elif whence == 1: 170 offset += self.tell() 171 elif whence == 2: 172 offset += self.size() 173 else: 174 raise errors.InvalidArgumentError( 175 None, None, 176 "Invalid whence argument: {}. Valid values are 0, 1, or 2." 177 .format(whence)) 178 ret_status = self._read_buf.Seek(offset) 179 pywrap_tensorflow.Set_TF_Status_from_Status(status, ret_status) 180 181 def readline(self): 182 r"""Reads the next line from the file. Leaves the '\n' at the end.""" 183 self._preread_check() 184 return self._prepare_value(self._read_buf.ReadLineAsString()) 185 186 def readlines(self): 187 """Returns all lines from the file in a list.""" 188 self._preread_check() 189 lines = [] 190 while True: 191 s = self.readline() 192 if not s: 193 break 194 lines.append(s) 195 return lines 196 197 def tell(self): 198 """Returns the current position in the file.""" 199 if self._read_check_passed: 200 self._preread_check() 201 return self._read_buf.Tell() 202 else: 203 self._prewrite_check() 204 205 with errors.raise_exception_on_not_ok_status() as status: 206 return pywrap_tensorflow.TellFile(self._writable_file, status) 207 208 def __enter__(self): 209 """Make usable with "with" statement.""" 210 return self 211 212 def __exit__(self, unused_type, unused_value, unused_traceback): 213 """Make usable with "with" statement.""" 214 self.close() 215 216 def __iter__(self): 217 return self 218 219 def next(self): 220 retval = self.readline() 221 if not retval: 222 raise StopIteration() 223 return retval 224 225 def __next__(self): 226 return self.next() 227 228 def flush(self): 229 """Flushes the Writable file. 230 231 This only ensures that the data has made its way out of the process without 232 any guarantees on whether it's written to disk. This means that the 233 data would survive an application crash but not necessarily an OS crash. 234 """ 235 if self._writable_file: 236 with errors.raise_exception_on_not_ok_status() as status: 237 ret_status = self._writable_file.Flush() 238 pywrap_tensorflow.Set_TF_Status_from_Status(status, ret_status) 239 240 def close(self): 241 """Closes FileIO. Should be called for the WritableFile to be flushed.""" 242 self._read_buf = None 243 if self._writable_file: 244 with errors.raise_exception_on_not_ok_status() as status: 245 ret_status = self._writable_file.Close() 246 pywrap_tensorflow.Set_TF_Status_from_Status(status, ret_status) 247 self._writable_file = None 248 249 250@tf_export(v1=["gfile.Exists"]) 251def file_exists(filename): 252 """Determines whether a path exists or not. 253 254 Args: 255 filename: string, a path 256 257 Returns: 258 True if the path exists, whether its a file or a directory. 259 False if the path does not exist and there are no filesystem errors. 260 261 Raises: 262 errors.OpError: Propagates any errors reported by the FileSystem API. 263 """ 264 return file_exists_v2(filename) 265 266 267@tf_export("io.gfile.exists") 268def file_exists_v2(path): 269 """Determines whether a path exists or not. 270 271 Args: 272 path: string, a path 273 274 Returns: 275 True if the path exists, whether its a file or a directory. 276 False if the path does not exist and there are no filesystem errors. 277 278 Raises: 279 errors.OpError: Propagates any errors reported by the FileSystem API. 280 """ 281 try: 282 with errors.raise_exception_on_not_ok_status() as status: 283 pywrap_tensorflow.FileExists(compat.as_bytes(path), status) 284 except errors.NotFoundError: 285 return False 286 return True 287 288 289@tf_export(v1=["gfile.Remove"]) 290def delete_file(filename): 291 """Deletes the file located at 'filename'. 292 293 Args: 294 filename: string, a filename 295 296 Raises: 297 errors.OpError: Propagates any errors reported by the FileSystem API. E.g., 298 NotFoundError if the file does not exist. 299 """ 300 delete_file_v2(filename) 301 302 303@tf_export("io.gfile.remove") 304def delete_file_v2(path): 305 """Deletes the path located at 'path'. 306 307 Args: 308 path: string, a path 309 310 Raises: 311 errors.OpError: Propagates any errors reported by the FileSystem API. E.g., 312 NotFoundError if the path does not exist. 313 """ 314 with errors.raise_exception_on_not_ok_status() as status: 315 pywrap_tensorflow.DeleteFile(compat.as_bytes(path), status) 316 317 318def read_file_to_string(filename, binary_mode=False): 319 """Reads the entire contents of a file to a string. 320 321 Args: 322 filename: string, path to a file 323 binary_mode: whether to open the file in binary mode or not. This changes 324 the type of the object returned. 325 326 Returns: 327 contents of the file as a string or bytes. 328 329 Raises: 330 errors.OpError: Raises variety of errors that are subtypes e.g. 331 NotFoundError etc. 332 """ 333 if binary_mode: 334 f = FileIO(filename, mode="rb") 335 else: 336 f = FileIO(filename, mode="r") 337 return f.read() 338 339 340def write_string_to_file(filename, file_content): 341 """Writes a string to a given file. 342 343 Args: 344 filename: string, path to a file 345 file_content: string, contents that need to be written to the file 346 347 Raises: 348 errors.OpError: If there are errors during the operation. 349 """ 350 with FileIO(filename, mode="w") as f: 351 f.write(file_content) 352 353 354@tf_export(v1=["gfile.Glob"]) 355def get_matching_files(filename): 356 """Returns a list of files that match the given pattern(s). 357 358 Args: 359 filename: string or iterable of strings. The glob pattern(s). 360 361 Returns: 362 A list of strings containing filenames that match the given pattern(s). 363 364 Raises: 365 errors.OpError: If there are filesystem / directory listing errors. 366 """ 367 return get_matching_files_v2(filename) 368 369 370@tf_export("io.gfile.glob") 371def get_matching_files_v2(pattern): 372 """Returns a list of files that match the given pattern(s). 373 374 Args: 375 pattern: string or iterable of strings. The glob pattern(s). 376 377 Returns: 378 A list of strings containing filenames that match the given pattern(s). 379 380 Raises: 381 errors.OpError: If there are filesystem / directory listing errors. 382 """ 383 with errors.raise_exception_on_not_ok_status() as status: 384 if isinstance(pattern, six.string_types): 385 return [ 386 # Convert the filenames to string from bytes. 387 compat.as_str_any(matching_filename) 388 for matching_filename in pywrap_tensorflow.GetMatchingFiles( 389 compat.as_bytes(pattern), status) 390 ] 391 else: 392 return [ 393 # Convert the filenames to string from bytes. 394 compat.as_str_any(matching_filename) 395 for single_filename in pattern 396 for matching_filename in pywrap_tensorflow.GetMatchingFiles( 397 compat.as_bytes(single_filename), status) 398 ] 399 400 401@tf_export(v1=["gfile.MkDir"]) 402def create_dir(dirname): 403 """Creates a directory with the name 'dirname'. 404 405 Args: 406 dirname: string, name of the directory to be created 407 408 Notes: 409 The parent directories need to exist. Use recursive_create_dir instead if 410 there is the possibility that the parent dirs don't exist. 411 412 Raises: 413 errors.OpError: If the operation fails. 414 """ 415 create_dir_v2(dirname) 416 417 418@tf_export("io.gfile.mkdir") 419def create_dir_v2(path): 420 """Creates a directory with the name given by 'path'. 421 422 Args: 423 path: string, name of the directory to be created 424 425 Notes: 426 The parent directories need to exist. Use recursive_create_dir instead if 427 there is the possibility that the parent dirs don't exist. 428 429 Raises: 430 errors.OpError: If the operation fails. 431 """ 432 with errors.raise_exception_on_not_ok_status() as status: 433 pywrap_tensorflow.CreateDir(compat.as_bytes(path), status) 434 435 436@tf_export(v1=["gfile.MakeDirs"]) 437def recursive_create_dir(dirname): 438 """Creates a directory and all parent/intermediate directories. 439 440 It succeeds if dirname already exists and is writable. 441 442 Args: 443 dirname: string, name of the directory to be created 444 445 Raises: 446 errors.OpError: If the operation fails. 447 """ 448 recursive_create_dir_v2(dirname) 449 450 451@tf_export("io.gfile.makedirs") 452def recursive_create_dir_v2(path): 453 """Creates a directory and all parent/intermediate directories. 454 455 It succeeds if path already exists and is writable. 456 457 Args: 458 path: string, name of the directory to be created 459 460 Raises: 461 errors.OpError: If the operation fails. 462 """ 463 with errors.raise_exception_on_not_ok_status() as status: 464 pywrap_tensorflow.RecursivelyCreateDir(compat.as_bytes(path), status) 465 466 467@tf_export(v1=["gfile.Copy"]) 468def copy(oldpath, newpath, overwrite=False): 469 """Copies data from oldpath to newpath. 470 471 Args: 472 oldpath: string, name of the file who's contents need to be copied 473 newpath: string, name of the file to which to copy to 474 overwrite: boolean, if false its an error for newpath to be occupied by an 475 existing file. 476 477 Raises: 478 errors.OpError: If the operation fails. 479 """ 480 copy_v2(oldpath, newpath, overwrite) 481 482 483@tf_export("io.gfile.copy") 484def copy_v2(src, dst, overwrite=False): 485 """Copies data from src to dst. 486 487 Args: 488 src: string, name of the file whose contents need to be copied 489 dst: string, name of the file to which to copy to 490 overwrite: boolean, if false its an error for newpath to be occupied by an 491 existing file. 492 493 Raises: 494 errors.OpError: If the operation fails. 495 """ 496 with errors.raise_exception_on_not_ok_status() as status: 497 pywrap_tensorflow.CopyFile( 498 compat.as_bytes(src), compat.as_bytes(dst), overwrite, status) 499 500 501@tf_export(v1=["gfile.Rename"]) 502def rename(oldname, newname, overwrite=False): 503 """Rename or move a file / directory. 504 505 Args: 506 oldname: string, pathname for a file 507 newname: string, pathname to which the file needs to be moved 508 overwrite: boolean, if false it's an error for `newname` to be occupied by 509 an existing file. 510 511 Raises: 512 errors.OpError: If the operation fails. 513 """ 514 rename_v2(oldname, newname, overwrite) 515 516 517@tf_export("io.gfile.rename") 518def rename_v2(src, dst, overwrite=False): 519 """Rename or move a file / directory. 520 521 Args: 522 src: string, pathname for a file 523 dst: string, pathname to which the file needs to be moved 524 overwrite: boolean, if false it's an error for `dst` to be occupied by 525 an existing file. 526 527 Raises: 528 errors.OpError: If the operation fails. 529 """ 530 with errors.raise_exception_on_not_ok_status() as status: 531 pywrap_tensorflow.RenameFile( 532 compat.as_bytes(src), compat.as_bytes(dst), overwrite, status) 533 534 535def atomic_write_string_to_file(filename, contents, overwrite=True): 536 """Writes to `filename` atomically. 537 538 This means that when `filename` appears in the filesystem, it will contain 539 all of `contents`. With write_string_to_file, it is possible for the file 540 to appear in the filesystem with `contents` only partially written. 541 542 Accomplished by writing to a temp file and then renaming it. 543 544 Args: 545 filename: string, pathname for a file 546 contents: string, contents that need to be written to the file 547 overwrite: boolean, if false it's an error for `filename` to be occupied by 548 an existing file. 549 """ 550 temp_pathname = filename + ".tmp" + uuid.uuid4().hex 551 write_string_to_file(temp_pathname, contents) 552 try: 553 rename(temp_pathname, filename, overwrite) 554 except errors.OpError: 555 delete_file(temp_pathname) 556 raise 557 558 559@tf_export(v1=["gfile.DeleteRecursively"]) 560def delete_recursively(dirname): 561 """Deletes everything under dirname recursively. 562 563 Args: 564 dirname: string, a path to a directory 565 566 Raises: 567 errors.OpError: If the operation fails. 568 """ 569 delete_recursively_v2(dirname) 570 571 572@tf_export("io.gfile.rmtree") 573def delete_recursively_v2(path): 574 """Deletes everything under path recursively. 575 576 Args: 577 path: string, a path 578 579 Raises: 580 errors.OpError: If the operation fails. 581 """ 582 with errors.raise_exception_on_not_ok_status() as status: 583 pywrap_tensorflow.DeleteRecursively(compat.as_bytes(path), status) 584 585 586@tf_export(v1=["gfile.IsDirectory"]) 587def is_directory(dirname): 588 """Returns whether the path is a directory or not. 589 590 Args: 591 dirname: string, path to a potential directory 592 593 Returns: 594 True, if the path is a directory; False otherwise 595 """ 596 return is_directory_v2(dirname) 597 598 599@tf_export("io.gfile.isdir") 600def is_directory_v2(path): 601 """Returns whether the path is a directory or not. 602 603 Args: 604 path: string, path to a potential directory 605 606 Returns: 607 True, if the path is a directory; False otherwise 608 """ 609 status = c_api_util.ScopedTFStatus() 610 return pywrap_tensorflow.IsDirectory(compat.as_bytes(path), status) 611 612 613@tf_export(v1=["gfile.ListDirectory"]) 614def list_directory(dirname): 615 """Returns a list of entries contained within a directory. 616 617 The list is in arbitrary order. It does not contain the special entries "." 618 and "..". 619 620 Args: 621 dirname: string, path to a directory 622 623 Returns: 624 [filename1, filename2, ... filenameN] as strings 625 626 Raises: 627 errors.NotFoundError if directory doesn't exist 628 """ 629 return list_directory_v2(dirname) 630 631 632@tf_export("io.gfile.listdir") 633def list_directory_v2(path): 634 """Returns a list of entries contained within a directory. 635 636 The list is in arbitrary order. It does not contain the special entries "." 637 and "..". 638 639 Args: 640 path: string, path to a directory 641 642 Returns: 643 [filename1, filename2, ... filenameN] as strings 644 645 Raises: 646 errors.NotFoundError if directory doesn't exist 647 """ 648 if not is_directory(path): 649 raise errors.NotFoundError( 650 node_def=None, 651 op=None, 652 message="Could not find directory {}".format(path)) 653 with errors.raise_exception_on_not_ok_status() as status: 654 # Convert each element to string, since the return values of the 655 # vector of string should be interpreted as strings, not bytes. 656 return [ 657 compat.as_str_any(filename) 658 for filename in pywrap_tensorflow.GetChildren( 659 compat.as_bytes(path), status) 660 ] 661 662 663@tf_export(v1=["gfile.Walk"]) 664def walk(top, in_order=True): 665 """Recursive directory tree generator for directories. 666 667 Args: 668 top: string, a Directory name 669 in_order: bool, Traverse in order if True, post order if False. 670 671 Errors that happen while listing directories are ignored. 672 673 Yields: 674 Each yield is a 3-tuple: the pathname of a directory, followed by lists of 675 all its subdirectories and leaf files. 676 (dirname, [subdirname, subdirname, ...], [filename, filename, ...]) 677 as strings 678 """ 679 return walk_v2(top, in_order) 680 681 682@tf_export("io.gfile.walk") 683def walk_v2(top, topdown=True, onerror=None): 684 """Recursive directory tree generator for directories. 685 686 Args: 687 top: string, a Directory name 688 topdown: bool, Traverse pre order if True, post order if False. 689 onerror: optional handler for errors. Should be a function, it will be 690 called with the error as argument. Rethrowing the error aborts the walk. 691 692 Errors that happen while listing directories are ignored. 693 694 Yields: 695 Each yield is a 3-tuple: the pathname of a directory, followed by lists of 696 all its subdirectories and leaf files. 697 (dirname, [subdirname, subdirname, ...], [filename, filename, ...]) 698 as strings 699 """ 700 top = compat.as_str_any(top) 701 try: 702 listing = list_directory(top) 703 except errors.NotFoundError as err: 704 if onerror: 705 onerror(err) 706 else: 707 return 708 709 files = [] 710 subdirs = [] 711 for item in listing: 712 full_path = os.path.join(top, item) 713 if is_directory(full_path): 714 subdirs.append(item) 715 else: 716 files.append(item) 717 718 here = (top, subdirs, files) 719 720 if topdown: 721 yield here 722 723 for subdir in subdirs: 724 for subitem in walk_v2(os.path.join(top, subdir), topdown, onerror=onerror): 725 yield subitem 726 727 if not topdown: 728 yield here 729 730 731@tf_export(v1=["gfile.Stat"]) 732def stat(filename): 733 """Returns file statistics for a given path. 734 735 Args: 736 filename: string, path to a file 737 738 Returns: 739 FileStatistics struct that contains information about the path 740 741 Raises: 742 errors.OpError: If the operation fails. 743 """ 744 return stat_v2(filename) 745 746 747@tf_export("io.gfile.stat") 748def stat_v2(path): 749 """Returns file statistics for a given path. 750 751 Args: 752 path: string, path to a file 753 754 Returns: 755 FileStatistics struct that contains information about the path 756 757 Raises: 758 errors.OpError: If the operation fails. 759 """ 760 file_statistics = pywrap_tensorflow.FileStatistics() 761 with errors.raise_exception_on_not_ok_status() as status: 762 pywrap_tensorflow.Stat(compat.as_bytes(path), file_statistics, status) 763 return file_statistics 764 765 766def filecmp(filename_a, filename_b): 767 """Compare two files, returning True if they are the same, False otherwise. 768 769 We check size first and return False quickly if the files are different sizes. 770 If they are the same size, we continue to generating a crc for the whole file. 771 772 You might wonder: why not use Python's filecmp.cmp() instead? The answer is 773 that the builtin library is not robust to the many different filesystems 774 TensorFlow runs on, and so we here perform a similar comparison with 775 the more robust FileIO. 776 777 Args: 778 filename_a: string path to the first file. 779 filename_b: string path to the second file. 780 781 Returns: 782 True if the files are the same, False otherwise. 783 """ 784 size_a = FileIO(filename_a, "rb").size() 785 size_b = FileIO(filename_b, "rb").size() 786 if size_a != size_b: 787 return False 788 789 # Size is the same. Do a full check. 790 crc_a = file_crc32(filename_a) 791 crc_b = file_crc32(filename_b) 792 return crc_a == crc_b 793 794 795def file_crc32(filename, block_size=_DEFAULT_BLOCK_SIZE): 796 """Get the crc32 of the passed file. 797 798 The crc32 of a file can be used for error checking; two files with the same 799 crc32 are considered equivalent. Note that the entire file must be read 800 to produce the crc32. 801 802 Args: 803 filename: string, path to a file 804 block_size: Integer, process the files by reading blocks of `block_size` 805 bytes. Use -1 to read the file as once. 806 807 Returns: 808 hexadecimal as string, the crc32 of the passed file. 809 """ 810 crc = 0 811 with FileIO(filename, mode="rb") as f: 812 chunk = f.read(n=block_size) 813 while chunk: 814 crc = binascii.crc32(chunk, crc) 815 chunk = f.read(n=block_size) 816 return hex(crc & 0xFFFFFFFF) 817