• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""File IO methods that wrap the C++ FileSystem API."""
16from __future__ import absolute_import
17from __future__ import division
18from __future__ import print_function
19
20import binascii
21import os
22import uuid
23
24import six
25
26from tensorflow.python.framework import errors
27from tensorflow.python.lib.io import _pywrap_file_io
28from tensorflow.python.util import compat
29from tensorflow.python.util import deprecation
30from tensorflow.python.util.tf_export import tf_export
31
32# A good default block size depends on the system in question.
33# A somewhat conservative default chosen here.
34_DEFAULT_BLOCK_SIZE = 16 * 1024 * 1024
35
36
37class FileIO(object):
38  """FileIO class that exposes methods to read / write to / from files.
39
40  The constructor takes the following arguments:
41  name: [path-like object](https://docs.python.org/3/glossary.html#term-path-like-object)
42    giving the pathname of the file to be opened.
43  mode: one of `r`, `w`, `a`, `r+`, `w+`, `a+`. Append `b` for bytes mode.
44
45  Can be used as an iterator to iterate over lines in the file.
46
47  The default buffer size used for the BufferedInputStream used for reading
48  the file line by line is 1024 * 512 bytes.
49  """
50
51  def __init__(self, name, mode):
52    self.__name = name
53    self.__mode = mode
54    self._read_buf = None
55    self._writable_file = None
56    self._binary_mode = "b" in mode
57    mode = mode.replace("b", "")
58    if mode not in ("r", "w", "a", "r+", "w+", "a+"):
59      raise errors.InvalidArgumentError(
60          None, None, "mode is not 'r' or 'w' or 'a' or 'r+' or 'w+' or 'a+'")
61    self._read_check_passed = mode in ("r", "r+", "a+", "w+")
62    self._write_check_passed = mode in ("a", "w", "r+", "a+", "w+")
63
64  @property
65  def name(self):
66    """Returns the file name."""
67    return self.__name
68
69  @property
70  def mode(self):
71    """Returns the mode in which the file was opened."""
72    return self.__mode
73
74  def _preread_check(self):
75    if not self._read_buf:
76      if not self._read_check_passed:
77        raise errors.PermissionDeniedError(None, None,
78                                           "File isn't open for reading")
79      self._read_buf = _pywrap_file_io.BufferedInputStream(
80          compat.path_to_str(self.__name), 1024 * 512)
81
82  def _prewrite_check(self):
83    if not self._writable_file:
84      if not self._write_check_passed:
85        raise errors.PermissionDeniedError(None, None,
86                                           "File isn't open for writing")
87      self._writable_file = _pywrap_file_io.WritableFile(
88          compat.path_to_bytes(self.__name), compat.as_bytes(self.__mode))
89
90  def _prepare_value(self, val):
91    if self._binary_mode:
92      return compat.as_bytes(val)
93    else:
94      return compat.as_str_any(val)
95
96  def size(self):
97    """Returns the size of the file."""
98    return stat(self.__name).length
99
100  def write(self, file_content):
101    """Writes file_content to the file. Appends to the end of the file."""
102    self._prewrite_check()
103    self._writable_file.append(compat.as_bytes(file_content))
104
105  def read(self, n=-1):
106    """Returns the contents of a file as a string.
107
108    Starts reading from current position in file.
109
110    Args:
111      n: Read `n` bytes if `n != -1`. If `n = -1`, reads to end of file.
112
113    Returns:
114      `n` bytes of the file (or whole file) in bytes mode or `n` bytes of the
115      string if in string (regular) mode.
116    """
117    self._preread_check()
118    if n == -1:
119      length = self.size() - self.tell()
120    else:
121      length = n
122    return self._prepare_value(self._read_buf.read(length))
123
124  @deprecation.deprecated_args(
125      None, "position is deprecated in favor of the offset argument.",
126      "position")
127  def seek(self, offset=None, whence=0, position=None):
128    # TODO(jhseu): Delete later. Used to omit `position` from docs.
129    # pylint: disable=g-doc-args
130    """Seeks to the offset in the file.
131
132    Args:
133      offset: The byte count relative to the whence argument.
134      whence: Valid values for whence are:
135        0: start of the file (default)
136        1: relative to the current position of the file
137        2: relative to the end of file. `offset` is usually negative.
138    """
139    # pylint: enable=g-doc-args
140    self._preread_check()
141    # We needed to make offset a keyword argument for backwards-compatibility.
142    # This check exists so that we can convert back to having offset be a
143    # positional argument.
144    # TODO(jhseu): Make `offset` a positional argument after `position` is
145    # deleted.
146    if offset is None and position is None:
147      raise TypeError("seek(): offset argument required")
148    if offset is not None and position is not None:
149      raise TypeError("seek(): offset and position may not be set "
150                      "simultaneously.")
151
152    if position is not None:
153      offset = position
154
155    if whence == 0:
156      pass
157    elif whence == 1:
158      offset += self.tell()
159    elif whence == 2:
160      offset += self.size()
161    else:
162      raise errors.InvalidArgumentError(
163          None, None,
164          "Invalid whence argument: {}. Valid values are 0, 1, or 2.".format(
165              whence))
166    self._read_buf.seek(offset)
167
168  def readline(self):
169    r"""Reads the next line, keeping \n. At EOF, returns ''."""
170    self._preread_check()
171    return self._prepare_value(self._read_buf.readline())
172
173  def readlines(self):
174    """Returns all lines from the file in a list."""
175    self._preread_check()
176    lines = []
177    while True:
178      s = self.readline()
179      if not s:
180        break
181      lines.append(s)
182    return lines
183
184  def tell(self):
185    """Returns the current position in the file."""
186    if self._read_check_passed:
187      self._preread_check()
188      return self._read_buf.tell()
189    else:
190      self._prewrite_check()
191
192      return self._writable_file.tell()
193
194  def __enter__(self):
195    """Make usable with "with" statement."""
196    return self
197
198  def __exit__(self, unused_type, unused_value, unused_traceback):
199    """Make usable with "with" statement."""
200    self.close()
201
202  def __iter__(self):
203    return self
204
205  def __next__(self):
206    retval = self.readline()
207    if not retval:
208      raise StopIteration()
209    return retval
210
211  def next(self):
212    return self.__next__()
213
214  def flush(self):
215    """Flushes the Writable file.
216
217    This only ensures that the data has made its way out of the process without
218    any guarantees on whether it's written to disk. This means that the
219    data would survive an application crash but not necessarily an OS crash.
220    """
221    if self._writable_file:
222      self._writable_file.flush()
223
224  def close(self):
225    r"""Closes the file.
226
227    Should be called for the WritableFile to be flushed.
228
229    In general, if you use the context manager pattern, you don't need to call
230    this directly.
231
232    >>> with tf.io.gfile.GFile("/tmp/x", "w") as f:
233    ...   f.write("asdf\n")
234    ...   f.write("qwer\n")
235    >>> # implicit f.close() at the end of the block
236
237    For cloud filesystems, forgetting to call `close()` might result in data
238    loss as last write might not have been replicated.
239    """
240    self._read_buf = None
241    if self._writable_file:
242      self._writable_file.close()
243      self._writable_file = None
244
245  def seekable(self):
246    """Returns True as FileIO supports random access ops of seek()/tell()"""
247    return True
248
249
250@tf_export("io.gfile.exists")
251def file_exists_v2(path):
252  """Determines whether a path exists or not.
253
254  >>> with open("/tmp/x", "w") as f:
255  ...   f.write("asdf")
256  ...
257  4
258  >>> tf.io.gfile.exists("/tmp/x")
259  True
260
261  You can also specify the URI scheme for selecting a different filesystem:
262
263  >>> # for a GCS filesystem path:
264  >>> # tf.io.gfile.exists("gs://bucket/file")
265  >>> # for a local filesystem:
266  >>> with open("/tmp/x", "w") as f:
267  ...   f.write("asdf")
268  ...
269  4
270  >>> tf.io.gfile.exists("file:///tmp/x")
271  True
272
273  This currently returns `True` for existing directories but don't rely on this
274  behavior, especially if you are using cloud filesystems (e.g., GCS, S3,
275  Hadoop):
276
277  >>> tf.io.gfile.exists("/tmp")
278  True
279
280  Args:
281    path: string, a path
282
283  Returns:
284    True if the path exists, whether it's a file or a directory.
285    False if the path does not exist and there are no filesystem errors.
286
287  Raises:
288    errors.OpError: Propagates any errors reported by the FileSystem API.
289  """
290  try:
291    _pywrap_file_io.FileExists(compat.path_to_bytes(path))
292  except errors.NotFoundError:
293    return False
294  return True
295
296
297@tf_export(v1=["gfile.Exists"])
298def file_exists(filename):
299  return file_exists_v2(filename)
300
301
302file_exists.__doc__ = file_exists_v2.__doc__
303
304
305@tf_export(v1=["gfile.Remove"])
306def delete_file(filename):
307  """Deletes the file located at 'filename'.
308
309  Args:
310    filename: string, a filename
311
312  Raises:
313    errors.OpError: Propagates any errors reported by the FileSystem API.  E.g.,
314    `NotFoundError` if the file does not exist.
315  """
316  delete_file_v2(filename)
317
318
319@tf_export("io.gfile.remove")
320def delete_file_v2(path):
321  """Deletes the path located at 'path'.
322
323  Args:
324    path: string, a path
325
326  Raises:
327    errors.OpError: Propagates any errors reported by the FileSystem API.  E.g.,
328    `NotFoundError` if the path does not exist.
329  """
330  _pywrap_file_io.DeleteFile(compat.path_to_bytes(path))
331
332
333def read_file_to_string(filename, binary_mode=False):
334  """Reads the entire contents of a file to a string.
335
336  Args:
337    filename: string, path to a file
338    binary_mode: whether to open the file in binary mode or not. This changes
339      the type of the object returned.
340
341  Returns:
342    contents of the file as a string or bytes.
343
344  Raises:
345    errors.OpError: Raises variety of errors that are subtypes e.g.
346    `NotFoundError` etc.
347  """
348  if binary_mode:
349    f = FileIO(filename, mode="rb")
350  else:
351    f = FileIO(filename, mode="r")
352  return f.read()
353
354
355def write_string_to_file(filename, file_content):
356  """Writes a string to a given file.
357
358  Args:
359    filename: string, path to a file
360    file_content: string, contents that need to be written to the file
361
362  Raises:
363    errors.OpError: If there are errors during the operation.
364  """
365  with FileIO(filename, mode="w") as f:
366    f.write(file_content)
367
368
369@tf_export(v1=["gfile.Glob"])
370def get_matching_files(filename):
371  """Returns a list of files that match the given pattern(s).
372
373  Args:
374    filename: string or iterable of strings. The glob pattern(s).
375
376  Returns:
377    A list of strings containing filenames that match the given pattern(s).
378
379  Raises:
380  *  errors.OpError: If there are filesystem / directory listing errors.
381  *  errors.NotFoundError: If pattern to be matched is an invalid directory.
382  """
383  return get_matching_files_v2(filename)
384
385
386@tf_export("io.gfile.glob")
387def get_matching_files_v2(pattern):
388  r"""Returns a list of files that match the given pattern(s).
389
390  The patterns are defined as strings. Supported patterns are defined
391  here. Note that the pattern can be a Python iteratable of string patterns.
392
393  The format definition of the pattern is:
394
395  **pattern**: `{ term }`
396
397  **term**:
398    * `'*'`: matches any sequence of non-'/' characters
399    * `'?'`: matches a single non-'/' character
400    * `'[' [ '^' ] { match-list } ']'`: matches any single
401      character (not) on the list
402    * `c`: matches character `c`  where `c != '*', '?', '\\', '['`
403    * `'\\' c`: matches character `c`
404
405  **character range**:
406    * `c`: matches character `c` while `c != '\\', '-', ']'`
407    * `'\\' c`: matches character `c`
408    * `lo '-' hi`: matches character `c` for `lo <= c <= hi`
409
410  Examples:
411
412  >>> tf.io.gfile.glob("*.py")
413  ... # For example, ['__init__.py']
414
415  >>> tf.io.gfile.glob("__init__.??")
416  ... # As above
417
418  >>> files = {"*.py"}
419  >>> the_iterator = iter(files)
420  >>> tf.io.gfile.glob(the_iterator)
421  ... # As above
422
423  See the C++ function `GetMatchingPaths` in
424  [`core/platform/file_system.h`]
425  (../../../core/platform/file_system.h)
426  for implementation details.
427
428  Args:
429    pattern: string or iterable of strings. The glob pattern(s).
430
431  Returns:
432    A list of strings containing filenames that match the given pattern(s).
433
434  Raises:
435    errors.OpError: If there are filesystem / directory listing errors.
436    errors.NotFoundError: If pattern to be matched is an invalid directory.
437  """
438  if isinstance(pattern, six.string_types):
439    return [
440        # Convert the filenames to string from bytes.
441        compat.as_str_any(matching_filename)
442        for matching_filename in _pywrap_file_io.GetMatchingFiles(
443            compat.as_bytes(pattern))
444    ]
445  else:
446    return [
447        # Convert the filenames to string from bytes.
448        compat.as_str_any(matching_filename)  # pylint: disable=g-complex-comprehension
449        for single_filename in pattern
450        for matching_filename in _pywrap_file_io.GetMatchingFiles(
451            compat.as_bytes(single_filename))
452    ]
453
454
455@tf_export(v1=["gfile.MkDir"])
456def create_dir(dirname):
457  """Creates a directory with the name `dirname`.
458
459  Args:
460    dirname: string, name of the directory to be created
461
462  Notes: The parent directories need to exist. Use `tf.io.gfile.makedirs`
463    instead if there is the possibility that the parent dirs don't exist.
464
465  Raises:
466    errors.OpError: If the operation fails.
467  """
468  create_dir_v2(dirname)
469
470
471@tf_export("io.gfile.mkdir")
472def create_dir_v2(path):
473  """Creates a directory with the name given by `path`.
474
475  Args:
476    path: string, name of the directory to be created
477
478  Notes: The parent directories need to exist. Use `tf.io.gfile.makedirs`
479    instead if there is the possibility that the parent dirs don't exist.
480
481  Raises:
482    errors.OpError: If the operation fails.
483  """
484  _pywrap_file_io.CreateDir(compat.path_to_bytes(path))
485
486
487@tf_export(v1=["gfile.MakeDirs"])
488def recursive_create_dir(dirname):
489  """Creates a directory and all parent/intermediate directories.
490
491  It succeeds if dirname already exists and is writable.
492
493  Args:
494    dirname: string, name of the directory to be created
495
496  Raises:
497    errors.OpError: If the operation fails.
498  """
499  recursive_create_dir_v2(dirname)
500
501
502@tf_export("io.gfile.makedirs")
503def recursive_create_dir_v2(path):
504  """Creates a directory and all parent/intermediate directories.
505
506  It succeeds if path already exists and is writable.
507
508  Args:
509    path: string, name of the directory to be created
510
511  Raises:
512    errors.OpError: If the operation fails.
513  """
514  _pywrap_file_io.RecursivelyCreateDir(compat.path_to_bytes(path))
515
516
517@tf_export("io.gfile.copy")
518def copy_v2(src, dst, overwrite=False):
519  """Copies data from `src` to `dst`.
520
521  >>> with open("/tmp/x", "w") as f:
522  ...   f.write("asdf")
523  ...
524  4
525  >>> tf.io.gfile.exists("/tmp/x")
526  True
527  >>> tf.io.gfile.copy("/tmp/x", "/tmp/y")
528  >>> tf.io.gfile.exists("/tmp/y")
529  True
530  >>> tf.io.gfile.remove("/tmp/y")
531
532  You can also specify the URI scheme for selecting a different filesystem:
533
534  >>> with open("/tmp/x", "w") as f:
535  ...   f.write("asdf")
536  ...
537  4
538  >>> tf.io.gfile.copy("/tmp/x", "file:///tmp/y")
539  >>> tf.io.gfile.exists("/tmp/y")
540  True
541  >>> tf.io.gfile.remove("/tmp/y")
542
543  Note that you need to always specify a file name, even if moving into a new
544  directory. This is because some cloud filesystems don't have the concept of a
545  directory.
546
547  >>> with open("/tmp/x", "w") as f:
548  ...   f.write("asdf")
549  ...
550  4
551  >>> tf.io.gfile.mkdir("/tmp/new_dir")
552  >>> tf.io.gfile.copy("/tmp/x", "/tmp/new_dir/y")
553  >>> tf.io.gfile.exists("/tmp/new_dir/y")
554  True
555  >>> tf.io.gfile.rmtree("/tmp/new_dir")
556
557  If you want to prevent errors if the path already exists, you can use
558  `overwrite` argument:
559
560  >>> with open("/tmp/x", "w") as f:
561  ...   f.write("asdf")
562  ...
563  4
564  >>> tf.io.gfile.copy("/tmp/x", "file:///tmp/y")
565  >>> tf.io.gfile.copy("/tmp/x", "file:///tmp/y", overwrite=True)
566  >>> tf.io.gfile.remove("/tmp/y")
567
568  Note that the above will still result in an error if you try to overwrite a
569  directory with a file.
570
571  Note that you cannot copy a directory, only file arguments are supported.
572
573  Args:
574    src: string, name of the file whose contents need to be copied
575    dst: string, name of the file to which to copy to
576    overwrite: boolean, if false it's an error for `dst` to be occupied by an
577      existing file.
578
579  Raises:
580    errors.OpError: If the operation fails.
581  """
582  _pywrap_file_io.CopyFile(
583      compat.path_to_bytes(src), compat.path_to_bytes(dst), overwrite)
584
585
586@tf_export(v1=["gfile.Copy"])
587def copy(oldpath, newpath, overwrite=False):
588  copy_v2(oldpath, newpath, overwrite)
589
590
591copy.__doc__ = copy_v2.__doc__
592
593
594@tf_export(v1=["gfile.Rename"])
595def rename(oldname, newname, overwrite=False):
596  """Rename or move a file / directory.
597
598  Args:
599    oldname: string, pathname for a file
600    newname: string, pathname to which the file needs to be moved
601    overwrite: boolean, if false it's an error for `newname` to be occupied by
602      an existing file.
603
604  Raises:
605    errors.OpError: If the operation fails.
606  """
607  rename_v2(oldname, newname, overwrite)
608
609
610@tf_export("io.gfile.rename")
611def rename_v2(src, dst, overwrite=False):
612  """Rename or move a file / directory.
613
614  Args:
615    src: string, pathname for a file
616    dst: string, pathname to which the file needs to be moved
617    overwrite: boolean, if false it's an error for `dst` to be occupied by an
618      existing file.
619
620  Raises:
621    errors.OpError: If the operation fails.
622  """
623  _pywrap_file_io.RenameFile(
624      compat.path_to_bytes(src), compat.path_to_bytes(dst), overwrite)
625
626
627def atomic_write_string_to_file(filename, contents, overwrite=True):
628  """Writes to `filename` atomically.
629
630  This means that when `filename` appears in the filesystem, it will contain
631  all of `contents`. With write_string_to_file, it is possible for the file
632  to appear in the filesystem with `contents` only partially written.
633
634  Accomplished by writing to a temp file and then renaming it.
635
636  Args:
637    filename: string, pathname for a file
638    contents: string, contents that need to be written to the file
639    overwrite: boolean, if false it's an error for `filename` to be occupied by
640      an existing file.
641  """
642  if not has_atomic_move(filename):
643    write_string_to_file(filename, contents)
644  else:
645    temp_pathname = filename + ".tmp" + uuid.uuid4().hex
646    write_string_to_file(temp_pathname, contents)
647    try:
648      rename(temp_pathname, filename, overwrite)
649    except errors.OpError:
650      delete_file(temp_pathname)
651      raise
652
653
654@tf_export(v1=["gfile.DeleteRecursively"])
655def delete_recursively(dirname):
656  """Deletes everything under dirname recursively.
657
658  Args:
659    dirname: string, a path to a directory
660
661  Raises:
662    errors.OpError: If the operation fails.
663  """
664  delete_recursively_v2(dirname)
665
666
667@tf_export("io.gfile.rmtree")
668def delete_recursively_v2(path):
669  """Deletes everything under path recursively.
670
671  Args:
672    path: string, a path
673
674  Raises:
675    errors.OpError: If the operation fails.
676  """
677  _pywrap_file_io.DeleteRecursively(compat.path_to_bytes(path))
678
679
680@tf_export(v1=["gfile.IsDirectory"])
681def is_directory(dirname):
682  """Returns whether the path is a directory or not.
683
684  Args:
685    dirname: string, path to a potential directory
686
687  Returns:
688    True, if the path is a directory; False otherwise
689  """
690  return is_directory_v2(dirname)
691
692
693@tf_export("io.gfile.isdir")
694def is_directory_v2(path):
695  """Returns whether the path is a directory or not.
696
697  Args:
698    path: string, path to a potential directory
699
700  Returns:
701    True, if the path is a directory; False otherwise
702  """
703  try:
704    return _pywrap_file_io.IsDirectory(compat.path_to_bytes(path))
705  except errors.OpError:
706    return False
707
708
709def has_atomic_move(path):
710  """Checks whether the file system supports atomic moves.
711
712  Returns whether or not the file system of the given path supports the atomic
713  move operation for a file or folder.  If atomic move is supported, it is
714  recommended to use a temp location for writing and then move to the final
715  location.
716
717  Args:
718    path: string, path to a file
719
720  Returns:
721    True, if the path is on a file system that supports atomic move
722    False, if the file system does not support atomic move. In such cases
723           we need to be careful about using moves. In some cases it is safer
724           not to use temporary locations in this case.
725  """
726  try:
727    return _pywrap_file_io.HasAtomicMove(compat.path_to_bytes(path))
728  except errors.OpError:
729    # defaults to True
730    return True
731
732
733@tf_export(v1=["gfile.ListDirectory"])
734def list_directory(dirname):
735  """Returns a list of entries contained within a directory.
736
737  The list is in arbitrary order. It does not contain the special entries "."
738  and "..".
739
740  Args:
741    dirname: string, path to a directory
742
743  Returns:
744    [filename1, filename2, ... filenameN] as strings
745
746  Raises:
747    errors.NotFoundError if directory doesn't exist
748  """
749  return list_directory_v2(dirname)
750
751
752@tf_export("io.gfile.listdir")
753def list_directory_v2(path):
754  """Returns a list of entries contained within a directory.
755
756  The list is in arbitrary order. It does not contain the special entries "."
757  and "..".
758
759  Args:
760    path: string, path to a directory
761
762  Returns:
763    [filename1, filename2, ... filenameN] as strings
764
765  Raises:
766    errors.NotFoundError if directory doesn't exist
767  """
768  if not is_directory(path):
769    raise errors.NotFoundError(
770        node_def=None,
771        op=None,
772        message="Could not find directory {}".format(path))
773
774  # Convert each element to string, since the return values of the
775  # vector of string should be interpreted as strings, not bytes.
776  return [
777      compat.as_str_any(filename)
778      for filename in _pywrap_file_io.GetChildren(compat.path_to_bytes(path))
779  ]
780
781
782@tf_export(v1=["gfile.Walk"])
783def walk(top, in_order=True):
784  """Recursive directory tree generator for directories.
785
786  Args:
787    top: string, a Directory name
788    in_order: bool, Traverse in order if True, post order if False.  Errors that
789      happen while listing directories are ignored.
790
791  Yields:
792    Each yield is a 3-tuple:  the pathname of a directory, followed by lists of
793    all its subdirectories and leaf files. That is, each yield looks like:
794    `(dirname, [subdirname, subdirname, ...], [filename, filename, ...])`.
795    Each item is a string.
796  """
797  return walk_v2(top, in_order)
798
799
800@tf_export("io.gfile.walk")
801def walk_v2(top, topdown=True, onerror=None):
802  """Recursive directory tree generator for directories.
803
804  Args:
805    top: string, a Directory name
806    topdown: bool, Traverse pre order if True, post order if False.
807    onerror: optional handler for errors. Should be a function, it will be
808      called with the error as argument. Rethrowing the error aborts the walk.
809      Errors that happen while listing directories are ignored.
810
811  Yields:
812    Each yield is a 3-tuple:  the pathname of a directory, followed by lists of
813    all its subdirectories and leaf files. That is, each yield looks like:
814    `(dirname, [subdirname, subdirname, ...], [filename, filename, ...])`.
815    Each item is a string.
816  """
817
818  def _make_full_path(parent, item):
819    # Since `os.path.join` discards paths before one that starts with the path
820    # separator (https://docs.python.org/3/library/os.path.html#os.path.join),
821    # we have to manually handle that case as `/` is a valid character on GCS.
822    if item[0] == os.sep:
823      return "".join([os.path.join(parent, ""), item])
824    return os.path.join(parent, item)
825
826  top = compat.as_str_any(compat.path_to_str(top))
827  try:
828    listing = list_directory(top)
829  except errors.NotFoundError as err:
830    if onerror:
831      onerror(err)
832    else:
833      return
834
835  files = []
836  subdirs = []
837  for item in listing:
838    full_path = _make_full_path(top, item)
839    if is_directory(full_path):
840      subdirs.append(item)
841    else:
842      files.append(item)
843
844  here = (top, subdirs, files)
845
846  if topdown:
847    yield here
848
849  for subdir in subdirs:
850    for subitem in walk_v2(
851        _make_full_path(top, subdir), topdown, onerror=onerror):
852      yield subitem
853
854  if not topdown:
855    yield here
856
857
858@tf_export(v1=["gfile.Stat"])
859def stat(filename):
860  """Returns file statistics for a given path.
861
862  Args:
863    filename: string, path to a file
864
865  Returns:
866    FileStatistics struct that contains information about the path
867
868  Raises:
869    errors.OpError: If the operation fails.
870  """
871  return stat_v2(filename)
872
873
874@tf_export("io.gfile.stat")
875def stat_v2(path):
876  """Returns file statistics for a given path.
877
878  Args:
879    path: string, path to a file
880
881  Returns:
882    FileStatistics struct that contains information about the path
883
884  Raises:
885    errors.OpError: If the operation fails.
886  """
887  return _pywrap_file_io.Stat(compat.path_to_str(path))
888
889
890def filecmp(filename_a, filename_b):
891  """Compare two files, returning True if they are the same, False otherwise.
892
893  We check size first and return False quickly if the files are different sizes.
894  If they are the same size, we continue to generating a crc for the whole file.
895
896  You might wonder: why not use Python's `filecmp.cmp()` instead? The answer is
897  that the builtin library is not robust to the many different filesystems
898  TensorFlow runs on, and so we here perform a similar comparison with
899  the more robust FileIO.
900
901  Args:
902    filename_a: string path to the first file.
903    filename_b: string path to the second file.
904
905  Returns:
906    True if the files are the same, False otherwise.
907  """
908  size_a = FileIO(filename_a, "rb").size()
909  size_b = FileIO(filename_b, "rb").size()
910  if size_a != size_b:
911    return False
912
913  # Size is the same. Do a full check.
914  crc_a = file_crc32(filename_a)
915  crc_b = file_crc32(filename_b)
916  return crc_a == crc_b
917
918
919def file_crc32(filename, block_size=_DEFAULT_BLOCK_SIZE):
920  """Get the crc32 of the passed file.
921
922  The crc32 of a file can be used for error checking; two files with the same
923  crc32 are considered equivalent. Note that the entire file must be read
924  to produce the crc32.
925
926  Args:
927    filename: string, path to a file
928    block_size: Integer, process the files by reading blocks of `block_size`
929      bytes. Use -1 to read the file as once.
930
931  Returns:
932    hexadecimal as string, the crc32 of the passed file.
933  """
934  crc = 0
935  with FileIO(filename, mode="rb") as f:
936    chunk = f.read(n=block_size)
937    while chunk:
938      crc = binascii.crc32(chunk, crc)
939      chunk = f.read(n=block_size)
940  return hex(crc & 0xFFFFFFFF)
941