• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #ifndef TENSORFLOW_CORE_LIB_IO_RECORD_READER_H_
17 #define TENSORFLOW_CORE_LIB_IO_RECORD_READER_H_
18 
19 #include "tensorflow/core/lib/core/errors.h"
20 #include "tensorflow/core/lib/core/stringpiece.h"
21 #include "tensorflow/core/lib/io/inputstream_interface.h"
22 #if !defined(IS_SLIM_BUILD)
23 #include "tensorflow/core/lib/io/zlib_compression_options.h"
24 #include "tensorflow/core/lib/io/zlib_inputstream.h"
25 #endif  // IS_SLIM_BUILD
26 #include "tensorflow/core/platform/macros.h"
27 #include "tensorflow/core/platform/types.h"
28 
29 namespace tensorflow {
30 
31 class RandomAccessFile;
32 
33 namespace io {
34 
35 class RecordReaderOptions {
36  public:
37   enum CompressionType { NONE = 0, ZLIB_COMPRESSION = 1 };
38   CompressionType compression_type = NONE;
39 
40   // If buffer_size is non-zero, then all reads must be sequential, and no
41   // skipping around is permitted. (Note: this is the same behavior as reading
42   // compressed files.) Consider using SequentialRecordReader.
43   int64 buffer_size = 0;
44 
45   static RecordReaderOptions CreateRecordReaderOptions(
46       const string& compression_type);
47 
48 #if !defined(IS_SLIM_BUILD)
49   // Options specific to zlib compression.
50   ZlibCompressionOptions zlib_options;
51 #endif  // IS_SLIM_BUILD
52 };
53 
54 // Low-level interface to read TFRecord files.
55 //
56 // If using compression or buffering, consider using SequentialRecordReader.
57 //
58 // Note: this class is not thread safe; external synchronization required.
59 class RecordReader {
60  public:
61   // Format of a single record:
62   //  uint64    length
63   //  uint32    masked crc of length
64   //  byte      data[length]
65   //  uint32    masked crc of data
66   static const size_t kHeaderSize = sizeof(uint64) + sizeof(uint32);
67   static const size_t kFooterSize = sizeof(uint32);
68 
69   // Statistics (sizes are in units of bytes)
70   struct Stats {
71     int64 file_size = -1;
72     int64 data_size = -1;
73     int64 entries = -1;  // Number of values
74   };
75 
76   // Metadata for the TFRecord file.
77   struct Metadata {
78     Stats stats;
79   };
80 
81   // Create a reader that will return log records from "*file".
82   // "*file" must remain live while this Reader is in use.
83   explicit RecordReader(
84       RandomAccessFile* file,
85       const RecordReaderOptions& options = RecordReaderOptions());
86 
87   virtual ~RecordReader() = default;
88 
89   // Read the record at "*offset" into *record and update *offset to
90   // point to the offset of the next record.  Returns OK on success,
91   // OUT_OF_RANGE for end of file, or something else for an error.
92   Status ReadRecord(uint64* offset, string* record);
93 
94   // Return the metadata of the Record file.
95   //
96   // The current implementation scans the file to completion,
97   // skipping over the data regions, to extract the metadata once
98   // on the first call to GetStats().  An improved implementation
99   // would change RecordWriter to write the metadata into TFRecord
100   // so that GetMetadata() could be a const method.
101   //
102   // 'metadata' must not be nullptr.
103   Status GetMetadata(Metadata* md);
104 
105  private:
106   Status ReadChecksummed(uint64 offset, size_t n, string* result);
107 
108   RecordReaderOptions options_;
109   std::unique_ptr<InputStreamInterface> input_stream_;
110   bool last_read_failed_;
111 
112   std::unique_ptr<Metadata> cached_metadata_;
113 
114   TF_DISALLOW_COPY_AND_ASSIGN(RecordReader);
115 };
116 
117 // High-level interface to read TFRecord files.
118 //
119 // Note: this class is not thread safe; external synchronization required.
120 class SequentialRecordReader {
121  public:
122   // Create a reader that will return log records from "*file".
123   // "*file" must remain live while this Reader is in use.
124   explicit SequentialRecordReader(
125       RandomAccessFile* file,
126       const RecordReaderOptions& options = RecordReaderOptions());
127 
128   virtual ~SequentialRecordReader() = default;
129 
130   // Reads the next record in the file into *record. Returns OK on success,
131   // OUT_OF_RANGE for end of file, or something else for an error.
ReadRecord(string * record)132   Status ReadRecord(string* record) {
133     return underlying_.ReadRecord(&offset_, record);
134   }
135 
136   // Returns the current offset in the file.
TellOffset()137   uint64 TellOffset() { return offset_; }
138 
139   // Seek to this offset within the file and set this offset as the current
140   // offset. Trying to seek backward will throw error.
SeekOffset(uint64 offset)141   Status SeekOffset(uint64 offset) {
142     if (offset < offset_)
143       return errors::InvalidArgument(
144           "Trying to seek offset: ", offset,
145           " which is less than the current offset: ", offset_);
146     offset_ = offset;
147     return Status::OK();
148   }
149 
150  private:
151   RecordReader underlying_;
152   uint64 offset_ = 0;
153 };
154 
155 }  // namespace io
156 }  // namespace tensorflow
157 
158 #endif  // TENSORFLOW_CORE_LIB_IO_RECORD_READER_H_
159