1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_FAST_PARSING_H_ 17 #define TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_FAST_PARSING_H_ 18 19 #include <string> 20 #include <unordered_map> 21 #include <vector> 22 23 #include "tensorflow/core/example/example.pb.h" 24 #include "tensorflow/core/framework/allocator.h" 25 #include "tensorflow/core/framework/graph.pb.h" 26 #include "tensorflow/core/framework/op_kernel.h" 27 #include "tensorflow/core/framework/partial_tensor_shape.h" 28 #include "tensorflow/core/framework/tensor.h" 29 #include "tensorflow/core/framework/types.h" 30 #include "tensorflow/core/lib/gtl/array_slice.h" 31 #include "tensorflow/core/platform/types.h" 32 #include "tensorflow/core/util/sparse/sparse_tensor.h" 33 34 namespace tensorflow { 35 namespace example { 36 37 // FastParseExampleConfig defines how to parse features in Example. 38 // Each sub-config is responsible for one feature identified with feature_name. 39 // FastParseExampleConfig can't have two sub-configs with the same feature_name. 40 // dtype identifies the type of output vector and the kind of Feature expected 41 // in Example. 42 struct FastParseExampleConfig { 43 struct Dense { DenseFastParseExampleConfig::Dense44 Dense(StringPiece feature_name, DataType dtype, PartialTensorShape shape, 45 Tensor default_value, bool variable_length, 46 std::size_t elements_per_stride) 47 : feature_name(feature_name), // TODO(mrry): Switch to preallocated 48 // tstring when this is available. 49 dtype(dtype), 50 shape(std::move(shape)), 51 default_value(std::move(default_value)), 52 variable_length(variable_length), 53 elements_per_stride(elements_per_stride) {} 54 Dense() = default; 55 56 tstring feature_name; 57 DataType dtype; 58 // These 2 fields correspond exactly to dense_shapes and dense_defaults in 59 // ParseExample op. 60 // Documentation is available in: tensorflow/core/ops/parsing_ops.cc 61 PartialTensorShape shape; 62 Tensor default_value; 63 bool variable_length; 64 std::size_t elements_per_stride; 65 }; 66 67 struct Sparse { SparseFastParseExampleConfig::Sparse68 Sparse(StringPiece feature_name, DataType dtype) 69 : feature_name(feature_name), // TODO(mrry): Switch to preallocated 70 // tstring when this is available. 71 dtype(dtype) {} 72 Sparse() = default; 73 74 tstring feature_name; 75 DataType dtype; 76 }; 77 78 struct Ragged { RaggedFastParseExampleConfig::Ragged79 Ragged(StringPiece feature_name, DataType dtype, DataType splits_dtype) 80 : feature_name(feature_name), // TODO(mrry): Switch to preallocated 81 // tstring when this is available. 82 dtype(dtype), 83 splits_dtype(splits_dtype) {} 84 Ragged() = default; 85 86 tstring feature_name; 87 DataType dtype; 88 DataType splits_dtype; 89 }; 90 91 std::vector<Dense> dense; 92 std::vector<Sparse> sparse; 93 std::vector<Ragged> ragged; 94 95 // If `true`, `Result::feature_stats` will contain one 96 // `PerExampleFeatureStats` for each serialized example in the input. 97 bool collect_feature_stats = false; 98 }; 99 100 // Statistics about the features in each example passed to 101 // `FastParse[Single]Example()`. 102 // 103 // TODO(b/111553342): The gathered statistics currently have two limitations: 104 // * Feature names that appear more than once will be counted multiple times. 105 // * The feature values count only represents the counts for features that were 106 // requested in the `FastParseExampleConfig`. 107 // These could be addressed with additional work at runtime. 108 struct PerExampleFeatureStats { 109 // The number of feature names in an example. 110 size_t features_count = 0; 111 112 // The sum of the number of values in each feature that is parsed. 113 size_t feature_values_count = 0; 114 }; 115 116 // This is exactly the output of TF's ParseExample Op. 117 // Documentation is available in: tensorflow/core/ops/parsing_ops.cc 118 struct Result { 119 std::vector<Tensor> sparse_indices; 120 std::vector<Tensor> sparse_values; 121 std::vector<Tensor> sparse_shapes; 122 std::vector<Tensor> dense_values; 123 std::vector<Tensor> ragged_values; 124 std::vector<Tensor> ragged_splits; 125 std::vector<Tensor> ragged_outer_splits; // For SequenceExamples 126 127 // This vector will be populated with one element per example if 128 // `FastParseExampleConfig::collect_feature_stats` is set to `true`. 129 std::vector<PerExampleFeatureStats> feature_stats; 130 }; 131 132 // Parses a batch of serialized Example protos and converts them into result 133 // according to given config. 134 // Given example names have to either be empty or the same size as serialized. 135 // example_names are used only for error messages. 136 Status FastParseExample(const FastParseExampleConfig& config, 137 gtl::ArraySlice<tstring> serialized, 138 gtl::ArraySlice<tstring> example_names, 139 thread::ThreadPool* thread_pool, Result* result); 140 141 // TODO(mrry): Move the hash table construction into the config object. 142 typedef FastParseExampleConfig FastParseSingleExampleConfig; 143 144 Status FastParseSingleExample(const FastParseSingleExampleConfig& config, 145 StringPiece serialized, Result* result); 146 147 // Parses a batch of serialized SequenceExample protos and converts them into 148 // result according to given config. 149 // Given example names have to either be empty or the same size as serialized. 150 // example_names are used only for error messages. 151 // (If batch=true, then this parses a single SequenceExample.) 152 Status FastParseSequenceExample( 153 const example::FastParseExampleConfig& context_config, 154 const example::FastParseExampleConfig& feature_list_config, 155 gtl::ArraySlice<tstring> serialized, gtl::ArraySlice<tstring> example_names, 156 thread::ThreadPool* thread_pool, example::Result* context_result, 157 example::Result* feature_list_result, 158 std::vector<Tensor>* dense_feature_lengths, bool is_batch = true); 159 160 // This function parses serialized Example and populates given example. 161 // It uses the same specialized parser as FastParseExample which is efficient. 162 // But then constructs Example which is relatively slow. 163 // It is exported here as a convenient API to test parser part separately. 164 bool TestFastParse(const string& serialized, Example* example); 165 166 } // namespace example 167 } // namespace tensorflow 168 169 #endif // TENSORFLOW_CORE_UTIL_EXAMPLE_PROTO_FAST_PARSING_H_ 170