1# Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15 16"""Parsing Ops.""" 17from __future__ import absolute_import 18from __future__ import division 19from __future__ import print_function 20 21import collections 22import re 23 24from tensorflow.python.framework import constant_op 25from tensorflow.python.framework import dtypes 26from tensorflow.python.framework import ops 27from tensorflow.python.framework import sparse_tensor 28from tensorflow.python.framework import tensor_shape 29from tensorflow.python.ops import array_ops 30from tensorflow.python.ops import control_flow_ops 31from tensorflow.python.ops import gen_parsing_ops 32from tensorflow.python.ops import math_ops 33from tensorflow.python.ops import sparse_ops 34# go/tf-wildcard-import 35# pylint: disable=wildcard-import,undefined-variable 36from tensorflow.python.ops.gen_parsing_ops import * 37# pylint: enable=wildcard-import,undefined-variable 38from tensorflow.python.platform import tf_logging 39from tensorflow.python.util import deprecation 40from tensorflow.python.util.tf_export import tf_export 41 42 43ops.NotDifferentiable("DecodeRaw") 44ops.NotDifferentiable("ParseTensor") 45ops.NotDifferentiable("SerializeTensor") 46ops.NotDifferentiable("StringToNumber") 47 48 49@tf_export("io.VarLenFeature", v1=["VarLenFeature", "io.VarLenFeature"]) 50class VarLenFeature(collections.namedtuple("VarLenFeature", ["dtype"])): 51 """Configuration for parsing a variable-length input feature. 52 53 Fields: 54 dtype: Data type of input. 55 """ 56 pass 57 58 59@tf_export("io.SparseFeature", v1=["io.SparseFeature", "SparseFeature"]) 60class SparseFeature( 61 collections.namedtuple( 62 "SparseFeature", 63 ["index_key", "value_key", "dtype", "size", "already_sorted"])): 64 """Configuration for parsing a sparse input feature from an `Example`. 65 66 Note, preferably use `VarLenFeature` (possibly in combination with a 67 `SequenceExample`) in order to parse out `SparseTensor`s instead of 68 `SparseFeature` due to its simplicity. 69 70 Closely mimicking the `SparseTensor` that will be obtained by parsing an 71 `Example` with a `SparseFeature` config, a `SparseFeature` contains a 72 73 * `value_key`: The name of key for a `Feature` in the `Example` whose parsed 74 `Tensor` will be the resulting `SparseTensor.values`. 75 76 * `index_key`: A list of names - one for each dimension in the resulting 77 `SparseTensor` whose `indices[i][dim]` indicating the position of 78 the `i`-th value in the `dim` dimension will be equal to the `i`-th value in 79 the Feature with key named `index_key[dim]` in the `Example`. 80 81 * `size`: A list of ints for the resulting `SparseTensor.dense_shape`. 82 83 For example, we can represent the following 2D `SparseTensor` 84 85 ```python 86 SparseTensor(indices=[[3, 1], [20, 0]], 87 values=[0.5, -1.0] 88 dense_shape=[100, 3]) 89 ``` 90 91 with an `Example` input proto 92 93 ```python 94 features { 95 feature { key: "val" value { float_list { value: [ 0.5, -1.0 ] } } } 96 feature { key: "ix0" value { int64_list { value: [ 3, 20 ] } } } 97 feature { key: "ix1" value { int64_list { value: [ 1, 0 ] } } } 98 } 99 ``` 100 101 and `SparseFeature` config with 2 `index_key`s 102 103 ```python 104 SparseFeature(index_key=["ix0", "ix1"], 105 value_key="val", 106 dtype=tf.float32, 107 size=[100, 3]) 108 ``` 109 110 Fields: 111 index_key: A single string name or a list of string names of index features. 112 For each key the underlying feature's type must be `int64` and its length 113 must always match that of the `value_key` feature. 114 To represent `SparseTensor`s with a `dense_shape` of `rank` higher than 1 115 a list of length `rank` should be used. 116 value_key: Name of value feature. The underlying feature's type must 117 be `dtype` and its length must always match that of all the `index_key`s' 118 features. 119 dtype: Data type of the `value_key` feature. 120 size: A Python int or list thereof specifying the dense shape. Should be a 121 list if and only if `index_key` is a list. In that case the list must be 122 equal to the length of `index_key`. Each for each entry `i` all values in 123 the `index_key`[i] feature must be in `[0, size[i])`. 124 already_sorted: A Python boolean to specify whether the values in 125 `value_key` are already sorted by their index position. If so skip 126 sorting. False by default (optional). 127 """ 128 129 def __new__(cls, index_key, value_key, dtype, size, already_sorted=False): 130 return super(SparseFeature, cls).__new__( 131 cls, index_key, value_key, dtype, size, already_sorted) 132 133 134@tf_export("io.FixedLenFeature", v1=["io.FixedLenFeature", "FixedLenFeature"]) 135class FixedLenFeature(collections.namedtuple( 136 "FixedLenFeature", ["shape", "dtype", "default_value"])): 137 """Configuration for parsing a fixed-length input feature. 138 139 To treat sparse input as dense, provide a `default_value`; otherwise, 140 the parse functions will fail on any examples missing this feature. 141 142 Fields: 143 shape: Shape of input data. 144 dtype: Data type of input. 145 default_value: Value to be used if an example is missing this feature. It 146 must be compatible with `dtype` and of the specified `shape`. 147 """ 148 149 def __new__(cls, shape, dtype, default_value=None): 150 return super(FixedLenFeature, cls).__new__( 151 cls, shape, dtype, default_value) 152 153 154@tf_export("io.FixedLenSequenceFeature", 155 v1=["io.FixedLenSequenceFeature", "FixedLenSequenceFeature"]) 156class FixedLenSequenceFeature(collections.namedtuple( 157 "FixedLenSequenceFeature", 158 ["shape", "dtype", "allow_missing", "default_value"])): 159 """Configuration for parsing a variable-length input feature into a `Tensor`. 160 161 The resulting `Tensor` of parsing a single `SequenceExample` or `Example` has 162 a static `shape` of `[None] + shape` and the specified `dtype`. 163 The resulting `Tensor` of parsing a `batch_size` many `Example`s has 164 a static `shape` of `[batch_size, None] + shape` and the specified `dtype`. 165 The entries in the `batch` from different `Examples` will be padded with 166 `default_value` to the maximum length present in the `batch`. 167 168 To treat a sparse input as dense, provide `allow_missing=True`; otherwise, 169 the parse functions will fail on any examples missing this feature. 170 171 Fields: 172 shape: Shape of input data for dimension 2 and higher. First dimension is 173 of variable length `None`. 174 dtype: Data type of input. 175 allow_missing: Whether to allow this feature to be missing from a feature 176 list item. Is available only for parsing `SequenceExample` not for 177 parsing `Examples`. 178 default_value: Scalar value to be used to pad multiple `Example`s to their 179 maximum length. Irrelevant for parsing a single `Example` or 180 `SequenceExample`. Defaults to "" for dtype string and 0 otherwise 181 (optional). 182 """ 183 184 def __new__(cls, shape, dtype, allow_missing=False, default_value=None): 185 return super(FixedLenSequenceFeature, cls).__new__( 186 cls, shape, dtype, allow_missing, default_value) 187 188 189def _features_to_raw_params(features, types): 190 """Split feature tuples into raw params used by `gen_parsing_ops`. 191 192 Args: 193 features: A `dict` mapping feature keys to objects of a type in `types`. 194 types: Type of features to allow, among `FixedLenFeature`, `VarLenFeature`, 195 `SparseFeature`, and `FixedLenSequenceFeature`. 196 197 Returns: 198 Tuple of `sparse_keys`, `sparse_types`, `dense_keys`, `dense_types`, 199 `dense_defaults`, `dense_shapes`. 200 201 Raises: 202 ValueError: if `features` contains an item not in `types`, or an invalid 203 feature. 204 """ 205 sparse_keys = [] 206 sparse_types = [] 207 dense_keys = [] 208 dense_types = [] 209 # When the graph is built twice, multiple dense_defaults in a normal dict 210 # could come out in different orders. This will fail the _e2e_test which 211 # expects exactly the same graph. 212 # OrderedDict which preserves the order can solve the problem. 213 dense_defaults = collections.OrderedDict() 214 dense_shapes = [] 215 if features: 216 # NOTE: We iterate over sorted keys to keep things deterministic. 217 for key in sorted(features.keys()): 218 feature = features[key] 219 if isinstance(feature, VarLenFeature): 220 if VarLenFeature not in types: 221 raise ValueError("Unsupported VarLenFeature %s." % (feature,)) 222 if not feature.dtype: 223 raise ValueError("Missing type for feature %s." % key) 224 sparse_keys.append(key) 225 sparse_types.append(feature.dtype) 226 elif isinstance(feature, SparseFeature): 227 if SparseFeature not in types: 228 raise ValueError("Unsupported SparseFeature %s." % (feature,)) 229 230 if not feature.index_key: 231 raise ValueError( 232 "Missing index_key for SparseFeature %s." % (feature,)) 233 if not feature.value_key: 234 raise ValueError( 235 "Missing value_key for SparseFeature %s." % (feature,)) 236 if not feature.dtype: 237 raise ValueError("Missing type for feature %s." % key) 238 index_keys = feature.index_key 239 if isinstance(index_keys, str): 240 index_keys = [index_keys] 241 elif len(index_keys) > 1: 242 tf_logging.warning("SparseFeature is a complicated feature config " 243 "and should only be used after careful " 244 "consideration of VarLenFeature.") 245 for index_key in sorted(index_keys): 246 if index_key in sparse_keys: 247 dtype = sparse_types[sparse_keys.index(index_key)] 248 if dtype != dtypes.int64: 249 raise ValueError("Conflicting type %s vs int64 for feature %s." % 250 (dtype, index_key)) 251 else: 252 sparse_keys.append(index_key) 253 sparse_types.append(dtypes.int64) 254 if feature.value_key in sparse_keys: 255 dtype = sparse_types[sparse_keys.index(feature.value_key)] 256 if dtype != feature.dtype: 257 raise ValueError("Conflicting type %s vs %s for feature %s." % ( 258 dtype, feature.dtype, feature.value_key)) 259 else: 260 sparse_keys.append(feature.value_key) 261 sparse_types.append(feature.dtype) 262 elif isinstance(feature, FixedLenFeature): 263 if FixedLenFeature not in types: 264 raise ValueError("Unsupported FixedLenFeature %s." % (feature,)) 265 if not feature.dtype: 266 raise ValueError("Missing type for feature %s." % key) 267 if feature.shape is None: 268 raise ValueError("Missing shape for feature %s." % key) 269 feature_tensor_shape = tensor_shape.as_shape(feature.shape) 270 if (feature.shape and feature_tensor_shape.ndims and 271 feature_tensor_shape.dims[0].value is None): 272 raise ValueError("First dimension of shape for feature %s unknown. " 273 "Consider using FixedLenSequenceFeature." % key) 274 if (feature.shape is not None and 275 not feature_tensor_shape.is_fully_defined()): 276 raise ValueError("All dimensions of shape for feature %s need to be " 277 "known but received %s." % (key, str(feature.shape))) 278 dense_keys.append(key) 279 dense_shapes.append(feature.shape) 280 dense_types.append(feature.dtype) 281 if feature.default_value is not None: 282 dense_defaults[key] = feature.default_value 283 elif isinstance(feature, FixedLenSequenceFeature): 284 if FixedLenSequenceFeature not in types: 285 raise ValueError("Unsupported FixedLenSequenceFeature %s." % ( 286 feature,)) 287 if not feature.dtype: 288 raise ValueError("Missing type for feature %s." % key) 289 if feature.shape is None: 290 raise ValueError("Missing shape for feature %s." % key) 291 dense_keys.append(key) 292 dense_shapes.append(feature.shape) 293 dense_types.append(feature.dtype) 294 if feature.allow_missing: 295 dense_defaults[key] = None 296 if feature.default_value is not None: 297 dense_defaults[key] = feature.default_value 298 else: 299 raise ValueError("Invalid feature %s:%s." % (key, feature)) 300 return ( 301 sparse_keys, sparse_types, dense_keys, dense_types, dense_defaults, 302 dense_shapes) 303 304 305def _construct_sparse_tensors_for_sparse_features(features, tensor_dict): 306 """Merges SparseTensors of indices and values of SparseFeatures. 307 308 Constructs new dict based on `tensor_dict`. For `SparseFeatures` in the values 309 of `features` expects their `index_key`s and `index_value`s to be present in 310 `tensor_dict` mapping to `SparseTensor`s. Constructs a single `SparseTensor` 311 from them, and adds it to the result with the key from `features`. 312 Copies other keys and values from `tensor_dict` with keys present in 313 `features`. 314 315 Args: 316 features: A `dict` mapping feature keys to `SparseFeature` values. 317 Values of other types will be ignored. 318 tensor_dict: A `dict` mapping feature keys to `Tensor` and `SparseTensor` 319 values. Expected to contain keys of the `SparseFeature`s' `index_key`s and 320 `value_key`s and mapping them to `SparseTensor`s. 321 Returns: 322 A `dict` mapping feature keys to `Tensor` and `SparseTensor` values. Similar 323 to `tensor_dict` except each `SparseFeature`s in `features` results in a 324 single `SparseTensor`. 325 """ 326 tensor_dict = dict(tensor_dict) # Do not modify argument passed in. 327 # Construct SparseTensors for SparseFeatures. 328 for key in sorted(features.keys()): 329 feature = features[key] 330 if isinstance(feature, SparseFeature): 331 if isinstance(feature.index_key, str): 332 sp_ids = tensor_dict[feature.index_key] 333 else: 334 sp_ids = [tensor_dict[index_key] for index_key in feature.index_key] 335 sp_values = tensor_dict[feature.value_key] 336 tensor_dict[key] = sparse_ops.sparse_merge( 337 sp_ids, 338 sp_values, 339 vocab_size=feature.size, 340 already_sorted=feature.already_sorted) 341 # Remove tensors from dictionary that were only used to construct 342 # SparseTensors for SparseFeature. 343 for key in set(tensor_dict) - set(features): 344 del tensor_dict[key] 345 return tensor_dict 346 347 348def _prepend_none_dimension(features): 349 if features: 350 modified_features = dict(features) # Create a copy to modify 351 for key, feature in features.items(): 352 if isinstance(feature, FixedLenSequenceFeature): 353 if not feature.allow_missing: 354 raise ValueError("Unsupported: FixedLenSequenceFeature requires " 355 "allow_missing to be True.") 356 modified_features[key] = FixedLenSequenceFeature( 357 [None] + list(feature.shape), 358 feature.dtype, 359 feature.allow_missing, 360 feature.default_value) 361 return modified_features 362 else: 363 return features 364 365 366@tf_export(v1=["io.parse_example", "parse_example"]) 367def parse_example(serialized, features, name=None, example_names=None): 368 # pylint: disable=line-too-long 369 """Parses `Example` protos into a `dict` of tensors. 370 371 Parses a number of serialized [`Example`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto) 372 protos given in `serialized`. We refer to `serialized` as a batch with 373 `batch_size` many entries of individual `Example` protos. 374 375 `example_names` may contain descriptive names for the corresponding serialized 376 protos. These may be useful for debugging purposes, but they have no effect on 377 the output. If not `None`, `example_names` must be the same length as 378 `serialized`. 379 380 This op parses serialized examples into a dictionary mapping keys to `Tensor` 381 and `SparseTensor` objects. `features` is a dict from keys to `VarLenFeature`, 382 `SparseFeature`, and `FixedLenFeature` objects. Each `VarLenFeature` 383 and `SparseFeature` is mapped to a `SparseTensor`, and each 384 `FixedLenFeature` is mapped to a `Tensor`. 385 386 Each `VarLenFeature` maps to a `SparseTensor` of the specified type 387 representing a ragged matrix. Its indices are `[batch, index]` where `batch` 388 identifies the example in `serialized`, and `index` is the value's index in 389 the list of values associated with that feature and example. 390 391 Each `SparseFeature` maps to a `SparseTensor` of the specified type 392 representing a Tensor of `dense_shape` `[batch_size] + SparseFeature.size`. 393 Its `values` come from the feature in the examples with key `value_key`. 394 A `values[i]` comes from a position `k` in the feature of an example at batch 395 entry `batch`. This positional information is recorded in `indices[i]` as 396 `[batch, index_0, index_1, ...]` where `index_j` is the `k-th` value of 397 the feature in the example at with key `SparseFeature.index_key[j]`. 398 In other words, we split the indices (except the first index indicating the 399 batch entry) of a `SparseTensor` by dimension into different features of the 400 `Example`. Due to its complexity a `VarLenFeature` should be preferred over a 401 `SparseFeature` whenever possible. 402 403 Each `FixedLenFeature` `df` maps to a `Tensor` of the specified type (or 404 `tf.float32` if not specified) and shape `(serialized.size(),) + df.shape`. 405 406 `FixedLenFeature` entries with a `default_value` are optional. With no default 407 value, we will fail if that `Feature` is missing from any example in 408 `serialized`. 409 410 Each `FixedLenSequenceFeature` `df` maps to a `Tensor` of the specified type 411 (or `tf.float32` if not specified) and shape 412 `(serialized.size(), None) + df.shape`. 413 All examples in `serialized` will be padded with `default_value` along the 414 second dimension. 415 416 Examples: 417 418 For example, if one expects a `tf.float32` `VarLenFeature` `ft` and three 419 serialized `Example`s are provided: 420 421 ``` 422 serialized = [ 423 features 424 { feature { key: "ft" value { float_list { value: [1.0, 2.0] } } } }, 425 features 426 { feature []}, 427 features 428 { feature { key: "ft" value { float_list { value: [3.0] } } } 429 ] 430 ``` 431 432 then the output will look like: 433 434 ```python 435 {"ft": SparseTensor(indices=[[0, 0], [0, 1], [2, 0]], 436 values=[1.0, 2.0, 3.0], 437 dense_shape=(3, 2)) } 438 ``` 439 440 If instead a `FixedLenSequenceFeature` with `default_value = -1.0` and 441 `shape=[]` is used then the output will look like: 442 443 ```python 444 {"ft": [[1.0, 2.0], [3.0, -1.0]]} 445 ``` 446 447 Given two `Example` input protos in `serialized`: 448 449 ``` 450 [ 451 features { 452 feature { key: "kw" value { bytes_list { value: [ "knit", "big" ] } } } 453 feature { key: "gps" value { float_list { value: [] } } } 454 }, 455 features { 456 feature { key: "kw" value { bytes_list { value: [ "emmy" ] } } } 457 feature { key: "dank" value { int64_list { value: [ 42 ] } } } 458 feature { key: "gps" value { } } 459 } 460 ] 461 ``` 462 463 And arguments 464 465 ``` 466 example_names: ["input0", "input1"], 467 features: { 468 "kw": VarLenFeature(tf.string), 469 "dank": VarLenFeature(tf.int64), 470 "gps": VarLenFeature(tf.float32), 471 } 472 ``` 473 474 Then the output is a dictionary: 475 476 ```python 477 { 478 "kw": SparseTensor( 479 indices=[[0, 0], [0, 1], [1, 0]], 480 values=["knit", "big", "emmy"] 481 dense_shape=[2, 2]), 482 "dank": SparseTensor( 483 indices=[[1, 0]], 484 values=[42], 485 dense_shape=[2, 1]), 486 "gps": SparseTensor( 487 indices=[], 488 values=[], 489 dense_shape=[2, 0]), 490 } 491 ``` 492 493 For dense results in two serialized `Example`s: 494 495 ``` 496 [ 497 features { 498 feature { key: "age" value { int64_list { value: [ 0 ] } } } 499 feature { key: "gender" value { bytes_list { value: [ "f" ] } } } 500 }, 501 features { 502 feature { key: "age" value { int64_list { value: [] } } } 503 feature { key: "gender" value { bytes_list { value: [ "f" ] } } } 504 } 505 ] 506 ``` 507 508 We can use arguments: 509 510 ``` 511 example_names: ["input0", "input1"], 512 features: { 513 "age": FixedLenFeature([], dtype=tf.int64, default_value=-1), 514 "gender": FixedLenFeature([], dtype=tf.string), 515 } 516 ``` 517 518 And the expected output is: 519 520 ```python 521 { 522 "age": [[0], [-1]], 523 "gender": [["f"], ["f"]], 524 } 525 ``` 526 527 An alternative to `VarLenFeature` to obtain a `SparseTensor` is 528 `SparseFeature`. For example, given two `Example` input protos in 529 `serialized`: 530 531 ``` 532 [ 533 features { 534 feature { key: "val" value { float_list { value: [ 0.5, -1.0 ] } } } 535 feature { key: "ix" value { int64_list { value: [ 3, 20 ] } } } 536 }, 537 features { 538 feature { key: "val" value { float_list { value: [ 0.0 ] } } } 539 feature { key: "ix" value { int64_list { value: [ 42 ] } } } 540 } 541 ] 542 ``` 543 544 And arguments 545 546 ``` 547 example_names: ["input0", "input1"], 548 features: { 549 "sparse": SparseFeature( 550 index_key="ix", value_key="val", dtype=tf.float32, size=100), 551 } 552 ``` 553 554 Then the output is a dictionary: 555 556 ```python 557 { 558 "sparse": SparseTensor( 559 indices=[[0, 3], [0, 20], [1, 42]], 560 values=[0.5, -1.0, 0.0] 561 dense_shape=[2, 100]), 562 } 563 ``` 564 565 Args: 566 serialized: A vector (1-D Tensor) of strings, a batch of binary 567 serialized `Example` protos. 568 features: A `dict` mapping feature keys to `FixedLenFeature`, 569 `VarLenFeature`, and `SparseFeature` values. 570 name: A name for this operation (optional). 571 example_names: A vector (1-D Tensor) of strings (optional), the names of 572 the serialized protos in the batch. 573 574 Returns: 575 A `dict` mapping feature keys to `Tensor` and `SparseTensor` values. 576 577 Raises: 578 ValueError: if any feature is invalid. 579 """ 580 return parse_example_v2(serialized, features, example_names, name) 581 582 583@tf_export("io.parse_example", v1=[]) 584def parse_example_v2(serialized, features, example_names=None, name=None): 585 # pylint: disable=line-too-long 586 """Parses `Example` protos into a `dict` of tensors. 587 588 Parses a number of serialized [`Example`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto) 589 protos given in `serialized`. We refer to `serialized` as a batch with 590 `batch_size` many entries of individual `Example` protos. 591 592 `example_names` may contain descriptive names for the corresponding serialized 593 protos. These may be useful for debugging purposes, but they have no effect on 594 the output. If not `None`, `example_names` must be the same length as 595 `serialized`. 596 597 This op parses serialized examples into a dictionary mapping keys to `Tensor` 598 and `SparseTensor` objects. `features` is a dict from keys to `VarLenFeature`, 599 `SparseFeature`, and `FixedLenFeature` objects. Each `VarLenFeature` 600 and `SparseFeature` is mapped to a `SparseTensor`, and each 601 `FixedLenFeature` is mapped to a `Tensor`. 602 603 Each `VarLenFeature` maps to a `SparseTensor` of the specified type 604 representing a ragged matrix. Its indices are `[batch, index]` where `batch` 605 identifies the example in `serialized`, and `index` is the value's index in 606 the list of values associated with that feature and example. 607 608 Each `SparseFeature` maps to a `SparseTensor` of the specified type 609 representing a Tensor of `dense_shape` `[batch_size] + SparseFeature.size`. 610 Its `values` come from the feature in the examples with key `value_key`. 611 A `values[i]` comes from a position `k` in the feature of an example at batch 612 entry `batch`. This positional information is recorded in `indices[i]` as 613 `[batch, index_0, index_1, ...]` where `index_j` is the `k-th` value of 614 the feature in the example at with key `SparseFeature.index_key[j]`. 615 In other words, we split the indices (except the first index indicating the 616 batch entry) of a `SparseTensor` by dimension into different features of the 617 `Example`. Due to its complexity a `VarLenFeature` should be preferred over a 618 `SparseFeature` whenever possible. 619 620 Each `FixedLenFeature` `df` maps to a `Tensor` of the specified type (or 621 `tf.float32` if not specified) and shape `(serialized.size(),) + df.shape`. 622 623 `FixedLenFeature` entries with a `default_value` are optional. With no default 624 value, we will fail if that `Feature` is missing from any example in 625 `serialized`. 626 627 Each `FixedLenSequenceFeature` `df` maps to a `Tensor` of the specified type 628 (or `tf.float32` if not specified) and shape 629 `(serialized.size(), None) + df.shape`. 630 All examples in `serialized` will be padded with `default_value` along the 631 second dimension. 632 633 Examples: 634 635 For example, if one expects a `tf.float32` `VarLenFeature` `ft` and three 636 serialized `Example`s are provided: 637 638 ``` 639 serialized = [ 640 features 641 { feature { key: "ft" value { float_list { value: [1.0, 2.0] } } } }, 642 features 643 { feature []}, 644 features 645 { feature { key: "ft" value { float_list { value: [3.0] } } } 646 ] 647 ``` 648 649 then the output will look like: 650 651 ```python 652 {"ft": SparseTensor(indices=[[0, 0], [0, 1], [2, 0]], 653 values=[1.0, 2.0, 3.0], 654 dense_shape=(3, 2)) } 655 ``` 656 657 If instead a `FixedLenSequenceFeature` with `default_value = -1.0` and 658 `shape=[]` is used then the output will look like: 659 660 ```python 661 {"ft": [[1.0, 2.0], [3.0, -1.0]]} 662 ``` 663 664 Given two `Example` input protos in `serialized`: 665 666 ``` 667 [ 668 features { 669 feature { key: "kw" value { bytes_list { value: [ "knit", "big" ] } } } 670 feature { key: "gps" value { float_list { value: [] } } } 671 }, 672 features { 673 feature { key: "kw" value { bytes_list { value: [ "emmy" ] } } } 674 feature { key: "dank" value { int64_list { value: [ 42 ] } } } 675 feature { key: "gps" value { } } 676 } 677 ] 678 ``` 679 680 And arguments 681 682 ``` 683 example_names: ["input0", "input1"], 684 features: { 685 "kw": VarLenFeature(tf.string), 686 "dank": VarLenFeature(tf.int64), 687 "gps": VarLenFeature(tf.float32), 688 } 689 ``` 690 691 Then the output is a dictionary: 692 693 ```python 694 { 695 "kw": SparseTensor( 696 indices=[[0, 0], [0, 1], [1, 0]], 697 values=["knit", "big", "emmy"] 698 dense_shape=[2, 2]), 699 "dank": SparseTensor( 700 indices=[[1, 0]], 701 values=[42], 702 dense_shape=[2, 1]), 703 "gps": SparseTensor( 704 indices=[], 705 values=[], 706 dense_shape=[2, 0]), 707 } 708 ``` 709 710 For dense results in two serialized `Example`s: 711 712 ``` 713 [ 714 features { 715 feature { key: "age" value { int64_list { value: [ 0 ] } } } 716 feature { key: "gender" value { bytes_list { value: [ "f" ] } } } 717 }, 718 features { 719 feature { key: "age" value { int64_list { value: [] } } } 720 feature { key: "gender" value { bytes_list { value: [ "f" ] } } } 721 } 722 ] 723 ``` 724 725 We can use arguments: 726 727 ``` 728 example_names: ["input0", "input1"], 729 features: { 730 "age": FixedLenFeature([], dtype=tf.int64, default_value=-1), 731 "gender": FixedLenFeature([], dtype=tf.string), 732 } 733 ``` 734 735 And the expected output is: 736 737 ```python 738 { 739 "age": [[0], [-1]], 740 "gender": [["f"], ["f"]], 741 } 742 ``` 743 744 An alternative to `VarLenFeature` to obtain a `SparseTensor` is 745 `SparseFeature`. For example, given two `Example` input protos in 746 `serialized`: 747 748 ``` 749 [ 750 features { 751 feature { key: "val" value { float_list { value: [ 0.5, -1.0 ] } } } 752 feature { key: "ix" value { int64_list { value: [ 3, 20 ] } } } 753 }, 754 features { 755 feature { key: "val" value { float_list { value: [ 0.0 ] } } } 756 feature { key: "ix" value { int64_list { value: [ 42 ] } } } 757 } 758 ] 759 ``` 760 761 And arguments 762 763 ``` 764 example_names: ["input0", "input1"], 765 features: { 766 "sparse": SparseFeature( 767 index_key="ix", value_key="val", dtype=tf.float32, size=100), 768 } 769 ``` 770 771 Then the output is a dictionary: 772 773 ```python 774 { 775 "sparse": SparseTensor( 776 indices=[[0, 3], [0, 20], [1, 42]], 777 values=[0.5, -1.0, 0.0] 778 dense_shape=[2, 100]), 779 } 780 ``` 781 782 Args: 783 serialized: A vector (1-D Tensor) of strings, a batch of binary 784 serialized `Example` protos. 785 features: A `dict` mapping feature keys to `FixedLenFeature`, 786 `VarLenFeature`, and `SparseFeature` values. 787 example_names: A vector (1-D Tensor) of strings (optional), the names of 788 the serialized protos in the batch. 789 name: A name for this operation (optional). 790 791 Returns: 792 A `dict` mapping feature keys to `Tensor` and `SparseTensor` values. 793 794 Raises: 795 ValueError: if any feature is invalid. 796 """ 797 if not features: 798 raise ValueError("Missing: features was %s." % features) 799 features = _prepend_none_dimension(features) 800 (sparse_keys, sparse_types, dense_keys, dense_types, dense_defaults, 801 dense_shapes) = _features_to_raw_params( 802 features, 803 [VarLenFeature, SparseFeature, FixedLenFeature, FixedLenSequenceFeature]) 804 outputs = _parse_example_raw( 805 serialized, example_names, sparse_keys, sparse_types, dense_keys, 806 dense_types, dense_defaults, dense_shapes, name) 807 return _construct_sparse_tensors_for_sparse_features(features, outputs) 808 809 810def _parse_example_raw(serialized, 811 names=None, 812 sparse_keys=None, 813 sparse_types=None, 814 dense_keys=None, 815 dense_types=None, 816 dense_defaults=None, 817 dense_shapes=None, 818 name=None): 819 """Parses `Example` protos. 820 821 Args: 822 serialized: A vector (1-D Tensor) of strings, a batch of binary 823 serialized `Example` protos. 824 names: A vector (1-D Tensor) of strings (optional), the names of 825 the serialized protos. 826 sparse_keys: A list of string keys in the examples' features. 827 The results for these keys will be returned as `SparseTensor` objects. 828 sparse_types: A list of `DTypes` of the same length as `sparse_keys`. 829 Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`), 830 and `tf.string` (`BytesList`) are supported. 831 dense_keys: A list of string keys in the examples' features. 832 The results for these keys will be returned as `Tensor`s 833 dense_types: A list of DTypes of the same length as `dense_keys`. 834 Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`), 835 and `tf.string` (`BytesList`) are supported. 836 dense_defaults: A dict mapping string keys to `Tensor`s. 837 The keys of the dict must match the dense_keys of the feature. 838 dense_shapes: A list of tuples with the same length as `dense_keys`. 839 The shape of the data for each dense feature referenced by `dense_keys`. 840 Required for any input tensors identified by `dense_keys`. Must be 841 either fully defined, or may contain an unknown first dimension. 842 An unknown first dimension means the feature is treated as having 843 a variable number of blocks, and the output shape along this dimension 844 is considered unknown at graph build time. Padding is applied for 845 minibatch elements smaller than the maximum number of blocks for the 846 given feature along this dimension. 847 name: A name for this operation (optional). 848 849 Returns: 850 A `dict` mapping keys to `Tensor`s and `SparseTensor`s. 851 852 """ 853 with ops.name_scope(name, "ParseExample", [serialized, names]): 854 (names, dense_defaults_vec, sparse_keys, sparse_types, 855 dense_keys, dense_shapes, _) = _process_raw_parameters( 856 names, dense_defaults, sparse_keys, sparse_types, dense_keys, 857 dense_types, dense_shapes) 858 859 outputs = gen_parsing_ops.parse_example( 860 serialized=serialized, 861 names=names, 862 dense_defaults=dense_defaults_vec, 863 sparse_keys=sparse_keys, 864 sparse_types=sparse_types, 865 dense_keys=dense_keys, 866 dense_shapes=dense_shapes, 867 name=name) 868 869 (sparse_indices, sparse_values, sparse_shapes, dense_values) = outputs 870 871 sparse_tensors = [ 872 sparse_tensor.SparseTensor(ix, val, shape) for (ix, val, shape) 873 in zip(sparse_indices, sparse_values, sparse_shapes)] 874 875 return dict(zip(sparse_keys + dense_keys, sparse_tensors + dense_values)) 876 877 878def _process_raw_parameters(names, dense_defaults, sparse_keys, sparse_types, 879 dense_keys, dense_types, dense_shapes): 880 """Process raw parameters to params used by `gen_parsing_ops`. 881 882 Args: 883 names: A vector (1-D Tensor) of strings (optional), the names of 884 the serialized protos. 885 dense_defaults: A dict mapping string keys to `Tensor`s. 886 The keys of the dict must match the dense_keys of the feature. 887 sparse_keys: A list of string keys in the examples' features. 888 The results for these keys will be returned as `SparseTensor` objects. 889 sparse_types: A list of `DTypes` of the same length as `sparse_keys`. 890 Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`), 891 and `tf.string` (`BytesList`) are supported. 892 dense_keys: A list of string keys in the examples' features. 893 The results for these keys will be returned as `Tensor`s 894 dense_types: A list of DTypes of the same length as `dense_keys`. 895 Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`), 896 and `tf.string` (`BytesList`) are supported. 897 dense_shapes: A list of tuples with the same length as `dense_keys`. 898 The shape of the data for each dense feature referenced by `dense_keys`. 899 Required for any input tensors identified by `dense_keys`. Must be 900 either fully defined, or may contain an unknown first dimension. 901 An unknown first dimension means the feature is treated as having 902 a variable number of blocks, and the output shape along this dimension 903 is considered unknown at graph build time. Padding is applied for 904 minibatch elements smaller than the maximum number of blocks for the 905 given feature along this dimension. 906 907 Returns: 908 Tuple of `names`, `dense_defaults_vec`, `sparse_keys`, `sparse_types`, 909 `dense_keys`, `dense_shapes`. 910 911 Raises: 912 ValueError: If sparse and dense key sets intersect, or input lengths do not 913 match up. 914 """ 915 names = [] if names is None else names 916 dense_defaults = collections.OrderedDict( 917 ) if dense_defaults is None else dense_defaults 918 sparse_keys = [] if sparse_keys is None else sparse_keys 919 sparse_types = [] if sparse_types is None else sparse_types 920 dense_keys = [] if dense_keys is None else dense_keys 921 dense_types = [] if dense_types is None else dense_types 922 dense_shapes = ([[]] * len(dense_keys) 923 if dense_shapes is None else dense_shapes) 924 925 num_dense = len(dense_keys) 926 num_sparse = len(sparse_keys) 927 928 if len(dense_shapes) != num_dense: 929 raise ValueError("len(dense_shapes) != len(dense_keys): %d vs. %d" % 930 (len(dense_shapes), num_dense)) 931 if len(dense_types) != num_dense: 932 raise ValueError("len(dense_types) != len(num_dense): %d vs. %d" % 933 (len(dense_types), num_dense)) 934 if len(sparse_types) != num_sparse: 935 raise ValueError("len(sparse_types) != len(sparse_keys): %d vs. %d" % 936 (len(sparse_types), num_sparse)) 937 if num_dense + num_sparse == 0: 938 raise ValueError("Must provide at least one sparse key or dense key") 939 if not set(dense_keys).isdisjoint(set(sparse_keys)): 940 raise ValueError( 941 "Dense and sparse keys must not intersect; intersection: %s" % 942 set(dense_keys).intersection(set(sparse_keys))) 943 944 # Convert dense_shapes to TensorShape object. 945 dense_shapes = [tensor_shape.as_shape(shape) for shape in dense_shapes] 946 947 dense_defaults_vec = [] 948 for i, key in enumerate(dense_keys): 949 default_value = dense_defaults.get(key) 950 dense_shape = dense_shapes[i] 951 if (dense_shape.ndims is not None and dense_shape.ndims > 0 and 952 dense_shape.dims[0].value is None): 953 # Variable stride dense shape, the default value should be a 954 # scalar padding value 955 if default_value is None: 956 default_value = ops.convert_to_tensor( 957 "" if dense_types[i] == dtypes.string else 0, dtype=dense_types[i]) 958 else: 959 # Reshape to a scalar to ensure user gets an error if they 960 # provide a tensor that's not intended to be a padding value 961 # (0 or 2+ elements). 962 key_name = "padding_" + re.sub("[^A-Za-z0-9_.\\-/]", "_", key) 963 default_value = ops.convert_to_tensor( 964 default_value, dtype=dense_types[i], name=key_name) 965 default_value = array_ops.reshape(default_value, []) 966 else: 967 if default_value is None: 968 default_value = constant_op.constant([], dtype=dense_types[i]) 969 elif not isinstance(default_value, ops.Tensor): 970 key_name = "key_" + re.sub("[^A-Za-z0-9_.\\-/]", "_", key) 971 default_value = ops.convert_to_tensor( 972 default_value, dtype=dense_types[i], name=key_name) 973 default_value = array_ops.reshape(default_value, dense_shape) 974 975 dense_defaults_vec.append(default_value) 976 977 # Finally, convert dense_shapes to TensorShapeProto 978 dense_shapes_as_proto = [shape.as_proto() for shape in dense_shapes] 979 980 return (names, dense_defaults_vec, sparse_keys, sparse_types, dense_keys, 981 dense_shapes_as_proto, dense_shapes) 982 983 984@tf_export(v1=["io.parse_single_example", "parse_single_example"]) 985def parse_single_example(serialized, features, name=None, example_names=None): 986 """Parses a single `Example` proto. 987 988 Similar to `parse_example`, except: 989 990 For dense tensors, the returned `Tensor` is identical to the output of 991 `parse_example`, except there is no batch dimension, the output shape is the 992 same as the shape given in `dense_shape`. 993 994 For `SparseTensor`s, the first (batch) column of the indices matrix is removed 995 (the indices matrix is a column vector), the values vector is unchanged, and 996 the first (`batch_size`) entry of the shape vector is removed (it is now a 997 single element vector). 998 999 One might see performance advantages by batching `Example` protos with 1000 `parse_example` instead of using this function directly. 1001 1002 Args: 1003 serialized: A scalar string Tensor, a single serialized Example. 1004 See `_parse_single_example_raw` documentation for more details. 1005 features: A `dict` mapping feature keys to `FixedLenFeature` or 1006 `VarLenFeature` values. 1007 name: A name for this operation (optional). 1008 example_names: (Optional) A scalar string Tensor, the associated name. 1009 See `_parse_single_example_raw` documentation for more details. 1010 1011 Returns: 1012 A `dict` mapping feature keys to `Tensor` and `SparseTensor` values. 1013 1014 Raises: 1015 ValueError: if any feature is invalid. 1016 """ 1017 return parse_single_example_v2_unoptimized( 1018 serialized, features, example_names, name 1019 ) 1020 1021 1022# TODO(b/70890287): Combine the implementation of this op and 1023# `parse_single_example_v2()` after 1/10/2018. 1024@tf_export("io.parse_single_example", v1=[]) 1025def parse_single_example_v2_unoptimized( 1026 serialized, features, example_names=None, name=None 1027 ): 1028 """Parses a single `Example` proto. 1029 1030 Similar to `parse_example`, except: 1031 1032 For dense tensors, the returned `Tensor` is identical to the output of 1033 `parse_example`, except there is no batch dimension, the output shape is the 1034 same as the shape given in `dense_shape`. 1035 1036 For `SparseTensor`s, the first (batch) column of the indices matrix is removed 1037 (the indices matrix is a column vector), the values vector is unchanged, and 1038 the first (`batch_size`) entry of the shape vector is removed (it is now a 1039 single element vector). 1040 1041 One might see performance advantages by batching `Example` protos with 1042 `parse_example` instead of using this function directly. 1043 1044 Args: 1045 serialized: A scalar string Tensor, a single serialized Example. 1046 See `_parse_single_example_raw` documentation for more details. 1047 features: A `dict` mapping feature keys to `FixedLenFeature` or 1048 `VarLenFeature` values. 1049 example_names: (Optional) A scalar string Tensor, the associated name. 1050 See `_parse_single_example_raw` documentation for more details. 1051 name: A name for this operation (optional). 1052 1053 Returns: 1054 A `dict` mapping feature keys to `Tensor` and `SparseTensor` values. 1055 1056 Raises: 1057 ValueError: if any feature is invalid. 1058 """ 1059 if not features: 1060 raise ValueError("Missing features.") 1061 if example_names is None: 1062 return parse_single_example_v2(serialized, features, name) 1063 features = _prepend_none_dimension(features) 1064 (sparse_keys, sparse_types, dense_keys, dense_types, dense_defaults, 1065 dense_shapes) = _features_to_raw_params( 1066 features, 1067 [VarLenFeature, FixedLenFeature, FixedLenSequenceFeature, SparseFeature]) 1068 outputs = _parse_single_example_raw( 1069 serialized, example_names, sparse_keys, sparse_types, dense_keys, 1070 dense_types, dense_defaults, dense_shapes, name) 1071 return _construct_sparse_tensors_for_sparse_features(features, outputs) 1072 1073 1074def _parse_single_example_raw(serialized, 1075 names=None, 1076 sparse_keys=None, 1077 sparse_types=None, 1078 dense_keys=None, 1079 dense_types=None, 1080 dense_defaults=None, 1081 dense_shapes=None, 1082 name=None): 1083 """Parses a single `Example` proto. 1084 1085 Args: 1086 serialized: A scalar string Tensor, a single serialized Example. 1087 See `_parse_example_raw` documentation for more details. 1088 names: (Optional) A scalar string Tensor, the associated name. 1089 See `_parse_example_raw` documentation for more details. 1090 sparse_keys: See `_parse_example_raw` documentation for more details. 1091 sparse_types: See `_parse_example_raw` documentation for more details. 1092 dense_keys: See `_parse_example_raw` documentation for more details. 1093 dense_types: See `_parse_example_raw` documentation for more details. 1094 dense_defaults: See `_parse_example_raw` documentation for more details. 1095 dense_shapes: See `_parse_example_raw` documentation for more details. 1096 name: A name for this operation (optional). 1097 1098 Returns: 1099 A `dict` mapping feature keys to `Tensor` and `SparseTensor` values. 1100 1101 Raises: 1102 ValueError: if any feature is invalid. 1103 """ 1104 with ops.name_scope(name, "ParseSingleExample", [serialized, names]): 1105 serialized = ops.convert_to_tensor(serialized) 1106 serialized_shape = serialized.get_shape() 1107 if serialized_shape.ndims is not None: 1108 if serialized_shape.ndims != 0: 1109 raise ValueError("Input serialized must be a scalar") 1110 else: 1111 serialized = control_flow_ops.with_dependencies( 1112 [control_flow_ops.Assert( 1113 math_ops.equal(array_ops.rank(serialized), 0), 1114 ["Input serialized must be a scalar"], 1115 name="SerializedIsScalar")], 1116 serialized, 1117 name="SerializedDependencies") 1118 serialized = array_ops.expand_dims(serialized, 0) 1119 if names is not None: 1120 names = ops.convert_to_tensor(names) 1121 names_shape = names.get_shape() 1122 if names_shape.ndims is not None: 1123 if names_shape.ndims != 0: 1124 raise ValueError("Input names must be a scalar") 1125 else: 1126 names = control_flow_ops.with_dependencies( 1127 [control_flow_ops.Assert( 1128 math_ops.equal(array_ops.rank(names), 0), 1129 ["Input names must be a scalar"], 1130 name="NamesIsScalar")], 1131 names, 1132 name="NamesDependencies") 1133 names = array_ops.expand_dims(names, 0) 1134 1135 outputs = _parse_example_raw( 1136 serialized, 1137 names=names, 1138 sparse_keys=sparse_keys, 1139 sparse_types=sparse_types, 1140 dense_keys=dense_keys, 1141 dense_types=dense_types, 1142 dense_defaults=dense_defaults, 1143 dense_shapes=dense_shapes, 1144 name=name) 1145 if dense_keys is not None: 1146 for d in dense_keys: 1147 d_name = re.sub("[^A-Za-z0-9_.\\-/]", "_", d) 1148 outputs[d] = array_ops.squeeze( 1149 outputs[d], [0], name="Squeeze_%s" % d_name) 1150 if sparse_keys is not None: 1151 for s in sparse_keys: 1152 s_name = re.sub("[^A-Za-z0-9_.\\-/]", "_", s) 1153 outputs[s] = sparse_tensor.SparseTensor( 1154 array_ops.slice(outputs[s].indices, 1155 [0, 1], [-1, -1], name="Slice_Indices_%s" % s_name), 1156 outputs[s].values, 1157 array_ops.slice(outputs[s].dense_shape, 1158 [1], [-1], name="Squeeze_Shape_%s" % s_name)) 1159 return outputs 1160 1161 1162@tf_export("io.parse_sequence_example") 1163def parse_sequence_example(serialized, 1164 context_features=None, 1165 sequence_features=None, 1166 example_names=None, 1167 name=None): 1168 # pylint: disable=line-too-long 1169 """Parses a batch of `SequenceExample` protos. 1170 1171 Parses a vector of serialized 1172 [`SequenceExample`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto) 1173 protos given in `serialized`. 1174 1175 This op parses serialized sequence examples into a tuple of dictionaries 1176 mapping keys to `Tensor` and `SparseTensor` objects respectively. 1177 The first dictionary contains mappings for keys appearing in 1178 `context_features`, and the second dictionary contains mappings for keys 1179 appearing in `sequence_features`. 1180 1181 At least one of `context_features` and `sequence_features` must be provided 1182 and non-empty. 1183 1184 The `context_features` keys are associated with a `SequenceExample` as a 1185 whole, independent of time / frame. In contrast, the `sequence_features` keys 1186 provide a way to access variable-length data within the `FeatureList` section 1187 of the `SequenceExample` proto. While the shapes of `context_features` values 1188 are fixed with respect to frame, the frame dimension (the first dimension) 1189 of `sequence_features` values may vary between `SequenceExample` protos, 1190 and even between `feature_list` keys within the same `SequenceExample`. 1191 1192 `context_features` contains `VarLenFeature` and `FixedLenFeature` objects. 1193 Each `VarLenFeature` is mapped to a `SparseTensor`, and each `FixedLenFeature` 1194 is mapped to a `Tensor`, of the specified type, shape, and default value. 1195 1196 `sequence_features` contains `VarLenFeature` and `FixedLenSequenceFeature` 1197 objects. Each `VarLenFeature` is mapped to a `SparseTensor`, and each 1198 `FixedLenSequenceFeature` is mapped to a `Tensor`, each of the specified type. 1199 The shape will be `(B,T,) + df.dense_shape` for `FixedLenSequenceFeature` 1200 `df`, where `B` is the batch size, and `T` is the length of the associated 1201 `FeatureList` in the `SequenceExample`. For instance, 1202 `FixedLenSequenceFeature([])` yields a scalar 2-D `Tensor` of static shape 1203 `[None, None]` and dynamic shape `[B, T]`, while 1204 `FixedLenSequenceFeature([k])` (for `int k >= 1`) yields a 3-D matrix `Tensor` 1205 of static shape `[None, None, k]` and dynamic shape `[B, T, k]`. 1206 1207 Like the input, the resulting output tensors have a batch dimension. This 1208 means that the original per-example shapes of `VarLenFeature`s and 1209 `FixedLenSequenceFeature`s can be lost. To handle that situation, this op also 1210 provides dicts of shape tensors as part of the output. There is one dict for 1211 the context features, and one for the feature_list features. Context features 1212 of type `FixedLenFeature`s will not be present, since their shapes are already 1213 known by the caller. In situations where the input 'FixedLenFeature`s are of 1214 different lengths across examples, the shorter examples will be padded with 1215 default datatype values: 0 for numeric types, and the empty string for string 1216 types. 1217 1218 Each `SparseTensor` corresponding to `sequence_features` represents a ragged 1219 vector. Its indices are `[time, index]`, where `time` is the `FeatureList` 1220 entry and `index` is the value's index in the list of values associated with 1221 that time. 1222 1223 `FixedLenFeature` entries with a `default_value` and `FixedLenSequenceFeature` 1224 entries with `allow_missing=True` are optional; otherwise, we will fail if 1225 that `Feature` or `FeatureList` is missing from any example in `serialized`. 1226 1227 `example_name` may contain a descriptive name for the corresponding serialized 1228 proto. This may be useful for debugging purposes, but it has no effect on the 1229 output. If not `None`, `example_name` must be a scalar. 1230 1231 Args: 1232 serialized: A vector (1-D Tensor) of type string containing binary 1233 serialized `SequenceExample` protos. 1234 context_features: A `dict` mapping feature keys to `FixedLenFeature` or 1235 `VarLenFeature` values. These features are associated with a 1236 `SequenceExample` as a whole. 1237 sequence_features: A `dict` mapping feature keys to 1238 `FixedLenSequenceFeature` or `VarLenFeature` values. These features are 1239 associated with data within the `FeatureList` section of the 1240 `SequenceExample` proto. 1241 example_names: A vector (1-D Tensor) of strings (optional), the name of the 1242 serialized protos. 1243 name: A name for this operation (optional). 1244 1245 Returns: 1246 A tuple of three `dict`s, each mapping keys to `Tensor`s and 1247 `SparseTensor`s. The first dict contains the context key/values, 1248 the second dict contains the feature_list key/values, and the final dict 1249 contains the lengths of any dense feature_list features. 1250 1251 Raises: 1252 ValueError: if any feature is invalid. 1253 """ 1254 if not (context_features or sequence_features): 1255 raise ValueError("Missing features.") 1256 (context_sparse_keys, context_sparse_types, context_dense_keys, 1257 context_dense_types, 1258 context_dense_defaults, context_dense_shapes) = _features_to_raw_params( 1259 context_features, [VarLenFeature, FixedLenFeature]) 1260 (feature_list_sparse_keys, feature_list_sparse_types, feature_list_dense_keys, 1261 feature_list_dense_types, feature_list_dense_defaults, 1262 feature_list_dense_shapes) = _features_to_raw_params( 1263 sequence_features, [VarLenFeature, FixedLenSequenceFeature]) 1264 return _parse_sequence_example_raw( 1265 serialized, example_names, context_sparse_keys, context_sparse_types, 1266 context_dense_keys, context_dense_types, context_dense_defaults, 1267 context_dense_shapes, feature_list_sparse_keys, feature_list_sparse_types, 1268 feature_list_dense_keys, feature_list_dense_types, 1269 feature_list_dense_shapes, feature_list_dense_defaults, name) 1270 1271 1272def _parse_sequence_example_raw(serialized, 1273 debug_name=None, 1274 context_sparse_keys=None, 1275 context_sparse_types=None, 1276 context_dense_keys=None, 1277 context_dense_types=None, 1278 context_dense_defaults=None, 1279 context_dense_shapes=None, 1280 feature_list_sparse_keys=None, 1281 feature_list_sparse_types=None, 1282 feature_list_dense_keys=None, 1283 feature_list_dense_types=None, 1284 feature_list_dense_shapes=None, 1285 feature_list_dense_defaults=None, 1286 name=None): 1287 """Parses a vector of `SequenceExample` protos. 1288 1289 Args: 1290 serialized: A vector (1-D Tensor) of type string, containing binary 1291 serialized `SequenceExample` protos. 1292 debug_name: A vector (1-D Tensor) of strings (optional), the names of the 1293 serialized protos. 1294 context_sparse_keys: A list of string keys in the `SequenceExample`'s 1295 features. The results for these keys will be returned as `SparseTensor` 1296 objects. 1297 context_sparse_types: A list of `DTypes`, the same length as `sparse_keys`. 1298 Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`), and `tf.string` 1299 (`BytesList`) are supported. 1300 context_dense_keys: A list of string keys in the examples' features. The 1301 results for these keys will be returned as `Tensor`s 1302 context_dense_types: A list of DTypes, same length as `context_dense_keys`. 1303 Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`), and `tf.string` 1304 (`BytesList`) are supported. 1305 context_dense_defaults: A dict mapping string keys to `Tensor`s. The keys of 1306 the dict must match the context_dense_keys of the feature. 1307 context_dense_shapes: A list of tuples, same length as `context_dense_keys`. 1308 The shape of the data for each context_dense feature referenced by 1309 `context_dense_keys`. Required for any input tensors identified by 1310 `context_dense_keys` whose shapes are anything other than `[]` or `[1]`. 1311 feature_list_sparse_keys: A list of string keys in the `SequenceExample`'s 1312 feature_lists. The results for these keys will be returned as 1313 `SparseTensor` objects. 1314 feature_list_sparse_types: A list of `DTypes`, same length as `sparse_keys`. 1315 Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`), and `tf.string` 1316 (`BytesList`) are supported. 1317 feature_list_dense_keys: A list of string keys in the `SequenceExample`'s 1318 features_lists. The results for these keys will be returned as `Tensor`s. 1319 feature_list_dense_types: A list of `DTypes`, same length as 1320 `feature_list_dense_keys`. Only `tf.float32` (`FloatList`), `tf.int64` 1321 (`Int64List`), and `tf.string` (`BytesList`) are supported. 1322 feature_list_dense_shapes: A list of tuples, same length as 1323 `feature_list_dense_keys`. The shape of the data for each `FeatureList` 1324 feature referenced by `feature_list_dense_keys`. 1325 feature_list_dense_defaults: A dict mapping key strings to values. The only 1326 currently allowed value is `None`. Any key appearing in this dict with 1327 value `None` is allowed to be missing from the `SequenceExample`. If 1328 missing, the key is treated as zero-length. 1329 name: A name for this operation (optional). 1330 1331 Returns: 1332 A tuple of three `dict`s, each mapping keys to `Tensor`s and 1333 `SparseTensor`s. The first dict contains the context key/values, 1334 the second dict contains the feature_list key/values, and the final dict 1335 contains the lengths of any dense feature_list features. 1336 1337 Raises: 1338 ValueError: If context_sparse and context_dense key sets intersect, 1339 if feature_list_sparse and feature_list_dense key sets intersect, 1340 if input lengths do not match up, or if a value in 1341 feature_list_dense_defaults is not None. 1342 TypeError: if feature_list_dense_defaults is not either None or a dict. 1343 """ 1344 with ops.name_scope(name, "ParseSequenceExample", [serialized]): 1345 context_dense_defaults = ({} if context_dense_defaults is None else 1346 context_dense_defaults) 1347 context_sparse_keys = ([] if context_sparse_keys is None else 1348 context_sparse_keys) 1349 context_sparse_types = ([] if context_sparse_types is None else 1350 context_sparse_types) 1351 context_dense_keys = ([] 1352 if context_dense_keys is None else context_dense_keys) 1353 context_dense_types = ([] if context_dense_types is None else 1354 context_dense_types) 1355 context_dense_shapes = ([[]] * len(context_dense_keys) 1356 if context_dense_shapes is None else 1357 context_dense_shapes) 1358 feature_list_sparse_keys = ([] if feature_list_sparse_keys is None else 1359 feature_list_sparse_keys) 1360 feature_list_sparse_types = ([] if feature_list_sparse_types is None else 1361 feature_list_sparse_types) 1362 feature_list_dense_keys = ([] if feature_list_dense_keys is None else 1363 feature_list_dense_keys) 1364 feature_list_dense_types = ([] if feature_list_dense_types is None else 1365 feature_list_dense_types) 1366 feature_list_dense_shapes = ([[]] * len(feature_list_dense_keys) 1367 if feature_list_dense_shapes is None else 1368 feature_list_dense_shapes) 1369 feature_list_dense_defaults = ( 1370 dict() 1371 if feature_list_dense_defaults is None else feature_list_dense_defaults) 1372 debug_name = [] if debug_name is None else debug_name 1373 1374 # Internal 1375 feature_list_dense_missing_assumed_empty = [] 1376 1377 num_context_dense = len(context_dense_keys) 1378 num_feature_list_dense = len(feature_list_dense_keys) 1379 num_context_sparse = len(context_sparse_keys) 1380 num_feature_list_sparse = len(feature_list_sparse_keys) 1381 1382 if len(context_dense_shapes) != num_context_dense: 1383 raise ValueError( 1384 "len(context_dense_shapes) != len(context_dense_keys): %d vs. %d" % 1385 (len(context_dense_shapes), num_context_dense)) 1386 if len(context_dense_types) != num_context_dense: 1387 raise ValueError( 1388 "len(context_dense_types) != len(num_context_dense): %d vs. %d" % 1389 (len(context_dense_types), num_context_dense)) 1390 if len(feature_list_dense_shapes) != num_feature_list_dense: 1391 raise ValueError( 1392 "len(feature_list_dense_shapes) != len(feature_list_dense_keys): " 1393 "%d vs. %d" % (len(feature_list_dense_shapes), 1394 num_feature_list_dense)) 1395 if len(feature_list_dense_types) != num_feature_list_dense: 1396 raise ValueError( 1397 "len(feature_list_dense_types) != len(num_feature_list_dense):" 1398 "%d vs. %d" % (len(feature_list_dense_types), num_feature_list_dense)) 1399 if len(context_sparse_types) != num_context_sparse: 1400 raise ValueError( 1401 "len(context_sparse_types) != len(context_sparse_keys): %d vs. %d" % 1402 (len(context_sparse_types), num_context_sparse)) 1403 if len(feature_list_sparse_types) != num_feature_list_sparse: 1404 raise ValueError( 1405 "len(feature_list_sparse_types) != len(feature_list_sparse_keys): " 1406 "%d vs. %d" % (len(feature_list_sparse_types), 1407 num_feature_list_sparse)) 1408 if (num_context_dense + num_context_sparse + num_feature_list_dense + 1409 num_feature_list_sparse) == 0: 1410 raise ValueError( 1411 "Must provide at least one context_sparse key, context_dense key, " 1412 ", feature_list_sparse key, or feature_list_dense key") 1413 if not set(context_dense_keys).isdisjoint(set(context_sparse_keys)): 1414 raise ValueError( 1415 "context_dense and context_sparse keys must not intersect; " 1416 "intersection: %s" % set(context_dense_keys).intersection( 1417 set(context_sparse_keys))) 1418 if not set(feature_list_dense_keys).isdisjoint( 1419 set(feature_list_sparse_keys)): 1420 raise ValueError( 1421 "feature_list_dense and feature_list_sparse keys must not intersect; " 1422 "intersection: %s" % set(feature_list_dense_keys).intersection( 1423 set(feature_list_sparse_keys))) 1424 if not isinstance(feature_list_dense_defaults, dict): 1425 raise TypeError("feature_list_dense_defaults must be a dict") 1426 for k, v in feature_list_dense_defaults.items(): 1427 if v is not None: 1428 raise ValueError( 1429 "Value feature_list_dense_defaults[%s] must be None" % k) 1430 feature_list_dense_missing_assumed_empty.append(k) 1431 1432 context_dense_defaults_vec = [] 1433 for i, key in enumerate(context_dense_keys): 1434 default_value = context_dense_defaults.get(key) 1435 if default_value is None: 1436 default_value = constant_op.constant([], dtype=context_dense_types[i]) 1437 elif not isinstance(default_value, ops.Tensor): 1438 key_name = "key_" + re.sub("[^A-Za-z0-9_.\\-/]", "_", key) 1439 default_value = ops.convert_to_tensor( 1440 default_value, dtype=context_dense_types[i], name=key_name) 1441 1442 context_dense_defaults_vec.append(default_value) 1443 1444 context_dense_shapes = [ 1445 tensor_shape.as_shape(shape).as_proto() 1446 for shape in context_dense_shapes 1447 ] 1448 feature_list_dense_shapes = [ 1449 tensor_shape.as_shape(shape).as_proto() 1450 for shape in feature_list_dense_shapes 1451 ] 1452 1453 # pylint: disable=protected-access 1454 outputs = gen_parsing_ops.parse_sequence_example( 1455 serialized=serialized, 1456 debug_name=debug_name, 1457 Ncontext_sparse=num_context_sparse, 1458 Ncontext_dense=num_context_dense, 1459 Nfeature_list_sparse=num_feature_list_sparse, 1460 Nfeature_list_dense=num_feature_list_dense, 1461 context_dense_defaults=context_dense_defaults_vec, 1462 context_sparse_keys=context_sparse_keys, 1463 context_sparse_types=context_sparse_types, 1464 context_dense_keys=context_dense_keys, 1465 context_dense_shapes=context_dense_shapes, 1466 feature_list_sparse_keys=feature_list_sparse_keys, 1467 feature_list_sparse_types=feature_list_sparse_types, 1468 feature_list_dense_keys=feature_list_dense_keys, 1469 feature_list_dense_types=feature_list_dense_types, 1470 feature_list_dense_shapes=feature_list_dense_shapes, 1471 feature_list_dense_missing_assumed_empty=( 1472 feature_list_dense_missing_assumed_empty), 1473 name=name) 1474 # pylint: enable=protected-access 1475 1476 (context_sparse_indices, context_sparse_values, context_sparse_shapes, 1477 context_dense_values, feature_list_sparse_indices, 1478 feature_list_sparse_values, feature_list_sparse_shapes, 1479 feature_list_dense_values, feature_list_dense_lengths) = outputs 1480 1481 context_sparse_tensors = [ 1482 sparse_tensor.SparseTensor(ix, val, shape) 1483 for (ix, val, 1484 shape) in zip(context_sparse_indices, context_sparse_values, 1485 context_sparse_shapes) 1486 ] 1487 1488 feature_list_sparse_tensors = [ 1489 sparse_tensor.SparseTensor(ix, val, shape) 1490 for (ix, val, shape 1491 ) in zip(feature_list_sparse_indices, feature_list_sparse_values, 1492 feature_list_sparse_shapes) 1493 ] 1494 1495 context_output = dict( 1496 zip(context_sparse_keys + context_dense_keys, 1497 context_sparse_tensors + context_dense_values)) 1498 feature_list_output = dict( 1499 zip(feature_list_sparse_keys + feature_list_dense_keys, 1500 feature_list_sparse_tensors + feature_list_dense_values)) 1501 feature_list_lengths = dict( 1502 zip(feature_list_dense_keys, feature_list_dense_lengths)) 1503 1504 return (context_output, feature_list_output, feature_list_lengths) 1505 1506 1507# TODO(sundberg): rewrite this method to call the batch version, which is more 1508# efficient especially for large inputs. 1509@tf_export("io.parse_single_sequence_example", 1510 v1=["io.parse_single_sequence_example", 1511 "parse_single_sequence_example"]) 1512def parse_single_sequence_example( 1513 serialized, context_features=None, sequence_features=None, 1514 example_name=None, name=None): 1515 # pylint: disable=line-too-long 1516 """Parses a single `SequenceExample` proto. 1517 1518 Parses a single serialized [`SequenceExample`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto) 1519 proto given in `serialized`. 1520 1521 This op parses a serialized sequence example into a tuple of dictionaries 1522 mapping keys to `Tensor` and `SparseTensor` objects respectively. 1523 The first dictionary contains mappings for keys appearing in 1524 `context_features`, and the second dictionary contains mappings for keys 1525 appearing in `sequence_features`. 1526 1527 At least one of `context_features` and `sequence_features` must be provided 1528 and non-empty. 1529 1530 The `context_features` keys are associated with a `SequenceExample` as a 1531 whole, independent of time / frame. In contrast, the `sequence_features` keys 1532 provide a way to access variable-length data within the `FeatureList` section 1533 of the `SequenceExample` proto. While the shapes of `context_features` values 1534 are fixed with respect to frame, the frame dimension (the first dimension) 1535 of `sequence_features` values may vary between `SequenceExample` protos, 1536 and even between `feature_list` keys within the same `SequenceExample`. 1537 1538 `context_features` contains `VarLenFeature` and `FixedLenFeature` objects. 1539 Each `VarLenFeature` is mapped to a `SparseTensor`, and each `FixedLenFeature` 1540 is mapped to a `Tensor`, of the specified type, shape, and default value. 1541 1542 `sequence_features` contains `VarLenFeature` and `FixedLenSequenceFeature` 1543 objects. Each `VarLenFeature` is mapped to a `SparseTensor`, and each 1544 `FixedLenSequenceFeature` is mapped to a `Tensor`, each of the specified type. 1545 The shape will be `(T,) + df.dense_shape` for `FixedLenSequenceFeature` `df`, where 1546 `T` is the length of the associated `FeatureList` in the `SequenceExample`. 1547 For instance, `FixedLenSequenceFeature([])` yields a scalar 1-D `Tensor` of 1548 static shape `[None]` and dynamic shape `[T]`, while 1549 `FixedLenSequenceFeature([k])` (for `int k >= 1`) yields a 2-D matrix `Tensor` 1550 of static shape `[None, k]` and dynamic shape `[T, k]`. 1551 1552 Each `SparseTensor` corresponding to `sequence_features` represents a ragged 1553 vector. Its indices are `[time, index]`, where `time` is the `FeatureList` 1554 entry and `index` is the value's index in the list of values associated with 1555 that time. 1556 1557 `FixedLenFeature` entries with a `default_value` and `FixedLenSequenceFeature` 1558 entries with `allow_missing=True` are optional; otherwise, we will fail if 1559 that `Feature` or `FeatureList` is missing from any example in `serialized`. 1560 1561 `example_name` may contain a descriptive name for the corresponding serialized 1562 proto. This may be useful for debugging purposes, but it has no effect on the 1563 output. If not `None`, `example_name` must be a scalar. 1564 1565 Args: 1566 serialized: A scalar (0-D Tensor) of type string, a single binary 1567 serialized `SequenceExample` proto. 1568 context_features: A `dict` mapping feature keys to `FixedLenFeature` or 1569 `VarLenFeature` values. These features are associated with a 1570 `SequenceExample` as a whole. 1571 sequence_features: A `dict` mapping feature keys to 1572 `FixedLenSequenceFeature` or `VarLenFeature` values. These features are 1573 associated with data within the `FeatureList` section of the 1574 `SequenceExample` proto. 1575 example_name: A scalar (0-D Tensor) of strings (optional), the name of 1576 the serialized proto. 1577 name: A name for this operation (optional). 1578 1579 Returns: 1580 A tuple of two `dict`s, each mapping keys to `Tensor`s and `SparseTensor`s. 1581 The first dict contains the context key/values. 1582 The second dict contains the feature_list key/values. 1583 1584 Raises: 1585 ValueError: if any feature is invalid. 1586 """ 1587 # pylint: enable=line-too-long 1588 if not (context_features or sequence_features): 1589 raise ValueError("Missing features.") 1590 (context_sparse_keys, context_sparse_types, context_dense_keys, 1591 context_dense_types, context_dense_defaults, 1592 context_dense_shapes) = _features_to_raw_params( 1593 context_features, [VarLenFeature, FixedLenFeature]) 1594 (feature_list_sparse_keys, feature_list_sparse_types, 1595 feature_list_dense_keys, feature_list_dense_types, 1596 feature_list_dense_defaults, 1597 feature_list_dense_shapes) = _features_to_raw_params( 1598 sequence_features, [VarLenFeature, FixedLenSequenceFeature]) 1599 return _parse_single_sequence_example_raw( 1600 serialized, context_sparse_keys, context_sparse_types, 1601 context_dense_keys, context_dense_types, context_dense_defaults, 1602 context_dense_shapes, feature_list_sparse_keys, 1603 feature_list_sparse_types, feature_list_dense_keys, 1604 feature_list_dense_types, feature_list_dense_shapes, 1605 feature_list_dense_defaults, example_name, name) 1606 1607 1608def _parse_single_sequence_example_raw(serialized, 1609 context_sparse_keys=None, 1610 context_sparse_types=None, 1611 context_dense_keys=None, 1612 context_dense_types=None, 1613 context_dense_defaults=None, 1614 context_dense_shapes=None, 1615 feature_list_sparse_keys=None, 1616 feature_list_sparse_types=None, 1617 feature_list_dense_keys=None, 1618 feature_list_dense_types=None, 1619 feature_list_dense_shapes=None, 1620 feature_list_dense_defaults=None, 1621 debug_name=None, 1622 name=None): 1623 """Parses a single `SequenceExample` proto. 1624 1625 Args: 1626 serialized: A scalar (0-D Tensor) of type string, a single binary 1627 serialized `SequenceExample` proto. 1628 context_sparse_keys: A list of string keys in the `SequenceExample`'s 1629 features. The results for these keys will be returned as 1630 `SparseTensor` objects. 1631 context_sparse_types: A list of `DTypes`, the same length as `sparse_keys`. 1632 Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`), 1633 and `tf.string` (`BytesList`) are supported. 1634 context_dense_keys: A list of string keys in the examples' features. 1635 The results for these keys will be returned as `Tensor`s 1636 context_dense_types: A list of DTypes, same length as `context_dense_keys`. 1637 Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`), 1638 and `tf.string` (`BytesList`) are supported. 1639 context_dense_defaults: A dict mapping string keys to `Tensor`s. 1640 The keys of the dict must match the context_dense_keys of the feature. 1641 context_dense_shapes: A list of tuples, same length as `context_dense_keys`. 1642 The shape of the data for each context_dense feature referenced by 1643 `context_dense_keys`. Required for any input tensors identified by 1644 `context_dense_keys` whose shapes are anything other than `[]` or `[1]`. 1645 feature_list_sparse_keys: A list of string keys in the `SequenceExample`'s 1646 feature_lists. The results for these keys will be returned as 1647 `SparseTensor` objects. 1648 feature_list_sparse_types: A list of `DTypes`, same length as `sparse_keys`. 1649 Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`), 1650 and `tf.string` (`BytesList`) are supported. 1651 feature_list_dense_keys: A list of string keys in the `SequenceExample`'s 1652 features_lists. The results for these keys will be returned as `Tensor`s. 1653 feature_list_dense_types: A list of `DTypes`, same length as 1654 `feature_list_dense_keys`. Only `tf.float32` (`FloatList`), 1655 `tf.int64` (`Int64List`), and `tf.string` (`BytesList`) are supported. 1656 feature_list_dense_shapes: A list of tuples, same length as 1657 `feature_list_dense_keys`. The shape of the data for each 1658 `FeatureList` feature referenced by `feature_list_dense_keys`. 1659 feature_list_dense_defaults: A dict mapping key strings to values. 1660 The only currently allowed value is `None`. Any key appearing 1661 in this dict with value `None` is allowed to be missing from the 1662 `SequenceExample`. If missing, the key is treated as zero-length. 1663 debug_name: A scalar (0-D Tensor) of strings (optional), the name of 1664 the serialized proto. 1665 name: A name for this operation (optional). 1666 1667 Returns: 1668 A tuple of two `dict`s, each mapping keys to `Tensor`s and `SparseTensor`s. 1669 The first dict contains the context key/values. 1670 The second dict contains the feature_list key/values. 1671 1672 Raises: 1673 ValueError: If context_sparse and context_dense key sets intersect, 1674 if input lengths do not match up, or if a value in 1675 feature_list_dense_defaults is not None. 1676 TypeError: if feature_list_dense_defaults is not either None or a dict. 1677 """ 1678 with ops.name_scope(name, "ParseSingleSequenceExample", [serialized]): 1679 context_dense_defaults = ( 1680 {} if context_dense_defaults is None else context_dense_defaults) 1681 context_sparse_keys = ( 1682 [] if context_sparse_keys is None else context_sparse_keys) 1683 context_sparse_types = ( 1684 [] if context_sparse_types is None else context_sparse_types) 1685 context_dense_keys = ( 1686 [] if context_dense_keys is None else context_dense_keys) 1687 context_dense_types = ( 1688 [] if context_dense_types is None else context_dense_types) 1689 context_dense_shapes = ( 1690 [[]] * len(context_dense_keys) 1691 if context_dense_shapes is None else context_dense_shapes) 1692 feature_list_sparse_keys = ( 1693 [] if feature_list_sparse_keys is None else feature_list_sparse_keys) 1694 feature_list_sparse_types = ( 1695 [] if feature_list_sparse_types is None else feature_list_sparse_types) 1696 feature_list_dense_keys = ( 1697 [] if feature_list_dense_keys is None else feature_list_dense_keys) 1698 feature_list_dense_types = ( 1699 [] if feature_list_dense_types is None else feature_list_dense_types) 1700 feature_list_dense_shapes = ( 1701 [[]] * len(feature_list_dense_keys) 1702 if feature_list_dense_shapes is None else feature_list_dense_shapes) 1703 feature_list_dense_defaults = ( 1704 dict() if feature_list_dense_defaults is None 1705 else feature_list_dense_defaults) 1706 debug_name = "" if debug_name is None else debug_name 1707 1708 # Internal 1709 feature_list_dense_missing_assumed_empty = [] 1710 1711 num_context_dense = len(context_dense_keys) 1712 num_feature_list_dense = len(feature_list_dense_keys) 1713 num_context_sparse = len(context_sparse_keys) 1714 num_feature_list_sparse = len(feature_list_sparse_keys) 1715 1716 if len(context_dense_shapes) != num_context_dense: 1717 raise ValueError( 1718 "len(context_dense_shapes) != len(context_dense_keys): %d vs. %d" 1719 % (len(context_dense_shapes), num_context_dense)) 1720 if len(context_dense_types) != num_context_dense: 1721 raise ValueError( 1722 "len(context_dense_types) != len(num_context_dense): %d vs. %d" 1723 % (len(context_dense_types), num_context_dense)) 1724 if len(feature_list_dense_shapes) != num_feature_list_dense: 1725 raise ValueError( 1726 "len(feature_list_dense_shapes) != len(feature_list_dense_keys): " 1727 "%d vs. %d" % (len(feature_list_dense_shapes), 1728 num_feature_list_dense)) 1729 if len(feature_list_dense_types) != num_feature_list_dense: 1730 raise ValueError( 1731 "len(feature_list_dense_types) != len(num_feature_list_dense):" 1732 "%d vs. %d" % (len(feature_list_dense_types), num_feature_list_dense)) 1733 if len(context_sparse_types) != num_context_sparse: 1734 raise ValueError( 1735 "len(context_sparse_types) != len(context_sparse_keys): %d vs. %d" 1736 % (len(context_sparse_types), num_context_sparse)) 1737 if len(feature_list_sparse_types) != num_feature_list_sparse: 1738 raise ValueError( 1739 "len(feature_list_sparse_types) != len(feature_list_sparse_keys): " 1740 "%d vs. %d" 1741 % (len(feature_list_sparse_types), num_feature_list_sparse)) 1742 if (num_context_dense + num_context_sparse 1743 + num_feature_list_dense + num_feature_list_sparse) == 0: 1744 raise ValueError( 1745 "Must provide at least one context_sparse key, context_dense key, " 1746 ", feature_list_sparse key, or feature_list_dense key") 1747 if not set(context_dense_keys).isdisjoint(set(context_sparse_keys)): 1748 raise ValueError( 1749 "context_dense and context_sparse keys must not intersect; " 1750 "intersection: %s" % 1751 set(context_dense_keys).intersection(set(context_sparse_keys))) 1752 if not set(feature_list_dense_keys).isdisjoint( 1753 set(feature_list_sparse_keys)): 1754 raise ValueError( 1755 "feature_list_dense and feature_list_sparse keys must not intersect; " 1756 "intersection: %s" % 1757 set(feature_list_dense_keys).intersection( 1758 set(feature_list_sparse_keys))) 1759 if not isinstance(feature_list_dense_defaults, dict): 1760 raise TypeError("feature_list_dense_defaults must be a dict") 1761 for k, v in feature_list_dense_defaults.items(): 1762 if v is not None: 1763 raise ValueError("Value feature_list_dense_defaults[%s] must be None" 1764 % k) 1765 feature_list_dense_missing_assumed_empty.append(k) 1766 1767 context_dense_defaults_vec = [] 1768 for i, key in enumerate(context_dense_keys): 1769 default_value = context_dense_defaults.get(key) 1770 if default_value is None: 1771 default_value = constant_op.constant([], dtype=context_dense_types[i]) 1772 elif not isinstance(default_value, ops.Tensor): 1773 key_name = "key_" + re.sub("[^A-Za-z0-9_.\\-/]", "_", key) 1774 default_value = ops.convert_to_tensor( 1775 default_value, dtype=context_dense_types[i], name=key_name) 1776 default_value = array_ops.reshape( 1777 default_value, context_dense_shapes[i]) 1778 1779 context_dense_defaults_vec.append(default_value) 1780 1781 context_dense_shapes = [tensor_shape.as_shape(shape).as_proto() 1782 for shape in context_dense_shapes] 1783 feature_list_dense_shapes = [tensor_shape.as_shape(shape).as_proto() 1784 for shape in feature_list_dense_shapes] 1785 1786 outputs = gen_parsing_ops.parse_single_sequence_example( 1787 serialized=serialized, 1788 debug_name=debug_name, 1789 context_dense_defaults=context_dense_defaults_vec, 1790 context_sparse_keys=context_sparse_keys, 1791 context_sparse_types=context_sparse_types, 1792 context_dense_keys=context_dense_keys, 1793 context_dense_shapes=context_dense_shapes, 1794 feature_list_sparse_keys=feature_list_sparse_keys, 1795 feature_list_sparse_types=feature_list_sparse_types, 1796 feature_list_dense_keys=feature_list_dense_keys, 1797 feature_list_dense_types=feature_list_dense_types, 1798 feature_list_dense_shapes=feature_list_dense_shapes, 1799 feature_list_dense_missing_assumed_empty=( 1800 feature_list_dense_missing_assumed_empty), 1801 name=name) 1802 1803 (context_sparse_indices, context_sparse_values, 1804 context_sparse_shapes, context_dense_values, 1805 feature_list_sparse_indices, feature_list_sparse_values, 1806 feature_list_sparse_shapes, feature_list_dense_values) = outputs 1807 1808 context_sparse_tensors = [ 1809 sparse_tensor.SparseTensor(ix, val, shape) for (ix, val, shape) 1810 in zip(context_sparse_indices, 1811 context_sparse_values, 1812 context_sparse_shapes)] 1813 1814 feature_list_sparse_tensors = [ 1815 sparse_tensor.SparseTensor(ix, val, shape) for (ix, val, shape) 1816 in zip(feature_list_sparse_indices, 1817 feature_list_sparse_values, 1818 feature_list_sparse_shapes)] 1819 1820 context_output = dict( 1821 zip(context_sparse_keys + context_dense_keys, 1822 context_sparse_tensors + context_dense_values)) 1823 feature_list_output = dict( 1824 zip(feature_list_sparse_keys + feature_list_dense_keys, 1825 feature_list_sparse_tensors + feature_list_dense_values)) 1826 1827 return (context_output, feature_list_output) 1828 1829 1830# Swap `name` and `na_value` for backward compatibility. 1831@tf_export(v1=["io.decode_csv", "decode_csv"]) 1832@deprecation.deprecated_endpoints("decode_csv") 1833def decode_csv(records, 1834 record_defaults, 1835 field_delim=",", 1836 use_quote_delim=True, 1837 name=None, 1838 na_value="", 1839 select_cols=None): 1840 """Convert CSV records to tensors. Each column maps to one tensor. 1841 1842 RFC 4180 format is expected for the CSV records. 1843 (https://tools.ietf.org/html/rfc4180) 1844 Note that we allow leading and trailing spaces with int or float field. 1845 1846 Args: 1847 records: A `Tensor` of type `string`. 1848 Each string is a record/row in the csv and all records should have 1849 the same format. 1850 record_defaults: A list of `Tensor` objects with specific types. 1851 Acceptable types are `float32`, `float64`, `int32`, `int64`, `string`. 1852 One tensor per column of the input record, with either a 1853 scalar default value for that column or an empty vector if the column is 1854 required. 1855 field_delim: An optional `string`. Defaults to `","`. 1856 char delimiter to separate fields in a record. 1857 use_quote_delim: An optional `bool`. Defaults to `True`. 1858 If false, treats double quotation marks as regular 1859 characters inside of the string fields (ignoring RFC 4180, Section 2, 1860 Bullet 5). 1861 name: A name for the operation (optional). 1862 na_value: Additional string to recognize as NA/NaN. 1863 select_cols: Optional sorted list of column indices to select. If specified, 1864 only this subset of columns will be parsed and returned. 1865 1866 Returns: 1867 A list of `Tensor` objects. Has the same type as `record_defaults`. 1868 Each tensor will have the same shape as records. 1869 1870 Raises: 1871 ValueError: If any of the arguments is malformed. 1872 """ 1873 return decode_csv_v2( 1874 records, record_defaults, 1875 field_delim, use_quote_delim, 1876 na_value, select_cols, name 1877 ) 1878 1879 1880@tf_export("io.decode_csv", v1=[]) 1881def decode_csv_v2(records, 1882 record_defaults, 1883 field_delim=",", 1884 use_quote_delim=True, 1885 na_value="", 1886 select_cols=None, 1887 name=None): 1888 """Convert CSV records to tensors. Each column maps to one tensor. 1889 1890 RFC 4180 format is expected for the CSV records. 1891 (https://tools.ietf.org/html/rfc4180) 1892 Note that we allow leading and trailing spaces with int or float field. 1893 1894 Args: 1895 records: A `Tensor` of type `string`. 1896 Each string is a record/row in the csv and all records should have 1897 the same format. 1898 record_defaults: A list of `Tensor` objects with specific types. 1899 Acceptable types are `float32`, `float64`, `int32`, `int64`, `string`. 1900 One tensor per column of the input record, with either a 1901 scalar default value for that column or an empty vector if the column is 1902 required. 1903 field_delim: An optional `string`. Defaults to `","`. 1904 char delimiter to separate fields in a record. 1905 use_quote_delim: An optional `bool`. Defaults to `True`. 1906 If false, treats double quotation marks as regular 1907 characters inside of the string fields (ignoring RFC 4180, Section 2, 1908 Bullet 5). 1909 na_value: Additional string to recognize as NA/NaN. 1910 select_cols: Optional sorted list of column indices to select. If specified, 1911 only this subset of columns will be parsed and returned. 1912 name: A name for the operation (optional). 1913 1914 Returns: 1915 A list of `Tensor` objects. Has the same type as `record_defaults`. 1916 Each tensor will have the same shape as records. 1917 1918 Raises: 1919 ValueError: If any of the arguments is malformed. 1920 """ 1921 if select_cols is not None and any(select_cols[i] >= select_cols[i + 1] 1922 for i in range(len(select_cols) - 1)): 1923 raise ValueError("select_cols is not strictly increasing.") 1924 if select_cols is not None and select_cols[0] < 0: 1925 raise ValueError("select_cols contains negative values.") 1926 if select_cols is not None and len(select_cols) != len(record_defaults): 1927 raise ValueError("Length of select_cols and record_defaults do not match.") 1928 return gen_parsing_ops.decode_csv( 1929 records=records, 1930 record_defaults=record_defaults, 1931 field_delim=field_delim, 1932 use_quote_delim=use_quote_delim, 1933 na_value=na_value, 1934 name=name, 1935 select_cols=select_cols, 1936 ) 1937 1938 1939# TODO(b/70890287): Combine the implementation of this op and 1940# `parse_single_example()` after 1/10/2018. 1941def parse_single_example_v2(serialized, features, name=None): 1942 # pylint: disable=line-too-long 1943 """Parses an `Example` proto into a `dict` of tensors. 1944 1945 Parses a serialized 1946 [`Example`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto) 1947 proto given in `serialized`. 1948 1949 This op parses serialized examples into a dictionary mapping keys to `Tensor` 1950 and `SparseTensor` objects. `features` is a dict from keys to `VarLenFeature`, 1951 `SparseFeature`, and `FixedLenFeature` objects. Each `VarLenFeature` 1952 and `SparseFeature` is mapped to a `SparseTensor`, and each 1953 `FixedLenFeature` is mapped to a `Tensor`. 1954 1955 Each `VarLenFeature` maps to a `SparseTensor` of the specified type 1956 representing a ragged matrix. Its indices are `[index]` where 1957 `index` is the value's index in the list of values associated with 1958 that feature and example. 1959 1960 Each `SparseFeature` maps to a `SparseTensor` of the specified type 1961 representing a Tensor of `dense_shape` `SparseFeature.size`. 1962 Its `values` come from the feature in the examples with key `value_key`. 1963 A `values[i]` comes from a position `k` in the feature of an example at batch 1964 entry `batch`. This positional information is recorded in `indices[i]` as 1965 `[batch, index_0, index_1, ...]` where `index_j` is the `k-th` value of 1966 the feature in the example at with key `SparseFeature.index_key[j]`. 1967 In other words, we split the indices (except the first index indicating the 1968 batch entry) of a `SparseTensor` by dimension into different features of the 1969 `Example`. Due to its complexity a `VarLenFeature` should be preferred over a 1970 `SparseFeature` whenever possible. 1971 1972 Each `FixedLenFeature` `df` maps to a `Tensor` of the specified type (or 1973 `tf.float32` if not specified) and shape `df.shape`. 1974 1975 `FixedLenFeature` entries with a `default_value` are optional. With no default 1976 value, we will fail if that `Feature` is missing from any example in 1977 `serialized`. 1978 1979 Each `FixedLenSequenceFeature` `df` maps to a `Tensor` of the specified type 1980 (or `tf.float32` if not specified) and shape `(None,) + df.shape`. 1981 1982 Args: 1983 serialized: A scalar (0-D Tensor) string, a serialized `Example` proto. 1984 features: A `dict` mapping feature keys to `FixedLenFeature`, 1985 `VarLenFeature`, and `SparseFeature` values. 1986 name: A name for this operation (optional). 1987 1988 Returns: 1989 A `dict` mapping feature keys to `Tensor` and `SparseTensor` values. 1990 1991 Raises: 1992 ValueError: if any feature is invalid. 1993 """ 1994 if not features: 1995 raise ValueError("Missing: features was %s." % features) 1996 features = _prepend_none_dimension(features) 1997 (sparse_keys, sparse_types, dense_keys, dense_types, 1998 dense_defaults, dense_shapes) = _features_to_raw_params( 1999 features, 2000 [VarLenFeature, SparseFeature, FixedLenFeature, FixedLenSequenceFeature]) 2001 outputs = _parse_single_example_v2_raw(serialized, sparse_keys, sparse_types, 2002 dense_keys, dense_types, 2003 dense_defaults, dense_shapes, name) 2004 return _construct_sparse_tensors_for_sparse_features(features, outputs) 2005 2006 2007def _parse_single_example_v2_raw(serialized, sparse_keys, sparse_types, 2008 dense_keys, dense_types, dense_defaults, 2009 dense_shapes, name): 2010 """Parses `Example` protos. 2011 2012 Args: 2013 serialized: A scalar (0-D Tensor) string, containing a binary 2014 serialized `Example` proto. 2015 sparse_keys: A list of string keys in the examples' features. 2016 The results for these keys will be returned as `SparseTensor` objects. 2017 sparse_types: A list of `DTypes` of the same length as `sparse_keys`. 2018 Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`), 2019 and `tf.string` (`BytesList`) are supported. 2020 dense_keys: A list of string keys in the examples' features. 2021 The results for these keys will be returned as `Tensor`s 2022 dense_types: A list of DTypes of the same length as `dense_keys`. 2023 Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`), 2024 and `tf.string` (`BytesList`) are supported. 2025 dense_defaults: A dict mapping string keys to `Tensor`s. 2026 The keys of the dict must match the dense_keys of the feature. 2027 dense_shapes: A list of tuples with the same length as `dense_keys`. 2028 The shape of the data for each dense feature referenced by `dense_keys`. 2029 Required for any input tensors identified by `dense_keys`. Must be 2030 either fully defined, or may contain an unknown first dimension. 2031 An unknown first dimension means the feature is treated as having 2032 a variable number of blocks, and the output shape along this dimension 2033 is considered unknown at graph build time. Padding is applied for 2034 minibatch elements smaller than the maximum number of blocks for the 2035 given feature along this dimension. 2036 name: A name for this operation (optional). 2037 2038 Returns: 2039 A `dict` mapping keys to `Tensor`s and `SparseTensor`s. 2040 2041 Raises: 2042 ValueError: If sparse and dense key sets intersect, or input lengths do not 2043 match up. 2044 """ 2045 with ops.name_scope(name, "ParseSingleExample", [serialized]): 2046 serialized = ops.convert_to_tensor(serialized, name="serialized") 2047 dense_defaults = collections.OrderedDict( 2048 ) if dense_defaults is None else dense_defaults 2049 sparse_keys = [] if sparse_keys is None else sparse_keys 2050 sparse_types = [] if sparse_types is None else sparse_types 2051 dense_keys = [] if dense_keys is None else dense_keys 2052 dense_types = [] if dense_types is None else dense_types 2053 dense_shapes = ([[]] * len(dense_keys) 2054 if dense_shapes is None else dense_shapes) 2055 2056 num_dense = len(dense_keys) 2057 num_sparse = len(sparse_keys) 2058 2059 if len(dense_shapes) != num_dense: 2060 raise ValueError("len(dense_shapes) != len(dense_keys): %d vs. %d" % 2061 (len(dense_shapes), num_dense)) 2062 if len(dense_types) != num_dense: 2063 raise ValueError("len(dense_types) != len(num_dense): %d vs. %d" % 2064 (len(dense_types), num_dense)) 2065 if len(sparse_types) != num_sparse: 2066 raise ValueError("len(sparse_types) != len(sparse_keys): %d vs. %d" % 2067 (len(sparse_types), num_sparse)) 2068 if num_dense + num_sparse == 0: 2069 raise ValueError("Must provide at least one sparse key or dense key") 2070 if not set(dense_keys).isdisjoint(set(sparse_keys)): 2071 raise ValueError( 2072 "Dense and sparse keys must not intersect; intersection: %s" % 2073 set(dense_keys).intersection(set(sparse_keys))) 2074 2075 # Convert dense_shapes to TensorShape object. 2076 dense_shapes = [tensor_shape.as_shape(shape) for shape in dense_shapes] 2077 2078 dense_defaults_vec = [] 2079 for i, key in enumerate(dense_keys): 2080 default_value = dense_defaults.get(key) 2081 dense_shape = dense_shapes[i] 2082 if (dense_shape.ndims is not None and dense_shape.ndims > 0 and 2083 dense_shape.dims[0].value is None): 2084 # Variable stride dense shape, the default value should be a 2085 # scalar padding value 2086 if default_value is None: 2087 default_value = ops.convert_to_tensor( 2088 "" if dense_types[i] == dtypes.string else 0, 2089 dtype=dense_types[i]) 2090 else: 2091 # Reshape to a scalar to ensure user gets an error if they 2092 # provide a tensor that's not intended to be a padding value 2093 # (0 or 2+ elements). 2094 key_name = "padding_" + re.sub("[^A-Za-z0-9_.\\-/]", "_", key) 2095 default_value = ops.convert_to_tensor( 2096 default_value, dtype=dense_types[i], name=key_name) 2097 default_value = array_ops.reshape(default_value, []) 2098 else: 2099 if default_value is None: 2100 default_value = constant_op.constant([], dtype=dense_types[i]) 2101 elif not isinstance(default_value, ops.Tensor): 2102 key_name = "key_" + re.sub("[^A-Za-z0-9_.\\-/]", "_", key) 2103 default_value = ops.convert_to_tensor( 2104 default_value, dtype=dense_types[i], name=key_name) 2105 default_value = array_ops.reshape(default_value, dense_shape) 2106 2107 dense_defaults_vec.append(default_value) 2108 2109 # Finally, convert dense_shapes to TensorShapeProto 2110 dense_shapes = [shape.as_proto() for shape in dense_shapes] 2111 2112 outputs = gen_parsing_ops.parse_single_example( 2113 serialized=serialized, 2114 dense_defaults=dense_defaults_vec, 2115 num_sparse=len(sparse_keys), 2116 sparse_keys=sparse_keys, 2117 sparse_types=sparse_types, 2118 dense_keys=dense_keys, 2119 dense_shapes=dense_shapes, 2120 name=name) 2121 2122 (sparse_indices, sparse_values, sparse_shapes, dense_values) = outputs 2123 2124 sparse_tensors = [ 2125 sparse_tensor.SparseTensor(ix, val, shape) 2126 for (ix, val, 2127 shape) in zip(sparse_indices, sparse_values, sparse_shapes) 2128 ] 2129 2130 return dict(zip(sparse_keys + dense_keys, sparse_tensors + dense_values)) 2131