1# Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15 16"""Parsing Ops.""" 17from __future__ import absolute_import 18from __future__ import division 19from __future__ import print_function 20 21from tensorflow.python.framework import ops 22from tensorflow.python.framework import sparse_tensor 23from tensorflow.python.ops import array_ops 24from tensorflow.python.ops import control_flow_ops 25from tensorflow.python.ops import gen_parsing_ops 26from tensorflow.python.ops import math_ops 27from tensorflow.python.ops import parsing_config 28# go/tf-wildcard-import 29# pylint: disable=wildcard-import,undefined-variable 30from tensorflow.python.ops.gen_parsing_ops import * 31# pylint: enable=wildcard-import,undefined-variable 32from tensorflow.python.util import deprecation 33from tensorflow.python.util.tf_export import tf_export 34 35 36ops.NotDifferentiable("DecodeRaw") 37ops.NotDifferentiable("DecodePaddedRaw") 38ops.NotDifferentiable("ParseTensor") 39ops.NotDifferentiable("SerializeTensor") 40ops.NotDifferentiable("StringToNumber") 41 42 43VarLenFeature = parsing_config.VarLenFeature 44RaggedFeature = parsing_config.RaggedFeature 45SparseFeature = parsing_config.SparseFeature 46FixedLenFeature = parsing_config.FixedLenFeature 47FixedLenSequenceFeature = parsing_config.FixedLenSequenceFeature 48# pylint: disable=protected-access 49_ParseOpParams = parsing_config._ParseOpParams 50_construct_tensors_for_composite_features = ( 51 parsing_config._construct_tensors_for_composite_features) 52# pylint: enable=protected-access 53 54 55# TODO(b/122887740) Switch files that use this private symbol to use new name. 56_construct_sparse_tensors_for_sparse_features = \ 57 _construct_tensors_for_composite_features 58 59 60def _prepend_none_dimension(features): 61 """Returns a copy of features with adjusted FixedLenSequenceFeature shapes.""" 62 if features: 63 modified_features = dict(features) # Create a copy to modify 64 for key, feature in features.items(): 65 if isinstance(feature, FixedLenSequenceFeature): 66 if not feature.allow_missing: 67 raise ValueError("Unsupported: FixedLenSequenceFeature requires " 68 "allow_missing to be True.") 69 modified_features[key] = FixedLenSequenceFeature( 70 [None] + list(feature.shape), 71 feature.dtype, 72 feature.allow_missing, 73 feature.default_value) 74 return modified_features 75 else: 76 return features 77 78 79@tf_export("io.parse_example", v1=[]) 80def parse_example_v2(serialized, features, example_names=None, name=None): 81 # pylint: disable=line-too-long 82 """Parses `Example` protos into a `dict` of tensors. 83 84 Parses a number of serialized [`Example`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto) 85 protos given in `serialized`. We refer to `serialized` as a batch with 86 `batch_size` many entries of individual `Example` protos. 87 88 `example_names` may contain descriptive names for the corresponding serialized 89 protos. These may be useful for debugging purposes, but they have no effect on 90 the output. If not `None`, `example_names` must be the same length as 91 `serialized`. 92 93 This op parses serialized examples into a dictionary mapping keys to `Tensor` 94 `SparseTensor`, and `RaggedTensor` objects. `features` is a dict from keys to 95 `VarLenFeature`, `SparseFeature`, `RaggedFeature`, and `FixedLenFeature` 96 objects. Each `VarLenFeature` and `SparseFeature` is mapped to a 97 `SparseTensor`; each `FixedLenFeature` is mapped to a `Tensor`; and each 98 `RaggedFeature` is mapped to a `RaggedTensor`. 99 100 Each `VarLenFeature` maps to a `SparseTensor` of the specified type 101 representing a ragged matrix. Its indices are `[batch, index]` where `batch` 102 identifies the example in `serialized`, and `index` is the value's index in 103 the list of values associated with that feature and example. 104 105 Each `SparseFeature` maps to a `SparseTensor` of the specified type 106 representing a Tensor of `dense_shape` `[batch_size] + SparseFeature.size`. 107 Its `values` come from the feature in the examples with key `value_key`. 108 A `values[i]` comes from a position `k` in the feature of an example at batch 109 entry `batch`. This positional information is recorded in `indices[i]` as 110 `[batch, index_0, index_1, ...]` where `index_j` is the `k-th` value of 111 the feature in the example at with key `SparseFeature.index_key[j]`. 112 In other words, we split the indices (except the first index indicating the 113 batch entry) of a `SparseTensor` by dimension into different features of the 114 `Example`. Due to its complexity a `VarLenFeature` should be preferred over a 115 `SparseFeature` whenever possible. 116 117 Each `FixedLenFeature` `df` maps to a `Tensor` of the specified type (or 118 `tf.float32` if not specified) and shape `(serialized.size(),) + df.shape`. 119 120 `FixedLenFeature` entries with a `default_value` are optional. With no default 121 value, we will fail if that `Feature` is missing from any example in 122 `serialized`. 123 124 Each `FixedLenSequenceFeature` `df` maps to a `Tensor` of the specified type 125 (or `tf.float32` if not specified) and shape 126 `(serialized.size(), None) + df.shape`. 127 All examples in `serialized` will be padded with `default_value` along the 128 second dimension. 129 130 Each `RaggedFeature` maps to a `RaggedTensor` of the specified type. It 131 is formed by stacking the `RaggedTensor` for each example, where the 132 `RaggedTensor` for each individual example is constructed using the tensors 133 specified by `RaggedTensor.values_key` and `RaggedTensor.partition`. See 134 the `tf.io.RaggedFeature` documentation for details and examples. 135 136 Examples: 137 138 For example, if one expects a `tf.float32` `VarLenFeature` `ft` and three 139 serialized `Example`s are provided: 140 141 ``` 142 serialized = [ 143 features 144 { feature { key: "ft" value { float_list { value: [1.0, 2.0] } } } }, 145 features 146 { feature []}, 147 features 148 { feature { key: "ft" value { float_list { value: [3.0] } } } 149 ] 150 ``` 151 152 then the output will look like: 153 154 ```python 155 {"ft": SparseTensor(indices=[[0, 0], [0, 1], [2, 0]], 156 values=[1.0, 2.0, 3.0], 157 dense_shape=(3, 2)) } 158 ``` 159 160 If instead a `FixedLenSequenceFeature` with `default_value = -1.0` and 161 `shape=[]` is used then the output will look like: 162 163 ```python 164 {"ft": [[1.0, 2.0], [3.0, -1.0]]} 165 ``` 166 167 Given two `Example` input protos in `serialized`: 168 169 ``` 170 [ 171 features { 172 feature { key: "kw" value { bytes_list { value: [ "knit", "big" ] } } } 173 feature { key: "gps" value { float_list { value: [] } } } 174 }, 175 features { 176 feature { key: "kw" value { bytes_list { value: [ "emmy" ] } } } 177 feature { key: "dank" value { int64_list { value: [ 42 ] } } } 178 feature { key: "gps" value { } } 179 } 180 ] 181 ``` 182 183 And arguments 184 185 ``` 186 example_names: ["input0", "input1"], 187 features: { 188 "kw": VarLenFeature(tf.string), 189 "dank": VarLenFeature(tf.int64), 190 "gps": VarLenFeature(tf.float32), 191 } 192 ``` 193 194 Then the output is a dictionary: 195 196 ```python 197 { 198 "kw": SparseTensor( 199 indices=[[0, 0], [0, 1], [1, 0]], 200 values=["knit", "big", "emmy"] 201 dense_shape=[2, 2]), 202 "dank": SparseTensor( 203 indices=[[1, 0]], 204 values=[42], 205 dense_shape=[2, 1]), 206 "gps": SparseTensor( 207 indices=[], 208 values=[], 209 dense_shape=[2, 0]), 210 } 211 ``` 212 213 For dense results in two serialized `Example`s: 214 215 ``` 216 [ 217 features { 218 feature { key: "age" value { int64_list { value: [ 0 ] } } } 219 feature { key: "gender" value { bytes_list { value: [ "f" ] } } } 220 }, 221 features { 222 feature { key: "age" value { int64_list { value: [] } } } 223 feature { key: "gender" value { bytes_list { value: [ "f" ] } } } 224 } 225 ] 226 ``` 227 228 We can use arguments: 229 230 ``` 231 example_names: ["input0", "input1"], 232 features: { 233 "age": FixedLenFeature([], dtype=tf.int64, default_value=-1), 234 "gender": FixedLenFeature([], dtype=tf.string), 235 } 236 ``` 237 238 And the expected output is: 239 240 ```python 241 { 242 "age": [[0], [-1]], 243 "gender": [["f"], ["f"]], 244 } 245 ``` 246 247 An alternative to `VarLenFeature` to obtain a `SparseTensor` is 248 `SparseFeature`. For example, given two `Example` input protos in 249 `serialized`: 250 251 ``` 252 [ 253 features { 254 feature { key: "val" value { float_list { value: [ 0.5, -1.0 ] } } } 255 feature { key: "ix" value { int64_list { value: [ 3, 20 ] } } } 256 }, 257 features { 258 feature { key: "val" value { float_list { value: [ 0.0 ] } } } 259 feature { key: "ix" value { int64_list { value: [ 42 ] } } } 260 } 261 ] 262 ``` 263 264 And arguments 265 266 ``` 267 example_names: ["input0", "input1"], 268 features: { 269 "sparse": SparseFeature( 270 index_key="ix", value_key="val", dtype=tf.float32, size=100), 271 } 272 ``` 273 274 Then the output is a dictionary: 275 276 ```python 277 { 278 "sparse": SparseTensor( 279 indices=[[0, 3], [0, 20], [1, 42]], 280 values=[0.5, -1.0, 0.0] 281 dense_shape=[2, 100]), 282 } 283 ``` 284 285 See the `tf.io.RaggedFeature` documentation for examples showing how 286 `RaggedFeature` can be used to obtain `RaggedTensor`s. 287 288 Args: 289 serialized: A vector (1-D Tensor) of strings, a batch of binary 290 serialized `Example` protos. 291 features: A `dict` mapping feature keys to `FixedLenFeature`, 292 `VarLenFeature`, `SparseFeature`, and `RaggedFeature` values. 293 example_names: A vector (1-D Tensor) of strings (optional), the names of 294 the serialized protos in the batch. 295 name: A name for this operation (optional). 296 297 Returns: 298 A `dict` mapping feature keys to `Tensor`, `SparseTensor`, and 299 `RaggedTensor` values. 300 301 Raises: 302 ValueError: if any feature is invalid. 303 """ 304 if not features: 305 raise ValueError("Missing: features was %s." % features) 306 features = _prepend_none_dimension(features) 307 params = _ParseOpParams.from_features(features, [ 308 VarLenFeature, SparseFeature, FixedLenFeature, FixedLenSequenceFeature, 309 RaggedFeature 310 ]) 311 312 outputs = _parse_example_raw(serialized, example_names, params, name=name) 313 return _construct_tensors_for_composite_features(features, outputs) 314 315 316@tf_export(v1=["io.parse_example", "parse_example"]) 317def parse_example(serialized, features, name=None, example_names=None): 318 return parse_example_v2(serialized, features, example_names, name) 319 320 321parse_example.__doc__ = parse_example_v2.__doc__ 322 323 324def _parse_example_raw(serialized, names, params, name): 325 """Parses `Example` protos. 326 327 Args: 328 serialized: A vector (1-D Tensor) of strings, a batch of binary 329 serialized `Example` protos. 330 names: A vector (1-D Tensor) of strings (optional), the names of 331 the serialized protos. 332 params: A `ParseOpParams` containing the parameters for the parse op. 333 name: A name for this operation (optional). 334 335 Returns: 336 A `dict` mapping keys to `Tensor`s and `SparseTensor`s and `RaggedTensor`s. 337 338 """ 339 if params.num_features == 0: 340 raise ValueError("Must provide at least one feature key") 341 with ops.name_scope(name, "ParseExample", [serialized, names]): 342 names = [] if names is None else names 343 serialized = ops.convert_to_tensor(serialized, name="serialized") 344 if params.ragged_keys and serialized.shape.ndims is None: 345 raise ValueError("serialized must have statically-known rank to " 346 "parse ragged features.") 347 outputs = gen_parsing_ops.parse_example_v2( 348 serialized=serialized, 349 names=names, 350 sparse_keys=params.sparse_keys, 351 dense_keys=params.dense_keys, 352 ragged_keys=params.ragged_keys, 353 dense_defaults=params.dense_defaults_vec, 354 num_sparse=len(params.sparse_keys), 355 sparse_types=params.sparse_types, 356 ragged_value_types=params.ragged_value_types, 357 ragged_split_types=params.ragged_split_types, 358 dense_shapes=params.dense_shapes_as_proto, 359 name=name) 360 (sparse_indices, sparse_values, sparse_shapes, dense_values, 361 ragged_values, ragged_row_splits) = outputs 362 # pylint: disable=protected-access 363 ragged_tensors = parsing_config._build_ragged_tensors( 364 serialized.shape, ragged_values, ragged_row_splits) 365 366 sparse_tensors = [ 367 sparse_tensor.SparseTensor(ix, val, shape) for (ix, val, shape) 368 in zip(sparse_indices, sparse_values, sparse_shapes)] 369 370 return dict( 371 zip(params.sparse_keys + params.dense_keys + params.ragged_keys, 372 sparse_tensors + dense_values + ragged_tensors)) 373 374 375@tf_export(v1=["io.parse_single_example", "parse_single_example"]) 376def parse_single_example(serialized, features, name=None, example_names=None): 377 """Parses a single `Example` proto. 378 379 Similar to `parse_example`, except: 380 381 For dense tensors, the returned `Tensor` is identical to the output of 382 `parse_example`, except there is no batch dimension, the output shape is the 383 same as the shape given in `dense_shape`. 384 385 For `SparseTensor`s, the first (batch) column of the indices matrix is removed 386 (the indices matrix is a column vector), the values vector is unchanged, and 387 the first (`batch_size`) entry of the shape vector is removed (it is now a 388 single element vector). 389 390 One might see performance advantages by batching `Example` protos with 391 `parse_example` instead of using this function directly. 392 393 Args: 394 serialized: A scalar string Tensor, a single serialized Example. 395 features: A `dict` mapping feature keys to `FixedLenFeature` or 396 `VarLenFeature` values. 397 name: A name for this operation (optional). 398 example_names: (Optional) A scalar string Tensor, the associated name. 399 400 Returns: 401 A `dict` mapping feature keys to `Tensor` and `SparseTensor` values. 402 403 Raises: 404 ValueError: if any feature is invalid. 405 """ 406 return parse_single_example_v2(serialized, features, example_names, name) 407 408 409@tf_export("io.parse_single_example", v1=[]) 410def parse_single_example_v2( 411 serialized, features, example_names=None, name=None 412 ): 413 """Parses a single `Example` proto. 414 415 Similar to `parse_example`, except: 416 417 For dense tensors, the returned `Tensor` is identical to the output of 418 `parse_example`, except there is no batch dimension, the output shape is the 419 same as the shape given in `dense_shape`. 420 421 For `SparseTensor`s, the first (batch) column of the indices matrix is removed 422 (the indices matrix is a column vector), the values vector is unchanged, and 423 the first (`batch_size`) entry of the shape vector is removed (it is now a 424 single element vector). 425 426 One might see performance advantages by batching `Example` protos with 427 `parse_example` instead of using this function directly. 428 429 Args: 430 serialized: A scalar string Tensor, a single serialized Example. 431 features: A `dict` mapping feature keys to `FixedLenFeature` or 432 `VarLenFeature` values. 433 example_names: (Optional) A scalar string Tensor, the associated name. 434 name: A name for this operation (optional). 435 436 Returns: 437 A `dict` mapping feature keys to `Tensor` and `SparseTensor` values. 438 439 Raises: 440 ValueError: if any feature is invalid. 441 """ 442 if not features: 443 raise ValueError("Missing features.") 444 with ops.name_scope(name, "ParseSingleExample", [serialized, example_names]): 445 serialized = ops.convert_to_tensor(serialized, name="serialized") 446 serialized = _assert_scalar(serialized, "serialized") 447 return parse_example_v2(serialized, features, example_names, name) 448 449 450@tf_export("io.parse_sequence_example") 451def parse_sequence_example(serialized, 452 context_features=None, 453 sequence_features=None, 454 example_names=None, 455 name=None): 456 # pylint: disable=line-too-long 457 """Parses a batch of `SequenceExample` protos. 458 459 Parses a vector of serialized 460 [`SequenceExample`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto) 461 protos given in `serialized`. 462 463 This op parses serialized sequence examples into a tuple of dictionaries, 464 each mapping keys to `Tensor` and `SparseTensor` objects. 465 The first dictionary contains mappings for keys appearing in 466 `context_features`, and the second dictionary contains mappings for keys 467 appearing in `sequence_features`. 468 469 At least one of `context_features` and `sequence_features` must be provided 470 and non-empty. 471 472 The `context_features` keys are associated with a `SequenceExample` as a 473 whole, independent of time / frame. In contrast, the `sequence_features` keys 474 provide a way to access variable-length data within the `FeatureList` section 475 of the `SequenceExample` proto. While the shapes of `context_features` values 476 are fixed with respect to frame, the frame dimension (the first dimension) 477 of `sequence_features` values may vary between `SequenceExample` protos, 478 and even between `feature_list` keys within the same `SequenceExample`. 479 480 `context_features` contains `VarLenFeature`, `RaggedFeature`, and 481 `FixedLenFeature` objects. Each `VarLenFeature` is mapped to a 482 `SparseTensor`; each `RaggedFeature` is mapped to a `RaggedTensor`; and each 483 `FixedLenFeature` is mapped to a `Tensor`, of the specified type, shape, and 484 default value. 485 486 `sequence_features` contains `VarLenFeature`, `RaggedFeature`, and 487 `FixedLenSequenceFeature` objects. Each `VarLenFeature` is mapped to a 488 `SparseTensor`; each `RaggedFeature` is mapped to a `RaggedTensor; and 489 each `FixedLenSequenceFeature` is mapped to a `Tensor`, each of the specified 490 type. The shape will be `(B,T,) + df.dense_shape` for 491 `FixedLenSequenceFeature` `df`, where `B` is the batch size, and `T` is the 492 length of the associated `FeatureList` in the `SequenceExample`. For instance, 493 `FixedLenSequenceFeature([])` yields a scalar 2-D `Tensor` of static shape 494 `[None, None]` and dynamic shape `[B, T]`, while 495 `FixedLenSequenceFeature([k])` (for `int k >= 1`) yields a 3-D matrix `Tensor` 496 of static shape `[None, None, k]` and dynamic shape `[B, T, k]`. 497 498 Like the input, the resulting output tensors have a batch dimension. This 499 means that the original per-example shapes of `VarLenFeature`s and 500 `FixedLenSequenceFeature`s can be lost. To handle that situation, this op also 501 provides dicts of shape tensors as part of the output. There is one dict for 502 the context features, and one for the feature_list features. Context features 503 of type `FixedLenFeature`s will not be present, since their shapes are already 504 known by the caller. In situations where the input 'FixedLenFeature`s are of 505 different lengths across examples, the shorter examples will be padded with 506 default datatype values: 0 for numeric types, and the empty string for string 507 types. 508 509 Each `SparseTensor` corresponding to `sequence_features` represents a ragged 510 vector. Its indices are `[time, index]`, where `time` is the `FeatureList` 511 entry and `index` is the value's index in the list of values associated with 512 that time. 513 514 `FixedLenFeature` entries with a `default_value` and `FixedLenSequenceFeature` 515 entries with `allow_missing=True` are optional; otherwise, we will fail if 516 that `Feature` or `FeatureList` is missing from any example in `serialized`. 517 518 `example_name` may contain a descriptive name for the corresponding serialized 519 proto. This may be useful for debugging purposes, but it has no effect on the 520 output. If not `None`, `example_name` must be a scalar. 521 522 Args: 523 serialized: A vector (1-D Tensor) of type string containing binary 524 serialized `SequenceExample` protos. 525 context_features: A `dict` mapping feature keys to `FixedLenFeature` or 526 `VarLenFeature` or `RaggedFeature` values. These features are associated 527 with a `SequenceExample` as a whole. 528 sequence_features: A `dict` mapping feature keys to 529 `FixedLenSequenceFeature` or `VarLenFeature` or `RaggedFeature` values. 530 These features are associated with data within the `FeatureList` section 531 of the `SequenceExample` proto. 532 example_names: A vector (1-D Tensor) of strings (optional), the name of the 533 serialized protos. 534 name: A name for this operation (optional). 535 536 Returns: 537 A tuple of three `dict`s, each mapping keys to `Tensor`s, 538 `SparseTensor`s, and `RaggedTensor`. The first dict contains the context 539 key/values, the second dict contains the feature_list key/values, and the 540 final dict contains the lengths of any dense feature_list features. 541 542 Raises: 543 ValueError: if any feature is invalid. 544 """ 545 if not (context_features or sequence_features): 546 raise ValueError("Missing features.") 547 context_params = _ParseOpParams.from_features( 548 context_features, [VarLenFeature, FixedLenFeature, RaggedFeature]) 549 feature_list_params = _ParseOpParams.from_features( 550 sequence_features, 551 [VarLenFeature, FixedLenSequenceFeature, RaggedFeature]) 552 553 with ops.name_scope(name, "ParseSequenceExample", 554 [serialized, example_names]): 555 outputs = _parse_sequence_example_raw(serialized, example_names, 556 context_params, feature_list_params, 557 name) 558 context_output, feature_list_output, feature_list_lengths = outputs 559 560 if context_params.ragged_keys: 561 context_output = _construct_tensors_for_composite_features( 562 context_features, context_output) 563 if feature_list_params.ragged_keys: 564 feature_list_output = _construct_tensors_for_composite_features( 565 sequence_features, feature_list_output) 566 567 return context_output, feature_list_output, feature_list_lengths 568 569 570def _parse_sequence_example_raw(serialized, 571 debug_name, 572 context, 573 feature_list, 574 name=None): 575 """Parses a vector of `SequenceExample` protos. 576 577 Args: 578 serialized: A vector (1-D Tensor) of type string, containing binary 579 serialized `SequenceExample` protos. 580 debug_name: A vector (1-D Tensor) of strings (optional), the names of the 581 serialized protos. 582 context: A `ParseOpParams` containing the parameters for the parse 583 op for the context features. 584 feature_list: A `ParseOpParams` containing the parameters for the 585 parse op for the feature_list features. 586 name: A name for this operation (optional). 587 588 Returns: 589 A tuple of three `dict`s, each mapping keys to `Tensor`s, `SparseTensor`s, 590 and `RaggedTensor`s. The first dict contains the context key/values, the 591 second dict contains the feature_list key/values, and the final dict 592 contains the lengths of any dense feature_list features. 593 594 Raises: 595 TypeError: if feature_list.dense_defaults is not either None or a dict. 596 """ 597 if context.num_features + feature_list.num_features == 0: 598 raise ValueError("Must provide at least one feature key") 599 with ops.name_scope(name, "ParseSequenceExample", [serialized]): 600 debug_name = [] if debug_name is None else debug_name 601 602 # Internal 603 feature_list_dense_missing_assumed_empty = [] 604 for k, v in feature_list.dense_defaults.items(): 605 if v is not None: 606 raise ValueError("Value feature_list.dense_defaults[%s] must be None" % 607 k) 608 feature_list_dense_missing_assumed_empty.append(k) 609 610 has_ragged = context.ragged_keys or feature_list.ragged_keys 611 serialized = ops.convert_to_tensor(serialized, name="serialized") 612 if has_ragged and serialized.shape.ndims is None: 613 raise ValueError("serialized must have statically-known rank to " 614 "parse ragged features.") 615 feature_list_dense_missing_assumed_empty_vector = [ 616 key in feature_list_dense_missing_assumed_empty 617 for key in feature_list.dense_keys 618 ] 619 outputs = gen_parsing_ops.parse_sequence_example_v2( 620 # Inputs 621 serialized=serialized, 622 debug_name=debug_name, 623 context_sparse_keys=context.sparse_keys, 624 context_dense_keys=context.dense_keys, 625 context_ragged_keys=context.ragged_keys, 626 feature_list_sparse_keys=feature_list.sparse_keys, 627 feature_list_dense_keys=feature_list.dense_keys, 628 feature_list_ragged_keys=feature_list.ragged_keys, 629 feature_list_dense_missing_assumed_empty=( 630 feature_list_dense_missing_assumed_empty_vector), 631 context_dense_defaults=context.dense_defaults_vec, 632 # Attrs 633 Ncontext_sparse=len(context.sparse_keys), 634 Nfeature_list_sparse=len(feature_list.sparse_keys), 635 Nfeature_list_dense=len(feature_list.dense_keys), 636 context_sparse_types=context.sparse_types, 637 context_ragged_value_types=context.ragged_value_types, 638 context_ragged_split_types=context.ragged_split_types, 639 feature_list_dense_types=feature_list.dense_types, 640 feature_list_sparse_types=feature_list.sparse_types, 641 feature_list_ragged_value_types=feature_list.ragged_value_types, 642 feature_list_ragged_split_types=feature_list.ragged_split_types, 643 context_dense_shapes=context.dense_shapes_as_proto, 644 feature_list_dense_shapes=feature_list.dense_shapes, 645 name=name) 646 (context_sparse_indices, context_sparse_values, context_sparse_shapes, 647 context_dense_values, context_ragged_values, context_ragged_row_splits, 648 feature_list_sparse_indices, feature_list_sparse_values, 649 feature_list_sparse_shapes, feature_list_dense_values, 650 feature_list_dense_lengths, feature_list_ragged_values, 651 feature_list_ragged_outer_splits, 652 feature_list_ragged_inner_splits) = outputs 653 # pylint: disable=protected-access 654 context_ragged_tensors = parsing_config._build_ragged_tensors( 655 serialized.shape, context_ragged_values, context_ragged_row_splits) 656 feature_list_ragged_tensors = parsing_config._build_ragged_tensors( 657 serialized.shape, feature_list_ragged_values, 658 feature_list_ragged_outer_splits, feature_list_ragged_inner_splits) 659 660 # pylint: disable=g-complex-comprehension 661 context_sparse_tensors = [ 662 sparse_tensor.SparseTensor(ix, val, shape) 663 for (ix, val, 664 shape) in zip(context_sparse_indices, context_sparse_values, 665 context_sparse_shapes) 666 ] 667 668 feature_list_sparse_tensors = [ 669 sparse_tensor.SparseTensor(ix, val, shape) 670 for (ix, val, shape 671 ) in zip(feature_list_sparse_indices, feature_list_sparse_values, 672 feature_list_sparse_shapes) 673 ] 674 # pylint: enable=g-complex-comprehension 675 676 context_output = dict( 677 zip( 678 context.sparse_keys + context.dense_keys + context.ragged_keys, 679 context_sparse_tensors + context_dense_values + 680 context_ragged_tensors)) 681 feature_list_output = dict( 682 zip( 683 feature_list.sparse_keys + feature_list.dense_keys + 684 feature_list.ragged_keys, feature_list_sparse_tensors + 685 feature_list_dense_values + feature_list_ragged_tensors)) 686 feature_list_lengths = dict( 687 zip(feature_list.dense_keys, feature_list_dense_lengths)) 688 689 return (context_output, feature_list_output, feature_list_lengths) 690 691 692@tf_export("io.parse_single_sequence_example", 693 v1=["io.parse_single_sequence_example", 694 "parse_single_sequence_example"]) 695def parse_single_sequence_example( 696 serialized, context_features=None, sequence_features=None, 697 example_name=None, name=None): 698 # pylint: disable=line-too-long 699 """Parses a single `SequenceExample` proto. 700 701 Parses a single serialized [`SequenceExample`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto) 702 proto given in `serialized`. 703 704 This op parses a serialized sequence example into a tuple of dictionaries, 705 each mapping keys to `Tensor` and `SparseTensor` objects. 706 The first dictionary contains mappings for keys appearing in 707 `context_features`, and the second dictionary contains mappings for keys 708 appearing in `sequence_features`. 709 710 At least one of `context_features` and `sequence_features` must be provided 711 and non-empty. 712 713 The `context_features` keys are associated with a `SequenceExample` as a 714 whole, independent of time / frame. In contrast, the `sequence_features` keys 715 provide a way to access variable-length data within the `FeatureList` section 716 of the `SequenceExample` proto. While the shapes of `context_features` values 717 are fixed with respect to frame, the frame dimension (the first dimension) 718 of `sequence_features` values may vary between `SequenceExample` protos, 719 and even between `feature_list` keys within the same `SequenceExample`. 720 721 `context_features` contains `VarLenFeature`, `RaggedFeature`, and 722 `FixedLenFeature` objects. Each `VarLenFeature` is mapped to a `SparseTensor`; 723 each `RaggedFeature` is mapped to a `RaggedTensor`; and each `FixedLenFeature` 724 is mapped to a `Tensor`, of the specified type, shape, and default value. 725 726 `sequence_features` contains `VarLenFeature`, `RaggedFeature`, and 727 `FixedLenSequenceFeature` objects. Each `VarLenFeature` is mapped to a 728 `SparseTensor`; each `RaggedFeature` is mapped to a `RaggedTensor`; and each 729 `FixedLenSequenceFeature` is mapped to a `Tensor`, each of the specified type. 730 The shape will be `(T,) + df.dense_shape` for `FixedLenSequenceFeature` `df`, 731 where `T` is the length of the associated `FeatureList` in the 732 `SequenceExample`. For instance, `FixedLenSequenceFeature([])` yields a scalar 733 1-D `Tensor` of static shape `[None]` and dynamic shape `[T]`, while 734 `FixedLenSequenceFeature([k])` (for `int k >= 1`) yields a 2-D matrix `Tensor` 735 of static shape `[None, k]` and dynamic shape `[T, k]`. 736 737 Each `SparseTensor` corresponding to `sequence_features` represents a ragged 738 vector. Its indices are `[time, index]`, where `time` is the `FeatureList` 739 entry and `index` is the value's index in the list of values associated with 740 that time. 741 742 `FixedLenFeature` entries with a `default_value` and `FixedLenSequenceFeature` 743 entries with `allow_missing=True` are optional; otherwise, we will fail if 744 that `Feature` or `FeatureList` is missing from any example in `serialized`. 745 746 `example_name` may contain a descriptive name for the corresponding serialized 747 proto. This may be useful for debugging purposes, but it has no effect on the 748 output. If not `None`, `example_name` must be a scalar. 749 750 Note that the batch version of this function, `tf.parse_sequence_example`, 751 is written for better memory efficiency and will be faster on large 752 `SequenceExample`s. 753 754 Args: 755 serialized: A scalar (0-D Tensor) of type string, a single binary 756 serialized `SequenceExample` proto. 757 context_features: A `dict` mapping feature keys to `FixedLenFeature` or 758 `VarLenFeature` or `RaggedFeature` values. These features are associated 759 with a `SequenceExample` as a whole. 760 sequence_features: A `dict` mapping feature keys to 761 `FixedLenSequenceFeature` or `VarLenFeature` or `RaggedFeature` values. 762 These features are associated with data within the `FeatureList` section 763 of the `SequenceExample` proto. 764 example_name: A scalar (0-D Tensor) of strings (optional), the name of 765 the serialized proto. 766 name: A name for this operation (optional). 767 768 Returns: 769 A tuple of two `dict`s, each mapping keys to `Tensor`s and `SparseTensor`s 770 and `RaggedTensor`s. 771 772 * The first dict contains the context key/values. 773 * The second dict contains the feature_list key/values. 774 775 Raises: 776 ValueError: if any feature is invalid. 777 """ 778 # pylint: enable=line-too-long 779 if not (context_features or sequence_features): 780 raise ValueError("Missing features.") 781 context_params = _ParseOpParams.from_features( 782 context_features, [VarLenFeature, FixedLenFeature, RaggedFeature]) 783 feature_list_params = _ParseOpParams.from_features( 784 sequence_features, 785 [VarLenFeature, FixedLenSequenceFeature, RaggedFeature]) 786 787 with ops.name_scope(name, "ParseSingleSequenceExample", 788 [serialized, example_name]): 789 context_output, feature_list_output = ( 790 _parse_single_sequence_example_raw(serialized, context_params, 791 feature_list_params, example_name, 792 name)) 793 794 if context_params.ragged_keys: 795 context_output = _construct_tensors_for_composite_features( 796 context_features, context_output) 797 if feature_list_params.ragged_keys: 798 feature_list_output = _construct_tensors_for_composite_features( 799 sequence_features, feature_list_output) 800 801 return context_output, feature_list_output 802 803 804def _parse_single_sequence_example_raw(serialized, 805 context, 806 feature_list, 807 debug_name, 808 name=None): 809 """Parses a single `SequenceExample` proto. 810 811 Args: 812 serialized: A scalar (0-D Tensor) of type string, a single binary serialized 813 `SequenceExample` proto. 814 context: A `ParseOpParams` containing the parameters for the parse op for 815 the context features. 816 feature_list: A `ParseOpParams` containing the parameters for the parse op 817 for the feature_list features. 818 debug_name: A scalar (0-D Tensor) of strings (optional), the name of the 819 serialized proto. 820 name: A name for this operation (optional). 821 822 Returns: 823 A tuple of two `dict`s, each mapping keys to `Tensor`s and `SparseTensor`s. 824 The first dict contains the context key/values. 825 The second dict contains the feature_list key/values. 826 827 Raises: 828 TypeError: if feature_list.dense_defaults is not either None or a dict. 829 """ 830 with ops.name_scope(name, "ParseSingleExample", [serialized, debug_name]): 831 serialized = ops.convert_to_tensor(serialized, name="serialized") 832 serialized = _assert_scalar(serialized, "serialized") 833 return _parse_sequence_example_raw(serialized, debug_name, context, 834 feature_list, name)[:2] 835 836 837@tf_export("io.decode_raw", v1=[]) 838def decode_raw(input_bytes, 839 out_type, 840 little_endian=True, 841 fixed_length=None, 842 name=None): 843 """Convert raw byte strings into tensors. 844 845 Args: 846 input_bytes: 847 Each element of the input Tensor is converted to an array of bytes. 848 out_type: 849 `DType` of the output. Acceptable types are `half`, `float`, `double`, 850 `int32`, `uint16`, `uint8`, `int16`, `int8`, `int64`. 851 little_endian: 852 Whether the `input_bytes` data is in little-endian format. Data will be 853 converted into host byte order if necessary. 854 fixed_length: 855 If set, the first `fixed_length` bytes of each element will be converted. 856 Data will be zero-padded or truncated to the specified length. 857 858 `fixed_length` must be a multiple of the size of `out_type`. 859 `fixed_length` must be specified if the elements of `input_bytes` are of 860 variable length. 861 name: A name for the operation (optional). 862 863 Returns: 864 A `Tensor` object storing the decoded bytes. 865 866 """ 867 if fixed_length is not None: 868 return gen_parsing_ops.decode_padded_raw( 869 input_bytes, 870 fixed_length=fixed_length, 871 out_type=out_type, 872 little_endian=little_endian, 873 name=name) 874 else: 875 return gen_parsing_ops.decode_raw( 876 input_bytes, out_type, little_endian=little_endian, name=name) 877 878 879@tf_export(v1=["decode_raw", "io.decode_raw"]) 880@deprecation.deprecated_args(None, 881 "bytes is deprecated, use input_bytes instead", 882 "bytes") 883def decode_raw_v1( 884 input_bytes=None, 885 out_type=None, 886 little_endian=True, 887 name=None, 888 bytes=None # pylint: disable=redefined-builtin 889): 890 """Convert raw byte strings into tensors. 891 892 Args: 893 input_bytes: 894 Each element of the input Tensor is converted to an array of bytes. 895 out_type: 896 `DType` of the output. Acceptable types are `half`, `float`, `double`, 897 `int32`, `uint16`, `uint8`, `int16`, `int8`, `int64`. 898 little_endian: 899 Whether the `input_bytes` data is in little-endian format. Data will be 900 converted into host byte order if necessary. 901 name: A name for the operation (optional). 902 bytes: Deprecated parameter. Use `input_bytes` instead. 903 904 Returns: 905 A `Tensor` object storing the decoded bytes. 906 """ 907 input_bytes = deprecation.deprecated_argument_lookup("input_bytes", 908 input_bytes, "bytes", 909 bytes) 910 911 # out_type is a required positional argument in the original API, and had to 912 # be changed to a keyword argument in order to facilitate the transition from 913 # the reserved named `bytes` to `input_bytes`. Ensure it's still set. 914 if out_type is None: 915 raise ValueError( 916 "decode_raw_v1() missing 1 positional argument: 'out_type'") 917 918 return gen_parsing_ops.decode_raw( 919 input_bytes, out_type, little_endian=little_endian, name=name) 920 921 922# Swap `name` and `na_value` for backward compatibility. 923@tf_export(v1=["io.decode_csv", "decode_csv"]) 924@deprecation.deprecated_endpoints("decode_csv") 925def decode_csv(records, 926 record_defaults, 927 field_delim=",", 928 use_quote_delim=True, 929 name=None, 930 na_value="", 931 select_cols=None): 932 """Convert CSV records to tensors. Each column maps to one tensor. 933 934 RFC 4180 format is expected for the CSV records. 935 (https://tools.ietf.org/html/rfc4180) 936 Note that we allow leading and trailing spaces with int or float field. 937 938 Args: 939 records: A `Tensor` of type `string`. 940 Each string is a record/row in the csv and all records should have 941 the same format. 942 record_defaults: A list of `Tensor` objects with specific types. 943 Acceptable types are `float32`, `float64`, `int32`, `int64`, `string`. 944 One tensor per column of the input record, with either a 945 scalar default value for that column or an empty vector if the column is 946 required. 947 field_delim: An optional `string`. Defaults to `","`. 948 char delimiter to separate fields in a record. 949 use_quote_delim: An optional `bool`. Defaults to `True`. 950 If false, treats double quotation marks as regular 951 characters inside of the string fields (ignoring RFC 4180, Section 2, 952 Bullet 5). 953 name: A name for the operation (optional). 954 na_value: Additional string to recognize as NA/NaN. 955 select_cols: Optional sorted list of column indices to select. If specified, 956 only this subset of columns will be parsed and returned. 957 958 Returns: 959 A list of `Tensor` objects. Has the same type as `record_defaults`. 960 Each tensor will have the same shape as records. 961 962 Raises: 963 ValueError: If any of the arguments is malformed. 964 """ 965 return decode_csv_v2( 966 records, record_defaults, 967 field_delim, use_quote_delim, 968 na_value, select_cols, name 969 ) 970 971 972@tf_export("io.decode_csv", v1=[]) 973def decode_csv_v2(records, 974 record_defaults, 975 field_delim=",", 976 use_quote_delim=True, 977 na_value="", 978 select_cols=None, 979 name=None): 980 """Convert CSV records to tensors. Each column maps to one tensor. 981 982 RFC 4180 format is expected for the CSV records. 983 (https://tools.ietf.org/html/rfc4180) 984 Note that we allow leading and trailing spaces with int or float field. 985 986 Args: 987 records: A `Tensor` of type `string`. 988 Each string is a record/row in the csv and all records should have 989 the same format. 990 record_defaults: A list of `Tensor` objects with specific types. 991 Acceptable types are `float32`, `float64`, `int32`, `int64`, `string`. 992 One tensor per column of the input record, with either a 993 scalar default value for that column or an empty vector if the column is 994 required. 995 field_delim: An optional `string`. Defaults to `","`. 996 char delimiter to separate fields in a record. 997 use_quote_delim: An optional `bool`. Defaults to `True`. 998 If false, treats double quotation marks as regular 999 characters inside of the string fields (ignoring RFC 4180, Section 2, 1000 Bullet 5). 1001 na_value: Additional string to recognize as NA/NaN. 1002 select_cols: Optional sorted list of column indices to select. If specified, 1003 only this subset of columns will be parsed and returned. 1004 name: A name for the operation (optional). 1005 1006 Returns: 1007 A list of `Tensor` objects. Has the same type as `record_defaults`. 1008 Each tensor will have the same shape as records. 1009 1010 Raises: 1011 ValueError: If any of the arguments is malformed. 1012 """ 1013 if select_cols is not None and any(select_cols[i] >= select_cols[i + 1] 1014 for i in range(len(select_cols) - 1)): 1015 raise ValueError("select_cols is not strictly increasing.") 1016 if select_cols is not None and select_cols[0] < 0: 1017 raise ValueError("select_cols contains negative values.") 1018 if select_cols is not None and len(select_cols) != len(record_defaults): 1019 raise ValueError("Length of select_cols and record_defaults do not match.") 1020 return gen_parsing_ops.decode_csv( 1021 records=records, 1022 record_defaults=record_defaults, 1023 field_delim=field_delim, 1024 use_quote_delim=use_quote_delim, 1025 na_value=na_value, 1026 name=name, 1027 select_cols=select_cols, 1028 ) 1029 1030 1031def _assert_scalar(value, name): 1032 """Asserts that `value` is scalar, and returns `value`.""" 1033 value_rank = value.shape.rank 1034 if value_rank is None: 1035 check = control_flow_ops.Assert( 1036 math_ops.equal(array_ops.rank(value), 0), 1037 ["Input %s must be a scalar" % name], 1038 name="%sIsScalar" % name.capitalize()) 1039 result = control_flow_ops.with_dependencies([check], 1040 value, 1041 name="%sDependencies" % name) 1042 result.set_shape([]) 1043 return result 1044 elif value_rank == 0: 1045 return value 1046 else: 1047 raise ValueError("Input %s must be a scalar" % name) 1048