1# Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15"""Utilities related to FeatureColumn.""" 16 17from __future__ import absolute_import 18from __future__ import division 19from __future__ import print_function 20 21import functools 22 23from tensorflow.contrib.framework.python.framework import experimental 24from tensorflow.contrib.framework.python.ops import variables as contrib_variables 25from tensorflow.contrib.layers.python.layers import embedding_ops 26from tensorflow.contrib.layers.python.layers import feature_column as fc 27from tensorflow.contrib.layers.python.layers import layers 28from tensorflow.python.framework import dtypes 29from tensorflow.python.framework import ops 30from tensorflow.python.framework import sparse_tensor as sparse_tensor_py 31from tensorflow.python.ops import array_ops 32from tensorflow.python.ops import init_ops 33from tensorflow.python.ops import math_ops 34from tensorflow.python.ops import nn_ops 35from tensorflow.python.ops import parsing_ops 36from tensorflow.python.ops import sparse_ops 37from tensorflow.python.ops import variable_scope 38from tensorflow.python.platform import tf_logging as logging 39from tensorflow.python.util import nest 40 41 42def _maybe_reshape_input_tensor(tensor, column_name, output_rank): 43 """Reshape the input tensor by the following rule. 44 45 1. If `output_rank > input_rank + 1`, raise a `ValueError`. 46 2. If `output_rank == input_rank + 1`, expand the tensor by one dimension. 47 3. If `output_rank == input_rank`, do nothing. 48 4. If `output_rank < input_rank`, flatten the inner dimensions of the tensor. 49 50 Args: 51 tensor: A Tensor or SparseTensor to be reshaped. 52 column_name: A string name of the feature column for the tensor. 53 output_rank: the desired rank of the tensor. 54 Returns: 55 A reshaped Tensor or SparseTensor. 56 Raises: 57 ValueError: if `output_rank > input_rank + 1` for the input tensor. 58 """ 59 input_rank = tensor.get_shape().ndims 60 61 if input_rank is None and isinstance(tensor, sparse_tensor_py.SparseTensor): 62 # Try to get the rank of a sparse tensor by its dense_shape's shape. 63 input_rank = tensor.dense_shape.get_shape().as_list()[0] 64 65 if input_rank is None: 66 raise ValueError('Error while processing column {}. Rank of input Tensor ' 67 'can not be None.'.format(column_name)) 68 69 if output_rank > input_rank + 1: 70 raise ValueError('Error while processing column {}. Rank of input Tensor ' 71 '({}) should be the same as output_rank ({}). For ' 72 'example, sequence data should typically be 3 ' 73 'dimensional (rank 3) while non-sequence data is ' 74 'typically 2 dimensional (rank 2).'.format( 75 column_name, input_rank, output_rank)) 76 elif output_rank == input_rank + 1: 77 # Expand the tensor's shape by 1 dimension. 78 if isinstance(tensor, sparse_tensor_py.SparseTensor): 79 output_shape = array_ops.concat([tensor.dense_shape, [1]], 0) 80 return sparse_ops.sparse_reshape(tensor, output_shape) 81 else: 82 reshaped = array_ops.expand_dims(tensor, -1) 83 # Try to calculate the new shape. 84 static_shape = tensor.get_shape() 85 if static_shape is not None and static_shape.dims is not None: 86 reshaped.set_shape(static_shape.as_list() + [1]) 87 return reshaped 88 elif output_rank < input_rank: 89 return layers._inner_flatten(tensor, output_rank) # pylint: disable=protected-access 90 else: 91 return tensor 92 93 94def _input_from_feature_columns(columns_to_tensors, 95 feature_columns, 96 weight_collections, 97 trainable, 98 scope, 99 output_rank, 100 default_name, 101 cols_to_outs=None): 102 """Implementation of `input_from(_sequence)_feature_columns`.""" 103 columns_to_tensors = columns_to_tensors.copy() 104 check_feature_columns(feature_columns) 105 if cols_to_outs is not None and not isinstance(cols_to_outs, dict): 106 raise ValueError('cols_to_outs must be a dict unless None') 107 with variable_scope.variable_scope(scope, 108 default_name=default_name, 109 values=columns_to_tensors.values()): 110 output_tensors = [] 111 transformer = _Transformer(columns_to_tensors) 112 if weight_collections: 113 weight_collections = list(set(list(weight_collections) + 114 [ops.GraphKeys.GLOBAL_VARIABLES])) 115 116 for column in sorted(set(feature_columns), key=lambda x: x.key): 117 with variable_scope.variable_scope(None, 118 default_name=column.name, 119 values=columns_to_tensors.values()): 120 transformed_tensor = transformer.transform(column) 121 if output_rank == 3: 122 transformed_tensor = nest.map_structure( 123 functools.partial( 124 _maybe_reshape_input_tensor, 125 column_name=column.name, 126 output_rank=output_rank), transformed_tensor) 127 try: 128 # pylint: disable=protected-access 129 arguments = column._deep_embedding_lookup_arguments( 130 transformed_tensor) 131 output_tensors.append( 132 fc._embeddings_from_arguments( # pylint: disable=protected-access 133 column, 134 arguments, 135 weight_collections, 136 trainable, 137 output_rank=output_rank)) 138 139 except NotImplementedError as ee: 140 try: 141 # pylint: disable=protected-access 142 output_tensors.append(column._to_dnn_input_layer( 143 transformed_tensor, 144 weight_collections, 145 trainable, 146 output_rank=output_rank)) 147 except ValueError as e: 148 raise ValueError('Error creating input layer for column: {}.\n' 149 '{}, {}'.format(column.name, e, ee)) 150 if cols_to_outs is not None: 151 cols_to_outs[column] = output_tensors[-1] 152 return array_ops.concat(output_tensors, output_rank - 1) 153 154 155def input_from_feature_columns(columns_to_tensors, 156 feature_columns, 157 weight_collections=None, 158 trainable=True, 159 scope=None, 160 cols_to_outs=None): 161 """A tf.contrib.layers style input layer builder based on FeatureColumns. 162 163 Generally a single example in training data is described with feature columns. 164 At the first layer of the model, this column oriented data should be converted 165 to a single tensor. Each feature column needs a different kind of operation 166 during this conversion. For example sparse features need a totally different 167 handling than continuous features. 168 169 Example: 170 171 ```python 172 # Building model for training 173 columns_to_tensor = tf.parse_example(...) 174 first_layer = input_from_feature_columns( 175 columns_to_tensors=columns_to_tensor, 176 feature_columns=feature_columns) 177 second_layer = fully_connected(inputs=first_layer, ...) 178 ... 179 ``` 180 181 where feature_columns can be defined as follows: 182 183 ```python 184 sparse_feature = sparse_column_with_hash_bucket( 185 column_name="sparse_col", ...) 186 sparse_feature_emb = embedding_column(sparse_id_column=sparse_feature, ...) 187 real_valued_feature = real_valued_column(...) 188 real_valued_buckets = bucketized_column( 189 source_column=real_valued_feature, ...) 190 191 feature_columns=[sparse_feature_emb, real_valued_buckets] 192 ``` 193 194 Args: 195 columns_to_tensors: A mapping from feature column to tensors. 'string' key 196 means a base feature (not-transformed). It can have FeatureColumn as a 197 key too. That means that FeatureColumn is already transformed by input 198 pipeline. 199 feature_columns: A set containing all the feature columns. All items in the 200 set should be instances of classes derived by FeatureColumn. 201 weight_collections: List of graph collections to which weights are added. 202 trainable: If `True` also add variables to the graph collection 203 `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). 204 scope: Optional scope for variable_scope. 205 cols_to_outs: Optional dict from feature column to output tensor, 206 which is concatenated into the returned tensor. 207 208 Returns: 209 A Tensor which can be consumed by hidden layers in the neural network. 210 211 Raises: 212 ValueError: if FeatureColumn cannot be consumed by a neural network. 213 """ 214 return _input_from_feature_columns(columns_to_tensors, 215 feature_columns, 216 weight_collections, 217 trainable, 218 scope, 219 output_rank=2, 220 default_name='input_from_feature_columns', 221 cols_to_outs=cols_to_outs) 222 223 224@experimental 225def sequence_input_from_feature_columns(columns_to_tensors, 226 feature_columns, 227 weight_collections=None, 228 trainable=True, 229 scope=None): 230 """Builds inputs for sequence models from `FeatureColumn`s. 231 232 See documentation for `input_from_feature_columns`. The following types of 233 `FeatureColumn` are permitted in `feature_columns`: `_OneHotColumn`, 234 `_EmbeddingColumn`, `_ScatteredEmbeddingColumn`, `_RealValuedColumn`, 235 `_DataFrameColumn`. In addition, columns in `feature_columns` may not be 236 constructed using any of the following: `ScatteredEmbeddingColumn`, 237 `BucketizedColumn`, `CrossedColumn`. 238 239 Args: 240 columns_to_tensors: A mapping from feature column to tensors. 'string' key 241 means a base feature (not-transformed). It can have FeatureColumn as a 242 key too. That means that FeatureColumn is already transformed by input 243 pipeline. 244 feature_columns: A set containing all the feature columns. All items in the 245 set should be instances of classes derived by FeatureColumn. 246 weight_collections: List of graph collections to which weights are added. 247 trainable: If `True` also add variables to the graph collection 248 `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). 249 scope: Optional scope for variable_scope. 250 251 Returns: 252 A Tensor which can be consumed by hidden layers in the neural network. 253 254 Raises: 255 ValueError: if FeatureColumn cannot be consumed by a neural network. 256 """ 257 _check_supported_sequence_columns(feature_columns) 258 _check_forbidden_sequence_columns(feature_columns) 259 260 return _input_from_feature_columns( 261 columns_to_tensors, 262 feature_columns, 263 weight_collections, 264 trainable, 265 scope, 266 output_rank=3, 267 default_name='sequence_input_from_feature_columns') 268 269 270def _create_embedding_lookup(column, 271 columns_to_tensors, 272 embedding_lookup_arguments, 273 num_outputs, 274 trainable, 275 weight_collections): 276 """Creates variables and returns predictions for linear weights in a model. 277 278 Args: 279 column: the column we're working on. 280 columns_to_tensors: a map from column name to tensors. 281 embedding_lookup_arguments: arguments for embedding lookup. 282 num_outputs: how many outputs. 283 trainable: whether the variable we create is trainable. 284 weight_collections: weights will be placed here. 285 286 Returns: 287 variables: the created embeddings. 288 predictions: the computed predictions. 289 """ 290 with variable_scope.variable_scope( 291 None, default_name=column.name, values=columns_to_tensors.values()): 292 variable = contrib_variables.model_variable( 293 name='weights', 294 shape=[embedding_lookup_arguments.vocab_size, num_outputs], 295 dtype=dtypes.float32, 296 initializer=embedding_lookup_arguments.initializer, 297 trainable=trainable, 298 collections=weight_collections) 299 if fc._is_variable(variable): # pylint: disable=protected-access 300 variable = [variable] 301 else: 302 variable = variable._get_variable_list() # pylint: disable=protected-access 303 predictions = embedding_ops.safe_embedding_lookup_sparse( 304 variable, 305 embedding_lookup_arguments.input_tensor, 306 sparse_weights=embedding_lookup_arguments.weight_tensor, 307 combiner=embedding_lookup_arguments.combiner, 308 name=column.name + '_weights') 309 return variable, predictions 310 311 312def _create_joint_embedding_lookup(columns_to_tensors, 313 embedding_lookup_arguments, 314 num_outputs, 315 trainable, 316 weight_collections): 317 """Creates an embedding lookup for all columns sharing a single weight.""" 318 for arg in embedding_lookup_arguments: 319 assert arg.weight_tensor is None, ( 320 'Joint sums for weighted sparse columns are not supported. ' 321 'Please use weighted_sum_from_feature_columns instead.') 322 assert arg.combiner == 'sum', ( 323 'Combiners other than sum are not supported for joint sums. ' 324 'Please use weighted_sum_from_feature_columns instead.') 325 assert len(embedding_lookup_arguments) >= 1, ( 326 'At least one column must be in the model.') 327 prev_size = 0 328 sparse_tensors = [] 329 for a in embedding_lookup_arguments: 330 t = a.input_tensor 331 values = t.values + prev_size 332 prev_size += a.vocab_size 333 sparse_tensors.append( 334 sparse_tensor_py.SparseTensor(t.indices, 335 values, 336 t.dense_shape)) 337 sparse_tensor = sparse_ops.sparse_concat(1, sparse_tensors) 338 with variable_scope.variable_scope( 339 None, default_name='linear_weights', values=columns_to_tensors.values()): 340 variable = contrib_variables.model_variable( 341 name='weights', 342 shape=[prev_size, num_outputs], 343 dtype=dtypes.float32, 344 initializer=init_ops.zeros_initializer(), 345 trainable=trainable, 346 collections=weight_collections) 347 if fc._is_variable(variable): # pylint: disable=protected-access 348 variable = [variable] 349 else: 350 variable = variable._get_variable_list() # pylint: disable=protected-access 351 predictions = embedding_ops.safe_embedding_lookup_sparse( 352 variable, 353 sparse_tensor, 354 sparse_weights=None, 355 combiner='sum', 356 name='_weights') 357 return variable, predictions 358 359 360def joint_weighted_sum_from_feature_columns(columns_to_tensors, 361 feature_columns, 362 num_outputs, 363 weight_collections=None, 364 trainable=True, 365 scope=None): 366 """A restricted linear prediction builder based on FeatureColumns. 367 368 As long as all feature columns are unweighted sparse columns this computes the 369 prediction of a linear model which stores all weights in a single variable. 370 371 Args: 372 columns_to_tensors: A mapping from feature column to tensors. 'string' key 373 means a base feature (not-transformed). It can have FeatureColumn as a 374 key too. That means that FeatureColumn is already transformed by input 375 pipeline. For example, `inflow` may have handled transformations. 376 feature_columns: A set containing all the feature columns. All items in the 377 set should be instances of classes derived from FeatureColumn. 378 num_outputs: An integer specifying number of outputs. Default value is 1. 379 weight_collections: List of graph collections to which weights are added. 380 trainable: If `True` also add variables to the graph collection 381 `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). 382 scope: Optional scope for variable_scope. 383 384 Returns: 385 A tuple containing: 386 387 * A Tensor which represents predictions of a linear model. 388 * A list of Variables storing the weights. 389 * A Variable which is used for bias. 390 391 Raises: 392 ValueError: if FeatureColumn cannot be used for linear predictions. 393 394 """ 395 columns_to_tensors = columns_to_tensors.copy() 396 check_feature_columns(feature_columns) 397 with variable_scope.variable_scope( 398 scope, 399 default_name='joint_weighted_sum_from_feature_columns', 400 values=columns_to_tensors.values()): 401 transformer = _Transformer(columns_to_tensors) 402 embedding_lookup_arguments = [] 403 for column in sorted(set(feature_columns), key=lambda x: x.key): 404 transformed_tensor = transformer.transform(column) 405 try: 406 embedding_lookup_arguments.append( 407 column._wide_embedding_lookup_arguments(transformed_tensor)) # pylint: disable=protected-access 408 except NotImplementedError: 409 raise NotImplementedError('Real-valued columns are not supported. ' 410 'Use weighted_sum_from_feature_columns ' 411 'instead, or bucketize these columns.') 412 413 variable, predictions_no_bias = _create_joint_embedding_lookup( 414 columns_to_tensors, 415 embedding_lookup_arguments, 416 num_outputs, 417 trainable, 418 weight_collections) 419 bias = contrib_variables.model_variable( 420 'bias_weight', 421 shape=[num_outputs], 422 initializer=init_ops.zeros_initializer(), 423 trainable=trainable, 424 collections=_add_variable_collection(weight_collections)) 425 _log_variable(bias) 426 predictions = nn_ops.bias_add(predictions_no_bias, bias) 427 428 return predictions, variable, bias 429 430 431def weighted_sum_from_feature_columns(columns_to_tensors, 432 feature_columns, 433 num_outputs, 434 weight_collections=None, 435 trainable=True, 436 scope=None): 437 """A tf.contrib.layers style linear prediction builder based on FeatureColumn. 438 439 Generally a single example in training data is described with feature columns. 440 This function generates weighted sum for each num_outputs. Weighted sum refers 441 to logits in classification problems. It refers to prediction itself for 442 linear regression problems. 443 444 Example: 445 446 ``` 447 # Building model for training 448 feature_columns = ( 449 real_valued_column("my_feature1"), 450 ... 451 ) 452 columns_to_tensor = tf.parse_example(...) 453 logits = weighted_sum_from_feature_columns( 454 columns_to_tensors=columns_to_tensor, 455 feature_columns=feature_columns, 456 num_outputs=1) 457 loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, 458 logits=logits) 459 ``` 460 461 Args: 462 columns_to_tensors: A mapping from feature column to tensors. 'string' key 463 means a base feature (not-transformed). It can have FeatureColumn as a 464 key too. That means that FeatureColumn is already transformed by input 465 pipeline. For example, `inflow` may have handled transformations. 466 feature_columns: A set containing all the feature columns. All items in the 467 set should be instances of classes derived from FeatureColumn. 468 num_outputs: An integer specifying number of outputs. Default value is 1. 469 weight_collections: List of graph collections to which weights are added. 470 trainable: If `True` also add variables to the graph collection 471 `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). 472 scope: Optional scope for variable_scope. 473 474 Returns: 475 A tuple containing: 476 477 * A Tensor which represents predictions of a linear model. 478 * A dictionary which maps feature_column to corresponding Variable. 479 * A Variable which is used for bias. 480 481 Raises: 482 ValueError: if FeatureColumn cannot be used for linear predictions. 483 """ 484 columns_to_tensors = columns_to_tensors.copy() 485 check_feature_columns(feature_columns) 486 with variable_scope.variable_scope( 487 scope, 488 default_name='weighted_sum_from_feature_columns', 489 values=columns_to_tensors.values()): 490 output_tensors = [] 491 column_to_variable = dict() 492 transformer = _Transformer(columns_to_tensors) 493 # pylint: disable=protected-access 494 for column in sorted(set(feature_columns), key=lambda x: x.key): 495 transformed_tensor = transformer.transform(column) 496 try: 497 embedding_lookup_arguments = column._wide_embedding_lookup_arguments( 498 transformed_tensor) 499 variable, predictions = _create_embedding_lookup( 500 column, 501 columns_to_tensors, 502 embedding_lookup_arguments, 503 num_outputs, 504 trainable, 505 weight_collections) 506 except NotImplementedError: 507 with variable_scope.variable_scope( 508 None, 509 default_name=column.name, 510 values=columns_to_tensors.values()): 511 tensor = column._to_dense_tensor(transformed_tensor) 512 tensor = _maybe_reshape_input_tensor( 513 tensor, column.name, output_rank=2) 514 variable = [ 515 contrib_variables.model_variable( 516 name='weight', 517 shape=[tensor.get_shape()[1], num_outputs], 518 initializer=init_ops.zeros_initializer(), 519 trainable=trainable, 520 collections=weight_collections) 521 ] 522 predictions = math_ops.matmul(tensor, variable[0], name='matmul') 523 except ValueError as ee: 524 raise ValueError('Error creating weighted sum for column: {}.\n' 525 '{}'.format(column.name, ee)) 526 output_tensors.append(array_ops.reshape( 527 predictions, shape=(-1, num_outputs))) 528 column_to_variable[column] = variable 529 _log_variable(variable) 530 fc._maybe_restore_from_checkpoint(column._checkpoint_path(), variable) # pylint: disable=protected-access 531 # pylint: enable=protected-access 532 predictions_no_bias = math_ops.add_n(output_tensors) 533 bias = contrib_variables.model_variable( 534 'bias_weight', 535 shape=[num_outputs], 536 initializer=init_ops.zeros_initializer(), 537 trainable=trainable, 538 collections=_add_variable_collection(weight_collections)) 539 _log_variable(bias) 540 predictions = nn_ops.bias_add(predictions_no_bias, bias) 541 542 return predictions, column_to_variable, bias 543 544 545def parse_feature_columns_from_examples(serialized, 546 feature_columns, 547 name=None, 548 example_names=None): 549 """Parses tf.Examples to extract tensors for given feature_columns. 550 551 This is a wrapper of 'tf.parse_example'. 552 553 Example: 554 555 ```python 556 columns_to_tensor = parse_feature_columns_from_examples( 557 serialized=my_data, 558 feature_columns=my_features) 559 560 # Where my_features are: 561 # Define features and transformations 562 sparse_feature_a = sparse_column_with_keys( 563 column_name="sparse_feature_a", keys=["AB", "CD", ...]) 564 565 embedding_feature_a = embedding_column( 566 sparse_id_column=sparse_feature_a, dimension=3, combiner="sum") 567 568 sparse_feature_b = sparse_column_with_hash_bucket( 569 column_name="sparse_feature_b", hash_bucket_size=1000) 570 571 embedding_feature_b = embedding_column( 572 sparse_id_column=sparse_feature_b, dimension=16, combiner="sum") 573 574 crossed_feature_a_x_b = crossed_column( 575 columns=[sparse_feature_a, sparse_feature_b], hash_bucket_size=10000) 576 577 real_feature = real_valued_column("real_feature") 578 real_feature_buckets = bucketized_column( 579 source_column=real_feature, boundaries=[...]) 580 581 my_features = [embedding_feature_b, real_feature_buckets, embedding_feature_a] 582 ``` 583 584 Args: 585 serialized: A vector (1-D Tensor) of strings, a batch of binary 586 serialized `Example` protos. 587 feature_columns: An iterable containing all the feature columns. All items 588 should be instances of classes derived from _FeatureColumn. 589 name: A name for this operation (optional). 590 example_names: A vector (1-D Tensor) of strings (optional), the names of 591 the serialized protos in the batch. 592 593 Returns: 594 A `dict` mapping FeatureColumn to `Tensor` and `SparseTensor` values. 595 """ 596 check_feature_columns(feature_columns) 597 columns_to_tensors = parsing_ops.parse_example( 598 serialized=serialized, 599 features=fc.create_feature_spec_for_parsing(feature_columns), 600 name=name, 601 example_names=example_names) 602 603 transformer = _Transformer(columns_to_tensors) 604 for column in sorted(set(feature_columns), key=lambda x: x.key): 605 transformer.transform(column) 606 return columns_to_tensors 607 608 609def transform_features(features, feature_columns): 610 """Returns transformed features based on features columns passed in. 611 612 Example: 613 614 ```python 615 columns_to_tensor = transform_features(features=features, 616 feature_columns=feature_columns) 617 618 # Where my_features are: 619 # Define features and transformations 620 sparse_feature_a = sparse_column_with_keys( 621 column_name="sparse_feature_a", keys=["AB", "CD", ...]) 622 623 embedding_feature_a = embedding_column( 624 sparse_id_column=sparse_feature_a, dimension=3, combiner="sum") 625 626 sparse_feature_b = sparse_column_with_hash_bucket( 627 column_name="sparse_feature_b", hash_bucket_size=1000) 628 629 embedding_feature_b = embedding_column( 630 sparse_id_column=sparse_feature_b, dimension=16, combiner="sum") 631 632 crossed_feature_a_x_b = crossed_column( 633 columns=[sparse_feature_a, sparse_feature_b], hash_bucket_size=10000) 634 635 real_feature = real_valued_column("real_feature") 636 real_feature_buckets = bucketized_column( 637 source_column=real_feature, boundaries=[...]) 638 639 feature_columns = [embedding_feature_b, 640 real_feature_buckets, 641 embedding_feature_a] 642 ``` 643 644 Args: 645 features: A dictionary of features. 646 feature_columns: An iterable containing all the feature columns. All items 647 should be instances of classes derived from _FeatureColumn. 648 649 Returns: 650 A `dict` mapping FeatureColumn to `Tensor` and `SparseTensor` values. 651 """ 652 columns_to_tensor = features.copy() 653 check_feature_columns(feature_columns) 654 transformer = _Transformer(columns_to_tensor) 655 for column in sorted(set(feature_columns), key=lambda x: x.key): 656 transformer.transform(column) 657 keys = list(columns_to_tensor.keys()) 658 for k in keys: 659 if k not in feature_columns: 660 columns_to_tensor.pop(k) 661 return columns_to_tensor 662 663 664def parse_feature_columns_from_sequence_examples( 665 serialized, 666 context_feature_columns, 667 sequence_feature_columns, 668 name=None, 669 example_name=None): 670 """Parses tf.SequenceExamples to extract tensors for given `FeatureColumn`s. 671 672 Args: 673 serialized: A scalar (0-D Tensor) of type string, a single serialized 674 `SequenceExample` proto. 675 context_feature_columns: An iterable containing the feature columns for 676 context features. All items should be instances of classes derived from 677 `_FeatureColumn`. Can be `None`. 678 sequence_feature_columns: An iterable containing the feature columns for 679 sequence features. All items should be instances of classes derived from 680 `_FeatureColumn`. Can be `None`. 681 name: A name for this operation (optional). 682 example_name: A scalar (0-D Tensor) of type string (optional), the names of 683 the serialized proto. 684 685 Returns: 686 A tuple consisting of (context_features, sequence_features) 687 688 * context_features: a dict mapping `FeatureColumns` from 689 `context_feature_columns` to their parsed `Tensors`/`SparseTensor`s. 690 * sequence_features: a dict mapping `FeatureColumns` from 691 `sequence_feature_columns` to their parsed `Tensors`/`SparseTensor`s. 692 """ 693 # Sequence example parsing requires a single (scalar) example. 694 try: 695 serialized = array_ops.reshape(serialized, []) 696 except ValueError as e: 697 raise ValueError( 698 'serialized must contain as single sequence example. Batching must be ' 699 'done after parsing for sequence examples. Error: {}'.format(e)) 700 701 if context_feature_columns is None: 702 context_feature_columns = [] 703 if sequence_feature_columns is None: 704 sequence_feature_columns = [] 705 706 check_feature_columns(context_feature_columns) 707 context_feature_spec = fc.create_feature_spec_for_parsing( 708 context_feature_columns) 709 710 check_feature_columns(sequence_feature_columns) 711 sequence_feature_spec = fc._create_sequence_feature_spec_for_parsing( # pylint: disable=protected-access 712 sequence_feature_columns, allow_missing_by_default=False) 713 714 return parsing_ops.parse_single_sequence_example(serialized, 715 context_feature_spec, 716 sequence_feature_spec, 717 example_name, 718 name) 719 720 721def _log_variable(variable): 722 if isinstance(variable, list): 723 for var in variable: 724 if fc._is_variable(variable): # pylint: disable=protected-access 725 logging.info('Created variable %s, with device=%s', var.name, 726 var.device) 727 elif fc._is_variable(variable): # pylint: disable=protected-access 728 logging.info('Created variable %s, with device=%s', variable.name, 729 variable.device) 730 731 732def _infer_real_valued_column_for_tensor(name, tensor): 733 """Creates a real_valued_column for given tensor and name.""" 734 if isinstance(tensor, sparse_tensor_py.SparseTensor): 735 raise ValueError( 736 'SparseTensor is not supported for auto detection. Please define ' 737 'corresponding FeatureColumn for tensor {} {}.', name, tensor) 738 739 if not (tensor.dtype.is_integer or tensor.dtype.is_floating): 740 raise ValueError( 741 'Non integer or non floating types are not supported for auto detection' 742 '. Please define corresponding FeatureColumn for tensor {} {}.', name, 743 tensor) 744 745 shape = tensor.get_shape().as_list() 746 dimension = 1 747 for i in range(1, len(shape)): 748 dimension *= shape[i] 749 return fc.real_valued_column(name, dimension=dimension, dtype=tensor.dtype) 750 751 752def infer_real_valued_columns(features): 753 if not isinstance(features, dict): 754 return [_infer_real_valued_column_for_tensor('', features)] 755 756 feature_columns = [] 757 for key, value in features.items(): 758 feature_columns.append(_infer_real_valued_column_for_tensor(key, value)) 759 760 return feature_columns 761 762 763def check_feature_columns(feature_columns): 764 """Checks the validity of the set of FeatureColumns. 765 766 Args: 767 feature_columns: An iterable of instances or subclasses of FeatureColumn. 768 769 Raises: 770 ValueError: If `feature_columns` is a dict. 771 ValueError: If there are duplicate feature column keys. 772 """ 773 if isinstance(feature_columns, dict): 774 raise ValueError('Expected feature_columns to be iterable, found dict.') 775 seen_keys = set() 776 for f in feature_columns: 777 key = f.key 778 if key in seen_keys: 779 raise ValueError('Duplicate feature column key found for column: {}. ' 780 'This usually means that the column is almost identical ' 781 'to another column, and one must be discarded.'.format( 782 f.name)) 783 seen_keys.add(key) 784 785 786class _Transformer(object): 787 """Handles all the transformations defined by FeatureColumn if needed. 788 789 FeatureColumn specifies how to digest an input column to the network. Some 790 feature columns require data transformations. This class handles those 791 transformations if they are not handled already. 792 793 Some features may be used in more than one place. For example, one can use a 794 bucketized feature by itself and a cross with it. In that case Transformer 795 should create only one bucketization op instead of multiple ops for each 796 feature column. To handle re-use of transformed columns, Transformer keeps all 797 previously transformed columns. 798 799 Example: 800 801 ```python 802 sparse_feature = sparse_column_with_hash_bucket(...) 803 real_valued_feature = real_valued_column(...) 804 real_valued_buckets = bucketized_column(source_column=real_valued_feature, 805 ...) 806 sparse_x_real = crossed_column( 807 columns=[sparse_feature, real_valued_buckets], hash_bucket_size=10000) 808 809 columns_to_tensor = tf.parse_example(...) 810 transformer = Transformer(columns_to_tensor) 811 812 sparse_x_real_tensor = transformer.transform(sparse_x_real) 813 sparse_tensor = transformer.transform(sparse_feature) 814 real_buckets_tensor = transformer.transform(real_valued_buckets) 815 ``` 816 """ 817 818 def __init__(self, columns_to_tensors): 819 """Initializes transformer. 820 821 Args: 822 columns_to_tensors: A mapping from feature columns to tensors. 'string' 823 key means a base feature (not-transformed). It can have FeatureColumn as 824 a key too. That means that FeatureColumn is already transformed by input 825 pipeline. For example, `inflow` may have handled transformations. 826 Transformed features are inserted in columns_to_tensors. 827 """ 828 self._columns_to_tensors = columns_to_tensors 829 830 def transform(self, feature_column): 831 """Returns a Tensor which represents given feature_column. 832 833 Args: 834 feature_column: An instance of FeatureColumn. 835 836 Returns: 837 A Tensor which represents given feature_column. It may create a new Tensor 838 or re-use an existing one. 839 840 Raises: 841 ValueError: if FeatureColumn cannot be handled by this Transformer. 842 """ 843 logging.debug('Transforming feature_column %s', feature_column) 844 if feature_column in self._columns_to_tensors: 845 # Feature_column is already transformed. 846 return self._columns_to_tensors[feature_column] 847 848 feature_column.insert_transformed_feature(self._columns_to_tensors) 849 850 if feature_column not in self._columns_to_tensors: 851 raise ValueError('Column {} is not supported.'.format( 852 feature_column.name)) 853 854 return self._columns_to_tensors[feature_column] 855 856 857def _add_variable_collection(weight_collections): 858 if weight_collections: 859 weight_collections = list( 860 set(list(weight_collections) + [ops.GraphKeys.GLOBAL_VARIABLES])) 861 return weight_collections 862 863 864# TODO(jamieas): remove the following logic once all FeatureColumn types are 865# supported for sequences. 866# pylint: disable=protected-access 867_SUPPORTED_SEQUENCE_COLUMNS = (fc._OneHotColumn, 868 fc._EmbeddingColumn, 869 fc._RealValuedColumn, 870 fc._RealValuedVarLenColumn) 871 872_FORBIDDEN_SEQUENCE_COLUMNS = (fc._ScatteredEmbeddingColumn, 873 fc._BucketizedColumn, 874 fc._CrossedColumn) 875 876 877def _check_supported_sequence_columns(feature_columns): 878 """Asserts `feature_columns` are in `_SUPPORTED_SEQUENCE_COLUMNS`.""" 879 for col in feature_columns: 880 if not isinstance(col, _SUPPORTED_SEQUENCE_COLUMNS): 881 raise ValueError( 882 'FeatureColumn type {} is not currently supported for sequence data.'. 883 format(type(col).__name__)) 884 885 886def _get_parent_columns(feature_column): 887 """Returns the tuple of `FeatureColumn`s that `feature_column` depends on.""" 888 if isinstance(feature_column, (fc._WeightedSparseColumn, 889 fc._OneHotColumn, 890 fc._EmbeddingColumn,)): 891 return (feature_column.sparse_id_column,) 892 if isinstance(feature_column, (fc._BucketizedColumn,)): 893 return (feature_column.source_column,) 894 if isinstance(feature_column, (fc._CrossedColumn)): 895 return tuple(feature_column.columns) 896 return tuple() 897 898 899def _gather_feature_columns(feature_columns): 900 """Returns a list of all ancestor `FeatureColumns` of `feature_columns`.""" 901 gathered = list(feature_columns) 902 i = 0 903 while i < len(gathered): 904 for column in _get_parent_columns(gathered[i]): 905 if column not in gathered: 906 gathered.append(column) 907 i += 1 908 return gathered 909 910 911def _check_forbidden_sequence_columns(feature_columns): 912 """Recursively checks `feature_columns` for `_FORBIDDEN_SEQUENCE_COLUMNS`.""" 913 all_feature_columns = _gather_feature_columns(feature_columns) 914 for feature_column in all_feature_columns: 915 if isinstance(feature_column, _FORBIDDEN_SEQUENCE_COLUMNS): 916 raise ValueError( 917 'Column {} is of type {}, which is not currently supported for ' 918 'sequences.'.format(feature_column.name, 919 type(feature_column).__name__)) 920