1# Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15"""This API defines FeatureColumn abstraction. 16 17FeatureColumns provide a high level abstraction for ingesting and representing 18features. FeatureColumns are also the primary way of encoding features for 19canned `tf.estimator.Estimator`s. 20 21When using FeatureColumns with `Estimators`, the type of feature column you 22should choose depends on (1) the feature type and (2) the model type. 23 241. Feature type: 25 26 * Continuous features can be represented by `numeric_column`. 27 * Categorical features can be represented by any `categorical_column_with_*` 28 column: 29 - `categorical_column_with_vocabulary_list` 30 - `categorical_column_with_vocabulary_file` 31 - `categorical_column_with_hash_bucket` 32 - `categorical_column_with_identity` 33 - `weighted_categorical_column` 34 352. Model type: 36 37 * Deep neural network models (`DNNClassifier`, `DNNRegressor`). 38 39 Continuous features can be directly fed into deep neural network models. 40 41 age_column = numeric_column("age") 42 43 To feed sparse features into DNN models, wrap the column with 44 `embedding_column` or `indicator_column`. `indicator_column` is recommended 45 for features with only a few possible values. For features with many 46 possible values, to reduce the size of your model, `embedding_column` is 47 recommended. 48 49 embedded_dept_column = embedding_column( 50 categorical_column_with_vocabulary_list( 51 "department", ["math", "philosophy", ...]), dimension=10) 52 53 * Wide (aka linear) models (`LinearClassifier`, `LinearRegressor`). 54 55 Sparse features can be fed directly into linear models. They behave like an 56 indicator column but with an efficient implementation. 57 58 dept_column = categorical_column_with_vocabulary_list("department", 59 ["math", "philosophy", "english"]) 60 61 It is recommended that continuous features be bucketized before being 62 fed into linear models. 63 64 bucketized_age_column = bucketized_column( 65 source_column=age_column, 66 boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) 67 68 Sparse features can be crossed (also known as conjuncted or combined) in 69 order to form non-linearities, and then fed into linear models. 70 71 cross_dept_age_column = crossed_column( 72 columns=["department", bucketized_age_column], 73 hash_bucket_size=1000) 74 75Example of building canned `Estimator`s using FeatureColumns: 76 77 ```python 78 # Define features and transformations 79 deep_feature_columns = [age_column, embedded_dept_column] 80 wide_feature_columns = [dept_column, bucketized_age_column, 81 cross_dept_age_column] 82 83 # Build deep model 84 estimator = DNNClassifier( 85 feature_columns=deep_feature_columns, 86 hidden_units=[500, 250, 50]) 87 estimator.train(...) 88 89 # Or build a wide model 90 estimator = LinearClassifier( 91 feature_columns=wide_feature_columns) 92 estimator.train(...) 93 94 # Or build a wide and deep model! 95 estimator = DNNLinearCombinedClassifier( 96 linear_feature_columns=wide_feature_columns, 97 dnn_feature_columns=deep_feature_columns, 98 dnn_hidden_units=[500, 250, 50]) 99 estimator.train(...) 100 ``` 101 102 103FeatureColumns can also be transformed into a generic input layer for 104custom models using `input_layer`. 105 106Example of building model using FeatureColumns, this can be used in a 107`model_fn` which is given to the {tf.estimator.Estimator}: 108 109 ```python 110 # Building model via layers 111 112 deep_feature_columns = [age_column, embedded_dept_column] 113 columns_to_tensor = parse_feature_columns_from_examples( 114 serialized=my_data, 115 feature_columns=deep_feature_columns) 116 first_layer = input_layer( 117 features=columns_to_tensor, 118 feature_columns=deep_feature_columns) 119 second_layer = fully_connected(first_layer, ...) 120 ``` 121 122NOTE: Functions prefixed with "_" indicate experimental or private parts of 123the API subject to change, and should not be relied upon! 124""" 125 126from __future__ import absolute_import 127from __future__ import division 128from __future__ import print_function 129 130import abc 131import collections 132import math 133 134import numpy as np 135import six 136 137 138from tensorflow.python.eager import context 139from tensorflow.python.feature_column import feature_column as fc_old 140from tensorflow.python.feature_column import utils as fc_utils 141from tensorflow.python.framework import dtypes 142from tensorflow.python.framework import ops 143from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib 144from tensorflow.python.framework import tensor_shape 145# TODO(b/118385027): Dependency on keras can be problematic if Keras moves out 146# of the main repo. 147from tensorflow.python.keras import utils 148from tensorflow.python.keras.engine import training 149from tensorflow.python.keras.engine.base_layer import Layer 150from tensorflow.python.ops import array_ops 151from tensorflow.python.ops import check_ops 152from tensorflow.python.ops import control_flow_ops 153from tensorflow.python.ops import embedding_ops 154from tensorflow.python.ops import init_ops 155from tensorflow.python.ops import lookup_ops 156from tensorflow.python.ops import math_ops 157from tensorflow.python.ops import nn_ops 158from tensorflow.python.ops import parsing_ops 159from tensorflow.python.ops import sparse_ops 160from tensorflow.python.ops import string_ops 161from tensorflow.python.ops import variable_scope 162from tensorflow.python.ops import variables 163from tensorflow.python.platform import gfile 164from tensorflow.python.platform import tf_logging as logging 165from tensorflow.python.training import checkpoint_utils 166from tensorflow.python.training.tracking import tracking 167from tensorflow.python.util import deprecation 168from tensorflow.python.util import nest 169from tensorflow.python.util.tf_export import keras_export 170from tensorflow.python.util.tf_export import tf_export 171 172 173_FEATURE_COLUMN_DEPRECATION_DATE = None 174_FEATURE_COLUMN_DEPRECATION = ('The old _FeatureColumn APIs are being ' 175 'deprecated. Please use the new FeatureColumn ' 176 'APIs instead.') 177 178 179class StateManager(object): 180 """Manages the state associated with FeatureColumns. 181 182 Some `FeatureColumn`s create variables or resources to assist their 183 computation. The `StateManager` is responsible for creating and storing these 184 objects since `FeatureColumn`s are supposed to be stateless configuration 185 only. 186 """ 187 188 def create_variable(self, 189 feature_column, 190 name, 191 shape, 192 dtype=None, 193 trainable=True, 194 use_resource=True, 195 initializer=None): 196 """Creates a new variable. 197 198 Args: 199 feature_column: A `FeatureColumn` object this variable corresponds to. 200 name: variable name. 201 shape: variable shape. 202 dtype: The type of the variable. Defaults to `self.dtype` or `float32`. 203 trainable: Whether this variable is trainable or not. 204 use_resource: If true, we use resource variables. Otherwise we use 205 RefVariable. 206 initializer: initializer instance (callable). 207 208 Returns: 209 The created variable. 210 """ 211 del feature_column, name, shape, dtype, trainable, use_resource, initializer 212 raise NotImplementedError('StateManager.create_variable') 213 214 def add_variable(self, feature_column, var): 215 """Adds an existing variable to the state. 216 217 Args: 218 feature_column: A `FeatureColumn` object to associate this variable with. 219 var: The variable. 220 """ 221 del feature_column, var 222 raise NotImplementedError('StateManager.add_variable') 223 224 def get_variable(self, feature_column, name): 225 """Returns an existing variable. 226 227 Args: 228 feature_column: A `FeatureColumn` object this variable corresponds to. 229 name: variable name. 230 """ 231 del feature_column, name 232 raise NotImplementedError('StateManager.get_var') 233 234 def add_resource(self, feature_column, name, resource): 235 """Creates a new resource. 236 237 Resources can be things such as tables etc. 238 239 Args: 240 feature_column: A `FeatureColumn` object this resource corresponds to. 241 name: Name of the resource. 242 resource: The resource. 243 244 Returns: 245 The created resource. 246 """ 247 del feature_column, name, resource 248 raise NotImplementedError('StateManager.add_resource') 249 250 def get_resource(self, feature_column, name): 251 """Returns an already created resource. 252 253 Resources can be things such as tables etc. 254 255 Args: 256 feature_column: A `FeatureColumn` object this variable corresponds to. 257 name: Name of the resource. 258 """ 259 del feature_column, name 260 raise NotImplementedError('StateManager.get_resource') 261 262 263class _StateManagerImpl(StateManager): 264 """Manages the state of DenseFeatures and LinearLayer.""" 265 266 def __init__(self, layer, trainable): 267 """Creates an _StateManagerImpl object. 268 269 Args: 270 layer: The input layer this state manager is associated with. 271 trainable: Whether by default, variables created are trainable or not. 272 """ 273 self._trainable = trainable 274 self._layer = layer 275 self._cols_to_vars_map = collections.defaultdict(lambda: {}) 276 277 def create_variable(self, 278 feature_column, 279 name, 280 shape, 281 dtype=None, 282 trainable=True, 283 use_resource=True, 284 initializer=None): 285 if name in self._cols_to_vars_map[feature_column]: 286 raise ValueError('Variable already exists.') 287 288 var = self._layer.add_variable( 289 name=name, 290 shape=shape, 291 dtype=dtype, 292 initializer=initializer, 293 trainable=self._trainable and trainable, 294 use_resource=use_resource, 295 # TODO(rohanj): Get rid of this hack once we have a mechanism for 296 # specifying a default partitioner for an entire layer. In that case, 297 # the default getter for Layers should work. 298 getter=variable_scope.get_variable) 299 self._cols_to_vars_map[feature_column][name] = var 300 return var 301 302 def get_variable(self, feature_column, name): 303 if name in self._cols_to_vars_map[feature_column]: 304 return self._cols_to_vars_map[feature_column][name] 305 raise ValueError('Variable does not exist.') 306 307 308class _BaseFeaturesLayer(Layer): 309 """Base class for DenseFeatures and SequenceFeatures. 310 311 Defines common methods and helpers. 312 313 Args: 314 feature_columns: An iterable containing the FeatureColumns to use as 315 inputs to your model. 316 expected_column_type: Expected class for provided feature columns. 317 trainable: Boolean, whether the layer's variables will be updated via 318 gradient descent during training. 319 name: Name to give to the DenseFeatures. 320 **kwargs: Keyword arguments to construct a layer. 321 322 Raises: 323 ValueError: if an item in `feature_columns` doesn't match 324 `expected_column_type`. 325 """ 326 def __init__(self, feature_columns, expected_column_type, trainable, name, 327 **kwargs): 328 super(_BaseFeaturesLayer, self).__init__( 329 name=name, trainable=trainable, **kwargs) 330 self._feature_columns = _normalize_feature_columns(feature_columns) 331 self._state_manager = _StateManagerImpl(self, self.trainable) 332 for column in self._feature_columns: 333 if not isinstance(column, expected_column_type): 334 raise ValueError( 335 'Items of feature_columns must be a {}. ' 336 'You can wrap a categorical column with an ' 337 'embedding_column or indicator_column. Given: {}'.format( 338 expected_column_type, column)) 339 340 def build(self, _): 341 for column in self._feature_columns: 342 with variable_scope._pure_variable_scope(self.name): # pylint: disable=protected-access 343 with variable_scope._pure_variable_scope(column.name): # pylint: disable=protected-access 344 column.create_state(self._state_manager) 345 super(_BaseFeaturesLayer, self).build(None) 346 347 def _output_shape(self, input_shape, num_elements): 348 """Computes expected output shape of the layer or a column's dense tensor. 349 350 Args: 351 input_shape: Tensor or array with batch shape. 352 num_elements: Size of the last dimension of the output. 353 354 Returns: 355 Tuple with output shape. 356 """ 357 raise NotImplementedError('Calling an abstract method.') 358 359 def compute_output_shape(self, input_shape): 360 total_elements = 0 361 for column in self._feature_columns: 362 total_elements += column.variable_shape.num_elements() 363 return self._target_shape(input_shape, total_elements) 364 365 def _process_dense_tensor(self, column, tensor): 366 """Reshapes the dense tensor output of a column based on expected shape. 367 368 Args: 369 column: A DenseColumn or SequenceDenseColumn object. 370 tensor: A dense tensor obtained from the same column. 371 372 Returns: 373 Reshaped dense tensor.""" 374 num_elements = column.variable_shape.num_elements() 375 target_shape = self._target_shape(array_ops.shape(tensor), num_elements) 376 return array_ops.reshape(tensor, shape=target_shape) 377 378 def _verify_and_concat_tensors(self, output_tensors): 379 """Verifies and concatenates the dense output of several columns.""" 380 _verify_static_batch_size_equality(output_tensors, self._feature_columns) 381 return array_ops.concat(output_tensors, -1) 382 383 384@keras_export('keras.layers.DenseFeatures') 385class DenseFeatures(_BaseFeaturesLayer): 386 """A layer that produces a dense `Tensor` based on given `feature_columns`. 387 388 Generally a single example in training data is described with FeatureColumns. 389 At the first layer of the model, this column oriented data should be converted 390 to a single `Tensor`. 391 392 This layer can be called multiple times with different features. 393 394 Example: 395 396 ```python 397 price = numeric_column('price') 398 keywords_embedded = embedding_column( 399 categorical_column_with_hash_bucket("keywords", 10K), dimensions=16) 400 columns = [price, keywords_embedded, ...] 401 feature_layer = DenseFeatures(columns) 402 403 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 404 dense_tensor = feature_layer(features) 405 for units in [128, 64, 32]: 406 dense_tensor = tf.layers.dense(dense_tensor, units, tf.nn.relu) 407 prediction = tf.layers.dense(dense_tensor, 1). 408 ``` 409 """ 410 411 def __init__(self, 412 feature_columns, 413 trainable=True, 414 name=None, 415 **kwargs): 416 """Constructs a DenseFeatures. 417 418 Args: 419 feature_columns: An iterable containing the FeatureColumns to use as 420 inputs to your model. All items should be instances of classes derived 421 from `DenseColumn` such as `numeric_column`, `embedding_column`, 422 `bucketized_column`, `indicator_column`. If you have categorical 423 features, you can wrap them with an `embedding_column` or 424 `indicator_column`. 425 trainable: Boolean, whether the layer's variables will be updated via 426 gradient descent during training. 427 name: Name to give to the DenseFeatures. 428 **kwargs: Keyword arguments to construct a layer. 429 430 Raises: 431 ValueError: if an item in `feature_columns` is not a `DenseColumn`. 432 """ 433 super(DenseFeatures, self).__init__( 434 feature_columns=feature_columns, 435 trainable=trainable, 436 name=name, 437 expected_column_type=DenseColumn, 438 **kwargs) 439 440 @property 441 def _is_feature_layer(self): 442 return True 443 444 def _target_shape(self, input_shape, total_elements): 445 return (input_shape[0], total_elements) 446 447 def call(self, features, cols_to_output_tensors=None): 448 """Returns a dense tensor corresponding to the `feature_columns`. 449 450 Args: 451 features: A mapping from key to tensors. `FeatureColumn`s look up via 452 these keys. For example `numeric_column('price')` will look at 'price' 453 key in this dict. Values can be a `SparseTensor` or a `Tensor` depends 454 on corresponding `FeatureColumn`. 455 cols_to_output_tensors: If not `None`, this will be filled with a dict 456 mapping feature columns to output tensors created. 457 458 Returns: 459 A `Tensor` which represents input layer of a model. Its shape 460 is (batch_size, first_layer_dimension) and its dtype is `float32`. 461 first_layer_dimension is determined based on given `feature_columns`. 462 463 Raises: 464 ValueError: If features are not a dictionary. 465 """ 466 if not isinstance(features, dict): 467 raise ValueError('We expected a dictionary here. Instead we got: ', 468 features) 469 transformation_cache = FeatureTransformationCache(features) 470 output_tensors = [] 471 for column in self._feature_columns: 472 with ops.name_scope(column.name): 473 tensor = column.get_dense_tensor(transformation_cache, 474 self._state_manager) 475 processed_tensors = self._process_dense_tensor(column, tensor) 476 if cols_to_output_tensors is not None: 477 cols_to_output_tensors[column] = processed_tensors 478 output_tensors.append(processed_tensors) 479 return self._verify_and_concat_tensors(output_tensors) 480 481 482class _LinearModelLayer(Layer): 483 """Layer that contains logic for `LinearModel`.""" 484 485 def __init__(self, 486 feature_columns, 487 units=1, 488 sparse_combiner='sum', 489 trainable=True, 490 name=None, 491 **kwargs): 492 super(_LinearModelLayer, self).__init__( 493 name=name, trainable=trainable, **kwargs) 494 495 self._feature_columns = _normalize_feature_columns(feature_columns) 496 for column in self._feature_columns: 497 if not isinstance(column, (DenseColumn, CategoricalColumn)): 498 raise ValueError( 499 'Items of feature_columns must be either a ' 500 'DenseColumn or CategoricalColumn. Given: {}'.format(column)) 501 502 self._units = units 503 self._sparse_combiner = sparse_combiner 504 505 self._state_manager = _StateManagerImpl(self, self.trainable) 506 self.bias = None 507 508 def build(self, _): 509 # We need variable scopes for now because we want the variable partitioning 510 # information to percolate down. We also use _pure_variable_scope's here 511 # since we want to open up a name_scope in the `call` method while creating 512 # the ops. 513 with variable_scope._pure_variable_scope(self.name): # pylint: disable=protected-access 514 for column in self._feature_columns: 515 with variable_scope._pure_variable_scope(column.name): # pylint: disable=protected-access 516 # Create the state for each feature column 517 column.create_state(self._state_manager) 518 519 # Create a weight variable for each column. 520 if isinstance(column, CategoricalColumn): 521 first_dim = column.num_buckets 522 else: 523 first_dim = column.variable_shape.num_elements() 524 self._state_manager.create_variable( 525 column, 526 name='weights', 527 dtype=dtypes.float32, 528 shape=(first_dim, self._units), 529 initializer=init_ops.zeros_initializer(), 530 trainable=self.trainable) 531 532 # Create a bias variable. 533 self.bias = self.add_variable( 534 name='bias_weights', 535 dtype=dtypes.float32, 536 shape=[self._units], 537 initializer=init_ops.zeros_initializer(), 538 trainable=self.trainable, 539 use_resource=True, 540 # TODO(rohanj): Get rid of this hack once we have a mechanism for 541 # specifying a default partitioner for an entire layer. In that case, 542 # the default getter for Layers should work. 543 getter=variable_scope.get_variable) 544 545 super(_LinearModelLayer, self).build(None) 546 547 def call(self, features): 548 if not isinstance(features, dict): 549 raise ValueError('We expected a dictionary here. Instead we got: {}' 550 .format(features)) 551 with ops.name_scope(self.name): 552 transformation_cache = FeatureTransformationCache(features) 553 weighted_sums = [] 554 for column in self._feature_columns: 555 with ops.name_scope(column.name): 556 # All the weights used in the linear model are owned by the state 557 # manager associated with this Linear Model. 558 weight_var = self._state_manager.get_variable(column, 'weights') 559 560 weighted_sum = _create_weighted_sum( 561 column=column, 562 transformation_cache=transformation_cache, 563 state_manager=self._state_manager, 564 sparse_combiner=self._sparse_combiner, 565 weight_var=weight_var) 566 weighted_sums.append(weighted_sum) 567 568 _verify_static_batch_size_equality(weighted_sums, self._feature_columns) 569 predictions_no_bias = math_ops.add_n( 570 weighted_sums, name='weighted_sum_no_bias') 571 predictions = nn_ops.bias_add( 572 predictions_no_bias, self.bias, name='weighted_sum') 573 return predictions 574 575 576@keras_export('keras.layers.LinearModel', v1=[]) 577class LinearModel(training.Model): 578 """Produces a linear prediction `Tensor` based on given `feature_columns`. 579 580 This layer generates a weighted sum based on output dimension `units`. 581 Weighted sum refers to logits in classification problems. It refers to the 582 prediction itself for linear regression problems. 583 584 Note on supported columns: `LinearLayer` treats categorical columns as 585 `indicator_column`s. To be specific, assume the input as `SparseTensor` looks 586 like: 587 588 ```python 589 shape = [2, 2] 590 { 591 [0, 0]: "a" 592 [1, 0]: "b" 593 [1, 1]: "c" 594 } 595 ``` 596 `linear_model` assigns weights for the presence of "a", "b", "c' implicitly, 597 just like `indicator_column`, while `input_layer` explicitly requires wrapping 598 each of categorical columns with an `embedding_column` or an 599 `indicator_column`. 600 601 Example of usage: 602 603 ```python 604 price = numeric_column('price') 605 price_buckets = bucketized_column(price, boundaries=[0., 10., 100., 1000.]) 606 keywords = categorical_column_with_hash_bucket("keywords", 10K) 607 keywords_price = crossed_column('keywords', price_buckets, ...) 608 columns = [price_buckets, keywords, keywords_price ...] 609 linear_model = LinearLayer(columns) 610 611 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 612 prediction = linear_model(features) 613 ``` 614 """ 615 616 def __init__(self, 617 feature_columns, 618 units=1, 619 sparse_combiner='sum', 620 trainable=True, 621 name=None, 622 **kwargs): 623 """Constructs a LinearLayer. 624 625 Args: 626 feature_columns: An iterable containing the FeatureColumns to use as 627 inputs to your model. All items should be instances of classes derived 628 from `_FeatureColumn`s. 629 units: An integer, dimensionality of the output space. Default value is 1. 630 sparse_combiner: A string specifying how to reduce if a categorical column 631 is multivalent. Except `numeric_column`, almost all columns passed to 632 `linear_model` are considered as categorical columns. It combines each 633 categorical column independently. Currently "mean", "sqrtn" and "sum" 634 are supported, with "sum" the default for linear model. "sqrtn" often 635 achieves good accuracy, in particular with bag-of-words columns. 636 * "sum": do not normalize features in the column 637 * "mean": do l1 normalization on features in the column 638 * "sqrtn": do l2 normalization on features in the column 639 For example, for two features represented as the categorical columns: 640 641 ```python 642 # Feature 1 643 644 shape = [2, 2] 645 { 646 [0, 0]: "a" 647 [0, 1]: "b" 648 [1, 0]: "c" 649 } 650 651 # Feature 2 652 653 shape = [2, 3] 654 { 655 [0, 0]: "d" 656 [1, 0]: "e" 657 [1, 1]: "f" 658 [1, 2]: "g" 659 } 660 ``` 661 662 with `sparse_combiner` as "mean", the linear model outputs conceptly are 663 ``` 664 y_0 = 1.0 / 2.0 * ( w_a + w_ b) + w_c + b_0 665 y_1 = w_d + 1.0 / 3.0 * ( w_e + w_ f + w_g) + b_1 666 ``` 667 where `y_i` is the output, `b_i` is the bias, and `w_x` is the weight 668 assigned to the presence of `x` in the input features. 669 trainable: If `True` also add the variable to the graph collection 670 `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). 671 name: Name to give to the Linear Model. All variables and ops created will 672 be scoped by this name. 673 **kwargs: Keyword arguments to construct a layer. 674 675 Raises: 676 ValueError: if an item in `feature_columns` is neither a `DenseColumn` 677 nor `CategoricalColumn`. 678 """ 679 680 super(LinearModel, self).__init__(name=name, **kwargs) 681 self.layer = _LinearModelLayer( 682 feature_columns, 683 units, 684 sparse_combiner, 685 trainable, 686 name=self.name, 687 **kwargs) 688 689 def call(self, features): 690 """Returns a `Tensor` the represents the predictions of a linear model. 691 692 Args: 693 features: A mapping from key to tensors. `_FeatureColumn`s look up via 694 these keys. For example `numeric_column('price')` will look at 'price' 695 key in this dict. Values are `Tensor` or `SparseTensor` depending on 696 corresponding `_FeatureColumn`. 697 698 Returns: 699 A `Tensor` which represents predictions/logits of a linear model. Its 700 shape is (batch_size, units) and its dtype is `float32`. 701 702 Raises: 703 ValueError: If features are not a dictionary. 704 """ 705 return self.layer(features) 706 707 @property 708 def bias(self): 709 return self.layer.bias 710 711 712def _transform_features_v2(features, feature_columns, state_manager): 713 """Returns transformed features based on features columns passed in. 714 715 Please note that most probably you would not need to use this function. Please 716 check `input_layer` and `linear_model` to see whether they will 717 satisfy your use case or not. 718 719 Example: 720 721 ```python 722 # Define features and transformations 723 crosses_a_x_b = crossed_column( 724 columns=["sparse_feature_a", "sparse_feature_b"], hash_bucket_size=10000) 725 price_buckets = bucketized_column( 726 source_column=numeric_column("price"), boundaries=[...]) 727 728 columns = [crosses_a_x_b, price_buckets] 729 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 730 transformed = transform_features(features=features, feature_columns=columns) 731 732 assertCountEqual(columns, transformed.keys()) 733 ``` 734 735 Args: 736 features: A mapping from key to tensors. `FeatureColumn`s look up via these 737 keys. For example `numeric_column('price')` will look at 'price' key in 738 this dict. Values can be a `SparseTensor` or a `Tensor` depends on 739 corresponding `FeatureColumn`. 740 feature_columns: An iterable containing all the `FeatureColumn`s. 741 state_manager: A StateManager object that holds the FeatureColumn state. 742 743 Returns: 744 A `dict` mapping `FeatureColumn` to `Tensor` and `SparseTensor` values. 745 """ 746 feature_columns = _normalize_feature_columns(feature_columns) 747 outputs = {} 748 with ops.name_scope( 749 None, default_name='transform_features', values=features.values()): 750 transformation_cache = FeatureTransformationCache(features) 751 for column in feature_columns: 752 with ops.name_scope(None, default_name=column.name): 753 outputs[column] = transformation_cache.get(column, state_manager) 754 return outputs 755 756 757@tf_export('feature_column.make_parse_example_spec', v1=[]) 758def make_parse_example_spec_v2(feature_columns): 759 """Creates parsing spec dictionary from input feature_columns. 760 761 The returned dictionary can be used as arg 'features' in `tf.parse_example`. 762 763 Typical usage example: 764 765 ```python 766 # Define features and transformations 767 feature_a = categorical_column_with_vocabulary_file(...) 768 feature_b = numeric_column(...) 769 feature_c_bucketized = bucketized_column(numeric_column("feature_c"), ...) 770 feature_a_x_feature_c = crossed_column( 771 columns=["feature_a", feature_c_bucketized], ...) 772 773 feature_columns = set( 774 [feature_b, feature_c_bucketized, feature_a_x_feature_c]) 775 features = tf.parse_example( 776 serialized=serialized_examples, 777 features=make_parse_example_spec(feature_columns)) 778 ``` 779 780 For the above example, make_parse_example_spec would return the dict: 781 782 ```python 783 { 784 "feature_a": parsing_ops.VarLenFeature(tf.string), 785 "feature_b": parsing_ops.FixedLenFeature([1], dtype=tf.float32), 786 "feature_c": parsing_ops.FixedLenFeature([1], dtype=tf.float32) 787 } 788 ``` 789 790 Args: 791 feature_columns: An iterable containing all feature columns. All items 792 should be instances of classes derived from `FeatureColumn`. 793 794 Returns: 795 A dict mapping each feature key to a `FixedLenFeature` or `VarLenFeature` 796 value. 797 798 Raises: 799 ValueError: If any of the given `feature_columns` is not a `FeatureColumn` 800 instance. 801 """ 802 result = {} 803 for column in feature_columns: 804 if not isinstance(column, FeatureColumn): 805 raise ValueError('All feature_columns must be FeatureColumn instances. ' 806 'Given: {}'.format(column)) 807 config = column.parse_example_spec 808 for key, value in six.iteritems(config): 809 if key in result and value != result[key]: 810 raise ValueError( 811 'feature_columns contain different parse_spec for key ' 812 '{}. Given {} and {}'.format(key, value, result[key])) 813 result.update(config) 814 return result 815 816 817@tf_export('feature_column.embedding_column') 818def embedding_column(categorical_column, 819 dimension, 820 combiner='mean', 821 initializer=None, 822 ckpt_to_load_from=None, 823 tensor_name_in_ckpt=None, 824 max_norm=None, 825 trainable=True): 826 """`DenseColumn` that converts from sparse, categorical input. 827 828 Use this when your inputs are sparse, but you want to convert them to a dense 829 representation (e.g., to feed to a DNN). 830 831 Inputs must be a `CategoricalColumn` created by any of the 832 `categorical_column_*` function. Here is an example of using 833 `embedding_column` with `DNNClassifier`: 834 835 ```python 836 video_id = categorical_column_with_identity( 837 key='video_id', num_buckets=1000000, default_value=0) 838 columns = [embedding_column(video_id, 9),...] 839 840 estimator = tf.estimator.DNNClassifier(feature_columns=columns, ...) 841 842 label_column = ... 843 def input_fn(): 844 features = tf.parse_example( 845 ..., features=make_parse_example_spec(columns + [label_column])) 846 labels = features.pop(label_column.name) 847 return features, labels 848 849 estimator.train(input_fn=input_fn, steps=100) 850 ``` 851 852 Here is an example using `embedding_column` with model_fn: 853 854 ```python 855 def model_fn(features, ...): 856 video_id = categorical_column_with_identity( 857 key='video_id', num_buckets=1000000, default_value=0) 858 columns = [embedding_column(video_id, 9),...] 859 dense_tensor = input_layer(features, columns) 860 # Form DNN layers, calculate loss, and return EstimatorSpec. 861 ... 862 ``` 863 864 Args: 865 categorical_column: A `CategoricalColumn` created by a 866 `categorical_column_with_*` function. This column produces the sparse IDs 867 that are inputs to the embedding lookup. 868 dimension: An integer specifying dimension of the embedding, must be > 0. 869 combiner: A string specifying how to reduce if there are multiple entries in 870 a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with 871 'mean' the default. 'sqrtn' often achieves good accuracy, in particular 872 with bag-of-words columns. Each of this can be thought as example level 873 normalizations on the column. For more information, see 874 `tf.embedding_lookup_sparse`. 875 initializer: A variable initializer function to be used in embedding 876 variable initialization. If not specified, defaults to 877 `tf.truncated_normal_initializer` with mean `0.0` and standard deviation 878 `1/sqrt(dimension)`. 879 ckpt_to_load_from: String representing checkpoint name/pattern from which to 880 restore column weights. Required if `tensor_name_in_ckpt` is not `None`. 881 tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from which 882 to restore the column weights. Required if `ckpt_to_load_from` is not 883 `None`. 884 max_norm: If not `None`, embedding values are l2-normalized to this value. 885 trainable: Whether or not the embedding is trainable. Default is True. 886 887 Returns: 888 `DenseColumn` that converts from sparse input. 889 890 Raises: 891 ValueError: if `dimension` not > 0. 892 ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt` 893 is specified. 894 ValueError: if `initializer` is specified and is not callable. 895 RuntimeError: If eager execution is enabled. 896 """ 897 if (dimension is None) or (dimension < 1): 898 raise ValueError('Invalid dimension {}.'.format(dimension)) 899 if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None): 900 raise ValueError('Must specify both `ckpt_to_load_from` and ' 901 '`tensor_name_in_ckpt` or none of them.') 902 903 if (initializer is not None) and (not callable(initializer)): 904 raise ValueError('initializer must be callable if specified. ' 905 'Embedding of column_name: {}'.format( 906 categorical_column.name)) 907 if initializer is None: 908 initializer = init_ops.truncated_normal_initializer( 909 mean=0.0, stddev=1 / math.sqrt(dimension)) 910 911 return EmbeddingColumn( 912 categorical_column=categorical_column, 913 dimension=dimension, 914 combiner=combiner, 915 initializer=initializer, 916 ckpt_to_load_from=ckpt_to_load_from, 917 tensor_name_in_ckpt=tensor_name_in_ckpt, 918 max_norm=max_norm, 919 trainable=trainable) 920 921 922@tf_export(v1=['feature_column.shared_embedding_columns']) 923def shared_embedding_columns(categorical_columns, 924 dimension, 925 combiner='mean', 926 initializer=None, 927 shared_embedding_collection_name=None, 928 ckpt_to_load_from=None, 929 tensor_name_in_ckpt=None, 930 max_norm=None, 931 trainable=True): 932 """List of dense columns that convert from sparse, categorical input. 933 934 This is similar to `embedding_column`, except that it produces a list of 935 embedding columns that share the same embedding weights. 936 937 Use this when your inputs are sparse and of the same type (e.g. watched and 938 impression video IDs that share the same vocabulary), and you want to convert 939 them to a dense representation (e.g., to feed to a DNN). 940 941 Inputs must be a list of categorical columns created by any of the 942 `categorical_column_*` function. They must all be of the same type and have 943 the same arguments except `key`. E.g. they can be 944 categorical_column_with_vocabulary_file with the same vocabulary_file. Some or 945 all columns could also be weighted_categorical_column. 946 947 Here is an example embedding of two features for a DNNClassifier model: 948 949 ```python 950 watched_video_id = categorical_column_with_vocabulary_file( 951 'watched_video_id', video_vocabulary_file, video_vocabulary_size) 952 impression_video_id = categorical_column_with_vocabulary_file( 953 'impression_video_id', video_vocabulary_file, video_vocabulary_size) 954 columns = shared_embedding_columns( 955 [watched_video_id, impression_video_id], dimension=10) 956 957 estimator = tf.estimator.DNNClassifier(feature_columns=columns, ...) 958 959 label_column = ... 960 def input_fn(): 961 features = tf.parse_example( 962 ..., features=make_parse_example_spec(columns + [label_column])) 963 labels = features.pop(label_column.name) 964 return features, labels 965 966 estimator.train(input_fn=input_fn, steps=100) 967 ``` 968 969 Here is an example using `shared_embedding_columns` with model_fn: 970 971 ```python 972 def model_fn(features, ...): 973 watched_video_id = categorical_column_with_vocabulary_file( 974 'watched_video_id', video_vocabulary_file, video_vocabulary_size) 975 impression_video_id = categorical_column_with_vocabulary_file( 976 'impression_video_id', video_vocabulary_file, video_vocabulary_size) 977 columns = shared_embedding_columns( 978 [watched_video_id, impression_video_id], dimension=10) 979 dense_tensor = input_layer(features, columns) 980 # Form DNN layers, calculate loss, and return EstimatorSpec. 981 ... 982 ``` 983 984 Args: 985 categorical_columns: List of categorical columns created by a 986 `categorical_column_with_*` function. These columns produce the sparse IDs 987 that are inputs to the embedding lookup. All columns must be of the same 988 type and have the same arguments except `key`. E.g. they can be 989 categorical_column_with_vocabulary_file with the same vocabulary_file. 990 Some or all columns could also be weighted_categorical_column. 991 dimension: An integer specifying dimension of the embedding, must be > 0. 992 combiner: A string specifying how to reduce if there are multiple entries in 993 a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with 994 'mean' the default. 'sqrtn' often achieves good accuracy, in particular 995 with bag-of-words columns. Each of this can be thought as example level 996 normalizations on the column. For more information, see 997 `tf.embedding_lookup_sparse`. 998 initializer: A variable initializer function to be used in embedding 999 variable initialization. If not specified, defaults to 1000 `tf.truncated_normal_initializer` with mean `0.0` and standard deviation 1001 `1/sqrt(dimension)`. 1002 shared_embedding_collection_name: Optional name of the collection where 1003 shared embedding weights are added. If not given, a reasonable name will 1004 be chosen based on the names of `categorical_columns`. This is also used 1005 in `variable_scope` when creating shared embedding weights. 1006 ckpt_to_load_from: String representing checkpoint name/pattern from which to 1007 restore column weights. Required if `tensor_name_in_ckpt` is not `None`. 1008 tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from which 1009 to restore the column weights. Required if `ckpt_to_load_from` is not 1010 `None`. 1011 max_norm: If not `None`, each embedding is clipped if its l2-norm is larger 1012 than this value, before combining. 1013 trainable: Whether or not the embedding is trainable. Default is True. 1014 1015 Returns: 1016 A list of dense columns that converts from sparse input. The order of 1017 results follows the ordering of `categorical_columns`. 1018 1019 Raises: 1020 ValueError: if `dimension` not > 0. 1021 ValueError: if any of the given `categorical_columns` is of different type 1022 or has different arguments than the others. 1023 ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt` 1024 is specified. 1025 ValueError: if `initializer` is specified and is not callable. 1026 RuntimeError: if eager execution is enabled. 1027 """ 1028 if context.executing_eagerly(): 1029 raise RuntimeError('shared_embedding_columns are not supported when eager ' 1030 'execution is enabled.') 1031 1032 if (dimension is None) or (dimension < 1): 1033 raise ValueError('Invalid dimension {}.'.format(dimension)) 1034 if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None): 1035 raise ValueError('Must specify both `ckpt_to_load_from` and ' 1036 '`tensor_name_in_ckpt` or none of them.') 1037 1038 if (initializer is not None) and (not callable(initializer)): 1039 raise ValueError('initializer must be callable if specified.') 1040 if initializer is None: 1041 initializer = init_ops.truncated_normal_initializer( 1042 mean=0.0, stddev=1. / math.sqrt(dimension)) 1043 1044 # Sort the columns so the default collection name is deterministic even if the 1045 # user passes columns from an unsorted collection, such as dict.values(). 1046 sorted_columns = sorted(categorical_columns, key=lambda x: x.name) 1047 1048 c0 = sorted_columns[0] 1049 num_buckets = c0._num_buckets # pylint: disable=protected-access 1050 if not isinstance(c0, fc_old._CategoricalColumn): # pylint: disable=protected-access 1051 raise ValueError( 1052 'All categorical_columns must be subclasses of _CategoricalColumn. ' 1053 'Given: {}, of type: {}'.format(c0, type(c0))) 1054 if isinstance(c0, 1055 (fc_old._WeightedCategoricalColumn, WeightedCategoricalColumn)): # pylint: disable=protected-access 1056 c0 = c0.categorical_column 1057 for c in sorted_columns[1:]: 1058 if isinstance( 1059 c, (fc_old._WeightedCategoricalColumn, WeightedCategoricalColumn)): # pylint: disable=protected-access 1060 c = c.categorical_column 1061 if not isinstance(c, type(c0)): 1062 raise ValueError( 1063 'To use shared_embedding_column, all categorical_columns must have ' 1064 'the same type, or be weighted_categorical_column of the same type. ' 1065 'Given column: {} of type: {} does not match given column: {} of ' 1066 'type: {}'.format(c0, type(c0), c, type(c))) 1067 if num_buckets != c._num_buckets: # pylint: disable=protected-access 1068 raise ValueError( 1069 'To use shared_embedding_column, all categorical_columns must have ' 1070 'the same number of buckets. Given column: {} with buckets: {} does ' 1071 'not match column: {} with buckets: {}'.format( 1072 c0, num_buckets, c, c._num_buckets)) # pylint: disable=protected-access 1073 1074 if not shared_embedding_collection_name: 1075 shared_embedding_collection_name = '_'.join(c.name for c in sorted_columns) 1076 shared_embedding_collection_name += '_shared_embedding' 1077 1078 result = [] 1079 for column in categorical_columns: 1080 result.append( 1081 fc_old._SharedEmbeddingColumn( # pylint: disable=protected-access 1082 categorical_column=column, 1083 initializer=initializer, 1084 dimension=dimension, 1085 combiner=combiner, 1086 shared_embedding_collection_name=shared_embedding_collection_name, 1087 ckpt_to_load_from=ckpt_to_load_from, 1088 tensor_name_in_ckpt=tensor_name_in_ckpt, 1089 max_norm=max_norm, 1090 trainable=trainable)) 1091 1092 return result 1093 1094 1095@tf_export('feature_column.shared_embeddings', v1=[]) 1096def shared_embedding_columns_v2(categorical_columns, 1097 dimension, 1098 combiner='mean', 1099 initializer=None, 1100 shared_embedding_collection_name=None, 1101 ckpt_to_load_from=None, 1102 tensor_name_in_ckpt=None, 1103 max_norm=None, 1104 trainable=True): 1105 """List of dense columns that convert from sparse, categorical input. 1106 1107 This is similar to `embedding_column`, except that it produces a list of 1108 embedding columns that share the same embedding weights. 1109 1110 Use this when your inputs are sparse and of the same type (e.g. watched and 1111 impression video IDs that share the same vocabulary), and you want to convert 1112 them to a dense representation (e.g., to feed to a DNN). 1113 1114 Inputs must be a list of categorical columns created by any of the 1115 `categorical_column_*` function. They must all be of the same type and have 1116 the same arguments except `key`. E.g. they can be 1117 categorical_column_with_vocabulary_file with the same vocabulary_file. Some or 1118 all columns could also be weighted_categorical_column. 1119 1120 Here is an example embedding of two features for a DNNClassifier model: 1121 1122 ```python 1123 watched_video_id = categorical_column_with_vocabulary_file( 1124 'watched_video_id', video_vocabulary_file, video_vocabulary_size) 1125 impression_video_id = categorical_column_with_vocabulary_file( 1126 'impression_video_id', video_vocabulary_file, video_vocabulary_size) 1127 columns = shared_embedding_columns( 1128 [watched_video_id, impression_video_id], dimension=10) 1129 1130 estimator = tf.estimator.DNNClassifier(feature_columns=columns, ...) 1131 1132 label_column = ... 1133 def input_fn(): 1134 features = tf.parse_example( 1135 ..., features=make_parse_example_spec(columns + [label_column])) 1136 labels = features.pop(label_column.name) 1137 return features, labels 1138 1139 estimator.train(input_fn=input_fn, steps=100) 1140 ``` 1141 1142 Here is an example using `shared_embedding_columns` with model_fn: 1143 1144 ```python 1145 def model_fn(features, ...): 1146 watched_video_id = categorical_column_with_vocabulary_file( 1147 'watched_video_id', video_vocabulary_file, video_vocabulary_size) 1148 impression_video_id = categorical_column_with_vocabulary_file( 1149 'impression_video_id', video_vocabulary_file, video_vocabulary_size) 1150 columns = shared_embedding_columns( 1151 [watched_video_id, impression_video_id], dimension=10) 1152 dense_tensor = input_layer(features, columns) 1153 # Form DNN layers, calculate loss, and return EstimatorSpec. 1154 ... 1155 ``` 1156 1157 Args: 1158 categorical_columns: List of categorical columns created by a 1159 `categorical_column_with_*` function. These columns produce the sparse IDs 1160 that are inputs to the embedding lookup. All columns must be of the same 1161 type and have the same arguments except `key`. E.g. they can be 1162 categorical_column_with_vocabulary_file with the same vocabulary_file. 1163 Some or all columns could also be weighted_categorical_column. 1164 dimension: An integer specifying dimension of the embedding, must be > 0. 1165 combiner: A string specifying how to reduce if there are multiple entries 1166 in a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with 1167 'mean' the default. 'sqrtn' often achieves good accuracy, in particular 1168 with bag-of-words columns. Each of this can be thought as example level 1169 normalizations on the column. For more information, see 1170 `tf.embedding_lookup_sparse`. 1171 initializer: A variable initializer function to be used in embedding 1172 variable initialization. If not specified, defaults to 1173 `tf.truncated_normal_initializer` with mean `0.0` and standard deviation 1174 `1/sqrt(dimension)`. 1175 shared_embedding_collection_name: Optional collective name of these columns. 1176 If not given, a reasonable name will be chosen based on the names of 1177 `categorical_columns`. 1178 ckpt_to_load_from: String representing checkpoint name/pattern from which to 1179 restore column weights. Required if `tensor_name_in_ckpt` is not `None`. 1180 tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from 1181 which to restore the column weights. Required if `ckpt_to_load_from` is 1182 not `None`. 1183 max_norm: If not `None`, each embedding is clipped if its l2-norm is 1184 larger than this value, before combining. 1185 trainable: Whether or not the embedding is trainable. Default is True. 1186 1187 Returns: 1188 A list of dense columns that converts from sparse input. The order of 1189 results follows the ordering of `categorical_columns`. 1190 1191 Raises: 1192 ValueError: if `dimension` not > 0. 1193 ValueError: if any of the given `categorical_columns` is of different type 1194 or has different arguments than the others. 1195 ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt` 1196 is specified. 1197 ValueError: if `initializer` is specified and is not callable. 1198 RuntimeError: if eager execution is enabled. 1199 """ 1200 if context.executing_eagerly(): 1201 raise RuntimeError('shared_embedding_columns are not supported when eager ' 1202 'execution is enabled.') 1203 1204 if (dimension is None) or (dimension < 1): 1205 raise ValueError('Invalid dimension {}.'.format(dimension)) 1206 if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None): 1207 raise ValueError('Must specify both `ckpt_to_load_from` and ' 1208 '`tensor_name_in_ckpt` or none of them.') 1209 1210 if (initializer is not None) and (not callable(initializer)): 1211 raise ValueError('initializer must be callable if specified.') 1212 if initializer is None: 1213 initializer = init_ops.truncated_normal_initializer( 1214 mean=0.0, stddev=1. / math.sqrt(dimension)) 1215 1216 # Sort the columns so the default collection name is deterministic even if the 1217 # user passes columns from an unsorted collection, such as dict.values(). 1218 sorted_columns = sorted(categorical_columns, key=lambda x: x.name) 1219 1220 c0 = sorted_columns[0] 1221 num_buckets = c0.num_buckets 1222 if not isinstance(c0, CategoricalColumn): 1223 raise ValueError( 1224 'All categorical_columns must be subclasses of CategoricalColumn. ' 1225 'Given: {}, of type: {}'.format(c0, type(c0))) 1226 if isinstance(c0, WeightedCategoricalColumn): 1227 c0 = c0.categorical_column 1228 for c in sorted_columns[1:]: 1229 if isinstance(c, WeightedCategoricalColumn): 1230 c = c.categorical_column 1231 if not isinstance(c, type(c0)): 1232 raise ValueError( 1233 'To use shared_embedding_column, all categorical_columns must have ' 1234 'the same type, or be weighted_categorical_column of the same type. ' 1235 'Given column: {} of type: {} does not match given column: {} of ' 1236 'type: {}'.format(c0, type(c0), c, type(c))) 1237 if num_buckets != c.num_buckets: 1238 raise ValueError( 1239 'To use shared_embedding_column, all categorical_columns must have ' 1240 'the same number of buckets. Given column: {} with buckets: {} does ' 1241 'not match column: {} with buckets: {}'.format( 1242 c0, num_buckets, c, c.num_buckets)) 1243 1244 if not shared_embedding_collection_name: 1245 shared_embedding_collection_name = '_'.join(c.name for c in sorted_columns) 1246 shared_embedding_collection_name += '_shared_embedding' 1247 1248 column_creator = SharedEmbeddingColumnCreator( 1249 dimension, initializer, ckpt_to_load_from, tensor_name_in_ckpt, 1250 num_buckets, trainable, shared_embedding_collection_name) 1251 1252 result = [] 1253 for column in categorical_columns: 1254 result.append( 1255 column_creator( 1256 categorical_column=column, combiner=combiner, max_norm=max_norm)) 1257 1258 return result 1259 1260 1261@tf_export('feature_column.numeric_column') 1262def numeric_column(key, 1263 shape=(1,), 1264 default_value=None, 1265 dtype=dtypes.float32, 1266 normalizer_fn=None): 1267 """Represents real valued or numerical features. 1268 1269 Example: 1270 1271 ```python 1272 price = numeric_column('price') 1273 columns = [price, ...] 1274 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1275 dense_tensor = input_layer(features, columns) 1276 1277 # or 1278 bucketized_price = bucketized_column(price, boundaries=[...]) 1279 columns = [bucketized_price, ...] 1280 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1281 linear_prediction = linear_model(features, columns) 1282 ``` 1283 1284 Args: 1285 key: A unique string identifying the input feature. It is used as the 1286 column name and the dictionary key for feature parsing configs, feature 1287 `Tensor` objects, and feature columns. 1288 shape: An iterable of integers specifies the shape of the `Tensor`. An 1289 integer can be given which means a single dimension `Tensor` with given 1290 width. The `Tensor` representing the column will have the shape of 1291 [batch_size] + `shape`. 1292 default_value: A single value compatible with `dtype` or an iterable of 1293 values compatible with `dtype` which the column takes on during 1294 `tf.Example` parsing if data is missing. A default value of `None` will 1295 cause `tf.parse_example` to fail if an example does not contain this 1296 column. If a single value is provided, the same value will be applied as 1297 the default value for every item. If an iterable of values is provided, 1298 the shape of the `default_value` should be equal to the given `shape`. 1299 dtype: defines the type of values. Default value is `tf.float32`. Must be a 1300 non-quantized, real integer or floating point type. 1301 normalizer_fn: If not `None`, a function that can be used to normalize the 1302 value of the tensor after `default_value` is applied for parsing. 1303 Normalizer function takes the input `Tensor` as its argument, and returns 1304 the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that 1305 even though the most common use case of this function is normalization, it 1306 can be used for any kind of Tensorflow transformations. 1307 1308 Returns: 1309 A `NumericColumn`. 1310 1311 Raises: 1312 TypeError: if any dimension in shape is not an int 1313 ValueError: if any dimension in shape is not a positive integer 1314 TypeError: if `default_value` is an iterable but not compatible with `shape` 1315 TypeError: if `default_value` is not compatible with `dtype`. 1316 ValueError: if `dtype` is not convertible to `tf.float32`. 1317 """ 1318 shape = _check_shape(shape, key) 1319 if not (dtype.is_integer or dtype.is_floating): 1320 raise ValueError('dtype must be convertible to float. ' 1321 'dtype: {}, key: {}'.format(dtype, key)) 1322 default_value = fc_utils.check_default_value( 1323 shape, default_value, dtype, key) 1324 1325 if normalizer_fn is not None and not callable(normalizer_fn): 1326 raise TypeError( 1327 'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn)) 1328 1329 fc_utils.assert_key_is_string(key) 1330 return NumericColumn( 1331 key, 1332 shape=shape, 1333 default_value=default_value, 1334 dtype=dtype, 1335 normalizer_fn=normalizer_fn) 1336 1337 1338@tf_export('feature_column.bucketized_column') 1339def bucketized_column(source_column, boundaries): 1340 """Represents discretized dense input. 1341 1342 Buckets include the left boundary, and exclude the right boundary. Namely, 1343 `boundaries=[0., 1., 2.]` generates buckets `(-inf, 0.)`, `[0., 1.)`, 1344 `[1., 2.)`, and `[2., +inf)`. 1345 1346 For example, if the inputs are 1347 1348 ```python 1349 boundaries = [0, 10, 100] 1350 input tensor = [[-5, 10000] 1351 [150, 10] 1352 [5, 100]] 1353 ``` 1354 1355 then the output will be 1356 1357 ```python 1358 output = [[0, 3] 1359 [3, 2] 1360 [1, 3]] 1361 ``` 1362 1363 Example: 1364 1365 ```python 1366 price = numeric_column('price') 1367 bucketized_price = bucketized_column(price, boundaries=[...]) 1368 columns = [bucketized_price, ...] 1369 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1370 linear_prediction = linear_model(features, columns) 1371 1372 # or 1373 columns = [bucketized_price, ...] 1374 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1375 dense_tensor = input_layer(features, columns) 1376 ``` 1377 1378 `bucketized_column` can also be crossed with another categorical column using 1379 `crossed_column`: 1380 1381 ```python 1382 price = numeric_column('price') 1383 # bucketized_column converts numerical feature to a categorical one. 1384 bucketized_price = bucketized_column(price, boundaries=[...]) 1385 # 'keywords' is a string feature. 1386 price_x_keywords = crossed_column([bucketized_price, 'keywords'], 50K) 1387 columns = [price_x_keywords, ...] 1388 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1389 linear_prediction = linear_model(features, columns) 1390 ``` 1391 1392 Args: 1393 source_column: A one-dimensional dense column which is generated with 1394 `numeric_column`. 1395 boundaries: A sorted list or tuple of floats specifying the boundaries. 1396 1397 Returns: 1398 A `BucketizedColumn`. 1399 1400 Raises: 1401 ValueError: If `source_column` is not a numeric column, or if it is not 1402 one-dimensional. 1403 ValueError: If `boundaries` is not a sorted list or tuple. 1404 """ 1405 if not isinstance(source_column, (NumericColumn, fc_old._NumericColumn)): # pylint: disable=protected-access 1406 raise ValueError( 1407 'source_column must be a column generated with numeric_column(). ' 1408 'Given: {}'.format(source_column)) 1409 if len(source_column.shape) > 1: 1410 raise ValueError( 1411 'source_column must be one-dimensional column. ' 1412 'Given: {}'.format(source_column)) 1413 if not boundaries: 1414 raise ValueError('boundaries must not be empty.') 1415 if not (isinstance(boundaries, list) or isinstance(boundaries, tuple)): 1416 raise ValueError('boundaries must be a sorted list.') 1417 for i in range(len(boundaries) - 1): 1418 if boundaries[i] >= boundaries[i + 1]: 1419 raise ValueError('boundaries must be a sorted list.') 1420 return BucketizedColumn(source_column, tuple(boundaries)) 1421 1422 1423@tf_export('feature_column.categorical_column_with_hash_bucket') 1424def categorical_column_with_hash_bucket(key, 1425 hash_bucket_size, 1426 dtype=dtypes.string): 1427 """Represents sparse feature where ids are set by hashing. 1428 1429 Use this when your sparse features are in string or integer format, and you 1430 want to distribute your inputs into a finite number of buckets by hashing. 1431 output_id = Hash(input_feature_string) % bucket_size for string type input. 1432 For int type input, the value is converted to its string representation first 1433 and then hashed by the same formula. 1434 1435 For input dictionary `features`, `features[key]` is either `Tensor` or 1436 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int 1437 and `''` for string, which will be dropped by this feature column. 1438 1439 Example: 1440 1441 ```python 1442 keywords = categorical_column_with_hash_bucket("keywords", 10K) 1443 columns = [keywords, ...] 1444 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1445 linear_prediction = linear_model(features, columns) 1446 1447 # or 1448 keywords_embedded = embedding_column(keywords, 16) 1449 columns = [keywords_embedded, ...] 1450 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1451 dense_tensor = input_layer(features, columns) 1452 ``` 1453 1454 Args: 1455 key: A unique string identifying the input feature. It is used as the 1456 column name and the dictionary key for feature parsing configs, feature 1457 `Tensor` objects, and feature columns. 1458 hash_bucket_size: An int > 1. The number of buckets. 1459 dtype: The type of features. Only string and integer types are supported. 1460 1461 Returns: 1462 A `HashedCategoricalColumn`. 1463 1464 Raises: 1465 ValueError: `hash_bucket_size` is not greater than 1. 1466 ValueError: `dtype` is neither string nor integer. 1467 """ 1468 if hash_bucket_size is None: 1469 raise ValueError('hash_bucket_size must be set. ' 'key: {}'.format(key)) 1470 1471 if hash_bucket_size < 1: 1472 raise ValueError('hash_bucket_size must be at least 1. ' 1473 'hash_bucket_size: {}, key: {}'.format( 1474 hash_bucket_size, key)) 1475 1476 fc_utils.assert_key_is_string(key) 1477 fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key)) 1478 1479 return HashedCategoricalColumn(key, hash_bucket_size, dtype) 1480 1481 1482@tf_export(v1=['feature_column.categorical_column_with_vocabulary_file']) 1483def categorical_column_with_vocabulary_file(key, 1484 vocabulary_file, 1485 vocabulary_size=None, 1486 num_oov_buckets=0, 1487 default_value=None, 1488 dtype=dtypes.string): 1489 """A `CategoricalColumn` with a vocabulary file. 1490 1491 Use this when your inputs are in string or integer format, and you have a 1492 vocabulary file that maps each value to an integer ID. By default, 1493 out-of-vocabulary values are ignored. Use either (but not both) of 1494 `num_oov_buckets` and `default_value` to specify how to include 1495 out-of-vocabulary values. 1496 1497 For input dictionary `features`, `features[key]` is either `Tensor` or 1498 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int 1499 and `''` for string, which will be dropped by this feature column. 1500 1501 Example with `num_oov_buckets`: 1502 File '/us/states.txt' contains 50 lines, each with a 2-character U.S. state 1503 abbreviation. All inputs with values in that file are assigned an ID 0-49, 1504 corresponding to its line number. All other values are hashed and assigned an 1505 ID 50-54. 1506 1507 ```python 1508 states = categorical_column_with_vocabulary_file( 1509 key='states', vocabulary_file='/us/states.txt', vocabulary_size=50, 1510 num_oov_buckets=5) 1511 columns = [states, ...] 1512 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1513 linear_prediction = linear_model(features, columns) 1514 ``` 1515 1516 Example with `default_value`: 1517 File '/us/states.txt' contains 51 lines - the first line is 'XX', and the 1518 other 50 each have a 2-character U.S. state abbreviation. Both a literal 'XX' 1519 in input, and other values missing from the file, will be assigned ID 0. All 1520 others are assigned the corresponding line number 1-50. 1521 1522 ```python 1523 states = categorical_column_with_vocabulary_file( 1524 key='states', vocabulary_file='/us/states.txt', vocabulary_size=51, 1525 default_value=0) 1526 columns = [states, ...] 1527 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1528 linear_prediction, _, _ = linear_model(features, columns) 1529 ``` 1530 1531 And to make an embedding with either: 1532 1533 ```python 1534 columns = [embedding_column(states, 3),...] 1535 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1536 dense_tensor = input_layer(features, columns) 1537 ``` 1538 1539 Args: 1540 key: A unique string identifying the input feature. It is used as the 1541 column name and the dictionary key for feature parsing configs, feature 1542 `Tensor` objects, and feature columns. 1543 vocabulary_file: The vocabulary file name. 1544 vocabulary_size: Number of the elements in the vocabulary. This must be no 1545 greater than length of `vocabulary_file`, if less than length, later 1546 values are ignored. If None, it is set to the length of `vocabulary_file`. 1547 num_oov_buckets: Non-negative integer, the number of out-of-vocabulary 1548 buckets. All out-of-vocabulary inputs will be assigned IDs in the range 1549 `[vocabulary_size, vocabulary_size+num_oov_buckets)` based on a hash of 1550 the input value. A positive `num_oov_buckets` can not be specified with 1551 `default_value`. 1552 default_value: The integer ID value to return for out-of-vocabulary feature 1553 values, defaults to `-1`. This can not be specified with a positive 1554 `num_oov_buckets`. 1555 dtype: The type of features. Only string and integer types are supported. 1556 1557 Returns: 1558 A `CategoricalColumn` with a vocabulary file. 1559 1560 Raises: 1561 ValueError: `vocabulary_file` is missing or cannot be opened. 1562 ValueError: `vocabulary_size` is missing or < 1. 1563 ValueError: `num_oov_buckets` is a negative integer. 1564 ValueError: `num_oov_buckets` and `default_value` are both specified. 1565 ValueError: `dtype` is neither string nor integer. 1566 """ 1567 return categorical_column_with_vocabulary_file_v2( 1568 key, vocabulary_file, vocabulary_size, 1569 dtype, default_value, 1570 num_oov_buckets) 1571 1572 1573@tf_export('feature_column.categorical_column_with_vocabulary_file', v1=[]) 1574def categorical_column_with_vocabulary_file_v2(key, 1575 vocabulary_file, 1576 vocabulary_size=None, 1577 dtype=dtypes.string, 1578 default_value=None, 1579 num_oov_buckets=0): 1580 """A `CategoricalColumn` with a vocabulary file. 1581 1582 Use this when your inputs are in string or integer format, and you have a 1583 vocabulary file that maps each value to an integer ID. By default, 1584 out-of-vocabulary values are ignored. Use either (but not both) of 1585 `num_oov_buckets` and `default_value` to specify how to include 1586 out-of-vocabulary values. 1587 1588 For input dictionary `features`, `features[key]` is either `Tensor` or 1589 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int 1590 and `''` for string, which will be dropped by this feature column. 1591 1592 Example with `num_oov_buckets`: 1593 File '/us/states.txt' contains 50 lines, each with a 2-character U.S. state 1594 abbreviation. All inputs with values in that file are assigned an ID 0-49, 1595 corresponding to its line number. All other values are hashed and assigned an 1596 ID 50-54. 1597 1598 ```python 1599 states = categorical_column_with_vocabulary_file( 1600 key='states', vocabulary_file='/us/states.txt', vocabulary_size=50, 1601 num_oov_buckets=5) 1602 columns = [states, ...] 1603 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1604 linear_prediction = linear_model(features, columns) 1605 ``` 1606 1607 Example with `default_value`: 1608 File '/us/states.txt' contains 51 lines - the first line is 'XX', and the 1609 other 50 each have a 2-character U.S. state abbreviation. Both a literal 'XX' 1610 in input, and other values missing from the file, will be assigned ID 0. All 1611 others are assigned the corresponding line number 1-50. 1612 1613 ```python 1614 states = categorical_column_with_vocabulary_file( 1615 key='states', vocabulary_file='/us/states.txt', vocabulary_size=51, 1616 default_value=0) 1617 columns = [states, ...] 1618 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1619 linear_prediction, _, _ = linear_model(features, columns) 1620 ``` 1621 1622 And to make an embedding with either: 1623 1624 ```python 1625 columns = [embedding_column(states, 3),...] 1626 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1627 dense_tensor = input_layer(features, columns) 1628 ``` 1629 1630 Args: 1631 key: A unique string identifying the input feature. It is used as the 1632 column name and the dictionary key for feature parsing configs, feature 1633 `Tensor` objects, and feature columns. 1634 vocabulary_file: The vocabulary file name. 1635 vocabulary_size: Number of the elements in the vocabulary. This must be no 1636 greater than length of `vocabulary_file`, if less than length, later 1637 values are ignored. If None, it is set to the length of `vocabulary_file`. 1638 dtype: The type of features. Only string and integer types are supported. 1639 default_value: The integer ID value to return for out-of-vocabulary feature 1640 values, defaults to `-1`. This can not be specified with a positive 1641 `num_oov_buckets`. 1642 num_oov_buckets: Non-negative integer, the number of out-of-vocabulary 1643 buckets. All out-of-vocabulary inputs will be assigned IDs in the range 1644 `[vocabulary_size, vocabulary_size+num_oov_buckets)` based on a hash of 1645 the input value. A positive `num_oov_buckets` can not be specified with 1646 `default_value`. 1647 1648 Returns: 1649 A `CategoricalColumn` with a vocabulary file. 1650 1651 Raises: 1652 ValueError: `vocabulary_file` is missing or cannot be opened. 1653 ValueError: `vocabulary_size` is missing or < 1. 1654 ValueError: `num_oov_buckets` is a negative integer. 1655 ValueError: `num_oov_buckets` and `default_value` are both specified. 1656 ValueError: `dtype` is neither string nor integer. 1657 """ 1658 if not vocabulary_file: 1659 raise ValueError('Missing vocabulary_file in {}.'.format(key)) 1660 1661 if vocabulary_size is None: 1662 if not gfile.Exists(vocabulary_file): 1663 raise ValueError('vocabulary_file in {} does not exist.'.format(key)) 1664 1665 with gfile.GFile(vocabulary_file) as f: 1666 vocabulary_size = sum(1 for _ in f) 1667 logging.info( 1668 'vocabulary_size = %d in %s is inferred from the number of elements ' 1669 'in the vocabulary_file %s.', vocabulary_size, key, vocabulary_file) 1670 1671 # `vocabulary_size` isn't required for lookup, but it is for `_num_buckets`. 1672 if vocabulary_size < 1: 1673 raise ValueError('Invalid vocabulary_size in {}.'.format(key)) 1674 if num_oov_buckets: 1675 if default_value is not None: 1676 raise ValueError( 1677 'Can\'t specify both num_oov_buckets and default_value in {}.'.format( 1678 key)) 1679 if num_oov_buckets < 0: 1680 raise ValueError('Invalid num_oov_buckets {} in {}.'.format( 1681 num_oov_buckets, key)) 1682 fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key)) 1683 fc_utils.assert_key_is_string(key) 1684 return VocabularyFileCategoricalColumn( 1685 key=key, 1686 vocabulary_file=vocabulary_file, 1687 vocabulary_size=vocabulary_size, 1688 num_oov_buckets=0 if num_oov_buckets is None else num_oov_buckets, 1689 default_value=-1 if default_value is None else default_value, 1690 dtype=dtype) 1691 1692 1693@tf_export('feature_column.categorical_column_with_vocabulary_list') 1694def categorical_column_with_vocabulary_list(key, 1695 vocabulary_list, 1696 dtype=None, 1697 default_value=-1, 1698 num_oov_buckets=0): 1699 """A `CategoricalColumn` with in-memory vocabulary. 1700 1701 Use this when your inputs are in string or integer format, and you have an 1702 in-memory vocabulary mapping each value to an integer ID. By default, 1703 out-of-vocabulary values are ignored. Use either (but not both) of 1704 `num_oov_buckets` and `default_value` to specify how to include 1705 out-of-vocabulary values. 1706 1707 For input dictionary `features`, `features[key]` is either `Tensor` or 1708 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int 1709 and `''` for string, which will be dropped by this feature column. 1710 1711 Example with `num_oov_buckets`: 1712 In the following example, each input in `vocabulary_list` is assigned an ID 1713 0-3 corresponding to its index (e.g., input 'B' produces output 2). All other 1714 inputs are hashed and assigned an ID 4-5. 1715 1716 ```python 1717 colors = categorical_column_with_vocabulary_list( 1718 key='colors', vocabulary_list=('R', 'G', 'B', 'Y'), 1719 num_oov_buckets=2) 1720 columns = [colors, ...] 1721 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1722 linear_prediction, _, _ = linear_model(features, columns) 1723 ``` 1724 1725 Example with `default_value`: 1726 In the following example, each input in `vocabulary_list` is assigned an ID 1727 0-4 corresponding to its index (e.g., input 'B' produces output 3). All other 1728 inputs are assigned `default_value` 0. 1729 1730 1731 ```python 1732 colors = categorical_column_with_vocabulary_list( 1733 key='colors', vocabulary_list=('X', 'R', 'G', 'B', 'Y'), default_value=0) 1734 columns = [colors, ...] 1735 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1736 linear_prediction, _, _ = linear_model(features, columns) 1737 ``` 1738 1739 And to make an embedding with either: 1740 1741 ```python 1742 columns = [embedding_column(colors, 3),...] 1743 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1744 dense_tensor = input_layer(features, columns) 1745 ``` 1746 1747 Args: 1748 key: A unique string identifying the input feature. It is used as the column 1749 name and the dictionary key for feature parsing configs, feature `Tensor` 1750 objects, and feature columns. 1751 vocabulary_list: An ordered iterable defining the vocabulary. Each feature 1752 is mapped to the index of its value (if present) in `vocabulary_list`. 1753 Must be castable to `dtype`. 1754 dtype: The type of features. Only string and integer types are supported. If 1755 `None`, it will be inferred from `vocabulary_list`. 1756 default_value: The integer ID value to return for out-of-vocabulary feature 1757 values, defaults to `-1`. This can not be specified with a positive 1758 `num_oov_buckets`. 1759 num_oov_buckets: Non-negative integer, the number of out-of-vocabulary 1760 buckets. All out-of-vocabulary inputs will be assigned IDs in the range 1761 `[len(vocabulary_list), len(vocabulary_list)+num_oov_buckets)` based on a 1762 hash of the input value. A positive `num_oov_buckets` can not be specified 1763 with `default_value`. 1764 1765 Returns: 1766 A `CategoricalColumn` with in-memory vocabulary. 1767 1768 Raises: 1769 ValueError: if `vocabulary_list` is empty, or contains duplicate keys. 1770 ValueError: `num_oov_buckets` is a negative integer. 1771 ValueError: `num_oov_buckets` and `default_value` are both specified. 1772 ValueError: if `dtype` is not integer or string. 1773 """ 1774 if (vocabulary_list is None) or (len(vocabulary_list) < 1): 1775 raise ValueError( 1776 'vocabulary_list {} must be non-empty, column_name: {}'.format( 1777 vocabulary_list, key)) 1778 if len(set(vocabulary_list)) != len(vocabulary_list): 1779 raise ValueError( 1780 'Duplicate keys in vocabulary_list {}, column_name: {}'.format( 1781 vocabulary_list, key)) 1782 vocabulary_dtype = dtypes.as_dtype(np.array(vocabulary_list).dtype) 1783 if num_oov_buckets: 1784 if default_value != -1: 1785 raise ValueError( 1786 'Can\'t specify both num_oov_buckets and default_value in {}.'.format( 1787 key)) 1788 if num_oov_buckets < 0: 1789 raise ValueError('Invalid num_oov_buckets {} in {}.'.format( 1790 num_oov_buckets, key)) 1791 fc_utils.assert_string_or_int( 1792 vocabulary_dtype, prefix='column_name: {} vocabulary'.format(key)) 1793 if dtype is None: 1794 dtype = vocabulary_dtype 1795 elif dtype.is_integer != vocabulary_dtype.is_integer: 1796 raise ValueError( 1797 'dtype {} and vocabulary dtype {} do not match, column_name: {}'.format( 1798 dtype, vocabulary_dtype, key)) 1799 fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key)) 1800 fc_utils.assert_key_is_string(key) 1801 1802 return VocabularyListCategoricalColumn( 1803 key=key, 1804 vocabulary_list=tuple(vocabulary_list), 1805 dtype=dtype, 1806 default_value=default_value, 1807 num_oov_buckets=num_oov_buckets) 1808 1809 1810@tf_export('feature_column.categorical_column_with_identity') 1811def categorical_column_with_identity(key, num_buckets, default_value=None): 1812 """A `CategoricalColumn` that returns identity values. 1813 1814 Use this when your inputs are integers in the range `[0, num_buckets)`, and 1815 you want to use the input value itself as the categorical ID. Values outside 1816 this range will result in `default_value` if specified, otherwise it will 1817 fail. 1818 1819 Typically, this is used for contiguous ranges of integer indexes, but 1820 it doesn't have to be. This might be inefficient, however, if many of IDs 1821 are unused. Consider `categorical_column_with_hash_bucket` in that case. 1822 1823 For input dictionary `features`, `features[key]` is either `Tensor` or 1824 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int 1825 and `''` for string, which will be dropped by this feature column. 1826 1827 In the following examples, each input in the range `[0, 1000000)` is assigned 1828 the same value. All other inputs are assigned `default_value` 0. Note that a 1829 literal 0 in inputs will result in the same default ID. 1830 1831 Linear model: 1832 1833 ```python 1834 video_id = categorical_column_with_identity( 1835 key='video_id', num_buckets=1000000, default_value=0) 1836 columns = [video_id, ...] 1837 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1838 linear_prediction, _, _ = linear_model(features, columns) 1839 ``` 1840 1841 Embedding for a DNN model: 1842 1843 ```python 1844 columns = [embedding_column(video_id, 9),...] 1845 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1846 dense_tensor = input_layer(features, columns) 1847 ``` 1848 1849 Args: 1850 key: A unique string identifying the input feature. It is used as the 1851 column name and the dictionary key for feature parsing configs, feature 1852 `Tensor` objects, and feature columns. 1853 num_buckets: Range of inputs and outputs is `[0, num_buckets)`. 1854 default_value: If `None`, this column's graph operations will fail for 1855 out-of-range inputs. Otherwise, this value must be in the range 1856 `[0, num_buckets)`, and will replace inputs in that range. 1857 1858 Returns: 1859 A `CategoricalColumn` that returns identity values. 1860 1861 Raises: 1862 ValueError: if `num_buckets` is less than one. 1863 ValueError: if `default_value` is not in range `[0, num_buckets)`. 1864 """ 1865 if num_buckets < 1: 1866 raise ValueError( 1867 'num_buckets {} < 1, column_name {}'.format(num_buckets, key)) 1868 if (default_value is not None) and ( 1869 (default_value < 0) or (default_value >= num_buckets)): 1870 raise ValueError( 1871 'default_value {} not in range [0, {}), column_name {}'.format( 1872 default_value, num_buckets, key)) 1873 fc_utils.assert_key_is_string(key) 1874 return IdentityCategoricalColumn( 1875 key=key, number_buckets=num_buckets, default_value=default_value) 1876 1877 1878@tf_export('feature_column.indicator_column') 1879def indicator_column(categorical_column): 1880 """Represents multi-hot representation of given categorical column. 1881 1882 - For DNN model, `indicator_column` can be used to wrap any 1883 `categorical_column_*` (e.g., to feed to DNN). Consider to Use 1884 `embedding_column` if the number of buckets/unique(values) are large. 1885 1886 - For Wide (aka linear) model, `indicator_column` is the internal 1887 representation for categorical column when passing categorical column 1888 directly (as any element in feature_columns) to `linear_model`. See 1889 `linear_model` for details. 1890 1891 ```python 1892 name = indicator_column(categorical_column_with_vocabulary_list( 1893 'name', ['bob', 'george', 'wanda']) 1894 columns = [name, ...] 1895 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1896 dense_tensor = input_layer(features, columns) 1897 1898 dense_tensor == [[1, 0, 0]] # If "name" bytes_list is ["bob"] 1899 dense_tensor == [[1, 0, 1]] # If "name" bytes_list is ["bob", "wanda"] 1900 dense_tensor == [[2, 0, 0]] # If "name" bytes_list is ["bob", "bob"] 1901 ``` 1902 1903 Args: 1904 categorical_column: A `CategoricalColumn` which is created by 1905 `categorical_column_with_*` or `crossed_column` functions. 1906 1907 Returns: 1908 An `IndicatorColumn`. 1909 """ 1910 return IndicatorColumn(categorical_column) 1911 1912 1913@tf_export('feature_column.weighted_categorical_column') 1914def weighted_categorical_column(categorical_column, 1915 weight_feature_key, 1916 dtype=dtypes.float32): 1917 """Applies weight values to a `CategoricalColumn`. 1918 1919 Use this when each of your sparse inputs has both an ID and a value. For 1920 example, if you're representing text documents as a collection of word 1921 frequencies, you can provide 2 parallel sparse input features ('terms' and 1922 'frequencies' below). 1923 1924 Example: 1925 1926 Input `tf.Example` objects: 1927 1928 ```proto 1929 [ 1930 features { 1931 feature { 1932 key: "terms" 1933 value {bytes_list {value: "very" value: "model"}} 1934 } 1935 feature { 1936 key: "frequencies" 1937 value {float_list {value: 0.3 value: 0.1}} 1938 } 1939 }, 1940 features { 1941 feature { 1942 key: "terms" 1943 value {bytes_list {value: "when" value: "course" value: "human"}} 1944 } 1945 feature { 1946 key: "frequencies" 1947 value {float_list {value: 0.4 value: 0.1 value: 0.2}} 1948 } 1949 } 1950 ] 1951 ``` 1952 1953 ```python 1954 categorical_column = categorical_column_with_hash_bucket( 1955 column_name='terms', hash_bucket_size=1000) 1956 weighted_column = weighted_categorical_column( 1957 categorical_column=categorical_column, weight_feature_key='frequencies') 1958 columns = [weighted_column, ...] 1959 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1960 linear_prediction, _, _ = linear_model(features, columns) 1961 ``` 1962 1963 This assumes the input dictionary contains a `SparseTensor` for key 1964 'terms', and a `SparseTensor` for key 'frequencies'. These 2 tensors must have 1965 the same indices and dense shape. 1966 1967 Args: 1968 categorical_column: A `CategoricalColumn` created by 1969 `categorical_column_with_*` functions. 1970 weight_feature_key: String key for weight values. 1971 dtype: Type of weights, such as `tf.float32`. Only float and integer weights 1972 are supported. 1973 1974 Returns: 1975 A `CategoricalColumn` composed of two sparse features: one represents id, 1976 the other represents weight (value) of the id feature in that example. 1977 1978 Raises: 1979 ValueError: if `dtype` is not convertible to float. 1980 """ 1981 if (dtype is None) or not (dtype.is_integer or dtype.is_floating): 1982 raise ValueError('dtype {} is not convertible to float.'.format(dtype)) 1983 return WeightedCategoricalColumn( 1984 categorical_column=categorical_column, 1985 weight_feature_key=weight_feature_key, 1986 dtype=dtype) 1987 1988 1989@tf_export('feature_column.crossed_column') 1990def crossed_column(keys, hash_bucket_size, hash_key=None): 1991 """Returns a column for performing crosses of categorical features. 1992 1993 Crossed features will be hashed according to `hash_bucket_size`. Conceptually, 1994 the transformation can be thought of as: 1995 Hash(cartesian product of features) % `hash_bucket_size` 1996 1997 For example, if the input features are: 1998 1999 * SparseTensor referred by first key: 2000 2001 ```python 2002 shape = [2, 2] 2003 { 2004 [0, 0]: "a" 2005 [1, 0]: "b" 2006 [1, 1]: "c" 2007 } 2008 ``` 2009 2010 * SparseTensor referred by second key: 2011 2012 ```python 2013 shape = [2, 1] 2014 { 2015 [0, 0]: "d" 2016 [1, 0]: "e" 2017 } 2018 ``` 2019 2020 then crossed feature will look like: 2021 2022 ```python 2023 shape = [2, 2] 2024 { 2025 [0, 0]: Hash64("d", Hash64("a")) % hash_bucket_size 2026 [1, 0]: Hash64("e", Hash64("b")) % hash_bucket_size 2027 [1, 1]: Hash64("e", Hash64("c")) % hash_bucket_size 2028 } 2029 ``` 2030 2031 Here is an example to create a linear model with crosses of string features: 2032 2033 ```python 2034 keywords_x_doc_terms = crossed_column(['keywords', 'doc_terms'], 50K) 2035 columns = [keywords_x_doc_terms, ...] 2036 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 2037 linear_prediction = linear_model(features, columns) 2038 ``` 2039 2040 You could also use vocabulary lookup before crossing: 2041 2042 ```python 2043 keywords = categorical_column_with_vocabulary_file( 2044 'keywords', '/path/to/vocabulary/file', vocabulary_size=1K) 2045 keywords_x_doc_terms = crossed_column([keywords, 'doc_terms'], 50K) 2046 columns = [keywords_x_doc_terms, ...] 2047 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 2048 linear_prediction = linear_model(features, columns) 2049 ``` 2050 2051 If an input feature is of numeric type, you can use 2052 `categorical_column_with_identity`, or `bucketized_column`, as in the example: 2053 2054 ```python 2055 # vertical_id is an integer categorical feature. 2056 vertical_id = categorical_column_with_identity('vertical_id', 10K) 2057 price = numeric_column('price') 2058 # bucketized_column converts numerical feature to a categorical one. 2059 bucketized_price = bucketized_column(price, boundaries=[...]) 2060 vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K) 2061 columns = [vertical_id_x_price, ...] 2062 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 2063 linear_prediction = linear_model(features, columns) 2064 ``` 2065 2066 To use crossed column in DNN model, you need to add it in an embedding column 2067 as in this example: 2068 2069 ```python 2070 vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K) 2071 vertical_id_x_price_embedded = embedding_column(vertical_id_x_price, 10) 2072 dense_tensor = input_layer(features, [vertical_id_x_price_embedded, ...]) 2073 ``` 2074 2075 Args: 2076 keys: An iterable identifying the features to be crossed. Each element can 2077 be either: 2078 * string: Will use the corresponding feature which must be of string type. 2079 * `CategoricalColumn`: Will use the transformed tensor produced by this 2080 column. Does not support hashed categorical column. 2081 hash_bucket_size: An int > 1. The number of buckets. 2082 hash_key: Specify the hash_key that will be used by the `FingerprintCat64` 2083 function to combine the crosses fingerprints on SparseCrossOp (optional). 2084 2085 Returns: 2086 A `CrossedColumn`. 2087 2088 Raises: 2089 ValueError: If `len(keys) < 2`. 2090 ValueError: If any of the keys is neither a string nor `CategoricalColumn`. 2091 ValueError: If any of the keys is `HashedCategoricalColumn`. 2092 ValueError: If `hash_bucket_size < 1`. 2093 """ 2094 if not hash_bucket_size or hash_bucket_size < 1: 2095 raise ValueError('hash_bucket_size must be > 1. ' 2096 'hash_bucket_size: {}'.format(hash_bucket_size)) 2097 if not keys or len(keys) < 2: 2098 raise ValueError( 2099 'keys must be a list with length > 1. Given: {}'.format(keys)) 2100 for key in keys: 2101 if (not isinstance(key, six.string_types) and 2102 not isinstance(key, (CategoricalColumn, fc_old._CategoricalColumn))): # pylint: disable=protected-access 2103 raise ValueError( 2104 'Unsupported key type. All keys must be either string, or ' 2105 'categorical column except HashedCategoricalColumn. ' 2106 'Given: {}'.format(key)) 2107 if isinstance(key, 2108 (HashedCategoricalColumn, fc_old._HashedCategoricalColumn)): # pylint: disable=protected-access 2109 raise ValueError( 2110 'categorical_column_with_hash_bucket is not supported for crossing. ' 2111 'Hashing before crossing will increase probability of collision. ' 2112 'Instead, use the feature name as a string. Given: {}'.format(key)) 2113 return CrossedColumn( 2114 keys=tuple(keys), hash_bucket_size=hash_bucket_size, hash_key=hash_key) 2115 2116 2117@six.add_metaclass(abc.ABCMeta) 2118class FeatureColumn(object): 2119 """Represents a feature column abstraction. 2120 2121 WARNING: Do not subclass this layer unless you know what you are doing: 2122 the API is subject to future changes. 2123 2124 To distinguish between the concept of a feature family and a specific binary 2125 feature within a family, we refer to a feature family like "country" as a 2126 feature column. For example, we can have a feature in a `tf.Example` format: 2127 {key: "country", value: [ "US" ]} 2128 In this example the value of feature is "US" and "country" refers to the 2129 column of the feature. 2130 2131 This class is an abstract class. Users should not create instances of this. 2132 """ 2133 2134 @abc.abstractproperty 2135 def name(self): 2136 """Returns string. Used for naming.""" 2137 pass 2138 2139 @abc.abstractmethod 2140 def transform_feature(self, transformation_cache, state_manager): 2141 """Returns intermediate representation (usually a `Tensor`). 2142 2143 Uses `transformation_cache` to create an intermediate representation 2144 (usually a `Tensor`) that other feature columns can use. 2145 2146 Example usage of `transformation_cache`: 2147 Let's say a Feature column depends on raw feature ('raw') and another 2148 `FeatureColumn` (input_fc). To access corresponding `Tensor`s, 2149 transformation_cache will be used as follows: 2150 2151 ```python 2152 raw_tensor = transformation_cache.get('raw', state_manager) 2153 fc_tensor = transformation_cache.get(input_fc, state_manager) 2154 ``` 2155 2156 Args: 2157 transformation_cache: A `FeatureTransformationCache` object to access 2158 features. 2159 state_manager: A `StateManager` to create / access resources such as 2160 lookup tables. 2161 2162 Returns: 2163 Transformed feature `Tensor`. 2164 """ 2165 pass 2166 2167 @abc.abstractproperty 2168 def parse_example_spec(self): 2169 """Returns a `tf.Example` parsing spec as dict. 2170 2171 It is used for get_parsing_spec for `tf.parse_example`. Returned spec is a 2172 dict from keys ('string') to `VarLenFeature`, `FixedLenFeature`, and other 2173 supported objects. Please check documentation of `tf.parse_example` for all 2174 supported spec objects. 2175 2176 Let's say a Feature column depends on raw feature ('raw') and another 2177 `FeatureColumn` (input_fc). One possible implementation of 2178 parse_example_spec is as follows: 2179 2180 ```python 2181 spec = {'raw': tf.FixedLenFeature(...)} 2182 spec.update(input_fc.parse_example_spec) 2183 return spec 2184 ``` 2185 """ 2186 pass 2187 2188 def create_state(self, state_manager): 2189 """Uses the `state_manager` to create state for the FeatureColumn. 2190 2191 Args: 2192 state_manager: A `StateManager` to create / access resources such as 2193 lookup tables and variables. 2194 """ 2195 pass 2196 2197 @abc.abstractproperty 2198 def _is_v2_column(self): 2199 """Returns whether this FeatureColumn is fully conformant to the new API. 2200 2201 This is needed for composition type cases where an EmbeddingColumn etc. 2202 might take in old categorical columns as input and then we want to use the 2203 old API. 2204 """ 2205 pass 2206 2207 @abc.abstractproperty 2208 def parents(self): 2209 """Returns a list of immediate raw feature and FeatureColumn dependencies. 2210 2211 For example: 2212 # For the following feature columns 2213 a = numeric_column('f1') 2214 c = crossed_column(a, 'f2') 2215 # The expected parents are: 2216 a.parents = ['f1'] 2217 c.parents = [a, 'f2'] 2218 """ 2219 pass 2220 2221 @abc.abstractmethod 2222 def _get_config(self): 2223 """Returns the config of the feature column. 2224 2225 A FeatureColumn config is a Python dictionary (serializable) containing the 2226 configuration of a FeatureColumn. The same FeatureColumn can be 2227 reinstantiated later from this configuration. 2228 2229 The config of a feature column does not include information about feature 2230 columns depending on it nor the FeatureColumn class name. 2231 2232 Example with (de)serialization practices followed in this file: 2233 ```python 2234 class SerializationExampleFeatureColumn( 2235 FeatureColumn, collections.namedtuple( 2236 'SerializationExampleFeatureColumn', 2237 ('dimension', 'parent', 'dtype', 'normalizer_fn'))): 2238 2239 def _get_config(self): 2240 # Create a dict from the namedtuple. 2241 # Python attribute literals can be directly copied from / to the config. 2242 # For example 'dimension', assuming it is an integer literal. 2243 config = dict(zip(self._fields, self)) 2244 2245 # (De)serialization of parent FeatureColumns should use the provided 2246 # (de)serialize_feature_column() methods that take care of de-duping. 2247 config['parent'] = serialize_feature_column(self.parent) 2248 2249 # Many objects provide custom (de)serialization e.g: for tf.DType 2250 # tf.DType.name, tf.as_dtype() can be used. 2251 config['dtype'] = self.dtype.name 2252 2253 # Non-trivial dependencies should be Keras-(de)serializable. 2254 config['normalizer_fn'] = utils.serialize_keras_object( 2255 self.normalizer_fn) 2256 2257 return config 2258 2259 @classmethod 2260 def _from_config(cls, config, custom_objects=None, columns_by_name=None): 2261 # This should do the inverse transform from `_get_config` and construct 2262 # the namedtuple. 2263 kwargs = config.copy() 2264 kwargs['parent'] = deserialize_feature_column( 2265 config['parent'], custom_objects, columns_by_name) 2266 kwargs['dtype'] = dtypes.as_dtype(config['dtype']) 2267 kwargs['normalizer_fn'] = utils.deserialize_keras_object( 2268 config['normalizer_fn'], custom_objects=custom_objects) 2269 return cls(**kwargs) 2270 2271 ``` 2272 Returns: 2273 A serializable Dict that can be used to deserialize the object with 2274 from_config. 2275 """ 2276 pass 2277 2278 @classmethod 2279 def _from_config(cls, config, custom_objects=None, columns_by_name=None): 2280 """Creates a FeatureColumn from its config. 2281 2282 This method should be the reverse of `_get_config`, capable of instantiating 2283 the same FeatureColumn from the config dictionary. See `_get_config` for an 2284 example of common (de)serialization practices followed in this file. 2285 2286 TODO(b/118939620): This is a private method until consensus is reached on 2287 supporting object deserialization deduping within Keras. 2288 2289 Args: 2290 config: A Dict config acquired with `_get_config`. 2291 custom_objects: Optional dictionary mapping names (strings) to custom 2292 classes or functions to be considered during deserialization. 2293 columns_by_name: A Dict[String, FeatureColumn] of existing columns in 2294 order to avoid duplication. Should be passed to any calls to 2295 deserialize_feature_column(). 2296 2297 Returns: 2298 A FeatureColumn for the input config. 2299 """ 2300 pass 2301 2302 2303class DenseColumn(FeatureColumn): 2304 """Represents a column which can be represented as `Tensor`. 2305 2306 Some examples of this type are: numeric_column, embedding_column, 2307 indicator_column. 2308 """ 2309 2310 @abc.abstractproperty 2311 def variable_shape(self): 2312 """`TensorShape` of `get_dense_tensor`, without batch dimension.""" 2313 pass 2314 2315 @abc.abstractmethod 2316 def get_dense_tensor(self, transformation_cache, state_manager): 2317 """Returns a `Tensor`. 2318 2319 The output of this function will be used by model-builder-functions. For 2320 example the pseudo code of `input_layer` will be like: 2321 2322 ```python 2323 def input_layer(features, feature_columns, ...): 2324 outputs = [fc.get_dense_tensor(...) for fc in feature_columns] 2325 return tf.concat(outputs) 2326 ``` 2327 2328 Args: 2329 transformation_cache: A `FeatureTransformationCache` object to access 2330 features. 2331 state_manager: A `StateManager` to create / access resources such as 2332 lookup tables. 2333 2334 Returns: 2335 `Tensor` of shape [batch_size] + `variable_shape`. 2336 """ 2337 pass 2338 2339 2340def is_feature_column_v2(feature_columns): 2341 """Returns True if all feature columns are V2.""" 2342 for feature_column in feature_columns: 2343 if not isinstance(feature_column, FeatureColumn): 2344 return False 2345 if not feature_column._is_v2_column: # pylint: disable=protected-access 2346 return False 2347 return True 2348 2349 2350def _create_weighted_sum(column, transformation_cache, state_manager, 2351 sparse_combiner, weight_var): 2352 """Creates a weighted sum for a dense/categorical column for linear_model.""" 2353 if isinstance(column, CategoricalColumn): 2354 return _create_categorical_column_weighted_sum( 2355 column=column, 2356 transformation_cache=transformation_cache, 2357 state_manager=state_manager, 2358 sparse_combiner=sparse_combiner, 2359 weight_var=weight_var) 2360 else: 2361 return _create_dense_column_weighted_sum( 2362 column=column, 2363 transformation_cache=transformation_cache, 2364 state_manager=state_manager, 2365 weight_var=weight_var) 2366 2367 2368def _create_dense_column_weighted_sum(column, transformation_cache, 2369 state_manager, weight_var): 2370 """Create a weighted sum of a dense column for linear_model.""" 2371 tensor = column.get_dense_tensor(transformation_cache, state_manager) 2372 num_elements = column.variable_shape.num_elements() 2373 batch_size = array_ops.shape(tensor)[0] 2374 tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements)) 2375 return math_ops.matmul(tensor, weight_var, name='weighted_sum') 2376 2377 2378class CategoricalColumn(FeatureColumn): 2379 """Represents a categorical feature. 2380 2381 A categorical feature typically handled with a `tf.SparseTensor` of IDs. 2382 """ 2383 2384 IdWeightPair = collections.namedtuple( # pylint: disable=invalid-name 2385 'IdWeightPair', ('id_tensor', 'weight_tensor')) 2386 2387 @abc.abstractproperty 2388 def num_buckets(self): 2389 """Returns number of buckets in this sparse feature.""" 2390 pass 2391 2392 @abc.abstractmethod 2393 def get_sparse_tensors(self, transformation_cache, state_manager): 2394 """Returns an IdWeightPair. 2395 2396 `IdWeightPair` is a pair of `SparseTensor`s which represents ids and 2397 weights. 2398 2399 `IdWeightPair.id_tensor` is typically a `batch_size` x `num_buckets` 2400 `SparseTensor` of `int64`. `IdWeightPair.weight_tensor` is either a 2401 `SparseTensor` of `float` or `None` to indicate all weights should be 2402 taken to be 1. If specified, `weight_tensor` must have exactly the same 2403 shape and indices as `sp_ids`. Expected `SparseTensor` is same as parsing 2404 output of a `VarLenFeature` which is a ragged matrix. 2405 2406 Args: 2407 transformation_cache: A `FeatureTransformationCache` object to access 2408 features. 2409 state_manager: A `StateManager` to create / access resources such as 2410 lookup tables. 2411 """ 2412 pass 2413 2414 2415def _create_categorical_column_weighted_sum( 2416 column, transformation_cache, state_manager, sparse_combiner, weight_var): 2417 # pylint: disable=g-doc-return-or-yield,g-doc-args 2418 """Create a weighted sum of a categorical column for linear_model. 2419 2420 Note to maintainer: As implementation details, the weighted sum is 2421 implemented via embedding_lookup_sparse toward efficiency. Mathematically, 2422 they are the same. 2423 2424 To be specific, conceptually, categorical column can be treated as multi-hot 2425 vector. Say: 2426 2427 ```python 2428 x = [0 0 1] # categorical column input 2429 w = [a b c] # weights 2430 ``` 2431 The weighted sum is `c` in this case, which is same as `w[2]`. 2432 2433 Another example is 2434 2435 ```python 2436 x = [0 1 1] # categorical column input 2437 w = [a b c] # weights 2438 ``` 2439 The weighted sum is `b + c` in this case, which is same as `w[2] + w[3]`. 2440 2441 For both cases, we can implement weighted sum via embedding_lookup with 2442 sparse_combiner = "sum". 2443 """ 2444 2445 sparse_tensors = column.get_sparse_tensors(transformation_cache, 2446 state_manager) 2447 id_tensor = sparse_ops.sparse_reshape(sparse_tensors.id_tensor, [ 2448 array_ops.shape(sparse_tensors.id_tensor)[0], -1 2449 ]) 2450 weight_tensor = sparse_tensors.weight_tensor 2451 if weight_tensor is not None: 2452 weight_tensor = sparse_ops.sparse_reshape( 2453 weight_tensor, [array_ops.shape(weight_tensor)[0], -1]) 2454 2455 return embedding_ops.safe_embedding_lookup_sparse( 2456 weight_var, 2457 id_tensor, 2458 sparse_weights=weight_tensor, 2459 combiner=sparse_combiner, 2460 name='weighted_sum') 2461 2462 2463class SequenceDenseColumn(FeatureColumn): 2464 """Represents dense sequence data.""" 2465 2466 TensorSequenceLengthPair = collections.namedtuple( # pylint: disable=invalid-name 2467 'TensorSequenceLengthPair', ('dense_tensor', 'sequence_length')) 2468 2469 @abc.abstractmethod 2470 def get_sequence_dense_tensor(self, transformation_cache, state_manager): 2471 """Returns a `TensorSequenceLengthPair`. 2472 2473 Args: 2474 transformation_cache: A `FeatureTransformationCache` object to access 2475 features. 2476 state_manager: A `StateManager` to create / access resources such as 2477 lookup tables. 2478 """ 2479 pass 2480 2481 2482class FeatureTransformationCache(object): 2483 """Handles caching of transformations while building the model. 2484 2485 `FeatureColumn` specifies how to digest an input column to the network. Some 2486 feature columns require data transformations. This class caches those 2487 transformations. 2488 2489 Some features may be used in more than one place. For example, one can use a 2490 bucketized feature by itself and a cross with it. In that case we 2491 should create only one bucketization op instead of creating ops for each 2492 feature column separately. To handle re-use of transformed columns, 2493 `FeatureTransformationCache` caches all previously transformed columns. 2494 2495 Example: 2496 We're trying to use the following `FeatureColumn`s: 2497 2498 ```python 2499 bucketized_age = fc.bucketized_column(fc.numeric_column("age"), ...) 2500 keywords = fc.categorical_column_with_hash_buckets("keywords", ...) 2501 age_X_keywords = fc.crossed_column([bucketized_age, "keywords"]) 2502 ... = linear_model(features, 2503 [bucketized_age, keywords, age_X_keywords] 2504 ``` 2505 2506 If we transform each column independently, then we'll get duplication of 2507 bucketization (one for cross, one for bucketization itself). 2508 The `FeatureTransformationCache` eliminates this duplication. 2509 """ 2510 2511 def __init__(self, features): 2512 """Creates a `FeatureTransformationCache`. 2513 2514 Args: 2515 features: A mapping from feature column to objects that are `Tensor` or 2516 `SparseTensor`, or can be converted to same via 2517 `sparse_tensor.convert_to_tensor_or_sparse_tensor`. A `string` key 2518 signifies a base feature (not-transformed). A `FeatureColumn` key 2519 means that this `Tensor` is the output of an existing `FeatureColumn` 2520 which can be reused. 2521 """ 2522 self._features = features.copy() 2523 self._feature_tensors = {} 2524 2525 def get(self, key, state_manager): 2526 """Returns a `Tensor` for the given key. 2527 2528 A `str` key is used to access a base feature (not-transformed). When a 2529 `FeatureColumn` is passed, the transformed feature is returned if it 2530 already exists, otherwise the given `FeatureColumn` is asked to provide its 2531 transformed output, which is then cached. 2532 2533 Args: 2534 key: a `str` or a `FeatureColumn`. 2535 state_manager: A StateManager object that holds the FeatureColumn state. 2536 2537 Returns: 2538 The transformed `Tensor` corresponding to the `key`. 2539 2540 Raises: 2541 ValueError: if key is not found or a transformed `Tensor` cannot be 2542 computed. 2543 """ 2544 if key in self._feature_tensors: 2545 # FeatureColumn is already transformed or converted. 2546 return self._feature_tensors[key] 2547 2548 if key in self._features: 2549 feature_tensor = self._get_raw_feature_as_tensor(key) 2550 self._feature_tensors[key] = feature_tensor 2551 return feature_tensor 2552 2553 if isinstance(key, six.string_types): 2554 raise ValueError('Feature {} is not in features dictionary.'.format(key)) 2555 2556 if not isinstance(key, FeatureColumn): 2557 raise TypeError('"key" must be either a "str" or "FeatureColumn". ' 2558 'Provided: {}'.format(key)) 2559 2560 column = key 2561 logging.debug('Transforming feature_column %s.', column) 2562 transformed = column.transform_feature(self, state_manager) 2563 if transformed is None: 2564 raise ValueError('Column {} is not supported.'.format(column.name)) 2565 self._feature_tensors[column] = transformed 2566 return transformed 2567 2568 def _get_raw_feature_as_tensor(self, key): 2569 """Gets the raw_feature (keyed by `key`) as `tensor`. 2570 2571 The raw feature is converted to (sparse) tensor and maybe expand dim. 2572 2573 For both `Tensor` and `SparseTensor`, the rank will be expanded (to 2) if 2574 the rank is 1. This supports dynamic rank also. For rank 0 raw feature, will 2575 error out as it is not supported. 2576 2577 Args: 2578 key: A `str` key to access the raw feature. 2579 2580 Returns: 2581 A `Tensor` or `SparseTensor`. 2582 2583 Raises: 2584 ValueError: if the raw feature has rank 0. 2585 """ 2586 raw_feature = self._features[key] 2587 feature_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor( 2588 raw_feature) 2589 2590 def expand_dims(input_tensor): 2591 # Input_tensor must have rank 1. 2592 if isinstance(input_tensor, sparse_tensor_lib.SparseTensor): 2593 return sparse_ops.sparse_reshape( 2594 input_tensor, [array_ops.shape(input_tensor)[0], 1]) 2595 else: 2596 return array_ops.expand_dims(input_tensor, -1) 2597 2598 rank = feature_tensor.get_shape().ndims 2599 if rank is not None: 2600 if rank == 0: 2601 raise ValueError( 2602 'Feature (key: {}) cannot have rank 0. Give: {}'.format( 2603 key, feature_tensor)) 2604 return feature_tensor if rank != 1 else expand_dims(feature_tensor) 2605 2606 # Handle dynamic rank. 2607 with ops.control_dependencies([ 2608 check_ops.assert_positive( 2609 array_ops.rank(feature_tensor), 2610 message='Feature (key: {}) cannot have rank 0. Given: {}'.format( 2611 key, feature_tensor))]): 2612 return control_flow_ops.cond( 2613 math_ops.equal(1, array_ops.rank(feature_tensor)), 2614 lambda: expand_dims(feature_tensor), 2615 lambda: feature_tensor) 2616 2617 2618# TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py 2619def _to_sparse_input_and_drop_ignore_values(input_tensor, ignore_value=None): 2620 """Converts a `Tensor` to a `SparseTensor`, dropping ignore_value cells. 2621 2622 If `input_tensor` is already a `SparseTensor`, just return it. 2623 2624 Args: 2625 input_tensor: A string or integer `Tensor`. 2626 ignore_value: Entries in `dense_tensor` equal to this value will be 2627 absent from the resulting `SparseTensor`. If `None`, default value of 2628 `dense_tensor`'s dtype will be used ('' for `str`, -1 for `int`). 2629 2630 Returns: 2631 A `SparseTensor` with the same shape as `input_tensor`. 2632 2633 Raises: 2634 ValueError: when `input_tensor`'s rank is `None`. 2635 """ 2636 input_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor( 2637 input_tensor) 2638 if isinstance(input_tensor, sparse_tensor_lib.SparseTensor): 2639 return input_tensor 2640 with ops.name_scope(None, 'to_sparse_input', (input_tensor, ignore_value,)): 2641 if ignore_value is None: 2642 if input_tensor.dtype == dtypes.string: 2643 # Exception due to TF strings are converted to numpy objects by default. 2644 ignore_value = '' 2645 elif input_tensor.dtype.is_integer: 2646 ignore_value = -1 # -1 has a special meaning of missing feature 2647 else: 2648 # NOTE: `as_numpy_dtype` is a property, so with the parentheses this is 2649 # constructing a new numpy object of the given type, which yields the 2650 # default value for that type. 2651 ignore_value = input_tensor.dtype.as_numpy_dtype() 2652 ignore_value = math_ops.cast( 2653 ignore_value, input_tensor.dtype, name='ignore_value') 2654 indices = array_ops.where( 2655 math_ops.not_equal(input_tensor, ignore_value), name='indices') 2656 return sparse_tensor_lib.SparseTensor( 2657 indices=indices, 2658 values=array_ops.gather_nd(input_tensor, indices, name='values'), 2659 dense_shape=array_ops.shape( 2660 input_tensor, out_type=dtypes.int64, name='dense_shape')) 2661 2662 2663def _normalize_feature_columns(feature_columns): 2664 """Normalizes the `feature_columns` input. 2665 2666 This method converts the `feature_columns` to list type as best as it can. In 2667 addition, verifies the type and other parts of feature_columns, required by 2668 downstream library. 2669 2670 Args: 2671 feature_columns: The raw feature columns, usually passed by users. 2672 2673 Returns: 2674 The normalized feature column list. 2675 2676 Raises: 2677 ValueError: for any invalid inputs, such as empty, duplicated names, etc. 2678 """ 2679 if isinstance(feature_columns, FeatureColumn): 2680 feature_columns = [feature_columns] 2681 2682 if isinstance(feature_columns, collections.Iterator): 2683 feature_columns = list(feature_columns) 2684 2685 if isinstance(feature_columns, dict): 2686 raise ValueError('Expected feature_columns to be iterable, found dict.') 2687 2688 for column in feature_columns: 2689 if not isinstance(column, FeatureColumn): 2690 raise ValueError('Items of feature_columns must be a FeatureColumn. ' 2691 'Given (type {}): {}.'.format(type(column), column)) 2692 if not feature_columns: 2693 raise ValueError('feature_columns must not be empty.') 2694 name_to_column = dict() 2695 for column in feature_columns: 2696 if column.name in name_to_column: 2697 raise ValueError('Duplicate feature column name found for columns: {} ' 2698 'and {}. This usually means that these columns refer to ' 2699 'same base feature. Either one must be discarded or a ' 2700 'duplicated but renamed item must be inserted in ' 2701 'features dict.'.format(column, 2702 name_to_column[column.name])) 2703 name_to_column[column.name] = column 2704 2705 return sorted(feature_columns, key=lambda x: x.name) 2706 2707 2708class NumericColumn( 2709 DenseColumn, 2710 fc_old._DenseColumn, # pylint: disable=protected-access 2711 collections.namedtuple( 2712 'NumericColumn', 2713 ('key', 'shape', 'default_value', 'dtype', 'normalizer_fn'))): 2714 """see `numeric_column`.""" 2715 2716 @property 2717 def _is_v2_column(self): 2718 return True 2719 2720 @property 2721 def name(self): 2722 """See `FeatureColumn` base class.""" 2723 return self.key 2724 2725 @property 2726 def parse_example_spec(self): 2727 """See `FeatureColumn` base class.""" 2728 return { 2729 self.key: 2730 parsing_ops.FixedLenFeature(self.shape, self.dtype, 2731 self.default_value) 2732 } 2733 2734 @property 2735 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 2736 _FEATURE_COLUMN_DEPRECATION) 2737 def _parse_example_spec(self): 2738 return self.parse_example_spec 2739 2740 def _transform_input_tensor(self, input_tensor): 2741 if isinstance(input_tensor, sparse_tensor_lib.SparseTensor): 2742 raise ValueError( 2743 'The corresponding Tensor of numerical column must be a Tensor. ' 2744 'SparseTensor is not supported. key: {}'.format(self.key)) 2745 if self.normalizer_fn is not None: 2746 input_tensor = self.normalizer_fn(input_tensor) 2747 return math_ops.cast(input_tensor, dtypes.float32) 2748 2749 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 2750 _FEATURE_COLUMN_DEPRECATION) 2751 def _transform_feature(self, inputs): 2752 input_tensor = inputs.get(self.key) 2753 return self._transform_input_tensor(input_tensor) 2754 2755 def transform_feature(self, transformation_cache, state_manager): 2756 """See `FeatureColumn` base class. 2757 2758 In this case, we apply the `normalizer_fn` to the input tensor. 2759 2760 Args: 2761 transformation_cache: A `FeatureTransformationCache` object to access 2762 features. 2763 state_manager: A `StateManager` to create / access resources such as 2764 lookup tables. 2765 2766 Returns: 2767 Normalized input tensor. 2768 Raises: 2769 ValueError: If a SparseTensor is passed in. 2770 """ 2771 input_tensor = transformation_cache.get(self.key, state_manager) 2772 return self._transform_input_tensor(input_tensor) 2773 2774 @property 2775 def variable_shape(self): 2776 """See `DenseColumn` base class.""" 2777 return tensor_shape.TensorShape(self.shape) 2778 2779 @property 2780 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 2781 _FEATURE_COLUMN_DEPRECATION) 2782 def _variable_shape(self): 2783 return self.variable_shape 2784 2785 def get_dense_tensor(self, transformation_cache, state_manager): 2786 """Returns dense `Tensor` representing numeric feature. 2787 2788 Args: 2789 transformation_cache: A `FeatureTransformationCache` object to access 2790 features. 2791 state_manager: A `StateManager` to create / access resources such as 2792 lookup tables. 2793 2794 Returns: 2795 Dense `Tensor` created within `transform_feature`. 2796 """ 2797 # Feature has been already transformed. Return the intermediate 2798 # representation created by _transform_feature. 2799 return transformation_cache.get(self, state_manager) 2800 2801 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 2802 _FEATURE_COLUMN_DEPRECATION) 2803 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 2804 del weight_collections 2805 del trainable 2806 return inputs.get(self) 2807 2808 @property 2809 def parents(self): 2810 """See 'FeatureColumn` base class.""" 2811 return [self.key] 2812 2813 def _get_config(self): 2814 """See 'FeatureColumn` base class.""" 2815 config = dict(zip(self._fields, self)) 2816 config['normalizer_fn'] = utils.serialize_keras_object(self.normalizer_fn) 2817 config['dtype'] = self.dtype.name 2818 return config 2819 2820 @classmethod 2821 def _from_config(cls, config, custom_objects=None, columns_by_name=None): 2822 """See 'FeatureColumn` base class.""" 2823 _check_config_keys(config, cls._fields) 2824 kwargs = config.copy() 2825 kwargs['normalizer_fn'] = utils.deserialize_keras_object( 2826 config['normalizer_fn'], custom_objects=custom_objects) 2827 kwargs['dtype'] = dtypes.as_dtype(config['dtype']) 2828 return cls(**kwargs) 2829 2830 2831class BucketizedColumn( 2832 DenseColumn, 2833 CategoricalColumn, 2834 fc_old._DenseColumn, # pylint: disable=protected-access 2835 fc_old._CategoricalColumn, # pylint: disable=protected-access 2836 collections.namedtuple('BucketizedColumn', 2837 ('source_column', 'boundaries'))): 2838 """See `bucketized_column`.""" 2839 2840 @property 2841 def _is_v2_column(self): 2842 return (isinstance(self.source_column, FeatureColumn) and 2843 self.source_column._is_v2_column) # pylint: disable=protected-access 2844 2845 @property 2846 def name(self): 2847 """See `FeatureColumn` base class.""" 2848 return '{}_bucketized'.format(self.source_column.name) 2849 2850 @property 2851 def parse_example_spec(self): 2852 """See `FeatureColumn` base class.""" 2853 return self.source_column.parse_example_spec 2854 2855 @property 2856 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 2857 _FEATURE_COLUMN_DEPRECATION) 2858 def _parse_example_spec(self): 2859 return self.source_column._parse_example_spec # pylint: disable=protected-access 2860 2861 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 2862 _FEATURE_COLUMN_DEPRECATION) 2863 def _transform_feature(self, inputs): 2864 """Returns bucketized categorical `source_column` tensor.""" 2865 source_tensor = inputs.get(self.source_column) 2866 return math_ops._bucketize( # pylint: disable=protected-access 2867 source_tensor, 2868 boundaries=self.boundaries) 2869 2870 def transform_feature(self, transformation_cache, state_manager): 2871 """Returns bucketized categorical `source_column` tensor.""" 2872 source_tensor = transformation_cache.get(self.source_column, state_manager) 2873 return math_ops._bucketize( # pylint: disable=protected-access 2874 source_tensor, 2875 boundaries=self.boundaries) 2876 2877 @property 2878 def variable_shape(self): 2879 """See `DenseColumn` base class.""" 2880 return tensor_shape.TensorShape( 2881 tuple(self.source_column.shape) + (len(self.boundaries) + 1,)) 2882 2883 @property 2884 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 2885 _FEATURE_COLUMN_DEPRECATION) 2886 def _variable_shape(self): 2887 return self.variable_shape 2888 2889 def _get_dense_tensor_for_input_tensor(self, input_tensor): 2890 return array_ops.one_hot( 2891 indices=math_ops.cast(input_tensor, dtypes.int64), 2892 depth=len(self.boundaries) + 1, 2893 on_value=1., 2894 off_value=0.) 2895 2896 def get_dense_tensor(self, transformation_cache, state_manager): 2897 """Returns one hot encoded dense `Tensor`.""" 2898 input_tensor = transformation_cache.get(self, state_manager) 2899 return self._get_dense_tensor_for_input_tensor(input_tensor) 2900 2901 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 2902 _FEATURE_COLUMN_DEPRECATION) 2903 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 2904 del weight_collections 2905 del trainable 2906 input_tensor = inputs.get(self) 2907 return self._get_dense_tensor_for_input_tensor(input_tensor) 2908 2909 @property 2910 def num_buckets(self): 2911 """See `CategoricalColumn` base class.""" 2912 # By construction, source_column is always one-dimensional. 2913 return (len(self.boundaries) + 1) * self.source_column.shape[0] 2914 2915 @property 2916 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 2917 _FEATURE_COLUMN_DEPRECATION) 2918 def _num_buckets(self): 2919 return self.num_buckets 2920 2921 def _get_sparse_tensors_for_input_tensor(self, input_tensor): 2922 batch_size = array_ops.shape(input_tensor)[0] 2923 # By construction, source_column is always one-dimensional. 2924 source_dimension = self.source_column.shape[0] 2925 2926 i1 = array_ops.reshape( 2927 array_ops.tile( 2928 array_ops.expand_dims(math_ops.range(0, batch_size), 1), 2929 [1, source_dimension]), 2930 (-1,)) 2931 i2 = array_ops.tile(math_ops.range(0, source_dimension), [batch_size]) 2932 # Flatten the bucket indices and unique them across dimensions 2933 # E.g. 2nd dimension indices will range from k to 2*k-1 with k buckets 2934 bucket_indices = ( 2935 array_ops.reshape(input_tensor, (-1,)) + 2936 (len(self.boundaries) + 1) * i2) 2937 2938 indices = math_ops.cast( 2939 array_ops.transpose(array_ops.stack((i1, i2))), dtypes.int64) 2940 dense_shape = math_ops.cast( 2941 array_ops.stack([batch_size, source_dimension]), dtypes.int64) 2942 sparse_tensor = sparse_tensor_lib.SparseTensor( 2943 indices=indices, 2944 values=bucket_indices, 2945 dense_shape=dense_shape) 2946 return CategoricalColumn.IdWeightPair(sparse_tensor, None) 2947 2948 def get_sparse_tensors(self, transformation_cache, state_manager): 2949 """Converts dense inputs to SparseTensor so downstream code can use it.""" 2950 input_tensor = transformation_cache.get(self, state_manager) 2951 return self._get_sparse_tensors_for_input_tensor(input_tensor) 2952 2953 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 2954 _FEATURE_COLUMN_DEPRECATION) 2955 def _get_sparse_tensors(self, inputs, weight_collections=None, 2956 trainable=None): 2957 """Converts dense inputs to SparseTensor so downstream code can use it.""" 2958 del weight_collections 2959 del trainable 2960 input_tensor = inputs.get(self) 2961 return self._get_sparse_tensors_for_input_tensor(input_tensor) 2962 2963 @property 2964 def parents(self): 2965 """See 'FeatureColumn` base class.""" 2966 return [self.source_column] 2967 2968 def _get_config(self): 2969 """See 'FeatureColumn` base class.""" 2970 config = dict(zip(self._fields, self)) 2971 config['source_column'] = serialize_feature_column(self.source_column) 2972 return config 2973 2974 @classmethod 2975 def _from_config(cls, config, custom_objects=None, columns_by_name=None): 2976 """See 'FeatureColumn` base class.""" 2977 _check_config_keys(config, cls._fields) 2978 kwargs = config.copy() 2979 kwargs['source_column'] = deserialize_feature_column( 2980 config['source_column'], custom_objects, columns_by_name) 2981 return cls(**kwargs) 2982 2983 2984class EmbeddingColumn( 2985 DenseColumn, 2986 SequenceDenseColumn, 2987 fc_old._DenseColumn, # pylint: disable=protected-access 2988 fc_old._SequenceDenseColumn, # pylint: disable=protected-access 2989 collections.namedtuple( 2990 'EmbeddingColumn', 2991 ('categorical_column', 'dimension', 'combiner', 'initializer', 2992 'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable'))): 2993 """See `embedding_column`.""" 2994 2995 @property 2996 def _is_v2_column(self): 2997 return (isinstance(self.categorical_column, FeatureColumn) and 2998 self.categorical_column._is_v2_column) # pylint: disable=protected-access 2999 3000 @property 3001 def name(self): 3002 """See `FeatureColumn` base class.""" 3003 return '{}_embedding'.format(self.categorical_column.name) 3004 3005 @property 3006 def parse_example_spec(self): 3007 """See `FeatureColumn` base class.""" 3008 return self.categorical_column.parse_example_spec 3009 3010 @property 3011 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 3012 _FEATURE_COLUMN_DEPRECATION) 3013 def _parse_example_spec(self): 3014 return self.categorical_column._parse_example_spec # pylint: disable=protected-access 3015 3016 def transform_feature(self, transformation_cache, state_manager): 3017 """Transforms underlying `categorical_column`.""" 3018 return transformation_cache.get(self.categorical_column, state_manager) 3019 3020 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 3021 _FEATURE_COLUMN_DEPRECATION) 3022 def _transform_feature(self, inputs): 3023 return inputs.get(self.categorical_column) 3024 3025 @property 3026 def variable_shape(self): 3027 """See `DenseColumn` base class.""" 3028 return tensor_shape.vector(self.dimension) 3029 3030 @property 3031 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 3032 _FEATURE_COLUMN_DEPRECATION) 3033 def _variable_shape(self): 3034 return self.variable_shape 3035 3036 def create_state(self, state_manager): 3037 """Creates the embedding lookup variable.""" 3038 embedding_shape = (self.categorical_column._num_buckets, self.dimension) # pylint: disable=protected-access 3039 state_manager.create_variable( 3040 self, 3041 name='embedding_weights', 3042 shape=embedding_shape, 3043 dtype=dtypes.float32, 3044 trainable=self.trainable, 3045 # TODO(rohanj): Make this True when b/118500434 is fixed. 3046 use_resource=False, 3047 initializer=self.initializer) 3048 3049 def _get_dense_tensor_internal_helper(self, sparse_tensors, 3050 embedding_weights): 3051 sparse_ids = sparse_tensors.id_tensor 3052 sparse_weights = sparse_tensors.weight_tensor 3053 3054 if self.ckpt_to_load_from is not None: 3055 to_restore = embedding_weights 3056 if isinstance(to_restore, variables.PartitionedVariable): 3057 to_restore = to_restore._get_variable_list() # pylint: disable=protected-access 3058 checkpoint_utils.init_from_checkpoint(self.ckpt_to_load_from, { 3059 self.tensor_name_in_ckpt: to_restore 3060 }) 3061 3062 # Return embedding lookup result. 3063 return embedding_ops.safe_embedding_lookup_sparse( 3064 embedding_weights=embedding_weights, 3065 sparse_ids=sparse_ids, 3066 sparse_weights=sparse_weights, 3067 combiner=self.combiner, 3068 name='%s_weights' % self.name, 3069 max_norm=self.max_norm) 3070 3071 def _get_dense_tensor_internal(self, sparse_tensors, state_manager): 3072 """Private method that follows the signature of get_dense_tensor.""" 3073 embedding_weights = state_manager.get_variable( 3074 self, name='embedding_weights') 3075 return self._get_dense_tensor_internal_helper(sparse_tensors, 3076 embedding_weights) 3077 3078 def _old_get_dense_tensor_internal(self, sparse_tensors, weight_collections, 3079 trainable): 3080 """Private method that follows the signature of _get_dense_tensor.""" 3081 embedding_shape = (self.categorical_column._num_buckets, self.dimension) # pylint: disable=protected-access 3082 if (weight_collections and 3083 ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections): 3084 weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES) 3085 embedding_weights = variable_scope.get_variable( 3086 name='embedding_weights', 3087 shape=embedding_shape, 3088 dtype=dtypes.float32, 3089 initializer=self.initializer, 3090 trainable=self.trainable and trainable, 3091 collections=weight_collections) 3092 return self._get_dense_tensor_internal_helper(sparse_tensors, 3093 embedding_weights) 3094 3095 def get_dense_tensor(self, transformation_cache, state_manager): 3096 """Returns tensor after doing the embedding lookup. 3097 3098 Args: 3099 transformation_cache: A `FeatureTransformationCache` object to access 3100 features. 3101 state_manager: A `StateManager` to create / access resources such as 3102 lookup tables. 3103 3104 Returns: 3105 Embedding lookup tensor. 3106 3107 Raises: 3108 ValueError: `categorical_column` is SequenceCategoricalColumn. 3109 """ 3110 if isinstance(self.categorical_column, SequenceCategoricalColumn): 3111 raise ValueError( 3112 'In embedding_column: {}. ' 3113 'categorical_column must not be of type SequenceCategoricalColumn. ' 3114 'Suggested fix A: If you wish to use DenseFeatures, use a ' 3115 'non-sequence categorical_column_with_*. ' 3116 'Suggested fix B: If you wish to create sequence input, use ' 3117 'SequenceFeatures instead of DenseFeatures. ' 3118 'Given (type {}): {}'.format(self.name, type(self.categorical_column), 3119 self.categorical_column)) 3120 # Get sparse IDs and weights. 3121 sparse_tensors = self.categorical_column.get_sparse_tensors( 3122 transformation_cache, state_manager) 3123 return self._get_dense_tensor_internal(sparse_tensors, state_manager) 3124 3125 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 3126 _FEATURE_COLUMN_DEPRECATION) 3127 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 3128 if isinstance( 3129 self.categorical_column, 3130 (SequenceCategoricalColumn, fc_old._SequenceCategoricalColumn)): # pylint: disable=protected-access 3131 raise ValueError( 3132 'In embedding_column: {}. ' 3133 'categorical_column must not be of type _SequenceCategoricalColumn. ' 3134 'Suggested fix A: If you wish to use DenseFeatures, use a ' 3135 'non-sequence categorical_column_with_*. ' 3136 'Suggested fix B: If you wish to create sequence input, use ' 3137 'SequenceFeatures instead of DenseFeatures. ' 3138 'Given (type {}): {}'.format(self.name, type(self.categorical_column), 3139 self.categorical_column)) 3140 sparse_tensors = self.categorical_column._get_sparse_tensors( # pylint: disable=protected-access 3141 inputs, weight_collections, trainable) 3142 return self._old_get_dense_tensor_internal(sparse_tensors, 3143 weight_collections, trainable) 3144 3145 def get_sequence_dense_tensor(self, transformation_cache, state_manager): 3146 """See `SequenceDenseColumn` base class.""" 3147 if not isinstance(self.categorical_column, SequenceCategoricalColumn): 3148 raise ValueError( 3149 'In embedding_column: {}. ' 3150 'categorical_column must be of type SequenceCategoricalColumn ' 3151 'to use SequenceFeatures. ' 3152 'Suggested fix: Use one of sequence_categorical_column_with_*. ' 3153 'Given (type {}): {}'.format(self.name, type(self.categorical_column), 3154 self.categorical_column)) 3155 sparse_tensors = self.categorical_column.get_sparse_tensors( 3156 transformation_cache, state_manager) 3157 dense_tensor = self._get_dense_tensor_internal(sparse_tensors, 3158 state_manager) 3159 sequence_length = fc_utils.sequence_length_from_sparse_tensor( 3160 sparse_tensors.id_tensor) 3161 return SequenceDenseColumn.TensorSequenceLengthPair( 3162 dense_tensor=dense_tensor, sequence_length=sequence_length) 3163 3164 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 3165 _FEATURE_COLUMN_DEPRECATION) 3166 def _get_sequence_dense_tensor(self, 3167 inputs, 3168 weight_collections=None, 3169 trainable=None): 3170 if not isinstance( 3171 self.categorical_column, 3172 (SequenceCategoricalColumn, fc_old._SequenceCategoricalColumn)): # pylint: disable=protected-access 3173 raise ValueError( 3174 'In embedding_column: {}. ' 3175 'categorical_column must be of type SequenceCategoricalColumn ' 3176 'to use SequenceFeatures. ' 3177 'Suggested fix: Use one of sequence_categorical_column_with_*. ' 3178 'Given (type {}): {}'.format(self.name, type(self.categorical_column), 3179 self.categorical_column)) 3180 sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access 3181 dense_tensor = self._old_get_dense_tensor_internal( 3182 sparse_tensors, 3183 weight_collections=weight_collections, 3184 trainable=trainable) 3185 sequence_length = fc_utils.sequence_length_from_sparse_tensor( 3186 sparse_tensors.id_tensor) 3187 return SequenceDenseColumn.TensorSequenceLengthPair( 3188 dense_tensor=dense_tensor, sequence_length=sequence_length) 3189 3190 @property 3191 def parents(self): 3192 """See 'FeatureColumn` base class.""" 3193 return [self.categorical_column] 3194 3195 def _get_config(self): 3196 """See 'FeatureColumn` base class.""" 3197 config = dict(zip(self._fields, self)) 3198 config['categorical_column'] = serialize_feature_column( 3199 self.categorical_column) 3200 config['initializer'] = utils.serialize_keras_object(self.initializer) 3201 return config 3202 3203 @classmethod 3204 def _from_config(cls, config, custom_objects=None, columns_by_name=None): 3205 """See 'FeatureColumn` base class.""" 3206 _check_config_keys(config, cls._fields) 3207 kwargs = config.copy() 3208 kwargs['categorical_column'] = deserialize_feature_column( 3209 config['categorical_column'], custom_objects, columns_by_name) 3210 kwargs['initializer'] = utils.deserialize_keras_object( 3211 config['initializer'], custom_objects=custom_objects) 3212 return cls(**kwargs) 3213 3214 3215def _raise_shared_embedding_column_error(): 3216 raise ValueError('SharedEmbeddingColumns are not supported in ' 3217 '`linear_model` or `input_layer`. Please use ' 3218 '`DenseFeatures` or `LinearModel` instead.') 3219 3220 3221class SharedEmbeddingColumnCreator(tracking.AutoTrackable): 3222 3223 def __init__(self, 3224 dimension, 3225 initializer, 3226 ckpt_to_load_from, 3227 tensor_name_in_ckpt, 3228 num_buckets, 3229 trainable, 3230 name='shared_embedding_column_creator'): 3231 self._dimension = dimension 3232 self._initializer = initializer 3233 self._ckpt_to_load_from = ckpt_to_load_from 3234 self._tensor_name_in_ckpt = tensor_name_in_ckpt 3235 self._num_buckets = num_buckets 3236 self._trainable = trainable 3237 self._name = name 3238 # Map from graph keys to embedding_weight variables. 3239 self._embedding_weights = {} 3240 3241 def __call__(self, categorical_column, combiner, max_norm): 3242 return SharedEmbeddingColumn(categorical_column, self, combiner, max_norm) 3243 3244 @property 3245 def embedding_weights(self): 3246 key = ops.get_default_graph()._graph_key # pylint: disable=protected-access 3247 if key not in self._embedding_weights: 3248 embedding_shape = (self._num_buckets, self._dimension) 3249 var = variable_scope.get_variable( 3250 name=self._name, 3251 shape=embedding_shape, 3252 dtype=dtypes.float32, 3253 initializer=self._initializer, 3254 trainable=self._trainable) 3255 3256 if self._ckpt_to_load_from is not None: 3257 to_restore = var 3258 if isinstance(to_restore, variables.PartitionedVariable): 3259 to_restore = to_restore._get_variable_list() # pylint: disable=protected-access 3260 checkpoint_utils.init_from_checkpoint( 3261 self._ckpt_to_load_from, {self._tensor_name_in_ckpt: to_restore}) 3262 self._embedding_weights[key] = var 3263 return self._embedding_weights[key] 3264 3265 @property 3266 def dimension(self): 3267 return self._dimension 3268 3269 3270class SharedEmbeddingColumn( 3271 DenseColumn, 3272 SequenceDenseColumn, 3273 fc_old._DenseColumn, # pylint: disable=protected-access 3274 fc_old._SequenceDenseColumn, # pylint: disable=protected-access 3275 collections.namedtuple( 3276 'SharedEmbeddingColumn', 3277 ('categorical_column', 'shared_embedding_column_creator', 'combiner', 3278 'max_norm'))): 3279 """See `embedding_column`.""" 3280 3281 @property 3282 def _is_v2_column(self): 3283 return True 3284 3285 @property 3286 def name(self): 3287 """See `FeatureColumn` base class.""" 3288 return '{}_shared_embedding'.format(self.categorical_column.name) 3289 3290 @property 3291 def parse_example_spec(self): 3292 """See `FeatureColumn` base class.""" 3293 return self.categorical_column.parse_example_spec 3294 3295 @property 3296 def _parse_example_spec(self): 3297 return _raise_shared_embedding_column_error() 3298 3299 def transform_feature(self, transformation_cache, state_manager): 3300 """See `FeatureColumn` base class.""" 3301 return transformation_cache.get(self.categorical_column, state_manager) 3302 3303 def _transform_feature(self, inputs): 3304 return _raise_shared_embedding_column_error() 3305 3306 @property 3307 def variable_shape(self): 3308 """See `DenseColumn` base class.""" 3309 return tensor_shape.vector(self.shared_embedding_column_creator.dimension) 3310 3311 @property 3312 def _variable_shape(self): 3313 return _raise_shared_embedding_column_error() 3314 3315 def _get_dense_tensor_internal(self, transformation_cache, state_manager): 3316 """Private method that follows the signature of _get_dense_tensor.""" 3317 # This method is called from a variable_scope with name _var_scope_name, 3318 # which is shared among all shared embeddings. Open a name_scope here, so 3319 # that the ops for different columns have distinct names. 3320 with ops.name_scope(None, default_name=self.name): 3321 # Get sparse IDs and weights. 3322 sparse_tensors = self.categorical_column.get_sparse_tensors( 3323 transformation_cache, state_manager) 3324 sparse_ids = sparse_tensors.id_tensor 3325 sparse_weights = sparse_tensors.weight_tensor 3326 3327 embedding_weights = self.shared_embedding_column_creator.embedding_weights 3328 3329 # Return embedding lookup result. 3330 return embedding_ops.safe_embedding_lookup_sparse( 3331 embedding_weights=embedding_weights, 3332 sparse_ids=sparse_ids, 3333 sparse_weights=sparse_weights, 3334 combiner=self.combiner, 3335 name='%s_weights' % self.name, 3336 max_norm=self.max_norm) 3337 3338 def get_dense_tensor(self, transformation_cache, state_manager): 3339 """Returns the embedding lookup result.""" 3340 if isinstance(self.categorical_column, SequenceCategoricalColumn): 3341 raise ValueError( 3342 'In embedding_column: {}. ' 3343 'categorical_column must not be of type SequenceCategoricalColumn. ' 3344 'Suggested fix A: If you wish to use DenseFeatures, use a ' 3345 'non-sequence categorical_column_with_*. ' 3346 'Suggested fix B: If you wish to create sequence input, use ' 3347 'SequenceFeatures instead of DenseFeatures. ' 3348 'Given (type {}): {}'.format(self.name, type(self.categorical_column), 3349 self.categorical_column)) 3350 return self._get_dense_tensor_internal(transformation_cache, state_manager) 3351 3352 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 3353 return _raise_shared_embedding_column_error() 3354 3355 def get_sequence_dense_tensor(self, transformation_cache, state_manager): 3356 """See `SequenceDenseColumn` base class.""" 3357 if not isinstance(self.categorical_column, SequenceCategoricalColumn): 3358 raise ValueError( 3359 'In embedding_column: {}. ' 3360 'categorical_column must be of type SequenceCategoricalColumn ' 3361 'to use SequenceFeatures. ' 3362 'Suggested fix: Use one of sequence_categorical_column_with_*. ' 3363 'Given (type {}): {}'.format(self.name, type(self.categorical_column), 3364 self.categorical_column)) 3365 dense_tensor = self._get_dense_tensor_internal(transformation_cache, 3366 state_manager) 3367 sparse_tensors = self.categorical_column.get_sparse_tensors( 3368 transformation_cache, state_manager) 3369 sequence_length = fc_utils.sequence_length_from_sparse_tensor( 3370 sparse_tensors.id_tensor) 3371 return SequenceDenseColumn.TensorSequenceLengthPair( 3372 dense_tensor=dense_tensor, sequence_length=sequence_length) 3373 3374 def _get_sequence_dense_tensor(self, 3375 inputs, 3376 weight_collections=None, 3377 trainable=None): 3378 return _raise_shared_embedding_column_error() 3379 3380 @property 3381 def parents(self): 3382 """See 'FeatureColumn` base class.""" 3383 return [self.categorical_column] 3384 3385 def _get_config(self): 3386 """See 'FeatureColumn` base class.""" 3387 raise NotImplementedError() 3388 3389 @classmethod 3390 def _from_config(cls, config, custom_objects=None, columns_by_name=None): 3391 """See 'FeatureColumn` base class.""" 3392 raise NotImplementedError() 3393 3394 3395def _check_shape(shape, key): 3396 """Returns shape if it's valid, raises error otherwise.""" 3397 assert shape is not None 3398 if not nest.is_sequence(shape): 3399 shape = [shape] 3400 shape = tuple(shape) 3401 for dimension in shape: 3402 if not isinstance(dimension, int): 3403 raise TypeError('shape dimensions must be integer. ' 3404 'shape: {}, key: {}'.format(shape, key)) 3405 if dimension < 1: 3406 raise ValueError('shape dimensions must be greater than 0. ' 3407 'shape: {}, key: {}'.format(shape, key)) 3408 return shape 3409 3410 3411class HashedCategoricalColumn( 3412 CategoricalColumn, 3413 fc_old._CategoricalColumn, # pylint: disable=protected-access 3414 collections.namedtuple('HashedCategoricalColumn', 3415 ('key', 'hash_bucket_size', 'dtype'))): 3416 """see `categorical_column_with_hash_bucket`.""" 3417 3418 @property 3419 def _is_v2_column(self): 3420 return True 3421 3422 @property 3423 def name(self): 3424 """See `FeatureColumn` base class.""" 3425 return self.key 3426 3427 @property 3428 def parse_example_spec(self): 3429 """See `FeatureColumn` base class.""" 3430 return {self.key: parsing_ops.VarLenFeature(self.dtype)} 3431 3432 @property 3433 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 3434 _FEATURE_COLUMN_DEPRECATION) 3435 def _parse_example_spec(self): 3436 return self.parse_example_spec 3437 3438 def _transform_input_tensor(self, input_tensor): 3439 """Hashes the values in the feature_column.""" 3440 if not isinstance(input_tensor, sparse_tensor_lib.SparseTensor): 3441 raise ValueError('SparseColumn input must be a SparseTensor.') 3442 3443 fc_utils.assert_string_or_int( 3444 input_tensor.dtype, 3445 prefix='column_name: {} input_tensor'.format(self.key)) 3446 3447 if self.dtype.is_integer != input_tensor.dtype.is_integer: 3448 raise ValueError( 3449 'Column dtype and SparseTensors dtype must be compatible. ' 3450 'key: {}, column dtype: {}, tensor dtype: {}'.format( 3451 self.key, self.dtype, input_tensor.dtype)) 3452 3453 if self.dtype == dtypes.string: 3454 sparse_values = input_tensor.values 3455 else: 3456 sparse_values = string_ops.as_string(input_tensor.values) 3457 3458 sparse_id_values = string_ops.string_to_hash_bucket_fast( 3459 sparse_values, self.hash_bucket_size, name='lookup') 3460 return sparse_tensor_lib.SparseTensor( 3461 input_tensor.indices, sparse_id_values, input_tensor.dense_shape) 3462 3463 def transform_feature(self, transformation_cache, state_manager): 3464 """Hashes the values in the feature_column.""" 3465 input_tensor = _to_sparse_input_and_drop_ignore_values( 3466 transformation_cache.get(self.key, state_manager)) 3467 return self._transform_input_tensor(input_tensor) 3468 3469 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 3470 _FEATURE_COLUMN_DEPRECATION) 3471 def _transform_feature(self, inputs): 3472 input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key)) 3473 return self._transform_input_tensor(input_tensor) 3474 3475 @property 3476 def num_buckets(self): 3477 """Returns number of buckets in this sparse feature.""" 3478 return self.hash_bucket_size 3479 3480 @property 3481 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 3482 _FEATURE_COLUMN_DEPRECATION) 3483 def _num_buckets(self): 3484 return self.num_buckets 3485 3486 def get_sparse_tensors(self, transformation_cache, state_manager): 3487 """See `CategoricalColumn` base class.""" 3488 return CategoricalColumn.IdWeightPair( 3489 transformation_cache.get(self, state_manager), None) 3490 3491 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 3492 _FEATURE_COLUMN_DEPRECATION) 3493 def _get_sparse_tensors(self, inputs, weight_collections=None, 3494 trainable=None): 3495 del weight_collections 3496 del trainable 3497 return CategoricalColumn.IdWeightPair(inputs.get(self), None) 3498 3499 @property 3500 def parents(self): 3501 """See 'FeatureColumn` base class.""" 3502 return [self.key] 3503 3504 def _get_config(self): 3505 """See 'FeatureColumn` base class.""" 3506 config = dict(zip(self._fields, self)) 3507 config['dtype'] = self.dtype.name 3508 return config 3509 3510 @classmethod 3511 def _from_config(cls, config, custom_objects=None, columns_by_name=None): 3512 """See 'FeatureColumn` base class.""" 3513 _check_config_keys(config, cls._fields) 3514 kwargs = config.copy() 3515 kwargs['dtype'] = dtypes.as_dtype(config['dtype']) 3516 return cls(**kwargs) 3517 3518 3519class VocabularyFileCategoricalColumn( 3520 CategoricalColumn, 3521 fc_old._CategoricalColumn, # pylint: disable=protected-access 3522 collections.namedtuple('VocabularyFileCategoricalColumn', 3523 ('key', 'vocabulary_file', 'vocabulary_size', 3524 'num_oov_buckets', 'dtype', 'default_value'))): 3525 """See `categorical_column_with_vocabulary_file`.""" 3526 3527 @property 3528 def _is_v2_column(self): 3529 return True 3530 3531 @property 3532 def name(self): 3533 """See `FeatureColumn` base class.""" 3534 return self.key 3535 3536 @property 3537 def parse_example_spec(self): 3538 """See `FeatureColumn` base class.""" 3539 return {self.key: parsing_ops.VarLenFeature(self.dtype)} 3540 3541 @property 3542 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 3543 _FEATURE_COLUMN_DEPRECATION) 3544 def _parse_example_spec(self): 3545 return self.parse_example_spec 3546 3547 def _transform_input_tensor(self, input_tensor): 3548 """Creates a lookup table for the vocabulary.""" 3549 if self.dtype.is_integer != input_tensor.dtype.is_integer: 3550 raise ValueError( 3551 'Column dtype and SparseTensors dtype must be compatible. ' 3552 'key: {}, column dtype: {}, tensor dtype: {}'.format( 3553 self.key, self.dtype, input_tensor.dtype)) 3554 3555 fc_utils.assert_string_or_int( 3556 input_tensor.dtype, 3557 prefix='column_name: {} input_tensor'.format(self.key)) 3558 3559 key_dtype = self.dtype 3560 if input_tensor.dtype.is_integer: 3561 # `index_table_from_file` requires 64-bit integer keys. 3562 key_dtype = dtypes.int64 3563 input_tensor = math_ops.cast(input_tensor, dtypes.int64) 3564 3565 # TODO(rohanj): Use state manager to manage the index table creation. 3566 return lookup_ops.index_table_from_file( 3567 vocabulary_file=self.vocabulary_file, 3568 num_oov_buckets=self.num_oov_buckets, 3569 vocab_size=self.vocabulary_size, 3570 default_value=self.default_value, 3571 key_dtype=key_dtype, 3572 name='{}_lookup'.format(self.key)).lookup(input_tensor) 3573 3574 def transform_feature(self, transformation_cache, state_manager): 3575 """Creates a lookup table for the vocabulary.""" 3576 input_tensor = _to_sparse_input_and_drop_ignore_values( 3577 transformation_cache.get(self.key, state_manager)) 3578 return self._transform_input_tensor(input_tensor) 3579 3580 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 3581 _FEATURE_COLUMN_DEPRECATION) 3582 def _transform_feature(self, inputs): 3583 input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key)) 3584 return self._transform_input_tensor(input_tensor) 3585 3586 @property 3587 def num_buckets(self): 3588 """Returns number of buckets in this sparse feature.""" 3589 return self.vocabulary_size + self.num_oov_buckets 3590 3591 @property 3592 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 3593 _FEATURE_COLUMN_DEPRECATION) 3594 def _num_buckets(self): 3595 return self.num_buckets 3596 3597 def get_sparse_tensors(self, transformation_cache, state_manager): 3598 """See `CategoricalColumn` base class.""" 3599 return CategoricalColumn.IdWeightPair( 3600 transformation_cache.get(self, state_manager), None) 3601 3602 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 3603 _FEATURE_COLUMN_DEPRECATION) 3604 def _get_sparse_tensors(self, inputs, weight_collections=None, 3605 trainable=None): 3606 del weight_collections 3607 del trainable 3608 return CategoricalColumn.IdWeightPair(inputs.get(self), None) 3609 3610 @property 3611 def parents(self): 3612 """See 'FeatureColumn` base class.""" 3613 return [self.key] 3614 3615 def _get_config(self): 3616 """See 'FeatureColumn` base class.""" 3617 config = dict(zip(self._fields, self)) 3618 config['dtype'] = self.dtype.name 3619 return config 3620 3621 @classmethod 3622 def _from_config(cls, config, custom_objects=None, columns_by_name=None): 3623 """See 'FeatureColumn` base class.""" 3624 _check_config_keys(config, cls._fields) 3625 kwargs = config.copy() 3626 kwargs['dtype'] = dtypes.as_dtype(config['dtype']) 3627 return cls(**kwargs) 3628 3629 3630class VocabularyListCategoricalColumn( 3631 CategoricalColumn, 3632 fc_old._CategoricalColumn, # pylint: disable=protected-access 3633 collections.namedtuple( 3634 'VocabularyListCategoricalColumn', 3635 ('key', 'vocabulary_list', 'dtype', 'default_value', 'num_oov_buckets')) 3636): 3637 """See `categorical_column_with_vocabulary_list`.""" 3638 3639 @property 3640 def _is_v2_column(self): 3641 return True 3642 3643 @property 3644 def name(self): 3645 """See `FeatureColumn` base class.""" 3646 return self.key 3647 3648 @property 3649 def parse_example_spec(self): 3650 """See `FeatureColumn` base class.""" 3651 return {self.key: parsing_ops.VarLenFeature(self.dtype)} 3652 3653 @property 3654 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 3655 _FEATURE_COLUMN_DEPRECATION) 3656 def _parse_example_spec(self): 3657 return self.parse_example_spec 3658 3659 def _transform_input_tensor(self, input_tensor): 3660 """Creates a lookup table for the vocabulary list.""" 3661 if self.dtype.is_integer != input_tensor.dtype.is_integer: 3662 raise ValueError( 3663 'Column dtype and SparseTensors dtype must be compatible. ' 3664 'key: {}, column dtype: {}, tensor dtype: {}'.format( 3665 self.key, self.dtype, input_tensor.dtype)) 3666 3667 fc_utils.assert_string_or_int( 3668 input_tensor.dtype, 3669 prefix='column_name: {} input_tensor'.format(self.key)) 3670 3671 key_dtype = self.dtype 3672 if input_tensor.dtype.is_integer: 3673 # `index_table_from_tensor` requires 64-bit integer keys. 3674 key_dtype = dtypes.int64 3675 input_tensor = math_ops.cast(input_tensor, dtypes.int64) 3676 3677 # TODO(rohanj): Use state manager to manage the index table creation. 3678 return lookup_ops.index_table_from_tensor( 3679 vocabulary_list=tuple(self.vocabulary_list), 3680 default_value=self.default_value, 3681 num_oov_buckets=self.num_oov_buckets, 3682 dtype=key_dtype, 3683 name='{}_lookup'.format(self.key)).lookup(input_tensor) 3684 3685 def transform_feature(self, transformation_cache, state_manager): 3686 """Creates a lookup table for the vocabulary list.""" 3687 input_tensor = _to_sparse_input_and_drop_ignore_values( 3688 transformation_cache.get(self.key, state_manager)) 3689 return self._transform_input_tensor(input_tensor) 3690 3691 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 3692 _FEATURE_COLUMN_DEPRECATION) 3693 def _transform_feature(self, inputs): 3694 input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key)) 3695 return self._transform_input_tensor(input_tensor) 3696 3697 @property 3698 def num_buckets(self): 3699 """Returns number of buckets in this sparse feature.""" 3700 return len(self.vocabulary_list) + self.num_oov_buckets 3701 3702 @property 3703 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 3704 _FEATURE_COLUMN_DEPRECATION) 3705 def _num_buckets(self): 3706 return self.num_buckets 3707 3708 def get_sparse_tensors(self, transformation_cache, state_manager): 3709 """See `CategoricalColumn` base class.""" 3710 return CategoricalColumn.IdWeightPair( 3711 transformation_cache.get(self, state_manager), None) 3712 3713 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 3714 _FEATURE_COLUMN_DEPRECATION) 3715 def _get_sparse_tensors(self, inputs, weight_collections=None, 3716 trainable=None): 3717 del weight_collections 3718 del trainable 3719 return CategoricalColumn.IdWeightPair(inputs.get(self), None) 3720 3721 @property 3722 def parents(self): 3723 """See 'FeatureColumn` base class.""" 3724 return [self.key] 3725 3726 def _get_config(self): 3727 """See 'FeatureColumn` base class.""" 3728 config = dict(zip(self._fields, self)) 3729 config['dtype'] = self.dtype.name 3730 return config 3731 3732 @classmethod 3733 def _from_config(cls, config, custom_objects=None, columns_by_name=None): 3734 """See 'FeatureColumn` base class.""" 3735 _check_config_keys(config, cls._fields) 3736 kwargs = config.copy() 3737 kwargs['dtype'] = dtypes.as_dtype(config['dtype']) 3738 return cls(**kwargs) 3739 3740 3741class IdentityCategoricalColumn( 3742 CategoricalColumn, 3743 fc_old._CategoricalColumn, # pylint: disable=protected-access 3744 collections.namedtuple('IdentityCategoricalColumn', 3745 ('key', 'number_buckets', 'default_value'))): 3746 3747 """See `categorical_column_with_identity`.""" 3748 3749 @property 3750 def _is_v2_column(self): 3751 return True 3752 3753 @property 3754 def name(self): 3755 """See `FeatureColumn` base class.""" 3756 return self.key 3757 3758 @property 3759 def parse_example_spec(self): 3760 """See `FeatureColumn` base class.""" 3761 return {self.key: parsing_ops.VarLenFeature(dtypes.int64)} 3762 3763 @property 3764 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 3765 _FEATURE_COLUMN_DEPRECATION) 3766 def _parse_example_spec(self): 3767 return self.parse_example_spec 3768 3769 def _transform_input_tensor(self, input_tensor): 3770 """Returns a SparseTensor with identity values.""" 3771 if not input_tensor.dtype.is_integer: 3772 raise ValueError( 3773 'Invalid input, not integer. key: {} dtype: {}'.format( 3774 self.key, input_tensor.dtype)) 3775 3776 values = math_ops.cast(input_tensor.values, dtypes.int64, name='values') 3777 num_buckets = math_ops.cast( 3778 self.num_buckets, dtypes.int64, name='num_buckets') 3779 zero = math_ops.cast(0, dtypes.int64, name='zero') 3780 if self.default_value is None: 3781 # Fail if values are out-of-range. 3782 assert_less = check_ops.assert_less( 3783 values, num_buckets, data=(values, num_buckets), 3784 name='assert_less_than_num_buckets') 3785 assert_greater = check_ops.assert_greater_equal( 3786 values, zero, data=(values,), 3787 name='assert_greater_or_equal_0') 3788 with ops.control_dependencies((assert_less, assert_greater)): 3789 values = array_ops.identity(values) 3790 else: 3791 # Assign default for out-of-range values. 3792 values = array_ops.where( 3793 math_ops.logical_or( 3794 values < zero, values >= num_buckets, name='out_of_range'), 3795 array_ops.fill( 3796 dims=array_ops.shape(values), 3797 value=math_ops.cast(self.default_value, dtypes.int64), 3798 name='default_values'), values) 3799 3800 return sparse_tensor_lib.SparseTensor( 3801 indices=input_tensor.indices, 3802 values=values, 3803 dense_shape=input_tensor.dense_shape) 3804 3805 def transform_feature(self, transformation_cache, state_manager): 3806 """Returns a SparseTensor with identity values.""" 3807 input_tensor = _to_sparse_input_and_drop_ignore_values( 3808 transformation_cache.get(self.key, state_manager)) 3809 return self._transform_input_tensor(input_tensor) 3810 3811 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 3812 _FEATURE_COLUMN_DEPRECATION) 3813 def _transform_feature(self, inputs): 3814 input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key)) 3815 return self._transform_input_tensor(input_tensor) 3816 3817 @property 3818 def num_buckets(self): 3819 """Returns number of buckets in this sparse feature.""" 3820 return self.number_buckets 3821 3822 @property 3823 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 3824 _FEATURE_COLUMN_DEPRECATION) 3825 def _num_buckets(self): 3826 return self.num_buckets 3827 3828 def get_sparse_tensors(self, transformation_cache, state_manager): 3829 """See `CategoricalColumn` base class.""" 3830 return CategoricalColumn.IdWeightPair( 3831 transformation_cache.get(self, state_manager), None) 3832 3833 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 3834 _FEATURE_COLUMN_DEPRECATION) 3835 def _get_sparse_tensors(self, inputs, weight_collections=None, 3836 trainable=None): 3837 del weight_collections 3838 del trainable 3839 return CategoricalColumn.IdWeightPair(inputs.get(self), None) 3840 3841 @property 3842 def parents(self): 3843 """See 'FeatureColumn` base class.""" 3844 return [self.key] 3845 3846 def _get_config(self): 3847 """See 'FeatureColumn` base class.""" 3848 return dict(zip(self._fields, self)) 3849 3850 @classmethod 3851 def _from_config(cls, config, custom_objects=None, columns_by_name=None): 3852 """See 'FeatureColumn` base class.""" 3853 _check_config_keys(config, cls._fields) 3854 return cls(**config) 3855 3856 3857class WeightedCategoricalColumn( 3858 CategoricalColumn, 3859 fc_old._CategoricalColumn, # pylint: disable=protected-access 3860 collections.namedtuple( 3861 'WeightedCategoricalColumn', 3862 ('categorical_column', 'weight_feature_key', 'dtype'))): 3863 """See `weighted_categorical_column`.""" 3864 3865 @property 3866 def _is_v2_column(self): 3867 return (isinstance(self.categorical_column, FeatureColumn) and 3868 self.categorical_column._is_v2_column) # pylint: disable=protected-access 3869 3870 @property 3871 def name(self): 3872 """See `FeatureColumn` base class.""" 3873 return '{}_weighted_by_{}'.format( 3874 self.categorical_column.name, self.weight_feature_key) 3875 3876 @property 3877 def parse_example_spec(self): 3878 """See `FeatureColumn` base class.""" 3879 config = self.categorical_column.parse_example_spec 3880 if self.weight_feature_key in config: 3881 raise ValueError('Parse config {} already exists for {}.'.format( 3882 config[self.weight_feature_key], self.weight_feature_key)) 3883 config[self.weight_feature_key] = parsing_ops.VarLenFeature(self.dtype) 3884 return config 3885 3886 @property 3887 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 3888 _FEATURE_COLUMN_DEPRECATION) 3889 def _parse_example_spec(self): 3890 config = self.categorical_column._parse_example_spec # pylint: disable=protected-access 3891 if self.weight_feature_key in config: 3892 raise ValueError('Parse config {} already exists for {}.'.format( 3893 config[self.weight_feature_key], self.weight_feature_key)) 3894 config[self.weight_feature_key] = parsing_ops.VarLenFeature(self.dtype) 3895 return config 3896 3897 @property 3898 def num_buckets(self): 3899 """See `DenseColumn` base class.""" 3900 return self.categorical_column.num_buckets 3901 3902 @property 3903 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 3904 _FEATURE_COLUMN_DEPRECATION) 3905 def _num_buckets(self): 3906 return self.categorical_column._num_buckets # pylint: disable=protected-access 3907 3908 def _transform_weight_tensor(self, weight_tensor): 3909 if weight_tensor is None: 3910 raise ValueError('Missing weights {}.'.format(self.weight_feature_key)) 3911 weight_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor( 3912 weight_tensor) 3913 if self.dtype != weight_tensor.dtype.base_dtype: 3914 raise ValueError('Bad dtype, expected {}, but got {}.'.format( 3915 self.dtype, weight_tensor.dtype)) 3916 if not isinstance(weight_tensor, sparse_tensor_lib.SparseTensor): 3917 # The weight tensor can be a regular Tensor. In this case, sparsify it. 3918 weight_tensor = _to_sparse_input_and_drop_ignore_values( 3919 weight_tensor, ignore_value=0.0) 3920 if not weight_tensor.dtype.is_floating: 3921 weight_tensor = math_ops.cast(weight_tensor, dtypes.float32) 3922 return weight_tensor 3923 3924 def transform_feature(self, transformation_cache, state_manager): 3925 """Applies weights to tensor generated from `categorical_column`'.""" 3926 weight_tensor = transformation_cache.get(self.weight_feature_key, 3927 state_manager) 3928 weight_tensor = self._transform_weight_tensor(weight_tensor) 3929 return (transformation_cache.get(self.categorical_column, state_manager), 3930 weight_tensor) 3931 3932 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 3933 _FEATURE_COLUMN_DEPRECATION) 3934 def _transform_feature(self, inputs): 3935 """Applies weights to tensor generated from `categorical_column`'.""" 3936 weight_tensor = inputs.get(self.weight_feature_key) 3937 weight_tensor = self._transform_weight_tensor(weight_tensor) 3938 return (inputs.get(self.categorical_column), weight_tensor) 3939 3940 def get_sparse_tensors(self, transformation_cache, state_manager): 3941 """See `CategoricalColumn` base class.""" 3942 tensors = transformation_cache.get(self, state_manager) 3943 return CategoricalColumn.IdWeightPair(tensors[0], tensors[1]) 3944 3945 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 3946 _FEATURE_COLUMN_DEPRECATION) 3947 def _get_sparse_tensors(self, inputs, weight_collections=None, 3948 trainable=None): 3949 del weight_collections 3950 del trainable 3951 tensors = inputs.get(self) 3952 return CategoricalColumn.IdWeightPair(tensors[0], tensors[1]) 3953 3954 @property 3955 def parents(self): 3956 """See 'FeatureColumn` base class.""" 3957 return [self.categorical_column, self.weight_feature_key] 3958 3959 def _get_config(self): 3960 """See 'FeatureColumn` base class.""" 3961 config = dict(zip(self._fields, self)) 3962 config['categorical_column'] = serialize_feature_column( 3963 self.categorical_column) 3964 config['dtype'] = self.dtype.name 3965 return config 3966 3967 @classmethod 3968 def _from_config(cls, config, custom_objects=None, columns_by_name=None): 3969 """See 'FeatureColumn` base class.""" 3970 _check_config_keys(config, cls._fields) 3971 kwargs = config.copy() 3972 kwargs['categorical_column'] = deserialize_feature_column( 3973 config['categorical_column'], custom_objects, columns_by_name) 3974 kwargs['dtype'] = dtypes.as_dtype(config['dtype']) 3975 return cls(**kwargs) 3976 3977 3978class CrossedColumn( 3979 CategoricalColumn, 3980 fc_old._CategoricalColumn, # pylint: disable=protected-access 3981 collections.namedtuple('CrossedColumn', 3982 ('keys', 'hash_bucket_size', 'hash_key'))): 3983 """See `crossed_column`.""" 3984 3985 @property 3986 def _is_v2_column(self): 3987 for key in _collect_leaf_level_keys(self): 3988 if isinstance(key, six.string_types): 3989 continue 3990 if not isinstance(key, FeatureColumn): 3991 return False 3992 if not key._is_v2_column: # pylint: disable=protected-access 3993 return False 3994 return True 3995 3996 @property 3997 def name(self): 3998 """See `FeatureColumn` base class.""" 3999 feature_names = [] 4000 for key in _collect_leaf_level_keys(self): 4001 if isinstance(key, (FeatureColumn, fc_old._FeatureColumn)): # pylint: disable=protected-access 4002 feature_names.append(key.name) 4003 else: # key must be a string 4004 feature_names.append(key) 4005 return '_X_'.join(sorted(feature_names)) 4006 4007 @property 4008 def parse_example_spec(self): 4009 """See `FeatureColumn` base class.""" 4010 config = {} 4011 for key in self.keys: 4012 if isinstance(key, FeatureColumn): 4013 config.update(key.parse_example_spec) 4014 elif isinstance(key, fc_old._FeatureColumn): # pylint: disable=protected-access 4015 config.update(key._parse_example_spec) # pylint: disable=protected-access 4016 else: # key must be a string 4017 config.update({key: parsing_ops.VarLenFeature(dtypes.string)}) 4018 return config 4019 4020 @property 4021 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 4022 _FEATURE_COLUMN_DEPRECATION) 4023 def _parse_example_spec(self): 4024 return self.parse_example_spec 4025 4026 def transform_feature(self, transformation_cache, state_manager): 4027 """Generates a hashed sparse cross from the input tensors.""" 4028 feature_tensors = [] 4029 for key in _collect_leaf_level_keys(self): 4030 if isinstance(key, six.string_types): 4031 feature_tensors.append(transformation_cache.get(key, state_manager)) 4032 elif isinstance(key, (fc_old._CategoricalColumn, CategoricalColumn)): # pylint: disable=protected-access 4033 ids_and_weights = key.get_sparse_tensors(transformation_cache, 4034 state_manager) 4035 if ids_and_weights.weight_tensor is not None: 4036 raise ValueError( 4037 'crossed_column does not support weight_tensor, but the given ' 4038 'column populates weight_tensor. ' 4039 'Given column: {}'.format(key.name)) 4040 feature_tensors.append(ids_and_weights.id_tensor) 4041 else: 4042 raise ValueError('Unsupported column type. Given: {}'.format(key)) 4043 return sparse_ops.sparse_cross_hashed( 4044 inputs=feature_tensors, 4045 num_buckets=self.hash_bucket_size, 4046 hash_key=self.hash_key) 4047 4048 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 4049 _FEATURE_COLUMN_DEPRECATION) 4050 def _transform_feature(self, inputs): 4051 """Generates a hashed sparse cross from the input tensors.""" 4052 feature_tensors = [] 4053 for key in _collect_leaf_level_keys(self): 4054 if isinstance(key, six.string_types): 4055 feature_tensors.append(inputs.get(key)) 4056 elif isinstance(key, (CategoricalColumn, fc_old._CategoricalColumn)): # pylint: disable=protected-access 4057 ids_and_weights = key._get_sparse_tensors(inputs) # pylint: disable=protected-access 4058 if ids_and_weights.weight_tensor is not None: 4059 raise ValueError( 4060 'crossed_column does not support weight_tensor, but the given ' 4061 'column populates weight_tensor. ' 4062 'Given column: {}'.format(key.name)) 4063 feature_tensors.append(ids_and_weights.id_tensor) 4064 else: 4065 raise ValueError('Unsupported column type. Given: {}'.format(key)) 4066 return sparse_ops.sparse_cross_hashed( 4067 inputs=feature_tensors, 4068 num_buckets=self.hash_bucket_size, 4069 hash_key=self.hash_key) 4070 4071 @property 4072 def num_buckets(self): 4073 """Returns number of buckets in this sparse feature.""" 4074 return self.hash_bucket_size 4075 4076 @property 4077 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 4078 _FEATURE_COLUMN_DEPRECATION) 4079 def _num_buckets(self): 4080 return self.num_buckets 4081 4082 def get_sparse_tensors(self, transformation_cache, state_manager): 4083 """See `CategoricalColumn` base class.""" 4084 return CategoricalColumn.IdWeightPair( 4085 transformation_cache.get(self, state_manager), None) 4086 4087 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 4088 _FEATURE_COLUMN_DEPRECATION) 4089 def _get_sparse_tensors(self, inputs, weight_collections=None, 4090 trainable=None): 4091 """See `CategoricalColumn` base class.""" 4092 del weight_collections 4093 del trainable 4094 return CategoricalColumn.IdWeightPair(inputs.get(self), None) 4095 4096 @property 4097 def parents(self): 4098 """See 'FeatureColumn` base class.""" 4099 return list(self.keys) 4100 4101 def _get_config(self): 4102 """See 'FeatureColumn` base class.""" 4103 config = dict(zip(self._fields, self)) 4104 config['keys'] = tuple([serialize_feature_column(fc) for fc in self.keys]) 4105 return config 4106 4107 @classmethod 4108 def _from_config(cls, config, custom_objects=None, columns_by_name=None): 4109 """See 'FeatureColumn` base class.""" 4110 _check_config_keys(config, cls._fields) 4111 kwargs = config.copy() 4112 kwargs['keys'] = tuple([ 4113 deserialize_feature_column(c, custom_objects, columns_by_name) 4114 for c in config['keys'] 4115 ]) 4116 return cls(**kwargs) 4117 4118 4119def _collect_leaf_level_keys(cross): 4120 """Collects base keys by expanding all nested crosses. 4121 4122 Args: 4123 cross: A `CrossedColumn`. 4124 4125 Returns: 4126 A list of strings or `CategoricalColumn` instances. 4127 """ 4128 leaf_level_keys = [] 4129 for k in cross.keys: 4130 if isinstance(k, CrossedColumn): 4131 leaf_level_keys.extend(_collect_leaf_level_keys(k)) 4132 else: 4133 leaf_level_keys.append(k) 4134 return leaf_level_keys 4135 4136 4137def _prune_invalid_ids(sparse_ids, sparse_weights): 4138 """Prune invalid IDs (< 0) from the input ids and weights.""" 4139 is_id_valid = math_ops.greater_equal(sparse_ids.values, 0) 4140 if sparse_weights is not None: 4141 is_id_valid = math_ops.logical_and( 4142 is_id_valid, 4143 array_ops.ones_like(sparse_weights.values, dtype=dtypes.bool)) 4144 sparse_ids = sparse_ops.sparse_retain(sparse_ids, is_id_valid) 4145 if sparse_weights is not None: 4146 sparse_weights = sparse_ops.sparse_retain(sparse_weights, is_id_valid) 4147 return sparse_ids, sparse_weights 4148 4149 4150def _prune_invalid_weights(sparse_ids, sparse_weights): 4151 """Prune invalid weights (< 0) from the input ids and weights.""" 4152 if sparse_weights is not None: 4153 is_weights_valid = math_ops.greater(sparse_weights.values, 0) 4154 sparse_ids = sparse_ops.sparse_retain(sparse_ids, is_weights_valid) 4155 sparse_weights = sparse_ops.sparse_retain(sparse_weights, is_weights_valid) 4156 return sparse_ids, sparse_weights 4157 4158 4159class IndicatorColumn( 4160 DenseColumn, 4161 SequenceDenseColumn, 4162 fc_old._DenseColumn, # pylint: disable=protected-access 4163 fc_old._SequenceDenseColumn, # pylint: disable=protected-access 4164 collections.namedtuple('IndicatorColumn', ('categorical_column'))): 4165 """Represents a one-hot column for use in deep networks. 4166 4167 Args: 4168 categorical_column: A `CategoricalColumn` which is created by 4169 `categorical_column_with_*` function. 4170 """ 4171 4172 @property 4173 def _is_v2_column(self): 4174 return (isinstance(self.categorical_column, FeatureColumn) and 4175 self.categorical_column._is_v2_column) # pylint: disable=protected-access 4176 4177 @property 4178 def name(self): 4179 """See `FeatureColumn` base class.""" 4180 return '{}_indicator'.format(self.categorical_column.name) 4181 4182 def _transform_id_weight_pair(self, id_weight_pair): 4183 id_tensor = id_weight_pair.id_tensor 4184 weight_tensor = id_weight_pair.weight_tensor 4185 4186 # If the underlying column is weighted, return the input as a dense tensor. 4187 if weight_tensor is not None: 4188 weighted_column = sparse_ops.sparse_merge( 4189 sp_ids=id_tensor, 4190 sp_values=weight_tensor, 4191 vocab_size=int(self._variable_shape[-1])) 4192 # Remove (?, -1) index. 4193 weighted_column = sparse_ops.sparse_slice(weighted_column, [0, 0], 4194 weighted_column.dense_shape) 4195 # Use scatter_nd to merge duplicated indices if existed, 4196 # instead of sparse_tensor_to_dense. 4197 return array_ops.scatter_nd(weighted_column.indices, 4198 weighted_column.values, 4199 weighted_column.dense_shape) 4200 4201 dense_id_tensor = sparse_ops.sparse_tensor_to_dense( 4202 id_tensor, default_value=-1) 4203 4204 # One hot must be float for tf.concat reasons since all other inputs to 4205 # input_layer are float32. 4206 one_hot_id_tensor = array_ops.one_hot( 4207 dense_id_tensor, 4208 depth=self._variable_shape[-1], 4209 on_value=1.0, 4210 off_value=0.0) 4211 4212 # Reduce to get a multi-hot per example. 4213 return math_ops.reduce_sum(one_hot_id_tensor, axis=[-2]) 4214 4215 def transform_feature(self, transformation_cache, state_manager): 4216 """Returns dense `Tensor` representing feature. 4217 4218 Args: 4219 transformation_cache: A `FeatureTransformationCache` object to access 4220 features. 4221 state_manager: A `StateManager` to create / access resources such as 4222 lookup tables. 4223 4224 Returns: 4225 Transformed feature `Tensor`. 4226 4227 Raises: 4228 ValueError: if input rank is not known at graph building time. 4229 """ 4230 id_weight_pair = self.categorical_column.get_sparse_tensors( 4231 transformation_cache, state_manager) 4232 return self._transform_id_weight_pair(id_weight_pair) 4233 4234 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 4235 _FEATURE_COLUMN_DEPRECATION) 4236 def _transform_feature(self, inputs): 4237 id_weight_pair = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access 4238 return self._transform_id_weight_pair(id_weight_pair) 4239 4240 @property 4241 def parse_example_spec(self): 4242 """See `FeatureColumn` base class.""" 4243 return self.categorical_column.parse_example_spec 4244 4245 @property 4246 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 4247 _FEATURE_COLUMN_DEPRECATION) 4248 def _parse_example_spec(self): 4249 return self.categorical_column._parse_example_spec # pylint: disable=protected-access 4250 4251 @property 4252 def variable_shape(self): 4253 """Returns a `TensorShape` representing the shape of the dense `Tensor`.""" 4254 if isinstance(self.categorical_column, FeatureColumn): 4255 return tensor_shape.TensorShape([1, self.categorical_column.num_buckets]) 4256 else: 4257 return tensor_shape.TensorShape([1, self.categorical_column._num_buckets]) # pylint: disable=protected-access 4258 4259 @property 4260 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 4261 _FEATURE_COLUMN_DEPRECATION) 4262 def _variable_shape(self): 4263 return tensor_shape.TensorShape([1, self.categorical_column._num_buckets]) # pylint: disable=protected-access 4264 4265 def get_dense_tensor(self, transformation_cache, state_manager): 4266 """Returns dense `Tensor` representing feature. 4267 4268 Args: 4269 transformation_cache: A `FeatureTransformationCache` object to access 4270 features. 4271 state_manager: A `StateManager` to create / access resources such as 4272 lookup tables. 4273 4274 Returns: 4275 Dense `Tensor` created within `transform_feature`. 4276 4277 Raises: 4278 ValueError: If `categorical_column` is a `SequenceCategoricalColumn`. 4279 """ 4280 if isinstance(self.categorical_column, SequenceCategoricalColumn): 4281 raise ValueError( 4282 'In indicator_column: {}. ' 4283 'categorical_column must not be of type SequenceCategoricalColumn. ' 4284 'Suggested fix A: If you wish to use DenseFeatures, use a ' 4285 'non-sequence categorical_column_with_*. ' 4286 'Suggested fix B: If you wish to create sequence input, use ' 4287 'SequenceFeatures instead of DenseFeatures. ' 4288 'Given (type {}): {}'.format(self.name, type(self.categorical_column), 4289 self.categorical_column)) 4290 # Feature has been already transformed. Return the intermediate 4291 # representation created by transform_feature. 4292 return transformation_cache.get(self, state_manager) 4293 4294 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 4295 _FEATURE_COLUMN_DEPRECATION) 4296 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 4297 del weight_collections 4298 del trainable 4299 if isinstance( 4300 self.categorical_column, 4301 (SequenceCategoricalColumn, fc_old._SequenceCategoricalColumn)): # pylint: disable=protected-access 4302 raise ValueError( 4303 'In indicator_column: {}. ' 4304 'categorical_column must not be of type _SequenceCategoricalColumn. ' 4305 'Suggested fix A: If you wish to use DenseFeatures, use a ' 4306 'non-sequence categorical_column_with_*. ' 4307 'Suggested fix B: If you wish to create sequence input, use ' 4308 'SequenceFeatures instead of DenseFeatures. ' 4309 'Given (type {}): {}'.format(self.name, type(self.categorical_column), 4310 self.categorical_column)) 4311 # Feature has been already transformed. Return the intermediate 4312 # representation created by transform_feature. 4313 return inputs.get(self) 4314 4315 def get_sequence_dense_tensor(self, transformation_cache, state_manager): 4316 """See `SequenceDenseColumn` base class.""" 4317 if not isinstance(self.categorical_column, SequenceCategoricalColumn): 4318 raise ValueError( 4319 'In indicator_column: {}. ' 4320 'categorical_column must be of type SequenceCategoricalColumn ' 4321 'to use SequenceFeatures. ' 4322 'Suggested fix: Use one of sequence_categorical_column_with_*. ' 4323 'Given (type {}): {}'.format(self.name, type(self.categorical_column), 4324 self.categorical_column)) 4325 # Feature has been already transformed. Return the intermediate 4326 # representation created by transform_feature. 4327 dense_tensor = transformation_cache.get(self, state_manager) 4328 sparse_tensors = self.categorical_column.get_sparse_tensors( 4329 transformation_cache, state_manager) 4330 sequence_length = fc_utils.sequence_length_from_sparse_tensor( 4331 sparse_tensors.id_tensor) 4332 return SequenceDenseColumn.TensorSequenceLengthPair( 4333 dense_tensor=dense_tensor, sequence_length=sequence_length) 4334 4335 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 4336 _FEATURE_COLUMN_DEPRECATION) 4337 def _get_sequence_dense_tensor(self, 4338 inputs, 4339 weight_collections=None, 4340 trainable=None): 4341 # Do nothing with weight_collections and trainable since no variables are 4342 # created in this function. 4343 del weight_collections 4344 del trainable 4345 if not isinstance( 4346 self.categorical_column, 4347 (SequenceCategoricalColumn, fc_old._SequenceCategoricalColumn)): # pylint: disable=protected-access 4348 raise ValueError( 4349 'In indicator_column: {}. ' 4350 'categorical_column must be of type _SequenceCategoricalColumn ' 4351 'to use SequenceFeatures. ' 4352 'Suggested fix: Use one of sequence_categorical_column_with_*. ' 4353 'Given (type {}): {}'.format(self.name, type(self.categorical_column), 4354 self.categorical_column)) 4355 # Feature has been already transformed. Return the intermediate 4356 # representation created by _transform_feature. 4357 dense_tensor = inputs.get(self) 4358 sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access 4359 sequence_length = fc_utils.sequence_length_from_sparse_tensor( 4360 sparse_tensors.id_tensor) 4361 return SequenceDenseColumn.TensorSequenceLengthPair( 4362 dense_tensor=dense_tensor, sequence_length=sequence_length) 4363 4364 @property 4365 def parents(self): 4366 """See 'FeatureColumn` base class.""" 4367 return [self.categorical_column] 4368 4369 def _get_config(self): 4370 """See 'FeatureColumn` base class.""" 4371 config = dict(zip(self._fields, self)) 4372 config['categorical_column'] = serialize_feature_column( 4373 self.categorical_column) 4374 return config 4375 4376 @classmethod 4377 def _from_config(cls, config, custom_objects=None, columns_by_name=None): 4378 """See 'FeatureColumn` base class.""" 4379 _check_config_keys(config, cls._fields) 4380 kwargs = config.copy() 4381 kwargs['categorical_column'] = deserialize_feature_column( 4382 config['categorical_column'], custom_objects, columns_by_name) 4383 return cls(**kwargs) 4384 4385 4386def _verify_static_batch_size_equality(tensors, columns): 4387 """Verify equality between static batch sizes. 4388 4389 Args: 4390 tensors: iterable of input tensors. 4391 columns: Corresponding feature columns. 4392 4393 Raises: 4394 ValueError: in case of mismatched batch sizes. 4395 """ 4396 # bath_size is a tf.Dimension object. 4397 expected_batch_size = None 4398 for i in range(0, len(tensors)): 4399 batch_size = tensor_shape.Dimension(tensor_shape.dimension_value( 4400 tensors[i].shape[0])) 4401 if batch_size.value is not None: 4402 if expected_batch_size is None: 4403 bath_size_column_index = i 4404 expected_batch_size = batch_size 4405 elif not expected_batch_size.is_compatible_with(batch_size): 4406 raise ValueError( 4407 'Batch size (first dimension) of each feature must be same. ' 4408 'Batch size of columns ({}, {}): ({}, {})'.format( 4409 columns[bath_size_column_index].name, columns[i].name, 4410 expected_batch_size, batch_size)) 4411 4412 4413class SequenceCategoricalColumn( 4414 CategoricalColumn, 4415 fc_old._SequenceCategoricalColumn, # pylint: disable=protected-access 4416 collections.namedtuple('SequenceCategoricalColumn', 4417 ('categorical_column'))): 4418 """Represents sequences of categorical data.""" 4419 4420 @property 4421 def _is_v2_column(self): 4422 return (isinstance(self.categorical_column, FeatureColumn) and 4423 self.categorical_column._is_v2_column) # pylint: disable=protected-access 4424 4425 @property 4426 def name(self): 4427 """See `FeatureColumn` base class.""" 4428 return self.categorical_column.name 4429 4430 @property 4431 def parse_example_spec(self): 4432 """See `FeatureColumn` base class.""" 4433 return self.categorical_column.parse_example_spec 4434 4435 @property 4436 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 4437 _FEATURE_COLUMN_DEPRECATION) 4438 def _parse_example_spec(self): 4439 return self.categorical_column._parse_example_spec # pylint: disable=protected-access 4440 4441 def transform_feature(self, transformation_cache, state_manager): 4442 """See `FeatureColumn` base class.""" 4443 return self.categorical_column.transform_feature(transformation_cache, 4444 state_manager) 4445 4446 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 4447 _FEATURE_COLUMN_DEPRECATION) 4448 def _transform_feature(self, inputs): 4449 return self.categorical_column._transform_feature(inputs) # pylint: disable=protected-access 4450 4451 @property 4452 def num_buckets(self): 4453 """Returns number of buckets in this sparse feature.""" 4454 return self.categorical_column.num_buckets 4455 4456 @property 4457 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 4458 _FEATURE_COLUMN_DEPRECATION) 4459 def _num_buckets(self): 4460 return self.categorical_column._num_buckets # pylint: disable=protected-access 4461 4462 def _get_sparse_tensors_helper(self, sparse_tensors): 4463 id_tensor = sparse_tensors.id_tensor 4464 weight_tensor = sparse_tensors.weight_tensor 4465 # Expands third dimension, if necessary so that embeddings are not 4466 # combined during embedding lookup. If the tensor is already 3D, leave 4467 # as-is. 4468 shape = array_ops.shape(id_tensor) 4469 # Compute the third dimension explicitly instead of setting it to -1, as 4470 # that doesn't work for dynamically shaped tensors with 0-length at runtime. 4471 # This happens for empty sequences. 4472 target_shape = [shape[0], shape[1], math_ops.reduce_prod(shape[2:])] 4473 id_tensor = sparse_ops.sparse_reshape(id_tensor, target_shape) 4474 if weight_tensor is not None: 4475 weight_tensor = sparse_ops.sparse_reshape(weight_tensor, target_shape) 4476 return CategoricalColumn.IdWeightPair(id_tensor, weight_tensor) 4477 4478 def get_sparse_tensors(self, transformation_cache, state_manager): 4479 """Returns an IdWeightPair. 4480 4481 `IdWeightPair` is a pair of `SparseTensor`s which represents ids and 4482 weights. 4483 4484 `IdWeightPair.id_tensor` is typically a `batch_size` x `num_buckets` 4485 `SparseTensor` of `int64`. `IdWeightPair.weight_tensor` is either a 4486 `SparseTensor` of `float` or `None` to indicate all weights should be 4487 taken to be 1. If specified, `weight_tensor` must have exactly the same 4488 shape and indices as `sp_ids`. Expected `SparseTensor` is same as parsing 4489 output of a `VarLenFeature` which is a ragged matrix. 4490 4491 Args: 4492 transformation_cache: A `FeatureTransformationCache` object to access 4493 features. 4494 state_manager: A `StateManager` to create / access resources such as 4495 lookup tables. 4496 """ 4497 sparse_tensors = self.categorical_column.get_sparse_tensors( 4498 transformation_cache, state_manager) 4499 return self._get_sparse_tensors_helper(sparse_tensors) 4500 4501 @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE, 4502 _FEATURE_COLUMN_DEPRECATION) 4503 def _get_sparse_tensors(self, inputs, weight_collections=None, 4504 trainable=None): 4505 sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access 4506 return self._get_sparse_tensors_helper(sparse_tensors) 4507 4508 @property 4509 def parents(self): 4510 """See 'FeatureColumn` base class.""" 4511 return [self.categorical_column] 4512 4513 def _get_config(self): 4514 """See 'FeatureColumn` base class.""" 4515 config = dict(zip(self._fields, self)) 4516 config['categorical_column'] = serialize_feature_column( 4517 self.categorical_column) 4518 return config 4519 4520 @classmethod 4521 def _from_config(cls, config, custom_objects=None, columns_by_name=None): 4522 """See 'FeatureColumn` base class.""" 4523 _check_config_keys(config, cls._fields) 4524 kwargs = config.copy() 4525 kwargs['categorical_column'] = deserialize_feature_column( 4526 config['categorical_column'], custom_objects, columns_by_name) 4527 return cls(**kwargs) 4528 4529 4530# FeatureColumn serialization, deserialization logic. 4531 4532 4533def _check_config_keys(config, expected_keys): 4534 """Checks that a config has all expected_keys.""" 4535 if set(config.keys()) != set(expected_keys): 4536 raise ValueError('Invalid config: {}, expected keys: {}'.format( 4537 config, expected_keys)) 4538 4539 4540def serialize_feature_column(fc): 4541 """Serializes a FeatureColumn or a raw string key. 4542 4543 This method should only be used to serialize parent FeatureColumns when 4544 implementing FeatureColumn._get_config(), else serialize_feature_columns() 4545 is preferable. 4546 4547 This serialization also keeps information of the FeatureColumn class, so 4548 deserialization is possible without knowing the class type. For example: 4549 4550 a = numeric_column('x') 4551 a._get_config() gives: 4552 { 4553 'key': 'price', 4554 'shape': (1,), 4555 'default_value': None, 4556 'dtype': 'float32', 4557 'normalizer_fn': None 4558 } 4559 While serialize_feature_column(a) gives: 4560 { 4561 'class_name': 'NumericColumn', 4562 'config': { 4563 'key': 'price', 4564 'shape': (1,), 4565 'default_value': None, 4566 'dtype': 'float32', 4567 'normalizer_fn': None 4568 } 4569 } 4570 4571 Args: 4572 fc: A FeatureColumn or raw feature key string. 4573 4574 Returns: 4575 Keras serialization for FeatureColumns, leaves string keys unaffected. 4576 4577 Raises: 4578 ValueError if called with input that is not string or FeatureColumn. 4579 """ 4580 if isinstance(fc, six.string_types): 4581 return fc 4582 elif isinstance(fc, FeatureColumn): 4583 return utils.serialize_keras_class_and_config(fc.__class__.__name__, 4584 fc._get_config()) 4585 else: 4586 raise ValueError('Instance: {} is not a FeatureColumn'.format(fc)) 4587 4588 4589def deserialize_feature_column(config, 4590 custom_objects=None, 4591 columns_by_name=None): 4592 """Deserializes a `config` generated with `serialize_feature_column`. 4593 4594 This method should only be used to deserialize parent FeatureColumns when 4595 implementing FeatureColumn._from_config(), else deserialize_feature_columns() 4596 is preferable. Returns a FeatureColumn for this config. 4597 TODO(b/118939620): Simplify code if Keras utils support object deduping. 4598 4599 Args: 4600 config: A Dict with the serialization of feature columns acquired by 4601 `serialize_feature_column`, or a string representing a raw column. 4602 custom_objects: A Dict from custom_object name to the associated keras 4603 serializable objects (FeatureColumns, classes or functions). 4604 columns_by_name: A Dict[String, FeatureColumn] of existing columns in order 4605 to avoid duplication. 4606 4607 Raises: 4608 ValueError if `config` has invalid format (e.g: expected keys missing, 4609 or refers to unknown classes). 4610 4611 Returns: 4612 A FeatureColumn corresponding to the input `config`. 4613 """ 4614 if isinstance(config, six.string_types): 4615 return config 4616 # A dict from class_name to class for all FeatureColumns in this module. 4617 # FeatureColumns not part of the module can be passed as custom_objects. 4618 module_feature_column_classes = { 4619 cls.__name__: cls for cls in [ 4620 BucketizedColumn, EmbeddingColumn, HashedCategoricalColumn, 4621 IdentityCategoricalColumn, IndicatorColumn, NumericColumn, 4622 SequenceCategoricalColumn, SequenceDenseColumn, SharedEmbeddingColumn, 4623 VocabularyFileCategoricalColumn, VocabularyListCategoricalColumn, 4624 WeightedCategoricalColumn, init_ops.TruncatedNormal 4625 ] 4626 } 4627 if columns_by_name is None: 4628 columns_by_name = {} 4629 4630 (cls, cls_config) = utils.class_and_config_for_serialized_keras_object( 4631 config, 4632 module_objects=module_feature_column_classes, 4633 custom_objects=custom_objects, 4634 printable_module_name='feature_column_v2') 4635 4636 if not issubclass(cls, FeatureColumn): 4637 raise ValueError( 4638 'Expected FeatureColumn class, instead found: {}'.format(cls)) 4639 4640 # Always deserialize the FeatureColumn, in order to get the name. 4641 new_instance = cls._from_config( # pylint: disable=protected-access 4642 cls_config, 4643 custom_objects=custom_objects, 4644 columns_by_name=columns_by_name) 4645 4646 # If the name already exists, re-use the column from columns_by_name, 4647 # (new_instance remains unused). 4648 return columns_by_name.setdefault(new_instance.name, new_instance) 4649 4650 4651def serialize_feature_columns(feature_columns): 4652 """Serializes a list of FeatureColumns. 4653 4654 Returns a list of Keras-style config dicts that represent the input 4655 FeatureColumns and can be used with `deserialize_feature_columns` for 4656 reconstructing the original columns. 4657 4658 Args: 4659 feature_columns: A list of FeatureColumns. 4660 4661 Returns: 4662 Keras serialization for the list of FeatureColumns. 4663 4664 Raises: 4665 ValueError if called with input that is not a list of FeatureColumns. 4666 """ 4667 return [serialize_feature_column(fc) for fc in feature_columns] 4668 4669 4670def deserialize_feature_columns(configs, custom_objects=None): 4671 """Deserializes a list of FeatureColumns configs. 4672 4673 Returns a list of FeatureColumns given a list of config dicts acquired by 4674 `serialize_feature_columns`. 4675 4676 Args: 4677 configs: A list of Dicts with the serialization of feature columns acquired 4678 by `serialize_feature_columns`. 4679 custom_objects: A Dict from custom_object name to the associated keras 4680 serializable objects (FeatureColumns, classes or functions). 4681 4682 Returns: 4683 FeatureColumn objects corresponding to the input configs. 4684 4685 Raises: 4686 ValueError if called with input that is not a list of FeatureColumns. 4687 """ 4688 columns_by_name = {} 4689 return [ 4690 deserialize_feature_column(c, custom_objects, columns_by_name) 4691 for c in configs 4692 ] 4693