1# Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15"""This API defines FeatureColumn abstraction. 16 17FeatureColumns provide a high level abstraction for ingesting and representing 18features. FeatureColumns are also the primary way of encoding features for 19canned `tf.estimator.Estimator`s. 20 21When using FeatureColumns with `Estimators`, the type of feature column you 22should choose depends on (1) the feature type and (2) the model type. 23 241. Feature type: 25 26 * Continuous features can be represented by `numeric_column`. 27 * Categorical features can be represented by any `categorical_column_with_*` 28 column: 29 - `categorical_column_with_vocabulary_list` 30 - `categorical_column_with_vocabulary_file` 31 - `categorical_column_with_hash_bucket` 32 - `categorical_column_with_identity` 33 - `weighted_categorical_column` 34 352. Model type: 36 37 * Deep neural network models (`DNNClassifier`, `DNNRegressor`). 38 39 Continuous features can be directly fed into deep neural network models. 40 41 age_column = numeric_column("age") 42 43 To feed sparse features into DNN models, wrap the column with 44 `embedding_column` or `indicator_column`. `indicator_column` is recommended 45 for features with only a few possible values. For features with many 46 possible values, to reduce the size of your model, `embedding_column` is 47 recommended. 48 49 embedded_dept_column = embedding_column( 50 categorical_column_with_vocabulary_list( 51 "department", ["math", "philosophy", ...]), dimension=10) 52 53 * Wide (aka linear) models (`LinearClassifier`, `LinearRegressor`). 54 55 Sparse features can be fed directly into linear models. They behave like an 56 indicator column but with an efficient implementation. 57 58 dept_column = categorical_column_with_vocabulary_list("department", 59 ["math", "philosophy", "english"]) 60 61 It is recommended that continuous features be bucketized before being 62 fed into linear models. 63 64 bucketized_age_column = bucketized_column( 65 source_column=age_column, 66 boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) 67 68 Sparse features can be crossed (also known as conjuncted or combined) in 69 order to form non-linearities, and then fed into linear models. 70 71 cross_dept_age_column = crossed_column( 72 columns=["department", bucketized_age_column], 73 hash_bucket_size=1000) 74 75Example of building canned `Estimator`s using FeatureColumns: 76 77 ```python 78 # Define features and transformations 79 deep_feature_columns = [age_column, embedded_dept_column] 80 wide_feature_columns = [dept_column, bucketized_age_column, 81 cross_dept_age_column] 82 83 # Build deep model 84 estimator = DNNClassifier( 85 feature_columns=deep_feature_columns, 86 hidden_units=[500, 250, 50]) 87 estimator.train(...) 88 89 # Or build a wide model 90 estimator = LinearClassifier( 91 feature_columns=wide_feature_columns) 92 estimator.train(...) 93 94 # Or build a wide and deep model! 95 estimator = DNNLinearCombinedClassifier( 96 linear_feature_columns=wide_feature_columns, 97 dnn_feature_columns=deep_feature_columns, 98 dnn_hidden_units=[500, 250, 50]) 99 estimator.train(...) 100 ``` 101 102 103FeatureColumns can also be transformed into a generic input layer for 104custom models using `input_layer`. 105 106Example of building model using FeatureColumns, this can be used in a 107`model_fn` which is given to the {tf.estimator.Estimator}: 108 109 ```python 110 # Building model via layers 111 112 deep_feature_columns = [age_column, embedded_dept_column] 113 columns_to_tensor = parse_feature_columns_from_examples( 114 serialized=my_data, 115 feature_columns=deep_feature_columns) 116 first_layer = input_layer( 117 features=columns_to_tensor, 118 feature_columns=deep_feature_columns) 119 second_layer = fully_connected(first_layer, ...) 120 ``` 121 122NOTE: Functions prefixed with "_" indicate experimental or private parts of 123the API subject to change, and should not be relied upon! 124 125NOTE: The new feature columns are being developed in feature_column_v2.py and 126are a somewhat duplicate of the code here. Please make sure to update logic 127in both places. 128""" 129 130from __future__ import absolute_import 131from __future__ import division 132from __future__ import print_function 133 134import abc 135import collections 136import math 137 138import numpy as np 139import six 140 141from tensorflow.python.eager import context 142from tensorflow.python.feature_column import utils as fc_utils 143from tensorflow.python.framework import dtypes 144from tensorflow.python.framework import ops 145from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib 146from tensorflow.python.framework import tensor_shape 147from tensorflow.python.keras.engine import training 148from tensorflow.python.layers import base 149from tensorflow.python.ops import array_ops 150from tensorflow.python.ops import check_ops 151from tensorflow.python.ops import control_flow_ops 152from tensorflow.python.ops import embedding_ops 153from tensorflow.python.ops import init_ops 154from tensorflow.python.ops import lookup_ops 155from tensorflow.python.ops import math_ops 156from tensorflow.python.ops import nn_ops 157from tensorflow.python.ops import parsing_ops 158from tensorflow.python.ops import resource_variable_ops 159from tensorflow.python.ops import sparse_ops 160from tensorflow.python.ops import string_ops 161from tensorflow.python.ops import template 162from tensorflow.python.ops import variable_scope 163from tensorflow.python.ops import variables 164from tensorflow.python.platform import gfile 165from tensorflow.python.platform import tf_logging as logging 166from tensorflow.python.training import checkpoint_utils 167from tensorflow.python.util import nest 168from tensorflow.python.util.tf_export import tf_export 169 170 171def _internal_input_layer(features, 172 feature_columns, 173 weight_collections=None, 174 trainable=True, 175 cols_to_vars=None, 176 scope=None, 177 cols_to_output_tensors=None, 178 from_template=False): 179 """See input_layer. `scope` is a name or variable scope to use.""" 180 181 feature_columns = _normalize_feature_columns(feature_columns) 182 for column in feature_columns: 183 if not isinstance(column, _DenseColumn): 184 raise ValueError( 185 'Items of feature_columns must be a _DenseColumn. ' 186 'You can wrap a categorical column with an ' 187 'embedding_column or indicator_column. Given: {}'.format(column)) 188 weight_collections = list(weight_collections or []) 189 if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections: 190 weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES) 191 if ops.GraphKeys.MODEL_VARIABLES not in weight_collections: 192 weight_collections.append(ops.GraphKeys.MODEL_VARIABLES) 193 194 def _get_logits(): # pylint: disable=missing-docstring 195 builder = _LazyBuilder(features) 196 output_tensors = [] 197 ordered_columns = [] 198 for column in sorted(feature_columns, key=lambda x: x.name): 199 ordered_columns.append(column) 200 with variable_scope.variable_scope( 201 None, default_name=column._var_scope_name): # pylint: disable=protected-access 202 tensor = column._get_dense_tensor( # pylint: disable=protected-access 203 builder, 204 weight_collections=weight_collections, 205 trainable=trainable) 206 num_elements = column._variable_shape.num_elements() # pylint: disable=protected-access 207 batch_size = array_ops.shape(tensor)[0] 208 output_tensor = array_ops.reshape( 209 tensor, shape=(batch_size, num_elements)) 210 output_tensors.append(output_tensor) 211 if cols_to_vars is not None: 212 # Retrieve any variables created (some _DenseColumn's don't create 213 # variables, in which case an empty list is returned). 214 cols_to_vars[column] = ops.get_collection( 215 ops.GraphKeys.GLOBAL_VARIABLES, 216 scope=variable_scope.get_variable_scope().name) 217 if cols_to_output_tensors is not None: 218 cols_to_output_tensors[column] = output_tensor 219 _verify_static_batch_size_equality(output_tensors, ordered_columns) 220 return array_ops.concat(output_tensors, 1) 221 222 # If we're constructing from the `make_template`, that by default adds a 223 # variable scope with the name of the layer. In that case, we dont want to 224 # add another `variable_scope` as that would break checkpoints. 225 if from_template: 226 return _get_logits() 227 else: 228 with variable_scope.variable_scope( 229 scope, default_name='input_layer', values=features.values()): 230 return _get_logits() 231 232 233@tf_export(v1=['feature_column.input_layer']) 234def input_layer(features, 235 feature_columns, 236 weight_collections=None, 237 trainable=True, 238 cols_to_vars=None, 239 cols_to_output_tensors=None): 240 """Returns a dense `Tensor` as input layer based on given `feature_columns`. 241 242 Generally a single example in training data is described with FeatureColumns. 243 At the first layer of the model, this column oriented data should be converted 244 to a single `Tensor`. 245 246 Example: 247 248 ```python 249 price = numeric_column('price') 250 keywords_embedded = embedding_column( 251 categorical_column_with_hash_bucket("keywords", 10K), dimensions=16) 252 columns = [price, keywords_embedded, ...] 253 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 254 dense_tensor = input_layer(features, columns) 255 for units in [128, 64, 32]: 256 dense_tensor = tf.layers.dense(dense_tensor, units, tf.nn.relu) 257 prediction = tf.layers.dense(dense_tensor, 1) 258 ``` 259 260 Args: 261 features: A mapping from key to tensors. `_FeatureColumn`s look up via these 262 keys. For example `numeric_column('price')` will look at 'price' key in 263 this dict. Values can be a `SparseTensor` or a `Tensor` depends on 264 corresponding `_FeatureColumn`. 265 feature_columns: An iterable containing the FeatureColumns to use as inputs 266 to your model. All items should be instances of classes derived from 267 `_DenseColumn` such as `numeric_column`, `embedding_column`, 268 `bucketized_column`, `indicator_column`. If you have categorical features, 269 you can wrap them with an `embedding_column` or `indicator_column`. 270 weight_collections: A list of collection names to which the Variable will be 271 added. Note that variables will also be added to collections 272 `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`. 273 trainable: If `True` also add the variable to the graph collection 274 `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). 275 cols_to_vars: If not `None`, must be a dictionary that will be filled with a 276 mapping from `_FeatureColumn` to list of `Variable`s. For example, after 277 the call, we might have cols_to_vars = 278 {_EmbeddingColumn( 279 categorical_column=_HashedCategoricalColumn( 280 key='sparse_feature', hash_bucket_size=5, dtype=tf.string), 281 dimension=10): [<tf.Variable 'some_variable:0' shape=(5, 10), 282 <tf.Variable 'some_variable:1' shape=(5, 10)]} 283 If a column creates no variables, its value will be an empty list. 284 cols_to_output_tensors: If not `None`, must be a dictionary that will be 285 filled with a mapping from '_FeatureColumn' to the associated 286 output `Tensor`s. 287 288 Returns: 289 A `Tensor` which represents input layer of a model. Its shape 290 is (batch_size, first_layer_dimension) and its dtype is `float32`. 291 first_layer_dimension is determined based on given `feature_columns`. 292 293 Raises: 294 ValueError: if an item in `feature_columns` is not a `_DenseColumn`. 295 """ 296 return _internal_input_layer( 297 features, 298 feature_columns, 299 weight_collections=weight_collections, 300 trainable=trainable, 301 cols_to_vars=cols_to_vars, 302 cols_to_output_tensors=cols_to_output_tensors) 303 304 305# TODO(akshayka): InputLayer should be a subclass of Layer, and it 306# should implement the logic in input_layer using Layer's build-and-call 307# paradigm; input_layer should create an instance of InputLayer and 308# return the result of invoking its apply method, just as functional layers do. 309class InputLayer(object): 310 """An object-oriented version of `input_layer` that reuses variables.""" 311 312 def __init__(self, 313 feature_columns, 314 weight_collections=None, 315 trainable=True, 316 cols_to_vars=None, 317 name='feature_column_input_layer', 318 create_scope_now=True): 319 """See `input_layer`.""" 320 321 self._feature_columns = feature_columns 322 self._weight_collections = weight_collections 323 self._trainable = trainable 324 self._cols_to_vars = cols_to_vars 325 self._name = name 326 self._input_layer_template = template.make_template( 327 self._name, _internal_input_layer, create_scope_now_=create_scope_now) 328 self._scope = self._input_layer_template.variable_scope 329 330 def __call__(self, features): 331 return self._input_layer_template( 332 features=features, 333 feature_columns=self._feature_columns, 334 weight_collections=self._weight_collections, 335 trainable=self._trainable, 336 cols_to_vars=None, 337 from_template=True) 338 339 @property 340 def name(self): 341 return self._name 342 343 @property 344 def non_trainable_variables(self): 345 return self._input_layer_template.non_trainable_variables 346 347 @property 348 def non_trainable_weights(self): 349 return self._input_layer_template.non_trainable_weights 350 351 @property 352 def trainable_variables(self): 353 return self._input_layer_template.trainable_variables 354 355 @property 356 def trainable_weights(self): 357 return self._input_layer_template.trainable_weights 358 359 @property 360 def variables(self): 361 return self._input_layer_template.variables 362 363 @property 364 def weights(self): 365 return self._input_layer_template.weights 366 367 368@tf_export(v1=['feature_column.linear_model']) 369def linear_model(features, 370 feature_columns, 371 units=1, 372 sparse_combiner='sum', 373 weight_collections=None, 374 trainable=True, 375 cols_to_vars=None): 376 """Returns a linear prediction `Tensor` based on given `feature_columns`. 377 378 This function generates a weighted sum based on output dimension `units`. 379 Weighted sum refers to logits in classification problems. It refers to the 380 prediction itself for linear regression problems. 381 382 Note on supported columns: `linear_model` treats categorical columns as 383 `indicator_column`s. To be specific, assume the input as `SparseTensor` looks 384 like: 385 386 ```python 387 shape = [2, 2] 388 { 389 [0, 0]: "a" 390 [1, 0]: "b" 391 [1, 1]: "c" 392 } 393 ``` 394 `linear_model` assigns weights for the presence of "a", "b", "c' implicitly, 395 just like `indicator_column`, while `input_layer` explicitly requires wrapping 396 each of categorical columns with an `embedding_column` or an 397 `indicator_column`. 398 399 Example of usage: 400 401 ```python 402 price = numeric_column('price') 403 price_buckets = bucketized_column(price, boundaries=[0., 10., 100., 1000.]) 404 keywords = categorical_column_with_hash_bucket("keywords", 10K) 405 keywords_price = crossed_column('keywords', price_buckets, ...) 406 columns = [price_buckets, keywords, keywords_price ...] 407 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 408 prediction = linear_model(features, columns) 409 ``` 410 411 Args: 412 features: A mapping from key to tensors. `_FeatureColumn`s look up via these 413 keys. For example `numeric_column('price')` will look at 'price' key in 414 this dict. Values are `Tensor` or `SparseTensor` depending on 415 corresponding `_FeatureColumn`. 416 feature_columns: An iterable containing the FeatureColumns to use as inputs 417 to your model. All items should be instances of classes derived from 418 `_FeatureColumn`s. 419 units: An integer, dimensionality of the output space. Default value is 1. 420 sparse_combiner: A string specifying how to reduce if a categorical column 421 is multivalent. Except `numeric_column`, almost all columns passed to 422 `linear_model` are considered as categorical columns. It combines each 423 categorical column independently. Currently "mean", "sqrtn" and "sum" are 424 supported, with "sum" the default for linear model. "sqrtn" often achieves 425 good accuracy, in particular with bag-of-words columns. 426 * "sum": do not normalize features in the column 427 * "mean": do l1 normalization on features in the column 428 * "sqrtn": do l2 normalization on features in the column 429 For example, for two features represented as the categorical columns: 430 431 ```python 432 # Feature 1 433 434 shape = [2, 2] 435 { 436 [0, 0]: "a" 437 [0, 1]: "b" 438 [1, 0]: "c" 439 } 440 441 # Feature 2 442 443 shape = [2, 3] 444 { 445 [0, 0]: "d" 446 [1, 0]: "e" 447 [1, 1]: "f" 448 [1, 2]: "f" 449 } 450 ``` 451 with `sparse_combiner` as "mean", the linear model outputs consequently 452 are: 453 ``` 454 y_0 = 1.0 / 2.0 * ( w_a + w_b ) + w_d + b 455 y_1 = w_c + 1.0 / 3.0 * ( w_e + 2.0 * w_f ) + b 456 ``` 457 where `y_i` is the output, `b` is the bias, and `w_x` is the weight 458 assigned to the presence of `x` in the input features. 459 weight_collections: A list of collection names to which the Variable will be 460 added. Note that, variables will also be added to collections 461 `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`. 462 trainable: If `True` also add the variable to the graph collection 463 `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). 464 cols_to_vars: If not `None`, must be a dictionary that will be filled with a 465 mapping from `_FeatureColumn` to associated list of `Variable`s. For 466 example, after the call, we might have cols_to_vars = { 467 _NumericColumn( 468 key='numeric_feature1', shape=(1,): 469 [<tf.Variable 'linear_model/price2/weights:0' shape=(1, 1)>], 470 'bias': [<tf.Variable 'linear_model/bias_weights:0' shape=(1,)>], 471 _NumericColumn( 472 key='numeric_feature2', shape=(2,)): 473 [<tf.Variable 'linear_model/price1/weights:0' shape=(2, 1)>]} 474 If a column creates no variables, its value will be an empty list. Note 475 that cols_to_vars will also contain a string key 'bias' that maps to a 476 list of Variables. 477 478 Returns: 479 A `Tensor` which represents predictions/logits of a linear model. Its shape 480 is (batch_size, units) and its dtype is `float32`. 481 482 Raises: 483 ValueError: if an item in `feature_columns` is neither a `_DenseColumn` 484 nor `_CategoricalColumn`. 485 """ 486 with variable_scope.variable_scope(None, 'linear_model') as vs: 487 model_name = _strip_leading_slashes(vs.name) 488 linear_model_layer = _LinearModel( 489 feature_columns=feature_columns, 490 units=units, 491 sparse_combiner=sparse_combiner, 492 weight_collections=weight_collections, 493 trainable=trainable, 494 name=model_name) 495 retval = linear_model_layer(features) # pylint: disable=not-callable 496 if cols_to_vars is not None: 497 cols_to_vars.update(linear_model_layer.cols_to_vars()) 498 return retval 499 500 501def _add_to_collections(var, weight_collections): 502 """Adds a var to the list of weight_collections provided. 503 504 Handles the case for partitioned and non-partitioned variables. 505 506 Args: 507 var: A variable or Partitioned Variable. 508 weight_collections: List of collections to add variable to. 509 """ 510 for weight_collection in weight_collections: 511 # The layer self.add_variable call already adds it to GLOBAL_VARIABLES. 512 if weight_collection == ops.GraphKeys.GLOBAL_VARIABLES: 513 continue 514 # TODO(rohanj): Explore adding a _get_variable_list method on `Variable` 515 # so that we don't have to do this check. 516 if isinstance(var, variables.PartitionedVariable): 517 for constituent_var in list(var): 518 ops.add_to_collection(weight_collection, constituent_var) 519 else: 520 ops.add_to_collection(weight_collection, var) 521 522 523class _FCLinearWrapper(base.Layer): 524 """Wraps a _FeatureColumn in a layer for use in a linear model. 525 526 See `linear_model` above. 527 """ 528 529 def __init__(self, 530 feature_column, 531 units=1, 532 sparse_combiner='sum', 533 weight_collections=None, 534 trainable=True, 535 name=None, 536 **kwargs): 537 super(_FCLinearWrapper, self).__init__( 538 trainable=trainable, name=name, **kwargs) 539 self._feature_column = feature_column 540 self._units = units 541 self._sparse_combiner = sparse_combiner 542 self._weight_collections = weight_collections 543 544 def build(self, _): 545 if isinstance(self._feature_column, _CategoricalColumn): 546 weight = self.add_variable( 547 name='weights', 548 shape=(self._feature_column._num_buckets, self._units), # pylint: disable=protected-access 549 initializer=init_ops.zeros_initializer(), 550 trainable=self.trainable) 551 else: 552 num_elements = self._feature_column._variable_shape.num_elements() # pylint: disable=protected-access 553 weight = self.add_variable( 554 name='weights', 555 shape=[num_elements, self._units], 556 initializer=init_ops.zeros_initializer(), 557 trainable=self.trainable) 558 _add_to_collections(weight, self._weight_collections) 559 self._weight_var = weight 560 self.built = True 561 562 def call(self, builder): 563 weighted_sum = _create_weighted_sum( 564 column=self._feature_column, 565 builder=builder, 566 units=self._units, 567 sparse_combiner=self._sparse_combiner, 568 weight_collections=self._weight_collections, 569 trainable=self.trainable, 570 weight_var=self._weight_var) 571 return weighted_sum 572 573 574class _BiasLayer(base.Layer): 575 """A layer for the bias term. 576 """ 577 578 def __init__(self, 579 units=1, 580 trainable=True, 581 weight_collections=None, 582 name=None, 583 **kwargs): 584 super(_BiasLayer, self).__init__(trainable=trainable, name=name, **kwargs) 585 self._units = units 586 self._weight_collections = weight_collections 587 588 def build(self, _): 589 self._bias_variable = self.add_variable( 590 'bias_weights', 591 shape=[self._units], 592 initializer=init_ops.zeros_initializer(), 593 trainable=self.trainable) 594 _add_to_collections(self._bias_variable, self._weight_collections) 595 self.built = True 596 597 def call(self, _): 598 return self._bias_variable 599 600 601def _get_expanded_variable_list(variable): 602 if (isinstance(variable, variables.Variable) or 603 resource_variable_ops.is_resource_variable(variable)): 604 return [variable] # Single variable case. 605 else: # Must be a PartitionedVariable, so convert into a list. 606 return list(variable) 607 608 609def _strip_leading_slashes(name): 610 return name.rsplit('/', 1)[-1] 611 612 613class _LinearModel(training.Model): 614 """Creates a linear model using feature columns. 615 616 See `linear_model` for details. 617 """ 618 619 def __init__(self, 620 feature_columns, 621 units=1, 622 sparse_combiner='sum', 623 weight_collections=None, 624 trainable=True, 625 name=None, 626 **kwargs): 627 super(_LinearModel, self).__init__(name=name, **kwargs) 628 self._feature_columns = _normalize_feature_columns( 629 feature_columns) 630 self._weight_collections = list(weight_collections or []) 631 if ops.GraphKeys.GLOBAL_VARIABLES not in self._weight_collections: 632 self._weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES) 633 if ops.GraphKeys.MODEL_VARIABLES not in self._weight_collections: 634 self._weight_collections.append(ops.GraphKeys.MODEL_VARIABLES) 635 636 column_layers = {} 637 for column in sorted(self._feature_columns, key=lambda x: x.name): 638 with variable_scope.variable_scope( 639 None, default_name=column._var_scope_name) as vs: # pylint: disable=protected-access 640 # Having the fully expressed variable scope name ends up doubly 641 # expressing the outer scope (scope with which this method was called) 642 # in the name of the variable that would get created. 643 column_name = _strip_leading_slashes(vs.name) 644 column_layer = _FCLinearWrapper(column, units, sparse_combiner, 645 self._weight_collections, trainable, 646 column_name, **kwargs) 647 column_layers[column_name] = column_layer 648 self._column_layers = self._add_layers(column_layers) 649 self._bias_layer = _BiasLayer( 650 units=units, 651 trainable=trainable, 652 weight_collections=self._weight_collections, 653 name='bias_layer', 654 **kwargs) 655 self._cols_to_vars = {} 656 657 def cols_to_vars(self): 658 """Returns a dict mapping _FeatureColumns to variables. 659 660 See `linear_model` for more information. 661 This is not populated till `call` is called i.e. layer is built. 662 """ 663 return self._cols_to_vars 664 665 def call(self, features): 666 with variable_scope.variable_scope(self.name): 667 for column in self._feature_columns: 668 if not isinstance(column, (_DenseColumn, _CategoricalColumn)): 669 raise ValueError( 670 'Items of feature_columns must be either a ' 671 '_DenseColumn or _CategoricalColumn. Given: {}'.format(column)) 672 weighted_sums = [] 673 ordered_columns = [] 674 builder = _LazyBuilder(features) 675 for layer in sorted(self._column_layers.values(), key=lambda x: x.name): 676 column = layer._feature_column # pylint: disable=protected-access 677 ordered_columns.append(column) 678 weighted_sum = layer(builder) 679 weighted_sums.append(weighted_sum) 680 self._cols_to_vars[column] = ops.get_collection( 681 ops.GraphKeys.GLOBAL_VARIABLES, scope=layer.scope_name) 682 683 _verify_static_batch_size_equality(weighted_sums, ordered_columns) 684 predictions_no_bias = math_ops.add_n( 685 weighted_sums, name='weighted_sum_no_bias') 686 predictions = nn_ops.bias_add( 687 predictions_no_bias, 688 self._bias_layer( # pylint: disable=not-callable 689 builder, 690 scope=variable_scope.get_variable_scope()), # pylint: disable=not-callable 691 name='weighted_sum') 692 bias = self._bias_layer.variables[0] 693 self._cols_to_vars['bias'] = _get_expanded_variable_list(bias) 694 return predictions 695 696 def _add_layers(self, layers): 697 # "Magic" required for keras.Model classes to track all the variables in 698 # a list of layers.Layer objects. 699 # TODO(ashankar): Figure out API so user code doesn't have to do this. 700 for name, layer in layers.items(): 701 setattr(self, 'layer-%s' % name, layer) 702 return layers 703 704 705def _transform_features(features, feature_columns): 706 """Returns transformed features based on features columns passed in. 707 708 Please note that most probably you would not need to use this function. Please 709 check `input_layer` and `linear_model` to see whether they will 710 satisfy your use case or not. 711 712 Example: 713 714 ```python 715 # Define features and transformations 716 crosses_a_x_b = crossed_column( 717 columns=["sparse_feature_a", "sparse_feature_b"], hash_bucket_size=10000) 718 price_buckets = bucketized_column( 719 source_column=numeric_column("price"), boundaries=[...]) 720 721 columns = [crosses_a_x_b, price_buckets] 722 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 723 transformed = transform_features(features=features, feature_columns=columns) 724 725 assertCountEqual(columns, transformed.keys()) 726 ``` 727 728 Args: 729 features: A mapping from key to tensors. `_FeatureColumn`s look up via these 730 keys. For example `numeric_column('price')` will look at 'price' key in 731 this dict. Values can be a `SparseTensor` or a `Tensor` depends on 732 corresponding `_FeatureColumn`. 733 feature_columns: An iterable containing all the `_FeatureColumn`s. 734 735 Returns: 736 A `dict` mapping `_FeatureColumn` to `Tensor` and `SparseTensor` values. 737 """ 738 feature_columns = _normalize_feature_columns(feature_columns) 739 outputs = {} 740 with ops.name_scope( 741 None, default_name='transform_features', values=features.values()): 742 builder = _LazyBuilder(features) 743 for column in sorted(feature_columns, key=lambda x: x.name): 744 with ops.name_scope(None, default_name=column.name): 745 outputs[column] = builder.get(column) 746 return outputs 747 748 749@tf_export(v1=['feature_column.make_parse_example_spec']) 750def make_parse_example_spec(feature_columns): 751 """Creates parsing spec dictionary from input feature_columns. 752 753 The returned dictionary can be used as arg 'features' in `tf.parse_example`. 754 755 Typical usage example: 756 757 ```python 758 # Define features and transformations 759 feature_a = categorical_column_with_vocabulary_file(...) 760 feature_b = numeric_column(...) 761 feature_c_bucketized = bucketized_column(numeric_column("feature_c"), ...) 762 feature_a_x_feature_c = crossed_column( 763 columns=["feature_a", feature_c_bucketized], ...) 764 765 feature_columns = set( 766 [feature_b, feature_c_bucketized, feature_a_x_feature_c]) 767 features = tf.parse_example( 768 serialized=serialized_examples, 769 features=make_parse_example_spec(feature_columns)) 770 ``` 771 772 For the above example, make_parse_example_spec would return the dict: 773 774 ```python 775 { 776 "feature_a": parsing_ops.VarLenFeature(tf.string), 777 "feature_b": parsing_ops.FixedLenFeature([1], dtype=tf.float32), 778 "feature_c": parsing_ops.FixedLenFeature([1], dtype=tf.float32) 779 } 780 ``` 781 782 Args: 783 feature_columns: An iterable containing all feature columns. All items 784 should be instances of classes derived from `_FeatureColumn`. 785 786 Returns: 787 A dict mapping each feature key to a `FixedLenFeature` or `VarLenFeature` 788 value. 789 790 Raises: 791 ValueError: If any of the given `feature_columns` is not a `_FeatureColumn` 792 instance. 793 """ 794 result = {} 795 for column in feature_columns: 796 if not isinstance(column, _FeatureColumn): 797 raise ValueError( 798 'All feature_columns must be _FeatureColumn instances. ' 799 'Given: {}'.format(column)) 800 config = column._parse_example_spec # pylint: disable=protected-access 801 for key, value in six.iteritems(config): 802 if key in result and value != result[key]: 803 raise ValueError( 804 'feature_columns contain different parse_spec for key ' 805 '{}. Given {} and {}'.format(key, value, result[key])) 806 result.update(config) 807 return result 808 809 810def _embedding_column(categorical_column, 811 dimension, 812 combiner='mean', 813 initializer=None, 814 ckpt_to_load_from=None, 815 tensor_name_in_ckpt=None, 816 max_norm=None, 817 trainable=True): 818 """`_DenseColumn` that converts from sparse, categorical input. 819 820 Use this when your inputs are sparse, but you want to convert them to a dense 821 representation (e.g., to feed to a DNN). 822 823 Inputs must be a `_CategoricalColumn` created by any of the 824 `categorical_column_*` function. Here is an example of using 825 `embedding_column` with `DNNClassifier`: 826 827 ```python 828 video_id = categorical_column_with_identity( 829 key='video_id', num_buckets=1000000, default_value=0) 830 columns = [embedding_column(video_id, 9),...] 831 832 estimator = tf.estimator.DNNClassifier(feature_columns=columns, ...) 833 834 label_column = ... 835 def input_fn(): 836 features = tf.parse_example( 837 ..., features=make_parse_example_spec(columns + [label_column])) 838 labels = features.pop(label_column.name) 839 return features, labels 840 841 estimator.train(input_fn=input_fn, steps=100) 842 ``` 843 844 Here is an example using `embedding_column` with model_fn: 845 846 ```python 847 def model_fn(features, ...): 848 video_id = categorical_column_with_identity( 849 key='video_id', num_buckets=1000000, default_value=0) 850 columns = [embedding_column(video_id, 9),...] 851 dense_tensor = input_layer(features, columns) 852 # Form DNN layers, calculate loss, and return EstimatorSpec. 853 ... 854 ``` 855 856 Args: 857 categorical_column: A `_CategoricalColumn` created by a 858 `categorical_column_with_*` function. This column produces the sparse IDs 859 that are inputs to the embedding lookup. 860 dimension: An integer specifying dimension of the embedding, must be > 0. 861 combiner: A string specifying how to reduce if there are multiple entries 862 in a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with 863 'mean' the default. 'sqrtn' often achieves good accuracy, in particular 864 with bag-of-words columns. Each of this can be thought as example level 865 normalizations on the column. For more information, see 866 `tf.embedding_lookup_sparse`. 867 initializer: A variable initializer function to be used in embedding 868 variable initialization. If not specified, defaults to 869 `tf.truncated_normal_initializer` with mean `0.0` and standard deviation 870 `1/sqrt(dimension)`. 871 ckpt_to_load_from: String representing checkpoint name/pattern from which to 872 restore column weights. Required if `tensor_name_in_ckpt` is not `None`. 873 tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from 874 which to restore the column weights. Required if `ckpt_to_load_from` is 875 not `None`. 876 max_norm: If not `None`, embedding values are l2-normalized to this value. 877 trainable: Whether or not the embedding is trainable. Default is True. 878 879 Returns: 880 `_DenseColumn` that converts from sparse input. 881 882 Raises: 883 ValueError: if `dimension` not > 0. 884 ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt` 885 is specified. 886 ValueError: if `initializer` is specified and is not callable. 887 RuntimeError: If eager execution is enabled. 888 """ 889 if (dimension is None) or (dimension < 1): 890 raise ValueError('Invalid dimension {}.'.format(dimension)) 891 if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None): 892 raise ValueError('Must specify both `ckpt_to_load_from` and ' 893 '`tensor_name_in_ckpt` or none of them.') 894 895 if (initializer is not None) and (not callable(initializer)): 896 raise ValueError('initializer must be callable if specified. ' 897 'Embedding of column_name: {}'.format( 898 categorical_column.name)) 899 if initializer is None: 900 initializer = init_ops.truncated_normal_initializer( 901 mean=0.0, stddev=1 / math.sqrt(dimension)) 902 903 embedding_shape = categorical_column._num_buckets, dimension # pylint: disable=protected-access 904 905 def _creator(weight_collections, scope): 906 embedding_column_layer = _EmbeddingColumnLayer( 907 embedding_shape=embedding_shape, 908 initializer=initializer, 909 weight_collections=weight_collections, 910 trainable=trainable, 911 name='embedding_column_layer') 912 return embedding_column_layer(None, scope=scope) # pylint: disable=not-callable 913 914 return _EmbeddingColumn( 915 categorical_column=categorical_column, 916 dimension=dimension, 917 combiner=combiner, 918 layer_creator=_creator, 919 ckpt_to_load_from=ckpt_to_load_from, 920 tensor_name_in_ckpt=tensor_name_in_ckpt, 921 max_norm=max_norm, 922 trainable=trainable) 923 924 925def _numeric_column(key, 926 shape=(1,), 927 default_value=None, 928 dtype=dtypes.float32, 929 normalizer_fn=None): 930 """Represents real valued or numerical features. 931 932 Example: 933 934 ```python 935 price = numeric_column('price') 936 columns = [price, ...] 937 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 938 dense_tensor = input_layer(features, columns) 939 940 # or 941 bucketized_price = bucketized_column(price, boundaries=[...]) 942 columns = [bucketized_price, ...] 943 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 944 linear_prediction = linear_model(features, columns) 945 ``` 946 947 Args: 948 key: A unique string identifying the input feature. It is used as the 949 column name and the dictionary key for feature parsing configs, feature 950 `Tensor` objects, and feature columns. 951 shape: An iterable of integers specifies the shape of the `Tensor`. An 952 integer can be given which means a single dimension `Tensor` with given 953 width. The `Tensor` representing the column will have the shape of 954 [batch_size] + `shape`. 955 default_value: A single value compatible with `dtype` or an iterable of 956 values compatible with `dtype` which the column takes on during 957 `tf.Example` parsing if data is missing. A default value of `None` will 958 cause `tf.parse_example` to fail if an example does not contain this 959 column. If a single value is provided, the same value will be applied as 960 the default value for every item. If an iterable of values is provided, 961 the shape of the `default_value` should be equal to the given `shape`. 962 dtype: defines the type of values. Default value is `tf.float32`. Must be a 963 non-quantized, real integer or floating point type. 964 normalizer_fn: If not `None`, a function that can be used to normalize the 965 value of the tensor after `default_value` is applied for parsing. 966 Normalizer function takes the input `Tensor` as its argument, and returns 967 the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that 968 even though the most common use case of this function is normalization, it 969 can be used for any kind of Tensorflow transformations. 970 971 Returns: 972 A `_NumericColumn`. 973 974 Raises: 975 TypeError: if any dimension in shape is not an int 976 ValueError: if any dimension in shape is not a positive integer 977 TypeError: if `default_value` is an iterable but not compatible with `shape` 978 TypeError: if `default_value` is not compatible with `dtype`. 979 ValueError: if `dtype` is not convertible to `tf.float32`. 980 """ 981 shape = _check_shape(shape, key) 982 if not (dtype.is_integer or dtype.is_floating): 983 raise ValueError('dtype must be convertible to float. ' 984 'dtype: {}, key: {}'.format(dtype, key)) 985 default_value = fc_utils.check_default_value( 986 shape, default_value, dtype, key) 987 988 if normalizer_fn is not None and not callable(normalizer_fn): 989 raise TypeError( 990 'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn)) 991 992 fc_utils.assert_key_is_string(key) 993 return _NumericColumn( 994 key, 995 shape=shape, 996 default_value=default_value, 997 dtype=dtype, 998 normalizer_fn=normalizer_fn) 999 1000 1001def _bucketized_column(source_column, boundaries): 1002 """Represents discretized dense input. 1003 1004 Buckets include the left boundary, and exclude the right boundary. Namely, 1005 `boundaries=[0., 1., 2.]` generates buckets `(-inf, 0.)`, `[0., 1.)`, 1006 `[1., 2.)`, and `[2., +inf)`. 1007 1008 For example, if the inputs are 1009 1010 ```python 1011 boundaries = [0, 10, 100] 1012 input tensor = [[-5, 10000] 1013 [150, 10] 1014 [5, 100]] 1015 ``` 1016 1017 then the output will be 1018 1019 ```python 1020 output = [[0, 3] 1021 [3, 2] 1022 [1, 3]] 1023 ``` 1024 1025 Example: 1026 1027 ```python 1028 price = numeric_column('price') 1029 bucketized_price = bucketized_column(price, boundaries=[...]) 1030 columns = [bucketized_price, ...] 1031 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1032 linear_prediction = linear_model(features, columns) 1033 1034 # or 1035 columns = [bucketized_price, ...] 1036 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1037 dense_tensor = input_layer(features, columns) 1038 ``` 1039 1040 `bucketized_column` can also be crossed with another categorical column using 1041 `crossed_column`: 1042 1043 ```python 1044 price = numeric_column('price') 1045 # bucketized_column converts numerical feature to a categorical one. 1046 bucketized_price = bucketized_column(price, boundaries=[...]) 1047 # 'keywords' is a string feature. 1048 price_x_keywords = crossed_column([bucketized_price, 'keywords'], 50K) 1049 columns = [price_x_keywords, ...] 1050 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1051 linear_prediction = linear_model(features, columns) 1052 ``` 1053 1054 Args: 1055 source_column: A one-dimensional dense column which is generated with 1056 `numeric_column`. 1057 boundaries: A sorted list or tuple of floats specifying the boundaries. 1058 1059 Returns: 1060 A `_BucketizedColumn`. 1061 1062 Raises: 1063 ValueError: If `source_column` is not a numeric column, or if it is not 1064 one-dimensional. 1065 ValueError: If `boundaries` is not a sorted list or tuple. 1066 """ 1067 if not isinstance(source_column, _NumericColumn): 1068 raise ValueError( 1069 'source_column must be a column generated with numeric_column(). ' 1070 'Given: {}'.format(source_column)) 1071 if len(source_column.shape) > 1: 1072 raise ValueError( 1073 'source_column must be one-dimensional column. ' 1074 'Given: {}'.format(source_column)) 1075 if (not boundaries or 1076 not (isinstance(boundaries, list) or isinstance(boundaries, tuple))): 1077 raise ValueError('boundaries must be a sorted list.') 1078 for i in range(len(boundaries) - 1): 1079 if boundaries[i] >= boundaries[i + 1]: 1080 raise ValueError('boundaries must be a sorted list.') 1081 return _BucketizedColumn(source_column, tuple(boundaries)) 1082 1083 1084def _categorical_column_with_hash_bucket(key, 1085 hash_bucket_size, 1086 dtype=dtypes.string): 1087 """Represents sparse feature where ids are set by hashing. 1088 1089 Use this when your sparse features are in string or integer format, and you 1090 want to distribute your inputs into a finite number of buckets by hashing. 1091 output_id = Hash(input_feature_string) % bucket_size for string type input. 1092 For int type input, the value is converted to its string representation first 1093 and then hashed by the same formula. 1094 1095 For input dictionary `features`, `features[key]` is either `Tensor` or 1096 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int 1097 and `''` for string, which will be dropped by this feature column. 1098 1099 Example: 1100 1101 ```python 1102 keywords = categorical_column_with_hash_bucket("keywords", 10K) 1103 columns = [keywords, ...] 1104 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1105 linear_prediction = linear_model(features, columns) 1106 1107 # or 1108 keywords_embedded = embedding_column(keywords, 16) 1109 columns = [keywords_embedded, ...] 1110 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1111 dense_tensor = input_layer(features, columns) 1112 ``` 1113 1114 Args: 1115 key: A unique string identifying the input feature. It is used as the 1116 column name and the dictionary key for feature parsing configs, feature 1117 `Tensor` objects, and feature columns. 1118 hash_bucket_size: An int > 1. The number of buckets. 1119 dtype: The type of features. Only string and integer types are supported. 1120 1121 Returns: 1122 A `_HashedCategoricalColumn`. 1123 1124 Raises: 1125 ValueError: `hash_bucket_size` is not greater than 1. 1126 ValueError: `dtype` is neither string nor integer. 1127 """ 1128 if hash_bucket_size is None: 1129 raise ValueError('hash_bucket_size must be set. ' 'key: {}'.format(key)) 1130 1131 if hash_bucket_size < 1: 1132 raise ValueError('hash_bucket_size must be at least 1. ' 1133 'hash_bucket_size: {}, key: {}'.format( 1134 hash_bucket_size, key)) 1135 1136 fc_utils.assert_key_is_string(key) 1137 fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key)) 1138 1139 return _HashedCategoricalColumn(key, hash_bucket_size, dtype) 1140 1141 1142def _categorical_column_with_vocabulary_file(key, 1143 vocabulary_file, 1144 vocabulary_size=None, 1145 num_oov_buckets=0, 1146 default_value=None, 1147 dtype=dtypes.string): 1148 """A `_CategoricalColumn` with a vocabulary file. 1149 1150 Use this when your inputs are in string or integer format, and you have a 1151 vocabulary file that maps each value to an integer ID. By default, 1152 out-of-vocabulary values are ignored. Use either (but not both) of 1153 `num_oov_buckets` and `default_value` to specify how to include 1154 out-of-vocabulary values. 1155 1156 For input dictionary `features`, `features[key]` is either `Tensor` or 1157 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int 1158 and `''` for string, which will be dropped by this feature column. 1159 1160 Example with `num_oov_buckets`: 1161 File '/us/states.txt' contains 50 lines, each with a 2-character U.S. state 1162 abbreviation. All inputs with values in that file are assigned an ID 0-49, 1163 corresponding to its line number. All other values are hashed and assigned an 1164 ID 50-54. 1165 1166 ```python 1167 states = categorical_column_with_vocabulary_file( 1168 key='states', vocabulary_file='/us/states.txt', vocabulary_size=50, 1169 num_oov_buckets=5) 1170 columns = [states, ...] 1171 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1172 linear_prediction = linear_model(features, columns) 1173 ``` 1174 1175 Example with `default_value`: 1176 File '/us/states.txt' contains 51 lines - the first line is 'XX', and the 1177 other 50 each have a 2-character U.S. state abbreviation. Both a literal 'XX' 1178 in input, and other values missing from the file, will be assigned ID 0. All 1179 others are assigned the corresponding line number 1-50. 1180 1181 ```python 1182 states = categorical_column_with_vocabulary_file( 1183 key='states', vocabulary_file='/us/states.txt', vocabulary_size=51, 1184 default_value=0) 1185 columns = [states, ...] 1186 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1187 linear_prediction, _, _ = linear_model(features, columns) 1188 ``` 1189 1190 And to make an embedding with either: 1191 1192 ```python 1193 columns = [embedding_column(states, 3),...] 1194 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1195 dense_tensor = input_layer(features, columns) 1196 ``` 1197 1198 Args: 1199 key: A unique string identifying the input feature. It is used as the 1200 column name and the dictionary key for feature parsing configs, feature 1201 `Tensor` objects, and feature columns. 1202 vocabulary_file: The vocabulary file name. 1203 vocabulary_size: Number of the elements in the vocabulary. This must be no 1204 greater than length of `vocabulary_file`, if less than length, later 1205 values are ignored. If None, it is set to the length of `vocabulary_file`. 1206 num_oov_buckets: Non-negative integer, the number of out-of-vocabulary 1207 buckets. All out-of-vocabulary inputs will be assigned IDs in the range 1208 `[vocabulary_size, vocabulary_size+num_oov_buckets)` based on a hash of 1209 the input value. A positive `num_oov_buckets` can not be specified with 1210 `default_value`. 1211 default_value: The integer ID value to return for out-of-vocabulary feature 1212 values, defaults to `-1`. This can not be specified with a positive 1213 `num_oov_buckets`. 1214 dtype: The type of features. Only string and integer types are supported. 1215 1216 Returns: 1217 A `_CategoricalColumn` with a vocabulary file. 1218 1219 Raises: 1220 ValueError: `vocabulary_file` is missing or cannot be opened. 1221 ValueError: `vocabulary_size` is missing or < 1. 1222 ValueError: `num_oov_buckets` is a negative integer. 1223 ValueError: `num_oov_buckets` and `default_value` are both specified. 1224 ValueError: `dtype` is neither string nor integer. 1225 """ 1226 if not vocabulary_file: 1227 raise ValueError('Missing vocabulary_file in {}.'.format(key)) 1228 1229 if vocabulary_size is None: 1230 if not gfile.Exists(vocabulary_file): 1231 raise ValueError('vocabulary_file in {} does not exist.'.format(key)) 1232 1233 with gfile.GFile(vocabulary_file) as f: 1234 vocabulary_size = sum(1 for _ in f) 1235 logging.info( 1236 'vocabulary_size = %d in %s is inferred from the number of elements ' 1237 'in the vocabulary_file %s.', vocabulary_size, key, vocabulary_file) 1238 1239 # `vocabulary_size` isn't required for lookup, but it is for `_num_buckets`. 1240 if vocabulary_size < 1: 1241 raise ValueError('Invalid vocabulary_size in {}.'.format(key)) 1242 if num_oov_buckets: 1243 if default_value is not None: 1244 raise ValueError( 1245 'Can\'t specify both num_oov_buckets and default_value in {}.'.format( 1246 key)) 1247 if num_oov_buckets < 0: 1248 raise ValueError('Invalid num_oov_buckets {} in {}.'.format( 1249 num_oov_buckets, key)) 1250 fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key)) 1251 fc_utils.assert_key_is_string(key) 1252 return _VocabularyFileCategoricalColumn( 1253 key=key, 1254 vocabulary_file=vocabulary_file, 1255 vocabulary_size=vocabulary_size, 1256 num_oov_buckets=0 if num_oov_buckets is None else num_oov_buckets, 1257 default_value=-1 if default_value is None else default_value, 1258 dtype=dtype) 1259 1260 1261def _categorical_column_with_vocabulary_list(key, 1262 vocabulary_list, 1263 dtype=None, 1264 default_value=-1, 1265 num_oov_buckets=0): 1266 """A `_CategoricalColumn` with in-memory vocabulary. 1267 1268 Use this when your inputs are in string or integer format, and you have an 1269 in-memory vocabulary mapping each value to an integer ID. By default, 1270 out-of-vocabulary values are ignored. Use either (but not both) of 1271 `num_oov_buckets` and `default_value` to specify how to include 1272 out-of-vocabulary values. 1273 1274 For input dictionary `features`, `features[key]` is either `Tensor` or 1275 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int 1276 and `''` for string, which will be dropped by this feature column. 1277 1278 Example with `num_oov_buckets`: 1279 In the following example, each input in `vocabulary_list` is assigned an ID 1280 0-3 corresponding to its index (e.g., input 'B' produces output 2). All other 1281 inputs are hashed and assigned an ID 4-5. 1282 1283 ```python 1284 colors = categorical_column_with_vocabulary_list( 1285 key='colors', vocabulary_list=('R', 'G', 'B', 'Y'), 1286 num_oov_buckets=2) 1287 columns = [colors, ...] 1288 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1289 linear_prediction, _, _ = linear_model(features, columns) 1290 ``` 1291 1292 Example with `default_value`: 1293 In the following example, each input in `vocabulary_list` is assigned an ID 1294 0-4 corresponding to its index (e.g., input 'B' produces output 3). All other 1295 inputs are assigned `default_value` 0. 1296 1297 1298 ```python 1299 colors = categorical_column_with_vocabulary_list( 1300 key='colors', vocabulary_list=('X', 'R', 'G', 'B', 'Y'), default_value=0) 1301 columns = [colors, ...] 1302 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1303 linear_prediction, _, _ = linear_model(features, columns) 1304 ``` 1305 1306 And to make an embedding with either: 1307 1308 ```python 1309 columns = [embedding_column(colors, 3),...] 1310 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1311 dense_tensor = input_layer(features, columns) 1312 ``` 1313 1314 Args: 1315 key: A unique string identifying the input feature. It is used as the 1316 column name and the dictionary key for feature parsing configs, feature 1317 `Tensor` objects, and feature columns. 1318 vocabulary_list: An ordered iterable defining the vocabulary. Each feature 1319 is mapped to the index of its value (if present) in `vocabulary_list`. 1320 Must be castable to `dtype`. 1321 dtype: The type of features. Only string and integer types are supported. 1322 If `None`, it will be inferred from `vocabulary_list`. 1323 default_value: The integer ID value to return for out-of-vocabulary feature 1324 values, defaults to `-1`. This can not be specified with a positive 1325 `num_oov_buckets`. 1326 num_oov_buckets: Non-negative integer, the number of out-of-vocabulary 1327 buckets. All out-of-vocabulary inputs will be assigned IDs in the range 1328 `[len(vocabulary_list), len(vocabulary_list)+num_oov_buckets)` based on a 1329 hash of the input value. A positive `num_oov_buckets` can not be specified 1330 with `default_value`. 1331 1332 Returns: 1333 A `_CategoricalColumn` with in-memory vocabulary. 1334 1335 Raises: 1336 ValueError: if `vocabulary_list` is empty, or contains duplicate keys. 1337 ValueError: `num_oov_buckets` is a negative integer. 1338 ValueError: `num_oov_buckets` and `default_value` are both specified. 1339 ValueError: if `dtype` is not integer or string. 1340 """ 1341 if (vocabulary_list is None) or (len(vocabulary_list) < 1): 1342 raise ValueError( 1343 'vocabulary_list {} must be non-empty, column_name: {}'.format( 1344 vocabulary_list, key)) 1345 if len(set(vocabulary_list)) != len(vocabulary_list): 1346 raise ValueError( 1347 'Duplicate keys in vocabulary_list {}, column_name: {}'.format( 1348 vocabulary_list, key)) 1349 vocabulary_dtype = dtypes.as_dtype(np.array(vocabulary_list).dtype) 1350 if num_oov_buckets: 1351 if default_value != -1: 1352 raise ValueError( 1353 'Can\'t specify both num_oov_buckets and default_value in {}.'.format( 1354 key)) 1355 if num_oov_buckets < 0: 1356 raise ValueError('Invalid num_oov_buckets {} in {}.'.format( 1357 num_oov_buckets, key)) 1358 fc_utils.assert_string_or_int( 1359 vocabulary_dtype, prefix='column_name: {} vocabulary'.format(key)) 1360 if dtype is None: 1361 dtype = vocabulary_dtype 1362 elif dtype.is_integer != vocabulary_dtype.is_integer: 1363 raise ValueError( 1364 'dtype {} and vocabulary dtype {} do not match, column_name: {}'.format( 1365 dtype, vocabulary_dtype, key)) 1366 fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key)) 1367 fc_utils.assert_key_is_string(key) 1368 1369 return _VocabularyListCategoricalColumn( 1370 key=key, vocabulary_list=tuple(vocabulary_list), dtype=dtype, 1371 default_value=default_value, num_oov_buckets=num_oov_buckets) 1372 1373 1374def _categorical_column_with_identity(key, num_buckets, default_value=None): 1375 """A `_CategoricalColumn` that returns identity values. 1376 1377 Use this when your inputs are integers in the range `[0, num_buckets)`, and 1378 you want to use the input value itself as the categorical ID. Values outside 1379 this range will result in `default_value` if specified, otherwise it will 1380 fail. 1381 1382 Typically, this is used for contiguous ranges of integer indexes, but 1383 it doesn't have to be. This might be inefficient, however, if many of IDs 1384 are unused. Consider `categorical_column_with_hash_bucket` in that case. 1385 1386 For input dictionary `features`, `features[key]` is either `Tensor` or 1387 `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int 1388 and `''` for string, which will be dropped by this feature column. 1389 1390 In the following examples, each input in the range `[0, 1000000)` is assigned 1391 the same value. All other inputs are assigned `default_value` 0. Note that a 1392 literal 0 in inputs will result in the same default ID. 1393 1394 Linear model: 1395 1396 ```python 1397 video_id = categorical_column_with_identity( 1398 key='video_id', num_buckets=1000000, default_value=0) 1399 columns = [video_id, ...] 1400 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1401 linear_prediction, _, _ = linear_model(features, columns) 1402 ``` 1403 1404 Embedding for a DNN model: 1405 1406 ```python 1407 columns = [embedding_column(video_id, 9),...] 1408 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1409 dense_tensor = input_layer(features, columns) 1410 ``` 1411 1412 Args: 1413 key: A unique string identifying the input feature. It is used as the 1414 column name and the dictionary key for feature parsing configs, feature 1415 `Tensor` objects, and feature columns. 1416 num_buckets: Range of inputs and outputs is `[0, num_buckets)`. 1417 default_value: If `None`, this column's graph operations will fail for 1418 out-of-range inputs. Otherwise, this value must be in the range 1419 `[0, num_buckets)`, and will replace inputs in that range. 1420 1421 Returns: 1422 A `_CategoricalColumn` that returns identity values. 1423 1424 Raises: 1425 ValueError: if `num_buckets` is less than one. 1426 ValueError: if `default_value` is not in range `[0, num_buckets)`. 1427 """ 1428 if num_buckets < 1: 1429 raise ValueError( 1430 'num_buckets {} < 1, column_name {}'.format(num_buckets, key)) 1431 if (default_value is not None) and ( 1432 (default_value < 0) or (default_value >= num_buckets)): 1433 raise ValueError( 1434 'default_value {} not in range [0, {}), column_name {}'.format( 1435 default_value, num_buckets, key)) 1436 fc_utils.assert_key_is_string(key) 1437 return _IdentityCategoricalColumn( 1438 key=key, num_buckets=num_buckets, default_value=default_value) 1439 1440 1441def _indicator_column(categorical_column): 1442 """Represents multi-hot representation of given categorical column. 1443 1444 - For DNN model, `indicator_column` can be used to wrap any 1445 `categorical_column_*` (e.g., to feed to DNN). Consider to Use 1446 `embedding_column` if the number of buckets/unique(values) are large. 1447 1448 - For Wide (aka linear) model, `indicator_column` is the internal 1449 representation for categorical column when passing categorical column 1450 directly (as any element in feature_columns) to `linear_model`. See 1451 `linear_model` for details. 1452 1453 ```python 1454 name = indicator_column(categorical_column_with_vocabulary_list( 1455 'name', ['bob', 'george', 'wanda']) 1456 columns = [name, ...] 1457 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1458 dense_tensor = input_layer(features, columns) 1459 1460 dense_tensor == [[1, 0, 0]] # If "name" bytes_list is ["bob"] 1461 dense_tensor == [[1, 0, 1]] # If "name" bytes_list is ["bob", "wanda"] 1462 dense_tensor == [[2, 0, 0]] # If "name" bytes_list is ["bob", "bob"] 1463 ``` 1464 1465 Args: 1466 categorical_column: A `_CategoricalColumn` which is created by 1467 `categorical_column_with_*` or `crossed_column` functions. 1468 1469 Returns: 1470 An `_IndicatorColumn`. 1471 """ 1472 return _IndicatorColumn(categorical_column) 1473 1474 1475def _weighted_categorical_column(categorical_column, 1476 weight_feature_key, 1477 dtype=dtypes.float32): 1478 """Applies weight values to a `_CategoricalColumn`. 1479 1480 Use this when each of your sparse inputs has both an ID and a value. For 1481 example, if you're representing text documents as a collection of word 1482 frequencies, you can provide 2 parallel sparse input features ('terms' and 1483 'frequencies' below). 1484 1485 Example: 1486 1487 Input `tf.Example` objects: 1488 1489 ```proto 1490 [ 1491 features { 1492 feature { 1493 key: "terms" 1494 value {bytes_list {value: "very" value: "model"}} 1495 } 1496 feature { 1497 key: "frequencies" 1498 value {float_list {value: 0.3 value: 0.1}} 1499 } 1500 }, 1501 features { 1502 feature { 1503 key: "terms" 1504 value {bytes_list {value: "when" value: "course" value: "human"}} 1505 } 1506 feature { 1507 key: "frequencies" 1508 value {float_list {value: 0.4 value: 0.1 value: 0.2}} 1509 } 1510 } 1511 ] 1512 ``` 1513 1514 ```python 1515 categorical_column = categorical_column_with_hash_bucket( 1516 column_name='terms', hash_bucket_size=1000) 1517 weighted_column = weighted_categorical_column( 1518 categorical_column=categorical_column, weight_feature_key='frequencies') 1519 columns = [weighted_column, ...] 1520 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1521 linear_prediction, _, _ = linear_model(features, columns) 1522 ``` 1523 1524 This assumes the input dictionary contains a `SparseTensor` for key 1525 'terms', and a `SparseTensor` for key 'frequencies'. These 2 tensors must have 1526 the same indices and dense shape. 1527 1528 Args: 1529 categorical_column: A `_CategoricalColumn` created by 1530 `categorical_column_with_*` functions. 1531 weight_feature_key: String key for weight values. 1532 dtype: Type of weights, such as `tf.float32`. Only float and integer weights 1533 are supported. 1534 1535 Returns: 1536 A `_CategoricalColumn` composed of two sparse features: one represents id, 1537 the other represents weight (value) of the id feature in that example. 1538 1539 Raises: 1540 ValueError: if `dtype` is not convertible to float. 1541 """ 1542 if (dtype is None) or not (dtype.is_integer or dtype.is_floating): 1543 raise ValueError('dtype {} is not convertible to float.'.format(dtype)) 1544 return _WeightedCategoricalColumn( 1545 categorical_column=categorical_column, 1546 weight_feature_key=weight_feature_key, 1547 dtype=dtype) 1548 1549 1550def _crossed_column(keys, hash_bucket_size, hash_key=None): 1551 """Returns a column for performing crosses of categorical features. 1552 1553 Crossed features will be hashed according to `hash_bucket_size`. Conceptually, 1554 the transformation can be thought of as: 1555 Hash(cartesian product of features) % `hash_bucket_size` 1556 1557 For example, if the input features are: 1558 1559 * SparseTensor referred by first key: 1560 1561 ```python 1562 shape = [2, 2] 1563 { 1564 [0, 0]: "a" 1565 [1, 0]: "b" 1566 [1, 1]: "c" 1567 } 1568 ``` 1569 1570 * SparseTensor referred by second key: 1571 1572 ```python 1573 shape = [2, 1] 1574 { 1575 [0, 0]: "d" 1576 [1, 0]: "e" 1577 } 1578 ``` 1579 1580 then crossed feature will look like: 1581 1582 ```python 1583 shape = [2, 2] 1584 { 1585 [0, 0]: Hash64("d", Hash64("a")) % hash_bucket_size 1586 [1, 0]: Hash64("e", Hash64("b")) % hash_bucket_size 1587 [1, 1]: Hash64("e", Hash64("c")) % hash_bucket_size 1588 } 1589 ``` 1590 1591 Here is an example to create a linear model with crosses of string features: 1592 1593 ```python 1594 keywords_x_doc_terms = crossed_column(['keywords', 'doc_terms'], 50K) 1595 columns = [keywords_x_doc_terms, ...] 1596 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1597 linear_prediction = linear_model(features, columns) 1598 ``` 1599 1600 You could also use vocabulary lookup before crossing: 1601 1602 ```python 1603 keywords = categorical_column_with_vocabulary_file( 1604 'keywords', '/path/to/vocabulary/file', vocabulary_size=1K) 1605 keywords_x_doc_terms = crossed_column([keywords, 'doc_terms'], 50K) 1606 columns = [keywords_x_doc_terms, ...] 1607 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1608 linear_prediction = linear_model(features, columns) 1609 ``` 1610 1611 If an input feature is of numeric type, you can use 1612 `categorical_column_with_identity`, or `bucketized_column`, as in the example: 1613 1614 ```python 1615 # vertical_id is an integer categorical feature. 1616 vertical_id = categorical_column_with_identity('vertical_id', 10K) 1617 price = numeric_column('price') 1618 # bucketized_column converts numerical feature to a categorical one. 1619 bucketized_price = bucketized_column(price, boundaries=[...]) 1620 vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K) 1621 columns = [vertical_id_x_price, ...] 1622 features = tf.parse_example(..., features=make_parse_example_spec(columns)) 1623 linear_prediction = linear_model(features, columns) 1624 ``` 1625 1626 To use crossed column in DNN model, you need to add it in an embedding column 1627 as in this example: 1628 1629 ```python 1630 vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K) 1631 vertical_id_x_price_embedded = embedding_column(vertical_id_x_price, 10) 1632 dense_tensor = input_layer(features, [vertical_id_x_price_embedded, ...]) 1633 ``` 1634 1635 Args: 1636 keys: An iterable identifying the features to be crossed. Each element can 1637 be either: 1638 * string: Will use the corresponding feature which must be of string type. 1639 * `_CategoricalColumn`: Will use the transformed tensor produced by this 1640 column. Does not support hashed categorical column. 1641 hash_bucket_size: An int > 1. The number of buckets. 1642 hash_key: Specify the hash_key that will be used by the `FingerprintCat64` 1643 function to combine the crosses fingerprints on SparseCrossOp (optional). 1644 1645 Returns: 1646 A `_CrossedColumn`. 1647 1648 Raises: 1649 ValueError: If `len(keys) < 2`. 1650 ValueError: If any of the keys is neither a string nor `_CategoricalColumn`. 1651 ValueError: If any of the keys is `_HashedCategoricalColumn`. 1652 ValueError: If `hash_bucket_size < 1`. 1653 """ 1654 if not hash_bucket_size or hash_bucket_size < 1: 1655 raise ValueError('hash_bucket_size must be > 1. ' 1656 'hash_bucket_size: {}'.format(hash_bucket_size)) 1657 if not keys or len(keys) < 2: 1658 raise ValueError( 1659 'keys must be a list with length > 1. Given: {}'.format(keys)) 1660 for key in keys: 1661 if (not isinstance(key, six.string_types) and 1662 not isinstance(key, _CategoricalColumn)): 1663 raise ValueError( 1664 'Unsupported key type. All keys must be either string, or ' 1665 'categorical column except _HashedCategoricalColumn. ' 1666 'Given: {}'.format(key)) 1667 if isinstance(key, _HashedCategoricalColumn): 1668 raise ValueError( 1669 'categorical_column_with_hash_bucket is not supported for crossing. ' 1670 'Hashing before crossing will increase probability of collision. ' 1671 'Instead, use the feature name as a string. Given: {}'.format(key)) 1672 return _CrossedColumn( 1673 keys=tuple(keys), hash_bucket_size=hash_bucket_size, 1674 hash_key=hash_key) 1675 1676 1677# TODO(rohanj): Clearly define semantics of this layer. 1678class _EmbeddingColumnLayer(base.Layer): 1679 """A layer that stores all the state required for a embedding column.""" 1680 1681 def __init__(self, 1682 embedding_shape, 1683 initializer, 1684 weight_collections=None, 1685 trainable=True, 1686 name=None, 1687 **kwargs): 1688 """Constructor. 1689 1690 Args: 1691 embedding_shape: Shape of the embedding variable used for lookup. 1692 initializer: A variable initializer function to be used in embedding 1693 variable initialization. 1694 weight_collections: A list of collection names to which the Variable will 1695 be added. Note that, variables will also be added to collections 1696 `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`. 1697 trainable: If `True` also add the variable to the graph collection 1698 `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). 1699 name: Name of the layer 1700 **kwargs: keyword named properties. 1701 """ 1702 super(_EmbeddingColumnLayer, self).__init__( 1703 trainable=trainable, name=name, **kwargs) 1704 self._embedding_shape = embedding_shape 1705 self._initializer = initializer 1706 self._weight_collections = weight_collections 1707 1708 def set_weight_collections(self, weight_collections): 1709 """Sets the weight collections for the layer. 1710 1711 Args: 1712 weight_collections: A list of collection names to which the Variable will 1713 be added. 1714 """ 1715 self._weight_collections = weight_collections 1716 1717 def build(self, _): 1718 self._embedding_weight_var = self.add_variable( 1719 name='embedding_weights', 1720 shape=self._embedding_shape, 1721 dtype=dtypes.float32, 1722 initializer=self._initializer, 1723 trainable=self.trainable) 1724 if self._weight_collections and not context.executing_eagerly(): 1725 _add_to_collections(self._embedding_weight_var, self._weight_collections) 1726 self.built = True 1727 1728 def call(self, _): 1729 return self._embedding_weight_var 1730 1731 1732@six.add_metaclass(abc.ABCMeta) 1733class _FeatureColumn(object): 1734 """Represents a feature column abstraction. 1735 1736 WARNING: Do not subclass this layer unless you know what you are doing: 1737 the API is subject to future changes. 1738 1739 To distinguish the concept of a feature family and a specific binary feature 1740 within a family, we refer to a feature family like "country" as a feature 1741 column. Following is an example feature in a `tf.Example` format: 1742 {key: "country", value: [ "US" ]} 1743 In this example the value of feature is "US" and "country" refers to the 1744 column of the feature. 1745 1746 This class is an abstract class. User should not create instances of this. 1747 """ 1748 1749 @abc.abstractproperty 1750 def name(self): 1751 """Returns string. Used for naming and for name_scope.""" 1752 pass 1753 1754 @property 1755 def _var_scope_name(self): 1756 """Returns string. Used for variable_scope. Defaults to self.name.""" 1757 return self.name 1758 1759 @abc.abstractmethod 1760 def _transform_feature(self, inputs): 1761 """Returns intermediate representation (usually a `Tensor`). 1762 1763 Uses `inputs` to create an intermediate representation (usually a `Tensor`) 1764 that other feature columns can use. 1765 1766 Example usage of `inputs`: 1767 Let's say a Feature column depends on raw feature ('raw') and another 1768 `_FeatureColumn` (input_fc). To access corresponding `Tensor`s, inputs will 1769 be used as follows: 1770 1771 ```python 1772 raw_tensor = inputs.get('raw') 1773 fc_tensor = inputs.get(input_fc) 1774 ``` 1775 1776 Args: 1777 inputs: A `_LazyBuilder` object to access inputs. 1778 1779 Returns: 1780 Transformed feature `Tensor`. 1781 """ 1782 pass 1783 1784 @abc.abstractproperty 1785 def _parse_example_spec(self): 1786 """Returns a `tf.Example` parsing spec as dict. 1787 1788 It is used for get_parsing_spec for `tf.parse_example`. Returned spec is a 1789 dict from keys ('string') to `VarLenFeature`, `FixedLenFeature`, and other 1790 supported objects. Please check documentation of `tf.parse_example` for all 1791 supported spec objects. 1792 1793 Let's say a Feature column depends on raw feature ('raw') and another 1794 `_FeatureColumn` (input_fc). One possible implementation of 1795 _parse_example_spec is as follows: 1796 1797 ```python 1798 spec = {'raw': tf.FixedLenFeature(...)} 1799 spec.update(input_fc._parse_example_spec) 1800 return spec 1801 ``` 1802 """ 1803 pass 1804 1805 def _reset_config(self): 1806 """Resets the configuration in the column. 1807 1808 Some feature columns e.g. embedding or shared embedding columns might 1809 have some state that is needed to be reset sometimes. Use this method 1810 in that scenario. 1811 """ 1812 1813 1814class _DenseColumn(_FeatureColumn): 1815 """Represents a column which can be represented as `Tensor`. 1816 1817 WARNING: Do not subclass this layer unless you know what you are doing: 1818 the API is subject to future changes. 1819 1820 Some examples of this type are: numeric_column, embedding_column, 1821 indicator_column. 1822 """ 1823 1824 @abc.abstractproperty 1825 def _variable_shape(self): 1826 """`TensorShape` of `_get_dense_tensor`, without batch dimension.""" 1827 pass 1828 1829 @abc.abstractmethod 1830 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 1831 """Returns a `Tensor`. 1832 1833 The output of this function will be used by model-builder-functions. For 1834 example the pseudo code of `input_layer` will be like: 1835 1836 ```python 1837 def input_layer(features, feature_columns, ...): 1838 outputs = [fc._get_dense_tensor(...) for fc in feature_columns] 1839 return tf.concat(outputs) 1840 ``` 1841 1842 Args: 1843 inputs: A `_LazyBuilder` object to access inputs. 1844 weight_collections: List of graph collections to which Variables (if any 1845 will be created) are added. 1846 trainable: If `True` also add variables to the graph collection 1847 `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`). 1848 1849 Returns: 1850 `Tensor` of shape [batch_size] + `_variable_shape`. 1851 """ 1852 pass 1853 1854 1855def _create_weighted_sum(column, 1856 builder, 1857 units, 1858 sparse_combiner, 1859 weight_collections, 1860 trainable, 1861 weight_var=None): 1862 """Creates a weighted sum for a dense/categorical column for linear_model.""" 1863 if isinstance(column, _CategoricalColumn): 1864 return _create_categorical_column_weighted_sum( 1865 column=column, 1866 builder=builder, 1867 units=units, 1868 sparse_combiner=sparse_combiner, 1869 weight_collections=weight_collections, 1870 trainable=trainable, 1871 weight_var=weight_var) 1872 else: 1873 return _create_dense_column_weighted_sum( 1874 column=column, 1875 builder=builder, 1876 units=units, 1877 weight_collections=weight_collections, 1878 trainable=trainable, 1879 weight_var=weight_var) 1880 1881 1882def _create_dense_column_weighted_sum(column, 1883 builder, 1884 units, 1885 weight_collections, 1886 trainable, 1887 weight_var=None): 1888 """Create a weighted sum of a dense column for linear_model.""" 1889 tensor = column._get_dense_tensor( # pylint: disable=protected-access 1890 builder, 1891 weight_collections=weight_collections, 1892 trainable=trainable) 1893 num_elements = column._variable_shape.num_elements() # pylint: disable=protected-access 1894 batch_size = array_ops.shape(tensor)[0] 1895 tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements)) 1896 if weight_var is not None: 1897 weight = weight_var 1898 else: 1899 weight = variable_scope.get_variable( 1900 name='weights', 1901 shape=[num_elements, units], 1902 initializer=init_ops.zeros_initializer(), 1903 trainable=trainable, 1904 collections=weight_collections) 1905 return math_ops.matmul(tensor, weight, name='weighted_sum') 1906 1907 1908class _CategoricalColumn(_FeatureColumn): 1909 """Represents a categorical feature. 1910 1911 WARNING: Do not subclass this layer unless you know what you are doing: 1912 the API is subject to future changes. 1913 1914 A categorical feature typically handled with a `tf.SparseTensor` of IDs. 1915 """ 1916 1917 IdWeightPair = collections.namedtuple( # pylint: disable=invalid-name 1918 'IdWeightPair', ['id_tensor', 'weight_tensor']) 1919 1920 @abc.abstractproperty 1921 def _num_buckets(self): 1922 """Returns number of buckets in this sparse feature.""" 1923 pass 1924 1925 @abc.abstractmethod 1926 def _get_sparse_tensors(self, 1927 inputs, 1928 weight_collections=None, 1929 trainable=None): 1930 """Returns an IdWeightPair. 1931 1932 `IdWeightPair` is a pair of `SparseTensor`s which represents ids and 1933 weights. 1934 1935 `IdWeightPair.id_tensor` is typically a `batch_size` x `num_buckets` 1936 `SparseTensor` of `int64`. `IdWeightPair.weight_tensor` is either a 1937 `SparseTensor` of `float` or `None` to indicate all weights should be 1938 taken to be 1. If specified, `weight_tensor` must have exactly the same 1939 shape and indices as `sp_ids`. Expected `SparseTensor` is same as parsing 1940 output of a `VarLenFeature` which is a ragged matrix. 1941 1942 Args: 1943 inputs: A `LazyBuilder` as a cache to get input tensors required to 1944 create `IdWeightPair`. 1945 weight_collections: List of graph collections to which variables (if any 1946 will be created) are added. 1947 trainable: If `True` also add variables to the graph collection 1948 `GraphKeys.TRAINABLE_VARIABLES` (see `tf.get_variable`). 1949 """ 1950 pass 1951 1952 1953def _create_categorical_column_weighted_sum(column, 1954 builder, 1955 units, 1956 sparse_combiner, 1957 weight_collections, 1958 trainable, 1959 weight_var=None): 1960 # pylint: disable=g-doc-return-or-yield,g-doc-args 1961 """Create a weighted sum of a categorical column for linear_model. 1962 1963 Note to maintainer: As implementation details, the weighted sum is 1964 implemented via embedding_lookup_sparse toward efficiency. Mathematically, 1965 they are the same. 1966 1967 To be specific, conceptually, categorical column can be treated as multi-hot 1968 vector. Say: 1969 1970 ```python 1971 x = [0 0 1] # categorical column input 1972 w = [a b c] # weights 1973 ``` 1974 The weighted sum is `c` in this case, which is same as `w[2]`. 1975 1976 Another example is 1977 1978 ```python 1979 x = [0 1 1] # categorical column input 1980 w = [a b c] # weights 1981 ``` 1982 The weighted sum is `b + c` in this case, which is same as `w[2] + w[3]`. 1983 1984 For both cases, we can implement weighted sum via embedding_lookup with 1985 sparse_combiner = "sum". 1986 """ 1987 1988 sparse_tensors = column._get_sparse_tensors( # pylint: disable=protected-access 1989 builder, 1990 weight_collections=weight_collections, 1991 trainable=trainable) 1992 id_tensor = sparse_ops.sparse_reshape(sparse_tensors.id_tensor, [ 1993 array_ops.shape(sparse_tensors.id_tensor)[0], -1 1994 ]) 1995 weight_tensor = sparse_tensors.weight_tensor 1996 if weight_tensor is not None: 1997 weight_tensor = sparse_ops.sparse_reshape( 1998 weight_tensor, [array_ops.shape(weight_tensor)[0], -1]) 1999 2000 if weight_var is not None: 2001 weight = weight_var 2002 else: 2003 weight = variable_scope.get_variable( 2004 name='weights', 2005 shape=(column._num_buckets, units), # pylint: disable=protected-access 2006 initializer=init_ops.zeros_initializer(), 2007 trainable=trainable, 2008 collections=weight_collections) 2009 return embedding_ops.safe_embedding_lookup_sparse( 2010 weight, 2011 id_tensor, 2012 sparse_weights=weight_tensor, 2013 combiner=sparse_combiner, 2014 name='weighted_sum') 2015 2016 2017class _SequenceDenseColumn(_FeatureColumn): 2018 """Represents dense sequence data.""" 2019 2020 TensorSequenceLengthPair = collections.namedtuple( # pylint: disable=invalid-name 2021 'TensorSequenceLengthPair', ['dense_tensor', 'sequence_length']) 2022 2023 @abc.abstractmethod 2024 def _get_sequence_dense_tensor( 2025 self, inputs, weight_collections=None, trainable=None): 2026 """Returns a `TensorSequenceLengthPair`.""" 2027 pass 2028 2029 2030class _LazyBuilder(object): 2031 """Handles caching of transformations while building the model. 2032 2033 `_FeatureColumn` specifies how to digest an input column to the network. Some 2034 feature columns require data transformations. This class caches those 2035 transformations. 2036 2037 Some features may be used in more than one place. For example, one can use a 2038 bucketized feature by itself and a cross with it. In that case we 2039 should create only one bucketization op instead of creating ops for each 2040 feature column separately. To handle re-use of transformed columns, 2041 `_LazyBuilder` caches all previously transformed columns. 2042 2043 Example: 2044 We're trying to use the following `_FeatureColumn`s: 2045 2046 ```python 2047 bucketized_age = fc.bucketized_column(fc.numeric_column("age"), ...) 2048 keywords = fc.categorical_column_with_hash_buckets("keywords", ...) 2049 age_X_keywords = fc.crossed_column([bucketized_age, "keywords"]) 2050 ... = linear_model(features, 2051 [bucketized_age, keywords, age_X_keywords] 2052 ``` 2053 2054 If we transform each column independently, then we'll get duplication of 2055 bucketization (one for cross, one for bucketization itself). 2056 The `_LazyBuilder` eliminates this duplication. 2057 """ 2058 2059 def __init__(self, features): 2060 """Creates a `_LazyBuilder`. 2061 2062 Args: 2063 features: A mapping from feature column to objects that are `Tensor` or 2064 `SparseTensor`, or can be converted to same via 2065 `sparse_tensor.convert_to_tensor_or_sparse_tensor`. A `string` key 2066 signifies a base feature (not-transformed). A `_FeatureColumn` key 2067 means that this `Tensor` is the output of an existing `_FeatureColumn` 2068 which can be reused. 2069 """ 2070 self._features = features.copy() 2071 self._feature_tensors = {} 2072 2073 def get(self, key): 2074 """Returns a `Tensor` for the given key. 2075 2076 A `str` key is used to access a base feature (not-transformed). When a 2077 `_FeatureColumn` is passed, the transformed feature is returned if it 2078 already exists, otherwise the given `_FeatureColumn` is asked to provide its 2079 transformed output, which is then cached. 2080 2081 Args: 2082 key: a `str` or a `_FeatureColumn`. 2083 2084 Returns: 2085 The transformed `Tensor` corresponding to the `key`. 2086 2087 Raises: 2088 ValueError: if key is not found or a transformed `Tensor` cannot be 2089 computed. 2090 """ 2091 if key in self._feature_tensors: 2092 # FeatureColumn is already transformed or converted. 2093 return self._feature_tensors[key] 2094 2095 if key in self._features: 2096 feature_tensor = self._get_raw_feature_as_tensor(key) 2097 self._feature_tensors[key] = feature_tensor 2098 return feature_tensor 2099 2100 if isinstance(key, six.string_types): 2101 raise ValueError('Feature {} is not in features dictionary.'.format(key)) 2102 2103 if not isinstance(key, _FeatureColumn): 2104 raise TypeError('"key" must be either a "str" or "_FeatureColumn". ' 2105 'Provided: {}'.format(key)) 2106 2107 column = key 2108 logging.debug('Transforming feature_column %s.', column) 2109 transformed = column._transform_feature(self) # pylint: disable=protected-access 2110 if transformed is None: 2111 raise ValueError('Column {} is not supported.'.format(column.name)) 2112 self._feature_tensors[column] = transformed 2113 return transformed 2114 2115 def _get_raw_feature_as_tensor(self, key): 2116 """Gets the raw_feature (keyed by `key`) as `tensor`. 2117 2118 The raw feature is converted to (sparse) tensor and maybe expand dim. 2119 2120 For both `Tensor` and `SparseTensor`, the rank will be expanded (to 2) if 2121 the rank is 1. This supports dynamic rank also. For rank 0 raw feature, will 2122 error out as it is not supported. 2123 2124 Args: 2125 key: A `str` key to access the raw feature. 2126 2127 Returns: 2128 A `Tensor` or `SparseTensor`. 2129 2130 Raises: 2131 ValueError: if the raw feature has rank 0. 2132 """ 2133 raw_feature = self._features[key] 2134 feature_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor( 2135 raw_feature) 2136 2137 def expand_dims(input_tensor): 2138 # Input_tensor must have rank 1. 2139 if isinstance(input_tensor, sparse_tensor_lib.SparseTensor): 2140 return sparse_ops.sparse_reshape( 2141 input_tensor, [array_ops.shape(input_tensor)[0], 1]) 2142 else: 2143 return array_ops.expand_dims(input_tensor, -1) 2144 2145 rank = feature_tensor.get_shape().ndims 2146 if rank is not None: 2147 if rank == 0: 2148 raise ValueError( 2149 'Feature (key: {}) cannot have rank 0. Give: {}'.format( 2150 key, feature_tensor)) 2151 return feature_tensor if rank != 1 else expand_dims(feature_tensor) 2152 2153 # Handle dynamic rank. 2154 with ops.control_dependencies([ 2155 check_ops.assert_positive( 2156 array_ops.rank(feature_tensor), 2157 message='Feature (key: {}) cannot have rank 0. Given: {}'.format( 2158 key, feature_tensor))]): 2159 return control_flow_ops.cond( 2160 math_ops.equal(1, array_ops.rank(feature_tensor)), 2161 lambda: expand_dims(feature_tensor), 2162 lambda: feature_tensor) 2163 2164 2165# TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py 2166def _shape_offsets(shape): 2167 """Returns moving offset for each dimension given shape.""" 2168 offsets = [] 2169 for dim in reversed(shape): 2170 if offsets: 2171 offsets.append(dim * offsets[-1]) 2172 else: 2173 offsets.append(dim) 2174 offsets.reverse() 2175 return offsets 2176 2177 2178# TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py 2179def _to_sparse_input_and_drop_ignore_values(input_tensor, ignore_value=None): 2180 """Converts a `Tensor` to a `SparseTensor`, dropping ignore_value cells. 2181 2182 If `input_tensor` is already a `SparseTensor`, just return it. 2183 2184 Args: 2185 input_tensor: A string or integer `Tensor`. 2186 ignore_value: Entries in `dense_tensor` equal to this value will be 2187 absent from the resulting `SparseTensor`. If `None`, default value of 2188 `dense_tensor`'s dtype will be used ('' for `str`, -1 for `int`). 2189 2190 Returns: 2191 A `SparseTensor` with the same shape as `input_tensor`. 2192 2193 Raises: 2194 ValueError: when `input_tensor`'s rank is `None`. 2195 """ 2196 input_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor( 2197 input_tensor) 2198 if isinstance(input_tensor, sparse_tensor_lib.SparseTensor): 2199 return input_tensor 2200 with ops.name_scope(None, 'to_sparse_input', (input_tensor, ignore_value,)): 2201 if ignore_value is None: 2202 if input_tensor.dtype == dtypes.string: 2203 # Exception due to TF strings are converted to numpy objects by default. 2204 ignore_value = '' 2205 elif input_tensor.dtype.is_integer: 2206 ignore_value = -1 # -1 has a special meaning of missing feature 2207 else: 2208 # NOTE: `as_numpy_dtype` is a property, so with the parentheses this is 2209 # constructing a new numpy object of the given type, which yields the 2210 # default value for that type. 2211 ignore_value = input_tensor.dtype.as_numpy_dtype() 2212 ignore_value = math_ops.cast( 2213 ignore_value, input_tensor.dtype, name='ignore_value') 2214 indices = array_ops.where( 2215 math_ops.not_equal(input_tensor, ignore_value), name='indices') 2216 return sparse_tensor_lib.SparseTensor( 2217 indices=indices, 2218 values=array_ops.gather_nd(input_tensor, indices, name='values'), 2219 dense_shape=array_ops.shape( 2220 input_tensor, out_type=dtypes.int64, name='dense_shape')) 2221 2222 2223def _normalize_feature_columns(feature_columns): 2224 """Normalizes the `feature_columns` input. 2225 2226 This method converts the `feature_columns` to list type as best as it can. In 2227 addition, verifies the type and other parts of feature_columns, required by 2228 downstream library. 2229 2230 Args: 2231 feature_columns: The raw feature columns, usually passed by users. 2232 2233 Returns: 2234 The normalized feature column list. 2235 2236 Raises: 2237 ValueError: for any invalid inputs, such as empty, duplicated names, etc. 2238 """ 2239 if isinstance(feature_columns, _FeatureColumn): 2240 feature_columns = [feature_columns] 2241 2242 if isinstance(feature_columns, collections.Iterator): 2243 feature_columns = list(feature_columns) 2244 2245 if isinstance(feature_columns, dict): 2246 raise ValueError('Expected feature_columns to be iterable, found dict.') 2247 2248 for column in feature_columns: 2249 if not isinstance(column, _FeatureColumn): 2250 raise ValueError('Items of feature_columns must be a _FeatureColumn. ' 2251 'Given (type {}): {}.'.format(type(column), column)) 2252 if not feature_columns: 2253 raise ValueError('feature_columns must not be empty.') 2254 name_to_column = dict() 2255 for column in feature_columns: 2256 if column.name in name_to_column: 2257 raise ValueError('Duplicate feature column name found for columns: {} ' 2258 'and {}. This usually means that these columns refer to ' 2259 'same base feature. Either one must be discarded or a ' 2260 'duplicated but renamed item must be inserted in ' 2261 'features dict.'.format(column, 2262 name_to_column[column.name])) 2263 name_to_column[column.name] = column 2264 2265 return feature_columns 2266 2267 2268class _NumericColumn(_DenseColumn, 2269 collections.namedtuple('_NumericColumn', [ 2270 'key', 'shape', 'default_value', 'dtype', 2271 'normalizer_fn' 2272 ])): 2273 """see `numeric_column`.""" 2274 2275 @property 2276 def name(self): 2277 return self.key 2278 2279 @property 2280 def _parse_example_spec(self): 2281 return { 2282 self.key: 2283 parsing_ops.FixedLenFeature(self.shape, self.dtype, 2284 self.default_value) 2285 } 2286 2287 def _transform_feature(self, inputs): 2288 input_tensor = inputs.get(self.key) 2289 if isinstance(input_tensor, sparse_tensor_lib.SparseTensor): 2290 raise ValueError( 2291 'The corresponding Tensor of numerical column must be a Tensor. ' 2292 'SparseTensor is not supported. key: {}'.format(self.key)) 2293 if self.normalizer_fn is not None: 2294 input_tensor = self.normalizer_fn(input_tensor) 2295 return math_ops.cast(input_tensor, dtypes.float32) 2296 2297 @property 2298 def _variable_shape(self): 2299 return tensor_shape.TensorShape(self.shape) 2300 2301 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 2302 """Returns dense `Tensor` representing numeric feature. 2303 2304 Args: 2305 inputs: A `_LazyBuilder` object to access inputs. 2306 weight_collections: Unused `weight_collections` since no variables are 2307 created in this function. 2308 trainable: Unused `trainable` bool since no variables are created in 2309 this function. 2310 2311 Returns: 2312 Dense `Tensor` created within `_transform_feature`. 2313 """ 2314 # Do nothing with weight_collections and trainable since no variables are 2315 # created in this function. 2316 del weight_collections 2317 del trainable 2318 # Feature has been already transformed. Return the intermediate 2319 # representation created by _transform_feature. 2320 return inputs.get(self) 2321 2322 2323class _BucketizedColumn(_DenseColumn, _CategoricalColumn, 2324 collections.namedtuple('_BucketizedColumn', [ 2325 'source_column', 'boundaries'])): 2326 """See `bucketized_column`.""" 2327 2328 @property 2329 def name(self): 2330 return '{}_bucketized'.format(self.source_column.name) 2331 2332 @property 2333 def _parse_example_spec(self): 2334 return self.source_column._parse_example_spec # pylint: disable=protected-access 2335 2336 def _transform_feature(self, inputs): 2337 source_tensor = inputs.get(self.source_column) 2338 return math_ops._bucketize( # pylint: disable=protected-access 2339 source_tensor, 2340 boundaries=self.boundaries) 2341 2342 @property 2343 def _variable_shape(self): 2344 return tensor_shape.TensorShape( 2345 tuple(self.source_column.shape) + (len(self.boundaries) + 1,)) 2346 2347 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 2348 del weight_collections 2349 del trainable 2350 input_tensor = inputs.get(self) 2351 return array_ops.one_hot( 2352 indices=math_ops.cast(input_tensor, dtypes.int64), 2353 depth=len(self.boundaries) + 1, 2354 on_value=1., 2355 off_value=0.) 2356 2357 @property 2358 def _num_buckets(self): 2359 # By construction, source_column is always one-dimensional. 2360 return (len(self.boundaries) + 1) * self.source_column.shape[0] 2361 2362 def _get_sparse_tensors(self, inputs, weight_collections=None, 2363 trainable=None): 2364 """Converts dense inputs to SparseTensor so downstream code can use it.""" 2365 input_tensor = inputs.get(self) 2366 batch_size = array_ops.shape(input_tensor)[0] 2367 # By construction, source_column is always one-dimensional. 2368 source_dimension = self.source_column.shape[0] 2369 2370 i1 = array_ops.reshape( 2371 array_ops.tile( 2372 array_ops.expand_dims(math_ops.range(0, batch_size), 1), 2373 [1, source_dimension]), 2374 (-1,)) 2375 i2 = array_ops.tile(math_ops.range(0, source_dimension), [batch_size]) 2376 # Flatten the bucket indices and unique them across dimensions 2377 # E.g. 2nd dimension indices will range from k to 2*k-1 with k buckets 2378 bucket_indices = ( 2379 array_ops.reshape(input_tensor, (-1,)) + 2380 (len(self.boundaries) + 1) * i2) 2381 2382 indices = math_ops.cast( 2383 array_ops.transpose(array_ops.stack((i1, i2))), dtypes.int64) 2384 dense_shape = math_ops.cast( 2385 array_ops.stack([batch_size, source_dimension]), dtypes.int64) 2386 sparse_tensor = sparse_tensor_lib.SparseTensor( 2387 indices=indices, 2388 values=bucket_indices, 2389 dense_shape=dense_shape) 2390 return _CategoricalColumn.IdWeightPair(sparse_tensor, None) 2391 2392 2393class _EmbeddingColumn( 2394 _DenseColumn, _SequenceDenseColumn, 2395 collections.namedtuple( 2396 '_EmbeddingColumn', 2397 ('categorical_column', 'dimension', 'combiner', 'layer_creator', 2398 'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable'))): 2399 """See `embedding_column`.""" 2400 2401 @property 2402 def name(self): 2403 if not hasattr(self, '_name'): 2404 self._name = '{}_embedding'.format(self.categorical_column.name) 2405 return self._name 2406 2407 @property 2408 def _parse_example_spec(self): 2409 return self.categorical_column._parse_example_spec # pylint: disable=protected-access 2410 2411 def _transform_feature(self, inputs): 2412 return inputs.get(self.categorical_column) 2413 2414 @property 2415 def _variable_shape(self): 2416 if not hasattr(self, '_shape'): 2417 self._shape = tensor_shape.vector(self.dimension) 2418 return self._shape 2419 2420 def _get_dense_tensor_internal(self, 2421 inputs, 2422 weight_collections=None, 2423 trainable=None): 2424 """Private method that follows the signature of _get_dense_tensor.""" 2425 # Get sparse IDs and weights. 2426 sparse_tensors = self.categorical_column._get_sparse_tensors( # pylint: disable=protected-access 2427 inputs, weight_collections=weight_collections, trainable=trainable) 2428 sparse_ids = sparse_tensors.id_tensor 2429 sparse_weights = sparse_tensors.weight_tensor 2430 2431 embedding_weights = self.layer_creator( 2432 weight_collections=weight_collections, 2433 scope=variable_scope.get_variable_scope()) 2434 2435 if self.ckpt_to_load_from is not None: 2436 to_restore = embedding_weights 2437 if isinstance(to_restore, variables.PartitionedVariable): 2438 to_restore = to_restore._get_variable_list() # pylint: disable=protected-access 2439 checkpoint_utils.init_from_checkpoint(self.ckpt_to_load_from, { 2440 self.tensor_name_in_ckpt: to_restore 2441 }) 2442 2443 # Return embedding lookup result. 2444 return embedding_ops.safe_embedding_lookup_sparse( 2445 embedding_weights=embedding_weights, 2446 sparse_ids=sparse_ids, 2447 sparse_weights=sparse_weights, 2448 combiner=self.combiner, 2449 name='%s_weights' % self.name, 2450 max_norm=self.max_norm) 2451 2452 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 2453 if isinstance(self.categorical_column, _SequenceCategoricalColumn): 2454 raise ValueError( 2455 'In embedding_column: {}. ' 2456 'categorical_column must not be of type _SequenceCategoricalColumn. ' 2457 'Suggested fix A: If you wish to use input_layer, use a ' 2458 'non-sequence categorical_column_with_*. ' 2459 'Suggested fix B: If you wish to create sequence input, use ' 2460 'sequence_input_layer instead of input_layer. ' 2461 'Given (type {}): {}'.format( 2462 self.name, type(self.categorical_column), 2463 self.categorical_column)) 2464 return self._get_dense_tensor_internal( 2465 inputs=inputs, 2466 weight_collections=weight_collections, 2467 trainable=trainable) 2468 2469 def _get_sequence_dense_tensor( 2470 self, inputs, weight_collections=None, trainable=None): 2471 if not isinstance(self.categorical_column, _SequenceCategoricalColumn): 2472 raise ValueError( 2473 'In embedding_column: {}. ' 2474 'categorical_column must be of type _SequenceCategoricalColumn ' 2475 'to use sequence_input_layer. ' 2476 'Suggested fix: Use one of sequence_categorical_column_with_*. ' 2477 'Given (type {}): {}'.format( 2478 self.name, type(self.categorical_column), 2479 self.categorical_column)) 2480 dense_tensor = self._get_dense_tensor_internal( # pylint: disable=protected-access 2481 inputs=inputs, 2482 weight_collections=weight_collections, 2483 trainable=trainable) 2484 2485 sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access 2486 sequence_length = fc_utils.sequence_length_from_sparse_tensor( 2487 sparse_tensors.id_tensor) 2488 return _SequenceDenseColumn.TensorSequenceLengthPair( 2489 dense_tensor=dense_tensor, sequence_length=sequence_length) 2490 2491 2492def _get_graph_for_variable(var): 2493 if isinstance(var, variables.PartitionedVariable): 2494 return list(var)[0].graph 2495 else: 2496 return var.graph 2497 2498 2499class _SharedEmbeddingColumn( 2500 _DenseColumn, _SequenceDenseColumn, 2501 collections.namedtuple( 2502 '_SharedEmbeddingColumn', 2503 ('categorical_column', 'dimension', 'combiner', 'initializer', 2504 'shared_embedding_collection_name', 'ckpt_to_load_from', 2505 'tensor_name_in_ckpt', 'max_norm', 'trainable'))): 2506 """See `embedding_column`.""" 2507 2508 @property 2509 def name(self): 2510 if not hasattr(self, '_name'): 2511 self._name = '{}_shared_embedding'.format(self.categorical_column.name) 2512 return self._name 2513 2514 @property 2515 def _var_scope_name(self): 2516 return self.shared_embedding_collection_name 2517 2518 @property 2519 def _parse_example_spec(self): 2520 return self.categorical_column._parse_example_spec # pylint: disable=protected-access 2521 2522 def _transform_feature(self, inputs): 2523 return inputs.get(self.categorical_column) 2524 2525 @property 2526 def _variable_shape(self): 2527 if not hasattr(self, '_shape'): 2528 self._shape = tensor_shape.vector(self.dimension) 2529 return self._shape 2530 2531 def _get_dense_tensor_internal(self, 2532 inputs, 2533 weight_collections=None, 2534 trainable=None): 2535 """Private method that follows the signature of _get_dense_tensor.""" 2536 # This method is called from a variable_scope with name _var_scope_name, 2537 # which is shared among all shared embeddings. Open a name_scope here, so 2538 # that the ops for different columns have distinct names. 2539 with ops.name_scope(None, default_name=self.name): 2540 # Get sparse IDs and weights. 2541 sparse_tensors = self.categorical_column._get_sparse_tensors( # pylint: disable=protected-access 2542 inputs, weight_collections=weight_collections, trainable=trainable) 2543 sparse_ids = sparse_tensors.id_tensor 2544 sparse_weights = sparse_tensors.weight_tensor 2545 2546 embedding_shape = (self.categorical_column._num_buckets, self.dimension) # pylint: disable=protected-access 2547 shared_embedding_collection = ops.get_collection( 2548 self.shared_embedding_collection_name) 2549 if shared_embedding_collection: 2550 if len(shared_embedding_collection) > 1: 2551 raise ValueError( 2552 'Collection {} can only contain one variable. ' 2553 'Suggested fix A: Choose a unique name for this collection. ' 2554 'Suggested fix B: Do not add any variables to this collection. ' 2555 'The feature_column library already adds a variable under the ' 2556 'hood.'.format(shared_embedding_collection)) 2557 embedding_weights = shared_embedding_collection[0] 2558 if embedding_weights.get_shape() != embedding_shape: 2559 raise ValueError( 2560 'Shared embedding collection {} contains variable {} of ' 2561 'unexpected shape {}. Expected shape is {}. ' 2562 'Suggested fix A: Choose a unique name for this collection. ' 2563 'Suggested fix B: Do not add any variables to this collection. ' 2564 'The feature_column library already adds a variable under the ' 2565 'hood.'.format(self.shared_embedding_collection_name, 2566 embedding_weights.name, 2567 embedding_weights.get_shape(), embedding_shape)) 2568 else: 2569 embedding_weights = variable_scope.get_variable( 2570 name='embedding_weights', 2571 shape=embedding_shape, 2572 dtype=dtypes.float32, 2573 initializer=self.initializer, 2574 trainable=self.trainable and trainable, 2575 collections=weight_collections) 2576 ops.add_to_collection(self.shared_embedding_collection_name, 2577 embedding_weights) 2578 if self.ckpt_to_load_from is not None: 2579 to_restore = embedding_weights 2580 if isinstance(to_restore, variables.PartitionedVariable): 2581 to_restore = to_restore._get_variable_list() # pylint: disable=protected-access 2582 checkpoint_utils.init_from_checkpoint(self.ckpt_to_load_from, { 2583 self.tensor_name_in_ckpt: to_restore 2584 }) 2585 2586 # Return embedding lookup result. 2587 return embedding_ops.safe_embedding_lookup_sparse( 2588 embedding_weights=embedding_weights, 2589 sparse_ids=sparse_ids, 2590 sparse_weights=sparse_weights, 2591 combiner=self.combiner, 2592 name='%s_weights' % self.name, 2593 max_norm=self.max_norm) 2594 2595 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 2596 if isinstance(self.categorical_column, _SequenceCategoricalColumn): 2597 raise ValueError( 2598 'In embedding_column: {}. ' 2599 'categorical_column must not be of type _SequenceCategoricalColumn. ' 2600 'Suggested fix A: If you wish to use input_layer, use a ' 2601 'non-sequence categorical_column_with_*. ' 2602 'Suggested fix B: If you wish to create sequence input, use ' 2603 'sequence_input_layer instead of input_layer. ' 2604 'Given (type {}): {}'.format(self.name, type(self.categorical_column), 2605 self.categorical_column)) 2606 return self._get_dense_tensor_internal( 2607 inputs=inputs, 2608 weight_collections=weight_collections, 2609 trainable=trainable) 2610 2611 def _get_sequence_dense_tensor(self, 2612 inputs, 2613 weight_collections=None, 2614 trainable=None): 2615 if not isinstance(self.categorical_column, _SequenceCategoricalColumn): 2616 raise ValueError( 2617 'In embedding_column: {}. ' 2618 'categorical_column must be of type _SequenceCategoricalColumn ' 2619 'to use sequence_input_layer. ' 2620 'Suggested fix: Use one of sequence_categorical_column_with_*. ' 2621 'Given (type {}): {}'.format(self.name, type(self.categorical_column), 2622 self.categorical_column)) 2623 dense_tensor = self._get_dense_tensor_internal( # pylint: disable=protected-access 2624 inputs=inputs, 2625 weight_collections=weight_collections, 2626 trainable=trainable) 2627 sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access 2628 sequence_length = fc_utils.sequence_length_from_sparse_tensor( 2629 sparse_tensors.id_tensor) 2630 return _SequenceDenseColumn.TensorSequenceLengthPair( 2631 dense_tensor=dense_tensor, sequence_length=sequence_length) 2632 2633 2634def _check_shape(shape, key): 2635 """Returns shape if it's valid, raises error otherwise.""" 2636 assert shape is not None 2637 if not nest.is_sequence(shape): 2638 shape = [shape] 2639 shape = tuple(shape) 2640 for dimension in shape: 2641 if not isinstance(dimension, six.integer_types): 2642 raise TypeError('shape dimensions must be integer. ' 2643 'shape: {}, key: {}'.format(shape, key)) 2644 if dimension < 1: 2645 raise ValueError('shape dimensions must be greater than 0. ' 2646 'shape: {}, key: {}'.format(shape, key)) 2647 return shape 2648 2649 2650class _HashedCategoricalColumn( 2651 _CategoricalColumn, 2652 collections.namedtuple('_HashedCategoricalColumn', 2653 ['key', 'hash_bucket_size', 'dtype'])): 2654 """see `categorical_column_with_hash_bucket`.""" 2655 2656 @property 2657 def name(self): 2658 return self.key 2659 2660 @property 2661 def _parse_example_spec(self): 2662 return {self.key: parsing_ops.VarLenFeature(self.dtype)} 2663 2664 def _transform_feature(self, inputs): 2665 input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key)) 2666 if not isinstance(input_tensor, sparse_tensor_lib.SparseTensor): 2667 raise ValueError('SparseColumn input must be a SparseTensor.') 2668 2669 fc_utils.assert_string_or_int( 2670 input_tensor.dtype, 2671 prefix='column_name: {} input_tensor'.format(self.key)) 2672 2673 if self.dtype.is_integer != input_tensor.dtype.is_integer: 2674 raise ValueError( 2675 'Column dtype and SparseTensors dtype must be compatible. ' 2676 'key: {}, column dtype: {}, tensor dtype: {}'.format( 2677 self.key, self.dtype, input_tensor.dtype)) 2678 2679 if self.dtype == dtypes.string: 2680 sparse_values = input_tensor.values 2681 else: 2682 sparse_values = string_ops.as_string(input_tensor.values) 2683 2684 sparse_id_values = string_ops.string_to_hash_bucket_fast( 2685 sparse_values, self.hash_bucket_size, name='lookup') 2686 return sparse_tensor_lib.SparseTensor( 2687 input_tensor.indices, sparse_id_values, input_tensor.dense_shape) 2688 2689 @property 2690 def _num_buckets(self): 2691 """Returns number of buckets in this sparse feature.""" 2692 return self.hash_bucket_size 2693 2694 def _get_sparse_tensors(self, inputs, weight_collections=None, 2695 trainable=None): 2696 return _CategoricalColumn.IdWeightPair(inputs.get(self), None) 2697 2698 2699class _VocabularyFileCategoricalColumn( 2700 _CategoricalColumn, 2701 collections.namedtuple('_VocabularyFileCategoricalColumn', ( 2702 'key', 'vocabulary_file', 'vocabulary_size', 'num_oov_buckets', 'dtype', 2703 'default_value' 2704 ))): 2705 """See `categorical_column_with_vocabulary_file`.""" 2706 2707 @property 2708 def name(self): 2709 return self.key 2710 2711 @property 2712 def _parse_example_spec(self): 2713 return {self.key: parsing_ops.VarLenFeature(self.dtype)} 2714 2715 def _transform_feature(self, inputs): 2716 input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key)) 2717 2718 if self.dtype.is_integer != input_tensor.dtype.is_integer: 2719 raise ValueError( 2720 'Column dtype and SparseTensors dtype must be compatible. ' 2721 'key: {}, column dtype: {}, tensor dtype: {}'.format( 2722 self.key, self.dtype, input_tensor.dtype)) 2723 2724 fc_utils.assert_string_or_int( 2725 input_tensor.dtype, 2726 prefix='column_name: {} input_tensor'.format(self.key)) 2727 2728 key_dtype = self.dtype 2729 if input_tensor.dtype.is_integer: 2730 # `index_table_from_file` requires 64-bit integer keys. 2731 key_dtype = dtypes.int64 2732 input_tensor = math_ops.cast(input_tensor, dtypes.int64) 2733 2734 return lookup_ops.index_table_from_file( 2735 vocabulary_file=self.vocabulary_file, 2736 num_oov_buckets=self.num_oov_buckets, 2737 vocab_size=self.vocabulary_size, 2738 default_value=self.default_value, 2739 key_dtype=key_dtype, 2740 name='{}_lookup'.format(self.key)).lookup(input_tensor) 2741 2742 @property 2743 def _num_buckets(self): 2744 """Returns number of buckets in this sparse feature.""" 2745 return self.vocabulary_size + self.num_oov_buckets 2746 2747 def _get_sparse_tensors( 2748 self, inputs, weight_collections=None, trainable=None): 2749 return _CategoricalColumn.IdWeightPair(inputs.get(self), None) 2750 2751 2752class _VocabularyListCategoricalColumn( 2753 _CategoricalColumn, 2754 collections.namedtuple('_VocabularyListCategoricalColumn', ( 2755 'key', 'vocabulary_list', 'dtype', 'default_value', 'num_oov_buckets' 2756 ))): 2757 """See `categorical_column_with_vocabulary_list`.""" 2758 2759 @property 2760 def name(self): 2761 return self.key 2762 2763 @property 2764 def _parse_example_spec(self): 2765 return {self.key: parsing_ops.VarLenFeature(self.dtype)} 2766 2767 def _transform_feature(self, inputs): 2768 input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key)) 2769 2770 if self.dtype.is_integer != input_tensor.dtype.is_integer: 2771 raise ValueError( 2772 'Column dtype and SparseTensors dtype must be compatible. ' 2773 'key: {}, column dtype: {}, tensor dtype: {}'.format( 2774 self.key, self.dtype, input_tensor.dtype)) 2775 2776 fc_utils.assert_string_or_int( 2777 input_tensor.dtype, 2778 prefix='column_name: {} input_tensor'.format(self.key)) 2779 2780 key_dtype = self.dtype 2781 if input_tensor.dtype.is_integer: 2782 # `index_table_from_tensor` requires 64-bit integer keys. 2783 key_dtype = dtypes.int64 2784 input_tensor = math_ops.cast(input_tensor, dtypes.int64) 2785 2786 return lookup_ops.index_table_from_tensor( 2787 vocabulary_list=tuple(self.vocabulary_list), 2788 default_value=self.default_value, 2789 num_oov_buckets=self.num_oov_buckets, 2790 dtype=key_dtype, 2791 name='{}_lookup'.format(self.key)).lookup(input_tensor) 2792 2793 @property 2794 def _num_buckets(self): 2795 """Returns number of buckets in this sparse feature.""" 2796 return len(self.vocabulary_list) + self.num_oov_buckets 2797 2798 def _get_sparse_tensors( 2799 self, inputs, weight_collections=None, trainable=None): 2800 return _CategoricalColumn.IdWeightPair(inputs.get(self), None) 2801 2802 2803class _IdentityCategoricalColumn( 2804 _CategoricalColumn, 2805 collections.namedtuple('_IdentityCategoricalColumn', ( 2806 'key', 'num_buckets', 'default_value' 2807 ))): 2808 2809 """See `categorical_column_with_identity`.""" 2810 2811 @property 2812 def name(self): 2813 return self.key 2814 2815 @property 2816 def _parse_example_spec(self): 2817 return {self.key: parsing_ops.VarLenFeature(dtypes.int64)} 2818 2819 def _transform_feature(self, inputs): 2820 input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key)) 2821 2822 if not input_tensor.dtype.is_integer: 2823 raise ValueError( 2824 'Invalid input, not integer. key: {} dtype: {}'.format( 2825 self.key, input_tensor.dtype)) 2826 2827 values = math_ops.cast(input_tensor.values, dtypes.int64, name='values') 2828 num_buckets = math_ops.cast( 2829 self.num_buckets, dtypes.int64, name='num_buckets') 2830 zero = math_ops.cast(0, dtypes.int64, name='zero') 2831 if self.default_value is None: 2832 # Fail if values are out-of-range. 2833 assert_less = check_ops.assert_less( 2834 values, num_buckets, data=(values, num_buckets), 2835 name='assert_less_than_num_buckets') 2836 assert_greater = check_ops.assert_greater_equal( 2837 values, zero, data=(values,), 2838 name='assert_greater_or_equal_0') 2839 with ops.control_dependencies((assert_less, assert_greater)): 2840 values = array_ops.identity(values) 2841 else: 2842 # Assign default for out-of-range values. 2843 values = array_ops.where( 2844 math_ops.logical_or( 2845 values < zero, values >= num_buckets, name='out_of_range'), 2846 array_ops.fill( 2847 dims=array_ops.shape(values), 2848 value=math_ops.cast(self.default_value, dtypes.int64), 2849 name='default_values'), values) 2850 2851 return sparse_tensor_lib.SparseTensor( 2852 indices=input_tensor.indices, 2853 values=values, 2854 dense_shape=input_tensor.dense_shape) 2855 2856 @property 2857 def _num_buckets(self): 2858 """Returns number of buckets in this sparse feature.""" 2859 return self.num_buckets 2860 2861 def _get_sparse_tensors( 2862 self, inputs, weight_collections=None, trainable=None): 2863 return _CategoricalColumn.IdWeightPair(inputs.get(self), None) 2864 2865 2866class _WeightedCategoricalColumn( 2867 _CategoricalColumn, 2868 collections.namedtuple('_WeightedCategoricalColumn', ( 2869 'categorical_column', 'weight_feature_key', 'dtype' 2870 ))): 2871 """See `weighted_categorical_column`.""" 2872 2873 @property 2874 def name(self): 2875 return '{}_weighted_by_{}'.format( 2876 self.categorical_column.name, self.weight_feature_key) 2877 2878 @property 2879 def _parse_example_spec(self): 2880 config = self.categorical_column._parse_example_spec # pylint: disable=protected-access 2881 if self.weight_feature_key in config: 2882 raise ValueError('Parse config {} already exists for {}.'.format( 2883 config[self.weight_feature_key], self.weight_feature_key)) 2884 config[self.weight_feature_key] = parsing_ops.VarLenFeature(self.dtype) 2885 return config 2886 2887 @property 2888 def _num_buckets(self): 2889 return self.categorical_column._num_buckets # pylint: disable=protected-access 2890 2891 def _transform_feature(self, inputs): 2892 weight_tensor = inputs.get(self.weight_feature_key) 2893 if weight_tensor is None: 2894 raise ValueError('Missing weights {}.'.format(self.weight_feature_key)) 2895 weight_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor( 2896 weight_tensor) 2897 if self.dtype != weight_tensor.dtype.base_dtype: 2898 raise ValueError('Bad dtype, expected {}, but got {}.'.format( 2899 self.dtype, weight_tensor.dtype)) 2900 if not isinstance(weight_tensor, sparse_tensor_lib.SparseTensor): 2901 # The weight tensor can be a regular Tensor. In this case, sparsify it. 2902 weight_tensor = _to_sparse_input_and_drop_ignore_values( 2903 weight_tensor, ignore_value=0.0) 2904 if not weight_tensor.dtype.is_floating: 2905 weight_tensor = math_ops.cast(weight_tensor, dtypes.float32) 2906 return (inputs.get(self.categorical_column), weight_tensor) 2907 2908 def _get_sparse_tensors( 2909 self, inputs, weight_collections=None, trainable=None): 2910 del weight_collections 2911 del trainable 2912 tensors = inputs.get(self) 2913 return _CategoricalColumn.IdWeightPair(tensors[0], tensors[1]) 2914 2915 2916class _CrossedColumn( 2917 _CategoricalColumn, 2918 collections.namedtuple('_CrossedColumn', 2919 ['keys', 'hash_bucket_size', 'hash_key'])): 2920 """See `crossed_column`.""" 2921 2922 @property 2923 def name(self): 2924 feature_names = [] 2925 for key in _collect_leaf_level_keys(self): 2926 if isinstance(key, _FeatureColumn): 2927 feature_names.append(key.name) 2928 else: # key must be a string 2929 feature_names.append(key) 2930 return '_X_'.join(sorted(feature_names)) 2931 2932 @property 2933 def _parse_example_spec(self): 2934 config = {} 2935 for key in self.keys: 2936 if isinstance(key, _FeatureColumn): 2937 config.update(key._parse_example_spec) # pylint: disable=protected-access 2938 else: # key must be a string 2939 config.update({key: parsing_ops.VarLenFeature(dtypes.string)}) 2940 return config 2941 2942 def _transform_feature(self, inputs): 2943 feature_tensors = [] 2944 for key in _collect_leaf_level_keys(self): 2945 if isinstance(key, six.string_types): 2946 feature_tensors.append(inputs.get(key)) 2947 elif isinstance(key, _CategoricalColumn): 2948 ids_and_weights = key._get_sparse_tensors(inputs) # pylint: disable=protected-access 2949 if ids_and_weights.weight_tensor is not None: 2950 raise ValueError( 2951 'crossed_column does not support weight_tensor, but the given ' 2952 'column populates weight_tensor. ' 2953 'Given column: {}'.format(key.name)) 2954 feature_tensors.append(ids_and_weights.id_tensor) 2955 else: 2956 raise ValueError('Unsupported column type. Given: {}'.format(key)) 2957 return sparse_ops.sparse_cross_hashed( 2958 inputs=feature_tensors, 2959 num_buckets=self.hash_bucket_size, 2960 hash_key=self.hash_key) 2961 2962 @property 2963 def _num_buckets(self): 2964 """Returns number of buckets in this sparse feature.""" 2965 return self.hash_bucket_size 2966 2967 def _get_sparse_tensors(self, inputs, weight_collections=None, 2968 trainable=None): 2969 return _CategoricalColumn.IdWeightPair(inputs.get(self), None) 2970 2971 2972def _collect_leaf_level_keys(cross): 2973 """Collects base keys by expanding all nested crosses. 2974 2975 Args: 2976 cross: A `_CrossedColumn`. 2977 2978 Returns: 2979 A list of strings or `_CategoricalColumn` instances. 2980 """ 2981 leaf_level_keys = [] 2982 for k in cross.keys: 2983 if isinstance(k, _CrossedColumn): 2984 leaf_level_keys.extend(_collect_leaf_level_keys(k)) 2985 else: 2986 leaf_level_keys.append(k) 2987 return leaf_level_keys 2988 2989 2990class _IndicatorColumn(_DenseColumn, _SequenceDenseColumn, 2991 collections.namedtuple('_IndicatorColumn', 2992 ['categorical_column'])): 2993 """Represents a one-hot column for use in deep networks. 2994 2995 Args: 2996 categorical_column: A `_CategoricalColumn` which is created by 2997 `categorical_column_with_*` function. 2998 """ 2999 3000 @property 3001 def name(self): 3002 return '{}_indicator'.format(self.categorical_column.name) 3003 3004 def _transform_feature(self, inputs): 3005 """Returns dense `Tensor` representing feature. 3006 3007 Args: 3008 inputs: A `_LazyBuilder` object to access inputs. 3009 3010 Returns: 3011 Transformed feature `Tensor`. 3012 3013 Raises: 3014 ValueError: if input rank is not known at graph building time. 3015 """ 3016 id_weight_pair = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access 3017 id_tensor = id_weight_pair.id_tensor 3018 weight_tensor = id_weight_pair.weight_tensor 3019 3020 # If the underlying column is weighted, return the input as a dense tensor. 3021 if weight_tensor is not None: 3022 weighted_column = sparse_ops.sparse_merge( 3023 sp_ids=id_tensor, 3024 sp_values=weight_tensor, 3025 vocab_size=int(self._variable_shape[-1])) 3026 # Remove (?, -1) index. 3027 weighted_column = sparse_ops.sparse_slice(weighted_column, [0, 0], 3028 weighted_column.dense_shape) 3029 # Use scatter_nd to merge duplicated indices if existed, 3030 # instead of sparse_tensor_to_dense. 3031 return array_ops.scatter_nd(weighted_column.indices, 3032 weighted_column.values, 3033 weighted_column.dense_shape) 3034 3035 dense_id_tensor = sparse_ops.sparse_tensor_to_dense( 3036 id_tensor, default_value=-1) 3037 3038 # One hot must be float for tf.concat reasons since all other inputs to 3039 # input_layer are float32. 3040 one_hot_id_tensor = array_ops.one_hot( 3041 dense_id_tensor, 3042 depth=self._variable_shape[-1], 3043 on_value=1.0, 3044 off_value=0.0) 3045 3046 # Reduce to get a multi-hot per example. 3047 return math_ops.reduce_sum(one_hot_id_tensor, axis=[-2]) 3048 3049 @property 3050 def _parse_example_spec(self): 3051 return self.categorical_column._parse_example_spec # pylint: disable=protected-access 3052 3053 @property 3054 def _variable_shape(self): 3055 """Returns a `TensorShape` representing the shape of the dense `Tensor`.""" 3056 return tensor_shape.TensorShape([1, self.categorical_column._num_buckets]) # pylint: disable=protected-access 3057 3058 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 3059 """Returns dense `Tensor` representing feature. 3060 3061 Args: 3062 inputs: A `_LazyBuilder` object to access inputs. 3063 weight_collections: Unused `weight_collections` since no variables are 3064 created in this function. 3065 trainable: Unused `trainable` bool since no variables are created in 3066 this function. 3067 3068 Returns: 3069 Dense `Tensor` created within `_transform_feature`. 3070 3071 Raises: 3072 ValueError: If `categorical_column` is a `_SequenceCategoricalColumn`. 3073 """ 3074 # Do nothing with weight_collections and trainable since no variables are 3075 # created in this function. 3076 del weight_collections 3077 del trainable 3078 if isinstance(self.categorical_column, _SequenceCategoricalColumn): 3079 raise ValueError( 3080 'In indicator_column: {}. ' 3081 'categorical_column must not be of type _SequenceCategoricalColumn. ' 3082 'Suggested fix A: If you wish to use input_layer, use a ' 3083 'non-sequence categorical_column_with_*. ' 3084 'Suggested fix B: If you wish to create sequence input, use ' 3085 'sequence_input_layer instead of input_layer. ' 3086 'Given (type {}): {}'.format( 3087 self.name, type(self.categorical_column), 3088 self.categorical_column)) 3089 # Feature has been already transformed. Return the intermediate 3090 # representation created by _transform_feature. 3091 return inputs.get(self) 3092 3093 def _get_sequence_dense_tensor( 3094 self, inputs, weight_collections=None, trainable=None): 3095 # Do nothing with weight_collections and trainable since no variables are 3096 # created in this function. 3097 del weight_collections 3098 del trainable 3099 if not isinstance(self.categorical_column, _SequenceCategoricalColumn): 3100 raise ValueError( 3101 'In indicator_column: {}. ' 3102 'categorical_column must be of type _SequenceCategoricalColumn ' 3103 'to use sequence_input_layer. ' 3104 'Suggested fix: Use one of sequence_categorical_column_with_*. ' 3105 'Given (type {}): {}'.format( 3106 self.name, type(self.categorical_column), 3107 self.categorical_column)) 3108 # Feature has been already transformed. Return the intermediate 3109 # representation created by _transform_feature. 3110 dense_tensor = inputs.get(self) 3111 sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access 3112 sequence_length = fc_utils.sequence_length_from_sparse_tensor( 3113 sparse_tensors.id_tensor) 3114 return _SequenceDenseColumn.TensorSequenceLengthPair( 3115 dense_tensor=dense_tensor, sequence_length=sequence_length) 3116 3117 3118def _verify_static_batch_size_equality(tensors, columns): 3119 """Validates that the first dim (batch size) of all tensors are equal or None. 3120 3121 Args: 3122 tensors: list of tensors to check. 3123 columns: list of feature columns matching tensors. Will be used for error 3124 messaging. 3125 3126 Raises: 3127 ValueError: if one of the tensors has a variant batch size 3128 """ 3129 # bath_size is a tf.Dimension object. 3130 expected_batch_size = None 3131 for i in range(0, len(tensors)): 3132 if tensors[i].shape.dims[0].value is not None: 3133 if expected_batch_size is None: 3134 bath_size_column_index = i 3135 expected_batch_size = tensors[i].shape.dims[0] 3136 elif not expected_batch_size.is_compatible_with(tensors[i].shape.dims[0]): 3137 raise ValueError( 3138 'Batch size (first dimension) of each feature must be same. ' 3139 'Batch size of columns ({}, {}): ({}, {})'.format( 3140 columns[bath_size_column_index].name, columns[i].name, 3141 expected_batch_size, tensors[i].shape.dims[0])) 3142 3143 3144class _SequenceCategoricalColumn( 3145 _CategoricalColumn, 3146 collections.namedtuple( 3147 '_SequenceCategoricalColumn', ['categorical_column'])): 3148 """Represents sequences of categorical data.""" 3149 3150 @property 3151 def name(self): 3152 return self.categorical_column.name 3153 3154 @property 3155 def _parse_example_spec(self): 3156 return self.categorical_column._parse_example_spec # pylint: disable=protected-access 3157 3158 def _transform_feature(self, inputs): 3159 return self.categorical_column._transform_feature(inputs) # pylint: disable=protected-access 3160 3161 @property 3162 def _num_buckets(self): 3163 return self.categorical_column._num_buckets # pylint: disable=protected-access 3164 3165 def _get_sparse_tensors(self, inputs, weight_collections=None, 3166 trainable=None): 3167 sparse_tensors = self.categorical_column._get_sparse_tensors(inputs) # pylint: disable=protected-access 3168 id_tensor = sparse_tensors.id_tensor 3169 weight_tensor = sparse_tensors.weight_tensor 3170 3171 # Expands third dimension, if necessary so that embeddings are not 3172 # combined during embedding lookup. If the tensor is already 3D, leave 3173 # as-is. 3174 shape = array_ops.shape(id_tensor) 3175 # Compute the third dimension explicitly instead of setting it to -1, as 3176 # that doesn't work for dynamically shaped tensors with 0-length at runtime. 3177 # This happens for empty sequences. 3178 target_shape = [shape[0], shape[1], math_ops.reduce_prod(shape[2:])] 3179 id_tensor = sparse_ops.sparse_reshape(id_tensor, target_shape) 3180 if weight_tensor is not None: 3181 weight_tensor = sparse_ops.sparse_reshape(weight_tensor, target_shape) 3182 3183 return _CategoricalColumn.IdWeightPair(id_tensor, weight_tensor) 3184