1# Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15"""This API defines FeatureColumn abstraction. 16 17FeatureColumns provide a high level abstraction for ingesting and representing 18features in `Estimator` models. 19 20FeatureColumns are the primary way of encoding features for pre-canned 21`Estimator` models. 22 23When using FeatureColumns with `Estimator` models, the type of feature column 24you should choose depends on (1) the feature type and (2) the model type. 25 26(1) Feature type: 27 28 * Continuous features can be represented by `real_valued_column`. 29 * Categorical features can be represented by any `sparse_column_with_*` 30 column (`sparse_column_with_keys`, `sparse_column_with_vocabulary_file`, 31 `sparse_column_with_hash_bucket`, `sparse_column_with_integerized_feature`). 32 33(2) Model type: 34 35 * Deep neural network models (`DNNClassifier`, `DNNRegressor`). 36 37 Continuous features can be directly fed into deep neural network models. 38 39 age_column = real_valued_column("age") 40 41 To feed sparse features into DNN models, wrap the column with 42 `embedding_column` or `one_hot_column`. `one_hot_column` will create a dense 43 boolean tensor with an entry for each possible value, and thus the 44 computation cost is linear in the number of possible values versus the number 45 of values that occur in the sparse tensor. Thus using a "one_hot_column" is 46 only recommended for features with only a few possible values. For features 47 with many possible values or for very sparse features, `embedding_column` is 48 recommended. 49 50 embedded_dept_column = embedding_column( 51 sparse_column_with_keys("department", ["math", "philosophy", ...]), 52 dimension=10) 53 54* Wide (aka linear) models (`LinearClassifier`, `LinearRegressor`). 55 56 Sparse features can be fed directly into linear models. When doing so 57 an embedding_lookups are used to efficiently perform the sparse matrix 58 multiplication. 59 60 dept_column = sparse_column_with_keys("department", 61 ["math", "philosophy", "english"]) 62 63 It is recommended that continuous features be bucketized before being 64 fed into linear models. 65 66 bucketized_age_column = bucketized_column( 67 source_column=age_column, 68 boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) 69 70 Sparse features can be crossed (also known as conjuncted or combined) in 71 order to form non-linearities, and then fed into linear models. 72 73 cross_dept_age_column = crossed_column( 74 columns=[department_column, bucketized_age_column], 75 hash_bucket_size=1000) 76 77Example of building an `Estimator` model using FeatureColumns: 78 79 # Define features and transformations 80 deep_feature_columns = [age_column, embedded_dept_column] 81 wide_feature_columns = [dept_column, bucketized_age_column, 82 cross_dept_age_column] 83 84 # Build deep model 85 estimator = DNNClassifier( 86 feature_columns=deep_feature_columns, 87 hidden_units=[500, 250, 50]) 88 estimator.train(...) 89 90 # Or build a wide model 91 estimator = LinearClassifier( 92 feature_columns=wide_feature_columns) 93 estimator.train(...) 94 95 # Or build a wide and deep model! 96 estimator = DNNLinearCombinedClassifier( 97 linear_feature_columns=wide_feature_columns, 98 dnn_feature_columns=deep_feature_columns, 99 dnn_hidden_units=[500, 250, 50]) 100 estimator.train(...) 101 102 103FeatureColumns can also be transformed into a generic input layer for 104custom models using `input_from_feature_columns` within 105`feature_column_ops.py`. 106 107Example of building a non-`Estimator` model using FeatureColumns: 108 109 # Building model via layers 110 111 deep_feature_columns = [age_column, embedded_dept_column] 112 columns_to_tensor = parse_feature_columns_from_examples( 113 serialized=my_data, 114 feature_columns=deep_feature_columns) 115 first_layer = input_from_feature_columns( 116 columns_to_tensors=columns_to_tensor, 117 feature_columns=deep_feature_columns) 118 second_layer = fully_connected(first_layer, ...) 119 120See feature_column_ops_test for more examples. 121""" 122 123from __future__ import absolute_import 124from __future__ import division 125from __future__ import print_function 126 127import abc 128import collections 129import math 130 131import six 132 133from tensorflow.contrib import lookup 134from tensorflow.contrib.framework.python.framework import checkpoint_utils 135from tensorflow.contrib.framework.python.framework import experimental 136from tensorflow.contrib.framework.python.ops import variables as contrib_variables 137from tensorflow.contrib.layers.python.layers import embedding_ops 138from tensorflow.contrib.layers.python.layers import layers 139from tensorflow.contrib.layers.python.ops import bucketization_op 140from tensorflow.contrib.layers.python.ops import sparse_feature_cross_op 141from tensorflow.contrib.layers.python.ops import sparse_ops as contrib_sparse_ops 142from tensorflow.python.feature_column import feature_column as fc_core 143from tensorflow.python.framework import dtypes 144from tensorflow.python.framework import ops 145from tensorflow.python.framework import sparse_tensor as sparse_tensor_py 146from tensorflow.python.framework import tensor_shape 147from tensorflow.python.ops import array_ops 148from tensorflow.python.ops import init_ops 149from tensorflow.python.ops import math_ops 150from tensorflow.python.ops import parsing_ops 151from tensorflow.python.ops import resource_variable_ops 152from tensorflow.python.ops import sparse_ops 153from tensorflow.python.ops import string_ops 154from tensorflow.python.ops import variables 155from tensorflow.python.platform import tf_logging as logging 156from tensorflow.python.util import deprecation 157from tensorflow.python.util import nest 158 159 160# Imports the core `InputLayer` symbol in contrib during development. 161InputLayer = fc_core.InputLayer # pylint: disable=invalid-name 162 163 164class _LinearEmbeddingLookupArguments( 165 collections.namedtuple("_LinearEmbeddingLookupArguments", 166 ["input_tensor", 167 "weight_tensor", 168 "vocab_size", 169 "initializer", 170 "combiner"])): 171 """Represents the information needed from a column for embedding lookup. 172 173 Used to compute DNN inputs and weighted sum. 174 """ 175 pass 176 177 178class _DeepEmbeddingLookupArguments( 179 collections.namedtuple("_DeepEmbeddingLookupArguments", 180 ["input_tensor", 181 "weight_tensor", 182 "vocab_size", 183 "initializer", 184 "combiner", 185 "dimension", 186 "shared_embedding_name", 187 "hash_key", 188 "max_norm", 189 "trainable"])): 190 """Represents the information needed from a column for embedding lookup. 191 192 Used to compute DNN inputs and weighted sum. 193 """ 194 pass 195 196 197@six.add_metaclass(abc.ABCMeta) 198class _FeatureColumn(object): 199 """Represents a feature column abstraction. 200 201 To distinguish the concept of a feature family and a specific binary feature 202 within a family, we refer to a feature family like "country" as a feature 203 column. For example "country:US" is a feature which is in "country" feature 204 column and has a feature value ("US"). 205 This class is an abstract class. User should not create one instance of this. 206 Following classes (_SparseColumn, _RealValuedColumn, ...) are concrete 207 instances. 208 """ 209 210 @abc.abstractproperty 211 @deprecation.deprecated( 212 "2016-09-25", 213 "Should be private.") 214 def name(self): 215 """Returns the name of column or transformed column.""" 216 pass 217 218 @abc.abstractproperty 219 @deprecation.deprecated( 220 "2016-09-25", 221 "Should be private.") 222 def config(self): 223 """Returns configuration of the base feature for `tf.parse_example`.""" 224 pass 225 226 @abc.abstractproperty 227 @deprecation.deprecated( 228 "2016-09-25", 229 "Should be private.") 230 def key(self): 231 """Returns a string which will be used as a key when we do sorting.""" 232 pass 233 234 @abc.abstractmethod 235 @deprecation.deprecated( 236 "2016-09-25", 237 "Should be private.") 238 def insert_transformed_feature(self, columns_to_tensors): 239 """Apply transformation and inserts it into columns_to_tensors. 240 241 Args: 242 columns_to_tensors: A mapping from feature columns to tensors. 'string' 243 key means a base feature (not-transformed). It can have _FeatureColumn 244 as a key too. That means that _FeatureColumn is already transformed. 245 """ 246 raise NotImplementedError("Transform is not implemented for {}.".format( 247 self)) 248 249 # pylint: disable=unused-argument 250 def _to_dnn_input_layer(self, 251 input_tensor, 252 weight_collection=None, 253 trainable=True, 254 output_rank=2): 255 """Returns a Tensor as an input to the first layer of neural network.""" 256 raise ValueError("Calling an abstract method.") 257 258 def _deep_embedding_lookup_arguments(self, input_tensor): 259 """Returns arguments to embedding lookup to build an input layer.""" 260 raise NotImplementedError( 261 "No deep embedding lookup arguments for column {}.".format(self)) 262 263 # It is expected that classes implement either wide_embedding_lookup_arguments 264 # or to_dense_tensor to be used in linear models. 265 # pylint: disable=unused-argument 266 def _wide_embedding_lookup_arguments(self, input_tensor): 267 """Returns arguments to look up embeddings for this column.""" 268 raise NotImplementedError( 269 "No wide embedding lookup arguments for column {}.".format(self)) 270 271 # pylint: disable=unused-argument 272 def _to_dense_tensor(self, input_tensor): 273 """Returns a dense tensor representing this column's values.""" 274 raise NotImplementedError( 275 "No dense tensor representation for column {}.".format(self)) 276 277 def _checkpoint_path(self): 278 """Returns None, or a (path,tensor_name) to load a checkpoint from.""" 279 return None 280 281 def _key_without_properties(self, properties): 282 """Helper method for self.key() that omits particular properties.""" 283 fields_values = [] 284 # pylint: disable=protected-access 285 for i, k in enumerate(self._fields): 286 if k in properties: 287 # Excludes a property from the key. 288 # For instance, exclude `initializer` from the key of EmbeddingColumn 289 # since we don't support users specifying different initializers for 290 # the same embedding column. Ditto for `normalizer` and 291 # RealValuedColumn. 292 # Special treatment is needed since the default str form of a 293 # function contains its address, which could introduce non-determinism 294 # in sorting. 295 continue 296 fields_values.append("{}={}".format(k, self[i])) 297 # pylint: enable=protected-access 298 299 # This is effectively the same format as str(self), except with our special 300 # treatment. 301 return "{}({})".format(type(self).__name__, ", ".join(fields_values)) 302 303 304# TODO(b/30410315): Support warm starting in all feature columns. 305class _SparseColumn( 306 _FeatureColumn, 307 fc_core._CategoricalColumn, # pylint: disable=protected-access 308 collections.namedtuple("_SparseColumn", [ 309 "column_name", "is_integerized", "bucket_size", "lookup_config", 310 "combiner", "dtype" 311 ])): 312 """Represents a sparse feature column also known as categorical features. 313 314 Instances of this class are immutable. A sparse column means features are 315 sparse and dictionary returned by InputBuilder contains a 316 ("column_name", SparseTensor) pair. 317 One and only one of bucket_size or lookup_config should be set. If 318 is_integerized is True then bucket_size should be set. 319 320 Attributes: 321 column_name: A string defining sparse column name. 322 is_integerized: A bool if True means type of feature is an integer. 323 Integerized means we can use the feature itself as id. 324 bucket_size: An int that is > 0. The number of buckets. 325 lookup_config: A _SparseIdLookupConfig defining feature-to-id lookup 326 configuration 327 combiner: A string specifying how to reduce if the sparse column is 328 multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum" 329 the default. "sqrtn" often achieves good accuracy, in particular with 330 bag-of-words columns. 331 * "sum": do not normalize features in the column 332 * "mean": do l1 normalization on features in the column 333 * "sqrtn": do l2 normalization on features in the column 334 For more information: `tf.embedding_lookup_sparse`. 335 dtype: Type of features, either `tf.string` or `tf.int64`. 336 337 Raises: 338 TypeError: if lookup_config is not a _SparseIdLookupConfig. 339 ValueError: if above expectations about input fails. 340 """ 341 342 def __new__(cls, 343 column_name, 344 is_integerized=False, 345 bucket_size=None, 346 lookup_config=None, 347 combiner="sum", 348 dtype=dtypes.string): 349 if is_integerized and bucket_size is None: 350 raise ValueError("bucket_size must be set if is_integerized is True. " 351 "column_name: {}".format(column_name)) 352 353 if is_integerized and not dtype.is_integer: 354 raise ValueError("dtype must be an integer if is_integerized is True. " 355 "dtype: {}, column_name: {}.".format(dtype, column_name)) 356 if dtype != dtypes.string and not dtype.is_integer: 357 raise ValueError("dtype must be string or integer. " 358 "dtype: {}, column_name: {}".format(dtype, column_name)) 359 360 if bucket_size is None and lookup_config is None: 361 raise ValueError("one of bucket_size or lookup_config must be set. " 362 "column_name: {}".format(column_name)) 363 364 if bucket_size is not None and lookup_config: 365 raise ValueError("one and only one of bucket_size or lookup_config " 366 "must be set. column_name: {}".format(column_name)) 367 368 if bucket_size is not None and bucket_size < 1: 369 raise ValueError("bucket_size must be at least 1. " 370 "bucket_size: {}, column_name: {}".format(bucket_size, 371 column_name)) 372 373 if ((lookup_config) and 374 (not isinstance(lookup_config, _SparseIdLookupConfig))): 375 raise TypeError( 376 "lookup_config must be an instance of _SparseIdLookupConfig. " 377 "Given one is in type {} for column_name {}".format( 378 type(lookup_config), column_name)) 379 380 if (lookup_config and lookup_config.vocabulary_file and 381 lookup_config.vocab_size is None): 382 raise ValueError("vocab_size must be defined. " 383 "column_name: {}".format(column_name)) 384 385 return super(_SparseColumn, cls).__new__( 386 cls, 387 column_name, 388 is_integerized=is_integerized, 389 bucket_size=bucket_size, 390 lookup_config=lookup_config, 391 combiner=combiner, 392 dtype=dtype) 393 394 @property 395 def name(self): 396 return self.column_name 397 398 @property 399 def length(self): 400 """Returns vocabulary or hash_bucket size.""" 401 if self.bucket_size is not None: 402 return self.bucket_size 403 return self.lookup_config.vocab_size + self.lookup_config.num_oov_buckets 404 405 @property 406 def config(self): 407 return {self.column_name: parsing_ops.VarLenFeature(self.dtype)} 408 409 @property 410 def key(self): 411 """Returns a string which will be used as a key when we do sorting.""" 412 return "{}".format(self) 413 414 def id_tensor(self, input_tensor): 415 """Returns the id tensor from the given transformed input_tensor.""" 416 return input_tensor 417 418 # pylint: disable=unused-argument 419 def weight_tensor(self, input_tensor): 420 """Returns the weight tensor from the given transformed input_tensor.""" 421 return None 422 423 # pylint: disable=unused-argument 424 def _to_dnn_input_layer(self, 425 input_tensor, 426 weight_collections=None, 427 trainable=True, 428 output_rank=2): 429 raise ValueError( 430 "SparseColumn is not supported in DNN. " 431 "Please use embedding_column or one_hot_column. column: {}".format( 432 self)) 433 434 def _wide_embedding_lookup_arguments(self, input_tensor): 435 return _LinearEmbeddingLookupArguments( 436 input_tensor=self.id_tensor(input_tensor), 437 weight_tensor=self.weight_tensor(input_tensor), 438 vocab_size=self.length, 439 initializer=init_ops.zeros_initializer(), 440 combiner=self.combiner) 441 442 def _get_input_sparse_tensor(self, input_tensor): 443 """sparsify input_tensor if dense.""" 444 if not isinstance(input_tensor, sparse_tensor_py.SparseTensor): 445 # To avoid making any assumptions about which values are to be ignored, 446 # we set ignore_value to -1 for numeric tensors to avoid excluding valid 447 # indices. 448 if input_tensor.dtype == dtypes.string: 449 ignore_value = "" 450 else: 451 ignore_value = -1 452 input_tensor = _reshape_real_valued_tensor(input_tensor, 2, self.name) 453 input_tensor = contrib_sparse_ops.dense_to_sparse_tensor( 454 input_tensor, ignore_value=ignore_value) 455 456 return input_tensor 457 458 def is_compatible(self, other_column): 459 """Check compatibility of two sparse columns.""" 460 if self.lookup_config and other_column.lookup_config: 461 return self.lookup_config == other_column.lookup_config 462 compatible = (self.length == other_column.length and 463 (self.dtype == other_column.dtype or 464 (self.dtype.is_integer and other_column.dtype.is_integer))) 465 if compatible: 466 logging.warn("Column {} and {} may not have the same vocabulary.". 467 format(self.name, other_column.name)) 468 return compatible 469 470 @abc.abstractmethod 471 def _do_transform(self, input_tensor): 472 pass 473 474 def insert_transformed_feature(self, columns_to_tensors): 475 """Handles sparse column to id conversion.""" 476 input_tensor = self._get_input_sparse_tensor(columns_to_tensors[self.name]) 477 columns_to_tensors[self] = self._do_transform(input_tensor) 478 479 def _transform_feature(self, inputs): 480 input_tensor = self._get_input_sparse_tensor(inputs.get(self.name)) 481 return self._do_transform(input_tensor) 482 483 @property 484 def _parse_example_spec(self): 485 return self.config 486 487 @property 488 def _num_buckets(self): 489 return self.length 490 491 def _get_sparse_tensors(self, inputs, weight_collections=None, 492 trainable=None): 493 del weight_collections 494 del trainable 495 input_tensor = inputs.get(self) 496 return fc_core._CategoricalColumn.IdWeightPair( # pylint: disable=protected-access 497 self.id_tensor(input_tensor), self.weight_tensor(input_tensor)) 498 499 500class _SparseColumnIntegerized(_SparseColumn): 501 """See `sparse_column_with_integerized_feature`.""" 502 503 def _do_transform(self, input_tensor): 504 sparse_id_values = math_ops.mod(input_tensor.values, self.bucket_size, 505 name="mod") 506 return sparse_tensor_py.SparseTensor(input_tensor.indices, sparse_id_values, 507 input_tensor.dense_shape) 508 509 510def sparse_column_with_integerized_feature(column_name, 511 bucket_size, 512 combiner="sum", 513 dtype=dtypes.int64): 514 """Creates an integerized _SparseColumn. 515 516 Use this when your features are already pre-integerized into int64 IDs, that 517 is, when the set of values to output is already coming in as what's desired in 518 the output. Integerized means we can use the feature value itself as id. 519 520 Typically this is used for reading contiguous ranges of integers indexes, but 521 it doesn't have to be. The output value is simply copied from the 522 input_feature, whatever it is. Just be aware, however, that if you have large 523 gaps of unused integers it might affect what you feed those in (for instance, 524 if you make up a one-hot tensor from these, the unused integers will appear as 525 values in the tensor which are always zero.) 526 527 Args: 528 column_name: A string defining sparse column name. 529 bucket_size: An int that is >= 1. The number of buckets. It should be bigger 530 than maximum feature. In other words features in this column should be an 531 int64 in range [0, bucket_size) 532 combiner: A string specifying how to reduce if the sparse column is 533 multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum" 534 the default. "sqrtn" often achieves good accuracy, in particular with 535 bag-of-words columns. 536 * "sum": do not normalize features in the column 537 * "mean": do l1 normalization on features in the column 538 * "sqrtn": do l2 normalization on features in the column 539 For more information: `tf.embedding_lookup_sparse`. 540 dtype: Type of features. It should be an integer type. Default value is 541 dtypes.int64. 542 543 Returns: 544 An integerized _SparseColumn definition. 545 546 Raises: 547 ValueError: bucket_size is less than 1. 548 ValueError: dtype is not integer. 549 """ 550 return _SparseColumnIntegerized( 551 column_name, is_integerized=True, bucket_size=bucket_size, 552 combiner=combiner, dtype=dtype) 553 554 555class _SparseColumnHashed(_SparseColumn): 556 """See `sparse_column_with_hash_bucket`.""" 557 558 def __new__(cls, 559 column_name, 560 is_integerized=False, 561 bucket_size=None, 562 lookup_config=None, 563 combiner="sum", 564 dtype=dtypes.string, 565 hash_keys=None): 566 if hash_keys is not None: 567 if not isinstance(hash_keys, list) or not hash_keys: 568 raise ValueError("hash_keys must be a non-empty list.") 569 if (any([not isinstance(key_pair, list) for key_pair in hash_keys]) or 570 any([len(key_pair) != 2 for key_pair in hash_keys]) or 571 any([not isinstance(key, int) for key in nest.flatten(hash_keys)])): 572 raise ValueError( 573 "Each element of hash_keys must be a pair of integers.") 574 obj = super(_SparseColumnHashed, cls).__new__( 575 cls, 576 column_name, 577 is_integerized=is_integerized, 578 bucket_size=bucket_size, 579 lookup_config=lookup_config, 580 combiner=combiner, 581 dtype=dtype) 582 obj.hash_keys = hash_keys 583 return obj 584 585 def _do_transform(self, input_tensor): 586 if self.dtype.is_integer: 587 sparse_values = string_ops.as_string(input_tensor.values) 588 else: 589 sparse_values = input_tensor.values 590 591 if self.hash_keys: 592 result = [] 593 for key in self.hash_keys: 594 sparse_id_values = string_ops.string_to_hash_bucket_strong( 595 sparse_values, self.bucket_size, key) 596 result.append( 597 sparse_tensor_py.SparseTensor(input_tensor.indices, 598 sparse_id_values, 599 input_tensor.dense_shape)) 600 return sparse_ops.sparse_concat(axis=1, sp_inputs=result, name="lookup") 601 else: 602 sparse_id_values = string_ops.string_to_hash_bucket_fast( 603 sparse_values, self.bucket_size, name="lookup") 604 return sparse_tensor_py.SparseTensor( 605 input_tensor.indices, sparse_id_values, input_tensor.dense_shape) 606 607 608def sparse_column_with_hash_bucket(column_name, 609 hash_bucket_size, 610 combiner="sum", 611 dtype=dtypes.string, 612 hash_keys=None): 613 """Creates a _SparseColumn with hashed bucket configuration. 614 615 Use this when your sparse features are in string or integer format, but you 616 don't have a vocab file that maps each value to an integer ID. 617 output_id = Hash(input_feature_string) % bucket_size 618 619 When hash_keys is set, multiple integer IDs would be created with each key 620 pair in the `hash_keys`. This is useful to reduce the collision of hashed ids. 621 622 Args: 623 column_name: A string defining sparse column name. 624 hash_bucket_size: An int that is > 1. The number of buckets. 625 combiner: A string specifying how to reduce if the sparse column is 626 multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum" 627 the default. "sqrtn" often achieves good accuracy, in particular with 628 bag-of-words columns. 629 * "sum": do not normalize features in the column 630 * "mean": do l1 normalization on features in the column 631 * "sqrtn": do l2 normalization on features in the column 632 For more information: `tf.embedding_lookup_sparse`. 633 dtype: The type of features. Only string and integer types are supported. 634 hash_keys: The hash keys to use. It is a list of lists of two uint64s. If 635 None, simple and fast hashing algorithm is used. Otherwise, multiple 636 strong hash ids would be produced with each two unit64s in this argument. 637 638 Returns: 639 A _SparseColumn with hashed bucket configuration 640 641 Raises: 642 ValueError: hash_bucket_size is not greater than 2. 643 ValueError: dtype is neither string nor integer. 644 """ 645 return _SparseColumnHashed( 646 column_name, 647 bucket_size=hash_bucket_size, 648 combiner=combiner, 649 dtype=dtype, 650 hash_keys=hash_keys) 651 652 653class _SparseColumnKeys(_SparseColumn): 654 """See `sparse_column_with_keys`.""" 655 656 def _do_transform(self, input_tensor): 657 table = lookup.index_table_from_tensor( 658 mapping=tuple(self.lookup_config.keys), 659 default_value=self.lookup_config.default_value, 660 dtype=self.dtype, 661 name="lookup") 662 return table.lookup(input_tensor) 663 664 665def sparse_column_with_keys( 666 column_name, keys, default_value=-1, combiner="sum", dtype=dtypes.string): 667 """Creates a _SparseColumn with keys. 668 669 Look up logic is as follows: 670 lookup_id = index_of_feature_in_keys if feature in keys else default_value 671 672 Args: 673 column_name: A string defining sparse column name. 674 keys: A list or tuple defining vocabulary. Must be castable to `dtype`. 675 default_value: The value to use for out-of-vocabulary feature values. 676 Default is -1. 677 combiner: A string specifying how to reduce if the sparse column is 678 multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum" 679 the default. "sqrtn" often achieves good accuracy, in particular with 680 bag-of-words columns. 681 * "sum": do not normalize features in the column 682 * "mean": do l1 normalization on features in the column 683 * "sqrtn": do l2 normalization on features in the column 684 For more information: `tf.embedding_lookup_sparse`. 685 dtype: Type of features. Only integer and string are supported. 686 687 Returns: 688 A _SparseColumnKeys with keys configuration. 689 """ 690 keys = tuple(keys) 691 return _SparseColumnKeys( 692 column_name, 693 lookup_config=_SparseIdLookupConfig( 694 keys=keys, vocab_size=len(keys), default_value=default_value), 695 combiner=combiner, 696 dtype=dtype) 697 698 699class _SparseColumnVocabulary(_SparseColumn): 700 """See `sparse_column_with_vocabulary_file`.""" 701 702 def _do_transform(self, st): 703 if self.dtype.is_integer: 704 sparse_string_values = string_ops.as_string(st.values) 705 sparse_string_tensor = sparse_tensor_py.SparseTensor(st.indices, 706 sparse_string_values, 707 st.dense_shape) 708 else: 709 sparse_string_tensor = st 710 711 table = lookup.index_table_from_file( 712 vocabulary_file=self.lookup_config.vocabulary_file, 713 num_oov_buckets=self.lookup_config.num_oov_buckets, 714 vocab_size=self.lookup_config.vocab_size, 715 default_value=self.lookup_config.default_value, 716 name=self.name + "_lookup") 717 return table.lookup(sparse_string_tensor) 718 719 720def sparse_column_with_vocabulary_file(column_name, 721 vocabulary_file, 722 num_oov_buckets=0, 723 vocab_size=None, 724 default_value=-1, 725 combiner="sum", 726 dtype=dtypes.string): 727 """Creates a _SparseColumn with vocabulary file configuration. 728 729 Use this when your sparse features are in string or integer format, and you 730 have a vocab file that maps each value to an integer ID. 731 output_id = LookupIdFromVocab(input_feature_string) 732 733 Args: 734 column_name: A string defining sparse column name. 735 vocabulary_file: The vocabulary filename. 736 num_oov_buckets: The number of out-of-vocabulary buckets. If zero all out of 737 vocabulary features will be ignored. 738 vocab_size: Number of the elements in the vocabulary. 739 default_value: The value to use for out-of-vocabulary feature values. 740 Defaults to -1. 741 combiner: A string specifying how to reduce if the sparse column is 742 multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum" 743 the default. "sqrtn" often achieves good accuracy, in particular with 744 bag-of-words columns. 745 * "sum": do not normalize features in the column 746 * "mean": do l1 normalization on features in the column 747 * "sqrtn": do l2 normalization on features in the column 748 For more information: `tf.embedding_lookup_sparse`. 749 dtype: The type of features. Only string and integer types are supported. 750 751 Returns: 752 A _SparseColumn with vocabulary file configuration. 753 754 Raises: 755 ValueError: vocab_size is not defined. 756 ValueError: dtype is neither string nor integer. 757 """ 758 if vocab_size is None: 759 raise ValueError("vocab_size should be defined. " 760 "column_name: {}".format(column_name)) 761 762 return _SparseColumnVocabulary( 763 column_name, 764 lookup_config=_SparseIdLookupConfig( 765 vocabulary_file=vocabulary_file, 766 num_oov_buckets=num_oov_buckets, 767 vocab_size=vocab_size, 768 default_value=default_value), 769 combiner=combiner, 770 dtype=dtype) 771 772 773class _WeightedSparseColumn( 774 _FeatureColumn, 775 fc_core._CategoricalColumn, # pylint: disable=protected-access 776 collections.namedtuple("_WeightedSparseColumn", 777 ["sparse_id_column", "weight_column_name", 778 "dtype"])): 779 """See `weighted_sparse_column`.""" 780 781 def __new__(cls, sparse_id_column, weight_column_name, dtype): 782 return super(_WeightedSparseColumn, cls).__new__(cls, sparse_id_column, 783 weight_column_name, dtype) 784 785 @property 786 def name(self): 787 return "{}_weighted_by_{}".format(self.sparse_id_column.name, 788 self.weight_column_name) 789 790 @property 791 def length(self): 792 """Returns id size.""" 793 return self.sparse_id_column.length 794 795 @property 796 def config(self): 797 config = _get_feature_config(self.sparse_id_column) 798 config.update( 799 {self.weight_column_name: parsing_ops.VarLenFeature(self.dtype)}) 800 return config 801 802 @property 803 def lookup_config(self): 804 return self.sparse_id_column.lookup_config 805 806 @property 807 def key(self): 808 """Returns a string which will be used as a key when we do sorting.""" 809 return "{}".format(self) 810 811 def id_tensor(self, input_tensor): 812 """Returns the id tensor from the given transformed input_tensor.""" 813 return input_tensor[0] 814 815 def weight_tensor(self, input_tensor): 816 """Returns the weight tensor from the given transformed input_tensor.""" 817 return input_tensor[1] 818 819 # pylint: disable=unused-argument 820 def _to_dnn_input_layer(self, 821 input_tensor, 822 weight_collections=None, 823 trainable=True, 824 output_rank=2): 825 raise ValueError( 826 "WeightedSparseColumn is not supported in DNN. " 827 "Please use embedding_column or one_hot_column. column: {}".format( 828 self)) 829 830 def _wide_embedding_lookup_arguments(self, input_tensor): 831 return _LinearEmbeddingLookupArguments( 832 input_tensor=self.id_tensor(input_tensor), 833 weight_tensor=self.weight_tensor(input_tensor), 834 vocab_size=self.length, 835 initializer=init_ops.zeros_initializer(), 836 combiner=self.sparse_id_column.combiner) 837 838 def _do_transform(self, id_tensor, weight_tensor): 839 if not isinstance(weight_tensor, sparse_tensor_py.SparseTensor): 840 # The weight tensor can be a regular Tensor. In such case, sparsify it. 841 weight_tensor = contrib_sparse_ops.dense_to_sparse_tensor(weight_tensor) 842 if not self.dtype.is_floating: 843 weight_tensor = math_ops.cast(weight_tensor, dtypes.float32) 844 return tuple([id_tensor, weight_tensor]) 845 846 def insert_transformed_feature(self, columns_to_tensors): 847 """Inserts a tuple with the id and weight tensors.""" 848 if self.sparse_id_column not in columns_to_tensors: 849 self.sparse_id_column.insert_transformed_feature(columns_to_tensors) 850 851 weight_tensor = columns_to_tensors[self.weight_column_name] 852 columns_to_tensors[self] = self._do_transform( 853 columns_to_tensors[self.sparse_id_column], weight_tensor) 854 855 def _transform_feature(self, inputs): 856 return self._do_transform( 857 inputs.get(self.sparse_id_column), inputs.get(self.weight_column_name)) 858 859 @property 860 def _parse_example_spec(self): 861 return self.config 862 863 @property 864 def _num_buckets(self): 865 return self.length 866 867 def _get_sparse_tensors(self, inputs, weight_collections=None, 868 trainable=None): 869 del weight_collections 870 del trainable 871 input_tensor = inputs.get(self) 872 return fc_core._CategoricalColumn.IdWeightPair( # pylint: disable=protected-access 873 self.id_tensor(input_tensor), self.weight_tensor(input_tensor)) 874 875 def is_compatible(self, other_column): 876 """Check compatibility with other sparse column.""" 877 if isinstance(other_column, _WeightedSparseColumn): 878 return self.sparse_id_column.is_compatible(other_column.sparse_id_column) 879 return self.sparse_id_column.is_compatible(other_column) 880 881 882def weighted_sparse_column(sparse_id_column, 883 weight_column_name, 884 dtype=dtypes.float32): 885 """Creates a _SparseColumn by combining sparse_id_column with a weight column. 886 887 Example: 888 889 ```python 890 sparse_feature = sparse_column_with_hash_bucket(column_name="sparse_col", 891 hash_bucket_size=1000) 892 weighted_feature = weighted_sparse_column(sparse_id_column=sparse_feature, 893 weight_column_name="weights_col") 894 ``` 895 896 This configuration assumes that input dictionary of model contains the 897 following two items: 898 * (key="sparse_col", value=sparse_tensor) where sparse_tensor is 899 a SparseTensor. 900 * (key="weights_col", value=weights_tensor) where weights_tensor 901 is a SparseTensor. 902 Following are assumed to be true: 903 * sparse_tensor.indices = weights_tensor.indices 904 * sparse_tensor.dense_shape = weights_tensor.dense_shape 905 906 Args: 907 sparse_id_column: A `_SparseColumn` which is created by 908 `sparse_column_with_*` functions. 909 weight_column_name: A string defining a sparse column name which represents 910 weight or value of the corresponding sparse id feature. 911 dtype: Type of weights, such as `tf.float32`. Only floating and integer 912 weights are supported. 913 914 Returns: 915 A _WeightedSparseColumn composed of two sparse features: one represents id, 916 the other represents weight (value) of the id feature in that example. 917 918 Raises: 919 ValueError: if dtype is not convertible to float. 920 """ 921 if not (dtype.is_integer or dtype.is_floating): 922 raise ValueError("dtype is not convertible to float. Given {}".format( 923 dtype)) 924 925 return _WeightedSparseColumn(sparse_id_column, weight_column_name, dtype) 926 927 928class _OneHotColumn( 929 _FeatureColumn, 930 fc_core._DenseColumn, # pylint: disable=protected-access 931 collections.namedtuple("_OneHotColumn", ["sparse_id_column"])): 932 """Represents a one-hot column for use in deep networks. 933 934 Args: 935 sparse_id_column: A _SparseColumn which is created by `sparse_column_with_*` 936 function. 937 """ 938 939 @property 940 def name(self): 941 return "{}_one_hot".format(self.sparse_id_column.name) 942 943 @property 944 def length(self): 945 """Returns vocabulary or hash_bucket size.""" 946 return self.sparse_id_column.length 947 948 @property 949 def config(self): 950 """Returns the parsing config of the origin column.""" 951 return _get_feature_config(self.sparse_id_column) 952 953 @property 954 def key(self): 955 """Returns a string which will be used as a key when we do sorting.""" 956 return "{}".format(self) 957 958 def insert_transformed_feature(self, columns_to_tensors): 959 """Used by the Transformer to prevent double transformations.""" 960 if self.sparse_id_column not in columns_to_tensors: 961 self.sparse_id_column.insert_transformed_feature(columns_to_tensors) 962 columns_to_tensors[self] = columns_to_tensors[self.sparse_id_column] 963 964 def _to_dnn_input_layer(self, 965 transformed_input_tensor, 966 unused_weight_collections=None, 967 unused_trainable=False, 968 output_rank=2): 969 """Returns a Tensor as an input to the first layer of neural network. 970 971 Args: 972 transformed_input_tensor: A tensor that has undergone the transformations 973 in `insert_transformed_feature`. Rank should be >= `output_rank`. 974 unused_weight_collections: Unused. One hot encodings are not variable. 975 unused_trainable: Unused. One hot encodings are not trainable. 976 output_rank: the desired rank of the output `Tensor`. 977 978 Returns: 979 A multi-hot Tensor to be fed into the first layer of neural network. 980 981 Raises: 982 ValueError: When using one_hot_column with weighted_sparse_column. 983 This is not yet supported. 984 """ 985 986 # Reshape ID column to `output_rank`. 987 sparse_id_column = self.sparse_id_column.id_tensor(transformed_input_tensor) 988 # pylint: disable=protected-access 989 sparse_id_column = layers._inner_flatten(sparse_id_column, output_rank) 990 991 weight_tensor = self.sparse_id_column.weight_tensor( 992 transformed_input_tensor) 993 if weight_tensor is not None: 994 weighted_column = sparse_ops.sparse_merge(sp_ids=sparse_id_column, 995 sp_values=weight_tensor, 996 vocab_size=self.length) 997 # Remove (?, -1) index 998 weighted_column = sparse_ops.sparse_slice( 999 weighted_column, 1000 array_ops.zeros_like(weighted_column.dense_shape), 1001 weighted_column.dense_shape) 1002 dense_tensor = sparse_ops.sparse_tensor_to_dense(weighted_column) 1003 batch_shape = array_ops.shape(dense_tensor)[:-1] 1004 dense_tensor_shape = array_ops.concat( 1005 [batch_shape, [self.length]], axis=0) 1006 dense_tensor = array_ops.reshape(dense_tensor, dense_tensor_shape) 1007 return dense_tensor 1008 1009 dense_id_tensor = sparse_ops.sparse_tensor_to_dense(sparse_id_column, 1010 default_value=-1) 1011 1012 # One hot must be float for tf.concat reasons since all other inputs to 1013 # input_layer are float32. 1014 one_hot_id_tensor = array_ops.one_hot( 1015 dense_id_tensor, depth=self.length, on_value=1.0, off_value=0.0) 1016 1017 # Reduce to get a multi-hot per example. 1018 return math_ops.reduce_sum(one_hot_id_tensor, axis=[output_rank - 1]) 1019 1020 @property 1021 def _variable_shape(self): 1022 return tensor_shape.TensorShape([self.length]) 1023 1024 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 1025 del weight_collections 1026 del trainable 1027 return inputs.get(self) 1028 1029 def _transform_feature(self, inputs): 1030 return self._to_dnn_input_layer(inputs.get(self.sparse_id_column)) 1031 1032 @property 1033 def _parse_example_spec(self): 1034 return self.config 1035 1036 1037class _EmbeddingColumn( 1038 _FeatureColumn, 1039 fc_core._DenseColumn, # pylint: disable=protected-access 1040 collections.namedtuple("_EmbeddingColumn", [ 1041 "sparse_id_column", "dimension", "combiner", "initializer", 1042 "ckpt_to_load_from", "tensor_name_in_ckpt", "shared_embedding_name", 1043 "shared_vocab_size", "max_norm", "trainable" 1044 ])): 1045 """Represents an embedding column. 1046 1047 Args: 1048 sparse_id_column: A `_SparseColumn` which is created by 1049 `sparse_column_with_*` or `weighted_sparse_column` functions. 1050 dimension: An integer specifying dimension of the embedding. 1051 combiner: A string specifying how to reduce if there are multiple entries 1052 in a single row. Currently "mean", "sqrtn" and "sum" are supported, with 1053 "mean" the default. "sqrtn" often achieves good accuracy, in particular 1054 with bag-of-words columns. Each of this can be thought as example level 1055 normalizations on the column: 1056 * "sum": do not normalize features in the column 1057 * "mean": do l1 normalization on features in the column 1058 * "sqrtn": do l2 normalization on features in the column 1059 For more information: `tf.embedding_lookup_sparse`. 1060 initializer: A variable initializer function to be used in embedding 1061 variable initialization. If not specified, defaults to 1062 `tf.truncated_normal_initializer` with mean 0.0 and standard deviation 1063 1/sqrt(sparse_id_column.length). 1064 ckpt_to_load_from: (Optional). String representing checkpoint name/pattern 1065 to restore the column weights. Required if `tensor_name_in_ckpt` is not 1066 None. 1067 tensor_name_in_ckpt: (Optional). Name of the `Tensor` in the provided 1068 checkpoint from which to restore the column weights. Required if 1069 `ckpt_to_load_from` is not None. 1070 shared_embedding_name: (Optional). The common name for shared embedding. 1071 shared_vocab_size: (Optional). The common vocab_size used for shared 1072 embedding space. 1073 max_norm: (Optional). If not None, embedding values are l2-normalized to 1074 the value of max_norm. 1075 trainable: (Optional). Should the embedding be trainable. Default is True. 1076 1077 Raises: 1078 ValueError: if `initializer` is specified and is not callable. Also, 1079 if only one of `ckpt_to_load_from` and `tensor_name_in_ckpt` is specified. 1080 """ 1081 1082 def __new__(cls, 1083 sparse_id_column, 1084 dimension, 1085 combiner="mean", 1086 initializer=None, 1087 ckpt_to_load_from=None, 1088 tensor_name_in_ckpt=None, 1089 shared_embedding_name=None, 1090 shared_vocab_size=None, 1091 max_norm=None, 1092 trainable=True): 1093 if initializer is not None and not callable(initializer): 1094 raise ValueError("initializer must be callable if specified. " 1095 "Embedding of column_name: {}".format( 1096 sparse_id_column.name)) 1097 1098 if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None): 1099 raise ValueError("Must specify both `ckpt_to_load_from` and " 1100 "`tensor_name_in_ckpt` or none of them.") 1101 if initializer is None: 1102 logging.warn("The default stddev value of initializer was changed from " 1103 "\"1/sqrt(vocab_size)\" to \"1/sqrt(dimension)\" in core " 1104 "implementation (tf.feature_column.embedding_column).") 1105 stddev = 1 / math.sqrt(sparse_id_column.length) 1106 initializer = init_ops.truncated_normal_initializer( 1107 mean=0.0, stddev=stddev) 1108 return super(_EmbeddingColumn, cls).__new__(cls, sparse_id_column, 1109 dimension, combiner, 1110 initializer, ckpt_to_load_from, 1111 tensor_name_in_ckpt, 1112 shared_embedding_name, 1113 shared_vocab_size, 1114 max_norm, 1115 trainable) 1116 1117 @property 1118 def name(self): 1119 if self.shared_embedding_name is None: 1120 return "{}_embedding".format(self.sparse_id_column.name) 1121 else: 1122 return "{}_shared_embedding".format(self.sparse_id_column.name) 1123 1124 @property 1125 def length(self): 1126 """Returns id size.""" 1127 if self.shared_vocab_size is None: 1128 return self.sparse_id_column.length 1129 else: 1130 return self.shared_vocab_size 1131 1132 @property 1133 def config(self): 1134 return _get_feature_config(self.sparse_id_column) 1135 1136 @property 1137 def key(self): 1138 """Returns a string which will be used as a key when we do sorting.""" 1139 return self._key_without_properties(["initializer"]) 1140 1141 def insert_transformed_feature(self, columns_to_tensors): 1142 if self.sparse_id_column not in columns_to_tensors: 1143 self.sparse_id_column.insert_transformed_feature(columns_to_tensors) 1144 columns_to_tensors[self] = columns_to_tensors[self.sparse_id_column] 1145 1146 def _deep_embedding_lookup_arguments(self, input_tensor): 1147 return _DeepEmbeddingLookupArguments( 1148 input_tensor=self.sparse_id_column.id_tensor(input_tensor), 1149 weight_tensor=self.sparse_id_column.weight_tensor(input_tensor), 1150 vocab_size=self.length, 1151 dimension=self.dimension, 1152 initializer=self.initializer, 1153 combiner=self.combiner, 1154 shared_embedding_name=self.shared_embedding_name, 1155 hash_key=None, 1156 max_norm=self.max_norm, 1157 trainable=self.trainable) 1158 1159 def _checkpoint_path(self): 1160 if self.ckpt_to_load_from is not None: 1161 return self.ckpt_to_load_from, self.tensor_name_in_ckpt 1162 return None 1163 1164 # pylint: disable=unused-argument 1165 def _wide_embedding_lookup_arguments(self, input_tensor): 1166 raise ValueError("Column {} is not supported in linear models. " 1167 "Please use sparse_column.".format(self)) 1168 1169 @property 1170 def _variable_shape(self): 1171 return tensor_shape.TensorShape([self.dimension]) 1172 1173 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 1174 return _embeddings_from_arguments( 1175 self, 1176 self._deep_embedding_lookup_arguments(inputs.get(self)), 1177 weight_collections, trainable) 1178 1179 def _transform_feature(self, inputs): 1180 return inputs.get(self.sparse_id_column) 1181 1182 @property 1183 def _parse_example_spec(self): 1184 return self.config 1185 1186 1187def _is_variable(v): 1188 """Returns true if `v` is a variable.""" 1189 return isinstance(v, (variables.Variable, 1190 resource_variable_ops.ResourceVariable)) 1191 1192 1193def _embeddings_from_arguments(column, 1194 args, 1195 weight_collections, 1196 trainable, 1197 output_rank=2): 1198 """Returns embeddings for a column based on the computed arguments. 1199 1200 Args: 1201 column: the column name. 1202 args: the _DeepEmbeddingLookupArguments for this column. 1203 weight_collections: collections to store weights in. 1204 trainable: whether these embeddings should be trainable. 1205 output_rank: the desired rank of the returned `Tensor`. Inner dimensions will 1206 be combined to produce the desired rank. 1207 1208 Returns: 1209 the embeddings. 1210 1211 Raises: 1212 ValueError: if not possible to create. 1213 """ 1214 # pylint: disable=protected-access 1215 input_tensor = layers._inner_flatten(args.input_tensor, output_rank) 1216 weight_tensor = None 1217 if args.weight_tensor is not None: 1218 weight_tensor = layers._inner_flatten(args.weight_tensor, output_rank) 1219 # pylint: enable=protected-access 1220 1221 # This option is only enabled for scattered_embedding_column. 1222 if args.hash_key: 1223 embeddings = contrib_variables.model_variable( 1224 name="weights", 1225 shape=[args.vocab_size], 1226 dtype=dtypes.float32, 1227 initializer=args.initializer, 1228 trainable=(trainable and args.trainable), 1229 collections=weight_collections) 1230 1231 return embedding_ops.scattered_embedding_lookup_sparse( 1232 embeddings, 1233 input_tensor, 1234 args.dimension, 1235 hash_key=args.hash_key, 1236 combiner=args.combiner, 1237 name="lookup") 1238 1239 if args.shared_embedding_name is not None: 1240 shared_embedding_collection_name = ( 1241 "SHARED_EMBEDDING_COLLECTION_" + args.shared_embedding_name.upper()) 1242 graph = ops.get_default_graph() 1243 shared_embedding_collection = ( 1244 graph.get_collection_ref(shared_embedding_collection_name)) 1245 shape = [args.vocab_size, args.dimension] 1246 if shared_embedding_collection: 1247 if len(shared_embedding_collection) > 1: 1248 raise ValueError( 1249 "Collection %s can only contain one " 1250 "(partitioned) variable." % shared_embedding_collection_name) 1251 else: 1252 embeddings = shared_embedding_collection[0] 1253 if embeddings.get_shape() != shape: 1254 raise ValueError( 1255 "The embedding variable with name {} already " 1256 "exists, but its shape does not match required " 1257 "embedding shape here. Please make sure to use " 1258 "different shared_embedding_name for different " 1259 "shared embeddings.".format(args.shared_embedding_name)) 1260 else: 1261 embeddings = contrib_variables.model_variable( 1262 name=args.shared_embedding_name, 1263 shape=shape, 1264 dtype=dtypes.float32, 1265 initializer=args.initializer, 1266 trainable=(trainable and args.trainable), 1267 collections=weight_collections) 1268 graph.add_to_collection(shared_embedding_collection_name, embeddings) 1269 else: 1270 embeddings = contrib_variables.model_variable( 1271 name="weights", 1272 shape=[args.vocab_size, args.dimension], 1273 dtype=dtypes.float32, 1274 initializer=args.initializer, 1275 trainable=(trainable and args.trainable), 1276 collections=weight_collections) 1277 1278 if _is_variable(embeddings): 1279 embeddings = [embeddings] 1280 else: 1281 embeddings = embeddings._get_variable_list() # pylint: disable=protected-access 1282 # pylint: disable=protected-access 1283 _maybe_restore_from_checkpoint(column._checkpoint_path(), embeddings) 1284 return embedding_ops.safe_embedding_lookup_sparse( 1285 embeddings, 1286 input_tensor, 1287 sparse_weights=weight_tensor, 1288 combiner=args.combiner, 1289 name=column.name + "weights", 1290 max_norm=args.max_norm) 1291 1292 1293def _maybe_restore_from_checkpoint(checkpoint_path, variable): 1294 if checkpoint_path is not None: 1295 path, tensor_name = checkpoint_path 1296 weights_to_restore = variable 1297 if len(variable) == 1: 1298 weights_to_restore = variable[0] 1299 checkpoint_utils.init_from_checkpoint(path, 1300 {tensor_name: weights_to_restore}) 1301 1302 1303def one_hot_column(sparse_id_column): 1304 """Creates an `_OneHotColumn` for a one-hot or multi-hot repr in a DNN. 1305 1306 Args: 1307 sparse_id_column: A _SparseColumn which is created by 1308 `sparse_column_with_*` 1309 or crossed_column functions. Note that `combiner` defined in 1310 `sparse_id_column` is ignored. 1311 1312 Returns: 1313 An _OneHotColumn. 1314 """ 1315 return _OneHotColumn(sparse_id_column) 1316 1317 1318def embedding_column(sparse_id_column, 1319 dimension, 1320 combiner="mean", 1321 initializer=None, 1322 ckpt_to_load_from=None, 1323 tensor_name_in_ckpt=None, 1324 max_norm=None, 1325 trainable=True): 1326 """Creates an `_EmbeddingColumn` for feeding sparse data into a DNN. 1327 1328 Args: 1329 sparse_id_column: A `_SparseColumn` which is created by for example 1330 `sparse_column_with_*` or crossed_column functions. Note that `combiner` 1331 defined in `sparse_id_column` is ignored. 1332 dimension: An integer specifying dimension of the embedding. 1333 combiner: A string specifying how to reduce if there are multiple entries 1334 in a single row. Currently "mean", "sqrtn" and "sum" are supported, with 1335 "mean" the default. "sqrtn" often achieves good accuracy, in particular 1336 with bag-of-words columns. Each of this can be thought as example level 1337 normalizations on the column: 1338 * "sum": do not normalize 1339 * "mean": do l1 normalization 1340 * "sqrtn": do l2 normalization 1341 For more information: `tf.embedding_lookup_sparse`. 1342 initializer: A variable initializer function to be used in embedding 1343 variable initialization. If not specified, defaults to 1344 `tf.truncated_normal_initializer` with mean 0.0 and standard deviation 1345 1/sqrt(sparse_id_column.length). 1346 ckpt_to_load_from: (Optional). String representing checkpoint name/pattern 1347 to restore the column weights. Required if `tensor_name_in_ckpt` is not 1348 None. 1349 tensor_name_in_ckpt: (Optional). Name of the `Tensor` in the provided 1350 checkpoint from which to restore the column weights. Required if 1351 `ckpt_to_load_from` is not None. 1352 max_norm: (Optional). If not None, embedding values are l2-normalized to 1353 the value of max_norm. 1354 trainable: (Optional). Should the embedding be trainable. Default is True 1355 1356 Returns: 1357 An `_EmbeddingColumn`. 1358 """ 1359 return _EmbeddingColumn(sparse_id_column, dimension, combiner, initializer, 1360 ckpt_to_load_from, tensor_name_in_ckpt, 1361 max_norm=max_norm, trainable=trainable) 1362 1363 1364def shared_embedding_columns(sparse_id_columns, 1365 dimension, 1366 combiner="mean", 1367 shared_embedding_name=None, 1368 initializer=None, 1369 ckpt_to_load_from=None, 1370 tensor_name_in_ckpt=None, 1371 max_norm=None, 1372 trainable=True): 1373 """Creates a list of `_EmbeddingColumn` sharing the same embedding. 1374 1375 Args: 1376 sparse_id_columns: An iterable of `_SparseColumn`, such as those created by 1377 `sparse_column_with_*` or crossed_column functions. Note that `combiner` 1378 defined in each sparse_id_column is ignored. 1379 dimension: An integer specifying dimension of the embedding. 1380 combiner: A string specifying how to reduce if there are multiple entries 1381 in a single row. Currently "mean", "sqrtn" and "sum" are supported, with 1382 "mean" the default. "sqrtn" often achieves good accuracy, in particular 1383 with bag-of-words columns. Each of this can be thought as example level 1384 normalizations on the column: 1385 * "sum": do not normalize 1386 * "mean": do l1 normalization 1387 * "sqrtn": do l2 normalization 1388 For more information: `tf.embedding_lookup_sparse`. 1389 shared_embedding_name: (Optional). A string specifying the name of shared 1390 embedding weights. This will be needed if you want to reference the shared 1391 embedding separately from the generated `_EmbeddingColumn`. 1392 initializer: A variable initializer function to be used in embedding 1393 variable initialization. If not specified, defaults to 1394 `tf.truncated_normal_initializer` with mean 0.0 and standard deviation 1395 1/sqrt(sparse_id_columns[0].length). 1396 ckpt_to_load_from: (Optional). String representing checkpoint name/pattern 1397 to restore the column weights. Required if `tensor_name_in_ckpt` is not 1398 None. 1399 tensor_name_in_ckpt: (Optional). Name of the `Tensor` in the provided 1400 checkpoint from which to restore the column weights. Required if 1401 `ckpt_to_load_from` is not None. 1402 max_norm: (Optional). If not None, embedding values are l2-normalized to 1403 the value of max_norm. 1404 trainable: (Optional). Should the embedding be trainable. Default is True 1405 1406 Returns: 1407 A tuple of `_EmbeddingColumn` with shared embedding space. 1408 1409 Raises: 1410 ValueError: if sparse_id_columns is empty, or its elements are not 1411 compatible with each other. 1412 TypeError: if `sparse_id_columns` is not a sequence or is a string. If at 1413 least one element of `sparse_id_columns` is not a `SparseColumn` or a 1414 `WeightedSparseColumn`. 1415 """ 1416 if (not isinstance(sparse_id_columns, collections.Sequence) or 1417 isinstance(sparse_id_columns, six.string_types)): 1418 raise TypeError( 1419 "sparse_id_columns must be a non-string sequence (ex: list or tuple) " 1420 "instead of type {}.".format(type(sparse_id_columns))) 1421 if len(sparse_id_columns) < 1: 1422 raise ValueError("The input sparse_id_columns should have at least one " 1423 "element.") 1424 for sparse_id_column in sparse_id_columns: 1425 if not (isinstance(sparse_id_column, _SparseColumn) or 1426 isinstance(sparse_id_column, _WeightedSparseColumn)): 1427 raise TypeError("Elements of sparse_id_columns must be _SparseColumn or " 1428 "_WeightedSparseColumn, but {} is not." 1429 .format(sparse_id_column)) 1430 1431 if len(sparse_id_columns) == 1: 1432 return [ 1433 _EmbeddingColumn(sparse_id_columns[0], dimension, combiner, initializer, 1434 ckpt_to_load_from, tensor_name_in_ckpt, 1435 shared_embedding_name, max_norm=max_norm, 1436 trainable=trainable)] 1437 else: 1438 # Check compatibility of sparse_id_columns 1439 compatible = True 1440 for column in sparse_id_columns[1:]: 1441 if isinstance(sparse_id_columns[0], _WeightedSparseColumn): 1442 compatible = compatible and sparse_id_columns[0].is_compatible(column) 1443 else: 1444 compatible = compatible and column.is_compatible(sparse_id_columns[0]) 1445 if not compatible: 1446 raise ValueError("The input sparse id columns are not compatible.") 1447 # Construct the shared name and size for shared embedding space. 1448 if not shared_embedding_name: 1449 # Sort the columns so that shared_embedding_name will be deterministic 1450 # even if users pass in unsorted columns from a dict or something. 1451 # Since they are different classes, ordering is SparseColumns first, 1452 # then WeightedSparseColumns. 1453 sparse_columns = [] 1454 weighted_sparse_columns = [] 1455 for column in sparse_id_columns: 1456 if isinstance(column, _SparseColumn): 1457 sparse_columns.append(column) 1458 else: 1459 weighted_sparse_columns.append(column) 1460 sorted_columns = sorted(sparse_columns) + sorted( 1461 weighted_sparse_columns, key=lambda x: x.name) 1462 if len(sorted_columns) <= 3: 1463 shared_embedding_name = "_".join([column.name 1464 for column in sorted_columns]) 1465 else: 1466 shared_embedding_name = "_".join([column.name 1467 for column in sorted_columns[0:3]]) 1468 shared_embedding_name += ( 1469 "_plus_{}_others".format(len(sorted_columns) - 3)) 1470 shared_embedding_name += "_shared_embedding" 1471 shared_vocab_size = sparse_id_columns[0].length 1472 1473 embedded_columns = [] 1474 for column in sparse_id_columns: 1475 embedded_columns.append( 1476 _EmbeddingColumn(column, dimension, combiner, initializer, 1477 ckpt_to_load_from, tensor_name_in_ckpt, 1478 shared_embedding_name, shared_vocab_size, 1479 max_norm=max_norm, trainable=trainable)) 1480 return tuple(embedded_columns) 1481 1482 1483class _ScatteredEmbeddingColumn( 1484 _FeatureColumn, 1485 fc_core._DenseColumn, # pylint: disable=protected-access 1486 collections.namedtuple("_ScatteredEmbeddingColumn", [ 1487 "column_name", "size", "dimension", "hash_key", "combiner", 1488 "initializer" 1489 ])): 1490 """See `scattered_embedding_column`.""" 1491 1492 def __new__(cls, 1493 column_name, 1494 size, 1495 dimension, 1496 hash_key, 1497 combiner="sqrtn", 1498 initializer=None): 1499 if initializer is not None and not callable(initializer): 1500 raise ValueError("initializer must be callable if specified. " 1501 "column_name: {}".format(column_name)) 1502 if initializer is None: 1503 stddev = 0.1 1504 initializer = init_ops.truncated_normal_initializer( 1505 mean=0.0, stddev=stddev) 1506 return super(_ScatteredEmbeddingColumn, cls).__new__(cls, column_name, size, 1507 dimension, hash_key, 1508 combiner, 1509 initializer) 1510 1511 @property 1512 def name(self): 1513 return "{}_scattered_embedding".format(self.column_name) 1514 1515 @property 1516 def config(self): 1517 return {self.column_name: parsing_ops.VarLenFeature(dtypes.string)} 1518 1519 @property 1520 def key(self): 1521 """Returns a string which will be used as a key when we do sorting.""" 1522 return self._key_without_properties(["initializer"]) 1523 1524 def insert_transformed_feature(self, columns_to_tensors): 1525 columns_to_tensors[self] = columns_to_tensors[self.column_name] 1526 1527 def _deep_embedding_lookup_arguments(self, input_tensor): 1528 return _DeepEmbeddingLookupArguments( 1529 input_tensor=input_tensor, 1530 weight_tensor=None, 1531 vocab_size=self.size, 1532 initializer=self.initializer, 1533 combiner=self.combiner, 1534 dimension=self.dimension, 1535 shared_embedding_name=None, 1536 hash_key=self.hash_key, 1537 max_norm=None, 1538 trainable=True) 1539 1540 @property 1541 def _variable_shape(self): 1542 return tensor_shape.TensorShape([self.dimension]) 1543 1544 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 1545 return _embeddings_from_arguments( 1546 self, 1547 self._deep_embedding_lookup_arguments(inputs.get(self)), 1548 weight_collections, trainable) 1549 1550 def _transform_feature(self, inputs): 1551 return inputs.get(self.column_name) 1552 1553 @property 1554 def _parse_example_spec(self): 1555 return self.config 1556 1557 1558def scattered_embedding_column(column_name, 1559 size, 1560 dimension, 1561 hash_key, 1562 combiner="mean", 1563 initializer=None): 1564 """Creates an embedding column of a sparse feature using parameter hashing. 1565 1566 This is a useful shorthand when you have a sparse feature you want to use an 1567 embedding for, but also want to hash the embedding's values in each dimension 1568 to a variable based on a different hash. 1569 1570 Specifically, the i-th embedding component of a value v is found by retrieving 1571 an embedding weight whose index is a fingerprint of the pair (v,i). 1572 1573 An embedding column with sparse_column_with_hash_bucket such as 1574 1575 embedding_column( 1576 sparse_column_with_hash_bucket(column_name, bucket_size), 1577 dimension) 1578 1579 could be replaced by 1580 1581 scattered_embedding_column( 1582 column_name, 1583 size=bucket_size * dimension, 1584 dimension=dimension, 1585 hash_key=tf.contrib.layers.SPARSE_FEATURE_CROSS_DEFAULT_HASH_KEY) 1586 1587 for the same number of embedding parameters. This should hopefully reduce the 1588 impact of collisions, but adds the cost of slowing down training. 1589 1590 Args: 1591 column_name: A string defining sparse column name. 1592 size: An integer specifying the number of parameters in the embedding layer. 1593 dimension: An integer specifying dimension of the embedding. 1594 hash_key: Specify the hash_key that will be used by the `FingerprintCat64` 1595 function to combine the crosses fingerprints on SparseFeatureCrossOp. 1596 combiner: A string specifying how to reduce if there are multiple entries 1597 in a single row. Currently "mean", "sqrtn" and "sum" are supported, with 1598 "mean" the default. "sqrtn" often achieves good accuracy, in particular 1599 with bag-of-words columns. Each of this can be thought as example level 1600 normalizations on the column: 1601 * "sum": do not normalize features in the column 1602 * "mean": do l1 normalization on features in the column 1603 * "sqrtn": do l2 normalization on features in the column 1604 For more information: `tf.embedding_lookup_sparse`. 1605 initializer: A variable initializer function to be used in embedding 1606 variable initialization. If not specified, defaults to 1607 `tf.truncated_normal_initializer` with mean 0 and standard deviation 0.1. 1608 1609 Returns: 1610 A _ScatteredEmbeddingColumn. 1611 1612 Raises: 1613 ValueError: if dimension or size is not a positive integer; or if combiner 1614 is not supported. 1615 1616 """ 1617 if (dimension < 1) or (size < 1): 1618 raise ValueError("Dimension and size must be greater than 0. " 1619 "dimension: {}, size: {}, column_name: {}".format( 1620 dimension, size, column_name)) 1621 1622 if combiner not in ("mean", "sqrtn", "sum"): 1623 raise ValueError("Combiner must be one of 'mean', 'sqrtn' or 'sum'. " 1624 "combiner: {}, column_name: {}".format(combiner, 1625 column_name)) 1626 1627 return _ScatteredEmbeddingColumn(column_name, size, dimension, hash_key, 1628 combiner, initializer) 1629 1630 1631def _reshape_real_valued_tensor(input_tensor, output_rank, column_name=None): 1632 """Reshaping logic for dense, numeric `Tensors`. 1633 1634 Follows the following rules: 1635 1. If `output_rank > input_rank + 1` raise a `ValueError`. 1636 2. If `output_rank == input_rank + 1`, expand `input_tensor` by one 1637 dimension and return 1638 3. If `output_rank == input_rank`, return `input_tensor`. 1639 4. If `output_rank < input_rank`, flatten the inner dimensions of 1640 `input_tensor` and return a `Tensor` with `output_rank` 1641 1642 Args: 1643 input_tensor: a dense `Tensor` to be reshaped. 1644 output_rank: the desired rank of the reshaped `Tensor`. 1645 column_name: (optional) the name of the associated column. Used for error 1646 messages. 1647 Returns: 1648 A `Tensor` with the same entries as `input_tensor` and rank `output_rank`. 1649 Raises: 1650 ValueError: if `output_rank > input_rank + 1`. 1651 """ 1652 input_rank = input_tensor.get_shape().ndims 1653 if input_rank is not None: 1654 if output_rank > input_rank + 1: 1655 error_string = ("Rank of input Tensor ({}) should be the same as " 1656 "output_rank ({}). For example, sequence data should " 1657 "typically be 3 dimensional (rank 3) while non-sequence " 1658 "data is typically 2 dimensional (rank 2).".format( 1659 input_rank, output_rank)) 1660 if column_name is not None: 1661 error_string = ("Error while processing column {}.".format(column_name) 1662 + error_string) 1663 raise ValueError(error_string) 1664 if output_rank == input_rank + 1: 1665 logging.warning( 1666 "Rank of input Tensor ({}) should be the same as output_rank ({}) " 1667 "for column. Will attempt to expand dims. It is highly recommended " 1668 "that you resize your input, as this behavior may change.".format( 1669 input_rank, output_rank)) 1670 return array_ops.expand_dims(input_tensor, -1, name="expand_dims") 1671 if output_rank == input_rank: 1672 return input_tensor 1673 # Here, either `input_rank` is unknown or it is greater than `output_rank`. 1674 return layers._inner_flatten(input_tensor, output_rank) # pylint: disable=protected-access 1675 1676 1677class _RealValuedVarLenColumn(_FeatureColumn, collections.namedtuple( 1678 "_RealValuedVarLenColumn", 1679 ["column_name", "default_value", "dtype", "normalizer", "is_sparse"])): 1680 """Represents a real valued feature column for variable length Features. 1681 1682 Instances of this class are immutable. 1683 If is_sparse=False, the dictionary returned by InputBuilder contains a 1684 ("column_name", Tensor) pair with a Tensor shape of (batch_size, dimension). 1685 If is_sparse=True, the dictionary contains a ("column_name", SparseTensor) 1686 pair instead with shape inferred after parsing. 1687 """ 1688 1689 @property 1690 def name(self): 1691 return self.column_name 1692 1693 @property 1694 def config(self): 1695 if self.is_sparse: 1696 return {self.column_name: parsing_ops.VarLenFeature(self.dtype)} 1697 else: 1698 return {self.column_name: parsing_ops.FixedLenSequenceFeature( 1699 [], self.dtype, allow_missing=True, 1700 default_value=self.default_value)} 1701 1702 @property 1703 def key(self): 1704 """Returns a string which will be used as a key when we do sorting.""" 1705 return self._key_without_properties(["normalizer"]) 1706 1707 @property 1708 def normalizer_fn(self): 1709 """Returns the function used to normalize the column.""" 1710 return self.normalizer 1711 1712 def _normalized_input_tensor(self, input_tensor): 1713 """Returns the input tensor after custom normalization is applied.""" 1714 if self.normalizer is None: 1715 return input_tensor 1716 if self.is_sparse: 1717 return sparse_tensor_py.SparseTensor( 1718 input_tensor.indices, 1719 self.normalizer(input_tensor.values), 1720 input_tensor.dense_shape) 1721 else: 1722 return self.normalizer(input_tensor) 1723 1724 def insert_transformed_feature(self, columns_to_tensors): 1725 """Apply transformation and inserts it into columns_to_tensors. 1726 1727 Args: 1728 columns_to_tensors: A mapping from feature columns to tensors. 'string' 1729 key means a base feature (not-transformed). It can have _FeatureColumn 1730 as a key too. That means that _FeatureColumn is already transformed. 1731 """ 1732 # Transform the input tensor according to the normalizer function. 1733 input_tensor = self._normalized_input_tensor(columns_to_tensors[self.name]) 1734 columns_to_tensors[self] = math_ops.cast(input_tensor, dtypes.float32) 1735 1736 # pylint: disable=unused-argument 1737 def _to_dnn_input_layer(self, 1738 input_tensor, 1739 weight_collections=None, 1740 trainable=True, 1741 output_rank=2): 1742 return _reshape_real_valued_tensor( 1743 self._to_dense_tensor(input_tensor), output_rank, self.name) 1744 1745 def _to_dense_tensor(self, input_tensor): 1746 if not self.is_sparse: 1747 return input_tensor 1748 raise ValueError("Set is_sparse to False if you want a dense Tensor for " 1749 "column_name: {}".format(self.name)) 1750 1751 1752@experimental 1753def _real_valued_var_len_column(column_name, 1754 default_value=None, 1755 dtype=dtypes.float32, 1756 normalizer=None, 1757 is_sparse=False): 1758 """Creates a `_RealValuedVarLenColumn` for variable-length numeric data. 1759 1760 Note, this is not integrated with any of the DNNEstimators, except the RNN 1761 ones DynamicRNNEstimator and the StateSavingRNNEstimator. 1762 1763 It can either create a parsing config for a SparseTensor (with is_sparse=True) 1764 or a padded Tensor. 1765 The (dense_)shape of the result will be [batch_size, None], which can be used 1766 with is_sparse=False as input into an RNN (see DynamicRNNEstimator or 1767 StateSavingRNNEstimator) or with is_sparse=True as input into a tree (see 1768 gtflow). 1769 1770 Use real_valued_column if the Feature has a fixed length. Use some 1771 SparseColumn for columns to be embedded / one-hot-encoded. 1772 1773 Args: 1774 column_name: A string defining real valued column name. 1775 default_value: A scalar value compatible with dtype. Needs to be specified 1776 if is_sparse=False. 1777 dtype: Defines the type of values. Default value is tf.float32. Needs to be 1778 convertible to tf.float32. 1779 normalizer: If not None, a function that can be used to normalize the value 1780 of the real valued column after default_value is applied for parsing. 1781 Normalizer function takes the input tensor as its argument, and returns 1782 the output tensor. (e.g. lambda x: (x - 3.0) / 4.2). Note that for 1783 is_sparse=False, the normalizer will be run on the values of the 1784 `SparseTensor`. 1785 is_sparse: A boolean defining whether to create a SparseTensor or a Tensor. 1786 Returns: 1787 A _RealValuedSparseColumn. 1788 Raises: 1789 TypeError: if default_value is not a scalar value compatible with dtype. 1790 TypeError: if dtype is not convertible to tf.float32. 1791 ValueError: if default_value is None and is_sparse is False. 1792 """ 1793 if not (dtype.is_integer or dtype.is_floating): 1794 raise TypeError("dtype must be convertible to float. " 1795 "dtype: {}, column_name: {}".format(dtype, column_name)) 1796 1797 if default_value is None and not is_sparse: 1798 raise ValueError("default_value must be provided when is_sparse=False to " 1799 "parse a padded Tensor. " 1800 "column_name: {}".format(column_name)) 1801 if isinstance(default_value, list): 1802 raise ValueError( 1803 "Only scalar default value. default_value: {}, column_name: {}".format( 1804 default_value, column_name)) 1805 if default_value is not None: 1806 if dtype.is_integer: 1807 default_value = int(default_value) 1808 elif dtype.is_floating: 1809 default_value = float(default_value) 1810 1811 return _RealValuedVarLenColumn(column_name, default_value, dtype, normalizer, 1812 is_sparse) 1813 1814 1815class _RealValuedColumn( 1816 _FeatureColumn, 1817 fc_core._DenseColumn, # pylint: disable=protected-access 1818 collections.namedtuple( 1819 "_RealValuedColumn", 1820 ["column_name", "dimension", "default_value", "dtype", "normalizer"])): 1821 """Represents a real valued feature column also known as continuous features. 1822 1823 Instances of this class are immutable. The dictionary returned by InputBuilder 1824 contains a ("column_name", Tensor) pair with a Tensor shape of 1825 (batch_size, dimension). 1826 """ 1827 1828 def __new__(cls, column_name, dimension, default_value, 1829 dtype, normalizer): 1830 if default_value is not None: 1831 default_value = tuple(default_value) 1832 return super(_RealValuedColumn, cls).__new__(cls, column_name, dimension, 1833 default_value, dtype, 1834 normalizer) 1835 1836 @property 1837 def name(self): 1838 return self.column_name 1839 1840 @property 1841 def config(self): 1842 default_value = self.default_value 1843 if default_value is not None: 1844 default_value = list(default_value) 1845 return {self.column_name: parsing_ops.FixedLenFeature([self.dimension], 1846 self.dtype, 1847 default_value)} 1848 1849 @property 1850 def key(self): 1851 """Returns a string which will be used as a key when we do sorting.""" 1852 return self._key_without_properties(["normalizer"]) 1853 1854 @property 1855 def normalizer_fn(self): 1856 """Returns the function used to normalize the column.""" 1857 return self.normalizer 1858 1859 def _normalized_input_tensor(self, input_tensor): 1860 """Returns the input tensor after custom normalization is applied.""" 1861 return (self.normalizer(input_tensor) if self.normalizer is not None else 1862 input_tensor) 1863 1864 def insert_transformed_feature(self, columns_to_tensors): 1865 """Apply transformation and inserts it into columns_to_tensors. 1866 1867 Args: 1868 columns_to_tensors: A mapping from feature columns to tensors. 'string' 1869 key means a base feature (not-transformed). It can have _FeatureColumn 1870 as a key too. That means that _FeatureColumn is already transformed. 1871 """ 1872 # Transform the input tensor according to the normalizer function. 1873 input_tensor = self._normalized_input_tensor(columns_to_tensors[self.name]) 1874 columns_to_tensors[self] = math_ops.cast(input_tensor, dtypes.float32) 1875 1876 # pylint: disable=unused-argument 1877 def _to_dnn_input_layer(self, 1878 input_tensor, 1879 weight_collections=None, 1880 trainable=True, 1881 output_rank=2): 1882 input_tensor = self._to_dense_tensor(input_tensor) 1883 if input_tensor.dtype != dtypes.float32: 1884 input_tensor = math_ops.cast(input_tensor, dtypes.float32) 1885 return _reshape_real_valued_tensor(input_tensor, output_rank, self.name) 1886 1887 def _to_dense_tensor(self, input_tensor): 1888 return input_tensor 1889 1890 @property 1891 def _variable_shape(self): 1892 return tensor_shape.TensorShape([self.dimension]) 1893 1894 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 1895 del weight_collections 1896 del trainable 1897 return inputs.get(self) 1898 1899 def _transform_feature(self, inputs): 1900 return math_ops.cast( 1901 self._normalized_input_tensor(inputs.get(self.name)), dtypes.float32) 1902 1903 @property 1904 def _parse_example_spec(self): 1905 return self.config 1906 1907 1908def real_valued_column(column_name, 1909 dimension=1, 1910 default_value=None, 1911 dtype=dtypes.float32, 1912 normalizer=None): 1913 """Creates a `_RealValuedColumn` for dense numeric data. 1914 1915 Args: 1916 column_name: A string defining real valued column name. 1917 dimension: An integer specifying dimension of the real valued column. 1918 The default is 1. 1919 default_value: A single value compatible with dtype or a list of values 1920 compatible with dtype which the column takes on during tf.Example parsing 1921 if data is missing. When dimension is not None, a default value of None 1922 will cause tf.parse_example to fail if an example does not contain this 1923 column. If a single value is provided, the same value will be applied as 1924 the default value for every dimension. If a list of values is provided, 1925 the length of the list should be equal to the value of `dimension`. 1926 Only scalar default value is supported in case dimension is not specified. 1927 dtype: defines the type of values. Default value is tf.float32. Must be a 1928 non-quantized, real integer or floating point type. 1929 normalizer: If not None, a function that can be used to normalize the value 1930 of the real valued column after default_value is applied for parsing. 1931 Normalizer function takes the input tensor as its argument, and returns 1932 the output tensor. (e.g. lambda x: (x - 3.0) / 4.2). Note that for 1933 variable length columns, the normalizer should expect an input_tensor of 1934 type `SparseTensor`. 1935 Returns: 1936 A _RealValuedColumn. 1937 Raises: 1938 TypeError: if dimension is not an int 1939 ValueError: if dimension is not a positive integer 1940 TypeError: if default_value is a list but its length is not equal to the 1941 value of `dimension`. 1942 TypeError: if default_value is not compatible with dtype. 1943 ValueError: if dtype is not convertible to tf.float32. 1944 """ 1945 1946 if dimension is None: 1947 raise TypeError("dimension must be an integer. Use the " 1948 "_real_valued_var_len_column for variable length features." 1949 "dimension: {}, column_name: {}".format(dimension, 1950 column_name)) 1951 if not isinstance(dimension, int): 1952 raise TypeError("dimension must be an integer. " 1953 "dimension: {}, column_name: {}".format(dimension, 1954 column_name)) 1955 if dimension < 1: 1956 raise ValueError("dimension must be greater than 0. " 1957 "dimension: {}, column_name: {}".format(dimension, 1958 column_name)) 1959 1960 if not (dtype.is_integer or dtype.is_floating): 1961 raise ValueError("dtype must be convertible to float. " 1962 "dtype: {}, column_name: {}".format(dtype, column_name)) 1963 1964 if default_value is None: 1965 return _RealValuedColumn(column_name, dimension, default_value, dtype, 1966 normalizer) 1967 1968 if isinstance(default_value, int): 1969 if dtype.is_integer: 1970 default_value = ([default_value for _ in range(dimension)] if dimension 1971 else [default_value]) 1972 return _RealValuedColumn(column_name, dimension, default_value, dtype, 1973 normalizer) 1974 if dtype.is_floating: 1975 default_value = float(default_value) 1976 default_value = ([default_value for _ in range(dimension)] if dimension 1977 else [default_value]) 1978 return _RealValuedColumn(column_name, dimension, default_value, dtype, 1979 normalizer) 1980 1981 if isinstance(default_value, float): 1982 if dtype.is_floating and (not dtype.is_integer): 1983 default_value = ([default_value for _ in range(dimension)] if dimension 1984 else [default_value]) 1985 return _RealValuedColumn(column_name, dimension, default_value, dtype, 1986 normalizer) 1987 1988 if isinstance(default_value, list): 1989 if len(default_value) != dimension: 1990 raise ValueError( 1991 "The length of default_value must be equal to dimension. " 1992 "default_value: {}, dimension: {}, column_name: {}".format( 1993 default_value, dimension, column_name)) 1994 # Check if the values in the list are all integers or are convertible to 1995 # floats. 1996 is_list_all_int = True 1997 is_list_all_float = True 1998 for v in default_value: 1999 if not isinstance(v, int): 2000 is_list_all_int = False 2001 if not (isinstance(v, float) or isinstance(v, int)): 2002 is_list_all_float = False 2003 if is_list_all_int: 2004 if dtype.is_integer: 2005 return _RealValuedColumn(column_name, dimension, default_value, dtype, 2006 normalizer) 2007 elif dtype.is_floating: 2008 default_value = [float(v) for v in default_value] 2009 return _RealValuedColumn(column_name, dimension, default_value, dtype, 2010 normalizer) 2011 if is_list_all_float: 2012 if dtype.is_floating and (not dtype.is_integer): 2013 default_value = [float(v) for v in default_value] 2014 return _RealValuedColumn(column_name, dimension, default_value, dtype, 2015 normalizer) 2016 2017 raise TypeError("default_value must be compatible with dtype. " 2018 "default_value: {}, dtype: {}, column_name: {}".format( 2019 default_value, dtype, column_name)) 2020 2021 2022class _BucketizedColumn( 2023 _FeatureColumn, 2024 fc_core._CategoricalColumn, # pylint: disable=protected-access 2025 fc_core._DenseColumn, # pylint: disable=protected-access 2026 collections.namedtuple("_BucketizedColumn", ["source_column", 2027 "boundaries"])): 2028 """Represents a bucketization transformation also known as binning. 2029 2030 Instances of this class are immutable. Values in `source_column` will be 2031 bucketized based on `boundaries`. 2032 For example, if the inputs are: 2033 boundaries = [0, 10, 100] 2034 source_column = [[-5], [150], [10], [0], [4], [19]] 2035 2036 then the bucketized feature will be: 2037 output = [[0], [3], [2], [1], [1], [2]] 2038 2039 Attributes: 2040 source_column: A _RealValuedColumn defining dense column. 2041 boundaries: A list or tuple of floats specifying the boundaries. It has to 2042 be sorted. [a, b, c] defines following buckets: (-inf., a), [a, b), 2043 [b, c), [c, inf.) 2044 Raises: 2045 ValueError: if 'boundaries' is empty or not sorted. 2046 """ 2047 2048 def __new__(cls, source_column, boundaries): 2049 if not isinstance(source_column, _RealValuedColumn): 2050 raise TypeError("source_column must be an instance of _RealValuedColumn. " 2051 "source_column: {}".format(source_column)) 2052 2053 if source_column.dimension is None: 2054 raise ValueError("source_column must have a defined dimension. " 2055 "source_column: {}".format(source_column)) 2056 2057 if (not isinstance(boundaries, list) and 2058 not isinstance(boundaries, tuple)) or not boundaries: 2059 raise ValueError("boundaries must be a non-empty list or tuple. " 2060 "boundaries: {}".format(boundaries)) 2061 2062 # We allow bucket boundaries to be monotonically increasing 2063 # (ie a[i+1] >= a[i]). When two bucket boundaries are the same, we 2064 # de-duplicate. 2065 sanitized_boundaries = [] 2066 for i in range(len(boundaries) - 1): 2067 if boundaries[i] == boundaries[i + 1]: 2068 continue 2069 elif boundaries[i] < boundaries[i + 1]: 2070 sanitized_boundaries.append(boundaries[i]) 2071 else: 2072 raise ValueError("boundaries must be a sorted list. " 2073 "boundaries: {}".format(boundaries)) 2074 sanitized_boundaries.append(boundaries[len(boundaries) - 1]) 2075 2076 return super(_BucketizedColumn, cls).__new__(cls, source_column, 2077 tuple(sanitized_boundaries)) 2078 2079 @property 2080 def name(self): 2081 return "{}_bucketized".format(self.source_column.name) 2082 2083 @property 2084 def length(self): 2085 """Returns total number of buckets.""" 2086 return len(self.boundaries) + 1 2087 2088 @property 2089 def config(self): 2090 return self.source_column.config 2091 2092 @property 2093 def key(self): 2094 """Returns a string which will be used as a key when we do sorting.""" 2095 return "{}".format(self) 2096 2097 # pylint: disable=unused-argument 2098 def _to_dnn_input_layer(self, 2099 input_tensor, 2100 weight_collections=None, 2101 trainable=True, 2102 output_rank=2): 2103 if output_rank != 2: 2104 raise ValueError("BucketizedColumn currently only supports output_rank=2") 2105 return array_ops.reshape( 2106 array_ops.one_hot( 2107 math_ops.cast(input_tensor, dtypes.int64), 2108 self.length, 2109 1., 2110 0., 2111 name="one_hot"), [-1, self.length * self.source_column.dimension], 2112 name="reshape") 2113 2114 def to_sparse_tensor(self, input_tensor): 2115 """Creates a SparseTensor from the bucketized Tensor.""" 2116 dimension = self.source_column.dimension 2117 batch_size = array_ops.shape(input_tensor, name="shape")[0] 2118 2119 if dimension > 1: 2120 i1 = array_ops.reshape( 2121 array_ops.tile( 2122 array_ops.expand_dims( 2123 math_ops.range(0, batch_size), 1, name="expand_dims"), 2124 [1, dimension], 2125 name="tile"), [-1], 2126 name="reshape") 2127 i2 = array_ops.tile( 2128 math_ops.range(0, dimension), [batch_size], name="tile") 2129 # Flatten the bucket indices and unique them across dimensions 2130 # E.g. 2nd dimension indices will range from k to 2*k-1 with k buckets 2131 bucket_indices = array_ops.reshape( 2132 input_tensor, [-1], name="reshape") + self.length * i2 2133 else: 2134 # Simpler indices when dimension=1 2135 i1 = math_ops.range(0, batch_size) 2136 i2 = array_ops.zeros([batch_size], dtype=dtypes.int32, name="zeros") 2137 bucket_indices = array_ops.reshape(input_tensor, [-1], name="reshape") 2138 2139 indices = math_ops.cast(array_ops.transpose(array_ops.stack((i1, i2))), 2140 dtypes.int64) 2141 shape = math_ops.cast(array_ops.stack([batch_size, dimension]), 2142 dtypes.int64) 2143 sparse_id_values = sparse_tensor_py.SparseTensor( 2144 indices, bucket_indices, shape) 2145 2146 return sparse_id_values 2147 2148 def _wide_embedding_lookup_arguments(self, input_tensor): 2149 return _LinearEmbeddingLookupArguments( 2150 input_tensor=self.to_sparse_tensor(input_tensor), 2151 weight_tensor=None, 2152 vocab_size=self.length * self.source_column.dimension, 2153 initializer=init_ops.zeros_initializer(), 2154 combiner="sum") 2155 2156 def _transform_feature(self, inputs): 2157 """Handles cross transformation.""" 2158 # Bucketize the source column. 2159 return bucketization_op.bucketize( 2160 inputs.get(self.source_column), 2161 boundaries=list(self.boundaries), 2162 name="bucketize") 2163 2164 def insert_transformed_feature(self, columns_to_tensors): 2165 """Handles sparse column to id conversion.""" 2166 columns_to_tensors[self] = self._transform_feature( 2167 _LazyBuilderByColumnsToTensor(columns_to_tensors)) 2168 2169 @property 2170 def _parse_example_spec(self): 2171 return self.config 2172 2173 @property 2174 def _num_buckets(self): 2175 return self.length * self.source_column.dimension 2176 2177 def _get_sparse_tensors(self, inputs, weight_collections=None, 2178 trainable=None): 2179 del weight_collections 2180 del trainable 2181 return fc_core._CategoricalColumn.IdWeightPair( # pylint: disable=protected-access 2182 self.to_sparse_tensor(inputs.get(self)), None) 2183 2184 @property 2185 def _variable_shape(self): 2186 return tensor_shape.TensorShape( 2187 [self.length * self.source_column.dimension]) 2188 2189 def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None): 2190 return self._to_dnn_input_layer( 2191 inputs.get(self), weight_collections, trainable) 2192 2193 2194def bucketized_column(source_column, boundaries): 2195 """Creates a _BucketizedColumn for discretizing dense input. 2196 2197 Args: 2198 source_column: A _RealValuedColumn defining dense column. 2199 boundaries: A list or tuple of floats specifying the boundaries. It has to 2200 be sorted. 2201 2202 Returns: 2203 A _BucketizedColumn. 2204 2205 Raises: 2206 ValueError: if 'boundaries' is empty or not sorted. 2207 """ 2208 return _BucketizedColumn(source_column, boundaries) 2209 2210 2211class _CrossedColumn( 2212 _FeatureColumn, 2213 fc_core._CategoricalColumn, # pylint: disable=protected-access 2214 collections.namedtuple("_CrossedColumn", [ 2215 "columns", "hash_bucket_size", "hash_key", "combiner", 2216 "ckpt_to_load_from", "tensor_name_in_ckpt" 2217 ])): 2218 """Represents a cross transformation also known as conjunction or combination. 2219 2220 Instances of this class are immutable. It crosses given `columns`. Crossed 2221 column output will be hashed to hash_bucket_size. 2222 Conceptually, transformation can be thought as: 2223 Hash(cartesian product of features in columns) % `hash_bucket_size` 2224 2225 For example, if the columns are 2226 2227 SparseTensor referred by first column: shape = [2, 2] 2228 [0, 0]: "a" 2229 [1, 0]: "b" 2230 [1, 1]: "c" 2231 2232 SparseTensor referred by second column: : shape = [2, 1] 2233 [0, 0]: "d" 2234 [1, 0]: "e" 2235 2236 then crossed feature will look like: 2237 2238 shape = [2, 2] 2239 [0, 0]: Hash64("d", Hash64("a")) % hash_bucket_size 2240 [1, 0]: Hash64("e", Hash64("b")) % hash_bucket_size 2241 [1, 1]: Hash64("e", Hash64("c")) % hash_bucket_size 2242 2243 Attributes: 2244 columns: An iterable of _FeatureColumn. Items can be an instance of 2245 _SparseColumn, _CrossedColumn, or _BucketizedColumn. 2246 hash_bucket_size: An int that is > 1. The number of buckets. 2247 combiner: A string specifying how to reduce if there are multiple entries 2248 in a single row. Currently "mean", "sqrtn" and "sum" are supported, with 2249 "sum" the default. "sqrtn" often achieves good accuracy, in particular 2250 with bag-of-words columns. Each of this can be thought as example level 2251 normalizations on the column:: 2252 * "sum": do not normalize 2253 * "mean": do l1 normalization 2254 * "sqrtn": do l2 normalization 2255 For more information: `tf.embedding_lookup_sparse`. 2256 ckpt_to_load_from: (Optional). String representing checkpoint name/pattern 2257 to restore the column weights. Required if `tensor_name_in_ckpt` is not 2258 None. 2259 tensor_name_in_ckpt: (Optional). Name of the `Tensor` in the provided 2260 checkpoint from which to restore the column weights. Required if 2261 `ckpt_to_load_from` is not None. 2262 2263 Raises: 2264 TypeError: if all items in columns are not an instance of _SparseColumn, 2265 _CrossedColumn, or _BucketizedColumn. 2266 ValueError: if hash_bucket_size is not > 1 or len(columns) is not > 1. Also, 2267 if only one of `ckpt_to_load_from` and `tensor_name_in_ckpt` is specified. 2268 """ 2269 2270 @staticmethod 2271 def _assert_is_crossable(column): 2272 if isinstance(column, (_SparseColumn, _CrossedColumn, _BucketizedColumn)): 2273 return 2274 raise TypeError("columns must be a set of _SparseColumn, " 2275 "_CrossedColumn, or _BucketizedColumn instances. " 2276 "(column {} is a {})".format(column, 2277 column.__class__.__name__)) 2278 2279 def __new__(cls, 2280 columns, 2281 hash_bucket_size, 2282 hash_key, 2283 combiner="sum", 2284 ckpt_to_load_from=None, 2285 tensor_name_in_ckpt=None): 2286 for column in columns: 2287 _CrossedColumn._assert_is_crossable(column) 2288 2289 if len(columns) < 2: 2290 raise ValueError("columns must contain at least 2 elements. " 2291 "columns: {}".format(columns)) 2292 2293 if hash_bucket_size < 2: 2294 raise ValueError("hash_bucket_size must be at least 2. " 2295 "hash_bucket_size: {}".format(hash_bucket_size)) 2296 2297 if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None): 2298 raise ValueError("Must specify both `ckpt_to_load_from` and " 2299 "`tensor_name_in_ckpt` or none of them.") 2300 2301 sorted_columns = sorted( 2302 [column for column in columns], key=lambda column: column.name) 2303 return super(_CrossedColumn, cls).__new__(cls, tuple(sorted_columns), 2304 hash_bucket_size, hash_key, 2305 combiner, 2306 ckpt_to_load_from, 2307 tensor_name_in_ckpt) 2308 2309 @property 2310 def name(self): 2311 sorted_names = sorted([column.name for column in self.columns]) 2312 return "_X_".join(sorted_names) 2313 2314 @property 2315 def config(self): 2316 config = {} 2317 for column in self.columns: 2318 config.update(_get_feature_config(column)) 2319 return config 2320 2321 @property 2322 def length(self): 2323 """Returns total number of buckets.""" 2324 return self.hash_bucket_size 2325 2326 @property 2327 def key(self): 2328 """Returns a string which will be used as a key when we do sorting.""" 2329 return "{}".format(self) 2330 2331 def id_tensor(self, input_tensor): 2332 """Returns the id tensor from the given transformed input_tensor.""" 2333 return input_tensor 2334 2335 def weight_tensor(self, input_tensor): 2336 """Returns the weight tensor from the given transformed input_tensor.""" 2337 del input_tensor 2338 return None 2339 2340 def _to_dnn_input_layer(self, 2341 input_tensor, 2342 weight_collections=None, 2343 trainable=True, 2344 output_rank=2): 2345 del input_tensor 2346 del weight_collections 2347 del trainable 2348 del output_rank 2349 raise ValueError("CrossedColumn is not supported in DNN. " 2350 "Please use embedding_column. column: {}".format(self)) 2351 2352 def _checkpoint_path(self): 2353 if self.ckpt_to_load_from is not None: 2354 return self.ckpt_to_load_from, self.tensor_name_in_ckpt 2355 return None 2356 2357 def _wide_embedding_lookup_arguments(self, input_tensor): 2358 return _LinearEmbeddingLookupArguments( 2359 input_tensor=input_tensor, 2360 weight_tensor=None, 2361 vocab_size=self.length, 2362 initializer=init_ops.zeros_initializer(), 2363 combiner=self.combiner) 2364 2365 def _transform_feature(self, inputs): 2366 """Handles cross transformation.""" 2367 2368 def _collect_leaf_level_columns(cross): 2369 """Collects base columns contained in the cross.""" 2370 leaf_level_columns = [] 2371 for c in cross.columns: 2372 if isinstance(c, _CrossedColumn): 2373 leaf_level_columns.extend(_collect_leaf_level_columns(c)) 2374 else: 2375 leaf_level_columns.append(c) 2376 return leaf_level_columns 2377 2378 feature_tensors = [] 2379 for c in _collect_leaf_level_columns(self): 2380 if isinstance(c, _SparseColumn): 2381 feature_tensors.append(inputs.get(c.name)) 2382 else: 2383 if isinstance(c, _BucketizedColumn): 2384 feature_tensors.append(c.to_sparse_tensor(inputs.get(c))) 2385 else: 2386 feature_tensors.append(inputs.get(c)) 2387 return sparse_feature_cross_op.sparse_feature_cross( 2388 feature_tensors, 2389 hashed_output=True, 2390 num_buckets=self.hash_bucket_size, 2391 hash_key=self.hash_key, 2392 name="cross") 2393 2394 def insert_transformed_feature(self, columns_to_tensors): 2395 """Handles sparse column to id conversion.""" 2396 columns_to_tensors[self] = self._transform_feature( 2397 _LazyBuilderByColumnsToTensor(columns_to_tensors)) 2398 2399 @property 2400 def _parse_example_spec(self): 2401 return self.config 2402 2403 @property 2404 def _num_buckets(self): 2405 return self.length 2406 2407 def _get_sparse_tensors(self, inputs, weight_collections=None, 2408 trainable=None): 2409 del weight_collections 2410 del trainable 2411 return fc_core._CategoricalColumn.IdWeightPair(inputs.get(self), None) # pylint: disable=protected-access 2412 2413 2414class _LazyBuilderByColumnsToTensor(object): 2415 2416 def __init__(self, columns_to_tensors): 2417 self._columns_to_tensors = columns_to_tensors 2418 2419 def get(self, key): 2420 """Gets the transformed feature column.""" 2421 if key in self._columns_to_tensors: 2422 return self._columns_to_tensors[key] 2423 if isinstance(key, str): 2424 raise ValueError( 2425 "features dictionary doesn't contain key ({})".format(key)) 2426 if not isinstance(key, _FeatureColumn): 2427 raise TypeError('"key" must be either a "str" or "_FeatureColumn". ' 2428 "Provided: {}".format(key)) 2429 2430 key.insert_transformed_feature(self._columns_to_tensors) 2431 return self._columns_to_tensors[key] 2432 2433 2434def crossed_column(columns, hash_bucket_size, combiner="sum", 2435 ckpt_to_load_from=None, 2436 tensor_name_in_ckpt=None, 2437 hash_key=None): 2438 """Creates a _CrossedColumn for performing feature crosses. 2439 2440 Args: 2441 columns: An iterable of _FeatureColumn. Items can be an instance of 2442 _SparseColumn, _CrossedColumn, or _BucketizedColumn. 2443 hash_bucket_size: An int that is > 1. The number of buckets. 2444 combiner: A string specifying how to reduce if there are multiple entries 2445 in a single row. Currently "mean", "sqrtn" and "sum" are supported, with 2446 "sum" the default. "sqrtn" often achieves good accuracy, in particular 2447 with bag-of-words columns. Each of this can be thought as example level 2448 normalizations on the column:: 2449 * "sum": do not normalize 2450 * "mean": do l1 normalization 2451 * "sqrtn": do l2 normalization 2452 For more information: `tf.embedding_lookup_sparse`. 2453 ckpt_to_load_from: (Optional). String representing checkpoint name/pattern 2454 to restore the column weights. Required if `tensor_name_in_ckpt` is not 2455 None. 2456 tensor_name_in_ckpt: (Optional). Name of the `Tensor` in the provided 2457 checkpoint from which to restore the column weights. Required if 2458 `ckpt_to_load_from` is not None. 2459 hash_key: Specify the hash_key that will be used by the `FingerprintCat64` 2460 function to combine the crosses fingerprints on SparseFeatureCrossOp 2461 (optional). 2462 2463 Returns: 2464 A _CrossedColumn. 2465 2466 Raises: 2467 TypeError: if any item in columns is not an instance of _SparseColumn, 2468 _CrossedColumn, or _BucketizedColumn, or 2469 hash_bucket_size is not an int. 2470 ValueError: if hash_bucket_size is not > 1 or 2471 len(columns) is not > 1. 2472 """ 2473 return _CrossedColumn( 2474 columns, 2475 hash_bucket_size, 2476 hash_key, 2477 combiner=combiner, 2478 ckpt_to_load_from=ckpt_to_load_from, 2479 tensor_name_in_ckpt=tensor_name_in_ckpt) 2480 2481 2482class DataFrameColumn(_FeatureColumn, 2483 collections.namedtuple("DataFrameColumn", 2484 ["column_name", "series"])): 2485 """Represents a feature column produced from a `DataFrame`. 2486 2487 Instances of this class are immutable. A `DataFrame` column may be dense or 2488 sparse, and may have any shape, with the constraint that dimension 0 is 2489 batch_size. 2490 2491 Args: 2492 column_name: a name for this column 2493 series: a `Series` to be wrapped, which has already had its base features 2494 substituted with `PredefinedSeries`. 2495 """ 2496 2497 def __new__(cls, column_name, series): 2498 return super(DataFrameColumn, cls).__new__(cls, column_name, series) 2499 2500 @property 2501 def name(self): 2502 return self.column_name 2503 2504 @property 2505 def config(self): 2506 return self.series.required_base_features() 2507 2508 @property 2509 def key(self): 2510 """Returns a string which will be used as a key when we do sorting.""" 2511 return self.name 2512 2513 def insert_transformed_feature(self, columns_to_tensors): 2514 # The cache must already contain mappings from the expected base feature 2515 # names to Tensors. 2516 2517 # Passing columns_to_tensors as the cache here means that multiple outputs 2518 # of the transform will be cached, keyed by the repr of their associated 2519 # TransformedSeries. 2520 # The specific requested output ends up in columns_to_tensors twice: once 2521 # keyed by the TransformedSeries repr, and once keyed by this 2522 # DataFrameColumn instance. 2523 columns_to_tensors[self] = self.series.build(columns_to_tensors) 2524 2525 # pylint: disable=unused-argument 2526 def _to_dnn_input_layer(self, 2527 input_tensor, 2528 weight_collections=None, 2529 trainable=True, 2530 output_rank=2): 2531 if input_tensor.dtype != dtypes.float32: 2532 input_tensor = math_ops.cast(input_tensor, dtypes.float32) 2533 return _reshape_real_valued_tensor(input_tensor, output_rank, self.name) 2534 2535 def _to_dense_tensor(self, input_tensor): 2536 return self._to_dnn_input_layer(input_tensor) 2537 2538 def __eq__(self, other): 2539 if isinstance(other, self.__class__): 2540 return self.__dict__ == other.__dict__ 2541 else: 2542 return False 2543 2544 def __ne__(self, other): 2545 return not self.__eq__(other) 2546 2547 2548def _get_feature_config(feature_column): 2549 """Returns configuration for the base feature defined in feature_column.""" 2550 if not isinstance(feature_column, _FeatureColumn): 2551 raise TypeError( 2552 "feature_columns should only contain instances of _FeatureColumn. " 2553 "Given column is {}".format(feature_column)) 2554 if isinstance(feature_column, (_SparseColumn, _WeightedSparseColumn, 2555 _EmbeddingColumn, _RealValuedColumn, 2556 _RealValuedVarLenColumn, 2557 _BucketizedColumn, _CrossedColumn, 2558 _OneHotColumn, _ScatteredEmbeddingColumn)): 2559 return feature_column.config 2560 2561 raise TypeError("Not supported _FeatureColumn type. " 2562 "Given column is {}".format(feature_column)) 2563 2564 2565def create_feature_spec_for_parsing(feature_columns): 2566 """Helper that prepares features config from input feature_columns. 2567 2568 The returned feature config can be used as arg 'features' in tf.parse_example. 2569 2570 Typical usage example: 2571 2572 ```python 2573 # Define features and transformations 2574 feature_a = sparse_column_with_vocabulary_file(...) 2575 feature_b = real_valued_column(...) 2576 feature_c_bucketized = bucketized_column(real_valued_column("feature_c"), ...) 2577 feature_a_x_feature_c = crossed_column( 2578 columns=[feature_a, feature_c_bucketized], ...) 2579 2580 feature_columns = set( 2581 [feature_b, feature_c_bucketized, feature_a_x_feature_c]) 2582 batch_examples = tf.parse_example( 2583 serialized=serialized_examples, 2584 features=create_feature_spec_for_parsing(feature_columns)) 2585 ``` 2586 2587 For the above example, create_feature_spec_for_parsing would return the dict: 2588 { 2589 "feature_a": parsing_ops.VarLenFeature(tf.string), 2590 "feature_b": parsing_ops.FixedLenFeature([1], dtype=tf.float32), 2591 "feature_c": parsing_ops.FixedLenFeature([1], dtype=tf.float32) 2592 } 2593 2594 Args: 2595 feature_columns: An iterable containing all the feature columns. All items 2596 should be instances of classes derived from _FeatureColumn, unless 2597 feature_columns is a dict -- in which case, this should be true of all 2598 values in the dict. 2599 Returns: 2600 A dict mapping feature keys to FixedLenFeature or VarLenFeature values. 2601 """ 2602 if isinstance(feature_columns, dict): 2603 feature_columns = feature_columns.values() 2604 2605 features_config = {} 2606 for column in feature_columns: 2607 features_config.update(_get_feature_config(column)) 2608 return features_config 2609 2610 2611def _create_sequence_feature_spec_for_parsing(sequence_feature_columns, 2612 allow_missing_by_default=False): 2613 """Prepares a feature spec for parsing `tf.SequenceExample`s. 2614 2615 Args: 2616 sequence_feature_columns: an iterable containing all the feature columns. 2617 All items should be instances of classes derived from `_FeatureColumn`. 2618 allow_missing_by_default: whether to set `allow_missing=True` by default for 2619 `FixedLenSequenceFeature`s. 2620 Returns: 2621 A dict mapping feature keys to `FixedLenSequenceFeature` or `VarLenFeature`. 2622 """ 2623 feature_spec = create_feature_spec_for_parsing(sequence_feature_columns) 2624 sequence_feature_spec = {} 2625 for key, feature in feature_spec.items(): 2626 if isinstance(feature, parsing_ops.VarLenFeature): 2627 sequence_feature = feature 2628 elif (isinstance(feature, parsing_ops.FixedLenFeature) or 2629 isinstance(feature, parsing_ops.FixedLenSequenceFeature)): 2630 default_is_set = feature.default_value is not None 2631 if default_is_set: 2632 logging.warning( 2633 'Found default value {} for feature "{}". Ignoring this value and ' 2634 'setting `allow_missing=True` instead.'. 2635 format(feature.default_value, key)) 2636 sequence_feature = parsing_ops.FixedLenSequenceFeature( 2637 shape=feature.shape, 2638 dtype=feature.dtype, 2639 allow_missing=(allow_missing_by_default or default_is_set)) 2640 else: 2641 raise TypeError( 2642 "Unsupported feature type: {}".format(type(feature).__name__)) 2643 sequence_feature_spec[key] = sequence_feature 2644 return sequence_feature_spec 2645 2646 2647def make_place_holder_tensors_for_base_features(feature_columns): 2648 """Returns placeholder tensors for inference. 2649 2650 Args: 2651 feature_columns: An iterable containing all the feature columns. All items 2652 should be instances of classes derived from _FeatureColumn. 2653 Returns: 2654 A dict mapping feature keys to SparseTensors (sparse columns) or 2655 placeholder Tensors (dense columns). 2656 """ 2657 # Get dict mapping features to FixedLenFeature or VarLenFeature values. 2658 dict_for_parse_example = create_feature_spec_for_parsing(feature_columns) 2659 placeholders = {} 2660 for column_name, column_type in dict_for_parse_example.items(): 2661 if isinstance(column_type, parsing_ops.VarLenFeature): 2662 # Sparse placeholder for sparse tensors. 2663 placeholders[column_name] = array_ops.sparse_placeholder( 2664 column_type.dtype, name="Placeholder_{}".format(column_name)) 2665 else: 2666 # Simple placeholder for dense tensors. 2667 placeholders[column_name] = array_ops.placeholder( 2668 column_type.dtype, 2669 shape=(None, column_type.shape[0]), 2670 name="Placeholder_{}".format(column_name)) 2671 return placeholders 2672 2673 2674class _SparseIdLookupConfig( 2675 collections.namedtuple("_SparseIdLookupConfig", 2676 ["vocabulary_file", "keys", "num_oov_buckets", 2677 "vocab_size", "default_value"])): 2678 """Defines lookup configuration for a sparse feature. 2679 2680 An immutable object defines lookup table configuration used by 2681 tf.feature_to_id_v2. 2682 2683 Attributes: 2684 vocabulary_file: The vocabulary filename. vocabulary_file cannot be combined 2685 with keys. 2686 keys: A 1-D string iterable that specifies the mapping of strings to 2687 indices. It means a feature in keys will map to it's index in keys. 2688 num_oov_buckets: The number of out-of-vocabulary buckets. If zero all out of 2689 vocabulary features will be ignored. 2690 vocab_size: Number of the elements in the vocabulary. 2691 default_value: The value to use for out-of-vocabulary feature values. 2692 Defaults to -1. 2693 """ 2694 2695 def __new__(cls, 2696 vocabulary_file=None, 2697 keys=None, 2698 num_oov_buckets=0, 2699 vocab_size=None, 2700 default_value=-1): 2701 2702 return super(_SparseIdLookupConfig, cls).__new__(cls, vocabulary_file, keys, 2703 num_oov_buckets, 2704 vocab_size, default_value) 2705