• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""This API defines FeatureColumn abstraction.
16
17FeatureColumns provide a high level abstraction for ingesting and representing
18features. FeatureColumns are also the primary way of encoding features for
19canned `tf.estimator.Estimator`s.
20
21When using FeatureColumns with `Estimators`, the type of feature column you
22should choose depends on (1) the feature type and (2) the model type.
23
241. Feature type:
25
26  * Continuous features can be represented by `numeric_column`.
27  * Categorical features can be represented by any `categorical_column_with_*`
28  column:
29    - `categorical_column_with_vocabulary_list`
30    - `categorical_column_with_vocabulary_file`
31    - `categorical_column_with_hash_bucket`
32    - `categorical_column_with_identity`
33    - `weighted_categorical_column`
34
352. Model type:
36
37  * Deep neural network models (`DNNClassifier`, `DNNRegressor`).
38
39    Continuous features can be directly fed into deep neural network models.
40
41      age_column = numeric_column("age")
42
43    To feed sparse features into DNN models, wrap the column with
44    `embedding_column` or `indicator_column`. `indicator_column` is recommended
45    for features with only a few possible values. For features with many
46    possible values, to reduce the size of your model, `embedding_column` is
47    recommended.
48
49      embedded_dept_column = embedding_column(
50          categorical_column_with_vocabulary_list(
51              "department", ["math", "philosophy", ...]), dimension=10)
52
53  * Wide (aka linear) models (`LinearClassifier`, `LinearRegressor`).
54
55    Sparse features can be fed directly into linear models. They behave like an
56    indicator column but with an efficient implementation.
57
58      dept_column = categorical_column_with_vocabulary_list("department",
59          ["math", "philosophy", "english"])
60
61    It is recommended that continuous features be bucketized before being
62    fed into linear models.
63
64      bucketized_age_column = bucketized_column(
65          source_column=age_column,
66          boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
67
68    Sparse features can be crossed (also known as conjuncted or combined) in
69    order to form non-linearities, and then fed into linear models.
70
71      cross_dept_age_column = crossed_column(
72          columns=["department", bucketized_age_column],
73          hash_bucket_size=1000)
74
75Example of building canned `Estimator`s using FeatureColumns:
76
77  ```python
78  # Define features and transformations
79  deep_feature_columns = [age_column, embedded_dept_column]
80  wide_feature_columns = [dept_column, bucketized_age_column,
81      cross_dept_age_column]
82
83  # Build deep model
84  estimator = DNNClassifier(
85      feature_columns=deep_feature_columns,
86      hidden_units=[500, 250, 50])
87  estimator.train(...)
88
89  # Or build a wide model
90  estimator = LinearClassifier(
91      feature_columns=wide_feature_columns)
92  estimator.train(...)
93
94  # Or build a wide and deep model!
95  estimator = DNNLinearCombinedClassifier(
96      linear_feature_columns=wide_feature_columns,
97      dnn_feature_columns=deep_feature_columns,
98      dnn_hidden_units=[500, 250, 50])
99  estimator.train(...)
100  ```
101
102
103FeatureColumns can also be transformed into a generic input layer for
104custom models using `input_layer`.
105
106Example of building model using FeatureColumns, this can be used in a
107`model_fn` which is given to the {tf.estimator.Estimator}:
108
109  ```python
110  # Building model via layers
111
112  deep_feature_columns = [age_column, embedded_dept_column]
113  columns_to_tensor = parse_feature_columns_from_examples(
114      serialized=my_data,
115      feature_columns=deep_feature_columns)
116  first_layer = input_layer(
117      features=columns_to_tensor,
118      feature_columns=deep_feature_columns)
119  second_layer = fully_connected(first_layer, ...)
120  ```
121
122NOTE: Functions prefixed with "_" indicate experimental or private parts of
123the API subject to change, and should not be relied upon!
124
125NOTE: The new feature columns are being developed in feature_column_v2.py and
126are a somewhat duplicate of the code here. Please make sure to update logic
127in both places.
128"""
129
130from __future__ import absolute_import
131from __future__ import division
132from __future__ import print_function
133
134import abc
135import collections
136import math
137
138import numpy as np
139import six
140
141from tensorflow.python.eager import context
142from tensorflow.python.feature_column import utils as fc_utils
143from tensorflow.python.framework import dtypes
144from tensorflow.python.framework import ops
145from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
146from tensorflow.python.framework import tensor_shape
147from tensorflow.python.keras.engine import training
148from tensorflow.python.layers import base
149from tensorflow.python.ops import array_ops
150from tensorflow.python.ops import check_ops
151from tensorflow.python.ops import control_flow_ops
152from tensorflow.python.ops import embedding_ops
153from tensorflow.python.ops import init_ops
154from tensorflow.python.ops import lookup_ops
155from tensorflow.python.ops import math_ops
156from tensorflow.python.ops import nn_ops
157from tensorflow.python.ops import parsing_ops
158from tensorflow.python.ops import resource_variable_ops
159from tensorflow.python.ops import sparse_ops
160from tensorflow.python.ops import string_ops
161from tensorflow.python.ops import template
162from tensorflow.python.ops import variable_scope
163from tensorflow.python.ops import variables
164from tensorflow.python.platform import gfile
165from tensorflow.python.platform import tf_logging as logging
166from tensorflow.python.training import checkpoint_utils
167from tensorflow.python.util import nest
168from tensorflow.python.util.tf_export import tf_export
169
170
171def _internal_input_layer(features,
172                          feature_columns,
173                          weight_collections=None,
174                          trainable=True,
175                          cols_to_vars=None,
176                          scope=None,
177                          cols_to_output_tensors=None,
178                          from_template=False):
179  """See input_layer. `scope` is a name or variable scope to use."""
180
181  feature_columns = _normalize_feature_columns(feature_columns)
182  for column in feature_columns:
183    if not isinstance(column, _DenseColumn):
184      raise ValueError(
185          'Items of feature_columns must be a _DenseColumn. '
186          'You can wrap a categorical column with an '
187          'embedding_column or indicator_column. Given: {}'.format(column))
188  weight_collections = list(weight_collections or [])
189  if ops.GraphKeys.GLOBAL_VARIABLES not in weight_collections:
190    weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
191  if ops.GraphKeys.MODEL_VARIABLES not in weight_collections:
192    weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
193
194  def _get_logits():  # pylint: disable=missing-docstring
195    builder = _LazyBuilder(features)
196    output_tensors = []
197    ordered_columns = []
198    for column in sorted(feature_columns, key=lambda x: x.name):
199      ordered_columns.append(column)
200      with variable_scope.variable_scope(
201          None, default_name=column._var_scope_name):  # pylint: disable=protected-access
202        tensor = column._get_dense_tensor(  # pylint: disable=protected-access
203            builder,
204            weight_collections=weight_collections,
205            trainable=trainable)
206        num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
207        batch_size = array_ops.shape(tensor)[0]
208        output_tensor = array_ops.reshape(
209            tensor, shape=(batch_size, num_elements))
210        output_tensors.append(output_tensor)
211        if cols_to_vars is not None:
212          # Retrieve any variables created (some _DenseColumn's don't create
213          # variables, in which case an empty list is returned).
214          cols_to_vars[column] = ops.get_collection(
215              ops.GraphKeys.GLOBAL_VARIABLES,
216              scope=variable_scope.get_variable_scope().name)
217        if cols_to_output_tensors is not None:
218          cols_to_output_tensors[column] = output_tensor
219    _verify_static_batch_size_equality(output_tensors, ordered_columns)
220    return array_ops.concat(output_tensors, 1)
221
222  # If we're constructing from the `make_template`, that by default adds a
223  # variable scope with the name of the layer. In that case, we dont want to
224  # add another `variable_scope` as that would break checkpoints.
225  if from_template:
226    return _get_logits()
227  else:
228    with variable_scope.variable_scope(
229        scope, default_name='input_layer', values=features.values()):
230      return _get_logits()
231
232
233@tf_export(v1=['feature_column.input_layer'])
234def input_layer(features,
235                feature_columns,
236                weight_collections=None,
237                trainable=True,
238                cols_to_vars=None,
239                cols_to_output_tensors=None):
240  """Returns a dense `Tensor` as input layer based on given `feature_columns`.
241
242  Generally a single example in training data is described with FeatureColumns.
243  At the first layer of the model, this column oriented data should be converted
244  to a single `Tensor`.
245
246  Example:
247
248  ```python
249  price = numeric_column('price')
250  keywords_embedded = embedding_column(
251      categorical_column_with_hash_bucket("keywords", 10K), dimensions=16)
252  columns = [price, keywords_embedded, ...]
253  features = tf.parse_example(..., features=make_parse_example_spec(columns))
254  dense_tensor = input_layer(features, columns)
255  for units in [128, 64, 32]:
256    dense_tensor = tf.layers.dense(dense_tensor, units, tf.nn.relu)
257  prediction = tf.layers.dense(dense_tensor, 1)
258  ```
259
260  Args:
261    features: A mapping from key to tensors. `_FeatureColumn`s look up via these
262      keys. For example `numeric_column('price')` will look at 'price' key in
263      this dict. Values can be a `SparseTensor` or a `Tensor` depends on
264      corresponding `_FeatureColumn`.
265    feature_columns: An iterable containing the FeatureColumns to use as inputs
266      to your model. All items should be instances of classes derived from
267      `_DenseColumn` such as `numeric_column`, `embedding_column`,
268      `bucketized_column`, `indicator_column`. If you have categorical features,
269      you can wrap them with an `embedding_column` or `indicator_column`.
270    weight_collections: A list of collection names to which the Variable will be
271      added. Note that variables will also be added to collections
272      `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
273    trainable: If `True` also add the variable to the graph collection
274      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
275    cols_to_vars: If not `None`, must be a dictionary that will be filled with a
276      mapping from `_FeatureColumn` to list of `Variable`s.  For example, after
277      the call, we might have cols_to_vars =
278      {_EmbeddingColumn(
279        categorical_column=_HashedCategoricalColumn(
280          key='sparse_feature', hash_bucket_size=5, dtype=tf.string),
281        dimension=10): [<tf.Variable 'some_variable:0' shape=(5, 10),
282                        <tf.Variable 'some_variable:1' shape=(5, 10)]}
283      If a column creates no variables, its value will be an empty list.
284    cols_to_output_tensors: If not `None`, must be a dictionary that will be
285      filled with a mapping from '_FeatureColumn' to the associated
286      output `Tensor`s.
287
288  Returns:
289    A `Tensor` which represents input layer of a model. Its shape
290    is (batch_size, first_layer_dimension) and its dtype is `float32`.
291    first_layer_dimension is determined based on given `feature_columns`.
292
293  Raises:
294    ValueError: if an item in `feature_columns` is not a `_DenseColumn`.
295  """
296  return _internal_input_layer(
297      features,
298      feature_columns,
299      weight_collections=weight_collections,
300      trainable=trainable,
301      cols_to_vars=cols_to_vars,
302      cols_to_output_tensors=cols_to_output_tensors)
303
304
305# TODO(akshayka): InputLayer should be a subclass of Layer, and it
306# should implement the logic in input_layer using Layer's build-and-call
307# paradigm; input_layer should create an instance of InputLayer and
308# return the result of invoking its apply method, just as functional layers do.
309class InputLayer(object):
310  """An object-oriented version of `input_layer` that reuses variables."""
311
312  def __init__(self,
313               feature_columns,
314               weight_collections=None,
315               trainable=True,
316               cols_to_vars=None,
317               name='feature_column_input_layer',
318               create_scope_now=True):
319    """See `input_layer`."""
320
321    self._feature_columns = feature_columns
322    self._weight_collections = weight_collections
323    self._trainable = trainable
324    self._cols_to_vars = cols_to_vars
325    self._name = name
326    self._input_layer_template = template.make_template(
327        self._name, _internal_input_layer, create_scope_now_=create_scope_now)
328    self._scope = self._input_layer_template.variable_scope
329
330  def __call__(self, features):
331    return self._input_layer_template(
332        features=features,
333        feature_columns=self._feature_columns,
334        weight_collections=self._weight_collections,
335        trainable=self._trainable,
336        cols_to_vars=None,
337        from_template=True)
338
339  @property
340  def name(self):
341    return self._name
342
343  @property
344  def non_trainable_variables(self):
345    return self._input_layer_template.non_trainable_variables
346
347  @property
348  def non_trainable_weights(self):
349    return self._input_layer_template.non_trainable_weights
350
351  @property
352  def trainable_variables(self):
353    return self._input_layer_template.trainable_variables
354
355  @property
356  def trainable_weights(self):
357    return self._input_layer_template.trainable_weights
358
359  @property
360  def variables(self):
361    return self._input_layer_template.variables
362
363  @property
364  def weights(self):
365    return self._input_layer_template.weights
366
367
368@tf_export(v1=['feature_column.linear_model'])
369def linear_model(features,
370                 feature_columns,
371                 units=1,
372                 sparse_combiner='sum',
373                 weight_collections=None,
374                 trainable=True,
375                 cols_to_vars=None):
376  """Returns a linear prediction `Tensor` based on given `feature_columns`.
377
378  This function generates a weighted sum based on output dimension `units`.
379  Weighted sum refers to logits in classification problems. It refers to the
380  prediction itself for linear regression problems.
381
382  Note on supported columns: `linear_model` treats categorical columns as
383  `indicator_column`s. To be specific, assume the input as `SparseTensor` looks
384  like:
385
386  ```python
387    shape = [2, 2]
388    {
389        [0, 0]: "a"
390        [1, 0]: "b"
391        [1, 1]: "c"
392    }
393  ```
394  `linear_model` assigns weights for the presence of "a", "b", "c' implicitly,
395  just like `indicator_column`, while `input_layer` explicitly requires wrapping
396  each of categorical columns with an `embedding_column` or an
397  `indicator_column`.
398
399  Example of usage:
400
401  ```python
402  price = numeric_column('price')
403  price_buckets = bucketized_column(price, boundaries=[0., 10., 100., 1000.])
404  keywords = categorical_column_with_hash_bucket("keywords", 10K)
405  keywords_price = crossed_column('keywords', price_buckets, ...)
406  columns = [price_buckets, keywords, keywords_price ...]
407  features = tf.parse_example(..., features=make_parse_example_spec(columns))
408  prediction = linear_model(features, columns)
409  ```
410
411  Args:
412    features: A mapping from key to tensors. `_FeatureColumn`s look up via these
413      keys. For example `numeric_column('price')` will look at 'price' key in
414      this dict. Values are `Tensor` or `SparseTensor` depending on
415      corresponding `_FeatureColumn`.
416    feature_columns: An iterable containing the FeatureColumns to use as inputs
417      to your model. All items should be instances of classes derived from
418      `_FeatureColumn`s.
419    units: An integer, dimensionality of the output space. Default value is 1.
420    sparse_combiner: A string specifying how to reduce if a categorical column
421      is multivalent. Except `numeric_column`, almost all columns passed to
422      `linear_model` are considered as categorical columns.  It combines each
423      categorical column independently. Currently "mean", "sqrtn" and "sum" are
424      supported, with "sum" the default for linear model. "sqrtn" often achieves
425      good accuracy, in particular with bag-of-words columns.
426        * "sum": do not normalize features in the column
427        * "mean": do l1 normalization on features in the column
428        * "sqrtn": do l2 normalization on features in the column
429      For example, for two features represented as the categorical columns:
430
431      ```python
432        # Feature 1
433
434        shape = [2, 2]
435        {
436            [0, 0]: "a"
437            [0, 1]: "b"
438            [1, 0]: "c"
439        }
440
441        # Feature 2
442
443        shape = [2, 3]
444        {
445            [0, 0]: "d"
446            [1, 0]: "e"
447            [1, 1]: "f"
448            [1, 2]: "f"
449        }
450      ```
451      with `sparse_combiner` as "mean", the linear model outputs consequently
452      are:
453      ```
454        y_0 = 1.0 / 2.0 * ( w_a + w_b ) + w_d + b
455        y_1 = w_c + 1.0 / 3.0 * ( w_e + 2.0 * w_f ) + b
456      ```
457      where `y_i` is the output, `b` is the bias, and `w_x` is the weight
458      assigned to the presence of `x` in the input features.
459    weight_collections: A list of collection names to which the Variable will be
460      added. Note that, variables will also be added to collections
461      `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
462    trainable: If `True` also add the variable to the graph collection
463      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
464    cols_to_vars: If not `None`, must be a dictionary that will be filled with a
465      mapping from `_FeatureColumn` to associated list of `Variable`s.  For
466      example, after the call, we might have cols_to_vars = {
467        _NumericColumn(
468          key='numeric_feature1', shape=(1,):
469        [<tf.Variable 'linear_model/price2/weights:0' shape=(1, 1)>],
470        'bias': [<tf.Variable 'linear_model/bias_weights:0' shape=(1,)>],
471        _NumericColumn(
472          key='numeric_feature2', shape=(2,)):
473        [<tf.Variable 'linear_model/price1/weights:0' shape=(2, 1)>]}
474      If a column creates no variables, its value will be an empty list. Note
475      that cols_to_vars will also contain a string key 'bias' that maps to a
476      list of Variables.
477
478  Returns:
479    A `Tensor` which represents predictions/logits of a linear model. Its shape
480    is (batch_size, units) and its dtype is `float32`.
481
482  Raises:
483    ValueError: if an item in `feature_columns` is neither a `_DenseColumn`
484      nor `_CategoricalColumn`.
485  """
486  with variable_scope.variable_scope(None, 'linear_model') as vs:
487    model_name = _strip_leading_slashes(vs.name)
488  linear_model_layer = _LinearModel(
489      feature_columns=feature_columns,
490      units=units,
491      sparse_combiner=sparse_combiner,
492      weight_collections=weight_collections,
493      trainable=trainable,
494      name=model_name)
495  retval = linear_model_layer(features)  # pylint: disable=not-callable
496  if cols_to_vars is not None:
497    cols_to_vars.update(linear_model_layer.cols_to_vars())
498  return retval
499
500
501def _add_to_collections(var, weight_collections):
502  """Adds a var to the list of weight_collections provided.
503
504  Handles the case for partitioned and non-partitioned variables.
505
506  Args:
507    var: A variable or Partitioned Variable.
508    weight_collections: List of collections to add variable to.
509  """
510  for weight_collection in weight_collections:
511    # The layer self.add_variable call already adds it to GLOBAL_VARIABLES.
512    if weight_collection == ops.GraphKeys.GLOBAL_VARIABLES:
513      continue
514    # TODO(rohanj): Explore adding a _get_variable_list method on `Variable`
515    # so that we don't have to do this check.
516    if isinstance(var, variables.PartitionedVariable):
517      for constituent_var in list(var):
518        ops.add_to_collection(weight_collection, constituent_var)
519    else:
520      ops.add_to_collection(weight_collection, var)
521
522
523class _FCLinearWrapper(base.Layer):
524  """Wraps a _FeatureColumn in a layer for use in a linear model.
525
526  See `linear_model` above.
527  """
528
529  def __init__(self,
530               feature_column,
531               units=1,
532               sparse_combiner='sum',
533               weight_collections=None,
534               trainable=True,
535               name=None,
536               **kwargs):
537    super(_FCLinearWrapper, self).__init__(
538        trainable=trainable, name=name, **kwargs)
539    self._feature_column = feature_column
540    self._units = units
541    self._sparse_combiner = sparse_combiner
542    self._weight_collections = weight_collections
543
544  def build(self, _):
545    if isinstance(self._feature_column, _CategoricalColumn):
546      weight = self.add_variable(
547          name='weights',
548          shape=(self._feature_column._num_buckets, self._units),  # pylint: disable=protected-access
549          initializer=init_ops.zeros_initializer(),
550          trainable=self.trainable)
551    else:
552      num_elements = self._feature_column._variable_shape.num_elements()  # pylint: disable=protected-access
553      weight = self.add_variable(
554          name='weights',
555          shape=[num_elements, self._units],
556          initializer=init_ops.zeros_initializer(),
557          trainable=self.trainable)
558    _add_to_collections(weight, self._weight_collections)
559    self._weight_var = weight
560    self.built = True
561
562  def call(self, builder):
563    weighted_sum = _create_weighted_sum(
564        column=self._feature_column,
565        builder=builder,
566        units=self._units,
567        sparse_combiner=self._sparse_combiner,
568        weight_collections=self._weight_collections,
569        trainable=self.trainable,
570        weight_var=self._weight_var)
571    return weighted_sum
572
573
574class _BiasLayer(base.Layer):
575  """A layer for the bias term.
576  """
577
578  def __init__(self,
579               units=1,
580               trainable=True,
581               weight_collections=None,
582               name=None,
583               **kwargs):
584    super(_BiasLayer, self).__init__(trainable=trainable, name=name, **kwargs)
585    self._units = units
586    self._weight_collections = weight_collections
587
588  def build(self, _):
589    self._bias_variable = self.add_variable(
590        'bias_weights',
591        shape=[self._units],
592        initializer=init_ops.zeros_initializer(),
593        trainable=self.trainable)
594    _add_to_collections(self._bias_variable, self._weight_collections)
595    self.built = True
596
597  def call(self, _):
598    return self._bias_variable
599
600
601def _get_expanded_variable_list(variable):
602  if (isinstance(variable, variables.Variable) or
603      resource_variable_ops.is_resource_variable(variable)):
604    return [variable]  # Single variable case.
605  else:  # Must be a PartitionedVariable, so convert into a list.
606    return list(variable)
607
608
609def _strip_leading_slashes(name):
610  return name.rsplit('/', 1)[-1]
611
612
613class _LinearModel(training.Model):
614  """Creates a linear model using feature columns.
615
616  See `linear_model` for details.
617  """
618
619  def __init__(self,
620               feature_columns,
621               units=1,
622               sparse_combiner='sum',
623               weight_collections=None,
624               trainable=True,
625               name=None,
626               **kwargs):
627    super(_LinearModel, self).__init__(name=name, **kwargs)
628    self._feature_columns = _normalize_feature_columns(
629        feature_columns)
630    self._weight_collections = list(weight_collections or [])
631    if ops.GraphKeys.GLOBAL_VARIABLES not in self._weight_collections:
632      self._weight_collections.append(ops.GraphKeys.GLOBAL_VARIABLES)
633    if ops.GraphKeys.MODEL_VARIABLES not in self._weight_collections:
634      self._weight_collections.append(ops.GraphKeys.MODEL_VARIABLES)
635
636    column_layers = {}
637    for column in sorted(self._feature_columns, key=lambda x: x.name):
638      with variable_scope.variable_scope(
639          None, default_name=column._var_scope_name) as vs:  # pylint: disable=protected-access
640        # Having the fully expressed variable scope name ends up doubly
641        # expressing the outer scope (scope with which this method was called)
642        # in the name of the variable that would get created.
643        column_name = _strip_leading_slashes(vs.name)
644      column_layer = _FCLinearWrapper(column, units, sparse_combiner,
645                                      self._weight_collections, trainable,
646                                      column_name, **kwargs)
647      column_layers[column_name] = column_layer
648    self._column_layers = self._add_layers(column_layers)
649    self._bias_layer = _BiasLayer(
650        units=units,
651        trainable=trainable,
652        weight_collections=self._weight_collections,
653        name='bias_layer',
654        **kwargs)
655    self._cols_to_vars = {}
656
657  def cols_to_vars(self):
658    """Returns a dict mapping _FeatureColumns to variables.
659
660    See `linear_model` for more information.
661    This is not populated till `call` is called i.e. layer is built.
662    """
663    return self._cols_to_vars
664
665  def call(self, features):
666    with variable_scope.variable_scope(self.name):
667      for column in self._feature_columns:
668        if not isinstance(column, (_DenseColumn, _CategoricalColumn)):
669          raise ValueError(
670              'Items of feature_columns must be either a '
671              '_DenseColumn or _CategoricalColumn. Given: {}'.format(column))
672      weighted_sums = []
673      ordered_columns = []
674      builder = _LazyBuilder(features)
675      for layer in sorted(self._column_layers.values(), key=lambda x: x.name):
676        column = layer._feature_column  # pylint: disable=protected-access
677        ordered_columns.append(column)
678        weighted_sum = layer(builder)
679        weighted_sums.append(weighted_sum)
680        self._cols_to_vars[column] = ops.get_collection(
681            ops.GraphKeys.GLOBAL_VARIABLES, scope=layer.scope_name)
682
683      _verify_static_batch_size_equality(weighted_sums, ordered_columns)
684      predictions_no_bias = math_ops.add_n(
685          weighted_sums, name='weighted_sum_no_bias')
686      predictions = nn_ops.bias_add(
687          predictions_no_bias,
688          self._bias_layer(  # pylint: disable=not-callable
689              builder,
690              scope=variable_scope.get_variable_scope()),  # pylint: disable=not-callable
691          name='weighted_sum')
692      bias = self._bias_layer.variables[0]
693      self._cols_to_vars['bias'] = _get_expanded_variable_list(bias)
694    return predictions
695
696  def _add_layers(self, layers):
697    # "Magic" required for keras.Model classes to track all the variables in
698    # a list of layers.Layer objects.
699    # TODO(ashankar): Figure out API so user code doesn't have to do this.
700    for name, layer in layers.items():
701      setattr(self, 'layer-%s' % name, layer)
702    return layers
703
704
705def _transform_features(features, feature_columns):
706  """Returns transformed features based on features columns passed in.
707
708  Please note that most probably you would not need to use this function. Please
709  check `input_layer` and `linear_model` to see whether they will
710  satisfy your use case or not.
711
712  Example:
713
714  ```python
715  # Define features and transformations
716  crosses_a_x_b = crossed_column(
717      columns=["sparse_feature_a", "sparse_feature_b"], hash_bucket_size=10000)
718  price_buckets = bucketized_column(
719      source_column=numeric_column("price"), boundaries=[...])
720
721  columns = [crosses_a_x_b, price_buckets]
722  features = tf.parse_example(..., features=make_parse_example_spec(columns))
723  transformed = transform_features(features=features, feature_columns=columns)
724
725  assertCountEqual(columns, transformed.keys())
726  ```
727
728  Args:
729    features: A mapping from key to tensors. `_FeatureColumn`s look up via these
730      keys. For example `numeric_column('price')` will look at 'price' key in
731      this dict. Values can be a `SparseTensor` or a `Tensor` depends on
732      corresponding `_FeatureColumn`.
733    feature_columns: An iterable containing all the `_FeatureColumn`s.
734
735  Returns:
736    A `dict` mapping `_FeatureColumn` to `Tensor` and `SparseTensor` values.
737  """
738  feature_columns = _normalize_feature_columns(feature_columns)
739  outputs = {}
740  with ops.name_scope(
741      None, default_name='transform_features', values=features.values()):
742    builder = _LazyBuilder(features)
743    for column in sorted(feature_columns, key=lambda x: x.name):
744      with ops.name_scope(None, default_name=column.name):
745        outputs[column] = builder.get(column)
746  return outputs
747
748
749@tf_export(v1=['feature_column.make_parse_example_spec'])
750def make_parse_example_spec(feature_columns):
751  """Creates parsing spec dictionary from input feature_columns.
752
753  The returned dictionary can be used as arg 'features' in `tf.parse_example`.
754
755  Typical usage example:
756
757  ```python
758  # Define features and transformations
759  feature_a = categorical_column_with_vocabulary_file(...)
760  feature_b = numeric_column(...)
761  feature_c_bucketized = bucketized_column(numeric_column("feature_c"), ...)
762  feature_a_x_feature_c = crossed_column(
763      columns=["feature_a", feature_c_bucketized], ...)
764
765  feature_columns = set(
766      [feature_b, feature_c_bucketized, feature_a_x_feature_c])
767  features = tf.parse_example(
768      serialized=serialized_examples,
769      features=make_parse_example_spec(feature_columns))
770  ```
771
772  For the above example, make_parse_example_spec would return the dict:
773
774  ```python
775  {
776      "feature_a": parsing_ops.VarLenFeature(tf.string),
777      "feature_b": parsing_ops.FixedLenFeature([1], dtype=tf.float32),
778      "feature_c": parsing_ops.FixedLenFeature([1], dtype=tf.float32)
779  }
780  ```
781
782  Args:
783    feature_columns: An iterable containing all feature columns. All items
784      should be instances of classes derived from `_FeatureColumn`.
785
786  Returns:
787    A dict mapping each feature key to a `FixedLenFeature` or `VarLenFeature`
788    value.
789
790  Raises:
791    ValueError: If any of the given `feature_columns` is not a `_FeatureColumn`
792      instance.
793  """
794  result = {}
795  for column in feature_columns:
796    if not isinstance(column, _FeatureColumn):
797      raise ValueError(
798          'All feature_columns must be _FeatureColumn instances. '
799          'Given: {}'.format(column))
800    config = column._parse_example_spec  # pylint: disable=protected-access
801    for key, value in six.iteritems(config):
802      if key in result and value != result[key]:
803        raise ValueError(
804            'feature_columns contain different parse_spec for key '
805            '{}. Given {} and {}'.format(key, value, result[key]))
806    result.update(config)
807  return result
808
809
810def _embedding_column(categorical_column,
811                      dimension,
812                      combiner='mean',
813                      initializer=None,
814                      ckpt_to_load_from=None,
815                      tensor_name_in_ckpt=None,
816                      max_norm=None,
817                      trainable=True):
818  """`_DenseColumn` that converts from sparse, categorical input.
819
820  Use this when your inputs are sparse, but you want to convert them to a dense
821  representation (e.g., to feed to a DNN).
822
823  Inputs must be a `_CategoricalColumn` created by any of the
824  `categorical_column_*` function. Here is an example of using
825  `embedding_column` with `DNNClassifier`:
826
827  ```python
828  video_id = categorical_column_with_identity(
829      key='video_id', num_buckets=1000000, default_value=0)
830  columns = [embedding_column(video_id, 9),...]
831
832  estimator = tf.estimator.DNNClassifier(feature_columns=columns, ...)
833
834  label_column = ...
835  def input_fn():
836    features = tf.parse_example(
837        ..., features=make_parse_example_spec(columns + [label_column]))
838    labels = features.pop(label_column.name)
839    return features, labels
840
841  estimator.train(input_fn=input_fn, steps=100)
842  ```
843
844  Here is an example using `embedding_column` with model_fn:
845
846  ```python
847  def model_fn(features, ...):
848    video_id = categorical_column_with_identity(
849        key='video_id', num_buckets=1000000, default_value=0)
850    columns = [embedding_column(video_id, 9),...]
851    dense_tensor = input_layer(features, columns)
852    # Form DNN layers, calculate loss, and return EstimatorSpec.
853    ...
854  ```
855
856  Args:
857    categorical_column: A `_CategoricalColumn` created by a
858      `categorical_column_with_*` function. This column produces the sparse IDs
859      that are inputs to the embedding lookup.
860    dimension: An integer specifying dimension of the embedding, must be > 0.
861    combiner: A string specifying how to reduce if there are multiple entries
862      in a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with
863      'mean' the default. 'sqrtn' often achieves good accuracy, in particular
864      with bag-of-words columns. Each of this can be thought as example level
865      normalizations on the column. For more information, see
866      `tf.embedding_lookup_sparse`.
867    initializer: A variable initializer function to be used in embedding
868      variable initialization. If not specified, defaults to
869      `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
870      `1/sqrt(dimension)`.
871    ckpt_to_load_from: String representing checkpoint name/pattern from which to
872      restore column weights. Required if `tensor_name_in_ckpt` is not `None`.
873    tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from
874      which to restore the column weights. Required if `ckpt_to_load_from` is
875      not `None`.
876    max_norm: If not `None`, embedding values are l2-normalized to this value.
877    trainable: Whether or not the embedding is trainable. Default is True.
878
879  Returns:
880    `_DenseColumn` that converts from sparse input.
881
882  Raises:
883    ValueError: if `dimension` not > 0.
884    ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt`
885      is specified.
886    ValueError: if `initializer` is specified and is not callable.
887    RuntimeError: If eager execution is enabled.
888  """
889  if (dimension is None) or (dimension < 1):
890    raise ValueError('Invalid dimension {}.'.format(dimension))
891  if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):
892    raise ValueError('Must specify both `ckpt_to_load_from` and '
893                     '`tensor_name_in_ckpt` or none of them.')
894
895  if (initializer is not None) and (not callable(initializer)):
896    raise ValueError('initializer must be callable if specified. '
897                     'Embedding of column_name: {}'.format(
898                         categorical_column.name))
899  if initializer is None:
900    initializer = init_ops.truncated_normal_initializer(
901        mean=0.0, stddev=1 / math.sqrt(dimension))
902
903  embedding_shape = categorical_column._num_buckets, dimension  # pylint: disable=protected-access
904
905  def _creator(weight_collections, scope):
906    embedding_column_layer = _EmbeddingColumnLayer(
907        embedding_shape=embedding_shape,
908        initializer=initializer,
909        weight_collections=weight_collections,
910        trainable=trainable,
911        name='embedding_column_layer')
912    return embedding_column_layer(None, scope=scope)  # pylint: disable=not-callable
913
914  return _EmbeddingColumn(
915      categorical_column=categorical_column,
916      dimension=dimension,
917      combiner=combiner,
918      layer_creator=_creator,
919      ckpt_to_load_from=ckpt_to_load_from,
920      tensor_name_in_ckpt=tensor_name_in_ckpt,
921      max_norm=max_norm,
922      trainable=trainable)
923
924
925def _numeric_column(key,
926                    shape=(1,),
927                    default_value=None,
928                    dtype=dtypes.float32,
929                    normalizer_fn=None):
930  """Represents real valued or numerical features.
931
932  Example:
933
934  ```python
935  price = numeric_column('price')
936  columns = [price, ...]
937  features = tf.parse_example(..., features=make_parse_example_spec(columns))
938  dense_tensor = input_layer(features, columns)
939
940  # or
941  bucketized_price = bucketized_column(price, boundaries=[...])
942  columns = [bucketized_price, ...]
943  features = tf.parse_example(..., features=make_parse_example_spec(columns))
944  linear_prediction = linear_model(features, columns)
945  ```
946
947  Args:
948    key: A unique string identifying the input feature. It is used as the
949      column name and the dictionary key for feature parsing configs, feature
950      `Tensor` objects, and feature columns.
951    shape: An iterable of integers specifies the shape of the `Tensor`. An
952      integer can be given which means a single dimension `Tensor` with given
953      width. The `Tensor` representing the column will have the shape of
954      [batch_size] + `shape`.
955    default_value: A single value compatible with `dtype` or an iterable of
956      values compatible with `dtype` which the column takes on during
957      `tf.Example` parsing if data is missing. A default value of `None` will
958      cause `tf.parse_example` to fail if an example does not contain this
959      column. If a single value is provided, the same value will be applied as
960      the default value for every item. If an iterable of values is provided,
961      the shape of the `default_value` should be equal to the given `shape`.
962    dtype: defines the type of values. Default value is `tf.float32`. Must be a
963      non-quantized, real integer or floating point type.
964    normalizer_fn: If not `None`, a function that can be used to normalize the
965      value of the tensor after `default_value` is applied for parsing.
966      Normalizer function takes the input `Tensor` as its argument, and returns
967      the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that
968      even though the most common use case of this function is normalization, it
969      can be used for any kind of Tensorflow transformations.
970
971  Returns:
972    A `_NumericColumn`.
973
974  Raises:
975    TypeError: if any dimension in shape is not an int
976    ValueError: if any dimension in shape is not a positive integer
977    TypeError: if `default_value` is an iterable but not compatible with `shape`
978    TypeError: if `default_value` is not compatible with `dtype`.
979    ValueError: if `dtype` is not convertible to `tf.float32`.
980  """
981  shape = _check_shape(shape, key)
982  if not (dtype.is_integer or dtype.is_floating):
983    raise ValueError('dtype must be convertible to float. '
984                     'dtype: {}, key: {}'.format(dtype, key))
985  default_value = fc_utils.check_default_value(
986      shape, default_value, dtype, key)
987
988  if normalizer_fn is not None and not callable(normalizer_fn):
989    raise TypeError(
990        'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
991
992  fc_utils.assert_key_is_string(key)
993  return _NumericColumn(
994      key,
995      shape=shape,
996      default_value=default_value,
997      dtype=dtype,
998      normalizer_fn=normalizer_fn)
999
1000
1001def _bucketized_column(source_column, boundaries):
1002  """Represents discretized dense input.
1003
1004  Buckets include the left boundary, and exclude the right boundary. Namely,
1005  `boundaries=[0., 1., 2.]` generates buckets `(-inf, 0.)`, `[0., 1.)`,
1006  `[1., 2.)`, and `[2., +inf)`.
1007
1008  For example, if the inputs are
1009
1010  ```python
1011  boundaries = [0, 10, 100]
1012  input tensor = [[-5, 10000]
1013                  [150,   10]
1014                  [5,    100]]
1015  ```
1016
1017  then the output will be
1018
1019  ```python
1020  output = [[0, 3]
1021            [3, 2]
1022            [1, 3]]
1023  ```
1024
1025  Example:
1026
1027  ```python
1028  price = numeric_column('price')
1029  bucketized_price = bucketized_column(price, boundaries=[...])
1030  columns = [bucketized_price, ...]
1031  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1032  linear_prediction = linear_model(features, columns)
1033
1034  # or
1035  columns = [bucketized_price, ...]
1036  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1037  dense_tensor = input_layer(features, columns)
1038  ```
1039
1040  `bucketized_column` can also be crossed with another categorical column using
1041  `crossed_column`:
1042
1043  ```python
1044  price = numeric_column('price')
1045  # bucketized_column converts numerical feature to a categorical one.
1046  bucketized_price = bucketized_column(price, boundaries=[...])
1047  # 'keywords' is a string feature.
1048  price_x_keywords = crossed_column([bucketized_price, 'keywords'], 50K)
1049  columns = [price_x_keywords, ...]
1050  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1051  linear_prediction = linear_model(features, columns)
1052  ```
1053
1054  Args:
1055    source_column: A one-dimensional dense column which is generated with
1056      `numeric_column`.
1057    boundaries: A sorted list or tuple of floats specifying the boundaries.
1058
1059  Returns:
1060    A `_BucketizedColumn`.
1061
1062  Raises:
1063    ValueError: If `source_column` is not a numeric column, or if it is not
1064      one-dimensional.
1065    ValueError: If `boundaries` is not a sorted list or tuple.
1066  """
1067  if not isinstance(source_column, _NumericColumn):
1068    raise ValueError(
1069        'source_column must be a column generated with numeric_column(). '
1070        'Given: {}'.format(source_column))
1071  if len(source_column.shape) > 1:
1072    raise ValueError(
1073        'source_column must be one-dimensional column. '
1074        'Given: {}'.format(source_column))
1075  if (not boundaries or
1076      not (isinstance(boundaries, list) or isinstance(boundaries, tuple))):
1077    raise ValueError('boundaries must be a sorted list.')
1078  for i in range(len(boundaries) - 1):
1079    if boundaries[i] >= boundaries[i + 1]:
1080      raise ValueError('boundaries must be a sorted list.')
1081  return _BucketizedColumn(source_column, tuple(boundaries))
1082
1083
1084def _categorical_column_with_hash_bucket(key,
1085                                         hash_bucket_size,
1086                                         dtype=dtypes.string):
1087  """Represents sparse feature where ids are set by hashing.
1088
1089  Use this when your sparse features are in string or integer format, and you
1090  want to distribute your inputs into a finite number of buckets by hashing.
1091  output_id = Hash(input_feature_string) % bucket_size for string type input.
1092  For int type input, the value is converted to its string representation first
1093  and then hashed by the same formula.
1094
1095  For input dictionary `features`, `features[key]` is either `Tensor` or
1096  `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
1097  and `''` for string, which will be dropped by this feature column.
1098
1099  Example:
1100
1101  ```python
1102  keywords = categorical_column_with_hash_bucket("keywords", 10K)
1103  columns = [keywords, ...]
1104  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1105  linear_prediction = linear_model(features, columns)
1106
1107  # or
1108  keywords_embedded = embedding_column(keywords, 16)
1109  columns = [keywords_embedded, ...]
1110  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1111  dense_tensor = input_layer(features, columns)
1112  ```
1113
1114  Args:
1115    key: A unique string identifying the input feature. It is used as the
1116      column name and the dictionary key for feature parsing configs, feature
1117      `Tensor` objects, and feature columns.
1118    hash_bucket_size: An int > 1. The number of buckets.
1119    dtype: The type of features. Only string and integer types are supported.
1120
1121  Returns:
1122    A `_HashedCategoricalColumn`.
1123
1124  Raises:
1125    ValueError: `hash_bucket_size` is not greater than 1.
1126    ValueError: `dtype` is neither string nor integer.
1127  """
1128  if hash_bucket_size is None:
1129    raise ValueError('hash_bucket_size must be set. ' 'key: {}'.format(key))
1130
1131  if hash_bucket_size < 1:
1132    raise ValueError('hash_bucket_size must be at least 1. '
1133                     'hash_bucket_size: {}, key: {}'.format(
1134                         hash_bucket_size, key))
1135
1136  fc_utils.assert_key_is_string(key)
1137  fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
1138
1139  return _HashedCategoricalColumn(key, hash_bucket_size, dtype)
1140
1141
1142def _categorical_column_with_vocabulary_file(key,
1143                                             vocabulary_file,
1144                                             vocabulary_size=None,
1145                                             num_oov_buckets=0,
1146                                             default_value=None,
1147                                             dtype=dtypes.string):
1148  """A `_CategoricalColumn` with a vocabulary file.
1149
1150  Use this when your inputs are in string or integer format, and you have a
1151  vocabulary file that maps each value to an integer ID. By default,
1152  out-of-vocabulary values are ignored. Use either (but not both) of
1153  `num_oov_buckets` and `default_value` to specify how to include
1154  out-of-vocabulary values.
1155
1156  For input dictionary `features`, `features[key]` is either `Tensor` or
1157  `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
1158  and `''` for string, which will be dropped by this feature column.
1159
1160  Example with `num_oov_buckets`:
1161  File '/us/states.txt' contains 50 lines, each with a 2-character U.S. state
1162  abbreviation. All inputs with values in that file are assigned an ID 0-49,
1163  corresponding to its line number. All other values are hashed and assigned an
1164  ID 50-54.
1165
1166  ```python
1167  states = categorical_column_with_vocabulary_file(
1168      key='states', vocabulary_file='/us/states.txt', vocabulary_size=50,
1169      num_oov_buckets=5)
1170  columns = [states, ...]
1171  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1172  linear_prediction = linear_model(features, columns)
1173  ```
1174
1175  Example with `default_value`:
1176  File '/us/states.txt' contains 51 lines - the first line is 'XX', and the
1177  other 50 each have a 2-character U.S. state abbreviation. Both a literal 'XX'
1178  in input, and other values missing from the file, will be assigned ID 0. All
1179  others are assigned the corresponding line number 1-50.
1180
1181  ```python
1182  states = categorical_column_with_vocabulary_file(
1183      key='states', vocabulary_file='/us/states.txt', vocabulary_size=51,
1184      default_value=0)
1185  columns = [states, ...]
1186  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1187  linear_prediction, _, _ = linear_model(features, columns)
1188  ```
1189
1190  And to make an embedding with either:
1191
1192  ```python
1193  columns = [embedding_column(states, 3),...]
1194  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1195  dense_tensor = input_layer(features, columns)
1196  ```
1197
1198  Args:
1199    key: A unique string identifying the input feature. It is used as the
1200      column name and the dictionary key for feature parsing configs, feature
1201      `Tensor` objects, and feature columns.
1202    vocabulary_file: The vocabulary file name.
1203    vocabulary_size: Number of the elements in the vocabulary. This must be no
1204      greater than length of `vocabulary_file`, if less than length, later
1205      values are ignored. If None, it is set to the length of `vocabulary_file`.
1206    num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
1207      buckets. All out-of-vocabulary inputs will be assigned IDs in the range
1208      `[vocabulary_size, vocabulary_size+num_oov_buckets)` based on a hash of
1209      the input value. A positive `num_oov_buckets` can not be specified with
1210      `default_value`.
1211    default_value: The integer ID value to return for out-of-vocabulary feature
1212      values, defaults to `-1`. This can not be specified with a positive
1213      `num_oov_buckets`.
1214    dtype: The type of features. Only string and integer types are supported.
1215
1216  Returns:
1217    A `_CategoricalColumn` with a vocabulary file.
1218
1219  Raises:
1220    ValueError: `vocabulary_file` is missing or cannot be opened.
1221    ValueError: `vocabulary_size` is missing or < 1.
1222    ValueError: `num_oov_buckets` is a negative integer.
1223    ValueError: `num_oov_buckets` and `default_value` are both specified.
1224    ValueError: `dtype` is neither string nor integer.
1225  """
1226  if not vocabulary_file:
1227    raise ValueError('Missing vocabulary_file in {}.'.format(key))
1228
1229  if vocabulary_size is None:
1230    if not gfile.Exists(vocabulary_file):
1231      raise ValueError('vocabulary_file in {} does not exist.'.format(key))
1232
1233    with gfile.GFile(vocabulary_file) as f:
1234      vocabulary_size = sum(1 for _ in f)
1235    logging.info(
1236        'vocabulary_size = %d in %s is inferred from the number of elements '
1237        'in the vocabulary_file %s.', vocabulary_size, key, vocabulary_file)
1238
1239  # `vocabulary_size` isn't required for lookup, but it is for `_num_buckets`.
1240  if vocabulary_size < 1:
1241    raise ValueError('Invalid vocabulary_size in {}.'.format(key))
1242  if num_oov_buckets:
1243    if default_value is not None:
1244      raise ValueError(
1245          'Can\'t specify both num_oov_buckets and default_value in {}.'.format(
1246              key))
1247    if num_oov_buckets < 0:
1248      raise ValueError('Invalid num_oov_buckets {} in {}.'.format(
1249          num_oov_buckets, key))
1250  fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
1251  fc_utils.assert_key_is_string(key)
1252  return _VocabularyFileCategoricalColumn(
1253      key=key,
1254      vocabulary_file=vocabulary_file,
1255      vocabulary_size=vocabulary_size,
1256      num_oov_buckets=0 if num_oov_buckets is None else num_oov_buckets,
1257      default_value=-1 if default_value is None else default_value,
1258      dtype=dtype)
1259
1260
1261def _categorical_column_with_vocabulary_list(key,
1262                                             vocabulary_list,
1263                                             dtype=None,
1264                                             default_value=-1,
1265                                             num_oov_buckets=0):
1266  """A `_CategoricalColumn` with in-memory vocabulary.
1267
1268  Use this when your inputs are in string or integer format, and you have an
1269  in-memory vocabulary mapping each value to an integer ID. By default,
1270  out-of-vocabulary values are ignored. Use either (but not both) of
1271  `num_oov_buckets` and `default_value` to specify how to include
1272  out-of-vocabulary values.
1273
1274  For input dictionary `features`, `features[key]` is either `Tensor` or
1275  `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
1276  and `''` for string, which will be dropped by this feature column.
1277
1278  Example with `num_oov_buckets`:
1279  In the following example, each input in `vocabulary_list` is assigned an ID
1280  0-3 corresponding to its index (e.g., input 'B' produces output 2). All other
1281  inputs are hashed and assigned an ID 4-5.
1282
1283  ```python
1284  colors = categorical_column_with_vocabulary_list(
1285      key='colors', vocabulary_list=('R', 'G', 'B', 'Y'),
1286      num_oov_buckets=2)
1287  columns = [colors, ...]
1288  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1289  linear_prediction, _, _ = linear_model(features, columns)
1290  ```
1291
1292  Example with `default_value`:
1293  In the following example, each input in `vocabulary_list` is assigned an ID
1294  0-4 corresponding to its index (e.g., input 'B' produces output 3). All other
1295  inputs are assigned `default_value` 0.
1296
1297
1298  ```python
1299  colors = categorical_column_with_vocabulary_list(
1300      key='colors', vocabulary_list=('X', 'R', 'G', 'B', 'Y'), default_value=0)
1301  columns = [colors, ...]
1302  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1303  linear_prediction, _, _ = linear_model(features, columns)
1304  ```
1305
1306  And to make an embedding with either:
1307
1308  ```python
1309  columns = [embedding_column(colors, 3),...]
1310  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1311  dense_tensor = input_layer(features, columns)
1312  ```
1313
1314  Args:
1315    key: A unique string identifying the input feature. It is used as the
1316      column name and the dictionary key for feature parsing configs, feature
1317      `Tensor` objects, and feature columns.
1318    vocabulary_list: An ordered iterable defining the vocabulary. Each feature
1319      is mapped to the index of its value (if present) in `vocabulary_list`.
1320      Must be castable to `dtype`.
1321    dtype: The type of features. Only string and integer types are supported.
1322      If `None`, it will be inferred from `vocabulary_list`.
1323    default_value: The integer ID value to return for out-of-vocabulary feature
1324      values, defaults to `-1`. This can not be specified with a positive
1325      `num_oov_buckets`.
1326    num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
1327      buckets. All out-of-vocabulary inputs will be assigned IDs in the range
1328      `[len(vocabulary_list), len(vocabulary_list)+num_oov_buckets)` based on a
1329      hash of the input value. A positive `num_oov_buckets` can not be specified
1330      with `default_value`.
1331
1332  Returns:
1333    A `_CategoricalColumn` with in-memory vocabulary.
1334
1335  Raises:
1336    ValueError: if `vocabulary_list` is empty, or contains duplicate keys.
1337    ValueError: `num_oov_buckets` is a negative integer.
1338    ValueError: `num_oov_buckets` and `default_value` are both specified.
1339    ValueError: if `dtype` is not integer or string.
1340  """
1341  if (vocabulary_list is None) or (len(vocabulary_list) < 1):
1342    raise ValueError(
1343        'vocabulary_list {} must be non-empty, column_name: {}'.format(
1344            vocabulary_list, key))
1345  if len(set(vocabulary_list)) != len(vocabulary_list):
1346    raise ValueError(
1347        'Duplicate keys in vocabulary_list {}, column_name: {}'.format(
1348            vocabulary_list, key))
1349  vocabulary_dtype = dtypes.as_dtype(np.array(vocabulary_list).dtype)
1350  if num_oov_buckets:
1351    if default_value != -1:
1352      raise ValueError(
1353          'Can\'t specify both num_oov_buckets and default_value in {}.'.format(
1354              key))
1355    if num_oov_buckets < 0:
1356      raise ValueError('Invalid num_oov_buckets {} in {}.'.format(
1357          num_oov_buckets, key))
1358  fc_utils.assert_string_or_int(
1359      vocabulary_dtype, prefix='column_name: {} vocabulary'.format(key))
1360  if dtype is None:
1361    dtype = vocabulary_dtype
1362  elif dtype.is_integer != vocabulary_dtype.is_integer:
1363    raise ValueError(
1364        'dtype {} and vocabulary dtype {} do not match, column_name: {}'.format(
1365            dtype, vocabulary_dtype, key))
1366  fc_utils.assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
1367  fc_utils.assert_key_is_string(key)
1368
1369  return _VocabularyListCategoricalColumn(
1370      key=key, vocabulary_list=tuple(vocabulary_list), dtype=dtype,
1371      default_value=default_value, num_oov_buckets=num_oov_buckets)
1372
1373
1374def _categorical_column_with_identity(key, num_buckets, default_value=None):
1375  """A `_CategoricalColumn` that returns identity values.
1376
1377  Use this when your inputs are integers in the range `[0, num_buckets)`, and
1378  you want to use the input value itself as the categorical ID. Values outside
1379  this range will result in `default_value` if specified, otherwise it will
1380  fail.
1381
1382  Typically, this is used for contiguous ranges of integer indexes, but
1383  it doesn't have to be. This might be inefficient, however, if many of IDs
1384  are unused. Consider `categorical_column_with_hash_bucket` in that case.
1385
1386  For input dictionary `features`, `features[key]` is either `Tensor` or
1387  `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
1388  and `''` for string, which will be dropped by this feature column.
1389
1390  In the following examples, each input in the range `[0, 1000000)` is assigned
1391  the same value. All other inputs are assigned `default_value` 0. Note that a
1392  literal 0 in inputs will result in the same default ID.
1393
1394  Linear model:
1395
1396  ```python
1397  video_id = categorical_column_with_identity(
1398      key='video_id', num_buckets=1000000, default_value=0)
1399  columns = [video_id, ...]
1400  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1401  linear_prediction, _, _ = linear_model(features, columns)
1402  ```
1403
1404  Embedding for a DNN model:
1405
1406  ```python
1407  columns = [embedding_column(video_id, 9),...]
1408  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1409  dense_tensor = input_layer(features, columns)
1410  ```
1411
1412  Args:
1413    key: A unique string identifying the input feature. It is used as the
1414      column name and the dictionary key for feature parsing configs, feature
1415      `Tensor` objects, and feature columns.
1416    num_buckets: Range of inputs and outputs is `[0, num_buckets)`.
1417    default_value: If `None`, this column's graph operations will fail for
1418      out-of-range inputs. Otherwise, this value must be in the range
1419      `[0, num_buckets)`, and will replace inputs in that range.
1420
1421  Returns:
1422    A `_CategoricalColumn` that returns identity values.
1423
1424  Raises:
1425    ValueError: if `num_buckets` is less than one.
1426    ValueError: if `default_value` is not in range `[0, num_buckets)`.
1427  """
1428  if num_buckets < 1:
1429    raise ValueError(
1430        'num_buckets {} < 1, column_name {}'.format(num_buckets, key))
1431  if (default_value is not None) and (
1432      (default_value < 0) or (default_value >= num_buckets)):
1433    raise ValueError(
1434        'default_value {} not in range [0, {}), column_name {}'.format(
1435            default_value, num_buckets, key))
1436  fc_utils.assert_key_is_string(key)
1437  return _IdentityCategoricalColumn(
1438      key=key, num_buckets=num_buckets, default_value=default_value)
1439
1440
1441def _indicator_column(categorical_column):
1442  """Represents multi-hot representation of given categorical column.
1443
1444  - For DNN model, `indicator_column` can be used to wrap any
1445    `categorical_column_*` (e.g., to feed to DNN). Consider to Use
1446    `embedding_column` if the number of buckets/unique(values) are large.
1447
1448  - For Wide (aka linear) model, `indicator_column` is the internal
1449    representation for categorical column when passing categorical column
1450    directly (as any element in feature_columns) to `linear_model`. See
1451    `linear_model` for details.
1452
1453  ```python
1454  name = indicator_column(categorical_column_with_vocabulary_list(
1455      'name', ['bob', 'george', 'wanda'])
1456  columns = [name, ...]
1457  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1458  dense_tensor = input_layer(features, columns)
1459
1460  dense_tensor == [[1, 0, 0]]  # If "name" bytes_list is ["bob"]
1461  dense_tensor == [[1, 0, 1]]  # If "name" bytes_list is ["bob", "wanda"]
1462  dense_tensor == [[2, 0, 0]]  # If "name" bytes_list is ["bob", "bob"]
1463  ```
1464
1465  Args:
1466    categorical_column: A `_CategoricalColumn` which is created by
1467      `categorical_column_with_*` or `crossed_column` functions.
1468
1469  Returns:
1470    An `_IndicatorColumn`.
1471  """
1472  return _IndicatorColumn(categorical_column)
1473
1474
1475def _weighted_categorical_column(categorical_column,
1476                                 weight_feature_key,
1477                                 dtype=dtypes.float32):
1478  """Applies weight values to a `_CategoricalColumn`.
1479
1480  Use this when each of your sparse inputs has both an ID and a value. For
1481  example, if you're representing text documents as a collection of word
1482  frequencies, you can provide 2 parallel sparse input features ('terms' and
1483  'frequencies' below).
1484
1485  Example:
1486
1487  Input `tf.Example` objects:
1488
1489  ```proto
1490  [
1491    features {
1492      feature {
1493        key: "terms"
1494        value {bytes_list {value: "very" value: "model"}}
1495      }
1496      feature {
1497        key: "frequencies"
1498        value {float_list {value: 0.3 value: 0.1}}
1499      }
1500    },
1501    features {
1502      feature {
1503        key: "terms"
1504        value {bytes_list {value: "when" value: "course" value: "human"}}
1505      }
1506      feature {
1507        key: "frequencies"
1508        value {float_list {value: 0.4 value: 0.1 value: 0.2}}
1509      }
1510    }
1511  ]
1512  ```
1513
1514  ```python
1515  categorical_column = categorical_column_with_hash_bucket(
1516      column_name='terms', hash_bucket_size=1000)
1517  weighted_column = weighted_categorical_column(
1518      categorical_column=categorical_column, weight_feature_key='frequencies')
1519  columns = [weighted_column, ...]
1520  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1521  linear_prediction, _, _ = linear_model(features, columns)
1522  ```
1523
1524  This assumes the input dictionary contains a `SparseTensor` for key
1525  'terms', and a `SparseTensor` for key 'frequencies'. These 2 tensors must have
1526  the same indices and dense shape.
1527
1528  Args:
1529    categorical_column: A `_CategoricalColumn` created by
1530      `categorical_column_with_*` functions.
1531    weight_feature_key: String key for weight values.
1532    dtype: Type of weights, such as `tf.float32`. Only float and integer weights
1533      are supported.
1534
1535  Returns:
1536    A `_CategoricalColumn` composed of two sparse features: one represents id,
1537    the other represents weight (value) of the id feature in that example.
1538
1539  Raises:
1540    ValueError: if `dtype` is not convertible to float.
1541  """
1542  if (dtype is None) or not (dtype.is_integer or dtype.is_floating):
1543    raise ValueError('dtype {} is not convertible to float.'.format(dtype))
1544  return _WeightedCategoricalColumn(
1545      categorical_column=categorical_column,
1546      weight_feature_key=weight_feature_key,
1547      dtype=dtype)
1548
1549
1550def _crossed_column(keys, hash_bucket_size, hash_key=None):
1551  """Returns a column for performing crosses of categorical features.
1552
1553  Crossed features will be hashed according to `hash_bucket_size`. Conceptually,
1554  the transformation can be thought of as:
1555    Hash(cartesian product of features) % `hash_bucket_size`
1556
1557  For example, if the input features are:
1558
1559  * SparseTensor referred by first key:
1560
1561    ```python
1562    shape = [2, 2]
1563    {
1564        [0, 0]: "a"
1565        [1, 0]: "b"
1566        [1, 1]: "c"
1567    }
1568    ```
1569
1570  * SparseTensor referred by second key:
1571
1572    ```python
1573    shape = [2, 1]
1574    {
1575        [0, 0]: "d"
1576        [1, 0]: "e"
1577    }
1578    ```
1579
1580  then crossed feature will look like:
1581
1582  ```python
1583   shape = [2, 2]
1584  {
1585      [0, 0]: Hash64("d", Hash64("a")) % hash_bucket_size
1586      [1, 0]: Hash64("e", Hash64("b")) % hash_bucket_size
1587      [1, 1]: Hash64("e", Hash64("c")) % hash_bucket_size
1588  }
1589  ```
1590
1591  Here is an example to create a linear model with crosses of string features:
1592
1593  ```python
1594  keywords_x_doc_terms = crossed_column(['keywords', 'doc_terms'], 50K)
1595  columns = [keywords_x_doc_terms, ...]
1596  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1597  linear_prediction = linear_model(features, columns)
1598  ```
1599
1600  You could also use vocabulary lookup before crossing:
1601
1602  ```python
1603  keywords = categorical_column_with_vocabulary_file(
1604      'keywords', '/path/to/vocabulary/file', vocabulary_size=1K)
1605  keywords_x_doc_terms = crossed_column([keywords, 'doc_terms'], 50K)
1606  columns = [keywords_x_doc_terms, ...]
1607  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1608  linear_prediction = linear_model(features, columns)
1609  ```
1610
1611  If an input feature is of numeric type, you can use
1612  `categorical_column_with_identity`, or `bucketized_column`, as in the example:
1613
1614  ```python
1615  # vertical_id is an integer categorical feature.
1616  vertical_id = categorical_column_with_identity('vertical_id', 10K)
1617  price = numeric_column('price')
1618  # bucketized_column converts numerical feature to a categorical one.
1619  bucketized_price = bucketized_column(price, boundaries=[...])
1620  vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K)
1621  columns = [vertical_id_x_price, ...]
1622  features = tf.parse_example(..., features=make_parse_example_spec(columns))
1623  linear_prediction = linear_model(features, columns)
1624  ```
1625
1626  To use crossed column in DNN model, you need to add it in an embedding column
1627  as in this example:
1628
1629  ```python
1630  vertical_id_x_price = crossed_column([vertical_id, bucketized_price], 50K)
1631  vertical_id_x_price_embedded = embedding_column(vertical_id_x_price, 10)
1632  dense_tensor = input_layer(features, [vertical_id_x_price_embedded, ...])
1633  ```
1634
1635  Args:
1636    keys: An iterable identifying the features to be crossed. Each element can
1637      be either:
1638      * string: Will use the corresponding feature which must be of string type.
1639      * `_CategoricalColumn`: Will use the transformed tensor produced by this
1640        column. Does not support hashed categorical column.
1641    hash_bucket_size: An int > 1. The number of buckets.
1642    hash_key: Specify the hash_key that will be used by the `FingerprintCat64`
1643      function to combine the crosses fingerprints on SparseCrossOp (optional).
1644
1645  Returns:
1646    A `_CrossedColumn`.
1647
1648  Raises:
1649    ValueError: If `len(keys) < 2`.
1650    ValueError: If any of the keys is neither a string nor `_CategoricalColumn`.
1651    ValueError: If any of the keys is `_HashedCategoricalColumn`.
1652    ValueError: If `hash_bucket_size < 1`.
1653  """
1654  if not hash_bucket_size or hash_bucket_size < 1:
1655    raise ValueError('hash_bucket_size must be > 1. '
1656                     'hash_bucket_size: {}'.format(hash_bucket_size))
1657  if not keys or len(keys) < 2:
1658    raise ValueError(
1659        'keys must be a list with length > 1. Given: {}'.format(keys))
1660  for key in keys:
1661    if (not isinstance(key, six.string_types) and
1662        not isinstance(key, _CategoricalColumn)):
1663      raise ValueError(
1664          'Unsupported key type. All keys must be either string, or '
1665          'categorical column except _HashedCategoricalColumn. '
1666          'Given: {}'.format(key))
1667    if isinstance(key, _HashedCategoricalColumn):
1668      raise ValueError(
1669          'categorical_column_with_hash_bucket is not supported for crossing. '
1670          'Hashing before crossing will increase probability of collision. '
1671          'Instead, use the feature name as a string. Given: {}'.format(key))
1672  return _CrossedColumn(
1673      keys=tuple(keys), hash_bucket_size=hash_bucket_size,
1674      hash_key=hash_key)
1675
1676
1677# TODO(rohanj): Clearly define semantics of this layer.
1678class _EmbeddingColumnLayer(base.Layer):
1679  """A layer that stores all the state required for a embedding column."""
1680
1681  def __init__(self,
1682               embedding_shape,
1683               initializer,
1684               weight_collections=None,
1685               trainable=True,
1686               name=None,
1687               **kwargs):
1688    """Constructor.
1689
1690    Args:
1691      embedding_shape: Shape of the embedding variable used for lookup.
1692      initializer: A variable initializer function to be used in embedding
1693        variable initialization.
1694      weight_collections: A list of collection names to which the Variable will
1695        be added. Note that, variables will also be added to collections
1696        `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
1697      trainable: If `True` also add the variable to the graph collection
1698        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
1699      name: Name of the layer
1700      **kwargs: keyword named properties.
1701    """
1702    super(_EmbeddingColumnLayer, self).__init__(
1703        trainable=trainable, name=name, **kwargs)
1704    self._embedding_shape = embedding_shape
1705    self._initializer = initializer
1706    self._weight_collections = weight_collections
1707
1708  def set_weight_collections(self, weight_collections):
1709    """Sets the weight collections for the layer.
1710
1711    Args:
1712      weight_collections: A list of collection names to which the Variable will
1713        be added.
1714    """
1715    self._weight_collections = weight_collections
1716
1717  def build(self, _):
1718    self._embedding_weight_var = self.add_variable(
1719        name='embedding_weights',
1720        shape=self._embedding_shape,
1721        dtype=dtypes.float32,
1722        initializer=self._initializer,
1723        trainable=self.trainable)
1724    if self._weight_collections and not context.executing_eagerly():
1725      _add_to_collections(self._embedding_weight_var, self._weight_collections)
1726    self.built = True
1727
1728  def call(self, _):
1729    return self._embedding_weight_var
1730
1731
1732@six.add_metaclass(abc.ABCMeta)
1733class _FeatureColumn(object):
1734  """Represents a feature column abstraction.
1735
1736  WARNING: Do not subclass this layer unless you know what you are doing:
1737  the API is subject to future changes.
1738
1739  To distinguish the concept of a feature family and a specific binary feature
1740  within a family, we refer to a feature family like "country" as a feature
1741  column. Following is an example feature in a `tf.Example` format:
1742    {key: "country",  value: [ "US" ]}
1743  In this example the value of feature is "US" and "country" refers to the
1744  column of the feature.
1745
1746  This class is an abstract class. User should not create instances of this.
1747  """
1748
1749  @abc.abstractproperty
1750  def name(self):
1751    """Returns string. Used for naming and for name_scope."""
1752    pass
1753
1754  @property
1755  def _var_scope_name(self):
1756    """Returns string. Used for variable_scope. Defaults to self.name."""
1757    return self.name
1758
1759  @abc.abstractmethod
1760  def _transform_feature(self, inputs):
1761    """Returns intermediate representation (usually a `Tensor`).
1762
1763    Uses `inputs` to create an intermediate representation (usually a `Tensor`)
1764    that other feature columns can use.
1765
1766    Example usage of `inputs`:
1767    Let's say a Feature column depends on raw feature ('raw') and another
1768    `_FeatureColumn` (input_fc). To access corresponding `Tensor`s, inputs will
1769    be used as follows:
1770
1771    ```python
1772    raw_tensor = inputs.get('raw')
1773    fc_tensor = inputs.get(input_fc)
1774    ```
1775
1776    Args:
1777      inputs: A `_LazyBuilder` object to access inputs.
1778
1779    Returns:
1780      Transformed feature `Tensor`.
1781    """
1782    pass
1783
1784  @abc.abstractproperty
1785  def _parse_example_spec(self):
1786    """Returns a `tf.Example` parsing spec as dict.
1787
1788    It is used for get_parsing_spec for `tf.parse_example`. Returned spec is a
1789    dict from keys ('string') to `VarLenFeature`, `FixedLenFeature`, and other
1790    supported objects. Please check documentation of `tf.parse_example` for all
1791    supported spec objects.
1792
1793    Let's say a Feature column depends on raw feature ('raw') and another
1794    `_FeatureColumn` (input_fc). One possible implementation of
1795    _parse_example_spec is as follows:
1796
1797    ```python
1798    spec = {'raw': tf.FixedLenFeature(...)}
1799    spec.update(input_fc._parse_example_spec)
1800    return spec
1801    ```
1802    """
1803    pass
1804
1805  def _reset_config(self):
1806    """Resets the configuration in the column.
1807
1808    Some feature columns e.g. embedding or shared embedding columns might
1809    have some state that is needed to be reset sometimes. Use this method
1810    in that scenario.
1811    """
1812
1813
1814class _DenseColumn(_FeatureColumn):
1815  """Represents a column which can be represented as `Tensor`.
1816
1817  WARNING: Do not subclass this layer unless you know what you are doing:
1818  the API is subject to future changes.
1819
1820  Some examples of this type are: numeric_column, embedding_column,
1821  indicator_column.
1822  """
1823
1824  @abc.abstractproperty
1825  def _variable_shape(self):
1826    """`TensorShape` of `_get_dense_tensor`, without batch dimension."""
1827    pass
1828
1829  @abc.abstractmethod
1830  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
1831    """Returns a `Tensor`.
1832
1833    The output of this function will be used by model-builder-functions. For
1834    example the pseudo code of `input_layer` will be like:
1835
1836    ```python
1837    def input_layer(features, feature_columns, ...):
1838      outputs = [fc._get_dense_tensor(...) for fc in feature_columns]
1839      return tf.concat(outputs)
1840    ```
1841
1842    Args:
1843      inputs: A `_LazyBuilder` object to access inputs.
1844      weight_collections: List of graph collections to which Variables (if any
1845        will be created) are added.
1846      trainable: If `True` also add variables to the graph collection
1847        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
1848
1849    Returns:
1850      `Tensor` of shape [batch_size] + `_variable_shape`.
1851    """
1852    pass
1853
1854
1855def _create_weighted_sum(column,
1856                         builder,
1857                         units,
1858                         sparse_combiner,
1859                         weight_collections,
1860                         trainable,
1861                         weight_var=None):
1862  """Creates a weighted sum for a dense/categorical column for linear_model."""
1863  if isinstance(column, _CategoricalColumn):
1864    return _create_categorical_column_weighted_sum(
1865        column=column,
1866        builder=builder,
1867        units=units,
1868        sparse_combiner=sparse_combiner,
1869        weight_collections=weight_collections,
1870        trainable=trainable,
1871        weight_var=weight_var)
1872  else:
1873    return _create_dense_column_weighted_sum(
1874        column=column,
1875        builder=builder,
1876        units=units,
1877        weight_collections=weight_collections,
1878        trainable=trainable,
1879        weight_var=weight_var)
1880
1881
1882def _create_dense_column_weighted_sum(column,
1883                                      builder,
1884                                      units,
1885                                      weight_collections,
1886                                      trainable,
1887                                      weight_var=None):
1888  """Create a weighted sum of a dense column for linear_model."""
1889  tensor = column._get_dense_tensor(  # pylint: disable=protected-access
1890      builder,
1891      weight_collections=weight_collections,
1892      trainable=trainable)
1893  num_elements = column._variable_shape.num_elements()  # pylint: disable=protected-access
1894  batch_size = array_ops.shape(tensor)[0]
1895  tensor = array_ops.reshape(tensor, shape=(batch_size, num_elements))
1896  if weight_var is not None:
1897    weight = weight_var
1898  else:
1899    weight = variable_scope.get_variable(
1900        name='weights',
1901        shape=[num_elements, units],
1902        initializer=init_ops.zeros_initializer(),
1903        trainable=trainable,
1904        collections=weight_collections)
1905  return math_ops.matmul(tensor, weight, name='weighted_sum')
1906
1907
1908class _CategoricalColumn(_FeatureColumn):
1909  """Represents a categorical feature.
1910
1911  WARNING: Do not subclass this layer unless you know what you are doing:
1912  the API is subject to future changes.
1913
1914  A categorical feature typically handled with a `tf.SparseTensor` of IDs.
1915  """
1916
1917  IdWeightPair = collections.namedtuple(  # pylint: disable=invalid-name
1918      'IdWeightPair', ['id_tensor', 'weight_tensor'])
1919
1920  @abc.abstractproperty
1921  def _num_buckets(self):
1922    """Returns number of buckets in this sparse feature."""
1923    pass
1924
1925  @abc.abstractmethod
1926  def _get_sparse_tensors(self,
1927                          inputs,
1928                          weight_collections=None,
1929                          trainable=None):
1930    """Returns an IdWeightPair.
1931
1932    `IdWeightPair` is a pair of `SparseTensor`s which represents ids and
1933    weights.
1934
1935    `IdWeightPair.id_tensor` is typically a `batch_size` x `num_buckets`
1936    `SparseTensor` of `int64`. `IdWeightPair.weight_tensor` is either a
1937    `SparseTensor` of `float` or `None` to indicate all weights should be
1938    taken to be 1. If specified, `weight_tensor` must have exactly the same
1939    shape and indices as `sp_ids`. Expected `SparseTensor` is same as parsing
1940    output of a `VarLenFeature` which is a ragged matrix.
1941
1942    Args:
1943      inputs: A `LazyBuilder` as a cache to get input tensors required to
1944        create `IdWeightPair`.
1945      weight_collections: List of graph collections to which variables (if any
1946        will be created) are added.
1947      trainable: If `True` also add variables to the graph collection
1948        `GraphKeys.TRAINABLE_VARIABLES` (see `tf.get_variable`).
1949    """
1950    pass
1951
1952
1953def _create_categorical_column_weighted_sum(column,
1954                                            builder,
1955                                            units,
1956                                            sparse_combiner,
1957                                            weight_collections,
1958                                            trainable,
1959                                            weight_var=None):
1960  # pylint: disable=g-doc-return-or-yield,g-doc-args
1961  """Create a weighted sum of a categorical column for linear_model.
1962
1963  Note to maintainer: As implementation details, the weighted sum is
1964  implemented via embedding_lookup_sparse toward efficiency. Mathematically,
1965  they are the same.
1966
1967  To be specific, conceptually, categorical column can be treated as multi-hot
1968  vector. Say:
1969
1970  ```python
1971    x = [0 0 1]  # categorical column input
1972    w = [a b c]  # weights
1973  ```
1974  The weighted sum is `c` in this case, which is same as `w[2]`.
1975
1976  Another example is
1977
1978  ```python
1979    x = [0 1 1]  # categorical column input
1980    w = [a b c]  # weights
1981  ```
1982  The weighted sum is `b + c` in this case, which is same as `w[2] + w[3]`.
1983
1984  For both cases, we can implement weighted sum via embedding_lookup with
1985  sparse_combiner = "sum".
1986  """
1987
1988  sparse_tensors = column._get_sparse_tensors(  # pylint: disable=protected-access
1989      builder,
1990      weight_collections=weight_collections,
1991      trainable=trainable)
1992  id_tensor = sparse_ops.sparse_reshape(sparse_tensors.id_tensor, [
1993      array_ops.shape(sparse_tensors.id_tensor)[0], -1
1994  ])
1995  weight_tensor = sparse_tensors.weight_tensor
1996  if weight_tensor is not None:
1997    weight_tensor = sparse_ops.sparse_reshape(
1998        weight_tensor, [array_ops.shape(weight_tensor)[0], -1])
1999
2000  if weight_var is not None:
2001    weight = weight_var
2002  else:
2003    weight = variable_scope.get_variable(
2004        name='weights',
2005        shape=(column._num_buckets, units),  # pylint: disable=protected-access
2006        initializer=init_ops.zeros_initializer(),
2007        trainable=trainable,
2008        collections=weight_collections)
2009  return embedding_ops.safe_embedding_lookup_sparse(
2010      weight,
2011      id_tensor,
2012      sparse_weights=weight_tensor,
2013      combiner=sparse_combiner,
2014      name='weighted_sum')
2015
2016
2017class _SequenceDenseColumn(_FeatureColumn):
2018  """Represents dense sequence data."""
2019
2020  TensorSequenceLengthPair = collections.namedtuple(  # pylint: disable=invalid-name
2021      'TensorSequenceLengthPair', ['dense_tensor', 'sequence_length'])
2022
2023  @abc.abstractmethod
2024  def _get_sequence_dense_tensor(
2025      self, inputs, weight_collections=None, trainable=None):
2026    """Returns a `TensorSequenceLengthPair`."""
2027    pass
2028
2029
2030class _LazyBuilder(object):
2031  """Handles caching of transformations while building the model.
2032
2033  `_FeatureColumn` specifies how to digest an input column to the network. Some
2034  feature columns require data transformations. This class caches those
2035  transformations.
2036
2037  Some features may be used in more than one place. For example, one can use a
2038  bucketized feature by itself and a cross with it. In that case we
2039  should create only one bucketization op instead of creating ops for each
2040  feature column separately. To handle re-use of transformed columns,
2041  `_LazyBuilder` caches all previously transformed columns.
2042
2043  Example:
2044  We're trying to use the following `_FeatureColumn`s:
2045
2046  ```python
2047  bucketized_age = fc.bucketized_column(fc.numeric_column("age"), ...)
2048  keywords = fc.categorical_column_with_hash_buckets("keywords", ...)
2049  age_X_keywords = fc.crossed_column([bucketized_age, "keywords"])
2050  ... = linear_model(features,
2051                          [bucketized_age, keywords, age_X_keywords]
2052  ```
2053
2054  If we transform each column independently, then we'll get duplication of
2055  bucketization (one for cross, one for bucketization itself).
2056  The `_LazyBuilder` eliminates this duplication.
2057  """
2058
2059  def __init__(self, features):
2060    """Creates a `_LazyBuilder`.
2061
2062    Args:
2063      features: A mapping from feature column to objects that are `Tensor` or
2064        `SparseTensor`, or can be converted to same via
2065        `sparse_tensor.convert_to_tensor_or_sparse_tensor`. A `string` key
2066        signifies a base feature (not-transformed). A `_FeatureColumn` key
2067        means that this `Tensor` is the output of an existing `_FeatureColumn`
2068        which can be reused.
2069    """
2070    self._features = features.copy()
2071    self._feature_tensors = {}
2072
2073  def get(self, key):
2074    """Returns a `Tensor` for the given key.
2075
2076    A `str` key is used to access a base feature (not-transformed). When a
2077    `_FeatureColumn` is passed, the transformed feature is returned if it
2078    already exists, otherwise the given `_FeatureColumn` is asked to provide its
2079    transformed output, which is then cached.
2080
2081    Args:
2082      key: a `str` or a `_FeatureColumn`.
2083
2084    Returns:
2085      The transformed `Tensor` corresponding to the `key`.
2086
2087    Raises:
2088      ValueError: if key is not found or a transformed `Tensor` cannot be
2089        computed.
2090    """
2091    if key in self._feature_tensors:
2092      # FeatureColumn is already transformed or converted.
2093      return self._feature_tensors[key]
2094
2095    if key in self._features:
2096      feature_tensor = self._get_raw_feature_as_tensor(key)
2097      self._feature_tensors[key] = feature_tensor
2098      return feature_tensor
2099
2100    if isinstance(key, six.string_types):
2101      raise ValueError('Feature {} is not in features dictionary.'.format(key))
2102
2103    if not isinstance(key, _FeatureColumn):
2104      raise TypeError('"key" must be either a "str" or "_FeatureColumn". '
2105                      'Provided: {}'.format(key))
2106
2107    column = key
2108    logging.debug('Transforming feature_column %s.', column)
2109    transformed = column._transform_feature(self)  # pylint: disable=protected-access
2110    if transformed is None:
2111      raise ValueError('Column {} is not supported.'.format(column.name))
2112    self._feature_tensors[column] = transformed
2113    return transformed
2114
2115  def _get_raw_feature_as_tensor(self, key):
2116    """Gets the raw_feature (keyed by `key`) as `tensor`.
2117
2118    The raw feature is converted to (sparse) tensor and maybe expand dim.
2119
2120    For both `Tensor` and `SparseTensor`, the rank will be expanded (to 2) if
2121    the rank is 1. This supports dynamic rank also. For rank 0 raw feature, will
2122    error out as it is not supported.
2123
2124    Args:
2125      key: A `str` key to access the raw feature.
2126
2127    Returns:
2128      A `Tensor` or `SparseTensor`.
2129
2130    Raises:
2131      ValueError: if the raw feature has rank 0.
2132    """
2133    raw_feature = self._features[key]
2134    feature_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
2135        raw_feature)
2136
2137    def expand_dims(input_tensor):
2138      # Input_tensor must have rank 1.
2139      if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
2140        return sparse_ops.sparse_reshape(
2141            input_tensor, [array_ops.shape(input_tensor)[0], 1])
2142      else:
2143        return array_ops.expand_dims(input_tensor, -1)
2144
2145    rank = feature_tensor.get_shape().ndims
2146    if rank is not None:
2147      if rank == 0:
2148        raise ValueError(
2149            'Feature (key: {}) cannot have rank 0. Give: {}'.format(
2150                key, feature_tensor))
2151      return feature_tensor if rank != 1 else expand_dims(feature_tensor)
2152
2153    # Handle dynamic rank.
2154    with ops.control_dependencies([
2155        check_ops.assert_positive(
2156            array_ops.rank(feature_tensor),
2157            message='Feature (key: {}) cannot have rank 0. Given: {}'.format(
2158                key, feature_tensor))]):
2159      return control_flow_ops.cond(
2160          math_ops.equal(1, array_ops.rank(feature_tensor)),
2161          lambda: expand_dims(feature_tensor),
2162          lambda: feature_tensor)
2163
2164
2165# TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py
2166def _shape_offsets(shape):
2167  """Returns moving offset for each dimension given shape."""
2168  offsets = []
2169  for dim in reversed(shape):
2170    if offsets:
2171      offsets.append(dim * offsets[-1])
2172    else:
2173      offsets.append(dim)
2174  offsets.reverse()
2175  return offsets
2176
2177
2178# TODO(ptucker): Move to third_party/tensorflow/python/ops/sparse_ops.py
2179def _to_sparse_input_and_drop_ignore_values(input_tensor, ignore_value=None):
2180  """Converts a `Tensor` to a `SparseTensor`, dropping ignore_value cells.
2181
2182  If `input_tensor` is already a `SparseTensor`, just return it.
2183
2184  Args:
2185    input_tensor: A string or integer `Tensor`.
2186    ignore_value: Entries in `dense_tensor` equal to this value will be
2187      absent from the resulting `SparseTensor`. If `None`, default value of
2188      `dense_tensor`'s dtype will be used ('' for `str`, -1 for `int`).
2189
2190  Returns:
2191    A `SparseTensor` with the same shape as `input_tensor`.
2192
2193  Raises:
2194    ValueError: when `input_tensor`'s rank is `None`.
2195  """
2196  input_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
2197      input_tensor)
2198  if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
2199    return input_tensor
2200  with ops.name_scope(None, 'to_sparse_input', (input_tensor, ignore_value,)):
2201    if ignore_value is None:
2202      if input_tensor.dtype == dtypes.string:
2203        # Exception due to TF strings are converted to numpy objects by default.
2204        ignore_value = ''
2205      elif input_tensor.dtype.is_integer:
2206        ignore_value = -1  # -1 has a special meaning of missing feature
2207      else:
2208        # NOTE: `as_numpy_dtype` is a property, so with the parentheses this is
2209        # constructing a new numpy object of the given type, which yields the
2210        # default value for that type.
2211        ignore_value = input_tensor.dtype.as_numpy_dtype()
2212    ignore_value = math_ops.cast(
2213        ignore_value, input_tensor.dtype, name='ignore_value')
2214    indices = array_ops.where(
2215        math_ops.not_equal(input_tensor, ignore_value), name='indices')
2216    return sparse_tensor_lib.SparseTensor(
2217        indices=indices,
2218        values=array_ops.gather_nd(input_tensor, indices, name='values'),
2219        dense_shape=array_ops.shape(
2220            input_tensor, out_type=dtypes.int64, name='dense_shape'))
2221
2222
2223def _normalize_feature_columns(feature_columns):
2224  """Normalizes the `feature_columns` input.
2225
2226  This method converts the `feature_columns` to list type as best as it can. In
2227  addition, verifies the type and other parts of feature_columns, required by
2228  downstream library.
2229
2230  Args:
2231    feature_columns: The raw feature columns, usually passed by users.
2232
2233  Returns:
2234    The normalized feature column list.
2235
2236  Raises:
2237    ValueError: for any invalid inputs, such as empty, duplicated names, etc.
2238  """
2239  if isinstance(feature_columns, _FeatureColumn):
2240    feature_columns = [feature_columns]
2241
2242  if isinstance(feature_columns, collections.Iterator):
2243    feature_columns = list(feature_columns)
2244
2245  if isinstance(feature_columns, dict):
2246    raise ValueError('Expected feature_columns to be iterable, found dict.')
2247
2248  for column in feature_columns:
2249    if not isinstance(column, _FeatureColumn):
2250      raise ValueError('Items of feature_columns must be a _FeatureColumn. '
2251                       'Given (type {}): {}.'.format(type(column), column))
2252  if not feature_columns:
2253    raise ValueError('feature_columns must not be empty.')
2254  name_to_column = dict()
2255  for column in feature_columns:
2256    if column.name in name_to_column:
2257      raise ValueError('Duplicate feature column name found for columns: {} '
2258                       'and {}. This usually means that these columns refer to '
2259                       'same base feature. Either one must be discarded or a '
2260                       'duplicated but renamed item must be inserted in '
2261                       'features dict.'.format(column,
2262                                               name_to_column[column.name]))
2263    name_to_column[column.name] = column
2264
2265  return feature_columns
2266
2267
2268class _NumericColumn(_DenseColumn,
2269                     collections.namedtuple('_NumericColumn', [
2270                         'key', 'shape', 'default_value', 'dtype',
2271                         'normalizer_fn'
2272                     ])):
2273  """see `numeric_column`."""
2274
2275  @property
2276  def name(self):
2277    return self.key
2278
2279  @property
2280  def _parse_example_spec(self):
2281    return {
2282        self.key:
2283            parsing_ops.FixedLenFeature(self.shape, self.dtype,
2284                                        self.default_value)
2285    }
2286
2287  def _transform_feature(self, inputs):
2288    input_tensor = inputs.get(self.key)
2289    if isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
2290      raise ValueError(
2291          'The corresponding Tensor of numerical column must be a Tensor. '
2292          'SparseTensor is not supported. key: {}'.format(self.key))
2293    if self.normalizer_fn is not None:
2294      input_tensor = self.normalizer_fn(input_tensor)
2295    return math_ops.cast(input_tensor, dtypes.float32)
2296
2297  @property
2298  def _variable_shape(self):
2299    return tensor_shape.TensorShape(self.shape)
2300
2301  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
2302    """Returns dense `Tensor` representing numeric feature.
2303
2304    Args:
2305      inputs: A `_LazyBuilder` object to access inputs.
2306      weight_collections: Unused `weight_collections` since no variables are
2307        created in this function.
2308      trainable: Unused `trainable` bool since no variables are created in
2309        this function.
2310
2311    Returns:
2312      Dense `Tensor` created within `_transform_feature`.
2313    """
2314    # Do nothing with weight_collections and trainable since no variables are
2315    # created in this function.
2316    del weight_collections
2317    del trainable
2318    # Feature has been already transformed. Return the intermediate
2319    # representation created by _transform_feature.
2320    return inputs.get(self)
2321
2322
2323class _BucketizedColumn(_DenseColumn, _CategoricalColumn,
2324                        collections.namedtuple('_BucketizedColumn', [
2325                            'source_column', 'boundaries'])):
2326  """See `bucketized_column`."""
2327
2328  @property
2329  def name(self):
2330    return '{}_bucketized'.format(self.source_column.name)
2331
2332  @property
2333  def _parse_example_spec(self):
2334    return self.source_column._parse_example_spec  # pylint: disable=protected-access
2335
2336  def _transform_feature(self, inputs):
2337    source_tensor = inputs.get(self.source_column)
2338    return math_ops._bucketize(  # pylint: disable=protected-access
2339        source_tensor,
2340        boundaries=self.boundaries)
2341
2342  @property
2343  def _variable_shape(self):
2344    return tensor_shape.TensorShape(
2345        tuple(self.source_column.shape) + (len(self.boundaries) + 1,))
2346
2347  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
2348    del weight_collections
2349    del trainable
2350    input_tensor = inputs.get(self)
2351    return array_ops.one_hot(
2352        indices=math_ops.cast(input_tensor, dtypes.int64),
2353        depth=len(self.boundaries) + 1,
2354        on_value=1.,
2355        off_value=0.)
2356
2357  @property
2358  def _num_buckets(self):
2359    # By construction, source_column is always one-dimensional.
2360    return (len(self.boundaries) + 1) * self.source_column.shape[0]
2361
2362  def _get_sparse_tensors(self, inputs, weight_collections=None,
2363                          trainable=None):
2364    """Converts dense inputs to SparseTensor so downstream code can use it."""
2365    input_tensor = inputs.get(self)
2366    batch_size = array_ops.shape(input_tensor)[0]
2367    # By construction, source_column is always one-dimensional.
2368    source_dimension = self.source_column.shape[0]
2369
2370    i1 = array_ops.reshape(
2371        array_ops.tile(
2372            array_ops.expand_dims(math_ops.range(0, batch_size), 1),
2373            [1, source_dimension]),
2374        (-1,))
2375    i2 = array_ops.tile(math_ops.range(0, source_dimension), [batch_size])
2376    # Flatten the bucket indices and unique them across dimensions
2377    # E.g. 2nd dimension indices will range from k to 2*k-1 with k buckets
2378    bucket_indices = (
2379        array_ops.reshape(input_tensor, (-1,)) +
2380        (len(self.boundaries) + 1) * i2)
2381
2382    indices = math_ops.cast(
2383        array_ops.transpose(array_ops.stack((i1, i2))), dtypes.int64)
2384    dense_shape = math_ops.cast(
2385        array_ops.stack([batch_size, source_dimension]), dtypes.int64)
2386    sparse_tensor = sparse_tensor_lib.SparseTensor(
2387        indices=indices,
2388        values=bucket_indices,
2389        dense_shape=dense_shape)
2390    return _CategoricalColumn.IdWeightPair(sparse_tensor, None)
2391
2392
2393class _EmbeddingColumn(
2394    _DenseColumn, _SequenceDenseColumn,
2395    collections.namedtuple(
2396        '_EmbeddingColumn',
2397        ('categorical_column', 'dimension', 'combiner', 'layer_creator',
2398         'ckpt_to_load_from', 'tensor_name_in_ckpt', 'max_norm', 'trainable'))):
2399  """See `embedding_column`."""
2400
2401  @property
2402  def name(self):
2403    if not hasattr(self, '_name'):
2404      self._name = '{}_embedding'.format(self.categorical_column.name)
2405    return self._name
2406
2407  @property
2408  def _parse_example_spec(self):
2409    return self.categorical_column._parse_example_spec  # pylint: disable=protected-access
2410
2411  def _transform_feature(self, inputs):
2412    return inputs.get(self.categorical_column)
2413
2414  @property
2415  def _variable_shape(self):
2416    if not hasattr(self, '_shape'):
2417      self._shape = tensor_shape.vector(self.dimension)
2418    return self._shape
2419
2420  def _get_dense_tensor_internal(self,
2421                                 inputs,
2422                                 weight_collections=None,
2423                                 trainable=None):
2424    """Private method that follows the signature of _get_dense_tensor."""
2425    # Get sparse IDs and weights.
2426    sparse_tensors = self.categorical_column._get_sparse_tensors(  # pylint: disable=protected-access
2427        inputs, weight_collections=weight_collections, trainable=trainable)
2428    sparse_ids = sparse_tensors.id_tensor
2429    sparse_weights = sparse_tensors.weight_tensor
2430
2431    embedding_weights = self.layer_creator(
2432        weight_collections=weight_collections,
2433        scope=variable_scope.get_variable_scope())
2434
2435    if self.ckpt_to_load_from is not None:
2436      to_restore = embedding_weights
2437      if isinstance(to_restore, variables.PartitionedVariable):
2438        to_restore = to_restore._get_variable_list()  # pylint: disable=protected-access
2439      checkpoint_utils.init_from_checkpoint(self.ckpt_to_load_from, {
2440          self.tensor_name_in_ckpt: to_restore
2441      })
2442
2443    # Return embedding lookup result.
2444    return embedding_ops.safe_embedding_lookup_sparse(
2445        embedding_weights=embedding_weights,
2446        sparse_ids=sparse_ids,
2447        sparse_weights=sparse_weights,
2448        combiner=self.combiner,
2449        name='%s_weights' % self.name,
2450        max_norm=self.max_norm)
2451
2452  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
2453    if isinstance(self.categorical_column, _SequenceCategoricalColumn):
2454      raise ValueError(
2455          'In embedding_column: {}. '
2456          'categorical_column must not be of type _SequenceCategoricalColumn. '
2457          'Suggested fix A: If you wish to use input_layer, use a '
2458          'non-sequence categorical_column_with_*. '
2459          'Suggested fix B: If you wish to create sequence input, use '
2460          'sequence_input_layer instead of input_layer. '
2461          'Given (type {}): {}'.format(
2462              self.name, type(self.categorical_column),
2463              self.categorical_column))
2464    return self._get_dense_tensor_internal(
2465        inputs=inputs,
2466        weight_collections=weight_collections,
2467        trainable=trainable)
2468
2469  def _get_sequence_dense_tensor(
2470      self, inputs, weight_collections=None, trainable=None):
2471    if not isinstance(self.categorical_column, _SequenceCategoricalColumn):
2472      raise ValueError(
2473          'In embedding_column: {}. '
2474          'categorical_column must be of type _SequenceCategoricalColumn '
2475          'to use sequence_input_layer. '
2476          'Suggested fix: Use one of sequence_categorical_column_with_*. '
2477          'Given (type {}): {}'.format(
2478              self.name, type(self.categorical_column),
2479              self.categorical_column))
2480    dense_tensor = self._get_dense_tensor_internal(  # pylint: disable=protected-access
2481        inputs=inputs,
2482        weight_collections=weight_collections,
2483        trainable=trainable)
2484
2485    sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
2486    sequence_length = fc_utils.sequence_length_from_sparse_tensor(
2487        sparse_tensors.id_tensor)
2488    return _SequenceDenseColumn.TensorSequenceLengthPair(
2489        dense_tensor=dense_tensor, sequence_length=sequence_length)
2490
2491
2492def _get_graph_for_variable(var):
2493  if isinstance(var, variables.PartitionedVariable):
2494    return list(var)[0].graph
2495  else:
2496    return var.graph
2497
2498
2499class _SharedEmbeddingColumn(
2500    _DenseColumn, _SequenceDenseColumn,
2501    collections.namedtuple(
2502        '_SharedEmbeddingColumn',
2503        ('categorical_column', 'dimension', 'combiner', 'initializer',
2504         'shared_embedding_collection_name', 'ckpt_to_load_from',
2505         'tensor_name_in_ckpt', 'max_norm', 'trainable'))):
2506  """See `embedding_column`."""
2507
2508  @property
2509  def name(self):
2510    if not hasattr(self, '_name'):
2511      self._name = '{}_shared_embedding'.format(self.categorical_column.name)
2512    return self._name
2513
2514  @property
2515  def _var_scope_name(self):
2516    return self.shared_embedding_collection_name
2517
2518  @property
2519  def _parse_example_spec(self):
2520    return self.categorical_column._parse_example_spec  # pylint: disable=protected-access
2521
2522  def _transform_feature(self, inputs):
2523    return inputs.get(self.categorical_column)
2524
2525  @property
2526  def _variable_shape(self):
2527    if not hasattr(self, '_shape'):
2528      self._shape = tensor_shape.vector(self.dimension)
2529    return self._shape
2530
2531  def _get_dense_tensor_internal(self,
2532                                 inputs,
2533                                 weight_collections=None,
2534                                 trainable=None):
2535    """Private method that follows the signature of _get_dense_tensor."""
2536    # This method is called from a variable_scope with name _var_scope_name,
2537    # which is shared among all shared embeddings. Open a name_scope here, so
2538    # that the ops for different columns have distinct names.
2539    with ops.name_scope(None, default_name=self.name):
2540      # Get sparse IDs and weights.
2541      sparse_tensors = self.categorical_column._get_sparse_tensors(  # pylint: disable=protected-access
2542          inputs, weight_collections=weight_collections, trainable=trainable)
2543      sparse_ids = sparse_tensors.id_tensor
2544      sparse_weights = sparse_tensors.weight_tensor
2545
2546      embedding_shape = (self.categorical_column._num_buckets, self.dimension)  # pylint: disable=protected-access
2547      shared_embedding_collection = ops.get_collection(
2548          self.shared_embedding_collection_name)
2549      if shared_embedding_collection:
2550        if len(shared_embedding_collection) > 1:
2551          raise ValueError(
2552              'Collection {} can only contain one variable. '
2553              'Suggested fix A: Choose a unique name for this collection. '
2554              'Suggested fix B: Do not add any variables to this collection. '
2555              'The feature_column library already adds a variable under the '
2556              'hood.'.format(shared_embedding_collection))
2557        embedding_weights = shared_embedding_collection[0]
2558        if embedding_weights.get_shape() != embedding_shape:
2559          raise ValueError(
2560              'Shared embedding collection {} contains variable {} of '
2561              'unexpected shape {}. Expected shape is {}. '
2562              'Suggested fix A: Choose a unique name for this collection. '
2563              'Suggested fix B: Do not add any variables to this collection. '
2564              'The feature_column library already adds a variable under the '
2565              'hood.'.format(self.shared_embedding_collection_name,
2566                             embedding_weights.name,
2567                             embedding_weights.get_shape(), embedding_shape))
2568      else:
2569        embedding_weights = variable_scope.get_variable(
2570            name='embedding_weights',
2571            shape=embedding_shape,
2572            dtype=dtypes.float32,
2573            initializer=self.initializer,
2574            trainable=self.trainable and trainable,
2575            collections=weight_collections)
2576        ops.add_to_collection(self.shared_embedding_collection_name,
2577                              embedding_weights)
2578      if self.ckpt_to_load_from is not None:
2579        to_restore = embedding_weights
2580        if isinstance(to_restore, variables.PartitionedVariable):
2581          to_restore = to_restore._get_variable_list()  # pylint: disable=protected-access
2582        checkpoint_utils.init_from_checkpoint(self.ckpt_to_load_from, {
2583            self.tensor_name_in_ckpt: to_restore
2584        })
2585
2586      # Return embedding lookup result.
2587      return embedding_ops.safe_embedding_lookup_sparse(
2588          embedding_weights=embedding_weights,
2589          sparse_ids=sparse_ids,
2590          sparse_weights=sparse_weights,
2591          combiner=self.combiner,
2592          name='%s_weights' % self.name,
2593          max_norm=self.max_norm)
2594
2595  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
2596    if isinstance(self.categorical_column, _SequenceCategoricalColumn):
2597      raise ValueError(
2598          'In embedding_column: {}. '
2599          'categorical_column must not be of type _SequenceCategoricalColumn. '
2600          'Suggested fix A: If you wish to use input_layer, use a '
2601          'non-sequence categorical_column_with_*. '
2602          'Suggested fix B: If you wish to create sequence input, use '
2603          'sequence_input_layer instead of input_layer. '
2604          'Given (type {}): {}'.format(self.name, type(self.categorical_column),
2605                                       self.categorical_column))
2606    return self._get_dense_tensor_internal(
2607        inputs=inputs,
2608        weight_collections=weight_collections,
2609        trainable=trainable)
2610
2611  def _get_sequence_dense_tensor(self,
2612                                 inputs,
2613                                 weight_collections=None,
2614                                 trainable=None):
2615    if not isinstance(self.categorical_column, _SequenceCategoricalColumn):
2616      raise ValueError(
2617          'In embedding_column: {}. '
2618          'categorical_column must be of type _SequenceCategoricalColumn '
2619          'to use sequence_input_layer. '
2620          'Suggested fix: Use one of sequence_categorical_column_with_*. '
2621          'Given (type {}): {}'.format(self.name, type(self.categorical_column),
2622                                       self.categorical_column))
2623    dense_tensor = self._get_dense_tensor_internal(  # pylint: disable=protected-access
2624        inputs=inputs,
2625        weight_collections=weight_collections,
2626        trainable=trainable)
2627    sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
2628    sequence_length = fc_utils.sequence_length_from_sparse_tensor(
2629        sparse_tensors.id_tensor)
2630    return _SequenceDenseColumn.TensorSequenceLengthPair(
2631        dense_tensor=dense_tensor, sequence_length=sequence_length)
2632
2633
2634def _check_shape(shape, key):
2635  """Returns shape if it's valid, raises error otherwise."""
2636  assert shape is not None
2637  if not nest.is_sequence(shape):
2638    shape = [shape]
2639  shape = tuple(shape)
2640  for dimension in shape:
2641    if not isinstance(dimension, six.integer_types):
2642      raise TypeError('shape dimensions must be integer. '
2643                      'shape: {}, key: {}'.format(shape, key))
2644    if dimension < 1:
2645      raise ValueError('shape dimensions must be greater than 0. '
2646                       'shape: {}, key: {}'.format(shape, key))
2647  return shape
2648
2649
2650class _HashedCategoricalColumn(
2651    _CategoricalColumn,
2652    collections.namedtuple('_HashedCategoricalColumn',
2653                           ['key', 'hash_bucket_size', 'dtype'])):
2654  """see `categorical_column_with_hash_bucket`."""
2655
2656  @property
2657  def name(self):
2658    return self.key
2659
2660  @property
2661  def _parse_example_spec(self):
2662    return {self.key: parsing_ops.VarLenFeature(self.dtype)}
2663
2664  def _transform_feature(self, inputs):
2665    input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key))
2666    if not isinstance(input_tensor, sparse_tensor_lib.SparseTensor):
2667      raise ValueError('SparseColumn input must be a SparseTensor.')
2668
2669    fc_utils.assert_string_or_int(
2670        input_tensor.dtype,
2671        prefix='column_name: {} input_tensor'.format(self.key))
2672
2673    if self.dtype.is_integer != input_tensor.dtype.is_integer:
2674      raise ValueError(
2675          'Column dtype and SparseTensors dtype must be compatible. '
2676          'key: {}, column dtype: {}, tensor dtype: {}'.format(
2677              self.key, self.dtype, input_tensor.dtype))
2678
2679    if self.dtype == dtypes.string:
2680      sparse_values = input_tensor.values
2681    else:
2682      sparse_values = string_ops.as_string(input_tensor.values)
2683
2684    sparse_id_values = string_ops.string_to_hash_bucket_fast(
2685        sparse_values, self.hash_bucket_size, name='lookup')
2686    return sparse_tensor_lib.SparseTensor(
2687        input_tensor.indices, sparse_id_values, input_tensor.dense_shape)
2688
2689  @property
2690  def _num_buckets(self):
2691    """Returns number of buckets in this sparse feature."""
2692    return self.hash_bucket_size
2693
2694  def _get_sparse_tensors(self, inputs, weight_collections=None,
2695                          trainable=None):
2696    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
2697
2698
2699class _VocabularyFileCategoricalColumn(
2700    _CategoricalColumn,
2701    collections.namedtuple('_VocabularyFileCategoricalColumn', (
2702        'key', 'vocabulary_file', 'vocabulary_size', 'num_oov_buckets', 'dtype',
2703        'default_value'
2704    ))):
2705  """See `categorical_column_with_vocabulary_file`."""
2706
2707  @property
2708  def name(self):
2709    return self.key
2710
2711  @property
2712  def _parse_example_spec(self):
2713    return {self.key: parsing_ops.VarLenFeature(self.dtype)}
2714
2715  def _transform_feature(self, inputs):
2716    input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key))
2717
2718    if self.dtype.is_integer != input_tensor.dtype.is_integer:
2719      raise ValueError(
2720          'Column dtype and SparseTensors dtype must be compatible. '
2721          'key: {}, column dtype: {}, tensor dtype: {}'.format(
2722              self.key, self.dtype, input_tensor.dtype))
2723
2724    fc_utils.assert_string_or_int(
2725        input_tensor.dtype,
2726        prefix='column_name: {} input_tensor'.format(self.key))
2727
2728    key_dtype = self.dtype
2729    if input_tensor.dtype.is_integer:
2730      # `index_table_from_file` requires 64-bit integer keys.
2731      key_dtype = dtypes.int64
2732      input_tensor = math_ops.cast(input_tensor, dtypes.int64)
2733
2734    return lookup_ops.index_table_from_file(
2735        vocabulary_file=self.vocabulary_file,
2736        num_oov_buckets=self.num_oov_buckets,
2737        vocab_size=self.vocabulary_size,
2738        default_value=self.default_value,
2739        key_dtype=key_dtype,
2740        name='{}_lookup'.format(self.key)).lookup(input_tensor)
2741
2742  @property
2743  def _num_buckets(self):
2744    """Returns number of buckets in this sparse feature."""
2745    return self.vocabulary_size + self.num_oov_buckets
2746
2747  def _get_sparse_tensors(
2748      self, inputs, weight_collections=None, trainable=None):
2749    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
2750
2751
2752class _VocabularyListCategoricalColumn(
2753    _CategoricalColumn,
2754    collections.namedtuple('_VocabularyListCategoricalColumn', (
2755        'key', 'vocabulary_list', 'dtype', 'default_value', 'num_oov_buckets'
2756    ))):
2757  """See `categorical_column_with_vocabulary_list`."""
2758
2759  @property
2760  def name(self):
2761    return self.key
2762
2763  @property
2764  def _parse_example_spec(self):
2765    return {self.key: parsing_ops.VarLenFeature(self.dtype)}
2766
2767  def _transform_feature(self, inputs):
2768    input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key))
2769
2770    if self.dtype.is_integer != input_tensor.dtype.is_integer:
2771      raise ValueError(
2772          'Column dtype and SparseTensors dtype must be compatible. '
2773          'key: {}, column dtype: {}, tensor dtype: {}'.format(
2774              self.key, self.dtype, input_tensor.dtype))
2775
2776    fc_utils.assert_string_or_int(
2777        input_tensor.dtype,
2778        prefix='column_name: {} input_tensor'.format(self.key))
2779
2780    key_dtype = self.dtype
2781    if input_tensor.dtype.is_integer:
2782      # `index_table_from_tensor` requires 64-bit integer keys.
2783      key_dtype = dtypes.int64
2784      input_tensor = math_ops.cast(input_tensor, dtypes.int64)
2785
2786    return lookup_ops.index_table_from_tensor(
2787        vocabulary_list=tuple(self.vocabulary_list),
2788        default_value=self.default_value,
2789        num_oov_buckets=self.num_oov_buckets,
2790        dtype=key_dtype,
2791        name='{}_lookup'.format(self.key)).lookup(input_tensor)
2792
2793  @property
2794  def _num_buckets(self):
2795    """Returns number of buckets in this sparse feature."""
2796    return len(self.vocabulary_list) + self.num_oov_buckets
2797
2798  def _get_sparse_tensors(
2799      self, inputs, weight_collections=None, trainable=None):
2800    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
2801
2802
2803class _IdentityCategoricalColumn(
2804    _CategoricalColumn,
2805    collections.namedtuple('_IdentityCategoricalColumn', (
2806        'key', 'num_buckets', 'default_value'
2807    ))):
2808
2809  """See `categorical_column_with_identity`."""
2810
2811  @property
2812  def name(self):
2813    return self.key
2814
2815  @property
2816  def _parse_example_spec(self):
2817    return {self.key: parsing_ops.VarLenFeature(dtypes.int64)}
2818
2819  def _transform_feature(self, inputs):
2820    input_tensor = _to_sparse_input_and_drop_ignore_values(inputs.get(self.key))
2821
2822    if not input_tensor.dtype.is_integer:
2823      raise ValueError(
2824          'Invalid input, not integer. key: {} dtype: {}'.format(
2825              self.key, input_tensor.dtype))
2826
2827    values = math_ops.cast(input_tensor.values, dtypes.int64, name='values')
2828    num_buckets = math_ops.cast(
2829        self.num_buckets, dtypes.int64, name='num_buckets')
2830    zero = math_ops.cast(0, dtypes.int64, name='zero')
2831    if self.default_value is None:
2832      # Fail if values are out-of-range.
2833      assert_less = check_ops.assert_less(
2834          values, num_buckets, data=(values, num_buckets),
2835          name='assert_less_than_num_buckets')
2836      assert_greater = check_ops.assert_greater_equal(
2837          values, zero, data=(values,),
2838          name='assert_greater_or_equal_0')
2839      with ops.control_dependencies((assert_less, assert_greater)):
2840        values = array_ops.identity(values)
2841    else:
2842      # Assign default for out-of-range values.
2843      values = array_ops.where(
2844          math_ops.logical_or(
2845              values < zero, values >= num_buckets, name='out_of_range'),
2846          array_ops.fill(
2847              dims=array_ops.shape(values),
2848              value=math_ops.cast(self.default_value, dtypes.int64),
2849              name='default_values'), values)
2850
2851    return sparse_tensor_lib.SparseTensor(
2852        indices=input_tensor.indices,
2853        values=values,
2854        dense_shape=input_tensor.dense_shape)
2855
2856  @property
2857  def _num_buckets(self):
2858    """Returns number of buckets in this sparse feature."""
2859    return self.num_buckets
2860
2861  def _get_sparse_tensors(
2862      self, inputs, weight_collections=None, trainable=None):
2863    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
2864
2865
2866class _WeightedCategoricalColumn(
2867    _CategoricalColumn,
2868    collections.namedtuple('_WeightedCategoricalColumn', (
2869        'categorical_column', 'weight_feature_key', 'dtype'
2870    ))):
2871  """See `weighted_categorical_column`."""
2872
2873  @property
2874  def name(self):
2875    return '{}_weighted_by_{}'.format(
2876        self.categorical_column.name, self.weight_feature_key)
2877
2878  @property
2879  def _parse_example_spec(self):
2880    config = self.categorical_column._parse_example_spec  # pylint: disable=protected-access
2881    if self.weight_feature_key in config:
2882      raise ValueError('Parse config {} already exists for {}.'.format(
2883          config[self.weight_feature_key], self.weight_feature_key))
2884    config[self.weight_feature_key] = parsing_ops.VarLenFeature(self.dtype)
2885    return config
2886
2887  @property
2888  def _num_buckets(self):
2889    return self.categorical_column._num_buckets  # pylint: disable=protected-access
2890
2891  def _transform_feature(self, inputs):
2892    weight_tensor = inputs.get(self.weight_feature_key)
2893    if weight_tensor is None:
2894      raise ValueError('Missing weights {}.'.format(self.weight_feature_key))
2895    weight_tensor = sparse_tensor_lib.convert_to_tensor_or_sparse_tensor(
2896        weight_tensor)
2897    if self.dtype != weight_tensor.dtype.base_dtype:
2898      raise ValueError('Bad dtype, expected {}, but got {}.'.format(
2899          self.dtype, weight_tensor.dtype))
2900    if not isinstance(weight_tensor, sparse_tensor_lib.SparseTensor):
2901      # The weight tensor can be a regular Tensor. In this case, sparsify it.
2902      weight_tensor = _to_sparse_input_and_drop_ignore_values(
2903          weight_tensor, ignore_value=0.0)
2904    if not weight_tensor.dtype.is_floating:
2905      weight_tensor = math_ops.cast(weight_tensor, dtypes.float32)
2906    return (inputs.get(self.categorical_column), weight_tensor)
2907
2908  def _get_sparse_tensors(
2909      self, inputs, weight_collections=None, trainable=None):
2910    del weight_collections
2911    del trainable
2912    tensors = inputs.get(self)
2913    return _CategoricalColumn.IdWeightPair(tensors[0], tensors[1])
2914
2915
2916class _CrossedColumn(
2917    _CategoricalColumn,
2918    collections.namedtuple('_CrossedColumn',
2919                           ['keys', 'hash_bucket_size', 'hash_key'])):
2920  """See `crossed_column`."""
2921
2922  @property
2923  def name(self):
2924    feature_names = []
2925    for key in _collect_leaf_level_keys(self):
2926      if isinstance(key, _FeatureColumn):
2927        feature_names.append(key.name)
2928      else:  # key must be a string
2929        feature_names.append(key)
2930    return '_X_'.join(sorted(feature_names))
2931
2932  @property
2933  def _parse_example_spec(self):
2934    config = {}
2935    for key in self.keys:
2936      if isinstance(key, _FeatureColumn):
2937        config.update(key._parse_example_spec)  # pylint: disable=protected-access
2938      else:  # key must be a string
2939        config.update({key: parsing_ops.VarLenFeature(dtypes.string)})
2940    return config
2941
2942  def _transform_feature(self, inputs):
2943    feature_tensors = []
2944    for key in _collect_leaf_level_keys(self):
2945      if isinstance(key, six.string_types):
2946        feature_tensors.append(inputs.get(key))
2947      elif isinstance(key, _CategoricalColumn):
2948        ids_and_weights = key._get_sparse_tensors(inputs)  # pylint: disable=protected-access
2949        if ids_and_weights.weight_tensor is not None:
2950          raise ValueError(
2951              'crossed_column does not support weight_tensor, but the given '
2952              'column populates weight_tensor. '
2953              'Given column: {}'.format(key.name))
2954        feature_tensors.append(ids_and_weights.id_tensor)
2955      else:
2956        raise ValueError('Unsupported column type. Given: {}'.format(key))
2957    return sparse_ops.sparse_cross_hashed(
2958        inputs=feature_tensors,
2959        num_buckets=self.hash_bucket_size,
2960        hash_key=self.hash_key)
2961
2962  @property
2963  def _num_buckets(self):
2964    """Returns number of buckets in this sparse feature."""
2965    return self.hash_bucket_size
2966
2967  def _get_sparse_tensors(self, inputs, weight_collections=None,
2968                          trainable=None):
2969    return _CategoricalColumn.IdWeightPair(inputs.get(self), None)
2970
2971
2972def _collect_leaf_level_keys(cross):
2973  """Collects base keys by expanding all nested crosses.
2974
2975  Args:
2976    cross: A `_CrossedColumn`.
2977
2978  Returns:
2979    A list of strings or `_CategoricalColumn` instances.
2980  """
2981  leaf_level_keys = []
2982  for k in cross.keys:
2983    if isinstance(k, _CrossedColumn):
2984      leaf_level_keys.extend(_collect_leaf_level_keys(k))
2985    else:
2986      leaf_level_keys.append(k)
2987  return leaf_level_keys
2988
2989
2990class _IndicatorColumn(_DenseColumn, _SequenceDenseColumn,
2991                       collections.namedtuple('_IndicatorColumn',
2992                                              ['categorical_column'])):
2993  """Represents a one-hot column for use in deep networks.
2994
2995  Args:
2996    categorical_column: A `_CategoricalColumn` which is created by
2997      `categorical_column_with_*` function.
2998  """
2999
3000  @property
3001  def name(self):
3002    return '{}_indicator'.format(self.categorical_column.name)
3003
3004  def _transform_feature(self, inputs):
3005    """Returns dense `Tensor` representing feature.
3006
3007    Args:
3008      inputs: A `_LazyBuilder` object to access inputs.
3009
3010    Returns:
3011      Transformed feature `Tensor`.
3012
3013    Raises:
3014      ValueError: if input rank is not known at graph building time.
3015    """
3016    id_weight_pair = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
3017    id_tensor = id_weight_pair.id_tensor
3018    weight_tensor = id_weight_pair.weight_tensor
3019
3020    # If the underlying column is weighted, return the input as a dense tensor.
3021    if weight_tensor is not None:
3022      weighted_column = sparse_ops.sparse_merge(
3023          sp_ids=id_tensor,
3024          sp_values=weight_tensor,
3025          vocab_size=int(self._variable_shape[-1]))
3026      # Remove (?, -1) index.
3027      weighted_column = sparse_ops.sparse_slice(weighted_column, [0, 0],
3028                                                weighted_column.dense_shape)
3029      # Use scatter_nd to merge duplicated indices if existed,
3030      # instead of sparse_tensor_to_dense.
3031      return array_ops.scatter_nd(weighted_column.indices,
3032                                  weighted_column.values,
3033                                  weighted_column.dense_shape)
3034
3035    dense_id_tensor = sparse_ops.sparse_tensor_to_dense(
3036        id_tensor, default_value=-1)
3037
3038    # One hot must be float for tf.concat reasons since all other inputs to
3039    # input_layer are float32.
3040    one_hot_id_tensor = array_ops.one_hot(
3041        dense_id_tensor,
3042        depth=self._variable_shape[-1],
3043        on_value=1.0,
3044        off_value=0.0)
3045
3046    # Reduce to get a multi-hot per example.
3047    return math_ops.reduce_sum(one_hot_id_tensor, axis=[-2])
3048
3049  @property
3050  def _parse_example_spec(self):
3051    return self.categorical_column._parse_example_spec  # pylint: disable=protected-access
3052
3053  @property
3054  def _variable_shape(self):
3055    """Returns a `TensorShape` representing the shape of the dense `Tensor`."""
3056    return tensor_shape.TensorShape([1, self.categorical_column._num_buckets])  # pylint: disable=protected-access
3057
3058  def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
3059    """Returns dense `Tensor` representing feature.
3060
3061    Args:
3062      inputs: A `_LazyBuilder` object to access inputs.
3063      weight_collections: Unused `weight_collections` since no variables are
3064        created in this function.
3065      trainable: Unused `trainable` bool since no variables are created in
3066        this function.
3067
3068    Returns:
3069      Dense `Tensor` created within `_transform_feature`.
3070
3071    Raises:
3072      ValueError: If `categorical_column` is a `_SequenceCategoricalColumn`.
3073    """
3074    # Do nothing with weight_collections and trainable since no variables are
3075    # created in this function.
3076    del weight_collections
3077    del trainable
3078    if isinstance(self.categorical_column, _SequenceCategoricalColumn):
3079      raise ValueError(
3080          'In indicator_column: {}. '
3081          'categorical_column must not be of type _SequenceCategoricalColumn. '
3082          'Suggested fix A: If you wish to use input_layer, use a '
3083          'non-sequence categorical_column_with_*. '
3084          'Suggested fix B: If you wish to create sequence input, use '
3085          'sequence_input_layer instead of input_layer. '
3086          'Given (type {}): {}'.format(
3087              self.name, type(self.categorical_column),
3088              self.categorical_column))
3089    # Feature has been already transformed. Return the intermediate
3090    # representation created by _transform_feature.
3091    return inputs.get(self)
3092
3093  def _get_sequence_dense_tensor(
3094      self, inputs, weight_collections=None, trainable=None):
3095    # Do nothing with weight_collections and trainable since no variables are
3096    # created in this function.
3097    del weight_collections
3098    del trainable
3099    if not isinstance(self.categorical_column, _SequenceCategoricalColumn):
3100      raise ValueError(
3101          'In indicator_column: {}. '
3102          'categorical_column must be of type _SequenceCategoricalColumn '
3103          'to use sequence_input_layer. '
3104          'Suggested fix: Use one of sequence_categorical_column_with_*. '
3105          'Given (type {}): {}'.format(
3106              self.name, type(self.categorical_column),
3107              self.categorical_column))
3108    # Feature has been already transformed. Return the intermediate
3109    # representation created by _transform_feature.
3110    dense_tensor = inputs.get(self)
3111    sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
3112    sequence_length = fc_utils.sequence_length_from_sparse_tensor(
3113        sparse_tensors.id_tensor)
3114    return _SequenceDenseColumn.TensorSequenceLengthPair(
3115        dense_tensor=dense_tensor, sequence_length=sequence_length)
3116
3117
3118def _verify_static_batch_size_equality(tensors, columns):
3119  """Validates that the first dim (batch size) of all tensors are equal or None.
3120
3121  Args:
3122    tensors: list of tensors to check.
3123    columns: list of feature columns matching tensors. Will be used for error
3124      messaging.
3125
3126  Raises:
3127    ValueError: if one of the tensors has a variant batch size
3128  """
3129  # bath_size is a tf.Dimension object.
3130  expected_batch_size = None
3131  for i in range(0, len(tensors)):
3132    if tensors[i].shape.dims[0].value is not None:
3133      if expected_batch_size is None:
3134        bath_size_column_index = i
3135        expected_batch_size = tensors[i].shape.dims[0]
3136      elif not expected_batch_size.is_compatible_with(tensors[i].shape.dims[0]):
3137        raise ValueError(
3138            'Batch size (first dimension) of each feature must be same. '
3139            'Batch size of columns ({}, {}): ({}, {})'.format(
3140                columns[bath_size_column_index].name, columns[i].name,
3141                expected_batch_size, tensors[i].shape.dims[0]))
3142
3143
3144class _SequenceCategoricalColumn(
3145    _CategoricalColumn,
3146    collections.namedtuple(
3147        '_SequenceCategoricalColumn', ['categorical_column'])):
3148  """Represents sequences of categorical data."""
3149
3150  @property
3151  def name(self):
3152    return self.categorical_column.name
3153
3154  @property
3155  def _parse_example_spec(self):
3156    return self.categorical_column._parse_example_spec  # pylint: disable=protected-access
3157
3158  def _transform_feature(self, inputs):
3159    return self.categorical_column._transform_feature(inputs)  # pylint: disable=protected-access
3160
3161  @property
3162  def _num_buckets(self):
3163    return self.categorical_column._num_buckets  # pylint: disable=protected-access
3164
3165  def _get_sparse_tensors(self, inputs, weight_collections=None,
3166                          trainable=None):
3167    sparse_tensors = self.categorical_column._get_sparse_tensors(inputs)  # pylint: disable=protected-access
3168    id_tensor = sparse_tensors.id_tensor
3169    weight_tensor = sparse_tensors.weight_tensor
3170
3171    # Expands third dimension, if necessary so that embeddings are not
3172    # combined during embedding lookup. If the tensor is already 3D, leave
3173    # as-is.
3174    shape = array_ops.shape(id_tensor)
3175    # Compute the third dimension explicitly instead of setting it to -1, as
3176    # that doesn't work for dynamically shaped tensors with 0-length at runtime.
3177    # This happens for empty sequences.
3178    target_shape = [shape[0], shape[1], math_ops.reduce_prod(shape[2:])]
3179    id_tensor = sparse_ops.sparse_reshape(id_tensor, target_shape)
3180    if weight_tensor is not None:
3181      weight_tensor = sparse_ops.sparse_reshape(weight_tensor, target_shape)
3182
3183    return _CategoricalColumn.IdWeightPair(id_tensor, weight_tensor)
3184