1# Copyright 2018 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15"""This API defines FeatureColumn for sequential input. 16 17NOTE: This API is a work in progress and will likely be changing frequently. 18""" 19 20from __future__ import absolute_import 21from __future__ import division 22from __future__ import print_function 23 24 25import collections 26 27 28from tensorflow.python.feature_column import feature_column_v2 as fc 29from tensorflow.python.feature_column import utils as fc_utils 30from tensorflow.python.framework import dtypes 31from tensorflow.python.framework import ops 32from tensorflow.python.framework import tensor_shape 33from tensorflow.python.ops import array_ops 34from tensorflow.python.ops import check_ops 35from tensorflow.python.ops import parsing_ops 36from tensorflow.python.ops import sparse_ops 37from tensorflow.python.util.tf_export import tf_export 38 39 40# pylint: disable=protected-access 41def concatenate_context_input(context_input, sequence_input): 42 """Replicates `context_input` across all timesteps of `sequence_input`. 43 44 Expands dimension 1 of `context_input` then tiles it `sequence_length` times. 45 This value is appended to `sequence_input` on dimension 2 and the result is 46 returned. 47 48 Args: 49 context_input: A `Tensor` of dtype `float32` and shape `[batch_size, d1]`. 50 sequence_input: A `Tensor` of dtype `float32` and shape `[batch_size, 51 padded_length, d0]`. 52 53 Returns: 54 A `Tensor` of dtype `float32` and shape `[batch_size, padded_length, 55 d0 + d1]`. 56 57 Raises: 58 ValueError: If `sequence_input` does not have rank 3 or `context_input` does 59 not have rank 2. 60 """ 61 seq_rank_check = check_ops.assert_rank( 62 sequence_input, 63 3, 64 message='sequence_input must have rank 3', 65 data=[array_ops.shape(sequence_input)]) 66 seq_type_check = check_ops.assert_type( 67 sequence_input, 68 dtypes.float32, 69 message='sequence_input must have dtype float32; got {}.'.format( 70 sequence_input.dtype)) 71 ctx_rank_check = check_ops.assert_rank( 72 context_input, 73 2, 74 message='context_input must have rank 2', 75 data=[array_ops.shape(context_input)]) 76 ctx_type_check = check_ops.assert_type( 77 context_input, 78 dtypes.float32, 79 message='context_input must have dtype float32; got {}.'.format( 80 context_input.dtype)) 81 with ops.control_dependencies( 82 [seq_rank_check, seq_type_check, ctx_rank_check, ctx_type_check]): 83 padded_length = array_ops.shape(sequence_input)[1] 84 tiled_context_input = array_ops.tile( 85 array_ops.expand_dims(context_input, 1), 86 array_ops.concat([[1], [padded_length], [1]], 0)) 87 return array_ops.concat([sequence_input, tiled_context_input], 2) 88 89 90@tf_export('feature_column.sequence_categorical_column_with_identity') 91def sequence_categorical_column_with_identity( 92 key, num_buckets, default_value=None): 93 """Returns a feature column that represents sequences of integers. 94 95 Pass this to `embedding_column` or `indicator_column` to convert sequence 96 categorical data into dense representation for input to sequence NN, such as 97 RNN. 98 99 Example: 100 101 ```python 102 watches = sequence_categorical_column_with_identity( 103 'watches', num_buckets=1000) 104 watches_embedding = embedding_column(watches, dimension=10) 105 columns = [watches_embedding] 106 107 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 108 sequence_feature_layer = SequenceFeatures(columns) 109 sequence_input, sequence_length = sequence_feature_layer(features) 110 sequence_length_mask = tf.sequence_mask(sequence_length) 111 112 rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size) 113 rnn_layer = tf.keras.layers.RNN(rnn_cell) 114 outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask) 115 ``` 116 117 Args: 118 key: A unique string identifying the input feature. 119 num_buckets: Range of inputs. Namely, inputs are expected to be in the 120 range `[0, num_buckets)`. 121 default_value: If `None`, this column's graph operations will fail for 122 out-of-range inputs. Otherwise, this value must be in the range 123 `[0, num_buckets)`, and will replace out-of-range inputs. 124 125 Returns: 126 A `SequenceCategoricalColumn`. 127 128 Raises: 129 ValueError: if `num_buckets` is less than one. 130 ValueError: if `default_value` is not in range `[0, num_buckets)`. 131 """ 132 return fc.SequenceCategoricalColumn( 133 fc.categorical_column_with_identity( 134 key=key, 135 num_buckets=num_buckets, 136 default_value=default_value)) 137 138 139@tf_export('feature_column.sequence_categorical_column_with_hash_bucket') 140def sequence_categorical_column_with_hash_bucket( 141 key, hash_bucket_size, dtype=dtypes.string): 142 """A sequence of categorical terms where ids are set by hashing. 143 144 Pass this to `embedding_column` or `indicator_column` to convert sequence 145 categorical data into dense representation for input to sequence NN, such as 146 RNN. 147 148 Example: 149 150 ```python 151 tokens = sequence_categorical_column_with_hash_bucket( 152 'tokens', hash_bucket_size=1000) 153 tokens_embedding = embedding_column(tokens, dimension=10) 154 columns = [tokens_embedding] 155 156 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 157 sequence_feature_layer = SequenceFeatures(columns) 158 sequence_input, sequence_length = sequence_feature_layer(features) 159 sequence_length_mask = tf.sequence_mask(sequence_length) 160 161 rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size) 162 rnn_layer = tf.keras.layers.RNN(rnn_cell) 163 outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask) 164 ``` 165 166 Args: 167 key: A unique string identifying the input feature. 168 hash_bucket_size: An int > 1. The number of buckets. 169 dtype: The type of features. Only string and integer types are supported. 170 171 Returns: 172 A `SequenceCategoricalColumn`. 173 174 Raises: 175 ValueError: `hash_bucket_size` is not greater than 1. 176 ValueError: `dtype` is neither string nor integer. 177 """ 178 return fc.SequenceCategoricalColumn( 179 fc.categorical_column_with_hash_bucket( 180 key=key, 181 hash_bucket_size=hash_bucket_size, 182 dtype=dtype)) 183 184 185@tf_export('feature_column.sequence_categorical_column_with_vocabulary_file') 186def sequence_categorical_column_with_vocabulary_file( 187 key, vocabulary_file, vocabulary_size=None, num_oov_buckets=0, 188 default_value=None, dtype=dtypes.string): 189 """A sequence of categorical terms where ids use a vocabulary file. 190 191 Pass this to `embedding_column` or `indicator_column` to convert sequence 192 categorical data into dense representation for input to sequence NN, such as 193 RNN. 194 195 Example: 196 197 ```python 198 states = sequence_categorical_column_with_vocabulary_file( 199 key='states', vocabulary_file='/us/states.txt', vocabulary_size=50, 200 num_oov_buckets=5) 201 states_embedding = embedding_column(states, dimension=10) 202 columns = [states_embedding] 203 204 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 205 sequence_feature_layer = SequenceFeatures(columns) 206 sequence_input, sequence_length = sequence_feature_layer(features) 207 sequence_length_mask = tf.sequence_mask(sequence_length) 208 209 rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size) 210 rnn_layer = tf.keras.layers.RNN(rnn_cell) 211 outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask) 212 ``` 213 214 Args: 215 key: A unique string identifying the input feature. 216 vocabulary_file: The vocabulary file name. 217 vocabulary_size: Number of the elements in the vocabulary. This must be no 218 greater than length of `vocabulary_file`, if less than length, later 219 values are ignored. If None, it is set to the length of `vocabulary_file`. 220 num_oov_buckets: Non-negative integer, the number of out-of-vocabulary 221 buckets. All out-of-vocabulary inputs will be assigned IDs in the range 222 `[vocabulary_size, vocabulary_size+num_oov_buckets)` based on a hash of 223 the input value. A positive `num_oov_buckets` can not be specified with 224 `default_value`. 225 default_value: The integer ID value to return for out-of-vocabulary feature 226 values, defaults to `-1`. This can not be specified with a positive 227 `num_oov_buckets`. 228 dtype: The type of features. Only string and integer types are supported. 229 230 Returns: 231 A `SequenceCategoricalColumn`. 232 233 Raises: 234 ValueError: `vocabulary_file` is missing or cannot be opened. 235 ValueError: `vocabulary_size` is missing or < 1. 236 ValueError: `num_oov_buckets` is a negative integer. 237 ValueError: `num_oov_buckets` and `default_value` are both specified. 238 ValueError: `dtype` is neither string nor integer. 239 """ 240 return fc.SequenceCategoricalColumn( 241 fc.categorical_column_with_vocabulary_file( 242 key=key, 243 vocabulary_file=vocabulary_file, 244 vocabulary_size=vocabulary_size, 245 num_oov_buckets=num_oov_buckets, 246 default_value=default_value, 247 dtype=dtype)) 248 249 250@tf_export('feature_column.sequence_categorical_column_with_vocabulary_list') 251def sequence_categorical_column_with_vocabulary_list( 252 key, vocabulary_list, dtype=None, default_value=-1, num_oov_buckets=0): 253 """A sequence of categorical terms where ids use an in-memory list. 254 255 Pass this to `embedding_column` or `indicator_column` to convert sequence 256 categorical data into dense representation for input to sequence NN, such as 257 RNN. 258 259 Example: 260 261 ```python 262 colors = sequence_categorical_column_with_vocabulary_list( 263 key='colors', vocabulary_list=('R', 'G', 'B', 'Y'), 264 num_oov_buckets=2) 265 colors_embedding = embedding_column(colors, dimension=3) 266 columns = [colors_embedding] 267 268 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 269 sequence_feature_layer = SequenceFeatures(columns) 270 sequence_input, sequence_length = sequence_feature_layer(features) 271 sequence_length_mask = tf.sequence_mask(sequence_length) 272 273 rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size) 274 rnn_layer = tf.keras.layers.RNN(rnn_cell) 275 outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask) 276 ``` 277 278 Args: 279 key: A unique string identifying the input feature. 280 vocabulary_list: An ordered iterable defining the vocabulary. Each feature 281 is mapped to the index of its value (if present) in `vocabulary_list`. 282 Must be castable to `dtype`. 283 dtype: The type of features. Only string and integer types are supported. 284 If `None`, it will be inferred from `vocabulary_list`. 285 default_value: The integer ID value to return for out-of-vocabulary feature 286 values, defaults to `-1`. This can not be specified with a positive 287 `num_oov_buckets`. 288 num_oov_buckets: Non-negative integer, the number of out-of-vocabulary 289 buckets. All out-of-vocabulary inputs will be assigned IDs in the range 290 `[len(vocabulary_list), len(vocabulary_list)+num_oov_buckets)` based on a 291 hash of the input value. A positive `num_oov_buckets` can not be specified 292 with `default_value`. 293 294 Returns: 295 A `SequenceCategoricalColumn`. 296 297 Raises: 298 ValueError: if `vocabulary_list` is empty, or contains duplicate keys. 299 ValueError: `num_oov_buckets` is a negative integer. 300 ValueError: `num_oov_buckets` and `default_value` are both specified. 301 ValueError: if `dtype` is not integer or string. 302 """ 303 return fc.SequenceCategoricalColumn( 304 fc.categorical_column_with_vocabulary_list( 305 key=key, 306 vocabulary_list=vocabulary_list, 307 dtype=dtype, 308 default_value=default_value, 309 num_oov_buckets=num_oov_buckets)) 310 311 312@tf_export('feature_column.sequence_numeric_column') 313def sequence_numeric_column( 314 key, 315 shape=(1,), 316 default_value=0., 317 dtype=dtypes.float32, 318 normalizer_fn=None): 319 """Returns a feature column that represents sequences of numeric data. 320 321 Example: 322 323 ```python 324 temperature = sequence_numeric_column('temperature') 325 columns = [temperature] 326 327 features = tf.io.parse_example(..., features=make_parse_example_spec(columns)) 328 sequence_feature_layer = SequenceFeatures(columns) 329 sequence_input, sequence_length = sequence_feature_layer(features) 330 sequence_length_mask = tf.sequence_mask(sequence_length) 331 332 rnn_cell = tf.keras.layers.SimpleRNNCell(hidden_size) 333 rnn_layer = tf.keras.layers.RNN(rnn_cell) 334 outputs, state = rnn_layer(sequence_input, mask=sequence_length_mask) 335 ``` 336 337 Args: 338 key: A unique string identifying the input features. 339 shape: The shape of the input data per sequence id. E.g. if `shape=(2,)`, 340 each example must contain `2 * sequence_length` values. 341 default_value: A single value compatible with `dtype` that is used for 342 padding the sparse data into a dense `Tensor`. 343 dtype: The type of values. 344 normalizer_fn: If not `None`, a function that can be used to normalize the 345 value of the tensor after `default_value` is applied for parsing. 346 Normalizer function takes the input `Tensor` as its argument, and returns 347 the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that 348 even though the most common use case of this function is normalization, it 349 can be used for any kind of Tensorflow transformations. 350 351 Returns: 352 A `SequenceNumericColumn`. 353 354 Raises: 355 TypeError: if any dimension in shape is not an int. 356 ValueError: if any dimension in shape is not a positive integer. 357 ValueError: if `dtype` is not convertible to `tf.float32`. 358 """ 359 shape = fc._check_shape(shape=shape, key=key) 360 if not (dtype.is_integer or dtype.is_floating): 361 raise ValueError('dtype must be convertible to float. ' 362 'dtype: {}, key: {}'.format(dtype, key)) 363 if normalizer_fn is not None and not callable(normalizer_fn): 364 raise TypeError( 365 'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn)) 366 367 return SequenceNumericColumn( 368 key, 369 shape=shape, 370 default_value=default_value, 371 dtype=dtype, 372 normalizer_fn=normalizer_fn) 373 374 375def _assert_all_equal_and_return(tensors, name=None): 376 """Asserts that all tensors are equal and returns the first one.""" 377 with ops.name_scope(name, 'assert_all_equal', values=tensors): 378 if len(tensors) == 1: 379 return tensors[0] 380 assert_equal_ops = [] 381 for t in tensors[1:]: 382 assert_equal_ops.append(check_ops.assert_equal(tensors[0], t)) 383 with ops.control_dependencies(assert_equal_ops): 384 return array_ops.identity(tensors[0]) 385 386 387class SequenceNumericColumn( 388 fc.SequenceDenseColumn, 389 collections.namedtuple( 390 'SequenceNumericColumn', 391 ('key', 'shape', 'default_value', 'dtype', 'normalizer_fn'))): 392 """Represents sequences of numeric data.""" 393 394 @property 395 def _is_v2_column(self): 396 return True 397 398 @property 399 def name(self): 400 """See `FeatureColumn` base class.""" 401 return self.key 402 403 @property 404 def parse_example_spec(self): 405 """See `FeatureColumn` base class.""" 406 return {self.key: parsing_ops.VarLenFeature(self.dtype)} 407 408 def transform_feature(self, transformation_cache, state_manager): 409 """See `FeatureColumn` base class. 410 411 In this case, we apply the `normalizer_fn` to the input tensor. 412 413 Args: 414 transformation_cache: A `FeatureTransformationCache` object to access 415 features. 416 state_manager: A `StateManager` to create / access resources such as 417 lookup tables. 418 419 Returns: 420 Normalized input tensor. 421 """ 422 input_tensor = transformation_cache.get(self.key, state_manager) 423 if self.normalizer_fn is not None: 424 input_tensor = self.normalizer_fn(input_tensor) 425 return input_tensor 426 427 @property 428 def variable_shape(self): 429 """Returns a `TensorShape` representing the shape of sequence input.""" 430 return tensor_shape.TensorShape(self.shape) 431 432 def get_sequence_dense_tensor(self, transformation_cache, state_manager): 433 """Returns a `TensorSequenceLengthPair`. 434 435 Args: 436 transformation_cache: A `FeatureTransformationCache` object to access 437 features. 438 state_manager: A `StateManager` to create / access resources such as 439 lookup tables. 440 """ 441 sp_tensor = transformation_cache.get(self, state_manager) 442 dense_tensor = sparse_ops.sparse_tensor_to_dense( 443 sp_tensor, default_value=self.default_value) 444 # Reshape into [batch_size, T, variable_shape]. 445 dense_shape = array_ops.concat( 446 [array_ops.shape(dense_tensor)[:1], [-1], self.variable_shape], 447 axis=0) 448 dense_tensor = array_ops.reshape(dense_tensor, shape=dense_shape) 449 450 # Get the number of timesteps per example 451 # For the 2D case, the raw values are grouped according to num_elements; 452 # for the 3D case, the grouping happens in the third dimension, and 453 # sequence length is not affected. 454 if sp_tensor.shape.ndims == 2: 455 num_elements = self.variable_shape.num_elements() 456 else: 457 num_elements = 1 458 seq_length = fc_utils.sequence_length_from_sparse_tensor( 459 sp_tensor, num_elements=num_elements) 460 461 return fc.SequenceDenseColumn.TensorSequenceLengthPair( 462 dense_tensor=dense_tensor, sequence_length=seq_length) 463 464 @property 465 def parents(self): 466 """See 'FeatureColumn` base class.""" 467 return [self.key] 468 469 def get_config(self): 470 """See 'FeatureColumn` base class.""" 471 config = dict(zip(self._fields, self)) 472 config['dtype'] = self.dtype.name 473 return config 474 475 @classmethod 476 def from_config(cls, config, custom_objects=None, columns_by_name=None): 477 """See 'FeatureColumn` base class.""" 478 fc._check_config_keys(config, cls._fields) 479 kwargs = fc._standardize_and_copy_config(config) 480 kwargs['dtype'] = dtypes.as_dtype(config['dtype']) 481 return cls(**kwargs) 482 483 484# pylint: enable=protected-access 485