• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Utilities for preprocessing sequence data."""
16# pylint: disable=invalid-name
17
18from keras_preprocessing import sequence
19
20from tensorflow.python.keras.utils import data_utils
21from tensorflow.python.util.tf_export import keras_export
22
23make_sampling_table = sequence.make_sampling_table
24skipgrams = sequence.skipgrams
25# TODO(fchollet): consider making `_remove_long_seq` public.
26_remove_long_seq = sequence._remove_long_seq  # pylint: disable=protected-access
27
28
29@keras_export('keras.preprocessing.sequence.TimeseriesGenerator')
30class TimeseriesGenerator(sequence.TimeseriesGenerator, data_utils.Sequence):
31  """Utility class for generating batches of temporal data.
32
33  This class takes in a sequence of data-points gathered at
34  equal intervals, along with time series parameters such as
35  stride, length of history, etc., to produce batches for
36  training/validation.
37  # Arguments
38      data: Indexable generator (such as list or Numpy array)
39          containing consecutive data points (timesteps).
40          The data should be at 2D, and axis 0 is expected
41          to be the time dimension.
42      targets: Targets corresponding to timesteps in `data`.
43          It should have same length as `data`.
44      length: Length of the output sequences (in number of timesteps).
45      sampling_rate: Period between successive individual timesteps
46          within sequences. For rate `r`, timesteps
47          `data[i]`, `data[i-r]`, ... `data[i - length]`
48          are used for create a sample sequence.
49      stride: Period between successive output sequences.
50          For stride `s`, consecutive output samples would
51          be centered around `data[i]`, `data[i+s]`, `data[i+2*s]`, etc.
52      start_index: Data points earlier than `start_index` will not be used
53          in the output sequences. This is useful to reserve part of the
54          data for test or validation.
55      end_index: Data points later than `end_index` will not be used
56          in the output sequences. This is useful to reserve part of the
57          data for test or validation.
58      shuffle: Whether to shuffle output samples,
59          or instead draw them in chronological order.
60      reverse: Boolean: if `true`, timesteps in each output sample will be
61          in reverse chronological order.
62      batch_size: Number of timeseries samples in each batch
63          (except maybe the last one).
64  # Returns
65      A [Sequence](https://www.tensorflow.org/api_docs/python/tf/keras/utils/Sequence) instance.
66  # Examples
67  ```python
68  from keras.preprocessing.sequence import TimeseriesGenerator
69  import numpy as np
70  data = np.array([[i] for i in range(50)])
71  targets = np.array([[i] for i in range(50)])
72  data_gen = TimeseriesGenerator(data, targets,
73                                 length=10, sampling_rate=2,
74                                 batch_size=2)
75  assert len(data_gen) == 20
76  batch_0 = data_gen[0]
77  x, y = batch_0
78  assert np.array_equal(x,
79                        np.array([[[0], [2], [4], [6], [8]],
80                                  [[1], [3], [5], [7], [9]]]))
81  assert np.array_equal(y,
82                        np.array([[10], [11]]))
83  ```
84  """
85  pass
86
87
88@keras_export('keras.preprocessing.sequence.pad_sequences')
89def pad_sequences(sequences, maxlen=None, dtype='int32',
90                  padding='pre', truncating='pre', value=0.):
91  """Pads sequences to the same length.
92
93  This function transforms a list (of length `num_samples`)
94  of sequences (lists of integers)
95  into a 2D Numpy array of shape `(num_samples, num_timesteps)`.
96  `num_timesteps` is either the `maxlen` argument if provided,
97  or the length of the longest sequence in the list.
98
99  Sequences that are shorter than `num_timesteps`
100  are padded with `value` until they are `num_timesteps` long.
101
102  Sequences longer than `num_timesteps` are truncated
103  so that they fit the desired length.
104
105  The position where padding or truncation happens is determined by
106  the arguments `padding` and `truncating`, respectively.
107  Pre-padding or removing values from the beginning of the sequence is the
108  default.
109
110  >>> sequence = [[1], [2, 3], [4, 5, 6]]
111  >>> tf.keras.preprocessing.sequence.pad_sequences(sequence)
112  array([[0, 0, 1],
113         [0, 2, 3],
114         [4, 5, 6]], dtype=int32)
115
116  >>> tf.keras.preprocessing.sequence.pad_sequences(sequence, value=-1)
117  array([[-1, -1,  1],
118         [-1,  2,  3],
119         [ 4,  5,  6]], dtype=int32)
120
121  >>> tf.keras.preprocessing.sequence.pad_sequences(sequence, padding='post')
122  array([[1, 0, 0],
123         [2, 3, 0],
124         [4, 5, 6]], dtype=int32)
125
126  >>> tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=2)
127  array([[0, 1],
128         [2, 3],
129         [5, 6]], dtype=int32)
130
131  Args:
132      sequences: List of sequences (each sequence is a list of integers).
133      maxlen: Optional Int, maximum length of all sequences. If not provided,
134          sequences will be padded to the length of the longest individual
135          sequence.
136      dtype: (Optional, defaults to int32). Type of the output sequences.
137          To pad sequences with variable length strings, you can use `object`.
138      padding: String, 'pre' or 'post' (optional, defaults to 'pre'):
139          pad either before or after each sequence.
140      truncating: String, 'pre' or 'post' (optional, defaults to 'pre'):
141          remove values from sequences larger than
142          `maxlen`, either at the beginning or at the end of the sequences.
143      value: Float or String, padding value. (Optional, defaults to 0.)
144
145  Returns:
146      Numpy array with shape `(len(sequences), maxlen)`
147
148  Raises:
149      ValueError: In case of invalid values for `truncating` or `padding`,
150          or in case of invalid shape for a `sequences` entry.
151  """
152  return sequence.pad_sequences(
153      sequences, maxlen=maxlen, dtype=dtype,
154      padding=padding, truncating=truncating, value=value)
155
156keras_export(
157    'keras.preprocessing.sequence.make_sampling_table')(make_sampling_table)
158keras_export('keras.preprocessing.sequence.skipgrams')(skipgrams)
159