1# Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15"""Utilities for preprocessing sequence data. 16""" 17# pylint: disable=invalid-name 18from __future__ import absolute_import 19from __future__ import division 20from __future__ import print_function 21 22from keras_preprocessing import sequence 23 24from tensorflow.python.keras.utils import data_utils 25from tensorflow.python.util.tf_export import keras_export 26 27make_sampling_table = sequence.make_sampling_table 28skipgrams = sequence.skipgrams 29# TODO(fchollet): consider making `_remove_long_seq` public. 30_remove_long_seq = sequence._remove_long_seq # pylint: disable=protected-access 31 32 33@keras_export('keras.preprocessing.sequence.TimeseriesGenerator') 34class TimeseriesGenerator(sequence.TimeseriesGenerator, data_utils.Sequence): 35 """Utility class for generating batches of temporal data. 36 37 This class takes in a sequence of data-points gathered at 38 equal intervals, along with time series parameters such as 39 stride, length of history, etc., to produce batches for 40 training/validation. 41 # Arguments 42 data: Indexable generator (such as list or Numpy array) 43 containing consecutive data points (timesteps). 44 The data should be at 2D, and axis 0 is expected 45 to be the time dimension. 46 targets: Targets corresponding to timesteps in `data`. 47 It should have same length as `data`. 48 length: Length of the output sequences (in number of timesteps). 49 sampling_rate: Period between successive individual timesteps 50 within sequences. For rate `r`, timesteps 51 `data[i]`, `data[i-r]`, ... `data[i - length]` 52 are used for create a sample sequence. 53 stride: Period between successive output sequences. 54 For stride `s`, consecutive output samples would 55 be centered around `data[i]`, `data[i+s]`, `data[i+2*s]`, etc. 56 start_index: Data points earlier than `start_index` will not be used 57 in the output sequences. This is useful to reserve part of the 58 data for test or validation. 59 end_index: Data points later than `end_index` will not be used 60 in the output sequences. This is useful to reserve part of the 61 data for test or validation. 62 shuffle: Whether to shuffle output samples, 63 or instead draw them in chronological order. 64 reverse: Boolean: if `true`, timesteps in each output sample will be 65 in reverse chronological order. 66 batch_size: Number of timeseries samples in each batch 67 (except maybe the last one). 68 # Returns 69 A [Sequence](/utils/#sequence) instance. 70 # Examples 71 ```python 72 from keras.preprocessing.sequence import TimeseriesGenerator 73 import numpy as np 74 data = np.array([[i] for i in range(50)]) 75 targets = np.array([[i] for i in range(50)]) 76 data_gen = TimeseriesGenerator(data, targets, 77 length=10, sampling_rate=2, 78 batch_size=2) 79 assert len(data_gen) == 20 80 batch_0 = data_gen[0] 81 x, y = batch_0 82 assert np.array_equal(x, 83 np.array([[[0], [2], [4], [6], [8]], 84 [[1], [3], [5], [7], [9]]])) 85 assert np.array_equal(y, 86 np.array([[10], [11]])) 87 ``` 88 """ 89 pass 90 91 92@keras_export('keras.preprocessing.sequence.pad_sequences') 93def pad_sequences(sequences, maxlen=None, dtype='int32', 94 padding='pre', truncating='pre', value=0.): 95 """Pads sequences to the same length. 96 97 This function transforms a list (of length `num_samples`) 98 of sequences (lists of integers) 99 into a 2D Numpy array of shape `(num_samples, num_timesteps)`. 100 `num_timesteps` is either the `maxlen` argument if provided, 101 or the length of the longest sequence in the list. 102 103 Sequences that are shorter than `num_timesteps` 104 are padded with `value` until they are `num_timesteps` long. 105 106 Sequences longer than `num_timesteps` are truncated 107 so that they fit the desired length. 108 109 The position where padding or truncation happens is determined by 110 the arguments `padding` and `truncating`, respectively. 111 Pre-padding or removing values from the beginning of the sequence is the 112 default. 113 114 >>> sequence = [[1], [2, 3], [4, 5, 6]] 115 >>> tf.keras.preprocessing.sequence.pad_sequences(sequence) 116 array([[0, 0, 1], 117 [0, 2, 3], 118 [4, 5, 6]], dtype=int32) 119 120 >>> tf.keras.preprocessing.sequence.pad_sequences(sequence, value=-1) 121 array([[-1, -1, 1], 122 [-1, 2, 3], 123 [ 4, 5, 6]], dtype=int32) 124 125 >>> tf.keras.preprocessing.sequence.pad_sequences(sequence, padding='post') 126 array([[1, 0, 0], 127 [2, 3, 0], 128 [4, 5, 6]], dtype=int32) 129 130 >>> tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=2) 131 array([[0, 1], 132 [2, 3], 133 [5, 6]], dtype=int32) 134 135 Args: 136 sequences: List of sequences (each sequence is a list of integers). 137 maxlen: Optional Int, maximum length of all sequences. If not provided, 138 sequences will be padded to the length of the longest individual 139 sequence. 140 dtype: (Optional, defaults to int32). Type of the output sequences. 141 To pad sequences with variable length strings, you can use `object`. 142 padding: String, 'pre' or 'post' (optional, defaults to 'pre'): 143 pad either before or after each sequence. 144 truncating: String, 'pre' or 'post' (optional, defaults to 'pre'): 145 remove values from sequences larger than 146 `maxlen`, either at the beginning or at the end of the sequences. 147 value: Float or String, padding value. (Optional, defaults to 0.) 148 149 Returns: 150 Numpy array with shape `(len(sequences), maxlen)` 151 152 Raises: 153 ValueError: In case of invalid values for `truncating` or `padding`, 154 or in case of invalid shape for a `sequences` entry. 155 """ 156 return sequence.pad_sequences( 157 sequences, maxlen=maxlen, dtype=dtype, 158 padding=padding, truncating=truncating, value=value) 159 160keras_export( 161 'keras.preprocessing.sequence.make_sampling_table')(make_sampling_table) 162keras_export('keras.preprocessing.sequence.skipgrams')(skipgrams) 163