1# Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15 16"""Operations for clipping (gradient, weight) tensors to min/max values.""" 17from __future__ import absolute_import 18from __future__ import division 19from __future__ import print_function 20 21import six 22 23from tensorflow.python.framework import constant_op 24from tensorflow.python.framework import dtypes 25from tensorflow.python.framework import ops 26from tensorflow.python.ops import array_ops 27from tensorflow.python.ops import gen_array_ops 28from tensorflow.python.ops import gen_nn_ops 29from tensorflow.python.ops import math_ops 30from tensorflow.python.util import deprecation 31from tensorflow.python.util import dispatch 32from tensorflow.python.util.compat import collections_abc 33from tensorflow.python.util.tf_export import tf_export 34 35 36@tf_export("clip_by_value") 37@dispatch.add_dispatch_support 38def clip_by_value(t, clip_value_min, clip_value_max, 39 name=None): 40 """Clips tensor values to a specified min and max. 41 42 Given a tensor `t`, this operation returns a tensor of the same type and 43 shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`. 44 Any values less than `clip_value_min` are set to `clip_value_min`. Any values 45 greater than `clip_value_max` are set to `clip_value_max`. 46 47 Note: `clip_value_min` needs to be smaller or equal to `clip_value_max` for 48 correct results. 49 50 For example: 51 52 Basic usage passes a scalar as the min and max value. 53 54 >>> t = tf.constant([[-10., -1., 0.], [0., 2., 10.]]) 55 >>> t2 = tf.clip_by_value(t, clip_value_min=-1, clip_value_max=1) 56 >>> t2.numpy() 57 array([[-1., -1., 0.], 58 [ 0., 1., 1.]], dtype=float32) 59 60 The min and max can be the same size as `t`, or broadcastable to that size. 61 62 >>> t = tf.constant([[-1, 0., 10.], [-1, 0, 10]]) 63 >>> clip_min = [[2],[1]] 64 >>> t3 = tf.clip_by_value(t, clip_value_min=clip_min, clip_value_max=100) 65 >>> t3.numpy() 66 array([[ 2., 2., 10.], 67 [ 1., 1., 10.]], dtype=float32) 68 69 Broadcasting fails, intentionally, if you would expand the dimensions of `t` 70 71 >>> t = tf.constant([[-1, 0., 10.], [-1, 0, 10]]) 72 >>> clip_min = [[[2, 1]]] # Has a third axis 73 >>> t4 = tf.clip_by_value(t, clip_value_min=clip_min, clip_value_max=100) 74 Traceback (most recent call last): 75 ... 76 InvalidArgumentError: Incompatible shapes: [2,3] vs. [1,1,2] 77 78 It throws a `TypeError` if you try to clip an `int` to a `float` value 79 (`tf.cast` the input to `float` first). 80 81 >>> t = tf.constant([[1, 2], [3, 4]], dtype=tf.int32) 82 >>> t5 = tf.clip_by_value(t, clip_value_min=-3.1, clip_value_max=3.1) 83 Traceback (most recent call last): 84 ... 85 TypeError: Cannot convert ... 86 87 88 Args: 89 t: A `Tensor` or `IndexedSlices`. 90 clip_value_min: The minimum value to clip to. A scalar `Tensor` or one that 91 is broadcastable to the shape of `t`. 92 clip_value_max: The maximum value to clip to. A scalar `Tensor` or one that 93 is broadcastable to the shape of `t`. 94 name: A name for the operation (optional). 95 96 Returns: 97 A clipped `Tensor` or `IndexedSlices`. 98 99 Raises: 100 `tf.errors.InvalidArgumentError`: If the clip tensors would trigger array 101 broadcasting that would make the returned tensor larger than the input. 102 TypeError: If dtype of the input is `int32` and dtype of 103 the `clip_value_min` or `clip_value_max` is `float32` 104 """ 105 with ops.name_scope(name, "clip_by_value", 106 [t, clip_value_min, clip_value_max]) as name: 107 values = ops.convert_to_tensor( 108 t.values if isinstance(t, ops.IndexedSlices) else t, name="t") 109 110 # Go through list of tensors, for each value in each tensor clip 111 t_min = math_ops.minimum(values, clip_value_max) 112 # Assert that the shape is compatible with the initial shape, 113 # to prevent unintentional broadcasting. 114 values.shape.assert_is_compatible_with(t_min.shape) 115 116 t_max = math_ops.maximum(t_min, clip_value_min, name=name) 117 values.shape.assert_is_compatible_with(t_max.shape) 118 119 if isinstance(t, ops.IndexedSlices): 120 t_max = ops.IndexedSlices(t_max, t.indices, t.dense_shape) 121 122 return t_max 123 # TODO(scottzhu): switch to use new implementation in 2 weeks. 124 # return gen_math_ops.clip_by_value( 125 # t, clip_value_min, clip_value_max, name=name) 126 127 128# TODO(scottzhu): switch to use new implementation in 2 weeks. 129# @ops.RegisterGradient("ClipByValue") 130def _clip_by_value_grad(op, grad): 131 """Returns grad of clip_by_value.""" 132 x = op.inputs[0] 133 y = op.inputs[1] 134 z = op.inputs[2] 135 gdtype = grad.dtype 136 sx = array_ops.shape(x) 137 sy = array_ops.shape(y) 138 sz = array_ops.shape(z) 139 gradshape = array_ops.shape(grad) 140 zeros = array_ops.zeros(gradshape, gdtype) 141 xymask = math_ops.less(x, y) 142 xzmask = math_ops.greater(x, z) 143 rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy) 144 rx, rz = gen_array_ops.broadcast_gradient_args(sx, sz) 145 xgrad = array_ops.where(math_ops.logical_or(xymask, xzmask), zeros, grad) 146 ygrad = array_ops.where(xymask, grad, zeros) 147 zgrad = array_ops.where(xzmask, grad, zeros) 148 gx = array_ops.reshape(math_ops.reduce_sum(xgrad, rx), sx) 149 gy = array_ops.reshape(math_ops.reduce_sum(ygrad, ry), sy) 150 gz = array_ops.reshape(math_ops.reduce_sum(zgrad, rz), sz) 151 return (gx, gy, gz) 152 153 154@tf_export("clip_by_norm") 155@dispatch.add_dispatch_support 156def clip_by_norm(t, clip_norm, axes=None, name=None): 157 """Clips tensor values to a maximum L2-norm. 158 159 Given a tensor `t`, and a maximum clip value `clip_norm`, this operation 160 normalizes `t` so that its L2-norm is less than or equal to `clip_norm`, 161 along the dimensions given in `axes`. Specifically, in the default case 162 where all dimensions are used for calculation, if the L2-norm of `t` is 163 already less than or equal to `clip_norm`, then `t` is not modified. If 164 the L2-norm is greater than `clip_norm`, then this operation returns a 165 tensor of the same type and shape as `t` with its values set to: 166 167 `t * clip_norm / l2norm(t)` 168 169 In this case, the L2-norm of the output tensor is `clip_norm`. 170 171 As another example, if `t` is a matrix and `axes == [1]`, then each row 172 of the output will have L2-norm less than or equal to `clip_norm`. If 173 `axes == [0]` instead, each column of the output will be clipped. 174 175 Code example: 176 177 >>> some_nums = tf.constant([[1, 2, 3, 4, 5]], dtype=tf.float32) 178 >>> tf.clip_by_norm(some_nums, 2.0).numpy() 179 array([[0.26967996, 0.5393599 , 0.80903983, 1.0787199 , 1.3483998 ]], 180 dtype=float32) 181 182 This operation is typically used to clip gradients before applying them with 183 an optimizer. Most gradient data is a collection of different shaped tensors 184 for different parts of the model. Thus, this is a common usage: 185 186 ``` 187 # Get your gradients after training 188 loss_value, grads = grad(model, features, labels) 189 190 # Apply some clipping 191 grads = [tf.clip_by_norm(g, norm) 192 for g in grads] 193 194 # Continue on with training 195 optimizer.apply_gradients(grads) 196 ``` 197 198 Args: 199 t: A `Tensor` or `IndexedSlices`. This must be a floating point type. 200 clip_norm: A 0-D (scalar) `Tensor` > 0. A maximum clipping value, also 201 floating point 202 axes: A 1-D (vector) `Tensor` of type int32 containing the dimensions 203 to use for computing the L2-norm. If `None` (the default), uses all 204 dimensions. 205 name: A name for the operation (optional). 206 207 Returns: 208 A clipped `Tensor` or `IndexedSlices`. 209 210 Raises: 211 ValueError: If the clip_norm tensor is not a 0-D scalar tensor. 212 TypeError: If dtype of the input is not a floating point or 213 complex type. 214 """ 215 with ops.name_scope(name, "clip_by_norm", [t, clip_norm]) as name: 216 values = ops.convert_to_tensor( 217 t.values if isinstance(t, ops.IndexedSlices) else t, name="t") 218 219 # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm 220 l2sum = math_ops.reduce_sum(values * values, axes, keepdims=True) 221 pred = l2sum > 0 222 # Two-tap tf.where trick to bypass NaN gradients 223 l2sum_safe = array_ops.where(pred, l2sum, array_ops.ones_like(l2sum)) 224 l2norm = array_ops.where(pred, math_ops.sqrt(l2sum_safe), l2sum) 225 intermediate = values * clip_norm 226 # Assert that the shape is compatible with the initial shape, 227 # to prevent unintentional broadcasting. 228 values.shape.assert_is_compatible_with(intermediate.shape) 229 values_clip = array_ops.identity( 230 intermediate / math_ops.maximum(l2norm, clip_norm), name=name) 231 232 if isinstance(t, ops.IndexedSlices): 233 return ops.IndexedSlices(values_clip, t.indices, t.dense_shape) 234 235 return values_clip 236 237 238@tf_export("linalg.global_norm", v1=["linalg.global_norm", "global_norm"]) 239@dispatch.add_dispatch_support 240@deprecation.deprecated_endpoints("global_norm") 241def global_norm(t_list, name=None): 242 """Computes the global norm of multiple tensors. 243 244 Given a tuple or list of tensors `t_list`, this operation returns the 245 global norm of the elements in all tensors in `t_list`. The global norm is 246 computed as: 247 248 `global_norm = sqrt(sum([l2norm(t)**2 for t in t_list]))` 249 250 Any entries in `t_list` that are of type None are ignored. 251 252 Args: 253 t_list: A tuple or list of mixed `Tensors`, `IndexedSlices`, or None. 254 name: A name for the operation (optional). 255 256 Returns: 257 A 0-D (scalar) `Tensor` of type `float`. 258 259 Raises: 260 TypeError: If `t_list` is not a sequence. 261 """ 262 if (not isinstance(t_list, collections_abc.Sequence) or 263 isinstance(t_list, six.string_types)): 264 raise TypeError("t_list should be a sequence") 265 t_list = list(t_list) 266 with ops.name_scope(name, "global_norm", t_list) as name: 267 values = [ 268 ops.convert_to_tensor( 269 t.values if isinstance(t, ops.IndexedSlices) else t, 270 name="t_%d" % i) 271 if t is not None else t 272 for i, t in enumerate(t_list)] 273 half_squared_norms = [] 274 for v in values: 275 if v is not None: 276 with ops.colocate_with(v): 277 half_squared_norms.append(gen_nn_ops.l2_loss(v)) 278 279 half_squared_norm = math_ops.reduce_sum(array_ops.stack(half_squared_norms)) 280 281 norm = math_ops.sqrt( 282 half_squared_norm * 283 constant_op.constant(2.0, dtype=half_squared_norm.dtype), 284 name="global_norm") 285 286 return norm 287 288 289@tf_export("clip_by_global_norm") 290@dispatch.add_dispatch_support 291def clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None): 292 """Clips values of multiple tensors by the ratio of the sum of their norms. 293 294 Given a tuple or list of tensors `t_list`, and a clipping ratio `clip_norm`, 295 this operation returns a list of clipped tensors `list_clipped` 296 and the global norm (`global_norm`) of all tensors in `t_list`. Optionally, 297 if you've already computed the global norm for `t_list`, you can specify 298 the global norm with `use_norm`. 299 300 To perform the clipping, the values `t_list[i]` are set to: 301 302 t_list[i] * clip_norm / max(global_norm, clip_norm) 303 304 where: 305 306 global_norm = sqrt(sum([l2norm(t)**2 for t in t_list])) 307 308 If `clip_norm > global_norm` then the entries in `t_list` remain as they are, 309 otherwise they're all shrunk by the global ratio. 310 311 If `global_norm == infinity` then the entries in `t_list` are all set to `NaN` 312 to signal that an error occurred. 313 314 Any of the entries of `t_list` that are of type `None` are ignored. 315 316 This is the correct way to perform gradient clipping (Pascanu et al., 2012). 317 318 However, it is slower than `clip_by_norm()` because all the parameters must be 319 ready before the clipping operation can be performed. 320 321 Args: 322 t_list: A tuple or list of mixed `Tensors`, `IndexedSlices`, or None. 323 clip_norm: A 0-D (scalar) `Tensor` > 0. The clipping ratio. 324 use_norm: A 0-D (scalar) `Tensor` of type `float` (optional). The global 325 norm to use. If not provided, `global_norm()` is used to compute the norm. 326 name: A name for the operation (optional). 327 328 Returns: 329 list_clipped: A list of `Tensors` of the same type as `list_t`. 330 global_norm: A 0-D (scalar) `Tensor` representing the global norm. 331 332 Raises: 333 TypeError: If `t_list` is not a sequence. 334 335 References: 336 On the difficulty of training Recurrent Neural Networks: 337 [Pascanu et al., 2012](http://proceedings.mlr.press/v28/pascanu13.html) 338 ([pdf](http://proceedings.mlr.press/v28/pascanu13.pdf)) 339 """ 340 if (not isinstance(t_list, collections_abc.Sequence) or 341 isinstance(t_list, six.string_types)): 342 raise TypeError("t_list should be a sequence") 343 t_list = list(t_list) 344 if use_norm is None: 345 use_norm = global_norm(t_list, name) 346 347 with ops.name_scope(name, "clip_by_global_norm", 348 t_list + [clip_norm]) as name: 349 # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm 350 scale_for_finite = clip_norm * math_ops.minimum( 351 1.0 / use_norm, 352 constant_op.constant(1.0, dtype=use_norm.dtype) / clip_norm) 353 # If use_norm is any finite number, this is a no-op. For inf/-inf/NaN, 354 # this will make scale NaN. 355 scale = scale_for_finite + (use_norm - use_norm) 356 357 values = [ 358 ops.convert_to_tensor( 359 t.values if isinstance(t, ops.IndexedSlices) else t, 360 name="t_%d" % i) 361 if t is not None else t 362 for i, t in enumerate(t_list)] 363 364 values_clipped = [] 365 for i, v in enumerate(values): 366 if v is None: 367 values_clipped.append(None) 368 else: 369 with ops.colocate_with(v): 370 values_clipped.append( 371 array_ops.identity(v * scale, name="%s_%d" % (name, i))) 372 373 list_clipped = [ 374 ops.IndexedSlices(c_v, t.indices, t.dense_shape) 375 if isinstance(t, ops.IndexedSlices) 376 else c_v 377 for (c_v, t) in zip(values_clipped, t_list)] 378 379 return list_clipped, use_norm 380 381 382@deprecation.deprecated( 383 date=None, 384 instructions="clip_by_average_norm is deprecated in TensorFlow 2.0. Please " 385 "use clip_by_norm(t, clip_norm * tf.cast(tf.size(t), tf.float32), name) " 386 "instead.") 387@tf_export(v1=["clip_by_average_norm"]) 388@dispatch.add_dispatch_support 389def clip_by_average_norm(t, clip_norm, name=None): 390 """Clips tensor values to a maximum average L2-norm. 391 392 Given a tensor `t`, and a maximum clip value `clip_norm`, this operation 393 normalizes `t` so that its average L2-norm is less than or equal to 394 `clip_norm`. Specifically, if the average L2-norm is already less than or 395 equal to `clip_norm`, then `t` is not modified. If the average L2-norm is 396 greater than `clip_norm`, then this operation returns a tensor of the same 397 type and shape as `t` with its values set to: 398 399 `t * clip_norm / l2norm_avg(t)` 400 401 In this case, the average L2-norm of the output tensor is `clip_norm`. 402 403 This operation is typically used to clip gradients before applying them with 404 an optimizer. 405 406 Args: 407 t: A `Tensor`. 408 clip_norm: A 0-D (scalar) `Tensor` > 0. A maximum clipping value. 409 name: A name for the operation (optional). 410 411 Returns: 412 A clipped `Tensor`. 413 """ 414 with ops.name_scope(name, "clip_by_average_norm", [t, clip_norm]) as name: 415 t = ops.convert_to_tensor(t, name="t") 416 417 # Calculate L2-norm per element, clip elements by ratio of clip_norm to 418 # L2-norm per element 419 n_element = math_ops.cast(array_ops.size(t), dtypes.float32) 420 l2norm_inv = math_ops.rsqrt( 421 math_ops.reduce_sum(t * t, math_ops.range(array_ops.rank(t)))) 422 tclip = array_ops.identity( 423 t * clip_norm * math_ops.minimum( 424 l2norm_inv * n_element, constant_op.constant(1.0) / clip_norm), 425 name=name) 426 427 return tclip 428