1# Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15 16"""Operations for clipping (gradient, weight) tensors to min/max values.""" 17from __future__ import absolute_import 18from __future__ import division 19from __future__ import print_function 20 21import collections 22 23import six 24 25from tensorflow.python.framework import constant_op 26from tensorflow.python.framework import dtypes 27from tensorflow.python.framework import ops 28from tensorflow.python.ops import array_ops 29from tensorflow.python.ops import gen_array_ops 30from tensorflow.python.ops import gen_nn_ops 31from tensorflow.python.ops import math_ops 32from tensorflow.python.ops import numerics 33from tensorflow.python.util import deprecation 34from tensorflow.python.util import dispatch 35from tensorflow.python.util.tf_export import tf_export 36 37 38@tf_export("clip_by_value") 39@dispatch.add_dispatch_support 40def clip_by_value(t, clip_value_min, clip_value_max, 41 name=None): 42 """Clips tensor values to a specified min and max. 43 44 Given a tensor `t`, this operation returns a tensor of the same type and 45 shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`. 46 Any values less than `clip_value_min` are set to `clip_value_min`. Any values 47 greater than `clip_value_max` are set to `clip_value_max`. 48 49 Note: `clip_value_min` needs to be smaller or equal to `clip_value_max` for 50 correct results. 51 52 Args: 53 t: A `Tensor`. 54 clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape 55 as `t`. The minimum value to clip by. 56 clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape 57 as `t`. The maximum value to clip by. 58 name: A name for the operation (optional). 59 60 Returns: 61 A clipped `Tensor`. 62 63 Raises: 64 ValueError: If the clip tensors would trigger array broadcasting 65 that would make the returned tensor larger than the input. 66 """ 67 with ops.name_scope(name, "clip_by_value", 68 [t, clip_value_min, clip_value_max]) as name: 69 t = ops.convert_to_tensor(t, name="t") 70 71 # Go through list of tensors, for each value in each tensor clip 72 t_min = math_ops.minimum(t, clip_value_max) 73 # Assert that the shape is compatible with the initial shape, 74 # to prevent unintentional broadcasting. 75 _ = t.shape.merge_with(t_min.shape) 76 77 t_max = math_ops.maximum(t_min, clip_value_min, name=name) 78 _ = t.shape.merge_with(t_max.shape) 79 80 return t_max 81 # TODO(scottzhu): switch to use new implmentation in 2 weeks. 82 # return gen_math_ops.clip_by_value( 83 # t, clip_value_min, clip_value_max, name=name) 84 85 86# TODO(scottzhu): switch to use new implmentation in 2 weeks. 87# @ops.RegisterGradient("ClipByValue") 88def _clip_by_value_grad(op, grad): 89 """Returns grad of clip_by_value.""" 90 x = op.inputs[0] 91 y = op.inputs[1] 92 z = op.inputs[2] 93 gdtype = grad.dtype 94 sx = array_ops.shape(x) 95 sy = array_ops.shape(y) 96 sz = array_ops.shape(z) 97 gradshape = array_ops.shape(grad) 98 zeros = array_ops.zeros(gradshape, gdtype) 99 xymask = math_ops.less(x, y) 100 xzmask = math_ops.greater(x, z) 101 rx, ry = gen_array_ops.broadcast_gradient_args(sx, sy) 102 rx, rz = gen_array_ops.broadcast_gradient_args(sx, sz) 103 xgrad = array_ops.where(math_ops.logical_or(xymask, xzmask), zeros, grad) 104 ygrad = array_ops.where(xymask, grad, zeros) 105 zgrad = array_ops.where(xzmask, grad, zeros) 106 gx = array_ops.reshape(math_ops.reduce_sum(xgrad, rx), sx) 107 gy = array_ops.reshape(math_ops.reduce_sum(ygrad, ry), sy) 108 gz = array_ops.reshape(math_ops.reduce_sum(zgrad, rz), sz) 109 return (gx, gy, gz) 110 111 112@tf_export("clip_by_norm") 113def clip_by_norm(t, clip_norm, axes=None, name=None): 114 """Clips tensor values to a maximum L2-norm. 115 116 Given a tensor `t`, and a maximum clip value `clip_norm`, this operation 117 normalizes `t` so that its L2-norm is less than or equal to `clip_norm`, 118 along the dimensions given in `axes`. Specifically, in the default case 119 where all dimensions are used for calculation, if the L2-norm of `t` is 120 already less than or equal to `clip_norm`, then `t` is not modified. If 121 the L2-norm is greater than `clip_norm`, then this operation returns a 122 tensor of the same type and shape as `t` with its values set to: 123 124 `t * clip_norm / l2norm(t)` 125 126 In this case, the L2-norm of the output tensor is `clip_norm`. 127 128 As another example, if `t` is a matrix and `axes == [1]`, then each row 129 of the output will have L2-norm less than or equal to `clip_norm`. If 130 `axes == [0]` instead, each column of the output will be clipped. 131 132 This operation is typically used to clip gradients before applying them with 133 an optimizer. 134 135 Args: 136 t: A `Tensor` or `IndexedSlices`. 137 clip_norm: A 0-D (scalar) `Tensor` > 0. A maximum clipping value. 138 axes: A 1-D (vector) `Tensor` of type int32 containing the dimensions 139 to use for computing the L2-norm. If `None` (the default), uses all 140 dimensions. 141 name: A name for the operation (optional). 142 143 Returns: 144 A clipped `Tensor` or `IndexedSlices`. 145 """ 146 with ops.name_scope(name, "clip_by_norm", [t, clip_norm]) as name: 147 values = ops.convert_to_tensor( 148 t.values if isinstance(t, ops.IndexedSlices) else t, name="t") 149 150 # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm 151 l2sum = math_ops.reduce_sum(values * values, axes, keepdims=True) 152 pred = l2sum > 0 153 # Two-tap tf.where trick to bypass NaN gradients 154 l2sum_safe = array_ops.where(pred, l2sum, array_ops.ones_like(l2sum)) 155 l2norm = array_ops.where(pred, math_ops.sqrt(l2sum_safe), l2sum) 156 intermediate = values * clip_norm 157 # Assert that the shape is compatible with the initial shape, 158 # to prevent unintentional broadcasting. 159 _ = values.shape.merge_with(intermediate.shape) 160 values_clip = array_ops.identity( 161 intermediate / math_ops.maximum(l2norm, clip_norm), name=name) 162 163 if isinstance(t, ops.IndexedSlices): 164 return ops.IndexedSlices(values_clip, t.indices, t.dense_shape) 165 166 return values_clip 167 168 169@tf_export("linalg.global_norm", v1=["linalg.global_norm", "global_norm"]) 170@deprecation.deprecated_endpoints("global_norm") 171def global_norm(t_list, name=None): 172 """Computes the global norm of multiple tensors. 173 174 Given a tuple or list of tensors `t_list`, this operation returns the 175 global norm of the elements in all tensors in `t_list`. The global norm is 176 computed as: 177 178 `global_norm = sqrt(sum([l2norm(t)**2 for t in t_list]))` 179 180 Any entries in `t_list` that are of type None are ignored. 181 182 Args: 183 t_list: A tuple or list of mixed `Tensors`, `IndexedSlices`, or None. 184 name: A name for the operation (optional). 185 186 Returns: 187 A 0-D (scalar) `Tensor` of type `float`. 188 189 Raises: 190 TypeError: If `t_list` is not a sequence. 191 """ 192 if (not isinstance(t_list, collections.Sequence) 193 or isinstance(t_list, six.string_types)): 194 raise TypeError("t_list should be a sequence") 195 t_list = list(t_list) 196 with ops.name_scope(name, "global_norm", t_list) as name: 197 values = [ 198 ops.convert_to_tensor( 199 t.values if isinstance(t, ops.IndexedSlices) else t, 200 name="t_%d" % i) 201 if t is not None else t 202 for i, t in enumerate(t_list)] 203 half_squared_norms = [] 204 for v in values: 205 if v is not None: 206 with ops.colocate_with(v): 207 half_squared_norms.append(gen_nn_ops.l2_loss(v)) 208 209 half_squared_norm = math_ops.reduce_sum(array_ops.stack(half_squared_norms)) 210 211 norm = math_ops.sqrt( 212 half_squared_norm * 213 constant_op.constant(2.0, dtype=half_squared_norm.dtype), 214 name="global_norm") 215 216 return norm 217 218 219@tf_export("clip_by_global_norm") 220def clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None): 221 """Clips values of multiple tensors by the ratio of the sum of their norms. 222 223 Given a tuple or list of tensors `t_list`, and a clipping ratio `clip_norm`, 224 this operation returns a list of clipped tensors `list_clipped` 225 and the global norm (`global_norm`) of all tensors in `t_list`. Optionally, 226 if you've already computed the global norm for `t_list`, you can specify 227 the global norm with `use_norm`. 228 229 To perform the clipping, the values `t_list[i]` are set to: 230 231 t_list[i] * clip_norm / max(global_norm, clip_norm) 232 233 where: 234 235 global_norm = sqrt(sum([l2norm(t)**2 for t in t_list])) 236 237 If `clip_norm > global_norm` then the entries in `t_list` remain as they are, 238 otherwise they're all shrunk by the global ratio. 239 240 Any of the entries of `t_list` that are of type `None` are ignored. 241 242 This is the correct way to perform gradient clipping (for example, see 243 [Pascanu et al., 2012](http://arxiv.org/abs/1211.5063) 244 ([pdf](http://arxiv.org/pdf/1211.5063.pdf))). 245 246 However, it is slower than `clip_by_norm()` because all the parameters must be 247 ready before the clipping operation can be performed. 248 249 Args: 250 t_list: A tuple or list of mixed `Tensors`, `IndexedSlices`, or None. 251 clip_norm: A 0-D (scalar) `Tensor` > 0. The clipping ratio. 252 use_norm: A 0-D (scalar) `Tensor` of type `float` (optional). The global 253 norm to use. If not provided, `global_norm()` is used to compute the norm. 254 name: A name for the operation (optional). 255 256 Returns: 257 list_clipped: A list of `Tensors` of the same type as `list_t`. 258 global_norm: A 0-D (scalar) `Tensor` representing the global norm. 259 260 Raises: 261 TypeError: If `t_list` is not a sequence. 262 InvalidArgumentError: If global norm is not finite. 263 """ 264 if (not isinstance(t_list, collections.Sequence) 265 or isinstance(t_list, six.string_types)): 266 raise TypeError("t_list should be a sequence") 267 t_list = list(t_list) 268 if use_norm is None: 269 use_norm = global_norm(t_list, name) 270 use_norm = numerics.verify_tensor_all_finite(use_norm, 271 "Found Inf or NaN global norm.") 272 273 with ops.name_scope(name, "clip_by_global_norm", 274 t_list + [clip_norm]) as name: 275 # Calculate L2-norm, clip elements by ratio of clip_norm to L2-norm 276 scale = clip_norm * math_ops.minimum( 277 1.0 / use_norm, 278 constant_op.constant(1.0, dtype=use_norm.dtype) / clip_norm) 279 280 values = [ 281 ops.convert_to_tensor( 282 t.values if isinstance(t, ops.IndexedSlices) else t, 283 name="t_%d" % i) 284 if t is not None else t 285 for i, t in enumerate(t_list)] 286 287 values_clipped = [] 288 for i, v in enumerate(values): 289 if v is None: 290 values_clipped.append(None) 291 else: 292 with ops.colocate_with(v): 293 values_clipped.append( 294 array_ops.identity(v * scale, name="%s_%d" % (name, i))) 295 296 list_clipped = [ 297 ops.IndexedSlices(c_v, t.indices, t.dense_shape) 298 if isinstance(t, ops.IndexedSlices) 299 else c_v 300 for (c_v, t) in zip(values_clipped, t_list)] 301 302 return list_clipped, use_norm 303 304 305@deprecation.deprecated( 306 date=None, 307 instructions="clip_by_average_norm is deprecated in TensorFlow 2.0. Please " 308 "use clip_by_norm(t, clip_norm * tf.cast(tf.size(t), tf.float32), name) " 309 "instead.") 310@tf_export(v1=["clip_by_average_norm"]) 311def clip_by_average_norm(t, clip_norm, name=None): 312 """Clips tensor values to a maximum average L2-norm. 313 314 Given a tensor `t`, and a maximum clip value `clip_norm`, this operation 315 normalizes `t` so that its average L2-norm is less than or equal to 316 `clip_norm`. Specifically, if the average L2-norm is already less than or 317 equal to `clip_norm`, then `t` is not modified. If the average L2-norm is 318 greater than `clip_norm`, then this operation returns a tensor of the same 319 type and shape as `t` with its values set to: 320 321 `t * clip_norm / l2norm_avg(t)` 322 323 In this case, the average L2-norm of the output tensor is `clip_norm`. 324 325 This operation is typically used to clip gradients before applying them with 326 an optimizer. 327 328 Args: 329 t: A `Tensor`. 330 clip_norm: A 0-D (scalar) `Tensor` > 0. A maximum clipping value. 331 name: A name for the operation (optional). 332 333 Returns: 334 A clipped `Tensor`. 335 """ 336 with ops.name_scope(name, "clip_by_average_norm", [t, clip_norm]) as name: 337 t = ops.convert_to_tensor(t, name="t") 338 339 # Calculate L2-norm per element, clip elements by ratio of clip_norm to 340 # L2-norm per element 341 n_element = math_ops.cast(array_ops.size(t), dtypes.float32) 342 l2norm_inv = math_ops.rsqrt( 343 math_ops.reduce_sum(t * t, math_ops.range(array_ops.rank(t)))) 344 tclip = array_ops.identity( 345 t * clip_norm * math_ops.minimum( 346 l2norm_inv * n_element, constant_op.constant(1.0) / clip_norm), 347 name=name) 348 349 return tclip 350