• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Code for backpropagation using the tape utilities."""
16
17# TODO(b/159343581): Properly support CompositeTensor in all functions in this
18# file.
19
20from __future__ import absolute_import
21from __future__ import division
22from __future__ import print_function
23
24import functools
25import operator
26import sys
27
28import six
29
30from tensorflow.python import pywrap_tfe
31from tensorflow.python.eager import backprop_util
32from tensorflow.python.eager import context
33from tensorflow.python.eager import execute
34from tensorflow.python.eager import imperative_grad
35from tensorflow.python.eager import tape
36from tensorflow.python.framework import constant_op
37from tensorflow.python.framework import dtypes
38from tensorflow.python.framework import ops
39from tensorflow.python.framework import tensor_shape
40from tensorflow.python.framework import tensor_util
41from tensorflow.python.ops import array_ops
42from tensorflow.python.ops import check_ops
43from tensorflow.python.ops import control_flow_util
44from tensorflow.python.ops import default_gradient
45from tensorflow.python.ops import gen_array_ops
46from tensorflow.python.ops import gen_math_ops
47from tensorflow.python.ops import math_ops
48from tensorflow.python.ops import resource_variable_ops
49from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients
50from tensorflow.python.platform import tf_logging as logging
51from tensorflow.python.util import _pywrap_utils
52from tensorflow.python.util import nest
53from tensorflow.python.util import tf_contextlib
54from tensorflow.python.util import tf_inspect
55from tensorflow.python.util.lazy_loader import LazyLoader
56from tensorflow.python.util.tf_export import tf_export
57
58
59# Note that we need to lazy load the following two modules to avoid creating
60# circular dependencies.
61# TODO(b/119775953): fix the circular dependencies.
62pfor_ops = LazyLoader(
63    "pfor_ops", globals(),
64    "tensorflow.python.ops.parallel_for.control_flow_ops")
65
66function = LazyLoader("function", globals(),
67                      "tensorflow.python.eager.function")
68
69_op_attr_type_cache = {}
70
71
72def op_attr_type(op_type, attr_name):
73  try:
74    return _op_attr_type_cache[(op_type, attr_name)]
75  except KeyError:
76    context.ensure_initialized()
77    h = context.context()._handle  # pylint: disable=protected-access
78    attr_type = pywrap_tfe.TFE_OpNameGetAttrType(h, op_type, attr_name)
79  _op_attr_type_cache[(op_type, attr_name)] = attr_type
80  return attr_type
81
82
83def make_attr(attr_type, value):
84  # pybind11 enums do not return the raw value like SWIG enums do. They are
85  # useful when comparing amongst each other but not direct integers as we are
86  # doing in most tests.
87  # https://pybind11.readthedocs.io/en/stable/classes.html#enumerations-and-internal-types
88  # TODO(amitpatankar): After all SWIG transitions, convert the enum comparisons
89  # from integer value to class.
90  if attr_type == int(pywrap_tfe.TF_ATTR_TYPE):
91    return dtypes.as_dtype(value)
92  if attr_type == [int(pywrap_tfe.TF_ATTR_TYPE)]:
93    return [dtypes.as_dtype(v) for v in value]
94  if attr_type == int(pywrap_tfe.TF_ATTR_SHAPE):
95    return tensor_shape.as_shape(value).as_proto()
96  if attr_type == [int(pywrap_tfe.TF_ATTR_SHAPE)]:
97    return [tensor_shape.as_shape(v).as_proto() for v in value]
98  if isinstance(value, str):
99    return value.encode()
100  return value
101
102
103class _MockOp(object):
104  """Pretends to be a tf.Operation for the gradient functions."""
105
106  def __init__(self, attrs, inputs, outputs, typ, skip_input_indices):
107    self.attrs = attrs
108    self.inputs = inputs
109    self.outputs = outputs
110    self.type = typ
111    self.skip_input_indices = skip_input_indices
112
113  def get_attr(self, attr):
114    typ = op_attr_type(self.type, attr)
115    for i in range(0, len(self.attrs), 2):
116      if self.attrs[i] == attr:
117        return make_attr(typ, self.attrs[i + 1])
118    raise KeyError(attr)
119
120  def _get_control_flow_context(self):
121    raise NotImplementedError(
122        "tf.GradientTape.gradients() does not support graph control flow "
123        "operations like tf.cond or tf.while at this time. Use tf.gradients() "
124        "instead. If you need this feature, please file a feature request at "
125        "https://github.com/tensorflow/tensorflow/issues/new"
126    )
127
128
129def _gradient_function(op_name, attr_tuple, num_inputs, inputs, outputs,
130                       out_grads, skip_input_indices, forward_pass_name_scope):
131  """Calls the gradient function of the op.
132
133  Args:
134    op_name: the name of the op to be differentiated.
135    attr_tuple: the attrs, as a tuple.
136    num_inputs: the number of inputs to the op.
137    inputs: inputs to the original operation.
138    outputs: outputs to the original operation.
139    out_grads: gradients of the operation wrt its outputs.
140    skip_input_indices: a tuple that is passed to the gradient function,
141      indicating which inputs to skip calculating the gradient for
142    forward_pass_name_scope: the namescope of the op in the forward pass.
143
144  Returns:
145    The gradients with respect to the inputs of the function, as a list.
146  """
147  mock_op = _MockOp(attr_tuple, inputs, outputs, op_name, skip_input_indices)
148  grad_fn = ops._gradient_registry.lookup(op_name)  # pylint: disable=protected-access
149  if grad_fn is None:
150    return [None] * num_inputs
151
152  # This does not work with v1 TensorArrays.
153  if ops.executing_eagerly_outside_functions(
154  ) or control_flow_util.EnableControlFlowV2(ops.get_default_graph()):
155    gradient_name_scope = "gradient_tape/"
156    if forward_pass_name_scope:
157      gradient_name_scope += forward_pass_name_scope + "/"
158    with ops.name_scope(gradient_name_scope):
159      return grad_fn(mock_op, *out_grads)
160  else:
161    return grad_fn(mock_op, *out_grads)
162
163
164pywrap_tfe.TFE_Py_RegisterGradientFunction(_gradient_function)
165
166
167def _must_record_gradient():
168  return not pywrap_tfe.TFE_Py_TapeSetIsEmpty()
169
170
171def _record_gradient(op_name, inputs, attrs, results):
172  return pywrap_tfe.TFE_Py_RecordGradient(op_name, inputs, attrs, results,
173                                          ops.get_name_scope())
174
175
176execute.must_record_gradient = _must_record_gradient
177execute.record_gradient = _record_gradient
178
179
180def implicit_val_and_grad(f):
181  """Returns a function which differentiates f with respect to variables.
182
183  The wrapped function returns the value and the gradient of f when called with
184  the same arguments. The gradient is with respect to all trainable TFE
185  variables accessed by `f`.
186
187  This function is useful when the exact set of variables to differentiate with
188  is not known ahead of time.
189
190  Example:
191
192  ```python
193  dense_layer = tf.compat.v1.layers.Dense(1)
194  def loss(x, y):
195    return tf.reduce_sum(tf.square(dense_layer(x) - y))
196
197  # Obtain the gradient function.
198  val_grad_fn = tfe.implicit_value_and_gradients(loss)
199
200  # Invoke the gradient function with concrete values of x and y.
201  x = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
202  y = tf.constant([[10.0], [20.0]])
203  value, grads_and_vars = val_grad_fn(x, y)
204  print('Value of loss: %s' % value)
205
206  # Apply the gradients to Variables.
207  optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.1)
208  optimizer.apply_gradients(grads_and_vars)
209  ```
210
211  Args:
212    f: function to be differentiated. If `f` returns a scalar, this scalar will
213      be differentiated. If `f` returns a tensor or list of tensors, by default
214      a scalar will be computed by adding all their values to produce a single
215      scalar.
216
217  Returns:
218    A function which, when called, returns a tuple pair.
219    Its first element is the value to which the function evaluates.
220    Its second element is list of (gradient, variable) pairs.
221
222  Raises:
223    ValueError: if `f` returns None.
224  """
225  # TODO(cais): Remove calls to tf.constant() once the gradients functions
226  # accept lists and np.ndarrays.
227
228  def grad_fn(*args, **kwds):
229    """Computes the gradient of the wrapped function."""
230    this_tape = tape.push_new_tape()
231    try:
232      end_node = f(*args, **kwds)
233      if end_node is None:
234        raise ValueError("Cannot differentiate a function that returns None; "
235                         "did you forget to return a value from {}?".format(
236                             f.__name__))
237    finally:
238      tape.pop_tape(this_tape)
239    # Note: variables are returned in construction order. This ensures unique
240    # order across executions.
241    variables = this_tape.watched_variables()
242    if not variables:
243      raise ValueError("No trainable variables were accessed while the "
244                       "function was being computed.")
245
246    sources = [v.handle for v in variables]
247    for s in sources:
248      if getattr(s, "is_packed", False):
249        raise ValueError(
250            "GradientTape.gradient is not supported on packed EagerTensors yet."
251        )
252    grad = imperative_grad.imperative_grad(this_tape, nest.flatten(end_node),
253                                           sources)
254    return end_node, list(zip(grad, variables))
255
256  return grad_fn
257
258
259def implicit_grad(f):
260  """Returns a function which differentiates f with respect to variables.
261
262  The wrapped function returns the gradient of f when called with the same
263  arguments. The gradient is with respect to all trainable TFE variables
264  accessed by `f`.
265
266  This function is useful when the exact set of variables to differentiate with
267  is not known ahead of time.
268
269  Example:
270
271  ```python
272  dense_layer = tf.compat.v1.layers.Dense(1)
273  def loss(x, y):
274    return tf.reduce_sum(tf.square(dense_layer(x) - y))
275
276  # Obtain the gradient function.
277  grad_fn = tfe.implicit_gradients(loss)
278
279  # Invoke the gradient function with concrete values of x and y.
280  x = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
281  y = tf.constant([[10.0], [20.0]])
282  grads_and_vars = grad_fn(x, y)
283
284  # Apply the gradients to Variables.
285  optimizer = tf.compat.v1.train.GradientDescentOptimizer(0.1)
286  optimizer.apply_gradients(grads_and_vars)
287  ```
288
289  Args:
290    f: function to be differentiated. If `f` returns a scalar, this scalar will
291      be differentiated. If `f` returns a tensor or list of tensors, by default
292      a scalar will be computed by adding all their values to produce a single
293      scalar.
294
295  Returns:
296    A function which, when called, returns a list of (gradient, variable) pairs.
297  """
298  # TODO(cais): Remove calls to tf.constant() once the gradients functions
299  # accept lists and np.ndarrays.
300
301  def grad_fn(*args, **kwds):
302    """Computes the gradient of the wrapped function."""
303    return implicit_val_and_grad(f)(*args, **kwds)[1]
304
305  return grad_fn
306
307
308def _get_arg_spec(f, params, param_args):
309  """The positions of the parameters of f to be differentiated in param_args."""
310  try:
311    args = tf_inspect.getfullargspec(f).args
312  except TypeError as e:
313    # TypeError can happen when f is a callable object.
314    if params is None:
315      return range(len(param_args))
316    elif all(isinstance(x, int) for x in params):
317      return params
318    raise ValueError("Either callable provided is not a function or could not "
319                     "inspect its arguments by name: %s. Original error: %s"
320                     % (f, e))
321  if params is None:
322    if not args:
323      return range(len(param_args))
324    if args[0] == "self":
325      return range(len(args) - 1)
326    else:
327      return range(len(args))
328  elif all(isinstance(x, six.string_types) for x in params):
329    return [args.index(n) for n in params]
330  elif all(isinstance(x, int) for x in params):
331    return params
332  else:
333    raise ValueError(
334        "params must be all strings or all integers; got %s." % params)
335
336
337def gradients_function(f, params=None):
338  """Returns a function which differentiates f with respect to params.
339
340  Example:
341  ```python
342  # f(x, y) = (x ^ 3) * y - x * (y ^ 2)
343  # Therefore, the 1st order derivatives are:
344  #   df / dx = 3 * (x ^ 2) * y - y ^ 2
345  #   df / dy = x ^ 3 - 2 * x * y
346  # The 2nd order derivatives with respect to x is:
347  #   d^2 f / (dx)^2 = 6 * x * y
348  def f(x, y):
349    return x * x * x * y - x * y * y
350
351  # Obtain a function that returns 1st order gradients.
352  grad_fn = tfe.gradients_function(f)
353
354  x = 2.0
355  y = 3.0
356
357  # Invoke the 1st order gradient function.
358  x_grad, y_grad = grad_fn(x, y)
359  assert x_grad.numpy() == 3 * (2 ** 2) * 3 - 3 ** 2
360  assert y_grad.numpy() == (2 ** 3) - 2 * 2 * 3
361
362  # Obtain a function that returns the 2nd order gradient with respect to x.
363  gradgrad_fn = tfe.gradients_function(lambda x, y: grad_fn(x, y)[0])
364
365  # Invoke the 2nd order gradient function.
366  x_gradgrad = gradgrad_fn(x, y)[0]
367  assert x_gradgrad.numpy() == 6 * 2 * 3
368
369  # To obtain a callable that returns the gradient(s) of `f` with respect to a
370  # subset of its inputs, use the `params` keyword argument with
371  # `gradients_function()`.
372  ygrad_fn = tfe.gradients_function(f, params=[1])
373
374  (y_grad,) = ygrad_fn(x, y)
375  assert y_grad.numpy() == (2 ** 3) - 2 * 2 * 3
376  ```
377
378  Note that only tensors with real or complex dtypes are differentiable.
379
380  Args:
381    f: function to be differentiated. If `f` returns a scalar, this scalar will
382      be differentiated. If `f` returns a tensor or list of tensors, by default
383      a scalar will be computed by adding all their values to produce a single
384      scalar. If desired, the tensors can be elementwise multiplied by the
385      tensors passed as the `dy` keyword argument to the returned gradient
386      function.
387    params: list of parameter names of f or list of integers indexing the
388      parameters with respect to which we'll differentiate. Passing None
389      differentiates with respect to all parameters.
390
391  Returns:
392    function which, when called, returns the value of f and the gradient
393    of `f` with respect to all of `params`. The function takes an extra optional
394    keyword argument `dy`. Setting it allows computation of vector jacobian
395    products for vectors other than the vector of ones.
396
397  Raises:
398    ValueError: if the params are not all strings or all integers.
399  """
400
401  def decorated(*args, **kwds):
402    """Computes the gradient of the decorated function."""
403
404    _, grad = val_and_grad_function(f, params=params)(*args, **kwds)
405    return grad
406
407  return decorated
408
409
410def _ensure_unique_tensor_objects(parameter_positions, args):
411  """Make each of the parameter_positions in args a unique ops.Tensor object.
412
413  Ensure that each parameter is treated independently.
414  For example:
415
416  def f(x, y): return x * y
417  g = gradients_function(f)
418  one = tf.constant(1.)
419
420  g(one, one) should return [1., 1.]
421  (even though the two arguments are the same Tensor object).
422
423  Args:
424    parameter_positions: List of indices into args defining the arguments to
425      differentiate against.
426    args: A list of arguments to the function to be differentiated.
427
428  Returns:
429    args, possibly edited in-place.
430  """
431  s = set()
432  for (i, t) in enumerate(args):
433    if i in parameter_positions:
434      tid = ops.tensor_id(t)
435      if tid in s:
436        args[i] = gen_array_ops.identity(args[i])
437      else:
438        s.add(tid)
439  return args
440
441
442def val_and_grad_function(f, params=None):
443  """Returns a function that computes f and its derivative w.r.t. params.
444
445  Example:
446  ```python
447  # f(x, y) = (x ^ 3) * y - x * (y ^ 2)
448  # Therefore, the 1st order derivatives are:
449  #   df / dx = 3 * (x ^ 2) * y - y ^ 2
450  #   df / dy = x ^ 3 - 2 * x * y
451  def f(x, y):
452    return x * x * x * y - x * y * y
453
454  # Obtain a function that returns the function value and the 1st order
455  # gradients.
456  val_grads_fn = tfe.value_and_gradients_function(f)
457
458  x = 2.0
459  y = 3.0
460
461  # Invoke the value-and-gradients function.
462  f_val, (x_grad, y_grad) = val_grads_fn(x, y)
463  assert f_val.numpy() == (2 ** 3) * 3 - 2 * (3 ** 2)
464  assert x_grad.numpy() == 3 * (2 ** 2) * 3 - 3 ** 2
465  assert y_grad.numpy() == (2 ** 3) - 2 * 2 * 3
466
467  # To obtain a callable that returns the value of `f` and the gradient(s) of
468  # `f` with respect to a subset of its inputs, use the `params` keyword
469  # argument with `value_and_gradients_function()`.
470  val_ygrad_fn = tfe.value_and_gradients_function(f, params=[1])
471
472  f_val, (y_grad,) = val_ygrad_fn(x, y)
473  assert f_val.numpy() == (2 ** 3) * 3 - 2 * (3 ** 2)
474  assert y_grad.numpy() == (2 ** 3) - 2 * 2 * 3
475  ```
476
477  Args:
478    f: function to be differentiated. If `f` returns a scalar, this scalar will
479      be differentiated. If `f` returns a tensor or list of tensors, by default
480      a scalar will be computed by adding all their values to produce a single
481      scalar. If desired, the tensors can be elementwise multiplied by the
482      tensors passed as the `dy` keyword argument to the returned gradient
483      function.
484    params: list of parameter names of f or list of integers indexing the
485      parameters with respect to which we'll differentiate. Passing `None`
486      differentiates with respect to all parameters.
487
488  Returns:
489    function which, when called, returns the value of f and the gradient
490    of f with respect to all of `params`. The function takes an extra optional
491    keyword argument "dy". Setting it allows computation of vector jacobian
492    products for vectors other than the vector of ones.
493
494  Raises:
495    ValueError: if the params are not all strings or all integers.
496  """
497
498  def decorated(*args, **kwds):
499    """Computes the value and gradient of the decorated function."""
500    dy = kwds.pop("dy", None)
501    if kwds:
502      raise ValueError("Functions to be differentiated cannot "
503                       "receive keyword arguments.")
504    val, vjp = make_vjp(f, params)(*args, **kwds)
505    return val, vjp(dy=dy)
506
507  return decorated
508
509
510def make_vjp(f, params=None, persistent=True):
511  """Returns a function that computes f and its vjp w.r.t.
512
513  params.
514
515  The term "vjp" here is an abbreviation for vector-jacobian product.
516
517  Args:
518    f: the function to be differentiated.
519    params: the parameters (numbers or names) to differentiate with respect to.
520      A value of None will differentiate with respect to all parameters.
521    persistent: Boolean controlling whether the VJP function can be re-used.
522      Must be True or False.
523
524  Returns:
525    A function, which when called, returns a tuple (value, vjp), where:
526    - value is the result of calling f.
527    - vjp is a function, which takes a vector as an argument and
528      returns the product of that vector with the Jacobian of f.
529      Providing no argument to vjp is equivalent to providing a
530      vector of ones.
531
532    For example,
533    ```python
534    def f(x):
535      return x * x
536
537    wrapped_fn = tfe.make_vjp(f)
538    result, vjp = wrapped_fn(tf.constant(3.0))
539    # result is 9.0
540    vjp()  # the vjp function returns 6.0
541
542  Raises:
543    ValueError: if `f` returns None.
544  """
545
546  def decorated(*args, **kwds):
547    """Computes the value and gradient of the decorated function."""
548    parameter_positions = _get_arg_spec(f, params, args)
549    assert not kwds, "The gradient function can't take keyword arguments."
550    this_tape = tape.push_new_tape(persistent=persistent)
551    try:
552      sources = []
553      args = [
554          ops.convert_to_tensor(arg) if i in parameter_positions else arg
555          for i, arg in enumerate(args)
556      ]
557      args = _ensure_unique_tensor_objects(parameter_positions, args)
558      for i in parameter_positions:
559        if getattr(args[i], "is_packed", False):
560          raise ValueError(
561              "GradientTape.gradient is not supported on packed EagerTensors"
562              "yet.")
563        sources.append(args[i])
564        tape.watch(this_tape, args[i])
565      result = f(*args)
566      if result is None:
567        raise ValueError("Cannot differentiate a function that returns None; "
568                         "did you forget to return a value from {}?".format(
569                             f.__name__))
570      flat_result = nest.flatten(result)
571      flat_result = [gen_array_ops.identity(x) for x in flat_result]
572      result = nest.pack_sequence_as(result, flat_result)
573    finally:
574      tape.pop_tape(this_tape)
575    def vjp(dy=None):
576      if dy is not None:
577        dy = [ops.convert_to_tensor(x) for x in nest.flatten(dy)]
578      return imperative_grad.imperative_grad(
579          this_tape, nest.flatten(result), sources, output_gradients=dy)
580
581    return result, vjp
582
583  return decorated
584
585
586def flatten_nested_indexed_slices(grad):
587  assert isinstance(grad, ops.IndexedSlices)
588  if isinstance(grad.values, ops.Tensor):
589    return grad
590  else:
591    assert isinstance(grad.values, ops.IndexedSlices)
592    g = flatten_nested_indexed_slices(grad.values)
593    return ops.IndexedSlices(g.values, array_ops.gather(grad.indices,
594                                                        g.indices),
595                             g.dense_shape)
596
597
598def aggregate_indexed_slices_gradients(grads):
599  """Aggregates gradients containing `IndexedSlices`s."""
600  if len(grads) < 1:
601    return None
602  if len(grads) == 1:
603    return grads[0]
604  grads = [g for g in grads if g is not None]
605  # If any gradient is a `Tensor`, sum them up and return a dense tensor
606  # object.
607  if any(isinstance(g, ops.Tensor) for g in grads):
608    return math_ops.add_n(grads)
609
610  # The following `_as_indexed_slices_list` casts ids of IndexedSlices into
611  # int64. It is to make sure the inputs of `concat` all have same the data
612  # type.
613  grads = math_ops._as_indexed_slices_list(grads)  # pylint: disable=protected-access
614
615  grads = [flatten_nested_indexed_slices(x) for x in grads]
616  # Form IndexedSlices out of the concatenated values and indices.
617  concat_grad = ops.IndexedSlices(
618      array_ops.concat([x.values for x in grads], axis=0),
619      array_ops.concat([x.indices for x in grads], axis=0),
620      grads[0].dense_shape)
621
622  return concat_grad
623
624
625def _aggregate_grads(gradients):
626  """Aggregate gradients from multiple sources.
627
628  Args:
629    gradients: A list of 'Tensor' or 'IndexedSlices' gradients.
630
631  Returns:
632    If 'gradients' only has 'Tensor', returns an aggregated 'Tensor'.
633    Otherwise returns an aggregated 'IndexedSlices'.
634  """
635  assert gradients, "No gradients to aggregate"
636
637  if len(gradients) == 1:
638    return gradients[0]
639  if all(isinstance(g, ops.Tensor) for g in gradients):
640    return gen_math_ops.add_n(gradients)
641  else:
642    assert all(isinstance(g, (ops.Tensor, ops.IndexedSlices))
643               for g in gradients)
644    return aggregate_indexed_slices_gradients(gradients)
645
646
647def _num_elements(grad):
648  """The number of elements in the `grad` tensor."""
649  if isinstance(grad, ops.Tensor):
650    shape_tuple = grad._shape_tuple()  # pylint: disable=protected-access
651  elif isinstance(grad, ops.IndexedSlices):
652    shape_tuple = grad.values._shape_tuple()  # pylint: disable=protected-access
653  else:
654    raise ValueError("`grad` not a Tensor or IndexedSlices.")
655  if shape_tuple is None or None in shape_tuple:
656    return 0
657  return functools.reduce(operator.mul, shape_tuple, 1)
658
659
660def _fast_fill(value, shape, dtype):
661  return array_ops.fill(
662      constant_op.constant(shape, dtype=dtypes.int32),
663      constant_op.constant(value, dtype=dtype))
664
665
666def _zeros(shape, dtype):
667  """Helper to return (possibly cached) zero tensors in eager mode."""
668  # Note: variants will use _zeros_like
669  if dtype == dtypes.string or dtype == dtypes.resource:
670    return None
671
672  ctx = context.context()
673  if not ctx.executing_eagerly():
674    return array_ops.zeros(shape, dtype)
675
676  device = ctx.device_name
677
678  if tensor_util.is_tf_type(shape):
679    shape_key = shape.ref()
680  else:
681    shape_key = shape
682  cache_key = shape_key, dtype, device
683  cached = ctx.zeros_cache().get(cache_key)
684  if cached is None:
685    if dtypes.as_dtype(dtype).is_bool:
686      value = False
687    else:
688      value = 0
689    cached = _fast_fill(value, shape, dtype)
690    ctx.zeros_cache().put(cache_key, cached)
691  return cached
692
693
694def _ones(shape, dtype):
695  as_dtype = dtypes.as_dtype(dtype)
696  if as_dtype == dtypes.string:
697    return None
698
699  if not context.executing_eagerly():
700    return array_ops.ones(shape, dtype)
701
702  if as_dtype.is_bool:
703    value = True
704  else:
705    value = 1
706
707  if shape == ():  # pylint: disable=g-explicit-bool-comparison
708    return constant_op.constant(value, dtype=dtype)
709  return _fast_fill(value, shape, dtype)
710
711
712_default_vspace = imperative_grad.VSpace(
713    num_elements_fn=_num_elements,
714    aggregate_fn=_aggregate_grads,
715    zeros_fn=_zeros,
716    ones_fn=_ones,
717    zeros_like_fn=default_gradient.zeros_like,
718    ones_like_fn=default_gradient.ones_like,
719    graph_shape_fn=gen_array_ops.shape)
720pywrap_tfe.TFE_Py_RegisterVSpace(_default_vspace)
721
722
723def _handle_or_self(x):
724  """Unwrap resource variable/ndarray to return tensors."""
725  if resource_variable_ops.is_resource_variable(x):
726    return x.handle
727  return x
728
729
730@tf_export("GradientTape", "autodiff.GradientTape", v1=["GradientTape"])
731class GradientTape(object):
732  """Record operations for automatic differentiation.
733
734  Operations are recorded if they are executed within this context manager and
735  at least one of their inputs is being "watched".
736
737  Trainable variables (created by `tf.Variable` or `tf.compat.v1.get_variable`,
738  where `trainable=True` is default in both cases) are automatically watched.
739  Tensors can be manually watched by invoking the `watch` method on this context
740  manager.
741
742  For example, consider the function `y = x * x`. The gradient at `x = 3.0` can
743  be computed as:
744
745  >>> x = tf.constant(3.0)
746  >>> with tf.GradientTape() as g:
747  ...   g.watch(x)
748  ...   y = x * x
749  >>> dy_dx = g.gradient(y, x)
750  >>> print(dy_dx)
751  tf.Tensor(6.0, shape=(), dtype=float32)
752
753  GradientTapes can be nested to compute higher-order derivatives. For example,
754
755  >>> x = tf.constant(5.0)
756  >>> with tf.GradientTape() as g:
757  ...   g.watch(x)
758  ...   with tf.GradientTape() as gg:
759  ...     gg.watch(x)
760  ...     y = x * x
761  ...   dy_dx = gg.gradient(y, x)  # dy_dx = 2 * x
762  >>> d2y_dx2 = g.gradient(dy_dx, x)  # d2y_dx2 = 2
763  >>> print(dy_dx)
764  tf.Tensor(10.0, shape=(), dtype=float32)
765  >>> print(d2y_dx2)
766  tf.Tensor(2.0, shape=(), dtype=float32)
767
768  By default, the resources held by a GradientTape are released as soon as
769  GradientTape.gradient() method is called. To compute multiple gradients over
770  the same computation, create a persistent gradient tape. This allows multiple
771  calls to the gradient() method as resources are released when the tape object
772  is garbage collected. For example:
773
774  >>> x = tf.constant(3.0)
775  >>> with tf.GradientTape(persistent=True) as g:
776  ...   g.watch(x)
777  ...   y = x * x
778  ...   z = y * y
779  >>> dz_dx = g.gradient(z, x)  # (4*x^3 at x = 3)
780  >>> print(dz_dx)
781  tf.Tensor(108.0, shape=(), dtype=float32)
782  >>> dy_dx = g.gradient(y, x)
783  >>> print(dy_dx)
784  tf.Tensor(6.0, shape=(), dtype=float32)
785
786  By default GradientTape will automatically watch any trainable variables that
787  are accessed inside the context. If you want fine grained control over which
788  variables are watched you can disable automatic tracking by passing
789  `watch_accessed_variables=False` to the tape constructor:
790
791  >>> x = tf.Variable(2.0)
792  >>> w = tf.Variable(5.0)
793  >>> with tf.GradientTape(
794  ...     watch_accessed_variables=False, persistent=True) as tape:
795  ...   tape.watch(x)
796  ...   y = x ** 2  # Gradients will be available for `x`.
797  ...   z = w ** 3  # No gradients will be available as `w` isn't being watched.
798  >>> dy_dx = tape.gradient(y, x)
799  >>> print(dy_dx)
800  tf.Tensor(4.0, shape=(), dtype=float32)
801  >>> # No gradients will be available as `w` isn't being watched.
802  >>> dz_dy = tape.gradient(z, w)
803  >>> print(dz_dy)
804  None
805
806  Note that when using models you should ensure that your variables exist when
807  using `watch_accessed_variables=False`. Otherwise it's quite easy to make your
808  first iteration not have any gradients:
809
810  ```python
811  a = tf.keras.layers.Dense(32)
812  b = tf.keras.layers.Dense(32)
813
814  with tf.GradientTape(watch_accessed_variables=False) as tape:
815    tape.watch(a.variables)  # Since `a.build` has not been called at this point
816                             # `a.variables` will return an empty list and the
817                             # tape will not be watching anything.
818    result = b(a(inputs))
819    tape.gradient(result, a.variables)  # The result of this computation will be
820                                        # a list of `None`s since a's variables
821                                        # are not being watched.
822  ```
823
824  Note that only tensors with real or complex dtypes are differentiable.
825  """
826
827  def __init__(self, persistent=False, watch_accessed_variables=True):
828    """Creates a new GradientTape.
829
830    Args:
831      persistent: Boolean controlling whether a persistent gradient tape
832        is created. False by default, which means at most one call can
833        be made to the gradient() method on this object.
834      watch_accessed_variables: Boolean controlling whether the tape will
835        automatically `watch` any (trainable) variables accessed while the tape
836        is active. Defaults to True meaning gradients can be requested from any
837        result computed in the tape derived from reading a trainable `Variable`.
838        If False users must explicitly `watch` any `Variable`s they want to
839        request gradients from.
840    """
841    self._tape = None
842    self._persistent = persistent
843    self._watch_accessed_variables = watch_accessed_variables
844    self._watched_variables = ()
845    self._recording = False
846
847  def __enter__(self):
848    """Enters a context inside which operations are recorded on this tape."""
849    self._push_tape()
850    return self
851
852  def __exit__(self, typ, value, traceback):
853    """Exits the recording context, no further operations are traced."""
854    if self._recording:
855      self._pop_tape()
856
857  def _push_tape(self):
858    """Pushes a new tape onto the tape stack."""
859    if self._recording:
860      raise ValueError("Tape is still recording, This can happen if you try to "
861                       "re-enter an already-active tape.")
862    if self._tape is None:
863      self._tape = tape.push_new_tape(
864          persistent=self._persistent,
865          watch_accessed_variables=self._watch_accessed_variables)
866    else:
867      tape.push_tape(self._tape)
868    self._recording = True
869
870  def _pop_tape(self):
871    if not self._recording:
872      raise ValueError("Tape is not recording.")
873    tape.pop_tape(self._tape)
874    self._recording = False
875
876  @tf_contextlib.contextmanager
877  def _ensure_recording(self):
878    """Ensures that this tape is recording."""
879    if not self._recording:
880      try:
881        self._push_tape()
882        yield
883      finally:
884        self._pop_tape()
885    else:
886      yield
887
888  def watch(self, tensor):
889    """Ensures that `tensor` is being traced by this tape.
890
891    Args:
892      tensor: a Tensor or list of Tensors.
893
894    Raises:
895      ValueError: if it encounters something that is not a tensor.
896    """
897    for t in nest.flatten(tensor, expand_composites=True):
898      if not (_pywrap_utils.IsTensor(t) or _pywrap_utils.IsVariable(t)):
899        raise ValueError("Passed in object of type {}, not tf.Tensor".format(
900            type(t)))
901      if not backprop_util.IsTrainable(t):
902        logging.log_first_n(
903            logging.WARN, "The dtype of the watched tensor must be "
904            "floating (e.g. tf.float32), got %r", 5, t.dtype)
905      if hasattr(t, "handle"):
906        # There are many variable-like objects, all of them currently have
907        # `handle` attribute that points to a tensor. If this changes, internals
908        # of watch_variable need to change as well.
909        tape.watch_variable(self._tape, t)
910      else:
911        tape.watch(self._tape, t)
912
913  @tf_contextlib.contextmanager
914  def stop_recording(self):
915    """Temporarily stops recording operations on this tape.
916
917    Operations executed while this context manager is active will not be
918    recorded on the tape. This is useful for reducing the memory used by tracing
919    all computations.
920
921    For example:
922
923    >>> x = tf.constant(4.0)
924    >>> with tf.GradientTape() as tape:
925    ...   with tape.stop_recording():
926    ...     y = x ** 2
927    >>> dy_dx = tape.gradient(y, x)
928    >>> print(dy_dx)
929    None
930
931    Yields:
932      None
933    Raises:
934      RuntimeError: if the tape is not currently recording.
935    """
936    if self._tape is None:
937      raise RuntimeError(
938          "Trying to stop recording a tape which is not recording.")
939    self._pop_tape()
940    try:
941      yield
942    finally:
943      self._push_tape()
944
945  def reset(self):
946    """Clears all information stored in this tape.
947
948    Equivalent to exiting and reentering the tape context manager with a new
949    tape. For example, the two following code blocks are equivalent:
950
951    ```
952    with tf.GradientTape() as t:
953      loss = loss_fn()
954    with tf.GradientTape() as t:
955      loss += other_loss_fn()
956    t.gradient(loss, ...)  # Only differentiates other_loss_fn, not loss_fn
957
958
959    # The following is equivalent to the above
960    with tf.GradientTape() as t:
961      loss = loss_fn()
962      t.reset()
963      loss += other_loss_fn()
964    t.gradient(loss, ...)  # Only differentiates other_loss_fn, not loss_fn
965    ```
966
967    This is useful if you don't want to exit the context manager for the tape,
968    or can't because the desired reset point is inside a control flow construct:
969
970    ```
971    with tf.GradientTape() as t:
972      loss = ...
973      if loss > k:
974        t.reset()
975    ```
976    """
977    self._pop_tape()
978    self._tape = None
979    self._push_tape()
980
981  def watched_variables(self):
982    """Returns variables watched by this tape in order of construction."""
983    if self._tape is not None:
984      self._watched_variables = self._tape.watched_variables()
985    return self._watched_variables
986
987  def gradient(self,
988               target,
989               sources,
990               output_gradients=None,
991               unconnected_gradients=UnconnectedGradients.NONE):
992    """Computes the gradient using operations recorded in context of this tape.
993
994    Note: Unless you set `persistent=True` a GradientTape can only be used to
995    compute one set of gradients (or jacobians).
996
997    Args:
998      target: a list or nested structure of Tensors or Variables to be
999        differentiated.
1000      sources: a list or nested structure of Tensors or Variables. `target`
1001        will be differentiated against elements in `sources`.
1002      output_gradients: a list of gradients, one for each element of
1003        target. Defaults to None.
1004      unconnected_gradients: a value which can either hold 'none' or 'zero' and
1005        alters the value which will be returned if the target and sources are
1006        unconnected. The possible values and effects are detailed in
1007        'UnconnectedGradients' and it defaults to 'none'.
1008
1009    Returns:
1010      a list or nested structure of Tensors (or IndexedSlices, or None),
1011      one for each element in `sources`. Returned structure is the same as
1012      the structure of `sources`.
1013
1014    Raises:
1015      RuntimeError: If called on a used, non-persistent tape.
1016      RuntimeError: If called inside the context of the tape.
1017      TypeError: If the target is a None object.
1018      ValueError: If the target is a variable or if unconnected gradients is
1019       called with an unknown value.
1020    """
1021    if self._tape is None:
1022      raise RuntimeError("A non-persistent GradientTape can only be used to "
1023                         "compute one set of gradients (or jacobians)")
1024    if self._recording:
1025      if not self._persistent:
1026        self._pop_tape()
1027      else:
1028        logging.log_first_n(
1029            logging.WARN, "Calling GradientTape.gradient on a persistent "
1030            "tape inside its context is significantly less "
1031            "efficient than calling it outside the context (it "
1032            "causes the gradient ops to be recorded on the "
1033            "tape, leading to increased CPU and memory usage). "
1034            "Only call GradientTape.gradient inside the "
1035            "context if you actually want to trace the "
1036            "gradient in order to compute higher order "
1037            "derivatives.", 1)
1038
1039    if target is None:
1040      raise TypeError("Target should be a list or nested structure"
1041                      " of Tensors or Variables to be differentiated,"
1042                      " but recieved %r" % (target))
1043
1044    flat_targets = []
1045    for t in nest.flatten(target):
1046      if not backprop_util.IsTrainable(t):
1047        logging.vlog(
1048            logging.WARN, "The dtype of the target tensor must be "
1049            "floating (e.g. tf.float32) when calling GradientTape.gradient, "
1050            "got %r", t.dtype)
1051      if resource_variable_ops.is_resource_variable(t):
1052        with self:
1053          t = ops.convert_to_tensor(t)
1054      flat_targets.append(t)
1055
1056    flat_sources = nest.flatten(sources)
1057    flat_sources_raw = flat_sources
1058    flat_sources = [_handle_or_self(x) for x in flat_sources]
1059    for t in flat_sources_raw:
1060      if not backprop_util.IsTrainable(t):
1061        logging.vlog(
1062            logging.WARN, "The dtype of the source tensor must be "
1063            "floating (e.g. tf.float32) when calling GradientTape.gradient, "
1064            "got %r", t.dtype)
1065      if getattr(t, "is_packed", False):
1066        raise ValueError(
1067            "GradientTape.gradient is not supported on packed EagerTensors yet."
1068        )
1069
1070    if output_gradients is not None:
1071      output_gradients = [None if x is None else ops.convert_to_tensor(x)
1072                          for x in nest.flatten(output_gradients)]
1073
1074    flat_grad = imperative_grad.imperative_grad(
1075        self._tape,
1076        flat_targets,
1077        flat_sources,
1078        output_gradients=output_gradients,
1079        sources_raw=flat_sources_raw,
1080        unconnected_gradients=unconnected_gradients)
1081
1082    if not self._persistent:
1083      # Keep track of watched variables before setting tape to None
1084      self._watched_variables = self._tape.watched_variables()
1085      self._tape = None
1086
1087    grad = nest.pack_sequence_as(sources, flat_grad)
1088    return grad
1089
1090  def jacobian(self,
1091               target,
1092               sources,
1093               unconnected_gradients=UnconnectedGradients.NONE,
1094               parallel_iterations=None,
1095               experimental_use_pfor=True):
1096    """Computes the jacobian using operations recorded in context of this tape.
1097
1098    Note: Unless you set `persistent=True` a GradientTape can only be used to
1099    compute one set of gradients (or jacobians).
1100
1101    Note: By default the jacobian implementation uses parallel for (pfor), which
1102    creates a tf.function under the hood for each jacobian call. For better
1103    performance, and to avoid recompilation and vectorization rewrites on each
1104    call, enclose GradientTape code in @tf.function.
1105
1106    See[wikipedia
1107    article](http://en.wikipedia.org/wiki/jacobian_matrix_and_determinant)
1108    for the definition of a Jacobian.
1109
1110    Example usage:
1111
1112    ```python
1113    with tf.GradientTape() as g:
1114      x  = tf.constant([1.0, 2.0])
1115      g.watch(x)
1116      y = x * x
1117    jacobian = g.jacobian(y, x)
1118    # jacobian value is [[2., 0.], [0., 4.]]
1119    ```
1120
1121    Args:
1122      target: Tensor to be differentiated.
1123      sources: a list or nested structure of Tensors or Variables. `target`
1124        will be differentiated against elements in `sources`.
1125      unconnected_gradients: a value which can either hold 'none' or 'zero' and
1126        alters the value which will be returned if the target and sources are
1127        unconnected. The possible values and effects are detailed in
1128        'UnconnectedGradients' and it defaults to 'none'.
1129      parallel_iterations: A knob to control how many iterations are dispatched
1130        in parallel. This knob can be used to control the total memory usage.
1131      experimental_use_pfor: If true, vectorizes the jacobian computation. Else
1132        falls back to a sequential while_loop. Vectorization can sometimes fail
1133        or lead to excessive memory usage. This option can be used to disable
1134        vectorization in such cases.
1135
1136    Returns:
1137      A list or nested structure of Tensors (or None), one for each element in
1138      `sources`. Returned structure is the same as the structure of `sources`.
1139      Note if any gradient is sparse (IndexedSlices), jacobian function
1140      currently makes it dense and returns a Tensor instead. This may change in
1141      the future.
1142
1143
1144    Raises:
1145      RuntimeError: If called on a used, non-persistent tape.
1146      RuntimeError: If called on a non-persistent tape with eager execution
1147        enabled and without enabling experimental_use_pfor.
1148      ValueError: If vectorization of jacobian computation fails.
1149    """
1150    if self._tape is None:
1151      raise RuntimeError("A non-persistent GradientTape can only be used to "
1152                         "compute one set of gradients (or jacobians)")
1153
1154    flat_sources = nest.flatten(sources)
1155    target_static_shape = target.shape
1156    target_shape = array_ops.shape(target)
1157    # Note that we push and pop the tape here and below. This is needed since we
1158    # need gradients through the enclosed operations.
1159    with self._ensure_recording():
1160      target = array_ops.reshape(target, [-1])
1161
1162    def loop_fn(i):
1163      with self._ensure_recording():
1164        y = array_ops.gather(target, i)
1165      return self.gradient(y, flat_sources,
1166                           unconnected_gradients=unconnected_gradients)
1167
1168    try:
1169      target_size = int(target.shape[0])
1170    except TypeError:
1171      target_size = array_ops.shape(target)[0]
1172
1173    if experimental_use_pfor:
1174      try:
1175        output = pfor_ops.pfor(loop_fn, target_size,
1176                               parallel_iterations=parallel_iterations)
1177      except ValueError as err:
1178        six.reraise(
1179            ValueError,
1180            ValueError(
1181                str(err) + "\nEncountered an exception while vectorizing the "
1182                "jacobian computation. Vectorization can be disabled by setting"
1183                " experimental_use_pfor to False."),
1184            sys.exc_info()[2])
1185    else:
1186      if context.executing_eagerly() and not self._persistent:
1187        raise RuntimeError(
1188            "GradientTape must be created with persistent=True"
1189            " to compute the jacobian with eager execution enabled and with "
1190            " experimental_use_pfor set to False.")
1191      output = pfor_ops.for_loop(
1192          loop_fn, [target.dtype] * len(flat_sources), target_size,
1193          parallel_iterations=parallel_iterations)
1194
1195    for i, out in enumerate(output):
1196      if out is not None:
1197        new_shape = array_ops.concat(
1198            [target_shape, array_ops.shape(out)[1:]], axis=0)
1199        out = array_ops.reshape(out, new_shape)
1200        if context.executing_eagerly():
1201          out.set_shape(target_static_shape.concatenate(flat_sources[i].shape))
1202      output[i] = out
1203
1204    return nest.pack_sequence_as(sources, output)
1205
1206  def batch_jacobian(self,
1207                     target,
1208                     source,
1209                     unconnected_gradients=UnconnectedGradients.NONE,
1210                     parallel_iterations=None,
1211                     experimental_use_pfor=True):
1212    """Computes and stacks per-example jacobians.
1213
1214    See [wikipedia article](http://en.wikipedia.org/wiki/jacobian_matrix_and_determinant)
1215    for the definition of a Jacobian. This function is essentially an efficient
1216    implementation of the following:
1217
1218    `tf.stack([self.jacobian(y[i], x[i]) for i in range(x.shape[0])])`.
1219
1220    Note that compared to `GradientTape.jacobian` which computes gradient of
1221    each output value w.r.t each input value, this function is useful when
1222    `target[i,...]` is independent of `source[j,...]` for `j != i`. This
1223    assumption allows more efficient computation as compared to
1224    `GradientTape.jacobian`. The output, as well as intermediate activations,
1225    are lower dimensional and avoid a bunch of redundant zeros which would
1226    result in the jacobian computation given the independence assumption.
1227
1228    Note: Unless you set `persistent=True` a GradientTape can only be used to
1229    compute one set of gradients (or jacobians).
1230
1231    Note: By default the batch_jacobian implementation uses parallel for (pfor),
1232    which creates a tf.function under the hood for each batch_jacobian call.
1233    For better performance, and to avoid recompilation and vectorization
1234    rewrites on each call, enclose GradientTape code in @tf.function.
1235
1236
1237    Example usage:
1238
1239    ```python
1240    with tf.GradientTape() as g:
1241      x = tf.constant([[1., 2.], [3., 4.]], dtype=tf.float32)
1242      g.watch(x)
1243      y = x * x
1244    batch_jacobian = g.batch_jacobian(y, x)
1245    # batch_jacobian is [[[2,  0], [0,  4]], [[6,  0], [0,  8]]]
1246    ```
1247
1248    Args:
1249      target: A tensor with rank 2 or higher and with shape [b, y1, ..., y_n].
1250        `target[i,...]` should only depend on `source[i,...]`.
1251      source: A tensor with rank 2 or higher and with shape [b, x1, ..., x_m].
1252      unconnected_gradients: a value which can either hold 'none' or 'zero' and
1253        alters the value which will be returned if the target and sources are
1254        unconnected. The possible values and effects are detailed in
1255        'UnconnectedGradients' and it defaults to 'none'.
1256      parallel_iterations: A knob to control how many iterations are dispatched
1257        in parallel. This knob can be used to control the total memory usage.
1258      experimental_use_pfor: If true, uses pfor for computing the Jacobian. Else
1259        uses a tf.while_loop.
1260
1261    Returns:
1262      A tensor `t` with shape [b, y_1, ..., y_n, x1, ..., x_m] where `t[i, ...]`
1263      is the jacobian of `target[i, ...]` w.r.t. `source[i, ...]`, i.e. stacked
1264      per-example jacobians.
1265
1266    Raises:
1267      RuntimeError: If called on a used, non-persistent tape.
1268      RuntimeError: If called on a non-persistent tape with eager execution
1269        enabled and without enabling experimental_use_pfor.
1270      ValueError: If vectorization of jacobian computation fails or if first
1271        dimension of `target` and `source` do not match.
1272    """
1273    if self._tape is None:
1274      raise RuntimeError("A non-persistent GradientTape can only be used to"
1275                         "compute one set of gradients (or jacobians)")
1276    target_shape = target.shape
1277    if target_shape.rank is None:
1278      dim = tensor_shape.Dimension(None)
1279    else:
1280      dim = target_shape.dims[0]
1281    if not (target_shape.with_rank_at_least(2) and
1282            source.shape.with_rank_at_least(2) and
1283            dim.is_compatible_with(source.shape[0])):
1284      raise ValueError(
1285          "Need first dimension of target shape (%s) and "
1286          "source shape (%s) to match." % (target.shape, source.shape))
1287    if target_shape.is_fully_defined():
1288      batch_size = int(target_shape[0])
1289      target_row_size = target_shape.num_elements() // batch_size
1290    else:
1291      target_shape = array_ops.shape(target)
1292      batch_size = target_shape[0]
1293      target_row_size = array_ops.size(target) // batch_size
1294    source_shape = array_ops.shape(source)
1295    # Flatten target to 2-D.
1296    # Note that we push and pop the tape here and below. This is needed since we
1297    # need gradients through the enclosed operations.
1298    with self._ensure_recording():
1299      with ops.control_dependencies(
1300          [check_ops.assert_equal(batch_size, source_shape[0])]):
1301        target = array_ops.reshape(target, [batch_size, target_row_size])
1302
1303    def loop_fn(i):
1304      with self._ensure_recording():
1305        y = array_ops.gather(target, i, axis=1)
1306      return self.gradient(y, source,
1307                           unconnected_gradients=unconnected_gradients)
1308
1309    if experimental_use_pfor:
1310      try:
1311        output = pfor_ops.pfor(loop_fn, target_row_size,
1312                               parallel_iterations=parallel_iterations)
1313      except ValueError as err:
1314        six.reraise(
1315            ValueError,
1316            ValueError(
1317                str(err) + "\nEncountered an exception while vectorizing the "
1318                "batch_jacobian computation. Vectorization can be disabled by "
1319                "setting experimental_use_pfor to False."),
1320            sys.exc_info()[2])
1321    else:
1322      if context.executing_eagerly() and not self._persistent:
1323        raise RuntimeError(
1324            "GradientTape must be created with persistent=True"
1325            " to compute the batch_jacobian with eager execution enabled and "
1326            " with experimental_use_pfor set to False.")
1327      output = pfor_ops.for_loop(loop_fn, target.dtype, target_row_size,
1328                                 parallel_iterations=parallel_iterations)
1329    new_shape = array_ops.concat([target_shape, source_shape[1:]], axis=0)
1330    if output is None:
1331      # Note that this block is returning zeros when it could use `None` to
1332      # represent unconnected gradients. This is to maintain compatibility with
1333      # the previous behavior, which ignored `unconnected_gradients`.
1334      output = array_ops.zeros(new_shape, target.dtype)
1335      return output
1336    else:
1337      output = array_ops.reshape(output,
1338                                 [target_row_size, batch_size, -1])
1339      output = array_ops.transpose(output, [1, 0, 2])
1340
1341      output = array_ops.reshape(output, new_shape)
1342      return output
1343