• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ======================================
15"""Library of TPU helper functions."""
16
17import enum
18import math
19from typing import List, Optional, Text, Tuple
20
21import numpy as np
22
23from tensorflow.python.platform import tf_logging as logging
24from tensorflow.python.tpu.topology import Topology
25from tensorflow.python.util.tf_export import tf_export
26
27
28SINGLE_CORE_ASSIGNMENT = [[[0, 0, 0, 0]]]
29
30
31def _compute_task_and_cores_to_replicas(core_assignment, topology):
32  """Computes a nested dict which maps task and logical core to replicas."""
33  task_and_cores_to_replicas = {}
34  for replica in range(core_assignment.shape[0]):
35    for logical_core in range(core_assignment.shape[1]):
36      coordinates = core_assignment[replica, logical_core, :]
37      task_id = topology.task_ordinal_at_coordinates(coordinates)
38      if task_id not in task_and_cores_to_replicas:
39        task_and_cores_to_replicas[task_id] = {}
40      if logical_core not in task_and_cores_to_replicas[task_id]:
41        task_and_cores_to_replicas[task_id][logical_core] = set()
42
43      task_and_cores_to_replicas[task_id][logical_core].add(replica)
44
45  task_to_sorted_replica_id = {}
46
47  for task, core_to_replicas in task_and_cores_to_replicas.items():
48    core_to_sorted_replicas = {}
49    for core, replicas in core_to_replicas.items():
50      core_to_sorted_replicas[core] = sorted(replicas)
51
52    task_to_sorted_replica_id[task] = core_to_sorted_replicas
53  return task_to_sorted_replica_id
54
55
56@tf_export("tpu.experimental.DeviceAssignment")
57class DeviceAssignment(object):
58  """Mapping from logical cores in a computation to the physical TPU topology.
59
60  Prefer to use the `DeviceAssignment.build()` helper to construct a
61  `DeviceAssignment`; it is easier if less flexible than constructing a
62  `DeviceAssignment` directly.
63  """
64
65  def __init__(self, topology: Topology, core_assignment: np.ndarray):
66    """Constructs a `DeviceAssignment` object.
67
68    Args:
69      topology: A `Topology` object that describes the physical TPU topology.
70      core_assignment: A logical to physical core mapping, represented as a
71        rank 3 numpy array. See the description of the `core_assignment`
72        property for more details.
73
74    Raises:
75      ValueError: If `topology` is not `Topology` object.
76      ValueError: If `core_assignment` is not a rank 3 numpy array.
77    """
78    if not isinstance(topology, Topology):
79      raise ValueError("topology must be a Topology object, got {}".format(
80          type(topology)))
81    core_assignment = np.asarray(core_assignment, dtype=np.int32)
82
83    self._topology = topology
84
85    if core_assignment.ndim != 3:
86      raise ValueError("core_assignment must be a rank 3 numpy array, "
87                       f"got shape {core_assignment.shape}")
88
89    self._num_replicas = core_assignment.shape[0]
90    self._num_cores_per_replica = core_assignment.shape[1]
91
92    if core_assignment.shape[-1] != topology.mesh_rank:
93      raise ValueError(
94          "core_assignment.shape[-1] must have size equal to topology "
95          f"rank ({topology.mesh_rank}), got "
96          f"core_assignment.shape={core_assignment.shape}")
97
98    self._core_assignment = core_assignment
99    self._task_and_cores_to_replicas = _compute_task_and_cores_to_replicas(
100        self._core_assignment, topology)
101
102  @property
103  def topology(self) -> Topology:
104    """A `Topology` that describes the TPU topology."""
105    return self._topology
106
107  @property
108  def num_cores_per_replica(self) -> int:
109    """The number of cores per replica."""
110    return self._num_cores_per_replica
111
112  @property
113  def num_replicas(self) -> int:
114    """The number of replicas of the computation."""
115    return self._num_replicas
116
117  @property
118  def core_assignment(self) -> np.ndarray:
119    """The logical to physical core mapping.
120
121    Returns:
122      An integer numpy array of rank 3, with shape
123      `[num_replicas, num_cores_per_replica, topology_rank]`. Maps
124      (replica, logical core) pairs to physical topology coordinates.
125    """
126    return self._core_assignment
127
128  def coordinates(self, replica: int, logical_core: int) -> Tuple:  # pylint:disable=g-bare-generic
129    """Returns the physical topology coordinates of a logical core."""
130    return tuple(self.core_assignment[replica, logical_core, :])
131
132  def lookup_replicas(self, task_id: int, logical_core: int) -> List[int]:
133    """Lookup replica ids by task number and logical core.
134
135    Args:
136      task_id: TensorFlow task number.
137      logical_core: An integer, identifying a logical core.
138    Returns:
139      A sorted list of the replicas that are attached to that task and
140      logical_core.
141    Raises:
142      ValueError: If no replica exists in the task which contains the logical
143      core.
144    """
145    try:
146      return self._task_and_cores_to_replicas[task_id][logical_core]
147    except KeyError:
148      raise ValueError(
149          "Can not find any replica in task: {} contains logical_core: {} ".
150          format(task_id, logical_core))
151
152  def tpu_ordinal(self, replica: int = 0, logical_core: int = 0) -> int:
153    """Returns the ordinal of the TPU device assigned to a logical core."""
154    coordinates = self.coordinates(replica, logical_core)
155    return self._topology.tpu_device_ordinal_at_coordinates(coordinates)
156
157  def host_device(self,
158                  replica: int = 0,
159                  logical_core: int = 0,
160                  job: Optional[Text] = None) -> Text:
161    """Returns the CPU device attached to a logical core."""
162    coordinates = self.coordinates(replica, logical_core)
163    return self._topology.cpu_device_name_at_coordinates(coordinates, job=job)
164
165  def tpu_device(self,
166                 replica: int = 0,
167                 logical_core: int = 0,
168                 job: Optional[Text] = None) -> Text:
169    """Returns the name of the TPU device assigned to a logical core."""
170    coordinates = self.coordinates(replica, logical_core)
171    return self._topology.tpu_device_name_at_coordinates(coordinates, job=job)
172
173  @staticmethod
174  def build(topology: Topology,
175            computation_shape: Optional[np.ndarray] = None,
176            computation_stride: Optional[np.ndarray] = None,
177            num_replicas: int = 1) -> "DeviceAssignment":
178    return device_assignment(topology, computation_shape, computation_stride,
179                             num_replicas)
180
181
182def _open_ring_2d(x_size: int, y_size: int,
183                  z_coord: int) -> List[Tuple[int, int, int]]:
184  """Ring-order of a X by Y mesh, with a fixed Z coordinate.
185
186  For example, in a 4x4 mesh, this returns the following order.
187    0 -- 1 -- 2 -- 3
188    |    |    |    |
189    15-- 6 -- 5 -- 4
190    |    |    |    |
191    14-- 7 -- 8 -- 9
192    |    |    |    |
193    13-- 12-- 11-- 10
194
195  Note that chip 0 is not included in the output.
196
197  Args:
198    x_size: An integer represents the mesh size in the x-dimension. Must be
199      larger than 1.
200    y_size: An integer represents the mesh size in the y-dimension. Must be
201      larger than 1.
202    z_coord: An integer represents the z-coordinate to use for the chips in the
203      ring.
204
205  Returns:
206    A list of (x,y,z) triples in ring order.
207  """
208  ret = []
209  for i in range(y_size // 2):
210    for j in range(1, x_size):
211      ret.append((j, 2 * i, z_coord))
212    for j in range(x_size - 1, 0, -1):
213      ret.append((j, 2 * i + 1, z_coord))
214  for i in range(y_size - 1, 0, -1):
215    ret.append((0, i, z_coord))
216  return ret
217
218
219def _ring_3d(x_size: int, y_size: int,
220             z_size: int) -> List[Tuple[int, int, int]]:
221  """Ring-order of a X by Y by Z mesh.
222
223  Constructs the 3d ring from 2d rings that are stacked in the Z dimension and
224  joined in one corner.
225
226  z == 0:
227    0 -- 1 -- 2 -- 3
228    |    |    |    |
229    15 - 6 -- 5 -- 4
230    |    |    |    |
231    14 - 7 -- 8 -- 9
232    |    |    |    |
233    13 - 12 - 11 - 10
234  z == 1:
235    63 - 30 - 29 - 28
236    |    |    |    |
237    16 - 25 - 26 - 27
238    |    |    |    |
239    17 - 24 - 23 - 22
240    |    |    |    |
241    18 - 19 - 20 - 21
242  z == 2:
243    62 - 31 - 32 - 33
244    |    |    |    |
245    45 - 36 - 35 - 34
246    |    |    |    |
247    44 - 37 - 38 - 39
248    |    |    |    |
249    43 - 42 - 41 - 40
250  z == 3:
251    61 - 60 - 59 - 58
252    |    |    |    |
253    46 - 55 - 56 - 57
254    |    |    |    |
255    47 - 54 - 53 - 52
256    |    |    |    |
257    48 - 49 - 50 - 51
258
259  Args:
260    x_size: An integer represents the mesh size in the x-dimension. Must be
261      larger than 1.
262    y_size: An integer represents the mesh size in the y-dimension. Must be
263      larger than 1.
264    z_size: An integer represents the mesh size in the z-dimension. Must be
265      larger than 1.  For example, in a 4x4x4 mesh, this returns the following
266      order.
267
268  Returns:
269    A list of (x,y,z) triples in ring order.
270  """
271
272  # Handle the case where 2 dimensions are size 1.
273  if x_size == 1 and y_size == 1:
274    return [(0, 0, i) for i in range(z_size)]
275  if x_size == 1 and z_size == 1:
276    return [(0, i, 0) for i in range(y_size)]
277  if y_size == 1 and z_size == 1:
278    return [(i, 0, 0) for i in range(x_size)]
279
280  # Handle odd mesh dimensions.  This never happens in practice, so we don't
281  # bother to try building something optimal.
282  if (x_size > 1 and x_size % 2 != 0) or (y_size > 1 and
283                                          y_size % 2 != 0) or (z_size > 1 and
284                                                               z_size % 2 != 0):
285    logging.warning("Odd dimension")
286    ret = []
287    for z in range(z_size):
288      for y in range(y_size):
289        ret.extend((x, y, z) for x in range(x_size))
290    return ret
291
292  # Always start with chip 0.
293  ret = [(0, 0, 0)]
294  # Handle the case where one dimension is size 1.  We just build a flat, 2d
295  # ring.
296  if z_size == 1:
297    ret.extend(_open_ring_2d(x_size, y_size, 0))
298    return ret
299  if y_size == 1:
300    ret = [(0, 0, 0)]
301    ret.extend((x, y, z) for (x, z, y) in _open_ring_2d(x_size, z_size, 0))
302    return ret
303  if x_size == 1:
304    ret = [(0, 0, 0)]
305    ret.extend((x, y, z) for (y, z, x) in _open_ring_2d(y_size, z_size, 0))
306    return ret
307
308  # Handle the case where all dimensions have size > 1 and even.
309  ret = [(0, 0, 0)]
310  for i in range(0, z_size):
311    r = _open_ring_2d(x_size, y_size, i)
312    if i % 2 == 0:
313      ret.extend(r)
314    else:
315      ret.extend(reversed(r))
316  for i in range(z_size - 1, 0, -1):
317    ret.append((0, 0, i))
318  return ret
319
320
321class DeviceOrderMode(enum.IntEnum):
322  """The way of determining device orders when computing device assignment."""
323  # By default the mode is set to AUTO, the library will choose to form rings
324  # when that is possible.
325  AUTO = 0
326  # Form rings for replicas and model-parallel cores.
327  RING = 1
328  # Form meshes for replicas and/or model-parallel cores.
329  MESH = 2
330
331
332def device_assignment(
333    topology: Topology,
334    computation_shape: Optional[np.ndarray] = None,
335    computation_stride: Optional[np.ndarray] = None,
336    num_replicas: int = 1,
337    device_order_mode: DeviceOrderMode = DeviceOrderMode.AUTO
338) -> DeviceAssignment:
339  """Computes a device_assignment of a computation across a TPU topology.
340
341  Attempts to choose a compact grid of cores for locality.
342
343  Returns a `DeviceAssignment` that describes the cores in the topology assigned
344  to each core of each replica.
345
346  `computation_shape` and `computation_stride` values should be powers of 2 for
347  optimal packing.
348
349  Args:
350    topology: A `Topology` object that describes the TPU cluster topology. To
351      obtain a TPU topology, evaluate the `Tensor` returned by
352      `initialize_system` using `Session.run`. Either a serialized
353      `TopologyProto` or a `Topology` object may be passed. Note: you must
354        evaluate the `Tensor` first; you cannot pass an unevaluated `Tensor`
355        here.
356    computation_shape: A rank 1 int32 numpy array with size equal to the
357      topology rank, describing the shape of the computation's block of cores.
358      If None, the `computation_shape` is `[1] * topology_rank`.
359    computation_stride: A rank 1 int32 numpy array of size `topology_rank`,
360      describing the inter-core spacing of the `computation_shape` cores in the
361      TPU topology. If None, the `computation_stride` is `[1] * topology_rank`.
362    num_replicas: The number of computation replicas to run. The replicas will
363      be packed into the free spaces of the topology.
364    device_order_mode: An enum of `DeviceOrderMode` class which indicates
365      whether to assign devices to form rings or meshes, or let the library to
366      choose.
367
368  Returns:
369    A DeviceAssignment object, which describes the mapping between the logical
370    cores in each computation replica and the physical cores in the TPU
371    topology.
372
373  Raises:
374    ValueError: If `topology` is not a valid `Topology` object.
375    ValueError: If `computation_shape` or `computation_stride` are not 1D int32
376      numpy arrays with shape [3] where all values are positive.
377    ValueError: If computation's replicas cannot fit into the TPU topology.
378  """
379  # Deserialize the Topology proto, if it is a string.
380  if isinstance(topology, bytes):
381    topology = Topology(serialized=topology)
382
383  if not isinstance(topology, Topology):
384    raise ValueError(
385        f"`topology` is not a Topology object; got {type(topology)}")
386
387  topology_rank = len(topology.mesh_shape)
388  mesh_shape = topology.mesh_shape
389  if computation_shape is None:
390    computation_shape = np.array([1] * topology_rank, dtype=np.int32)
391  else:
392    computation_shape = np.asarray(computation_shape, dtype=np.int32)
393
394  if computation_stride is None:
395    computation_stride = np.array([1] * topology_rank, dtype=np.int32)
396  else:
397    computation_stride = np.asarray(computation_stride, dtype=np.int32)
398
399  if computation_shape.shape != (topology_rank,):
400    raise ValueError(
401        f"computation_shape must have shape [{topology_rank}]; "
402        f"got {computation_shape.shape}"
403    )
404  if computation_stride.shape != (topology_rank,):
405    raise ValueError(
406        f"computation_stride must have shape [{topology_rank}]; "
407        f"got {computation_stride.shape}"
408    )
409
410  if any(computation_shape < 1):
411    raise ValueError(
412        "computation_shape must be positive; got computation_shape={}".format(
413            computation_shape))
414  if any(computation_stride < 1):
415    raise ValueError(
416        "computation_stride must be positive; got computation_stride={}".format(
417            computation_stride))
418
419  # Computes the physical size of one computation instance.
420  computation_footprint = computation_shape * computation_stride
421  if any(computation_footprint > mesh_shape):
422    raise ValueError(
423        "computation footprint {} does not fit in TPU topology shape {}".format(
424            computation_footprint, mesh_shape))
425
426  # Computes how many copies of the computation footprint fit in the mesh.
427  block_counts = mesh_shape // computation_footprint
428
429  replica_counts = block_counts * computation_stride
430  max_replicas = np.prod(replica_counts)
431  if num_replicas > max_replicas:
432    raise ValueError(
433        "requested {} replicas but only {} replicas with shape {} and "
434        "computation_stride {} fit in a TPU mesh of shape {}".format(
435            num_replicas, max_replicas, computation_shape, computation_stride,
436            mesh_shape))
437
438  def ceil_of_ratio(n, m):
439    return (n + m - 1) // m
440
441  if topology.missing_devices.size == 0:
442    replica_shape = [0] * topology_rank
443    if num_replicas > 0:
444      remaining_replicas = num_replicas
445      remaining_dims = topology_rank
446
447      # Choose dimensions as close to an equal cube as possible,
448      # in order of increasing dimension size. By visiting dimensions
449      # in increasing size, we assign the most constrained dimension
450      # first, so we won't make infeasible choices.
451      #
452      # As a secondary sort order, visit the last dimension (core index) first,
453      # then the other dimensions in increasing order. This means we try to use
454      # both cores on the same chip in preference to two cores on different
455      # chips.  We visit the x dimension first, and the z dimension last, so
456      # that we prefer to arrange adjacent replicas on the same machine when
457      # possible.
458      #
459      # For example, if num_replicas == 4, we prefer to use a replica_shape of
460      # (2,1,1,2) over (1,1,2,2).
461
462      for x, ni in sorted(((x, ((i + 1) % topology_rank))
463                           for (i, x) in enumerate(replica_counts))):
464        i = (ni + topology_rank - 1) % topology_rank
465        target_size = int(math.ceil(remaining_replicas**(1.0 / remaining_dims)))
466        replica_shape[i] = min(target_size, x)
467        remaining_replicas = ceil_of_ratio(remaining_replicas, replica_shape[i])
468        remaining_dims -= 1
469
470      assert remaining_replicas == 1 and remaining_dims == 0
471
472    # Assigns an offset to each replica such that no two replicas overlap.
473    replica_offsets = np.full([num_replicas, topology_rank], -1, dtype=np.int32)
474
475    enable_3d_tiling = (
476        topology_rank == 4 and
477        computation_shape[-1] == mesh_shape[-1]  # Only handle 3D case.
478        and np.prod(computation_stride) == 1  # Ensure no stride.
479        and num_replicas == max_replicas)  # Full replication.
480
481    if device_order_mode != DeviceOrderMode.AUTO:
482      if device_order_mode == DeviceOrderMode.RING and not enable_3d_tiling:
483        raise ValueError(
484            "device_order_mode=DeviceOrderMode.RING is not compatible with the "
485            "3D tiling current topology.  Try setting "
486            "device_order_mode=DeviceOrderMode.AUTO"
487        )
488      enable_3d_tiling = device_order_mode == DeviceOrderMode.RING
489
490    if enable_3d_tiling:
491      assignment = []
492      inner_ring = _ring_3d(computation_shape[0], computation_shape[1],
493                            computation_shape[2])
494      outer_ring = _ring_3d(replica_shape[0], replica_shape[1],
495                            replica_shape[2])
496
497      for replica in range(num_replicas):
498        outer_x, outer_y, outer_z = outer_ring[replica]
499        per_replica_assignment = []
500        for index in range(np.prod(computation_shape)):
501          inner_x, inner_y, inner_z = inner_ring[index // mesh_shape[-1]]
502          px = outer_x * computation_shape[0] + inner_x
503          py = outer_y * computation_shape[1] + inner_y
504          pz = outer_z * computation_shape[2] + inner_z
505          pi = index % mesh_shape[-1]
506          per_replica_assignment.append([px, py, pz, pi])
507        assignment.append(per_replica_assignment)
508    else:
509      for replica in range(num_replicas):
510        # Chooses a replica number in each axis.
511        t = replica
512        pos = []
513        # Visit the core number first.
514        for dim in np.concatenate([[replica_shape[-1]], replica_shape[:-1]]):
515          pos.append(t % dim)
516          t //= dim
517        replica_pos = np.concatenate([pos[1:], [pos[0]]])
518
519        # Determines where that replica starts in each axis.
520        outer = replica_pos // computation_stride
521        inner = replica_pos % computation_stride
522        replica_offsets[replica, :] = outer * computation_footprint + inner
523
524      # Computes a logical core -> physical core mapping for each replica.
525      indices = [
526          np.arange(0, computation_shape[i] * computation_stride[i],
527                    computation_stride[i]) for i in range(topology_rank)
528      ]
529      indices = np.concatenate(
530          [i[..., np.newaxis] for i in np.meshgrid(*indices, indexing="ij")],
531          axis=-1)
532      indices = indices.reshape((-1, topology_rank))
533      assignment = indices + replica_offsets[:, np.newaxis, :]
534  else:
535    # We have a slice with missing chips. We define a simple assignment by
536    # ignoring computation stride. This assignment should enable a consistent
537    # and correct device assignment on degraded slices. It is optimal when
538    # weights are not sharded. But this device assignment may be sub-optimal for
539    # other model parallelism scenarios.
540    assert np.prod(computation_stride) == 1
541    # Next, we check if we have sufficient devices.
542    assert num_replicas * np.prod(
543        computation_shape) <= topology.num_tasks * topology.num_tpus_per_task
544    # Map replicas to physical devices in task order.
545    device_coordinates = topology.device_coordinates
546    assignment = []
547    devices_per_replica = np.prod(computation_shape)
548    for rindex in range(num_replicas):
549      replica_assignment = []
550      for index in range(devices_per_replica):
551        logical_id = rindex * devices_per_replica + index
552        # Pick logical cores in task order
553        task = logical_id // topology.num_tpus_per_task
554        device = logical_id % topology.num_tpus_per_task
555        # Append physical cores to the replica assignment
556        replica_assignment.append(device_coordinates[task, device, :])
557      assignment.append(replica_assignment)
558
559  return DeviceAssignment(topology, core_assignment=assignment)
560