• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15"""Utilities for multi-gpu training."""
16
17from tensorflow.python.framework import ops
18from tensorflow.python.keras import backend
19from tensorflow.python.keras.engine.training import Model
20from tensorflow.python.keras.layers.core import Lambda
21from tensorflow.python.keras.layers.merge import concatenate
22from tensorflow.python.ops import array_ops
23
24
25def _get_available_devices():
26  return [x.name for x in backend.get_session().list_devices()]
27
28
29def _normalize_device_name(name):
30  name = '/' + name.lower().split('device:')[1]
31  return name
32
33
34def multi_gpu_model(model, gpus, cpu_merge=True, cpu_relocation=False):
35  """Replicates a model on different GPUs.
36
37  Specifically, this function implements single-machine
38  multi-GPU data parallelism. It works in the following way:
39
40  - Divide the model's input(s) into multiple sub-batches.
41  - Apply a model copy on each sub-batch. Every model copy
42      is executed on a dedicated GPU.
43  - Concatenate the results (on CPU) into one big batch.
44
45  E.g. if your `batch_size` is 64 and you use `gpus=2`,
46  then we will divide the input into 2 sub-batches of 32 samples,
47  process each sub-batch on one GPU, then return the full
48  batch of 64 processed samples.
49
50  This induces quasi-linear speedup on up to 8 GPUs.
51
52  This function is only available with the TensorFlow backend
53  for the time being.
54
55  Args:
56      model: A Keras model instance. To avoid OOM errors,
57          this model could have been built on CPU, for instance
58          (see usage example below).
59      gpus: Integer >= 2, number of on GPUs on which to create
60          model replicas.
61      cpu_merge: A boolean value to identify whether to force
62          merging model weights under the scope of the CPU or not.
63      cpu_relocation: A boolean value to identify whether to
64          create the model's weights under the scope of the CPU.
65          If the model is not defined under any preceding device
66          scope, you can still rescue it by activating this option.
67
68  Returns:
69      A Keras `Model` instance which can be used just like the initial
70      `model` argument, but which distributes its workload on multiple GPUs.
71
72  Example 1: Training models with weights merge on CPU
73
74  ```python
75      import tensorflow as tf
76      from keras.applications import Xception
77      from keras.utils import multi_gpu_model
78      import numpy as np
79
80      num_samples = 1000
81      height = 224
82      width = 224
83      num_classes = 1000
84
85      # Instantiate the base model (or "template" model).
86      # We recommend doing this with under a CPU device scope,
87      # so that the model's weights are hosted on CPU memory.
88      # Otherwise they may end up hosted on a GPU, which would
89      # complicate weight sharing.
90      with tf.device('/cpu:0'):
91          model = Xception(weights=None,
92                           input_shape=(height, width, 3),
93                           classes=num_classes)
94
95      # Replicates the model on 8 GPUs.
96      # This assumes that your machine has 8 available GPUs.
97      parallel_model = multi_gpu_model(model, gpus=8)
98      parallel_model.compile(loss='categorical_crossentropy',
99                             optimizer='rmsprop')
100
101      # Generate dummy data.
102      x = np.random.random((num_samples, height, width, 3))
103      y = np.random.random((num_samples, num_classes))
104
105      # This `fit` call will be distributed on 8 GPUs.
106      # Since the batch size is 256, each GPU will process 32 samples.
107      parallel_model.fit(x, y, epochs=20, batch_size=256)
108
109      # Save model via the template model (which shares the same weights):
110      model.save('my_model.h5')
111  ```
112
113  Example 2: Training models with weights merge on CPU using cpu_relocation
114
115  ```python
116       ..
117       # Not needed to change the device scope for model definition:
118       model = Xception(weights=None, ..)
119
120       try:
121           model = multi_gpu_model(model, cpu_relocation=True)
122           print("Training using multiple GPUs..")
123       except:
124           print("Training using single GPU or CPU..")
125
126       model.compile(..)
127       ..
128  ```
129
130  Example 3: Training models with weights merge on GPU (recommended for NV-link)
131
132  ```python
133       ..
134       # Not needed to change the device scope for model definition:
135       model = Xception(weights=None, ..)
136
137       try:
138           model = multi_gpu_model(model, cpu_merge=False)
139           print("Training using multiple GPUs..")
140       except:
141           print("Training using single GPU or CPU..")
142       model.compile(..)
143       ..
144  ```
145
146  Raises:
147    ValueError: if the `gpus` argument does not match available devices.
148  """
149  if isinstance(gpus, (list, tuple)):
150    if len(gpus) <= 1:
151      raise ValueError('For multi-gpu usage to be effective, '
152                       'call `multi_gpu_model` with `len(gpus) >= 2`. '
153                       'Received: `gpus=%s`' % gpus)
154    num_gpus = len(gpus)
155    target_gpu_ids = gpus
156  else:
157    if gpus <= 1:
158      raise ValueError('For multi-gpu usage to be effective, '
159                       'call `multi_gpu_model` with `gpus >= 2`. '
160                       'Received: `gpus=%s`' % gpus)
161    num_gpus = gpus
162    target_gpu_ids = range(num_gpus)
163
164  target_devices = ['/cpu:0'] + ['/gpu:%d' % i for i in target_gpu_ids]
165  available_devices = _get_available_devices()
166  available_devices = [
167      _normalize_device_name(name) for name in available_devices
168  ]
169  for device in target_devices:
170    if device not in available_devices:
171      raise ValueError('To call `multi_gpu_model` with `gpus=%s`, '
172                       'we expect the following devices to be available: %s. '
173                       'However this machine only has: %s. '
174                       'Try reducing `gpus`.' % (gpus, target_devices,
175                                                 available_devices))
176
177  def get_slice(data, i, parts):
178    """Slice an array into `parts` slices and return slice `i`.
179
180    Args:
181      data: array to slice.
182      i: index of slice to return.
183      parts: number of slices to make.
184
185    Returns:
186      Slice `i` of `data`.
187    """
188    shape = array_ops.shape(data)
189    batch_size = shape[:1]
190    input_shape = shape[1:]
191    step = batch_size // parts
192    if i == parts - 1:
193      size = batch_size - step * i
194    else:
195      size = step
196    size = array_ops.concat([size, input_shape], axis=0)
197    stride = array_ops.concat([step, input_shape * 0], axis=0)
198    start = stride * i
199    return array_ops.slice(data, start, size)
200
201  # Relocate the model definition under CPU device scope if needed
202  if cpu_relocation:
203    from tensorflow.python.keras.models import clone_model  # pylint: disable=g-import-not-at-top
204    with ops.device('/cpu:0'):
205      model = clone_model(model)
206
207  all_outputs = [[] for _ in range(len(model.outputs))]
208
209  # Place a copy of the model on each GPU,
210  # each getting a slice of the inputs.
211  for i, gpu_id in enumerate(target_gpu_ids):
212    with ops.device('/gpu:%d' % gpu_id):
213      with backend.name_scope('replica_%d' % gpu_id):
214        inputs = []
215        # Retrieve a slice of the input.
216        for x in model.inputs:
217          input_shape = tuple(x.shape.as_list())[1:]
218          slice_i = Lambda(
219              get_slice,
220              output_shape=input_shape,
221              arguments={
222                  'i': i,
223                  'parts': num_gpus
224              })(
225                  x)
226          inputs.append(slice_i)
227
228        # Apply model on slice
229        # (creating a model replica on the target device).
230        outputs = model(inputs)
231        if not isinstance(outputs, list):
232          outputs = [outputs]
233
234        # Save the outputs for merging back together later.
235        for o, output in enumerate(outputs):
236          all_outputs[o].append(output)
237
238  # Deduplicate output names to handle Siamese networks.
239  occurrences = {}
240  for n in model.output_names:
241    if n not in occurrences:
242      occurrences[n] = 1
243    else:
244      occurrences[n] += 1
245  conflict_counter = {n: 0 for n, count in occurrences.items() if count > 1}
246  output_names = []
247  for n in model.output_names:
248    if n in conflict_counter:
249      conflict_counter[n] += 1
250      n += '_%d' % conflict_counter[n]
251    output_names.append(n)
252
253  # Merge outputs under expected scope.
254  with ops.device('/cpu:0' if cpu_merge else '/gpu:%d' % target_gpu_ids[0]):
255    merged = []
256    for name, outputs in zip(output_names, all_outputs):
257      merged.append(concatenate(outputs, axis=0, name=name))
258    return Model(model.inputs, merged)
259