1# Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15"""Utilities for multi-gpu training.""" 16 17from tensorflow.python.framework import ops 18from tensorflow.python.keras import backend 19from tensorflow.python.keras.engine.training import Model 20from tensorflow.python.keras.layers.core import Lambda 21from tensorflow.python.keras.layers.merge import concatenate 22from tensorflow.python.ops import array_ops 23 24 25def _get_available_devices(): 26 return [x.name for x in backend.get_session().list_devices()] 27 28 29def _normalize_device_name(name): 30 name = '/' + name.lower().split('device:')[1] 31 return name 32 33 34def multi_gpu_model(model, gpus, cpu_merge=True, cpu_relocation=False): 35 """Replicates a model on different GPUs. 36 37 Specifically, this function implements single-machine 38 multi-GPU data parallelism. It works in the following way: 39 40 - Divide the model's input(s) into multiple sub-batches. 41 - Apply a model copy on each sub-batch. Every model copy 42 is executed on a dedicated GPU. 43 - Concatenate the results (on CPU) into one big batch. 44 45 E.g. if your `batch_size` is 64 and you use `gpus=2`, 46 then we will divide the input into 2 sub-batches of 32 samples, 47 process each sub-batch on one GPU, then return the full 48 batch of 64 processed samples. 49 50 This induces quasi-linear speedup on up to 8 GPUs. 51 52 This function is only available with the TensorFlow backend 53 for the time being. 54 55 Args: 56 model: A Keras model instance. To avoid OOM errors, 57 this model could have been built on CPU, for instance 58 (see usage example below). 59 gpus: Integer >= 2, number of on GPUs on which to create 60 model replicas. 61 cpu_merge: A boolean value to identify whether to force 62 merging model weights under the scope of the CPU or not. 63 cpu_relocation: A boolean value to identify whether to 64 create the model's weights under the scope of the CPU. 65 If the model is not defined under any preceding device 66 scope, you can still rescue it by activating this option. 67 68 Returns: 69 A Keras `Model` instance which can be used just like the initial 70 `model` argument, but which distributes its workload on multiple GPUs. 71 72 Example 1: Training models with weights merge on CPU 73 74 ```python 75 import tensorflow as tf 76 from keras.applications import Xception 77 from keras.utils import multi_gpu_model 78 import numpy as np 79 80 num_samples = 1000 81 height = 224 82 width = 224 83 num_classes = 1000 84 85 # Instantiate the base model (or "template" model). 86 # We recommend doing this with under a CPU device scope, 87 # so that the model's weights are hosted on CPU memory. 88 # Otherwise they may end up hosted on a GPU, which would 89 # complicate weight sharing. 90 with tf.device('/cpu:0'): 91 model = Xception(weights=None, 92 input_shape=(height, width, 3), 93 classes=num_classes) 94 95 # Replicates the model on 8 GPUs. 96 # This assumes that your machine has 8 available GPUs. 97 parallel_model = multi_gpu_model(model, gpus=8) 98 parallel_model.compile(loss='categorical_crossentropy', 99 optimizer='rmsprop') 100 101 # Generate dummy data. 102 x = np.random.random((num_samples, height, width, 3)) 103 y = np.random.random((num_samples, num_classes)) 104 105 # This `fit` call will be distributed on 8 GPUs. 106 # Since the batch size is 256, each GPU will process 32 samples. 107 parallel_model.fit(x, y, epochs=20, batch_size=256) 108 109 # Save model via the template model (which shares the same weights): 110 model.save('my_model.h5') 111 ``` 112 113 Example 2: Training models with weights merge on CPU using cpu_relocation 114 115 ```python 116 .. 117 # Not needed to change the device scope for model definition: 118 model = Xception(weights=None, ..) 119 120 try: 121 model = multi_gpu_model(model, cpu_relocation=True) 122 print("Training using multiple GPUs..") 123 except: 124 print("Training using single GPU or CPU..") 125 126 model.compile(..) 127 .. 128 ``` 129 130 Example 3: Training models with weights merge on GPU (recommended for NV-link) 131 132 ```python 133 .. 134 # Not needed to change the device scope for model definition: 135 model = Xception(weights=None, ..) 136 137 try: 138 model = multi_gpu_model(model, cpu_merge=False) 139 print("Training using multiple GPUs..") 140 except: 141 print("Training using single GPU or CPU..") 142 model.compile(..) 143 .. 144 ``` 145 146 Raises: 147 ValueError: if the `gpus` argument does not match available devices. 148 """ 149 if isinstance(gpus, (list, tuple)): 150 if len(gpus) <= 1: 151 raise ValueError('For multi-gpu usage to be effective, ' 152 'call `multi_gpu_model` with `len(gpus) >= 2`. ' 153 'Received: `gpus=%s`' % gpus) 154 num_gpus = len(gpus) 155 target_gpu_ids = gpus 156 else: 157 if gpus <= 1: 158 raise ValueError('For multi-gpu usage to be effective, ' 159 'call `multi_gpu_model` with `gpus >= 2`. ' 160 'Received: `gpus=%s`' % gpus) 161 num_gpus = gpus 162 target_gpu_ids = range(num_gpus) 163 164 target_devices = ['/cpu:0'] + ['/gpu:%d' % i for i in target_gpu_ids] 165 available_devices = _get_available_devices() 166 available_devices = [ 167 _normalize_device_name(name) for name in available_devices 168 ] 169 for device in target_devices: 170 if device not in available_devices: 171 raise ValueError('To call `multi_gpu_model` with `gpus=%s`, ' 172 'we expect the following devices to be available: %s. ' 173 'However this machine only has: %s. ' 174 'Try reducing `gpus`.' % (gpus, target_devices, 175 available_devices)) 176 177 def get_slice(data, i, parts): 178 """Slice an array into `parts` slices and return slice `i`. 179 180 Args: 181 data: array to slice. 182 i: index of slice to return. 183 parts: number of slices to make. 184 185 Returns: 186 Slice `i` of `data`. 187 """ 188 shape = array_ops.shape(data) 189 batch_size = shape[:1] 190 input_shape = shape[1:] 191 step = batch_size // parts 192 if i == parts - 1: 193 size = batch_size - step * i 194 else: 195 size = step 196 size = array_ops.concat([size, input_shape], axis=0) 197 stride = array_ops.concat([step, input_shape * 0], axis=0) 198 start = stride * i 199 return array_ops.slice(data, start, size) 200 201 # Relocate the model definition under CPU device scope if needed 202 if cpu_relocation: 203 from tensorflow.python.keras.models import clone_model # pylint: disable=g-import-not-at-top 204 with ops.device('/cpu:0'): 205 model = clone_model(model) 206 207 all_outputs = [[] for _ in range(len(model.outputs))] 208 209 # Place a copy of the model on each GPU, 210 # each getting a slice of the inputs. 211 for i, gpu_id in enumerate(target_gpu_ids): 212 with ops.device('/gpu:%d' % gpu_id): 213 with backend.name_scope('replica_%d' % gpu_id): 214 inputs = [] 215 # Retrieve a slice of the input. 216 for x in model.inputs: 217 input_shape = tuple(x.shape.as_list())[1:] 218 slice_i = Lambda( 219 get_slice, 220 output_shape=input_shape, 221 arguments={ 222 'i': i, 223 'parts': num_gpus 224 })( 225 x) 226 inputs.append(slice_i) 227 228 # Apply model on slice 229 # (creating a model replica on the target device). 230 outputs = model(inputs) 231 if not isinstance(outputs, list): 232 outputs = [outputs] 233 234 # Save the outputs for merging back together later. 235 for o, output in enumerate(outputs): 236 all_outputs[o].append(output) 237 238 # Deduplicate output names to handle Siamese networks. 239 occurrences = {} 240 for n in model.output_names: 241 if n not in occurrences: 242 occurrences[n] = 1 243 else: 244 occurrences[n] += 1 245 conflict_counter = {n: 0 for n, count in occurrences.items() if count > 1} 246 output_names = [] 247 for n in model.output_names: 248 if n in conflict_counter: 249 conflict_counter[n] += 1 250 n += '_%d' % conflict_counter[n] 251 output_names.append(n) 252 253 # Merge outputs under expected scope. 254 with ops.device('/cpu:0' if cpu_merge else '/gpu:%d' % target_gpu_ids[0]): 255 merged = [] 256 for name, outputs in zip(output_names, all_outputs): 257 merged.append(concatenate(outputs, axis=0, name=name)) 258 return Model(model.inputs, merged) 259