1# Copyright 2018 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15"""Benchmarks for autotuning performance knobs.""" 16from __future__ import absolute_import 17from __future__ import division 18from __future__ import print_function 19 20 21import numpy as np 22 23from tensorflow.python.data.benchmarks import benchmark_base 24from tensorflow.python.data.ops import dataset_ops 25from tensorflow.python.ops import math_ops 26 27 28class AutotuneBenchmark(benchmark_base.DatasetBenchmarkBase): 29 """Benchmarks for autotuning performance knobs.""" 30 31 def _run_benchmark(self, dataset, autotune, autotune_buffers, 32 benchmark_iters, benchmark_label): 33 options = dataset_ops.Options() 34 options.experimental_optimization.apply_default_optimizations = False 35 options.experimental_optimization.autotune = autotune 36 options.experimental_optimization.autotune_buffers = autotune_buffers 37 dataset = dataset.with_options(options) 38 39 autotune_string = "_autotune_{}".format( 40 "parallelism_and_buffer_sizes" 41 if autotune_buffers else "parallelism_only") 42 wall_time = self.run_and_report_benchmark( 43 dataset=dataset, 44 num_elements=benchmark_iters, 45 warmup=True, 46 iters=1, 47 name=benchmark_label + (autotune_string if autotune else "")) 48 return wall_time 49 50 def benchmark_batch(self): 51 a = self._benchmark_batch(autotune=False) 52 b = self._benchmark_batch(autotune=True, autotune_buffers=False) 53 c = self._benchmark_batch(autotune=True, autotune_buffers=True) 54 print("autotune parallelism vs no autotuning speedup: {}".format(a / b)) 55 print("autotune parallelism and buffer sizes vs no autotuning speedup: {}" 56 .format(a / c)) 57 58 def _benchmark_batch(self, autotune, autotune_buffers=False): 59 batch_size = 128 60 k = 1024 61 dataset = dataset_ops.Dataset.from_tensors( 62 (np.random.rand(1, 4 * k), np.random.rand(4 * k, 1))).repeat() 63 dataset = dataset.map(math_ops.matmul) 64 dataset = dataset.batch( 65 batch_size=batch_size, num_parallel_calls=dataset_ops.AUTOTUNE) 66 return self._run_benchmark( 67 dataset, 68 autotune, 69 autotune_buffers, 70 benchmark_iters=10000, 71 benchmark_label="batch") 72 73 def benchmark_map(self): 74 a = self._benchmark_map(autotune=False) 75 b = self._benchmark_map(autotune=True, autotune_buffers=False) 76 c = self._benchmark_map(autotune=True, autotune_buffers=True) 77 print("autotune parallelism vs no autotuning speedup: {}".format(a / b)) 78 print("autotune parallelism and buffer sizes vs no autotuning speedup: {}" 79 .format(a / c)) 80 81 def _benchmark_map(self, autotune, autotune_buffers=False): 82 k = 1024 * 1024 83 dataset = dataset_ops.Dataset.from_tensors( 84 (np.random.rand(1, 4 * k), np.random.rand(4 * k, 1))).repeat() 85 dataset = dataset.map( 86 math_ops.matmul, num_parallel_calls=dataset_ops.AUTOTUNE) 87 return self._run_benchmark( 88 dataset=dataset, 89 autotune=autotune, 90 autotune_buffers=autotune_buffers, 91 benchmark_iters=10000, 92 benchmark_label="map") 93 94 def benchmark_map_and_batch(self): 95 a = self._benchmark_map_and_batch(autotune=False) 96 b = self._benchmark_map_and_batch(autotune=True, autotune_buffers=False) 97 c = self._benchmark_map_and_batch(autotune=True, autotune_buffers=True) 98 print("autotune parallelism vs no autotuning speedup: {}".format(a / b)) 99 print("autotune parallelism and buffer sizes vs no autotuning speedup: {}" 100 .format(a / c)) 101 102 def _benchmark_map_and_batch(self, autotune, autotune_buffers=False): 103 batch_size = 16 104 k = 1024 * 1024 105 dataset = dataset_ops.Dataset.from_tensors( 106 (np.random.rand(1, 4 * k), np.random.rand(4 * k, 1))).repeat() 107 dataset = dataset.map( 108 math_ops.matmul, num_parallel_calls=dataset_ops.AUTOTUNE) 109 dataset = dataset.batch(batch_size=batch_size) 110 return self._run_benchmark( 111 dataset=dataset, 112 autotune=autotune, 113 autotune_buffers=autotune_buffers, 114 benchmark_iters=1000, 115 benchmark_label="map_and_batch") 116 117 def benchmark_interleave(self): 118 a = self._benchmark_interleave(autotune=False) 119 b = self._benchmark_interleave(autotune=True, autotune_buffers=False) 120 c = self._benchmark_interleave(autotune=True, autotune_buffers=True) 121 print("autotune parallelism vs no autotuning speedup: {}".format(a / b)) 122 print("autotune parallelism and buffer sizes vs no autotuning speedup: {}" 123 .format(a / c)) 124 125 def _benchmark_interleave(self, autotune, autotune_buffers=False): 126 k = 1024 * 1024 127 dataset = dataset_ops.Dataset.from_tensors( 128 (np.random.rand(1, 4 * k), np.random.rand(4 * k, 1))).repeat() 129 dataset = dataset.map(math_ops.matmul) 130 dataset = dataset_ops.Dataset.range(1).repeat().interleave( 131 lambda _: dataset, 132 cycle_length=10, 133 num_parallel_calls=dataset_ops.AUTOTUNE) 134 return self._run_benchmark( 135 dataset=dataset, 136 autotune=autotune, 137 autotune_buffers=autotune_buffers, 138 benchmark_iters=10000, 139 benchmark_label="interleave") 140 141 def benchmark_map_and_interleave(self): 142 a = self._benchmark_map_and_interleave(autotune=False) 143 b = self._benchmark_map_and_interleave( 144 autotune=True, autotune_buffers=False) 145 c = self._benchmark_map_and_interleave(autotune=True, autotune_buffers=True) 146 print("autotune parallelism vs no autotuning speedup: {}".format(a / b)) 147 print("autotune parallelism and buffer sizes vs no autotuning speedup: {}" 148 .format(a / c)) 149 150 def _benchmark_map_and_interleave(self, autotune, autotune_buffers=False): 151 k = 1024 * 1024 152 a = (np.random.rand(1, 8 * k), np.random.rand(8 * k, 1)) 153 b = (np.random.rand(1, 4 * k), np.random.rand(4 * k, 1)) 154 c = (np.random.rand(1, 2 * k), np.random.rand(2 * k, 1)) 155 dataset_a = dataset_ops.Dataset.from_tensors(a).repeat() 156 dataset_b = dataset_ops.Dataset.from_tensors(b).repeat() 157 dataset_c = dataset_ops.Dataset.from_tensors(c).repeat() 158 159 def f1(x, y): 160 return math_ops.matmul(x, y) 161 162 def f2(a, b): 163 x, y = b 164 return a, math_ops.matmul(x, y) 165 166 dataset = dataset_a 167 dataset = dataset.map(f1, num_parallel_calls=dataset_ops.AUTOTUNE) 168 dataset = dataset_ops.Dataset.range(1).repeat().interleave( 169 lambda _: dataset, 170 num_parallel_calls=dataset_ops.AUTOTUNE, 171 cycle_length=2) 172 173 dataset = dataset_ops.Dataset.zip((dataset, dataset_b)) 174 dataset = dataset.map(f2, num_parallel_calls=dataset_ops.AUTOTUNE) 175 dataset = dataset_ops.Dataset.range(1).repeat().interleave( 176 lambda _: dataset, 177 num_parallel_calls=dataset_ops.AUTOTUNE, 178 cycle_length=2) 179 180 dataset = dataset_ops.Dataset.zip((dataset, dataset_c)) 181 dataset = dataset.map(f2, num_parallel_calls=dataset_ops.AUTOTUNE) 182 return self._run_benchmark( 183 dataset=dataset, 184 autotune=autotune, 185 autotune_buffers=autotune_buffers, 186 benchmark_iters=10000, 187 benchmark_label="map_and_interleave") 188 189 def benchmark_map_batch_and_interleave(self): 190 a = self._benchmark_map_batch_and_interleave(autotune=False) 191 b = self._benchmark_map_batch_and_interleave( 192 autotune=True, autotune_buffers=False) 193 c = self._benchmark_map_batch_and_interleave( 194 autotune=True, autotune_buffers=True) 195 print("autotune parallelism vs no autotuning speedup: {}".format(a / b)) 196 print("autotune parallelism and buffer sizes vs no autotuning speedup: {}" 197 .format(a / c)) 198 199 def _benchmark_map_batch_and_interleave(self, 200 autotune, 201 autotune_buffers=False): 202 batch_size = 16 203 k = 1024 * 1024 204 a = (np.random.rand(1, 8 * k), np.random.rand(8 * k, 1)) 205 b = (np.random.rand(1, 4 * k), np.random.rand(4 * k, 1)) 206 c = (np.random.rand(1, 2 * k), np.random.rand(2 * k, 1)) 207 dataset_a = dataset_ops.Dataset.from_tensors(a).repeat() 208 dataset_b = dataset_ops.Dataset.from_tensors(b).repeat() 209 dataset_c = dataset_ops.Dataset.from_tensors(c).repeat() 210 211 dataset = dataset_a 212 dataset = dataset.map( 213 math_ops.matmul, num_parallel_calls=dataset_ops.AUTOTUNE) 214 dataset = dataset.batch(batch_size=batch_size) 215 dataset = dataset_ops.Dataset.range(1).repeat().interleave( 216 lambda _: dataset, 217 num_parallel_calls=dataset_ops.AUTOTUNE, 218 cycle_length=2) 219 220 dataset = dataset_ops.Dataset.zip((dataset, dataset_b)) 221 dataset = dataset_ops.Dataset.range(1).repeat().interleave( 222 lambda _: dataset, 223 num_parallel_calls=dataset_ops.AUTOTUNE, 224 cycle_length=2) 225 226 dataset_c = dataset_c.map( 227 math_ops.matmul, num_parallel_calls=dataset_ops.AUTOTUNE) 228 dataset_c = dataset_c.batch(batch_size=batch_size) 229 dataset = dataset_ops.Dataset.zip((dataset, dataset_c)) 230 return self._run_benchmark( 231 dataset=dataset, 232 autotune=autotune, 233 autotune_buffers=autotune_buffers, 234 benchmark_iters=1000, 235 benchmark_label="map_batch_and_interleave") 236 237 238if __name__ == "__main__": 239 benchmark_base.test.main() 240