1# Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15# pylint: disable=invalid-name 16"""MobileNet v1 models for Keras. 17 18MobileNet is a general architecture and can be used for multiple use cases. 19Depending on the use case, it can use different input layer size and 20different width factors. This allows different width models to reduce 21the number of multiply-adds and thereby 22reduce inference cost on mobile devices. 23 24MobileNets support any input size greater than 32 x 32, with larger image sizes 25offering better performance. 26The number of parameters and number of multiply-adds 27can be modified by using the `alpha` parameter, 28which increases/decreases the number of filters in each layer. 29By altering the image size and `alpha` parameter, 30all 16 models from the paper can be built, with ImageNet weights provided. 31 32The paper demonstrates the performance of MobileNets using `alpha` values of 331.0 (also called 100 % MobileNet), 0.75, 0.5 and 0.25. 34For each of these `alpha` values, weights for 4 different input image sizes 35are provided (224, 192, 160, 128). 36 37The following table describes the size and accuracy of the 100% MobileNet 38on size 224 x 224: 39---------------------------------------------------------------------------- 40Width Multiplier (alpha) | ImageNet Acc | Multiply-Adds (M) | Params (M) 41---------------------------------------------------------------------------- 42| 1.0 MobileNet-224 | 70.6 % | 529 | 4.2 | 43| 0.75 MobileNet-224 | 68.4 % | 325 | 2.6 | 44| 0.50 MobileNet-224 | 63.7 % | 149 | 1.3 | 45| 0.25 MobileNet-224 | 50.6 % | 41 | 0.5 | 46---------------------------------------------------------------------------- 47 48The following table describes the performance of 49the 100 % MobileNet on various input sizes: 50------------------------------------------------------------------------ 51 Resolution | ImageNet Acc | Multiply-Adds (M) | Params (M) 52------------------------------------------------------------------------ 53| 1.0 MobileNet-224 | 70.6 % | 529 | 4.2 | 54| 1.0 MobileNet-192 | 69.1 % | 529 | 4.2 | 55| 1.0 MobileNet-160 | 67.2 % | 529 | 4.2 | 56| 1.0 MobileNet-128 | 64.4 % | 529 | 4.2 | 57------------------------------------------------------------------------ 58 59Reference paper: 60 - [MobileNets: Efficient Convolutional Neural Networks for 61 Mobile Vision Applications](https://arxiv.org/abs/1704.04861) 62""" 63from __future__ import absolute_import 64from __future__ import division 65from __future__ import print_function 66 67import os 68 69from tensorflow.python.keras import backend 70from tensorflow.python.keras import layers 71from tensorflow.python.keras.applications import imagenet_utils 72from tensorflow.python.keras.engine import training 73from tensorflow.python.keras.utils import data_utils 74from tensorflow.python.keras.utils import layer_utils 75from tensorflow.python.platform import tf_logging as logging 76from tensorflow.python.util.tf_export import keras_export 77 78BASE_WEIGHT_PATH = ('https://storage.googleapis.com/tensorflow/' 79 'keras-applications/mobilenet/') 80 81 82@keras_export('keras.applications.mobilenet.MobileNet', 83 'keras.applications.MobileNet') 84def MobileNet(input_shape=None, 85 alpha=1.0, 86 depth_multiplier=1, 87 dropout=1e-3, 88 include_top=True, 89 weights='imagenet', 90 input_tensor=None, 91 pooling=None, 92 classes=1000, 93 **kwargs): 94 """Instantiates the MobileNet architecture. 95 96 Reference paper: 97 - [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision 98 Applications](https://arxiv.org/abs/1704.04861) 99 100 Optionally loads weights pre-trained on ImageNet. 101 Note that the data format convention used by the model is 102 the one specified in the `tf.keras.backend.image_data_format()`. 103 104 Arguments: 105 input_shape: Optional shape tuple, only to be specified if `include_top` 106 is False (otherwise the input shape has to be `(224, 224, 3)` (with 107 `channels_last` data format) or (3, 224, 224) (with `channels_first` 108 data format). It should have exactly 3 inputs channels, and width and 109 height should be no smaller than 32. E.g. `(200, 200, 3)` would be one 110 valid value. Default to `None`. 111 `input_shape` will be ignored if the `input_tensor` is provided. 112 alpha: Controls the width of the network. This is known as the width 113 multiplier in the MobileNet paper. - If `alpha` < 1.0, proportionally 114 decreases the number of filters in each layer. - If `alpha` > 1.0, 115 proportionally increases the number of filters in each layer. - If 116 `alpha` = 1, default number of filters from the paper are used at each 117 layer. Default to 1.0. 118 depth_multiplier: Depth multiplier for depthwise convolution. This is 119 called the resolution multiplier in the MobileNet paper. Default to 1.0. 120 dropout: Dropout rate. Default to 0.001. 121 include_top: Boolean, whether to include the fully-connected layer at the 122 top of the network. Default to `True`. 123 weights: One of `None` (random initialization), 'imagenet' (pre-training 124 on ImageNet), or the path to the weights file to be loaded. Default to 125 `imagenet`. 126 input_tensor: Optional Keras tensor (i.e. output of `layers.Input()`) to 127 use as image input for the model. `input_tensor` is useful for sharing 128 inputs between multiple different networks. Default to None. 129 pooling: Optional pooling mode for feature extraction when `include_top` 130 is `False`. 131 - `None` (default) means that the output of the model will be 132 the 4D tensor output of the last convolutional block. 133 - `avg` means that global average pooling 134 will be applied to the output of the 135 last convolutional block, and thus 136 the output of the model will be a 2D tensor. 137 - `max` means that global max pooling will be applied. 138 classes: Optional number of classes to classify images into, only to be 139 specified if `include_top` is True, and if no `weights` argument is 140 specified. Defaults to 1000. 141 **kwargs: For backwards compatibility only. 142 143 Returns: 144 A `tf.keras.Model` instance. 145 146 Raises: 147 ValueError: in case of invalid argument for `weights`, 148 or invalid input shape. 149 """ 150 if 'layers' in kwargs: 151 global layers 152 layers = kwargs.pop('layers') 153 if kwargs: 154 raise ValueError('Unknown argument(s): %s' % (kwargs,)) 155 if not (weights in {'imagenet', None} or os.path.exists(weights)): 156 raise ValueError('The `weights` argument should be either ' 157 '`None` (random initialization), `imagenet` ' 158 '(pre-training on ImageNet), ' 159 'or the path to the weights file to be loaded.') 160 161 if weights == 'imagenet' and include_top and classes != 1000: 162 raise ValueError('If using `weights` as `"imagenet"` with `include_top` ' 163 'as true, `classes` should be 1000') 164 165 # Determine proper input shape and default size. 166 if input_shape is None: 167 default_size = 224 168 else: 169 if backend.image_data_format() == 'channels_first': 170 rows = input_shape[1] 171 cols = input_shape[2] 172 else: 173 rows = input_shape[0] 174 cols = input_shape[1] 175 176 if rows == cols and rows in [128, 160, 192, 224]: 177 default_size = rows 178 else: 179 default_size = 224 180 181 input_shape = imagenet_utils.obtain_input_shape( 182 input_shape, 183 default_size=default_size, 184 min_size=32, 185 data_format=backend.image_data_format(), 186 require_flatten=include_top, 187 weights=weights) 188 189 if backend.image_data_format() == 'channels_last': 190 row_axis, col_axis = (0, 1) 191 else: 192 row_axis, col_axis = (1, 2) 193 rows = input_shape[row_axis] 194 cols = input_shape[col_axis] 195 196 if weights == 'imagenet': 197 if depth_multiplier != 1: 198 raise ValueError('If imagenet weights are being loaded, ' 199 'depth multiplier must be 1') 200 201 if alpha not in [0.25, 0.50, 0.75, 1.0]: 202 raise ValueError('If imagenet weights are being loaded, ' 203 'alpha can be one of' 204 '`0.25`, `0.50`, `0.75` or `1.0` only.') 205 206 if rows != cols or rows not in [128, 160, 192, 224]: 207 rows = 224 208 logging.warning('`input_shape` is undefined or non-square, ' 209 'or `rows` is not in [128, 160, 192, 224]. ' 210 'Weights for input shape (224, 224) will be' 211 ' loaded as the default.') 212 213 if input_tensor is None: 214 img_input = layers.Input(shape=input_shape) 215 else: 216 if not backend.is_keras_tensor(input_tensor): 217 img_input = layers.Input(tensor=input_tensor, shape=input_shape) 218 else: 219 img_input = input_tensor 220 221 x = _conv_block(img_input, 32, alpha, strides=(2, 2)) 222 x = _depthwise_conv_block(x, 64, alpha, depth_multiplier, block_id=1) 223 224 x = _depthwise_conv_block( 225 x, 128, alpha, depth_multiplier, strides=(2, 2), block_id=2) 226 x = _depthwise_conv_block(x, 128, alpha, depth_multiplier, block_id=3) 227 228 x = _depthwise_conv_block( 229 x, 256, alpha, depth_multiplier, strides=(2, 2), block_id=4) 230 x = _depthwise_conv_block(x, 256, alpha, depth_multiplier, block_id=5) 231 232 x = _depthwise_conv_block( 233 x, 512, alpha, depth_multiplier, strides=(2, 2), block_id=6) 234 x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=7) 235 x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=8) 236 x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=9) 237 x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=10) 238 x = _depthwise_conv_block(x, 512, alpha, depth_multiplier, block_id=11) 239 240 x = _depthwise_conv_block( 241 x, 1024, alpha, depth_multiplier, strides=(2, 2), block_id=12) 242 x = _depthwise_conv_block(x, 1024, alpha, depth_multiplier, block_id=13) 243 244 if include_top: 245 if backend.image_data_format() == 'channels_first': 246 shape = (int(1024 * alpha), 1, 1) 247 else: 248 shape = (1, 1, int(1024 * alpha)) 249 250 x = layers.GlobalAveragePooling2D()(x) 251 x = layers.Reshape(shape, name='reshape_1')(x) 252 x = layers.Dropout(dropout, name='dropout')(x) 253 x = layers.Conv2D(classes, (1, 1), padding='same', name='conv_preds')(x) 254 x = layers.Reshape((classes,), name='reshape_2')(x) 255 x = layers.Activation('softmax', name='act_softmax')(x) 256 else: 257 if pooling == 'avg': 258 x = layers.GlobalAveragePooling2D()(x) 259 elif pooling == 'max': 260 x = layers.GlobalMaxPooling2D()(x) 261 262 # Ensure that the model takes into account 263 # any potential predecessors of `input_tensor`. 264 if input_tensor is not None: 265 inputs = layer_utils.get_source_inputs(input_tensor) 266 else: 267 inputs = img_input 268 269 # Create model. 270 model = training.Model(inputs, x, name='mobilenet_%0.2f_%s' % (alpha, rows)) 271 272 # Load weights. 273 if weights == 'imagenet': 274 if alpha == 1.0: 275 alpha_text = '1_0' 276 elif alpha == 0.75: 277 alpha_text = '7_5' 278 elif alpha == 0.50: 279 alpha_text = '5_0' 280 else: 281 alpha_text = '2_5' 282 283 if include_top: 284 model_name = 'mobilenet_%s_%d_tf.h5' % (alpha_text, rows) 285 weight_path = BASE_WEIGHT_PATH + model_name 286 weights_path = data_utils.get_file( 287 model_name, weight_path, cache_subdir='models') 288 else: 289 model_name = 'mobilenet_%s_%d_tf_no_top.h5' % (alpha_text, rows) 290 weight_path = BASE_WEIGHT_PATH + model_name 291 weights_path = data_utils.get_file( 292 model_name, weight_path, cache_subdir='models') 293 model.load_weights(weights_path) 294 elif weights is not None: 295 model.load_weights(weights) 296 297 return model 298 299 300def _conv_block(inputs, filters, alpha, kernel=(3, 3), strides=(1, 1)): 301 """Adds an initial convolution layer (with batch normalization and relu6). 302 303 Arguments: 304 inputs: Input tensor of shape `(rows, cols, 3)` (with `channels_last` 305 data format) or (3, rows, cols) (with `channels_first` data format). 306 It should have exactly 3 inputs channels, and width and height should 307 be no smaller than 32. E.g. `(224, 224, 3)` would be one valid value. 308 filters: Integer, the dimensionality of the output space (i.e. the 309 number of output filters in the convolution). 310 alpha: controls the width of the network. - If `alpha` < 1.0, 311 proportionally decreases the number of filters in each layer. - If 312 `alpha` > 1.0, proportionally increases the number of filters in each 313 layer. - If `alpha` = 1, default number of filters from the paper are 314 used at each layer. 315 kernel: An integer or tuple/list of 2 integers, specifying the width and 316 height of the 2D convolution window. Can be a single integer to 317 specify the same value for all spatial dimensions. 318 strides: An integer or tuple/list of 2 integers, specifying the strides 319 of the convolution along the width and height. Can be a single integer 320 to specify the same value for all spatial dimensions. Specifying any 321 stride value != 1 is incompatible with specifying any `dilation_rate` 322 value != 1. # Input shape 323 4D tensor with shape: `(samples, channels, rows, cols)` if 324 data_format='channels_first' 325 or 4D tensor with shape: `(samples, rows, cols, channels)` if 326 data_format='channels_last'. # Output shape 327 4D tensor with shape: `(samples, filters, new_rows, new_cols)` if 328 data_format='channels_first' 329 or 4D tensor with shape: `(samples, new_rows, new_cols, filters)` if 330 data_format='channels_last'. `rows` and `cols` values might have 331 changed due to stride. 332 333 Returns: 334 Output tensor of block. 335 """ 336 channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1 337 filters = int(filters * alpha) 338 x = layers.ZeroPadding2D(padding=((0, 1), (0, 1)), name='conv1_pad')(inputs) 339 x = layers.Conv2D( 340 filters, 341 kernel, 342 padding='valid', 343 use_bias=False, 344 strides=strides, 345 name='conv1')( 346 x) 347 x = layers.BatchNormalization(axis=channel_axis, name='conv1_bn')(x) 348 return layers.ReLU(6., name='conv1_relu')(x) 349 350 351def _depthwise_conv_block(inputs, 352 pointwise_conv_filters, 353 alpha, 354 depth_multiplier=1, 355 strides=(1, 1), 356 block_id=1): 357 """Adds a depthwise convolution block. 358 359 A depthwise convolution block consists of a depthwise conv, 360 batch normalization, relu6, pointwise convolution, 361 batch normalization and relu6 activation. 362 363 Arguments: 364 inputs: Input tensor of shape `(rows, cols, channels)` (with 365 `channels_last` data format) or (channels, rows, cols) (with 366 `channels_first` data format). 367 pointwise_conv_filters: Integer, the dimensionality of the output space 368 (i.e. the number of output filters in the pointwise convolution). 369 alpha: controls the width of the network. - If `alpha` < 1.0, 370 proportionally decreases the number of filters in each layer. - If 371 `alpha` > 1.0, proportionally increases the number of filters in each 372 layer. - If `alpha` = 1, default number of filters from the paper are 373 used at each layer. 374 depth_multiplier: The number of depthwise convolution output channels 375 for each input channel. The total number of depthwise convolution 376 output channels will be equal to `filters_in * depth_multiplier`. 377 strides: An integer or tuple/list of 2 integers, specifying the strides 378 of the convolution along the width and height. Can be a single integer 379 to specify the same value for all spatial dimensions. Specifying any 380 stride value != 1 is incompatible with specifying any `dilation_rate` 381 value != 1. 382 block_id: Integer, a unique identification designating the block number. 383 # Input shape 384 4D tensor with shape: `(batch, channels, rows, cols)` if 385 data_format='channels_first' 386 or 4D tensor with shape: `(batch, rows, cols, channels)` if 387 data_format='channels_last'. # Output shape 388 4D tensor with shape: `(batch, filters, new_rows, new_cols)` if 389 data_format='channels_first' 390 or 4D tensor with shape: `(batch, new_rows, new_cols, filters)` if 391 data_format='channels_last'. `rows` and `cols` values might have 392 changed due to stride. 393 394 Returns: 395 Output tensor of block. 396 """ 397 channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1 398 pointwise_conv_filters = int(pointwise_conv_filters * alpha) 399 400 if strides == (1, 1): 401 x = inputs 402 else: 403 x = layers.ZeroPadding2D(((0, 1), (0, 1)), name='conv_pad_%d' % block_id)( 404 inputs) 405 x = layers.DepthwiseConv2D((3, 3), 406 padding='same' if strides == (1, 1) else 'valid', 407 depth_multiplier=depth_multiplier, 408 strides=strides, 409 use_bias=False, 410 name='conv_dw_%d' % block_id)( 411 x) 412 x = layers.BatchNormalization( 413 axis=channel_axis, name='conv_dw_%d_bn' % block_id)( 414 x) 415 x = layers.ReLU(6., name='conv_dw_%d_relu' % block_id)(x) 416 417 x = layers.Conv2D( 418 pointwise_conv_filters, (1, 1), 419 padding='same', 420 use_bias=False, 421 strides=(1, 1), 422 name='conv_pw_%d' % block_id)( 423 x) 424 x = layers.BatchNormalization( 425 axis=channel_axis, name='conv_pw_%d_bn' % block_id)( 426 x) 427 return layers.ReLU(6., name='conv_pw_%d_relu' % block_id)(x) 428 429 430@keras_export('keras.applications.mobilenet.preprocess_input') 431def preprocess_input(x, data_format=None): 432 return imagenet_utils.preprocess_input(x, data_format=data_format, mode='tf') 433 434 435@keras_export('keras.applications.mobilenet.decode_predictions') 436def decode_predictions(preds, top=5): 437 return imagenet_utils.decode_predictions(preds, top=top) 438