• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15# pylint: disable=invalid-name
16# pylint: disable=missing-function-docstring
17"""MobileNet v3 models for Keras."""
18from __future__ import absolute_import
19from __future__ import division
20from __future__ import print_function
21
22
23from tensorflow.python.keras import backend
24from tensorflow.python.keras import models
25from tensorflow.python.keras.applications import imagenet_utils
26from tensorflow.python.keras.layers import VersionAwareLayers
27from tensorflow.python.keras.utils import data_utils
28from tensorflow.python.keras.utils import layer_utils
29from tensorflow.python.lib.io import file_io
30from tensorflow.python.platform import tf_logging as logging
31from tensorflow.python.util.tf_export import keras_export
32
33
34# TODO(scottzhu): Change this to the GCS path.
35BASE_WEIGHT_PATH = ('https://storage.googleapis.com/tensorflow/'
36                    'keras-applications/mobilenet_v3/')
37WEIGHTS_HASHES = {
38    'large_224_0.75_float': ('765b44a33ad4005b3ac83185abf1d0eb',
39                             'e7b4d1071996dd51a2c2ca2424570e20'),
40    'large_224_1.0_float': ('59e551e166be033d707958cf9e29a6a7',
41                            '037116398e07f018c0005ffcb0406831'),
42    'large_minimalistic_224_1.0_float': ('675e7b876c45c57e9e63e6d90a36599c',
43                                         'a2c33aed672524d1d0b4431808177695'),
44    'small_224_0.75_float': ('cb65d4e5be93758266aa0a7f2c6708b7',
45                             '4d2fe46f1c1f38057392514b0df1d673'),
46    'small_224_1.0_float': ('8768d4c2e7dee89b9d02b2d03d65d862',
47                            'be7100780f875c06bcab93d76641aa26'),
48    'small_minimalistic_224_1.0_float': ('99cd97fb2fcdad2bf028eb838de69e37',
49                                         '20d4e357df3f7a6361f3a288857b1051'),
50}
51
52layers = VersionAwareLayers()
53
54
55BASE_DOCSTRING = """Instantiates the {name} architecture.
56
57  Reference:
58  - [Searching for MobileNetV3](
59      https://arxiv.org/pdf/1905.02244.pdf) (ICCV 2019)
60
61  The following table describes the performance of MobileNets:
62  ------------------------------------------------------------------------
63  MACs stands for Multiply Adds
64
65  |Classification Checkpoint|MACs(M)|Parameters(M)|Top1 Accuracy|Pixel1 CPU(ms)|
66  |---|---|---|---|---|
67  | mobilenet_v3_large_1.0_224              | 217 | 5.4 |   75.6   |   51.2  |
68  | mobilenet_v3_large_0.75_224             | 155 | 4.0 |   73.3   |   39.8  |
69  | mobilenet_v3_large_minimalistic_1.0_224 | 209 | 3.9 |   72.3   |   44.1  |
70  | mobilenet_v3_small_1.0_224              | 66  | 2.9 |   68.1   |   15.8  |
71  | mobilenet_v3_small_0.75_224             | 44  | 2.4 |   65.4   |   12.8  |
72  | mobilenet_v3_small_minimalistic_1.0_224 | 65  | 2.0 |   61.9   |   12.2  |
73
74  The weights for all 6 models are obtained and translated from the Tensorflow
75  checkpoints from TensorFlow checkpoints found [here]
76  (https://github.com/tensorflow/models/tree/master/research/slim/nets/mobilenet/README.md).
77
78  Optionally loads weights pre-trained on ImageNet.
79
80  Args:
81    input_shape: Optional shape tuple, to be specified if you would
82      like to use a model with an input image resolution that is not
83      (224, 224, 3).
84      It should have exactly 3 inputs channels (224, 224, 3).
85      You can also omit this option if you would like
86      to infer input_shape from an input_tensor.
87      If you choose to include both input_tensor and input_shape then
88      input_shape will be used if they match, if the shapes
89      do not match then we will throw an error.
90      E.g. `(160, 160, 3)` would be one valid value.
91    alpha: controls the width of the network. This is known as the
92      depth multiplier in the MobileNetV3 paper, but the name is kept for
93      consistency with MobileNetV1 in Keras.
94      - If `alpha` < 1.0, proportionally decreases the number
95          of filters in each layer.
96      - If `alpha` > 1.0, proportionally increases the number
97          of filters in each layer.
98      - If `alpha` = 1, default number of filters from the paper
99          are used at each layer.
100    minimalistic: In addition to large and small models this module also
101      contains so-called minimalistic models, these models have the same
102      per-layer dimensions characteristic as MobilenetV3 however, they don't
103      utilize any of the advanced blocks (squeeze-and-excite units, hard-swish,
104      and 5x5 convolutions). While these models are less efficient on CPU, they
105      are much more performant on GPU/DSP.
106    include_top: Boolean, whether to include the fully-connected
107      layer at the top of the network. Defaults to `True`.
108    weights: String, one of `None` (random initialization),
109      'imagenet' (pre-training on ImageNet),
110      or the path to the weights file to be loaded.
111    input_tensor: Optional Keras tensor (i.e. output of
112      `layers.Input()`)
113      to use as image input for the model.
114    pooling: String, optional pooling mode for feature extraction
115      when `include_top` is `False`.
116      - `None` means that the output of the model
117          will be the 4D tensor output of the
118          last convolutional block.
119      - `avg` means that global average pooling
120          will be applied to the output of the
121          last convolutional block, and thus
122          the output of the model will be a
123          2D tensor.
124      - `max` means that global max pooling will
125          be applied.
126    classes: Integer, optional number of classes to classify images
127      into, only to be specified if `include_top` is True, and
128      if no `weights` argument is specified.
129    dropout_rate: fraction of the input units to drop on the last layer.
130    classifier_activation: A `str` or callable. The activation function to use
131      on the "top" layer. Ignored unless `include_top=True`. Set
132      `classifier_activation=None` to return the logits of the "top" layer.
133
134  Call arguments:
135    inputs: A floating point `numpy.array` or a `tf.Tensor`, 4D with 3 color
136      channels, with values in the range [0, 255].
137
138  Returns:
139    A `keras.Model` instance.
140
141  Raises:
142    ValueError: in case of invalid argument for `weights`,
143      or invalid input shape or invalid alpha, rows when
144      weights='imagenet'
145    ValueError: if `classifier_activation` is not `softmax` or `None` when
146      using a pretrained top layer.
147"""
148
149
150def MobileNetV3(stack_fn,
151                last_point_ch,
152                input_shape=None,
153                alpha=1.0,
154                model_type='large',
155                minimalistic=False,
156                include_top=True,
157                weights='imagenet',
158                input_tensor=None,
159                classes=1000,
160                pooling=None,
161                dropout_rate=0.2,
162                classifier_activation='softmax'):
163  if not (weights in {'imagenet', None} or file_io.file_exists_v2(weights)):
164    raise ValueError('The `weights` argument should be either '
165                     '`None` (random initialization), `imagenet` '
166                     '(pre-training on ImageNet), '
167                     'or the path to the weights file to be loaded.')
168
169  if weights == 'imagenet' and include_top and classes != 1000:
170    raise ValueError('If using `weights` as `"imagenet"` with `include_top` '
171                     'as true, `classes` should be 1000')
172
173  # Determine proper input shape and default size.
174  # If both input_shape and input_tensor are used, they should match
175  if input_shape is not None and input_tensor is not None:
176    try:
177      is_input_t_tensor = backend.is_keras_tensor(input_tensor)
178    except ValueError:
179      try:
180        is_input_t_tensor = backend.is_keras_tensor(
181            layer_utils.get_source_inputs(input_tensor))
182      except ValueError:
183        raise ValueError('input_tensor: ', input_tensor,
184                         'is not type input_tensor')
185    if is_input_t_tensor:
186      if backend.image_data_format() == 'channels_first':
187        if backend.int_shape(input_tensor)[1] != input_shape[1]:
188          raise ValueError('input_shape: ', input_shape, 'and input_tensor: ',
189                           input_tensor,
190                           'do not meet the same shape requirements')
191      else:
192        if backend.int_shape(input_tensor)[2] != input_shape[1]:
193          raise ValueError('input_shape: ', input_shape, 'and input_tensor: ',
194                           input_tensor,
195                           'do not meet the same shape requirements')
196    else:
197      raise ValueError('input_tensor specified: ', input_tensor,
198                       'is not a keras tensor')
199
200  # If input_shape is None, infer shape from input_tensor
201  if input_shape is None and input_tensor is not None:
202
203    try:
204      backend.is_keras_tensor(input_tensor)
205    except ValueError:
206      raise ValueError('input_tensor: ', input_tensor, 'is type: ',
207                       type(input_tensor), 'which is not a valid type')
208
209    if backend.is_keras_tensor(input_tensor):
210      if backend.image_data_format() == 'channels_first':
211        rows = backend.int_shape(input_tensor)[2]
212        cols = backend.int_shape(input_tensor)[3]
213        input_shape = (3, cols, rows)
214      else:
215        rows = backend.int_shape(input_tensor)[1]
216        cols = backend.int_shape(input_tensor)[2]
217        input_shape = (cols, rows, 3)
218  # If input_shape is None and input_tensor is None using standart shape
219  if input_shape is None and input_tensor is None:
220    input_shape = (None, None, 3)
221
222  if backend.image_data_format() == 'channels_last':
223    row_axis, col_axis = (0, 1)
224  else:
225    row_axis, col_axis = (1, 2)
226  rows = input_shape[row_axis]
227  cols = input_shape[col_axis]
228  if rows and cols and (rows < 32 or cols < 32):
229    raise ValueError('Input size must be at least 32x32; got `input_shape=' +
230                     str(input_shape) + '`')
231  if weights == 'imagenet':
232    if (not minimalistic and alpha not in [0.75, 1.0]
233        or minimalistic and alpha != 1.0):
234      raise ValueError('If imagenet weights are being loaded, '
235                       'alpha can be one of `0.75`, `1.0` for non minimalistic'
236                       ' or `1.0` for minimalistic only.')
237
238    if rows != cols or rows != 224:
239      logging.warning('`input_shape` is undefined or non-square, '
240                      'or `rows` is not 224.'
241                      ' Weights for input shape (224, 224) will be'
242                      ' loaded as the default.')
243
244  if input_tensor is None:
245    img_input = layers.Input(shape=input_shape)
246  else:
247    if not backend.is_keras_tensor(input_tensor):
248      img_input = layers.Input(tensor=input_tensor, shape=input_shape)
249    else:
250      img_input = input_tensor
251
252  channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1
253
254  if minimalistic:
255    kernel = 3
256    activation = relu
257    se_ratio = None
258  else:
259    kernel = 5
260    activation = hard_swish
261    se_ratio = 0.25
262
263  x = img_input
264  x = layers.Rescaling(scale=1. / 127.5, offset=-1.)(x)
265  x = layers.Conv2D(
266      16,
267      kernel_size=3,
268      strides=(2, 2),
269      padding='same',
270      use_bias=False,
271      name='Conv')(x)
272  x = layers.BatchNormalization(
273      axis=channel_axis, epsilon=1e-3,
274      momentum=0.999, name='Conv/BatchNorm')(x)
275  x = activation(x)
276
277  x = stack_fn(x, kernel, activation, se_ratio)
278
279  last_conv_ch = _depth(backend.int_shape(x)[channel_axis] * 6)
280
281  # if the width multiplier is greater than 1 we
282  # increase the number of output channels
283  if alpha > 1.0:
284    last_point_ch = _depth(last_point_ch * alpha)
285  x = layers.Conv2D(
286      last_conv_ch,
287      kernel_size=1,
288      padding='same',
289      use_bias=False,
290      name='Conv_1')(x)
291  x = layers.BatchNormalization(
292      axis=channel_axis, epsilon=1e-3,
293      momentum=0.999, name='Conv_1/BatchNorm')(x)
294  x = activation(x)
295  x = layers.Conv2D(
296      last_point_ch,
297      kernel_size=1,
298      padding='same',
299      use_bias=True,
300      name='Conv_2')(x)
301  x = activation(x)
302
303  if include_top:
304    x = layers.GlobalAveragePooling2D()(x)
305    if channel_axis == 1:
306      x = layers.Reshape((last_point_ch, 1, 1))(x)
307    else:
308      x = layers.Reshape((1, 1, last_point_ch))(x)
309    if dropout_rate > 0:
310      x = layers.Dropout(dropout_rate)(x)
311    x = layers.Conv2D(classes, kernel_size=1, padding='same', name='Logits')(x)
312    x = layers.Flatten()(x)
313    imagenet_utils.validate_activation(classifier_activation, weights)
314    x = layers.Activation(activation=classifier_activation,
315                          name='Predictions')(x)
316  else:
317    if pooling == 'avg':
318      x = layers.GlobalAveragePooling2D(name='avg_pool')(x)
319    elif pooling == 'max':
320      x = layers.GlobalMaxPooling2D(name='max_pool')(x)
321  # Ensure that the model takes into account
322  # any potential predecessors of `input_tensor`.
323  if input_tensor is not None:
324    inputs = layer_utils.get_source_inputs(input_tensor)
325  else:
326    inputs = img_input
327
328  # Create model.
329  model = models.Model(inputs, x, name='MobilenetV3' + model_type)
330
331  # Load weights.
332  if weights == 'imagenet':
333    model_name = '{}{}_224_{}_float'.format(
334        model_type, '_minimalistic' if minimalistic else '', str(alpha))
335    if include_top:
336      file_name = 'weights_mobilenet_v3_' + model_name + '.h5'
337      file_hash = WEIGHTS_HASHES[model_name][0]
338    else:
339      file_name = 'weights_mobilenet_v3_' + model_name + '_no_top.h5'
340      file_hash = WEIGHTS_HASHES[model_name][1]
341    weights_path = data_utils.get_file(
342        file_name,
343        BASE_WEIGHT_PATH + file_name,
344        cache_subdir='models',
345        file_hash=file_hash)
346    model.load_weights(weights_path)
347  elif weights is not None:
348    model.load_weights(weights)
349
350  return model
351
352
353@keras_export('keras.applications.MobileNetV3Small')
354def MobileNetV3Small(input_shape=None,
355                     alpha=1.0,
356                     minimalistic=False,
357                     include_top=True,
358                     weights='imagenet',
359                     input_tensor=None,
360                     classes=1000,
361                     pooling=None,
362                     dropout_rate=0.2,
363                     classifier_activation='softmax'):
364
365  def stack_fn(x, kernel, activation, se_ratio):
366
367    def depth(d):
368      return _depth(d * alpha)
369
370    x = _inverted_res_block(x, 1, depth(16), 3, 2, se_ratio, relu, 0)
371    x = _inverted_res_block(x, 72. / 16, depth(24), 3, 2, None, relu, 1)
372    x = _inverted_res_block(x, 88. / 24, depth(24), 3, 1, None, relu, 2)
373    x = _inverted_res_block(x, 4, depth(40), kernel, 2, se_ratio, activation, 3)
374    x = _inverted_res_block(x, 6, depth(40), kernel, 1, se_ratio, activation, 4)
375    x = _inverted_res_block(x, 6, depth(40), kernel, 1, se_ratio, activation, 5)
376    x = _inverted_res_block(x, 3, depth(48), kernel, 1, se_ratio, activation, 6)
377    x = _inverted_res_block(x, 3, depth(48), kernel, 1, se_ratio, activation, 7)
378    x = _inverted_res_block(x, 6, depth(96), kernel, 2, se_ratio, activation, 8)
379    x = _inverted_res_block(x, 6, depth(96), kernel, 1, se_ratio, activation, 9)
380    x = _inverted_res_block(x, 6, depth(96), kernel, 1, se_ratio, activation,
381                            10)
382    return x
383
384  return MobileNetV3(stack_fn, 1024, input_shape, alpha, 'small', minimalistic,
385                     include_top, weights, input_tensor, classes, pooling,
386                     dropout_rate, classifier_activation)
387
388
389@keras_export('keras.applications.MobileNetV3Large')
390def MobileNetV3Large(input_shape=None,
391                     alpha=1.0,
392                     minimalistic=False,
393                     include_top=True,
394                     weights='imagenet',
395                     input_tensor=None,
396                     classes=1000,
397                     pooling=None,
398                     dropout_rate=0.2,
399                     classifier_activation='softmax'):
400
401  def stack_fn(x, kernel, activation, se_ratio):
402
403    def depth(d):
404      return _depth(d * alpha)
405
406    x = _inverted_res_block(x, 1, depth(16), 3, 1, None, relu, 0)
407    x = _inverted_res_block(x, 4, depth(24), 3, 2, None, relu, 1)
408    x = _inverted_res_block(x, 3, depth(24), 3, 1, None, relu, 2)
409    x = _inverted_res_block(x, 3, depth(40), kernel, 2, se_ratio, relu, 3)
410    x = _inverted_res_block(x, 3, depth(40), kernel, 1, se_ratio, relu, 4)
411    x = _inverted_res_block(x, 3, depth(40), kernel, 1, se_ratio, relu, 5)
412    x = _inverted_res_block(x, 6, depth(80), 3, 2, None, activation, 6)
413    x = _inverted_res_block(x, 2.5, depth(80), 3, 1, None, activation, 7)
414    x = _inverted_res_block(x, 2.3, depth(80), 3, 1, None, activation, 8)
415    x = _inverted_res_block(x, 2.3, depth(80), 3, 1, None, activation, 9)
416    x = _inverted_res_block(x, 6, depth(112), 3, 1, se_ratio, activation, 10)
417    x = _inverted_res_block(x, 6, depth(112), 3, 1, se_ratio, activation, 11)
418    x = _inverted_res_block(x, 6, depth(160), kernel, 2, se_ratio, activation,
419                            12)
420    x = _inverted_res_block(x, 6, depth(160), kernel, 1, se_ratio, activation,
421                            13)
422    x = _inverted_res_block(x, 6, depth(160), kernel, 1, se_ratio, activation,
423                            14)
424    return x
425
426  return MobileNetV3(stack_fn, 1280, input_shape, alpha, 'large', minimalistic,
427                     include_top, weights, input_tensor, classes, pooling,
428                     dropout_rate, classifier_activation)
429
430
431MobileNetV3Small.__doc__ = BASE_DOCSTRING.format(name='MobileNetV3Small')
432MobileNetV3Large.__doc__ = BASE_DOCSTRING.format(name='MobileNetV3Large')
433
434
435def relu(x):
436  return layers.ReLU()(x)
437
438
439def hard_sigmoid(x):
440  return layers.ReLU(6.)(x + 3.) * (1. / 6.)
441
442
443def hard_swish(x):
444  return layers.Multiply()([hard_sigmoid(x), x])
445
446
447# This function is taken from the original tf repo.
448# It ensures that all layers have a channel number that is divisible by 8
449# It can be seen here:
450# https://github.com/tensorflow/models/blob/master/research/
451# slim/nets/mobilenet/mobilenet.py
452
453
454def _depth(v, divisor=8, min_value=None):
455  if min_value is None:
456    min_value = divisor
457  new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
458  # Make sure that round down does not go down by more than 10%.
459  if new_v < 0.9 * v:
460    new_v += divisor
461  return new_v
462
463
464def _se_block(inputs, filters, se_ratio, prefix):
465  x = layers.GlobalAveragePooling2D(name=prefix + 'squeeze_excite/AvgPool')(
466      inputs)
467  if backend.image_data_format() == 'channels_first':
468    x = layers.Reshape((filters, 1, 1))(x)
469  else:
470    x = layers.Reshape((1, 1, filters))(x)
471  x = layers.Conv2D(
472      _depth(filters * se_ratio),
473      kernel_size=1,
474      padding='same',
475      name=prefix + 'squeeze_excite/Conv')(
476          x)
477  x = layers.ReLU(name=prefix + 'squeeze_excite/Relu')(x)
478  x = layers.Conv2D(
479      filters,
480      kernel_size=1,
481      padding='same',
482      name=prefix + 'squeeze_excite/Conv_1')(
483          x)
484  x = hard_sigmoid(x)
485  x = layers.Multiply(name=prefix + 'squeeze_excite/Mul')([inputs, x])
486  return x
487
488
489def _inverted_res_block(x, expansion, filters, kernel_size, stride, se_ratio,
490                        activation, block_id):
491  channel_axis = 1 if backend.image_data_format() == 'channels_first' else -1
492  shortcut = x
493  prefix = 'expanded_conv/'
494  infilters = backend.int_shape(x)[channel_axis]
495  if block_id:
496    # Expand
497    prefix = 'expanded_conv_{}/'.format(block_id)
498    x = layers.Conv2D(
499        _depth(infilters * expansion),
500        kernel_size=1,
501        padding='same',
502        use_bias=False,
503        name=prefix + 'expand')(
504            x)
505    x = layers.BatchNormalization(
506        axis=channel_axis,
507        epsilon=1e-3,
508        momentum=0.999,
509        name=prefix + 'expand/BatchNorm')(
510            x)
511    x = activation(x)
512
513  if stride == 2:
514    x = layers.ZeroPadding2D(
515        padding=imagenet_utils.correct_pad(x, kernel_size),
516        name=prefix + 'depthwise/pad')(
517            x)
518  x = layers.DepthwiseConv2D(
519      kernel_size,
520      strides=stride,
521      padding='same' if stride == 1 else 'valid',
522      use_bias=False,
523      name=prefix + 'depthwise')(
524          x)
525  x = layers.BatchNormalization(
526      axis=channel_axis,
527      epsilon=1e-3,
528      momentum=0.999,
529      name=prefix + 'depthwise/BatchNorm')(
530          x)
531  x = activation(x)
532
533  if se_ratio:
534    x = _se_block(x, _depth(infilters * expansion), se_ratio, prefix)
535
536  x = layers.Conv2D(
537      filters,
538      kernel_size=1,
539      padding='same',
540      use_bias=False,
541      name=prefix + 'project')(
542          x)
543  x = layers.BatchNormalization(
544      axis=channel_axis,
545      epsilon=1e-3,
546      momentum=0.999,
547      name=prefix + 'project/BatchNorm')(
548          x)
549
550  if stride == 1 and infilters == filters:
551    x = layers.Add(name=prefix + 'Add')([shortcut, x])
552  return x
553
554
555@keras_export('keras.applications.mobilenet_v3.preprocess_input')
556def preprocess_input(x, data_format=None):  # pylint: disable=unused-argument
557  """A placeholder method for backward compatibility.
558
559  The preprocessing logic has been included in the mobilenet_v3 model
560  implementation. Users are no longer required to call this method to normalize
561  the input data. This method does nothing and only kept as a placeholder to
562  align the API surface between old and new version of model.
563
564  Args:
565    x: A floating point `numpy.array` or a `tf.Tensor`.
566    data_format: Optional data format of the image tensor/array. Defaults to
567      None, in which case the global setting
568      `tf.keras.backend.image_data_format()` is used (unless you changed it,
569      it defaults to "channels_last").{mode}
570
571  Returns:
572    Unchanged `numpy.array` or `tf.Tensor`.
573  """
574
575  return x
576
577
578@keras_export('keras.applications.mobilenet_v3.decode_predictions')
579def decode_predictions(preds, top=5):
580  return imagenet_utils.decode_predictions(preds, top=top)
581
582
583decode_predictions.__doc__ = imagenet_utils.decode_predictions.__doc__
584