• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 #ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_HYBRID_H_
16 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_HYBRID_H_
17 
18 #include "ruy/profiler/instrumentation.h"  // from @ruy
19 #include "tensorflow/lite/kernels/cpu_backend_context.h"
20 #include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
21 #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
22 #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h"
23 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h"
24 #include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_hybrid_3x3_filter.h"
25 #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
26 #include "tensorflow/lite/kernels/internal/types.h"
27 
28 namespace tflite {
29 namespace optimized_integer_ops {
30 namespace depthwise_conv {
31 
32 // Initializes the accumulator buffer with zeros.
DepthwiseConvInitAccBuffer(int num_output_pixels,int output_depth,int32 * acc_buffer)33 inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
34                                        int32* acc_buffer) {
35   memset(acc_buffer, 0,
36          sizeof(acc_buffer[0]) * output_depth * num_output_pixels);
37 }
38 
39 // Initializes the accumulator buffer with bias values.
DepthwiseConvHybridGeneral(const DepthwiseParams & params,const float * input_scales,const RuntimeShape & input_shape,const int8 * input_data,const RuntimeShape & filter_shape,const int8 * filter_data,const RuntimeShape & bias_shape,const float * bias_data,const RuntimeShape & output_shape,float * output_data,const float * per_channel_scales,const int32_t * input_offsets,int thread_start,int thread_end,int thread_dim)40 inline void DepthwiseConvHybridGeneral(
41     const DepthwiseParams& params,
42     const float* input_scales, const RuntimeShape& input_shape,
43     const int8* input_data, const RuntimeShape& filter_shape,
44     const int8* filter_data, const RuntimeShape& bias_shape,
45     const float* bias_data, const RuntimeShape& output_shape,
46     float* output_data, const float* per_channel_scales,
47     const int32_t* input_offsets, int thread_start, int thread_end,
48     int thread_dim) {
49   const int stride_width = params.stride_width;
50   const int stride_height = params.stride_height;
51   const int pad_width = params.padding_values.width;
52   const int pad_height = params.padding_values.height;
53   const int depth_multiplier = params.depth_multiplier;
54   const float output_activation_min = params.float_activation_min;
55   const float output_activation_max = params.float_activation_max;
56   const int dilation_width_factor = params.dilation_width_factor;
57   const int dilation_height_factor = params.dilation_height_factor;
58   const int batches = MatchingDim(input_shape, 0, output_shape, 0);
59   const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
60   const int input_height = input_shape.Dims(1);
61   const int input_width = input_shape.Dims(2);
62   const int input_depth = input_shape.Dims(3);
63   const int filter_height = filter_shape.Dims(1);
64   const int filter_width = filter_shape.Dims(2);
65   const int output_rows = output_shape.Dims(1);
66   const int output_width = output_shape.Dims(2);
67 
68   static const int kAccBufferMaxSize = 2048;
69   int32 acc_buffer[kAccBufferMaxSize];
70   TFLITE_DCHECK_GE(kAccBufferMaxSize, output_depth);
71   const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
72   const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
73   TFLITE_DCHECK_LE(kOutputPixelsInAccBuffer * output_depth,
74                    kAccBufferActualSize);
75   TFLITE_DCHECK_LE(kAccBufferActualSize, kAccBufferMaxSize);
76   TFLITE_DCHECK_GE(kOutputPixelsInAccBuffer, 1);
77   TFLITE_DCHECK(thread_dim == 0 || thread_dim == 1);
78 
79   // row_accum_func will point to the core accumulation function to be used
80   // for this DepthwiseConvHybrid op.
81   using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric);
82   row_accum_func_t row_accum_func = nullptr;
83 
84 #define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, \
85                                         FIXED_DEPTH_MULTIPLIER)           \
86   if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&          \
87       (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&     \
88       depth_multiplier == FIXED_DEPTH_MULTIPLIER) {                       \
89     row_accum_func =                                                      \
90         QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH,  \
91                                        FIXED_DEPTH_MULTIPLIER>;           \
92   }
93 
94 #ifdef USE_NEON
95   // We go over our list of kernels by decreasing order of preference
96   // for the cases where multiple kernels could apply.
97 
98   // Start with the fastest kernels: AllowStrided=false, fixed input depth.
99 
100   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2)
101   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2)
102   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2)
103   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4)
104   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1)
105   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4)
106   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
107   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8)
108   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
109   TFMINI_USE_DEPTHWISECONV_KERNEL(false, 12, 1)
110 
111   // Next come the strided kernels: AllowStrided=true, fixed input depth.
112   // They are a bit less efficient, but allow stride!=1.
113 
114   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2)
115   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1)
116   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16)
117   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
118   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
119   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
120   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
121   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
122   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
123 
124   // Finally, the kernels allowing a variable input depth,
125   // these are the least efficient but most general kernels.
126 
127   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
128   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
129   TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3)
130 #endif  // USE_NEON
131 
132   // No matching fast kernel found, use slow fallback.
133   if (!row_accum_func) {
134     row_accum_func = QuantizedDepthwiseConvAccumRowGeneric;
135   }
136 
137 #undef TFMINI_USE_DEPTHWISECONV_KERNEL
138 
139   const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
140   const int input_batch_stride = input_height_stride * input_shape.Dims(1);
141   const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
142 
143   // Now that we have determined row_accum_func, we can start work.
144   int batch_start = 0;
145   int batch_end = batches;
146   int row_start = 0;
147   int row_end = output_rows;
148   int output_ptr_offset = 0;
149 
150   switch (thread_dim) {
151     case 0:
152       TFLITE_DCHECK_GE(thread_start, 0);
153       TFLITE_DCHECK_LE(thread_end, batches);
154       batch_start = thread_start;
155       batch_end = thread_end;
156       output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
157       break;
158     case 1:
159       TFLITE_DCHECK_GE(thread_start, 0);
160       TFLITE_DCHECK_LE(thread_end, output_rows);
161       row_start = thread_start;
162       row_end = thread_end;
163       output_ptr_offset = row_start * output_width * output_depth;
164       break;
165   }
166 
167   float* output_ptr = output_data + output_ptr_offset;
168   int batch_step =
169       (output_rows + row_start - row_end) * output_width * output_depth;
170   for (int b = batch_start; b < batch_end; ++b) {
171     float input_scale = input_scales[b];
172     int32_t input_offset = input_offsets[b];
173     for (int out_y = row_start; out_y < row_end; ++out_y) {
174       const int in_y_origin = (out_y * stride_height) - pad_height;
175       const int filter_y_start =
176           std::max(0, (-in_y_origin + dilation_height_factor - 1) /
177                           dilation_height_factor);
178       const int filter_y_end =
179           std::min(filter_height,
180                    (input_height - in_y_origin + dilation_height_factor - 1) /
181                        dilation_height_factor);
182       for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
183            out_x_buffer_start += kOutputPixelsInAccBuffer) {
184         const int out_x_buffer_end = std::min(
185             output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
186         // We call a 'pixel' a group of activation that share all but the
187         // 'depth'/'channel' coordinate. num_output_pixels is the number of
188         // output pixels that we will accumulate in this loop iteration.
189         const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
190         DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, acc_buffer);
191 
192         // Accumulation loop. Most of the time should be spent in here.
193         for (int filter_y = filter_y_start; filter_y < filter_y_end;
194              ++filter_y) {
195           const int in_y = in_y_origin + dilation_height_factor * filter_y;
196           row_accum_func(
197               stride_width, dilation_width_factor, input_depth, input_width,
198               input_data + in_y * input_height_stride + b * input_batch_stride,
199               -input_offset, pad_width, depth_multiplier, filter_width,
200               filter_data + filter_y * filter_height_stride, out_x_buffer_start,
201               out_x_buffer_end, output_depth, acc_buffer);
202         }
203         // Finished accumulating int32 values. Just store them as float values
204         gemmlowp::ScopedProfilingLabel label("store");
205         const int num_output_values = output_depth * num_output_pixels;
206         int c = 0;
207         while (c < output_depth) {
208           int target_output_depth = output_depth;
209 
210 #ifdef USE_NEON
211           const float32x4_t output_activation_min_vec =
212               vdupq_n_f32(output_activation_min);
213           const float32x4_t output_activation_max_vec =
214               vdupq_n_f32(output_activation_max);
215           const float32x4_t input_scale_32x4 = vdupq_n_f32(input_scale);
216           for (; c <= output_depth - 4; c += 4) {
217             if ((c + 4) > output_depth) {
218               break;
219             }
220             const float32x4_t channel_scale_32x4 =
221                 vld1q_f32(per_channel_scales + c);
222             const float32x4_t bias_32x4 = vld1q_f32(bias_data + c);
223             for (int n = 0; n < num_output_pixels; ++n) {
224               int loc = n * output_depth + c;
225               int32x4_t acc = vld1q_s32(acc_buffer + loc);
226               float32x4_t float_acc = vcvtq_f32_s32(acc);
227               float_acc = vmulq_f32(float_acc, channel_scale_32x4);
228               float_acc = vmulq_f32(float_acc, input_scale_32x4);
229               float_acc = vaddq_f32(float_acc, bias_32x4);
230               float_acc = vmaxq_f32(float_acc, output_activation_min_vec);
231               float_acc = vminq_f32(float_acc, output_activation_max_vec);
232               vst1q_f32(output_ptr + loc, float_acc);
233             }
234           }
235 #endif  // USE_NEON
236 
237           for (; c < target_output_depth; c++) {
238             for (int n = 0; n < num_output_pixels; ++n) {
239               int loc = n * output_depth + c;
240               int32 acc = acc_buffer[loc];
241               float float_acc = acc * input_scale * per_channel_scales[c];
242               float_acc += bias_data[c];
243               float_acc = std::max(float_acc, output_activation_min);
244               float_acc = std::min(float_acc, output_activation_max);
245               output_ptr[loc] = float_acc;
246             }
247           }
248         }
249         output_ptr += num_output_values;
250       }
251     }
252     output_ptr += batch_step;
253   }
254 }
255 
256 }  // namespace depthwise_conv
257 
258 template <DepthwiseConvOutputRounding kOutputRounding>
DepthwiseConvHybridWithRounding(const DepthwiseParams & params,const float * input_scales,const RuntimeShape & input_shape,const int8 * input_data,const RuntimeShape & filter_shape,const int8 * filter_data,const RuntimeShape & bias_shape,const float * bias_data,const RuntimeShape & output_shape,float * output_data,const float * per_channel_scales,const int32_t * input_offsets,int thread_start,int thread_end,int thread_dim)259 inline void DepthwiseConvHybridWithRounding(
260     const DepthwiseParams& params, const float* input_scales,
261     const RuntimeShape& input_shape, const int8* input_data,
262     const RuntimeShape& filter_shape, const int8* filter_data,
263     const RuntimeShape& bias_shape, const float* bias_data,
264     const RuntimeShape& output_shape, float* output_data,
265     const float* per_channel_scales, const int32_t* input_offsets,
266     int thread_start, int thread_end, int thread_dim) {
267   gemmlowp::ScopedProfilingLabel label("DepthwiseConvHybridInt8/8bit");
268   const int depth_multiplier = params.depth_multiplier;
269   const int dilation_width_factor = params.dilation_width_factor;
270   const int dilation_height_factor = params.dilation_height_factor;
271   TFLITE_DCHECK_GE(dilation_width_factor, 1);
272   TFLITE_DCHECK_GE(dilation_height_factor, 1);
273   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
274   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
275   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
276   const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
277   const int input_depth = input_shape.Dims(3);
278   TFLITE_DCHECK_EQ(output_depth, input_depth * depth_multiplier);
279   TFLITE_DCHECK_EQ(bias_shape.FlatSize(), output_depth);
280 
281 // Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
282 // Jetson TX-2. This compiler does not support the offsetof() macro.
283 
284 #if defined(__aarch64__) && !defined(GOOGLE_L4T)
285   const int stride_width = params.stride_width;
286   const int stride_height = params.stride_height;
287   const int pad_width = params.padding_values.width;
288   const int pad_height = params.padding_values.height;
289 
290   // Call kernel optimized for depthwise convolutions using 3x3 filters if
291   // parameters are supported.
292   if (optimized_ops::depthwise_conv::Fast3x3FilterKernelSupported<
293       optimized_ops::depthwise_conv::QuantizationType::kNonPerChannelUint8>(
294           input_shape, filter_shape, stride_width, stride_height,
295           dilation_width_factor, dilation_height_factor, pad_width, pad_height,
296           depth_multiplier, output_shape, 0, nullptr)) {
297     gemmlowp::ScopedProfilingLabel specialized_label(
298         "DepthwiseConvHybridInt8/8bit/3x3");
299     optimized_ops::depthwise_conv::DepthwiseConvHybrid3x3FilterPerChannel<
300         DepthwiseConvOutputRounding::kUpward>(
301             params, input_scales, input_shape, input_data,
302             filter_shape, filter_data, bias_shape, bias_data, output_shape,
303             output_data, per_channel_scales, input_offsets,
304             thread_start, thread_end, thread_dim);
305     return;
306   }
307 #endif
308 
309   gemmlowp::ScopedProfilingLabel specialized_label(
310       "DepthwiseConvHybridInt8/8bit/General");
311   depthwise_conv::DepthwiseConvHybridGeneral(
312       params, input_scales, input_shape, input_data,
313       filter_shape, filter_data, bias_shape, bias_data, output_shape,
314       output_data, per_channel_scales, input_offsets,
315       thread_start, thread_end, thread_dim);
316 }
317 
DepthwiseConvHybridImpl(const DepthwiseParams & params,const float * input_scales,const RuntimeShape & input_shape,const int8 * input_data,const RuntimeShape & filter_shape,const int8 * filter_data,const RuntimeShape & bias_shape,const float * bias_data,const RuntimeShape & output_shape,float * output_data,const float * per_channel_scales,const int32_t * input_offsets,int thread_start,int thread_end,int thread_dim)318 inline void DepthwiseConvHybridImpl(
319     const DepthwiseParams& params, const float* input_scales,
320     const RuntimeShape& input_shape, const int8* input_data,
321     const RuntimeShape& filter_shape, const int8* filter_data,
322     const RuntimeShape& bias_shape, const float* bias_data,
323     const RuntimeShape& output_shape, float* output_data,
324     const float* per_channel_scales, const int32_t* input_offsets,
325     int thread_start, int thread_end, int thread_dim) {
326   return DepthwiseConvHybridWithRounding<
327       DepthwiseConvOutputRounding::kAwayFromZero>(
328           params, input_scales, input_shape, input_data,
329           filter_shape, filter_data, bias_shape, bias_data, output_shape,
330           output_data, per_channel_scales, input_offsets,
331           thread_start, thread_end, thread_dim);
332 }
333 
334 template <typename T, typename TS>
335 struct DepthwiseConvHybridWorkerTask : cpu_backend_threadpool::Task {
DepthwiseConvHybridWorkerTaskDepthwiseConvHybridWorkerTask336   DepthwiseConvHybridWorkerTask(const DepthwiseParams& params,
337                                 const float* input_scales,
338                                 const RuntimeShape& input_shape,
339                                 const T* input_data,
340                                 const RuntimeShape& filter_shape,
341                                 const T* filter_data,
342                                 const RuntimeShape& bias_shape,
343                                 const TS* bias_data,
344                                 const RuntimeShape& output_shape,
345                                 float* output_data,
346                                 const float* per_channel_scales,
347                                 const int32_t* input_offsets,
348                                 int thread_start, int thread_end,
349                                 int thread_dim)
350       : params(params),
351         input_scales(input_scales),
352         input_shape(input_shape),
353         input_data(input_data),
354         filter_shape(filter_shape),
355         filter_data(filter_data),
356         bias_shape(bias_shape),
357         bias_data(bias_data),
358         output_shape(output_shape),
359         output_data(output_data),
360         per_channel_scales(per_channel_scales),
361         input_offsets(input_offsets),
362         thread_start(thread_start),
363         thread_end(thread_end),
364         thread_dim(thread_dim) {}
365 
RunDepthwiseConvHybridWorkerTask366   void Run() override {
367     DepthwiseConvHybridImpl(params, input_scales, input_shape,
368                             input_data, filter_shape, filter_data,
369                             bias_shape, bias_data, output_shape,
370                             output_data, per_channel_scales, input_offsets,
371                             thread_start, thread_end, thread_dim);
372   }
373 
374  private:
375   const DepthwiseParams& params;
376   const float* input_scales;
377   const RuntimeShape& input_shape;
378   const T* input_data;
379   const RuntimeShape& filter_shape;
380   const T* filter_data;
381   const RuntimeShape& bias_shape;
382   const TS* bias_data;
383   const RuntimeShape& output_shape;
384   float* output_data;
385   const float* per_channel_scales;
386   const int32_t* input_offsets;
387   int thread_start;
388   int thread_end;
389   int thread_dim;
390 };
391 
DepthwiseConvHybridPerChannel(const DepthwiseParams & params,const float * input_scales,const RuntimeShape & input_shape,const int8 * input_data,const RuntimeShape & filter_shape,const int8 * filter_data,const RuntimeShape & bias_shape,const float * bias_data,const RuntimeShape & output_shape,float * output_data,const float * per_channel_scales,int32_t * input_offsets,CpuBackendContext * cpu_backend_context)392 inline void DepthwiseConvHybridPerChannel(
393     const DepthwiseParams& params, const float* input_scales,
394     const RuntimeShape& input_shape, const int8* input_data,
395     const RuntimeShape& filter_shape, const int8* filter_data,
396     const RuntimeShape& bias_shape, const float* bias_data,
397     const RuntimeShape& output_shape, float* output_data,
398     const float* per_channel_scales, int32_t* input_offsets,
399     CpuBackendContext* cpu_backend_context) {
400   gemmlowp::ScopedProfilingLabel label("DepthwiseConvHybridInt8");
401   TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
402   TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
403   TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
404 
405   const int output_batches = output_shape.Dims(0);
406   const int output_rows = output_shape.Dims(1);
407   int thread_count_batch = HowManyConvThreads(output_shape, filter_shape, 0);
408   int thread_count_row = HowManyConvThreads(output_shape, filter_shape, 1);
409   int thread_dim, thread_count, thread_dim_size;
410   if (thread_count_batch > thread_count_row) {
411     thread_dim = 0;
412     thread_dim_size = output_batches;
413     thread_count = thread_count_batch;
414   } else {
415     thread_dim = 1;
416     thread_dim_size = output_rows;
417     thread_count = thread_count_row;
418   }
419 
420   const int max_threads = cpu_backend_context->max_num_threads();
421   thread_count = std::max(1, std::min(thread_count, max_threads));
422 
423   if (thread_count == 1) {
424     DepthwiseConvHybridImpl(params, input_scales, input_shape,
425                             input_data, filter_shape, filter_data, bias_shape,
426                             bias_data, output_shape, output_data,
427                             per_channel_scales, input_offsets,
428                             /*thread_start=*/0, /*thread_end=*/output_rows,
429                             /*thread_dim=*/1);
430   } else {
431     std::vector<DepthwiseConvHybridWorkerTask<int8, float>> tasks;
432     // TODO(b/131746020) don't create new heap allocations every time.
433     // At least we make it a single heap allocation by using reserve().
434     tasks.reserve(thread_count);
435     int thread_start = 0;
436     for (int i = 0; i < thread_count; ++i) {
437       int thread_end =
438           thread_start + (thread_dim_size - thread_start) / (thread_count - i);
439       tasks.emplace_back(params, input_scales, input_shape,
440                          input_data, filter_shape, filter_data, bias_shape,
441                          bias_data, output_shape, output_data,
442                          per_channel_scales, input_offsets, thread_start,
443                          thread_end, thread_dim);
444       thread_start = thread_end;
445     }
446     cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
447                                     cpu_backend_context);
448   }
449 }
450 
451 }  // namespace optimized_integer_ops
452 }  // namespace tflite
453 
454 #endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_HYBRID_H_
455