• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <stddef.h>
10 #include <math.h>
11 
12 #include <fxdiv.h>
13 
14 #include <xnnpack/indirection.h>
15 #include <xnnpack/operator.h>
16 #include <xnnpack/math.h>
17 
18 
xnn_indirection_init_conv2d(xnn_operator_t op,size_t output_tile_size,uint32_t log2_element_size)19 void xnn_indirection_init_conv2d(
20   xnn_operator_t op,
21   size_t output_tile_size,
22   uint32_t log2_element_size)
23 {
24   const void** indirection_buffer          = op->indirection_buffer;
25   const void* input                        = op->input;
26   const void* zero                         = op->zero_buffer;
27   const size_t input_pixel_stride          = op->input_pixel_stride << log2_element_size;
28   const size_t input_height                = op->input_height;
29   const size_t input_width                 = op->input_width;
30   const size_t output_height               = op->output_height;
31   const size_t output_width                = op->output_width;
32   const size_t kernel_height               = op->kernel_height;
33   const size_t kernel_width                = op->kernel_width;
34   const size_t stride_height               = op->stride_height;
35   const size_t stride_width                = op->stride_width;
36   const size_t dilation_height             = op->dilation_height;
37   const size_t dilation_width              = op->dilation_width;
38   const size_t input_padding_top           = op->padding_top;
39   const size_t input_padding_left          = op->padding_left;
40 
41   const size_t output_size = output_height * output_width;
42   const size_t tiled_output_size = round_up(output_size, output_tile_size);
43   const size_t kernel_size = kernel_height * kernel_width;
44 
45   const struct fxdiv_divisor_size_t output_width_divisor = fxdiv_init_size_t(output_width);
46 
47   for (size_t output_tile_start = 0; output_tile_start < tiled_output_size; output_tile_start += output_tile_size) {
48     for (size_t output_tile_offset = 0; output_tile_offset < output_tile_size; output_tile_offset++) {
49       const size_t output_index = min(output_tile_start + output_tile_offset, output_size - 1);
50       const struct fxdiv_result_size_t output_y_x = fxdiv_divide_size_t(output_index, output_width_divisor);
51       const size_t output_x = output_y_x.remainder;
52       const size_t output_y = output_y_x.quotient;
53       for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) {
54         const size_t input_y = output_y * stride_height + kernel_y * dilation_height - input_padding_top;
55         if (input_y < input_height) {
56           for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
57             const size_t input_x = output_x * stride_width + kernel_x * dilation_width - input_padding_left;
58             const size_t kernel_index = kernel_y * kernel_width + kernel_x;
59             const size_t index = output_tile_start * kernel_size + kernel_index * output_tile_size + output_tile_offset;
60             if (input_x < input_width) {
61               indirection_buffer[index] = (const void*)
62                 ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
63             } else {
64               indirection_buffer[index] = zero;
65             }
66           }
67         } else {
68           for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
69             const size_t kernel_index = kernel_y * kernel_width + kernel_x;
70             const size_t index = output_tile_start * kernel_size + kernel_index * output_tile_size + output_tile_offset;
71             indirection_buffer[index] = zero;
72           }
73         }
74       }
75     }
76   }
77 }
78 
xnn_indirection_init_deconv2d(xnn_operator_t op,size_t output_tile_size,uint32_t log2_element_size)79 void xnn_indirection_init_deconv2d(
80   xnn_operator_t op,
81   size_t output_tile_size,
82   uint32_t log2_element_size)
83 {
84   const void** indirection_buffer = op->indirection_buffer;
85   const void* input               = op->input;
86   const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size;
87   const void* zero                = op->zero_buffer;
88   const size_t input_height       = op->input_height;
89   const size_t input_width        = op->input_width;
90   const size_t output_height      = op->output_height;
91   const size_t output_width       = op->output_width;
92   const size_t kernel_height      = op->kernel_height;
93   const size_t kernel_width       = op->kernel_width;
94   const size_t stride_height      = op->stride_height;
95   const size_t stride_width       = op->stride_width;
96   const size_t dilation_height    = op->dilation_height;
97   const size_t dilation_width     = op->dilation_width;
98   const size_t padding_top        = op->padding_top;
99   const size_t padding_left       = op->padding_left;
100 
101   const size_t output_size = output_height * output_width;
102   const size_t tiled_output_size = round_up(output_size, output_tile_size);
103   const size_t kernel_size = kernel_height * kernel_width;
104 
105   const struct fxdiv_divisor_size_t output_width_divisor = fxdiv_init_size_t(output_width);
106   const struct fxdiv_divisor_size_t stride_height_divisor = fxdiv_init_size_t(stride_height);
107   const struct fxdiv_divisor_size_t stride_width_divisor = fxdiv_init_size_t(stride_width);
108 
109   for (size_t output_tile_start = 0; output_tile_start < tiled_output_size; output_tile_start += output_tile_size) {
110     for (size_t output_tile_offset = 0; output_tile_offset < output_tile_size; output_tile_offset++) {
111       const size_t output_index = min(output_tile_start + output_tile_offset, output_size - 1);
112       const struct fxdiv_result_size_t output_y_x = fxdiv_divide_size_t(output_index, output_width_divisor);
113       const size_t output_x = output_y_x.remainder;
114       const size_t output_y = output_y_x.quotient;
115       for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) {
116         const size_t y = output_y + padding_top - kernel_y * dilation_height;
117         const size_t input_y = fxdiv_quotient_size_t(y, stride_height_divisor);
118         for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
119           const size_t x = output_x + padding_left - kernel_x * dilation_width;
120           const size_t input_x = fxdiv_quotient_size_t(x, stride_width_divisor);
121           const size_t kernel_index = kernel_y * kernel_width + kernel_x;
122           const size_t index = output_tile_start * kernel_size + kernel_index * output_tile_size + output_tile_offset;
123           if (input_y * stride_height == y && input_y < input_height && input_x * stride_width == x && input_x < input_width) {
124             indirection_buffer[index] = (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
125           } else {
126             indirection_buffer[index] = zero;
127           }
128         }
129       }
130     }
131   }
132 }
133 
xnn_indirection_init_subconv2d(xnn_operator_t op,size_t output_tile_size,uint32_t log2_element_size)134 void xnn_indirection_init_subconv2d(
135   xnn_operator_t op,
136   size_t output_tile_size,
137   uint32_t log2_element_size)
138 {
139   const void** indirection_buffer                     = op->indirection_buffer;
140   struct subconvolution_params* subconvolution_params = op->subconvolution_buffer;
141   const void* input                                   = op->input;
142   const size_t input_pixel_stride                     = op->input_pixel_stride << log2_element_size;
143   const void* zero                                    = op->zero_buffer;
144   const size_t input_height                           = op->input_height;
145   const size_t input_width                            = op->input_width;
146   const size_t output_height                          = op->output_height;
147   const size_t output_width                           = op->output_width;
148   const size_t kernel_height                          = op->kernel_height;
149   const size_t kernel_width                           = op->kernel_width;
150   const size_t stride_height                          = op->stride_height;
151   const size_t stride_width                           = op->stride_width;
152   const size_t padding_top                            = op->padding_top;
153   const size_t padding_left                           = op->padding_left;
154 
155   const size_t modulo_padding_top = padding_top % stride_height;
156   const size_t modulo_padding_left = padding_left % stride_width;
157   for (size_t offset_y = 0; offset_y < stride_height; offset_y++) {
158     const size_t output_y_start = subtract_modulo(offset_y, modulo_padding_top, stride_height);
159     for (size_t offset_x = 0; offset_x < stride_width; offset_x++) {
160       const size_t output_x_start = subtract_modulo(offset_x, modulo_padding_left, stride_width);
161       const size_t sliced_output_width = divide_round_up(output_width - output_x_start, stride_width);
162 
163       subconvolution_params->indirection_buffer = indirection_buffer;
164       subconvolution_params->indirection_y_stride =
165         subconvolution_params->indirection_x_stride * round_up(sliced_output_width, output_tile_size);
166       ++subconvolution_params;
167 
168       for (size_t output_y = output_y_start; output_y < output_height; output_y += stride_height) {
169         for (size_t output_tile_start = 0; output_tile_start < sliced_output_width; output_tile_start += output_tile_size) {
170           for (size_t kernel_y = offset_y; kernel_y < kernel_height; kernel_y += stride_height) {
171             assert(doz(output_y + padding_top, kernel_y) % stride_height == 0);
172             const size_t y = output_y + padding_top - kernel_y;
173             const size_t input_y = y / stride_height;
174 
175             for (size_t kernel_x = offset_x; kernel_x < kernel_width; kernel_x += stride_width) {
176               for (size_t output_tile_offset = 0; output_tile_offset < output_tile_size; output_tile_offset++) {
177                 const size_t sliced_output_x = min(output_tile_start + output_tile_offset, sliced_output_width - 1);
178                 const size_t output_x = output_x_start + sliced_output_x * stride_width;
179 
180                 assert(doz(output_x + padding_left, kernel_x) % stride_width == 0);
181                 const size_t x = output_x + padding_left - kernel_x;
182                 const size_t input_x = x / stride_width;
183 
184                 if (input_y < input_height && input_x < input_width) {
185                   *indirection_buffer++ =
186                     (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
187                 } else {
188                   *indirection_buffer++ = zero;
189                 }
190               }
191             }
192           }
193         }
194       }
195     }
196   }
197 }
198 
xnn_indirection_init_dwconv2d(xnn_operator_t op,size_t step_height,size_t step_width,uint32_t log2_element_size)199 void xnn_indirection_init_dwconv2d(
200   xnn_operator_t op,
201   size_t step_height,
202   size_t step_width,
203   uint32_t log2_element_size)
204 {
205   const void** indirection_buffer = op->indirection_buffer;
206   const void* input               = op->input;
207   const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size;
208   const void* zero                = op->zero_buffer;
209   const size_t input_height       = op->input_height;
210   const size_t input_width        = op->input_width;
211   const size_t output_height      = op->output_height;
212   const size_t output_width       = op->output_width;
213   const size_t kernel_height      = op->kernel_height;
214   const size_t kernel_width       = op->kernel_width;
215   const size_t stride_height      = op->stride_height;
216   const size_t stride_width       = op->stride_width;
217   const size_t dilation_height    = op->dilation_height;
218   const size_t dilation_width     = op->dilation_width;
219   const size_t input_padding_top  = op->padding_top;
220   const size_t input_padding_left = op->padding_left;
221 
222   for (size_t output_y = 0; output_y < output_height; output_y++) {
223     for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) {
224       const size_t input_y = output_y * stride_height + kernel_y * dilation_height - input_padding_top;
225       if (input_y < input_height) {
226         for (size_t output_x = 0; output_x < output_width; output_x++) {
227           for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
228             const size_t input_x = output_x * stride_width + kernel_x * dilation_width - input_padding_left;
229             const size_t index = output_y * step_height + output_x * step_width * kernel_height + kernel_x * kernel_height + kernel_y;
230             if (input_x < input_width) {
231               indirection_buffer[index] =
232                 (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
233             } else {
234               indirection_buffer[index] = zero;
235             }
236           }
237         }
238       } else {
239         for (size_t output_x = 0; output_x < output_width; output_x++) {
240           for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
241             const size_t index = output_y * step_height + output_x * step_width * kernel_height + kernel_x * kernel_height + kernel_y;
242             indirection_buffer[index] = zero;
243           }
244         }
245       }
246     }
247   }
248 }
249 
xnn_indirection_init_maxpool2d(xnn_operator_t op,size_t step_height,size_t step_width,uint32_t log2_element_size)250 void xnn_indirection_init_maxpool2d(
251   xnn_operator_t op,
252   size_t step_height,
253   size_t step_width,
254   uint32_t log2_element_size)
255 {
256   const void** indirection_buffer = op->indirection_buffer;
257   const void* input               = op->input;
258   const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size;
259   const size_t input_height       = op->input_height;
260   const size_t input_width        = op->input_width;
261   const size_t output_height      = op->output_height;
262   const size_t output_width       = op->output_width;
263   const size_t pooling_height     = op->kernel_height;
264   const size_t pooling_width      = op->kernel_width;
265   const size_t stride_height      = op->stride_height;
266   const size_t stride_width       = op->stride_width;
267   const size_t dilation_height    = op->dilation_height;
268   const size_t dilation_width     = op->dilation_width;
269   const size_t input_padding_top  = op->padding_top;
270   const size_t input_padding_left = op->padding_left;
271 
272   const bool any_dilation = (dilation_height | dilation_width) > 1;
273 
274   if (any_dilation) {
275     // Clamp to the border doesn't work for pooling with dilation.
276     const size_t adjusted_padding_top = input_padding_top % dilation_height;
277     const size_t adjusted_padding_left = input_padding_left % dilation_width;
278     for (size_t output_y = 0; output_y < output_height; output_y++) {
279       for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) {
280         size_t safe_input_y = output_y * stride_height;
281         if XNN_UNPREDICTABLE(safe_input_y < adjusted_padding_top) {
282           safe_input_y += dilation_height;
283         }
284         safe_input_y -= adjusted_padding_top;
285 
286         size_t input_y = output_y * stride_height + pooling_y * dilation_height - input_padding_top;
287         if XNN_UNPREDICTABLE(input_y >= input_height) {
288           input_y = safe_input_y;
289         }
290 
291         for (size_t output_x = 0; output_x < output_width; output_x++) {
292           for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) {
293             size_t safe_input_x = output_x * stride_width;
294             if XNN_UNPREDICTABLE(safe_input_x < adjusted_padding_left) {
295               safe_input_x += dilation_width;
296             }
297             safe_input_x -= adjusted_padding_left;
298 
299             size_t input_x = output_x * stride_width + pooling_x * dilation_width - input_padding_left;
300             if XNN_UNPREDICTABLE(input_x >= input_width) {
301               input_x = safe_input_x;
302             }
303 
304             const size_t index = output_y * step_height + output_x * step_width * pooling_height + pooling_x * pooling_height + pooling_y;
305             indirection_buffer[index] = (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
306           }
307         }
308       }
309     }
310   } else {
311     const size_t input_x_max = input_width - 1;
312     const size_t input_y_max = input_height - 1;
313     for (size_t output_y = 0; output_y < output_height; output_y++) {
314       for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) {
315         const size_t input_y = min(doz(output_y * stride_height + pooling_y * dilation_height, input_padding_top), input_y_max);
316         for (size_t output_x = 0; output_x < output_width; output_x++) {
317           for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) {
318             const size_t input_x = min(doz(output_x * stride_width + pooling_x * dilation_width, input_padding_left), input_x_max);
319             const size_t index = output_y * step_height + output_x * step_width * pooling_height + pooling_x * pooling_height + pooling_y;
320             indirection_buffer[index] = (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
321           }
322         }
323       }
324     }
325   }
326 }
327 
xnn_indirection_init_resize_bilinear2d_hwc_f32(size_t input_pixel_stride,size_t input_height,size_t input_width,size_t output_height,size_t output_width,const void * input,const void ** indirection_buffer,float * packed_weights,bool align_corners,bool tensorflow_legacy)328 void xnn_indirection_init_resize_bilinear2d_hwc_f32(
329   size_t input_pixel_stride,
330   size_t input_height,
331   size_t input_width,
332   size_t output_height,
333   size_t output_width,
334   const void* input,
335   const void** indirection_buffer,
336   float* packed_weights,
337   bool align_corners,
338   bool tensorflow_legacy)
339 {
340   assert(input_height != 0);
341   assert(input_height < 16777216 /* 2**24 */);
342   assert(input_width != 0);
343   assert(input_width < 16777216 /* 2**24 */);
344   assert(output_height != 0);
345   assert(output_height < 16777216 /* 2**24 */);
346   assert(output_width != 0);
347   assert(output_width < 16777216 /* 2**24 */);
348 
349   const int32_t width_adjustment = (int32_t) (align_corners && output_width != 1);
350   const int32_t height_adjustment = (int32_t) (align_corners && output_height != 1);
351   const float width_scale =
352     (float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment);
353   const float height_scale =
354     (float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment);
355 
356   const uint32_t input_y_max = (uint32_t) input_height - 1;
357   const uint32_t input_x_max = (uint32_t) input_width - 1;
358   if (tensorflow_legacy || align_corners) {
359     for (size_t output_y = 0; output_y < output_height; output_y++) {
360       const float input_y = (float) (int32_t) output_y * height_scale;
361       assert(input_y >= 0.0f);
362       assert(input_y < (float) input_height);
363 
364       const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
365       const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
366       const float alpha_y = input_y - (float) input_y_top;
367       for (size_t output_x = 0; output_x < output_width; output_x++) {
368         const float input_x = (float) (int32_t) output_x * width_scale;
369         assert(input_x >= 0.0f);
370         assert(input_x < (float) input_width);
371 
372         const uint32_t input_x_left = (uint32_t) (int32_t) input_x;
373         const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max);
374         const float alpha_x = input_x - (float) input_x_left;
375         indirection_buffer[0] =
376           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
377         indirection_buffer[1] =
378           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride);
379         indirection_buffer[2] =
380           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
381         indirection_buffer[3] =
382           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride);
383         packed_weights[0] = alpha_x;
384         packed_weights[1] = alpha_y;
385         indirection_buffer += 4;
386         packed_weights += 2;
387       }
388     }
389   } else {
390     const float height_offset = 0.5f * height_scale - 0.5f;
391     const float width_offset = 0.5f * width_scale - 0.5f;
392     for (size_t output_y = 0; output_y < output_height; output_y++) {
393       float input_y = (float) (int32_t) output_y * height_scale + height_offset;
394       input_y = math_min_f32(math_max_f32(input_y, 0.0f), (float) input_y_max);
395       const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
396       assert((int32_t) input_y_top >= 0);
397       const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
398       const float alpha_y = input_y - (float) input_y_top;
399       for (size_t output_x = 0; output_x < output_width; output_x++) {
400         float input_x = (float) (int32_t) output_x * width_scale + width_offset;
401         input_x = math_min_f32(math_max_f32(input_x, 0.0f), (float) input_x_max);
402         const uint32_t input_x_left = (uint32_t) (int32_t) input_x;
403         assert((int32_t) input_x_left >= 0);
404         const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max);
405         const float alpha_x = input_x - (float) input_x_left;
406         indirection_buffer[0] =
407           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
408         indirection_buffer[1] =
409           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride);
410         indirection_buffer[2] =
411           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
412         indirection_buffer[3] =
413           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride);
414         packed_weights[0] = alpha_x;
415         packed_weights[1] = alpha_y;
416         indirection_buffer += 4;
417         packed_weights += 2;
418       }
419     }
420   }
421 }
422 
xnn_indirection_init_resize_bilinear2d_hwc_q11(size_t input_pixel_stride,size_t input_height,size_t input_width,size_t output_height,size_t output_width,const void * input,const void ** indirection_buffer,int16_t * packed_weights,bool align_corners,bool tensorflow_legacy)423 void xnn_indirection_init_resize_bilinear2d_hwc_q11(
424   size_t input_pixel_stride,
425   size_t input_height,
426   size_t input_width,
427   size_t output_height,
428   size_t output_width,
429   const void* input,
430   const void** indirection_buffer,
431   int16_t* packed_weights,
432   bool align_corners,
433   bool tensorflow_legacy)
434 {
435   assert(input_height != 0);
436   assert(input_height < 16777216 /* 2**24 */);
437   assert(input_width != 0);
438   assert(input_width < 16777216 /* 2**24 */);
439   assert(output_height != 0);
440   assert(output_height < 16777216 /* 2**24 */);
441   assert(output_width != 0);
442   assert(output_width < 16777216 /* 2**24 */);
443 
444   const int32_t width_adjustment = (int32_t) (align_corners && output_width != 1);
445   const int32_t height_adjustment = (int32_t) (align_corners && output_height != 1);
446   const float width_scale =
447     (float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment);
448   const float height_scale =
449     (float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment);
450 
451   const uint32_t input_y_max = (uint32_t) input_height - 1;
452   const uint32_t input_x_max = (uint32_t) input_width - 1;
453   if (tensorflow_legacy || align_corners) {
454     for (size_t output_y = 0; output_y < output_height; output_y++) {
455       const float input_y = (float) (int32_t) output_y * height_scale;
456       assert(input_y >= 0.0f);
457       assert(input_y < (float) input_height);
458 
459       const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
460       const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
461       const float alpha_y = input_y - (float) input_y_top;
462       for (size_t output_x = 0; output_x < output_width; output_x++) {
463         const float input_x = (float) (int32_t) output_x * width_scale;
464         assert(input_x >= 0.0f);
465         assert(input_x < (float) input_width);
466 
467         const uint32_t input_x_left = (uint32_t) (int32_t) input_x;
468         const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max);
469         const float alpha_x = input_x - (float) input_x_left;
470         indirection_buffer[0] =
471           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
472         indirection_buffer[1] =
473           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride);
474         indirection_buffer[2] =
475           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
476         indirection_buffer[3] =
477           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride);
478         packed_weights[0] = (int16_t) lrintf(alpha_x * 0x1.0p+11f);
479         packed_weights[1] = (int16_t) lrintf(alpha_y * 0x1.0p+11f);
480         indirection_buffer += 4;
481         packed_weights += 2;
482       }
483     }
484   } else {
485     const float height_offset = 0.5f * height_scale - 0.5f;
486     const float width_offset = 0.5f * width_scale - 0.5f;
487     for (size_t output_y = 0; output_y < output_height; output_y++) {
488       float input_y = (float) (int32_t) output_y * height_scale + height_offset;
489       input_y = math_min_f32(math_max_f32(input_y, 0.0f), (float) input_y_max);
490       const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
491       assert((int32_t) input_y_top >= 0);
492       const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
493       const float alpha_y = input_y - (float) input_y_top;
494       for (size_t output_x = 0; output_x < output_width; output_x++) {
495         float input_x = (float) (int32_t) output_x * width_scale + width_offset;
496         input_x = math_min_f32(math_max_f32(input_x, 0.0f), (float) input_x_max);
497         const uint32_t input_x_left = (uint32_t) (int32_t) input_x;
498         assert((int32_t) input_x_left >= 0);
499         const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max);
500         const float alpha_x = input_x - (float) input_x_left;
501         indirection_buffer[0] =
502           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
503         indirection_buffer[1] =
504           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride);
505         indirection_buffer[2] =
506           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
507         indirection_buffer[3] =
508           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride);
509         packed_weights[0] = (int16_t) lrintf(alpha_x * 0x1.0p+11f);
510         packed_weights[1] = (int16_t) lrintf(alpha_y * 0x1.0p+11f);
511         indirection_buffer += 4;
512         packed_weights += 2;
513       }
514     }
515   }
516 }
517 
xnn_indirection_init_resize_bilinear2d_chw_f32(size_t input_pixel_stride,size_t input_height,size_t input_width,size_t output_height,size_t output_width,const void * input,const void ** indirection_buffer,float * packed_weights,bool align_corners,bool tensorflow_legacy)518 void xnn_indirection_init_resize_bilinear2d_chw_f32(
519   size_t input_pixel_stride,
520   size_t input_height,
521   size_t input_width,
522   size_t output_height,
523   size_t output_width,
524   const void* input,
525   const void** indirection_buffer,
526   float* packed_weights,
527   bool align_corners,
528   bool tensorflow_legacy)
529 {
530   assert(input_height > 1);
531   assert(input_height < 16777216 /* 2**24 */);
532   assert(input_width > 1);
533   assert(input_width < 16777216 /* 2**24 */);
534   assert(output_height != 0);
535   assert(output_height < 16777216 /* 2**24 */);
536   assert(output_width != 0);
537   assert(output_width < 16777216 /* 2**24 */);
538 
539   const int32_t width_adjustment = (int32_t) (align_corners && output_width != 1);
540   const int32_t height_adjustment = (int32_t) (align_corners && output_height != 1);
541   const float width_scale =
542     (float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment);
543   const float height_scale =
544     (float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment);
545 
546   const uint32_t input_y_max = (uint32_t) input_height - 1;
547   const uint32_t input_x_max = (uint32_t) input_width - 1;
548   if (tensorflow_legacy || align_corners) {
549     for (size_t output_y = 0; output_y < output_height; output_y++) {
550       const float input_y = (float) (int32_t) output_y * height_scale;
551       assert(input_y >= 0.0f);
552       assert(input_y < (float) input_height);
553 
554       const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
555       const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
556       const float alpha_y = input_y - (float) input_y_top;
557       for (size_t output_x = 0; output_x < output_width; output_x++) {
558         const float input_x = (float) (int32_t) output_x * width_scale;
559         assert(input_x >= 0.0f);
560         assert(input_x < (float) input_width);
561 
562         uint32_t input_x_left = (uint32_t) (int32_t) input_x;
563 
564         float alpha_x = input_x - (float) input_x_left;
565         if (input_x_left == input_x_max) {
566           // Ensure that there is a pixel to the right of the one pointed at,
567           // as required by some CHW kernels.
568           --input_x_left;
569           alpha_x = 1.0f;
570         }
571        indirection_buffer[0] =
572           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
573        indirection_buffer[1] =
574           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
575         packed_weights[0] = alpha_x;
576         packed_weights[1] = alpha_y;
577         indirection_buffer += 2;
578         packed_weights += 2;
579       }
580     }
581   } else {
582     const float height_offset = 0.5f * height_scale - 0.5f;
583     const float width_offset = 0.5f * width_scale - 0.5f;
584     for (size_t output_y = 0; output_y < output_height; output_y++) {
585       float input_y = (float) (int32_t) output_y * height_scale + height_offset;
586       input_y = math_min_f32(math_max_f32(input_y, 0.0f), (float) input_y_max);
587       const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
588       assert((int32_t) input_y_top >= 0);
589       const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
590       const float alpha_y = input_y - (float) input_y_top;
591       for (size_t output_x = 0; output_x < output_width; output_x++) {
592         float input_x = (float) (int32_t) output_x * width_scale + width_offset;
593         input_x = math_min_f32(math_max_f32(input_x, 0.0f), (float) input_x_max);
594         uint32_t input_x_left = (uint32_t) (int32_t) input_x;
595         assert((int32_t) input_x_left >= 0);
596 
597         float alpha_x = input_x - (float) input_x_left;
598         if (input_x_left == input_x_max) {
599           // Ensure that there is a pixel to the right of the one pointed at,
600           // as required by some CHW kernels.
601           --input_x_left;
602           alpha_x = 1.0f;
603         }
604 
605         indirection_buffer[0] =
606           (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
607         indirection_buffer[1] =
608           (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
609         packed_weights[0] = alpha_x;
610         packed_weights[1] = alpha_y;
611         indirection_buffer += 2;
612         packed_weights += 2;
613       }
614     }
615   }
616 }
617 
xnn_indirection_init_unpool2d(xnn_operator_t op,size_t batch_start,uint32_t log2_element_size)618 void xnn_indirection_init_unpool2d(
619   xnn_operator_t op,
620   size_t batch_start,
621   uint32_t log2_element_size)
622 {
623   const void** indirection_buffer  = op->indirection_buffer;
624   const void* output               = op->output;
625   const size_t output_pixel_stride = op->output_pixel_stride << log2_element_size;
626   const size_t batch_size          = op->batch_size;
627   const size_t input_height        = op->input_height;
628   const size_t input_width         = op->input_width;
629   const size_t output_height       = op->output_height;
630   const size_t output_width        = op->output_width;
631   const size_t pooling_height      = op->kernel_height;
632   const size_t pooling_width       = op->kernel_width;
633   const size_t output_padding_top  = op->padding_top;
634   const size_t output_padding_left = op->padding_left;
635 
636   for (size_t image = batch_start; image < batch_size; image++) {
637     for (size_t input_y = 0; input_y < input_height; input_y++) {
638       for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) {
639         const size_t output_y = min(doz(input_y * pooling_height + pooling_y, output_padding_top), output_height - 1);
640         for (size_t input_x = 0; input_x < input_width; input_x++) {
641           for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) {
642             const size_t output_x = min(doz(input_x * pooling_width + pooling_x, output_padding_left), output_width - 1);
643             indirection_buffer[(((image * input_height + input_y) * input_width + input_x) * pooling_width + pooling_x) * pooling_height + pooling_y] =
644               (const void*) ((uintptr_t) output + ((image * output_height + output_y) * output_width + output_x) * output_pixel_stride);
645           }
646         }
647       }
648     }
649   }
650 }
651