• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Auto-generated file. Do not edit!
2 //   Template: src/f32-ibilinear-chw/neon.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <arm_neon.h>
13 
14 #include <xnnpack/ibilinear.h>
15 
16 
xnn_f32_ibilinear_chw_ukernel__neonfma_p8(size_t output_pixels,size_t channels,const float ** restrict input,size_t input_offset,const float * restrict weights,float * restrict output,size_t input_increment)17 void xnn_f32_ibilinear_chw_ukernel__neonfma_p8(
18     size_t output_pixels,
19     size_t channels,
20     const float**restrict input,
21     size_t input_offset,
22     const float*restrict weights,
23     float*restrict output,
24     size_t input_increment) XNN_DISABLE_TSAN
25 {
26   assert(output_pixels != 0);
27   assert(channels != 0);
28   assert(input_increment % sizeof(float) == 0);
29 
30   do {
31     const float** i = input;
32     const float* w = weights;
33     size_t p = output_pixels;
34     for (; p >= 8; p -= 8) {
35       const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
36       const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
37       const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
38       const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
39       const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset);
40       const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset);
41       const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset);
42       const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset);
43       const float* itl4 = (const float*) ((uintptr_t) i[8] + input_offset);
44       const float* ibl4 = (const float*) ((uintptr_t) i[9] + input_offset);
45       const float* itl5 = (const float*) ((uintptr_t) i[10] + input_offset);
46       const float* ibl5 = (const float*) ((uintptr_t) i[11] + input_offset);
47       const float* itl6 = (const float*) ((uintptr_t) i[12] + input_offset);
48       const float* ibl6 = (const float*) ((uintptr_t) i[13] + input_offset);
49       const float* itl7 = (const float*) ((uintptr_t) i[14] + input_offset);
50       const float* ibl7 = (const float*) ((uintptr_t) i[15] + input_offset);
51       i += 2 * 8;
52 
53       const float32x4x2_t vw0123 = vld2q_f32(w + 0);
54       const float32x4x2_t vw4567 = vld2q_f32(w + 8);
55       w += 2 * 8;
56 
57       const float32x2_t vtltr0 = vld1_f32(itl0);
58       const float32x2_t vblbr0 = vld1_f32(ibl0);
59       const float32x2_t vtltr1 = vld1_f32(itl1);
60       const float32x2_t vblbr1 = vld1_f32(ibl1);
61       const float32x2_t vtltr2 = vld1_f32(itl2);
62       const float32x2_t vblbr2 = vld1_f32(ibl2);
63       const float32x2_t vtltr3 = vld1_f32(itl3);
64       const float32x2_t vblbr3 = vld1_f32(ibl3);
65       const float32x2_t vtltr4 = vld1_f32(itl4);
66       const float32x2_t vblbr4 = vld1_f32(ibl4);
67       const float32x2_t vtltr5 = vld1_f32(itl5);
68       const float32x2_t vblbr5 = vld1_f32(ibl5);
69       const float32x2_t vtltr6 = vld1_f32(itl6);
70       const float32x2_t vblbr6 = vld1_f32(ibl6);
71       const float32x2_t vtltr7 = vld1_f32(itl7);
72       const float32x2_t vblbr7 = vld1_f32(ibl7);
73 
74       const float32x4_t valphah0123 = vw0123.val[0];
75       const float32x4_t valphav0123 = vw0123.val[1];
76       const float32x4_t valphah4567 = vw4567.val[0];
77       const float32x4_t valphav4567 = vw4567.val[1];
78 
79       const float32x4_t vtltr01 = vcombine_f32(vtltr0, vtltr1);
80       const float32x4_t vblbr01 = vcombine_f32(vblbr0, vblbr1);
81       const float32x4_t vtltr23 = vcombine_f32(vtltr2, vtltr3);
82       const float32x4_t vblbr23 = vcombine_f32(vblbr2, vblbr3);
83       const float32x4_t vtltr45 = vcombine_f32(vtltr4, vtltr5);
84       const float32x4_t vblbr45 = vcombine_f32(vblbr4, vblbr5);
85       const float32x4_t vtltr67 = vcombine_f32(vtltr6, vtltr7);
86       const float32x4_t vblbr67 = vcombine_f32(vblbr6, vblbr7);
87 
88       const float32x4_t vldrd01 = vsubq_f32(vblbr01, vtltr01);
89       const float32x4_t vldrd23 = vsubq_f32(vblbr23, vtltr23);
90       const float32x4_t vldrd45 = vsubq_f32(vblbr45, vtltr45);
91       const float32x4_t vldrd67 = vsubq_f32(vblbr67, vtltr67);
92 
93       const float32x4x2_t vld_t0123 = vuzpq_f32(vldrd01, vldrd23);
94       const float32x4_t vld0123 = vld_t0123.val[0];
95       const float32x4_t vrd0123 = vld_t0123.val[1];
96       const float32x4x2_t vld_t4567 = vuzpq_f32(vldrd45, vldrd67);
97       const float32x4_t vld4567 = vld_t4567.val[0];
98       const float32x4_t vrd4567 = vld_t4567.val[1];
99 
100       const float32x4x2_t vtl_t0123 = vuzpq_f32(vtltr01, vtltr23);
101       const float32x4_t vtl0123 = vtl_t0123.val[0];
102       const float32x4_t vtr0123 = vtl_t0123.val[1];
103       const float32x4x2_t vtl_t4567 = vuzpq_f32(vtltr45, vtltr67);
104       const float32x4_t vtl4567 = vtl_t4567.val[0];
105       const float32x4_t vtr4567 = vtl_t4567.val[1];
106 
107       const float32x4_t vl0123 = vfmaq_f32(vtl0123, vld0123, valphav0123);
108       const float32x4_t vr0123 = vfmaq_f32(vtr0123, vrd0123, valphav0123);
109       const float32x4_t vl4567 = vfmaq_f32(vtl4567, vld4567, valphav4567);
110       const float32x4_t vr4567 = vfmaq_f32(vtr4567, vrd4567, valphav4567);
111 
112       const float32x4_t vd0123 = vsubq_f32(vr0123, vl0123);
113       const float32x4_t vd4567 = vsubq_f32(vr4567, vl4567);
114 
115       const float32x4_t vo0123 = vfmaq_f32(vl0123, vd0123, valphah0123);
116       const float32x4_t vo4567 = vfmaq_f32(vl4567, vd4567, valphah4567);
117 
118       vst1q_f32(output + 0, vo0123);
119       vst1q_f32(output + 4, vo4567);
120       output += 8;
121     }
122 
123     for (; p >= 4; p -= 4) {
124       const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
125       const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
126       const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
127       const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
128       const float* itl2 = (const float*) ((uintptr_t) i[4] + input_offset);
129       const float* ibl2 = (const float*) ((uintptr_t) i[5] + input_offset);
130       const float* itl3 = (const float*) ((uintptr_t) i[6] + input_offset);
131       const float* ibl3 = (const float*) ((uintptr_t) i[7] + input_offset);
132       i += 8;
133 
134       const float32x4x2_t vw = vld2q_f32(w);
135       w += 8;
136 
137       const float32x2_t vtltr0 = vld1_f32(itl0);
138       const float32x2_t vblbr0 = vld1_f32(ibl0);
139       const float32x2_t vtltr1 = vld1_f32(itl1);
140       const float32x2_t vblbr1 = vld1_f32(ibl1);
141       const float32x2_t vtltr2 = vld1_f32(itl2);
142       const float32x2_t vblbr2 = vld1_f32(ibl2);
143       const float32x2_t vtltr3 = vld1_f32(itl3);
144       const float32x2_t vblbr3 = vld1_f32(ibl3);
145 
146       const float32x4_t valphah = vw.val[0];
147       const float32x4_t valphav = vw.val[1];
148 
149       const float32x4_t vtltr01 = vcombine_f32(vtltr0, vtltr1);
150       const float32x4_t vblbr01 = vcombine_f32(vblbr0, vblbr1);
151       const float32x4_t vtltr23 = vcombine_f32(vtltr2, vtltr3);
152       const float32x4_t vblbr23 = vcombine_f32(vblbr2, vblbr3);
153 
154       const float32x4_t vldrd01 = vsubq_f32(vblbr01, vtltr01);
155       const float32x4_t vldrd23 = vsubq_f32(vblbr23, vtltr23);
156 
157       const float32x4x2_t vld_t = vuzpq_f32(vldrd01, vldrd23);
158       const float32x4_t vld = vld_t.val[0];
159       const float32x4_t vrd = vld_t.val[1];
160 
161       const float32x4x2_t vtl_t = vuzpq_f32(vtltr01, vtltr23);
162       const float32x4_t vtl = vtl_t.val[0];
163       const float32x4_t vtr = vtl_t.val[1];
164 
165       const float32x4_t vl = vfmaq_f32(vtl, vld, valphav);
166       const float32x4_t vr = vfmaq_f32(vtr, vrd, valphav);
167 
168       const float32x4_t vd = vsubq_f32(vr, vl);
169       const float32x4_t vo = vfmaq_f32(vl, vd, valphah);
170 
171       vst1q_f32(output, vo);
172       output += 4;
173     }
174 
175     if XNN_UNLIKELY(p != 0) {
176       if (p & 2) {
177         const float32x2x2_t vw = vld2_f32(w);
178         w += 4;
179 
180         const float32x2_t valphah = vw.val[0];
181         const float32x2_t valphav = vw.val[1];
182 
183         const float* itl0 = (const float*) ((uintptr_t) i[0] + input_offset);
184         const float* ibl0 = (const float*) ((uintptr_t) i[1] + input_offset);
185         const float* itl1 = (const float*) ((uintptr_t) i[2] + input_offset);
186         const float* ibl1 = (const float*) ((uintptr_t) i[3] + input_offset);
187         i += 4;
188 
189         const float32x2_t vtltr0 = vld1_f32(itl0);
190         const float32x2_t vblbr0 = vld1_f32(ibl0);
191         const float32x2_t vtltr1 = vld1_f32(itl1);
192         const float32x2_t vblbr1 = vld1_f32(ibl1);
193 
194         const float32x2_t vldrd0 = vsub_f32(vblbr0, vtltr0);
195         const float32x2_t vldrd1 = vsub_f32(vblbr1, vtltr1);
196 
197         const float32x2x2_t vld_t = vuzp_f32(vldrd0, vldrd1);
198         const float32x2_t vld = vld_t.val[0];
199         const float32x2_t vrd = vld_t.val[1];
200 
201         const float32x2x2_t vtl_t = vuzp_f32(vtltr0, vtltr1);
202         const float32x2_t vtl = vtl_t.val[0];
203         const float32x2_t vtr = vtl_t.val[1];
204 
205         const float32x2_t vl = vfma_f32(vtl, vld, valphav);
206         const float32x2_t vr = vfma_f32(vtr, vrd, valphav);
207 
208         const float32x2_t vd = vsub_f32(vr, vl);
209         const float32x2_t vo = vfma_f32(vl, vd, valphah);
210 
211         vst1_f32(output, vo);
212         output += 2;
213       }
214 
215       if (p & 1) {
216         // We are computing the following formula:
217         //   result = (1 - alpha_h) * (1 - alpha_v) * top_left +
218         //                 alpha_h  * (1 - alpha_v) * top_right +
219         //            (1 - alpha_h) *      alpha_v  * bottom_left +
220         //                 alpha_h  *      alpha_v  * bottom_right.
221         //
222         // Rearranging gives
223         //   result =    left + alpha_h * (right        - left),
224         // where
225         //   left =  top_left + alpha_v * (bottom_left  - top_left),
226         //  right = top_right + alpha_v * (bottom_right - top_right).
227 
228         const float alphah = *w;
229         const float32x2_t valphav = vld1_dup_f32(w + 1);
230         w += 2;
231 
232         const float* itl = (const float*) ((uintptr_t) i[0] + input_offset);
233         const float* ibl = (const float*) ((uintptr_t) i[1] + input_offset);
234         i += 2;
235 
236         const float32x2_t vtltr = vld1_f32(itl);
237         const float32x2_t vblbr = vld1_f32(ibl);
238 
239         // Compute at once
240         //    left_diff = bottom_left  - top_left
241         //   right_diff = bottom_right - top_right
242         const float32x2_t vldrd = vsub_f32(vblbr, vtltr);
243         const float32x2_t vlr = vfma_f32(vtltr, vldrd, valphav);
244 
245         // Extract them and compute the result.
246         const float l = vget_lane_f32(vlr, 0);
247         const float r = vget_lane_f32(vlr, 1);
248 
249         *output++ = l + alphah * (r - l);
250       }
251     }
252 
253     input_offset += input_increment;
254   } while (--channels != 0);
255 }
256