1 // Auto-generated file. Do not edit!
2 // Template: src/f32-dwconv2d-chw/3x3p1-scalar.c.in
3 // Generator: tools/xngen
4 //
5 // Copyright 2020 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9
10 #include <assert.h>
11
12 #include <xnnpack/dwconv.h>
13 #include <xnnpack/math.h>
14
15
xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1(size_t input_height,size_t input_width,const float * input,const float * weights,const float * zero,float * output,uint32_t padding_top,const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS (1)])16 void xnn_f32_dwconv2d_chw_ukernel_3x3p1__scalar_4x1(
17 size_t input_height,
18 size_t input_width,
19 const float* input,
20 const float* weights,
21 const float* zero,
22 float* output,
23 uint32_t padding_top,
24 const union xnn_f32_chw_params params[restrict XNN_MIN_ELEMENTS(1)])
25 {
26 assert(input_height != 0);
27 assert(input_width != 0);
28 assert(input_width % sizeof(float) == 0);
29 assert(padding_top == 1);
30
31 const float vmin = params->scalar.min;
32 const float vmax = params->scalar.max;
33
34 const float vbias = weights[0];
35 const float vk00 = weights[1];
36 const float vk01 = weights[2];
37 const float vk02 = weights[3];
38 const float vk10 = weights[4];
39 const float vk11 = weights[5];
40 const float vk12 = weights[6];
41 const float vk20 = weights[7];
42 const float vk21 = weights[8];
43 const float vk22 = weights[9];
44
45 const float* i0 = zero;
46 const float* i1 = input;
47 const float* i2 = (const float*) ((uintptr_t) i1 + input_width);
48 const float* i3 = (const float*) ((uintptr_t) i2 + input_width);
49 const float* i4 = (const float*) ((uintptr_t) i3 + input_width);
50 const float* i5 = (const float*) ((uintptr_t) i4 + input_width);
51
52 float* o0 = output;
53 float* o1 = (float*) ((uintptr_t) o0 + input_width);
54 float* o2 = (float*) ((uintptr_t) o1 + input_width);
55 float* o3 = (float*) ((uintptr_t) o2 + input_width);
56
57 size_t output_height = input_height;
58 do {
59 if XNN_UNPREDICTABLE(output_height < 2) {
60 i2 = zero;
61 o1 = o0;
62 }
63 if XNN_UNPREDICTABLE(output_height < 3) {
64 i3 = zero;
65 o2 = o1;
66 }
67 if XNN_UNPREDICTABLE(output_height < 4) {
68 i4 = zero;
69 o3 = o2;
70 }
71 if XNN_UNPREDICTABLE(output_height < 5) {
72 i5 = zero;
73 }
74
75 float vi0x0 = 0.0f;
76 float vi1x0 = 0.0f;
77 float vi2x0 = 0.0f;
78 float vi3x0 = 0.0f;
79 float vi4x0 = 0.0f;
80 float vi5x0 = 0.0f;
81
82 float vi0x1 = *i0++;
83 float vi1x1 = *i1++;
84 float vi2x1 = *i2++;
85 float vi3x1 = *i3++;
86 float vi4x1 = *i4++;
87 float vi5x1 = *i5++;
88
89 size_t w = input_width;
90 for (; w > 1 * sizeof(float); w -= 1 * sizeof(float)) {
91 const float vi0x2 = *i0++;
92 const float vi1x2 = *i1++;
93 const float vi2x2 = *i2++;
94 const float vi3x2 = *i3++;
95 const float vi4x2 = *i4++;
96 const float vi5x2 = *i5++;
97
98 float vo0p0 = vbias + vi0x0 * vk00;
99 float vo1p0 = vbias + vi1x0 * vk00;
100 float vo2p0 = vbias + vi2x0 * vk00;
101 float vo3p0 = vbias + vi3x0 * vk00;
102 vo0p0 += vi1x0 * vk10;
103 vo1p0 += vi2x0 * vk10;
104 vo2p0 += vi3x0 * vk10;
105 vo3p0 += vi4x0 * vk10;
106 vo0p0 += vi2x0 * vk20;
107 vo1p0 += vi3x0 * vk20;
108 vo2p0 += vi4x0 * vk20;
109 vo3p0 += vi5x0 * vk20;
110
111 vi0x0 = vi0x1;
112 vi1x0 = vi1x1;
113 vi2x0 = vi2x1;
114 vi3x0 = vi3x1;
115 vi4x0 = vi4x1;
116 vi5x0 = vi5x1;
117
118 vo0p0 += vi0x1 * vk01;
119 vo1p0 += vi1x1 * vk01;
120 vo2p0 += vi2x1 * vk01;
121 vo3p0 += vi3x1 * vk01;
122 vo0p0 += vi1x1 * vk11;
123 vo1p0 += vi2x1 * vk11;
124 vo2p0 += vi3x1 * vk11;
125 vo3p0 += vi4x1 * vk11;
126 vo0p0 += vi2x1 * vk21;
127 vo1p0 += vi3x1 * vk21;
128 vo2p0 += vi4x1 * vk21;
129 vo3p0 += vi5x1 * vk21;
130
131 vi0x1 = vi0x2;
132 vi1x1 = vi1x2;
133 vi2x1 = vi2x2;
134 vi3x1 = vi3x2;
135 vi4x1 = vi4x2;
136 vi5x1 = vi5x2;
137
138 vo0p0 += vi0x2 * vk02;
139 vo1p0 += vi1x2 * vk02;
140 vo2p0 += vi2x2 * vk02;
141 vo3p0 += vi3x2 * vk02;
142 vo0p0 += vi1x2 * vk12;
143 vo1p0 += vi2x2 * vk12;
144 vo2p0 += vi3x2 * vk12;
145 vo3p0 += vi4x2 * vk12;
146 vo0p0 += vi2x2 * vk22;
147 vo1p0 += vi3x2 * vk22;
148 vo2p0 += vi4x2 * vk22;
149 vo3p0 += vi5x2 * vk22;
150
151
152 float vo0 = math_max_f32(vo0p0, vmin);
153 float vo1 = math_max_f32(vo1p0, vmin);
154 float vo2 = math_max_f32(vo2p0, vmin);
155 float vo3 = math_max_f32(vo3p0, vmin);
156
157 vo0 = math_min_f32(vo0, vmax);
158 vo1 = math_min_f32(vo1, vmax);
159 vo2 = math_min_f32(vo2, vmax);
160 vo3 = math_min_f32(vo3, vmax);
161
162 *o3++ = vo3;
163 *o2++ = vo2;
164 *o1++ = vo1;
165 *o0++ = vo0;
166 }
167 // Always process the last pixel separately to account for right edge.
168 assert(w == 1 * sizeof(float));
169 {
170 float vo0p0 = vbias + vi0x0 * vk00;
171 float vo1p0 = vbias + vi1x0 * vk00;
172 float vo2p0 = vbias + vi2x0 * vk00;
173 float vo3p0 = vbias + vi3x0 * vk00;
174 vo0p0 += vi1x0 * vk10;
175 vo1p0 += vi2x0 * vk10;
176 vo2p0 += vi3x0 * vk10;
177 vo3p0 += vi4x0 * vk10;
178 vo0p0 += vi2x0 * vk20;
179 vo1p0 += vi3x0 * vk20;
180 vo2p0 += vi4x0 * vk20;
181 vo3p0 += vi5x0 * vk20;
182
183 vo0p0 += vi0x1 * vk01;
184 vo1p0 += vi1x1 * vk01;
185 vo2p0 += vi2x1 * vk01;
186 vo3p0 += vi3x1 * vk01;
187 vo0p0 += vi1x1 * vk11;
188 vo1p0 += vi2x1 * vk11;
189 vo2p0 += vi3x1 * vk11;
190 vo3p0 += vi4x1 * vk11;
191 vo0p0 += vi2x1 * vk21;
192 vo1p0 += vi3x1 * vk21;
193 vo2p0 += vi4x1 * vk21;
194 vo3p0 += vi5x1 * vk21;
195
196
197 float vo0 = math_max_f32(vo0p0, vmin);
198 float vo1 = math_max_f32(vo1p0, vmin);
199 float vo2 = math_max_f32(vo2p0, vmin);
200 float vo3 = math_max_f32(vo3p0, vmin);
201
202 vo0 = math_min_f32(vo0, vmax);
203 vo1 = math_min_f32(vo1, vmax);
204 vo2 = math_min_f32(vo2, vmax);
205 vo3 = math_min_f32(vo3, vmax);
206
207 *o3++ = vo3;
208 *o2++ = vo2;
209 *o1++ = vo1;
210 *o0++ = vo0;
211 }
212
213 i0 = (const float*) ((uintptr_t) i4 - input_width);
214 i1 = (const float*) ((uintptr_t) i5 - input_width);
215 i2 = (const float*) ((uintptr_t) i1 + input_width);
216 i3 = (const float*) ((uintptr_t) i2 + input_width);
217 i4 = (const float*) ((uintptr_t) i3 + input_width);
218 i5 = (const float*) ((uintptr_t) i4 + input_width);
219
220 o0 = o3;
221 o1 = (float*) ((uintptr_t) o0 + input_width);
222 o2 = (float*) ((uintptr_t) o1 + input_width);
223 o3 = (float*) ((uintptr_t) o2 + input_width);
224
225 output_height = doz(output_height, 4);
226 } while (output_height != 0);
227 }
228