• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Auto-generated file. Do not edit!
2 //   Template: src/f32-dwconv/up-avx512.c.in
3 //   Generator: tools/xngen
4 //
5 // Copyright 2019 Google LLC
6 //
7 // This source code is licensed under the BSD-style license found in the
8 // LICENSE file in the root directory of this source tree.
9 
10 #include <assert.h>
11 
12 #include <immintrin.h>
13 
14 #include <xnnpack/dwconv.h>
15 #include <xnnpack/intrinsics-polyfill.h>
16 
17 
xnn_f32_dwconv_ukernel_up32x25__avx512f_acc2(size_t channels,size_t output_width,const float ** input,const float * weights,float * output,size_t input_stride,size_t output_increment,const union xnn_f32_output_params params[restrict static1])18 void xnn_f32_dwconv_ukernel_up32x25__avx512f_acc2(
19     size_t channels,
20     size_t output_width,
21     const float** input,
22     const float* weights,
23     float* output,
24     size_t input_stride,
25     size_t output_increment,
26     const union xnn_f32_output_params params[restrict static 1])
27 {
28   assert(channels != 0);
29   assert(output_width != 0);
30 
31   const __m512 vmax = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.max));
32   const __m512 vmin = _mm512_broadcast_f32x4(_mm_load_ps(params->sse.min));
33   do {
34     const float* i0 = input[0];
35     assert(i0 != NULL);
36     const float* i1 = input[1];
37     assert(i1 != NULL);
38     const float* i2 = input[2];
39     assert(i2 != NULL);
40     const float* i3 = input[3];
41     assert(i3 != NULL);
42     const float* i4 = input[4];
43     assert(i4 != NULL);
44     const float* i5 = input[5];
45     assert(i5 != NULL);
46     const float* i6 = input[6];
47     assert(i6 != NULL);
48     const float* i7 = input[7];
49     assert(i7 != NULL);
50     const float* i8 = input[8];
51     assert(i8 != NULL);
52     const float* i9 = input[9];
53     assert(i9 != NULL);
54     const float* i10 = input[10];
55     assert(i10 != NULL);
56     const float* i11 = input[11];
57     assert(i11 != NULL);
58     const float* i12 = input[12];
59     assert(i12 != NULL);
60     const float* i13 = input[13];
61     assert(i13 != NULL);
62     const float* i14 = input[14];
63     assert(i14 != NULL);
64     const float* i15 = input[15];
65     assert(i15 != NULL);
66     const float* i16 = input[16];
67     assert(i16 != NULL);
68     const float* i17 = input[17];
69     assert(i17 != NULL);
70     const float* i18 = input[18];
71     assert(i18 != NULL);
72     const float* i19 = input[19];
73     assert(i19 != NULL);
74     const float* i20 = input[20];
75     assert(i20 != NULL);
76     const float* i21 = input[21];
77     assert(i21 != NULL);
78     const float* i22 = input[22];
79     assert(i22 != NULL);
80     const float* i23 = input[23];
81     assert(i23 != NULL);
82     const float* i24 = input[24];
83     assert(i24 != NULL);
84     input = (const float**) ((uintptr_t) input + input_stride);
85 
86     size_t c = channels;
87     const float* w = weights;
88     for (; c >= 32; c -= 32) {
89       __m512 vacc0123456789ABCDEFp0 = _mm512_load_ps(w);
90       __m512 vaccGHIJKLMNOPQRSTUVp0 = _mm512_load_ps(w + 16);
91 
92 
93       const __m512 vi0x0123456789ABCDEF = _mm512_loadu_ps(i0);
94       const __m512 vi0xGHIJKLMNOPQRSTUV = _mm512_loadu_ps(i0 + 16);
95       i0 += 32;
96 
97       const __m512 vk0x0123456789ABCDEF = _mm512_load_ps(w + 32);
98       const __m512 vk0xGHIJKLMNOPQRSTUV = _mm512_load_ps(w + 48);
99       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi0x0123456789ABCDEF, vk0x0123456789ABCDEF, vacc0123456789ABCDEFp0);
100       vaccGHIJKLMNOPQRSTUVp0 = _mm512_fmadd_ps(vi0xGHIJKLMNOPQRSTUV, vk0xGHIJKLMNOPQRSTUV, vaccGHIJKLMNOPQRSTUVp0);
101 
102       const __m512 vi1x0123456789ABCDEF = _mm512_loadu_ps(i1);
103       const __m512 vi1xGHIJKLMNOPQRSTUV = _mm512_loadu_ps(i1 + 16);
104       i1 += 32;
105 
106       const __m512 vk1x0123456789ABCDEF = _mm512_load_ps(w + 64);
107       const __m512 vk1xGHIJKLMNOPQRSTUV = _mm512_load_ps(w + 80);
108       __m512 vacc0123456789ABCDEFp1 = _mm512_mul_ps(vi1x0123456789ABCDEF, vk1x0123456789ABCDEF);
109       __m512 vaccGHIJKLMNOPQRSTUVp1 = _mm512_mul_ps(vi1xGHIJKLMNOPQRSTUV, vk1xGHIJKLMNOPQRSTUV);
110 
111       const __m512 vi2x0123456789ABCDEF = _mm512_loadu_ps(i2);
112       const __m512 vi2xGHIJKLMNOPQRSTUV = _mm512_loadu_ps(i2 + 16);
113       i2 += 32;
114 
115       const __m512 vk2x0123456789ABCDEF = _mm512_load_ps(w + 96);
116       const __m512 vk2xGHIJKLMNOPQRSTUV = _mm512_load_ps(w + 112);
117       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi2x0123456789ABCDEF, vk2x0123456789ABCDEF, vacc0123456789ABCDEFp0);
118       vaccGHIJKLMNOPQRSTUVp0 = _mm512_fmadd_ps(vi2xGHIJKLMNOPQRSTUV, vk2xGHIJKLMNOPQRSTUV, vaccGHIJKLMNOPQRSTUVp0);
119 
120       const __m512 vi3x0123456789ABCDEF = _mm512_loadu_ps(i3);
121       const __m512 vi3xGHIJKLMNOPQRSTUV = _mm512_loadu_ps(i3 + 16);
122       i3 += 32;
123 
124       const __m512 vk3x0123456789ABCDEF = _mm512_load_ps(w + 128);
125       const __m512 vk3xGHIJKLMNOPQRSTUV = _mm512_load_ps(w + 144);
126       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi3x0123456789ABCDEF, vk3x0123456789ABCDEF, vacc0123456789ABCDEFp1);
127       vaccGHIJKLMNOPQRSTUVp1 = _mm512_fmadd_ps(vi3xGHIJKLMNOPQRSTUV, vk3xGHIJKLMNOPQRSTUV, vaccGHIJKLMNOPQRSTUVp1);
128 
129       const __m512 vi4x0123456789ABCDEF = _mm512_loadu_ps(i4);
130       const __m512 vi4xGHIJKLMNOPQRSTUV = _mm512_loadu_ps(i4 + 16);
131       i4 += 32;
132 
133       const __m512 vk4x0123456789ABCDEF = _mm512_load_ps(w + 160);
134       const __m512 vk4xGHIJKLMNOPQRSTUV = _mm512_load_ps(w + 176);
135       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi4x0123456789ABCDEF, vk4x0123456789ABCDEF, vacc0123456789ABCDEFp0);
136       vaccGHIJKLMNOPQRSTUVp0 = _mm512_fmadd_ps(vi4xGHIJKLMNOPQRSTUV, vk4xGHIJKLMNOPQRSTUV, vaccGHIJKLMNOPQRSTUVp0);
137 
138       const __m512 vi5x0123456789ABCDEF = _mm512_loadu_ps(i5);
139       const __m512 vi5xGHIJKLMNOPQRSTUV = _mm512_loadu_ps(i5 + 16);
140       i5 += 32;
141 
142       const __m512 vk5x0123456789ABCDEF = _mm512_load_ps(w + 192);
143       const __m512 vk5xGHIJKLMNOPQRSTUV = _mm512_load_ps(w + 208);
144       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi5x0123456789ABCDEF, vk5x0123456789ABCDEF, vacc0123456789ABCDEFp1);
145       vaccGHIJKLMNOPQRSTUVp1 = _mm512_fmadd_ps(vi5xGHIJKLMNOPQRSTUV, vk5xGHIJKLMNOPQRSTUV, vaccGHIJKLMNOPQRSTUVp1);
146 
147       const __m512 vi6x0123456789ABCDEF = _mm512_loadu_ps(i6);
148       const __m512 vi6xGHIJKLMNOPQRSTUV = _mm512_loadu_ps(i6 + 16);
149       i6 += 32;
150 
151       const __m512 vk6x0123456789ABCDEF = _mm512_load_ps(w + 224);
152       const __m512 vk6xGHIJKLMNOPQRSTUV = _mm512_load_ps(w + 240);
153       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi6x0123456789ABCDEF, vk6x0123456789ABCDEF, vacc0123456789ABCDEFp0);
154       vaccGHIJKLMNOPQRSTUVp0 = _mm512_fmadd_ps(vi6xGHIJKLMNOPQRSTUV, vk6xGHIJKLMNOPQRSTUV, vaccGHIJKLMNOPQRSTUVp0);
155 
156       const __m512 vi7x0123456789ABCDEF = _mm512_loadu_ps(i7);
157       const __m512 vi7xGHIJKLMNOPQRSTUV = _mm512_loadu_ps(i7 + 16);
158       i7 += 32;
159 
160       const __m512 vk7x0123456789ABCDEF = _mm512_load_ps(w + 256);
161       const __m512 vk7xGHIJKLMNOPQRSTUV = _mm512_load_ps(w + 272);
162       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi7x0123456789ABCDEF, vk7x0123456789ABCDEF, vacc0123456789ABCDEFp1);
163       vaccGHIJKLMNOPQRSTUVp1 = _mm512_fmadd_ps(vi7xGHIJKLMNOPQRSTUV, vk7xGHIJKLMNOPQRSTUV, vaccGHIJKLMNOPQRSTUVp1);
164 
165       const __m512 vi8x0123456789ABCDEF = _mm512_loadu_ps(i8);
166       const __m512 vi8xGHIJKLMNOPQRSTUV = _mm512_loadu_ps(i8 + 16);
167       i8 += 32;
168 
169       const __m512 vk8x0123456789ABCDEF = _mm512_load_ps(w + 288);
170       const __m512 vk8xGHIJKLMNOPQRSTUV = _mm512_load_ps(w + 304);
171       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi8x0123456789ABCDEF, vk8x0123456789ABCDEF, vacc0123456789ABCDEFp0);
172       vaccGHIJKLMNOPQRSTUVp0 = _mm512_fmadd_ps(vi8xGHIJKLMNOPQRSTUV, vk8xGHIJKLMNOPQRSTUV, vaccGHIJKLMNOPQRSTUVp0);
173 
174       const __m512 vi9x0123456789ABCDEF = _mm512_loadu_ps(i9);
175       const __m512 vi9xGHIJKLMNOPQRSTUV = _mm512_loadu_ps(i9 + 16);
176       i9 += 32;
177 
178       const __m512 vk9x0123456789ABCDEF = _mm512_load_ps(w + 320);
179       const __m512 vk9xGHIJKLMNOPQRSTUV = _mm512_load_ps(w + 336);
180       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi9x0123456789ABCDEF, vk9x0123456789ABCDEF, vacc0123456789ABCDEFp1);
181       vaccGHIJKLMNOPQRSTUVp1 = _mm512_fmadd_ps(vi9xGHIJKLMNOPQRSTUV, vk9xGHIJKLMNOPQRSTUV, vaccGHIJKLMNOPQRSTUVp1);
182 
183       const __m512 vi10x0123456789ABCDEF = _mm512_loadu_ps(i10);
184       const __m512 vi10xGHIJKLMNOPQRSTUV = _mm512_loadu_ps(i10 + 16);
185       i10 += 32;
186 
187       const __m512 vk10x0123456789ABCDEF = _mm512_load_ps(w + 352);
188       const __m512 vk10xGHIJKLMNOPQRSTUV = _mm512_load_ps(w + 368);
189       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi10x0123456789ABCDEF, vk10x0123456789ABCDEF, vacc0123456789ABCDEFp0);
190       vaccGHIJKLMNOPQRSTUVp0 = _mm512_fmadd_ps(vi10xGHIJKLMNOPQRSTUV, vk10xGHIJKLMNOPQRSTUV, vaccGHIJKLMNOPQRSTUVp0);
191 
192       const __m512 vi11x0123456789ABCDEF = _mm512_loadu_ps(i11);
193       const __m512 vi11xGHIJKLMNOPQRSTUV = _mm512_loadu_ps(i11 + 16);
194       i11 += 32;
195 
196       const __m512 vk11x0123456789ABCDEF = _mm512_load_ps(w + 384);
197       const __m512 vk11xGHIJKLMNOPQRSTUV = _mm512_load_ps(w + 400);
198       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi11x0123456789ABCDEF, vk11x0123456789ABCDEF, vacc0123456789ABCDEFp1);
199       vaccGHIJKLMNOPQRSTUVp1 = _mm512_fmadd_ps(vi11xGHIJKLMNOPQRSTUV, vk11xGHIJKLMNOPQRSTUV, vaccGHIJKLMNOPQRSTUVp1);
200 
201       const __m512 vi12x0123456789ABCDEF = _mm512_loadu_ps(i12);
202       const __m512 vi12xGHIJKLMNOPQRSTUV = _mm512_loadu_ps(i12 + 16);
203       i12 += 32;
204 
205       const __m512 vk12x0123456789ABCDEF = _mm512_load_ps(w + 416);
206       const __m512 vk12xGHIJKLMNOPQRSTUV = _mm512_load_ps(w + 432);
207       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi12x0123456789ABCDEF, vk12x0123456789ABCDEF, vacc0123456789ABCDEFp0);
208       vaccGHIJKLMNOPQRSTUVp0 = _mm512_fmadd_ps(vi12xGHIJKLMNOPQRSTUV, vk12xGHIJKLMNOPQRSTUV, vaccGHIJKLMNOPQRSTUVp0);
209 
210       const __m512 vi13x0123456789ABCDEF = _mm512_loadu_ps(i13);
211       const __m512 vi13xGHIJKLMNOPQRSTUV = _mm512_loadu_ps(i13 + 16);
212       i13 += 32;
213 
214       const __m512 vk13x0123456789ABCDEF = _mm512_load_ps(w + 448);
215       const __m512 vk13xGHIJKLMNOPQRSTUV = _mm512_load_ps(w + 464);
216       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi13x0123456789ABCDEF, vk13x0123456789ABCDEF, vacc0123456789ABCDEFp1);
217       vaccGHIJKLMNOPQRSTUVp1 = _mm512_fmadd_ps(vi13xGHIJKLMNOPQRSTUV, vk13xGHIJKLMNOPQRSTUV, vaccGHIJKLMNOPQRSTUVp1);
218 
219       const __m512 vi14x0123456789ABCDEF = _mm512_loadu_ps(i14);
220       const __m512 vi14xGHIJKLMNOPQRSTUV = _mm512_loadu_ps(i14 + 16);
221       i14 += 32;
222 
223       const __m512 vk14x0123456789ABCDEF = _mm512_load_ps(w + 480);
224       const __m512 vk14xGHIJKLMNOPQRSTUV = _mm512_load_ps(w + 496);
225       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi14x0123456789ABCDEF, vk14x0123456789ABCDEF, vacc0123456789ABCDEFp0);
226       vaccGHIJKLMNOPQRSTUVp0 = _mm512_fmadd_ps(vi14xGHIJKLMNOPQRSTUV, vk14xGHIJKLMNOPQRSTUV, vaccGHIJKLMNOPQRSTUVp0);
227 
228       const __m512 vi15x0123456789ABCDEF = _mm512_loadu_ps(i15);
229       const __m512 vi15xGHIJKLMNOPQRSTUV = _mm512_loadu_ps(i15 + 16);
230       i15 += 32;
231 
232       const __m512 vk15x0123456789ABCDEF = _mm512_load_ps(w + 512);
233       const __m512 vk15xGHIJKLMNOPQRSTUV = _mm512_load_ps(w + 528);
234       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi15x0123456789ABCDEF, vk15x0123456789ABCDEF, vacc0123456789ABCDEFp1);
235       vaccGHIJKLMNOPQRSTUVp1 = _mm512_fmadd_ps(vi15xGHIJKLMNOPQRSTUV, vk15xGHIJKLMNOPQRSTUV, vaccGHIJKLMNOPQRSTUVp1);
236 
237       const __m512 vi16x0123456789ABCDEF = _mm512_loadu_ps(i16);
238       const __m512 vi16xGHIJKLMNOPQRSTUV = _mm512_loadu_ps(i16 + 16);
239       i16 += 32;
240 
241       const __m512 vk16x0123456789ABCDEF = _mm512_load_ps(w + 544);
242       const __m512 vk16xGHIJKLMNOPQRSTUV = _mm512_load_ps(w + 560);
243       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi16x0123456789ABCDEF, vk16x0123456789ABCDEF, vacc0123456789ABCDEFp0);
244       vaccGHIJKLMNOPQRSTUVp0 = _mm512_fmadd_ps(vi16xGHIJKLMNOPQRSTUV, vk16xGHIJKLMNOPQRSTUV, vaccGHIJKLMNOPQRSTUVp0);
245 
246       const __m512 vi17x0123456789ABCDEF = _mm512_loadu_ps(i17);
247       const __m512 vi17xGHIJKLMNOPQRSTUV = _mm512_loadu_ps(i17 + 16);
248       i17 += 32;
249 
250       const __m512 vk17x0123456789ABCDEF = _mm512_load_ps(w + 576);
251       const __m512 vk17xGHIJKLMNOPQRSTUV = _mm512_load_ps(w + 592);
252       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi17x0123456789ABCDEF, vk17x0123456789ABCDEF, vacc0123456789ABCDEFp1);
253       vaccGHIJKLMNOPQRSTUVp1 = _mm512_fmadd_ps(vi17xGHIJKLMNOPQRSTUV, vk17xGHIJKLMNOPQRSTUV, vaccGHIJKLMNOPQRSTUVp1);
254 
255       const __m512 vi18x0123456789ABCDEF = _mm512_loadu_ps(i18);
256       const __m512 vi18xGHIJKLMNOPQRSTUV = _mm512_loadu_ps(i18 + 16);
257       i18 += 32;
258 
259       const __m512 vk18x0123456789ABCDEF = _mm512_load_ps(w + 608);
260       const __m512 vk18xGHIJKLMNOPQRSTUV = _mm512_load_ps(w + 624);
261       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi18x0123456789ABCDEF, vk18x0123456789ABCDEF, vacc0123456789ABCDEFp0);
262       vaccGHIJKLMNOPQRSTUVp0 = _mm512_fmadd_ps(vi18xGHIJKLMNOPQRSTUV, vk18xGHIJKLMNOPQRSTUV, vaccGHIJKLMNOPQRSTUVp0);
263 
264       const __m512 vi19x0123456789ABCDEF = _mm512_loadu_ps(i19);
265       const __m512 vi19xGHIJKLMNOPQRSTUV = _mm512_loadu_ps(i19 + 16);
266       i19 += 32;
267 
268       const __m512 vk19x0123456789ABCDEF = _mm512_load_ps(w + 640);
269       const __m512 vk19xGHIJKLMNOPQRSTUV = _mm512_load_ps(w + 656);
270       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi19x0123456789ABCDEF, vk19x0123456789ABCDEF, vacc0123456789ABCDEFp1);
271       vaccGHIJKLMNOPQRSTUVp1 = _mm512_fmadd_ps(vi19xGHIJKLMNOPQRSTUV, vk19xGHIJKLMNOPQRSTUV, vaccGHIJKLMNOPQRSTUVp1);
272 
273       const __m512 vi20x0123456789ABCDEF = _mm512_loadu_ps(i20);
274       const __m512 vi20xGHIJKLMNOPQRSTUV = _mm512_loadu_ps(i20 + 16);
275       i20 += 32;
276 
277       const __m512 vk20x0123456789ABCDEF = _mm512_load_ps(w + 672);
278       const __m512 vk20xGHIJKLMNOPQRSTUV = _mm512_load_ps(w + 688);
279       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi20x0123456789ABCDEF, vk20x0123456789ABCDEF, vacc0123456789ABCDEFp0);
280       vaccGHIJKLMNOPQRSTUVp0 = _mm512_fmadd_ps(vi20xGHIJKLMNOPQRSTUV, vk20xGHIJKLMNOPQRSTUV, vaccGHIJKLMNOPQRSTUVp0);
281 
282       const __m512 vi21x0123456789ABCDEF = _mm512_loadu_ps(i21);
283       const __m512 vi21xGHIJKLMNOPQRSTUV = _mm512_loadu_ps(i21 + 16);
284       i21 += 32;
285 
286       const __m512 vk21x0123456789ABCDEF = _mm512_load_ps(w + 704);
287       const __m512 vk21xGHIJKLMNOPQRSTUV = _mm512_load_ps(w + 720);
288       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi21x0123456789ABCDEF, vk21x0123456789ABCDEF, vacc0123456789ABCDEFp1);
289       vaccGHIJKLMNOPQRSTUVp1 = _mm512_fmadd_ps(vi21xGHIJKLMNOPQRSTUV, vk21xGHIJKLMNOPQRSTUV, vaccGHIJKLMNOPQRSTUVp1);
290 
291       const __m512 vi22x0123456789ABCDEF = _mm512_loadu_ps(i22);
292       const __m512 vi22xGHIJKLMNOPQRSTUV = _mm512_loadu_ps(i22 + 16);
293       i22 += 32;
294 
295       const __m512 vk22x0123456789ABCDEF = _mm512_load_ps(w + 736);
296       const __m512 vk22xGHIJKLMNOPQRSTUV = _mm512_load_ps(w + 752);
297       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi22x0123456789ABCDEF, vk22x0123456789ABCDEF, vacc0123456789ABCDEFp0);
298       vaccGHIJKLMNOPQRSTUVp0 = _mm512_fmadd_ps(vi22xGHIJKLMNOPQRSTUV, vk22xGHIJKLMNOPQRSTUV, vaccGHIJKLMNOPQRSTUVp0);
299 
300       const __m512 vi23x0123456789ABCDEF = _mm512_loadu_ps(i23);
301       const __m512 vi23xGHIJKLMNOPQRSTUV = _mm512_loadu_ps(i23 + 16);
302       i23 += 32;
303 
304       const __m512 vk23x0123456789ABCDEF = _mm512_load_ps(w + 768);
305       const __m512 vk23xGHIJKLMNOPQRSTUV = _mm512_load_ps(w + 784);
306       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi23x0123456789ABCDEF, vk23x0123456789ABCDEF, vacc0123456789ABCDEFp1);
307       vaccGHIJKLMNOPQRSTUVp1 = _mm512_fmadd_ps(vi23xGHIJKLMNOPQRSTUV, vk23xGHIJKLMNOPQRSTUV, vaccGHIJKLMNOPQRSTUVp1);
308 
309       const __m512 vi24x0123456789ABCDEF = _mm512_loadu_ps(i24);
310       const __m512 vi24xGHIJKLMNOPQRSTUV = _mm512_loadu_ps(i24 + 16);
311       i24 += 32;
312 
313       const __m512 vk24x0123456789ABCDEF = _mm512_load_ps(w + 800);
314       const __m512 vk24xGHIJKLMNOPQRSTUV = _mm512_load_ps(w + 816);
315       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi24x0123456789ABCDEF, vk24x0123456789ABCDEF, vacc0123456789ABCDEFp0);
316       vaccGHIJKLMNOPQRSTUVp0 = _mm512_fmadd_ps(vi24xGHIJKLMNOPQRSTUV, vk24xGHIJKLMNOPQRSTUV, vaccGHIJKLMNOPQRSTUVp0);
317 
318       w += 832;
319 
320       // Add up all accumulators to vacc0123456789ABCDEFGHIJKLMNOPQRSTUVp0
321       vacc0123456789ABCDEFp0 = _mm512_add_ps(vacc0123456789ABCDEFp0, vacc0123456789ABCDEFp1);
322       vaccGHIJKLMNOPQRSTUVp0 = _mm512_add_ps(vaccGHIJKLMNOPQRSTUVp0, vaccGHIJKLMNOPQRSTUVp1);
323 
324       __m512 vacc0123456789ABCDEF = _mm512_max_ps(vacc0123456789ABCDEFp0, vmin);
325       __m512 vaccGHIJKLMNOPQRSTUV = _mm512_max_ps(vaccGHIJKLMNOPQRSTUVp0, vmin);
326       vacc0123456789ABCDEF = _mm512_min_ps(vacc0123456789ABCDEF, vmax);
327       vaccGHIJKLMNOPQRSTUV = _mm512_min_ps(vaccGHIJKLMNOPQRSTUV, vmax);
328 
329       _mm512_storeu_ps(output, vacc0123456789ABCDEF);
330       _mm512_storeu_ps(output + 16, vaccGHIJKLMNOPQRSTUV);
331       output += 32;
332     }
333     for (; c >= 16; c -= 16) {
334       __m512 vacc0123456789ABCDEFp0 = _mm512_load_ps(w);
335 
336       const __m512 vi0x0123456789ABCDEF = _mm512_loadu_ps(i0);
337       i0 += 16;
338 
339       const __m512 vk0x0123456789ABCDEF = _mm512_load_ps(w + 32);
340       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi0x0123456789ABCDEF, vk0x0123456789ABCDEF, vacc0123456789ABCDEFp0);
341 
342       const __m512 vi1x0123456789ABCDEF = _mm512_loadu_ps(i1);
343       i1 += 16;
344 
345       const __m512 vk1x0123456789ABCDEF = _mm512_load_ps(w + 64);
346       __m512 vacc0123456789ABCDEFp1 = _mm512_mul_ps(vi1x0123456789ABCDEF, vk1x0123456789ABCDEF);
347 
348       const __m512 vi2x0123456789ABCDEF = _mm512_loadu_ps(i2);
349       i2 += 16;
350 
351       const __m512 vk2x0123456789ABCDEF = _mm512_load_ps(w + 96);
352       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi2x0123456789ABCDEF, vk2x0123456789ABCDEF, vacc0123456789ABCDEFp0);
353 
354       const __m512 vi3x0123456789ABCDEF = _mm512_loadu_ps(i3);
355       i3 += 16;
356 
357       const __m512 vk3x0123456789ABCDEF = _mm512_load_ps(w + 128);
358       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi3x0123456789ABCDEF, vk3x0123456789ABCDEF, vacc0123456789ABCDEFp1);
359 
360       const __m512 vi4x0123456789ABCDEF = _mm512_loadu_ps(i4);
361       i4 += 16;
362 
363       const __m512 vk4x0123456789ABCDEF = _mm512_load_ps(w + 160);
364       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi4x0123456789ABCDEF, vk4x0123456789ABCDEF, vacc0123456789ABCDEFp0);
365 
366       const __m512 vi5x0123456789ABCDEF = _mm512_loadu_ps(i5);
367       i5 += 16;
368 
369       const __m512 vk5x0123456789ABCDEF = _mm512_load_ps(w + 192);
370       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi5x0123456789ABCDEF, vk5x0123456789ABCDEF, vacc0123456789ABCDEFp1);
371 
372       const __m512 vi6x0123456789ABCDEF = _mm512_loadu_ps(i6);
373       i6 += 16;
374 
375       const __m512 vk6x0123456789ABCDEF = _mm512_load_ps(w + 224);
376       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi6x0123456789ABCDEF, vk6x0123456789ABCDEF, vacc0123456789ABCDEFp0);
377 
378       const __m512 vi7x0123456789ABCDEF = _mm512_loadu_ps(i7);
379       i7 += 16;
380 
381       const __m512 vk7x0123456789ABCDEF = _mm512_load_ps(w + 256);
382       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi7x0123456789ABCDEF, vk7x0123456789ABCDEF, vacc0123456789ABCDEFp1);
383 
384       const __m512 vi8x0123456789ABCDEF = _mm512_loadu_ps(i8);
385       i8 += 16;
386 
387       const __m512 vk8x0123456789ABCDEF = _mm512_load_ps(w + 288);
388       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi8x0123456789ABCDEF, vk8x0123456789ABCDEF, vacc0123456789ABCDEFp0);
389 
390       const __m512 vi9x0123456789ABCDEF = _mm512_loadu_ps(i9);
391       i9 += 16;
392 
393       const __m512 vk9x0123456789ABCDEF = _mm512_load_ps(w + 320);
394       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi9x0123456789ABCDEF, vk9x0123456789ABCDEF, vacc0123456789ABCDEFp1);
395 
396       const __m512 vi10x0123456789ABCDEF = _mm512_loadu_ps(i10);
397       i10 += 16;
398 
399       const __m512 vk10x0123456789ABCDEF = _mm512_load_ps(w + 352);
400       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi10x0123456789ABCDEF, vk10x0123456789ABCDEF, vacc0123456789ABCDEFp0);
401 
402       const __m512 vi11x0123456789ABCDEF = _mm512_loadu_ps(i11);
403       i11 += 16;
404 
405       const __m512 vk11x0123456789ABCDEF = _mm512_load_ps(w + 384);
406       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi11x0123456789ABCDEF, vk11x0123456789ABCDEF, vacc0123456789ABCDEFp1);
407 
408       const __m512 vi12x0123456789ABCDEF = _mm512_loadu_ps(i12);
409       i12 += 16;
410 
411       const __m512 vk12x0123456789ABCDEF = _mm512_load_ps(w + 416);
412       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi12x0123456789ABCDEF, vk12x0123456789ABCDEF, vacc0123456789ABCDEFp0);
413 
414       const __m512 vi13x0123456789ABCDEF = _mm512_loadu_ps(i13);
415       i13 += 16;
416 
417       const __m512 vk13x0123456789ABCDEF = _mm512_load_ps(w + 448);
418       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi13x0123456789ABCDEF, vk13x0123456789ABCDEF, vacc0123456789ABCDEFp1);
419 
420       const __m512 vi14x0123456789ABCDEF = _mm512_loadu_ps(i14);
421       i14 += 16;
422 
423       const __m512 vk14x0123456789ABCDEF = _mm512_load_ps(w + 480);
424       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi14x0123456789ABCDEF, vk14x0123456789ABCDEF, vacc0123456789ABCDEFp0);
425 
426       const __m512 vi15x0123456789ABCDEF = _mm512_loadu_ps(i15);
427       i15 += 16;
428 
429       const __m512 vk15x0123456789ABCDEF = _mm512_load_ps(w + 512);
430       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi15x0123456789ABCDEF, vk15x0123456789ABCDEF, vacc0123456789ABCDEFp1);
431 
432       const __m512 vi16x0123456789ABCDEF = _mm512_loadu_ps(i16);
433       i16 += 16;
434 
435       const __m512 vk16x0123456789ABCDEF = _mm512_load_ps(w + 544);
436       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi16x0123456789ABCDEF, vk16x0123456789ABCDEF, vacc0123456789ABCDEFp0);
437 
438       const __m512 vi17x0123456789ABCDEF = _mm512_loadu_ps(i17);
439       i17 += 16;
440 
441       const __m512 vk17x0123456789ABCDEF = _mm512_load_ps(w + 576);
442       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi17x0123456789ABCDEF, vk17x0123456789ABCDEF, vacc0123456789ABCDEFp1);
443 
444       const __m512 vi18x0123456789ABCDEF = _mm512_loadu_ps(i18);
445       i18 += 16;
446 
447       const __m512 vk18x0123456789ABCDEF = _mm512_load_ps(w + 608);
448       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi18x0123456789ABCDEF, vk18x0123456789ABCDEF, vacc0123456789ABCDEFp0);
449 
450       const __m512 vi19x0123456789ABCDEF = _mm512_loadu_ps(i19);
451       i19 += 16;
452 
453       const __m512 vk19x0123456789ABCDEF = _mm512_load_ps(w + 640);
454       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi19x0123456789ABCDEF, vk19x0123456789ABCDEF, vacc0123456789ABCDEFp1);
455 
456       const __m512 vi20x0123456789ABCDEF = _mm512_loadu_ps(i20);
457       i20 += 16;
458 
459       const __m512 vk20x0123456789ABCDEF = _mm512_load_ps(w + 672);
460       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi20x0123456789ABCDEF, vk20x0123456789ABCDEF, vacc0123456789ABCDEFp0);
461 
462       const __m512 vi21x0123456789ABCDEF = _mm512_loadu_ps(i21);
463       i21 += 16;
464 
465       const __m512 vk21x0123456789ABCDEF = _mm512_load_ps(w + 704);
466       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi21x0123456789ABCDEF, vk21x0123456789ABCDEF, vacc0123456789ABCDEFp1);
467 
468       const __m512 vi22x0123456789ABCDEF = _mm512_loadu_ps(i22);
469       i22 += 16;
470 
471       const __m512 vk22x0123456789ABCDEF = _mm512_load_ps(w + 736);
472       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi22x0123456789ABCDEF, vk22x0123456789ABCDEF, vacc0123456789ABCDEFp0);
473 
474       const __m512 vi23x0123456789ABCDEF = _mm512_loadu_ps(i23);
475       i23 += 16;
476 
477       const __m512 vk23x0123456789ABCDEF = _mm512_load_ps(w + 768);
478       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi23x0123456789ABCDEF, vk23x0123456789ABCDEF, vacc0123456789ABCDEFp1);
479 
480       const __m512 vi24x0123456789ABCDEF = _mm512_loadu_ps(i24);
481       i24 += 16;
482 
483       const __m512 vk24x0123456789ABCDEF = _mm512_load_ps(w + 800);
484       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi24x0123456789ABCDEF, vk24x0123456789ABCDEF, vacc0123456789ABCDEFp0);
485 
486       w += 16;
487 
488       // Add up all accumulators to vacc0123456789ABCDEFp0
489       vacc0123456789ABCDEFp0 = _mm512_add_ps(vacc0123456789ABCDEFp0, vacc0123456789ABCDEFp1);
490 
491       __m512 vacc0123456789ABCDEF = _mm512_max_ps(vacc0123456789ABCDEFp0, vmin);
492       vacc0123456789ABCDEF = _mm512_min_ps(vacc0123456789ABCDEF, vmax);
493 
494       _mm512_storeu_ps(output, vacc0123456789ABCDEF);
495       output += 16;
496     }
497     if XNN_UNLIKELY(c != 0) {
498       assert(c >= 1);
499       assert(c <= 16);
500       // Prepare mask for valid 32-bit elements (depends on nc).
501       const __mmask16 vmask = _cvtu32_mask16((uint16_t) ((uint32_t) (UINT32_C(1) << c) - UINT32_C(1)));
502 
503       __m512 vacc0123456789ABCDEFp0 = _mm512_maskz_loadu_ps(vmask, w);
504 
505       const __m512 vi0x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i0);
506       const __m512 vk0x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 32);
507       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi0x0123456789ABCDEF, vk0x0123456789ABCDEF, vacc0123456789ABCDEFp0);
508 
509       const __m512 vi1x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i1);
510       const __m512 vk1x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 64);
511       __m512 vacc0123456789ABCDEFp1 = _mm512_mul_ps(vi1x0123456789ABCDEF, vk1x0123456789ABCDEF);
512 
513       const __m512 vi2x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i2);
514       const __m512 vk2x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 96);
515       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi2x0123456789ABCDEF, vk2x0123456789ABCDEF, vacc0123456789ABCDEFp0);
516 
517       const __m512 vi3x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i3);
518       const __m512 vk3x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 128);
519       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi3x0123456789ABCDEF, vk3x0123456789ABCDEF, vacc0123456789ABCDEFp1);
520 
521       const __m512 vi4x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i4);
522       const __m512 vk4x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 160);
523       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi4x0123456789ABCDEF, vk4x0123456789ABCDEF, vacc0123456789ABCDEFp0);
524 
525       const __m512 vi5x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i5);
526       const __m512 vk5x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 192);
527       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi5x0123456789ABCDEF, vk5x0123456789ABCDEF, vacc0123456789ABCDEFp1);
528 
529       const __m512 vi6x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i6);
530       const __m512 vk6x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 224);
531       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi6x0123456789ABCDEF, vk6x0123456789ABCDEF, vacc0123456789ABCDEFp0);
532 
533       const __m512 vi7x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i7);
534       const __m512 vk7x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 256);
535       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi7x0123456789ABCDEF, vk7x0123456789ABCDEF, vacc0123456789ABCDEFp1);
536 
537       const __m512 vi8x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i8);
538       const __m512 vk8x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 288);
539       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi8x0123456789ABCDEF, vk8x0123456789ABCDEF, vacc0123456789ABCDEFp0);
540 
541       const __m512 vi9x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i9);
542       const __m512 vk9x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 320);
543       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi9x0123456789ABCDEF, vk9x0123456789ABCDEF, vacc0123456789ABCDEFp1);
544 
545       const __m512 vi10x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i10);
546       const __m512 vk10x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 352);
547       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi10x0123456789ABCDEF, vk10x0123456789ABCDEF, vacc0123456789ABCDEFp0);
548 
549       const __m512 vi11x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i11);
550       const __m512 vk11x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 384);
551       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi11x0123456789ABCDEF, vk11x0123456789ABCDEF, vacc0123456789ABCDEFp1);
552 
553       const __m512 vi12x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i12);
554       const __m512 vk12x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 416);
555       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi12x0123456789ABCDEF, vk12x0123456789ABCDEF, vacc0123456789ABCDEFp0);
556 
557       const __m512 vi13x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i13);
558       const __m512 vk13x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 448);
559       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi13x0123456789ABCDEF, vk13x0123456789ABCDEF, vacc0123456789ABCDEFp1);
560 
561       const __m512 vi14x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i14);
562       const __m512 vk14x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 480);
563       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi14x0123456789ABCDEF, vk14x0123456789ABCDEF, vacc0123456789ABCDEFp0);
564 
565       const __m512 vi15x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i15);
566       const __m512 vk15x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 512);
567       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi15x0123456789ABCDEF, vk15x0123456789ABCDEF, vacc0123456789ABCDEFp1);
568 
569       const __m512 vi16x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i16);
570       const __m512 vk16x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 544);
571       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi16x0123456789ABCDEF, vk16x0123456789ABCDEF, vacc0123456789ABCDEFp0);
572 
573       const __m512 vi17x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i17);
574       const __m512 vk17x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 576);
575       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi17x0123456789ABCDEF, vk17x0123456789ABCDEF, vacc0123456789ABCDEFp1);
576 
577       const __m512 vi18x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i18);
578       const __m512 vk18x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 608);
579       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi18x0123456789ABCDEF, vk18x0123456789ABCDEF, vacc0123456789ABCDEFp0);
580 
581       const __m512 vi19x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i19);
582       const __m512 vk19x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 640);
583       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi19x0123456789ABCDEF, vk19x0123456789ABCDEF, vacc0123456789ABCDEFp1);
584 
585       const __m512 vi20x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i20);
586       const __m512 vk20x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 672);
587       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi20x0123456789ABCDEF, vk20x0123456789ABCDEF, vacc0123456789ABCDEFp0);
588 
589       const __m512 vi21x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i21);
590       const __m512 vk21x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 704);
591       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi21x0123456789ABCDEF, vk21x0123456789ABCDEF, vacc0123456789ABCDEFp1);
592 
593       const __m512 vi22x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i22);
594       const __m512 vk22x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 736);
595       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi22x0123456789ABCDEF, vk22x0123456789ABCDEF, vacc0123456789ABCDEFp0);
596 
597       const __m512 vi23x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i23);
598       const __m512 vk23x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 768);
599       vacc0123456789ABCDEFp1 = _mm512_fmadd_ps(vi23x0123456789ABCDEF, vk23x0123456789ABCDEF, vacc0123456789ABCDEFp1);
600 
601       const __m512 vi24x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, i24);
602       const __m512 vk24x0123456789ABCDEF = _mm512_maskz_loadu_ps(vmask, w + 800);
603       vacc0123456789ABCDEFp0 = _mm512_fmadd_ps(vi24x0123456789ABCDEF, vk24x0123456789ABCDEF, vacc0123456789ABCDEFp0);
604 
605       // Add up all accumulators to vacc0123456789ABCDEFp0
606       vacc0123456789ABCDEFp0 = _mm512_add_ps(vacc0123456789ABCDEFp0, vacc0123456789ABCDEFp1);
607 
608       __m512 vacc0123456789ABCDEF = _mm512_max_ps(vacc0123456789ABCDEFp0, vmin);
609       vacc0123456789ABCDEF = _mm512_min_ps(vacc0123456789ABCDEF, vmax);
610 
611       _mm512_mask_storeu_ps(output, vmask, vacc0123456789ABCDEF);
612       output += c;
613     }
614 
615     output = (float*) ((uintptr_t) output + output_increment);
616   } while (--output_width != 0);
617 }
618