• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) Facebook, Inc. and its affiliates.
2 // All rights reserved.
3 //
4 // Copyright 2019 Google LLC
5 //
6 // This source code is licensed under the BSD-style license found in the
7 // LICENSE file in the root directory of this source tree.
8 
9 #include <assert.h>
10 
11 #include <emmintrin.h>
12 
13 #include <xnnpack/avgpool.h>
14 
15 
xnn_qu8_avgpool_minmax_ukernel_9p8x__sse2_c8(size_t output_pixels,size_t kernel_elements,size_t channels,const uint8_t ** input,size_t input_offset,const uint8_t * zero,int32_t * buffer,uint8_t * output,size_t input_increment,size_t output_increment,const union xnn_qu8_avgpool_params params[restrict XNN_MIN_ELEMENTS (1)])16 void xnn_qu8_avgpool_minmax_ukernel_9p8x__sse2_c8(
17     size_t output_pixels,
18     size_t kernel_elements,
19     size_t channels,
20     const uint8_t** input,
21     size_t input_offset,
22     const uint8_t* zero,
23     int32_t* buffer,
24     uint8_t* output,
25     size_t input_increment,
26     size_t output_increment,
27     const union xnn_qu8_avgpool_params params[restrict XNN_MIN_ELEMENTS(1)]) XNN_DISABLE_TSAN
28 {
29   assert(output_pixels != 0);
30   assert(kernel_elements > 9);
31   assert(channels != 0);
32 
33   const __m128i vbias = _mm_load_si128((const __m128i*) &params->sse2.bias);
34   const __m128i vzero = _mm_setzero_si128();
35   const __m128i vmultiplier = _mm_load_si128((const __m128i*) params->sse2.multiplier);
36   const __m128i vrounding = _mm_load_si128((const __m128i*) params->sse2.rounding);
37   const __m128i vright_shift = _mm_loadl_epi64((const __m128i*) params->sse2.right_shift);
38 
39   do {
40     {
41       const uint8_t* i0 = *input++;
42       assert(i0 != NULL);
43       if XNN_UNPREDICTABLE(i0 != zero) {
44         i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
45       }
46       const uint8_t* i1 = *input++;
47       assert(i1 != NULL);
48       if XNN_UNPREDICTABLE(i1 != zero) {
49         i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
50       }
51       const uint8_t* i2 = *input++;
52       assert(i2 != NULL);
53       if XNN_UNPREDICTABLE(i2 != zero) {
54         i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
55       }
56       const uint8_t* i3 = *input++;
57       assert(i3 != NULL);
58       if XNN_UNPREDICTABLE(i3 != zero) {
59         i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
60       }
61       const uint8_t* i4 = *input++;
62       assert(i4 != NULL);
63       if XNN_UNPREDICTABLE(i4 != zero) {
64         i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
65       }
66       const uint8_t* i5 = *input++;
67       assert(i5 != NULL);
68       if XNN_UNPREDICTABLE(i5 != zero) {
69         i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
70       }
71       const uint8_t* i6 = *input++;
72       assert(i6 != NULL);
73       if XNN_UNPREDICTABLE(i6 != zero) {
74         i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
75       }
76       const uint8_t* i7 = *input++;
77       assert(i7 != NULL);
78       if XNN_UNPREDICTABLE(i7 != zero) {
79         i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
80       }
81       const uint8_t* i8 = *input++;
82       assert(i8 != NULL);
83       if XNN_UNPREDICTABLE(i8 != zero) {
84         i8 = (const uint8_t*) ((uintptr_t) i8 + input_offset);
85       }
86 
87       int32_t* b = buffer;
88       for (size_t c = 0; c < channels; c += 8) {
89         const __m128i vi0 = _mm_loadl_epi64((const __m128i*) i0); i0 += 8;
90         const __m128i vi1 = _mm_loadl_epi64((const __m128i*) i1); i1 += 8;
91         const __m128i vi2 = _mm_loadl_epi64((const __m128i*) i2); i2 += 8;
92         const __m128i vi3 = _mm_loadl_epi64((const __m128i*) i3); i3 += 8;
93         const __m128i vi4 = _mm_loadl_epi64((const __m128i*) i4); i4 += 8;
94         const __m128i vi5 = _mm_loadl_epi64((const __m128i*) i5); i5 += 8;
95         const __m128i vi6 = _mm_loadl_epi64((const __m128i*) i6); i6 += 8;
96         const __m128i vi7 = _mm_loadl_epi64((const __m128i*) i7); i7 += 8;
97         const __m128i vi8 = _mm_loadl_epi64((const __m128i*) i8); i8 += 8;
98 
99         const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
100         const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
101         const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
102         const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
103         const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
104         const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
105         const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
106         const __m128i vxi7 = _mm_unpacklo_epi8(vi7, vzero);
107         const __m128i vxi8 = _mm_unpacklo_epi8(vi8, vzero);
108 
109         const __m128i vsum018 = _mm_add_epi16(_mm_add_epi16(vxi0, vxi1), vxi8);
110         const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
111         const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
112         const __m128i vsum67 = _mm_add_epi16(vxi6, vxi7);
113 
114         const __m128i vsum2345 = _mm_add_epi16(vsum23, vsum45);
115         const __m128i vsum01678 = _mm_add_epi16(vsum018, vsum67);
116         const __m128i vsum = _mm_add_epi16(vsum2345, vsum01678);
117 
118         const __m128i vacc_lo = _mm_add_epi32(vbias, _mm_unpacklo_epi16(vsum, vzero));
119         const __m128i vacc_hi = _mm_add_epi32(vbias, _mm_unpackhi_epi16(vsum, vzero));
120 
121         _mm_store_si128((__m128i*) b, vacc_lo);
122         _mm_store_si128((__m128i*) b + 1, vacc_hi);
123         b += 8;
124       }
125     }
126 
127     size_t k = kernel_elements;
128     for (k -= 9; k > 8; k -= 8) {
129       const uint8_t* i0 = *input++;
130       assert(i0 != NULL);
131       if XNN_UNPREDICTABLE(i0 != zero) {
132         i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
133       }
134       const uint8_t* i1 = *input++;
135       assert(i1 != NULL);
136       if XNN_UNPREDICTABLE(i1 != zero) {
137         i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
138       }
139       const uint8_t* i2 = *input++;
140       assert(i2 != NULL);
141       if XNN_UNPREDICTABLE(i2 != zero) {
142         i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
143       }
144       const uint8_t* i3 = *input++;
145       assert(i3 != NULL);
146       if XNN_UNPREDICTABLE(i3 != zero) {
147         i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
148       }
149       const uint8_t* i4 = *input++;
150       assert(i4 != NULL);
151       if XNN_UNPREDICTABLE(i4 != zero) {
152         i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
153       }
154       const uint8_t* i5 = *input++;
155       assert(i5 != NULL);
156       if XNN_UNPREDICTABLE(i5 != zero) {
157         i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
158       }
159       const uint8_t* i6 = *input++;
160       assert(i6 != NULL);
161       if XNN_UNPREDICTABLE(i6 != zero) {
162         i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
163       }
164       const uint8_t* i7 = *input++;
165       assert(i7 != NULL);
166       if XNN_UNPREDICTABLE(i7 != zero) {
167         i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
168       }
169 
170       int32_t* b = buffer;
171       for (size_t c = 0; c < channels; c += 8) {
172         const __m128i vi0 = _mm_loadl_epi64((const __m128i*) i0); i0 += 8;
173         const __m128i vi1 = _mm_loadl_epi64((const __m128i*) i1); i1 += 8;
174         const __m128i vi2 = _mm_loadl_epi64((const __m128i*) i2); i2 += 8;
175         const __m128i vi3 = _mm_loadl_epi64((const __m128i*) i3); i3 += 8;
176         const __m128i vi4 = _mm_loadl_epi64((const __m128i*) i4); i4 += 8;
177         const __m128i vi5 = _mm_loadl_epi64((const __m128i*) i5); i5 += 8;
178         const __m128i vi6 = _mm_loadl_epi64((const __m128i*) i6); i6 += 8;
179         const __m128i vi7 = _mm_loadl_epi64((const __m128i*) i7); i7 += 8;
180         __m128i vacc_lo = _mm_load_si128((const __m128i*) b);
181         __m128i vacc_hi = _mm_load_si128((const __m128i*) b + 1);
182 
183         const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
184         const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
185         const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
186         const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
187         const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
188         const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
189         const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
190         const __m128i vxi7 = _mm_unpacklo_epi8(vi7, vzero);
191 
192         const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);
193         const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
194         const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
195         const __m128i vsum67 = _mm_add_epi16(vxi6, vxi7);
196 
197         const __m128i vsum0123 = _mm_add_epi16(vsum01, vsum23);
198         const __m128i vsum4567 = _mm_add_epi16(vsum45, vsum67);
199         const __m128i vsum = _mm_add_epi16(vsum0123, vsum4567);
200 
201         vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
202         vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
203 
204         _mm_store_si128((__m128i*) b, vacc_lo);
205         _mm_store_si128((__m128i*) b + 1, vacc_hi);
206         b += 8;
207       }
208     }
209 
210     {
211       const uint8_t* i0 = input[0];
212       assert(i0 != NULL);
213       const uint8_t* i1 = input[1];
214       const uint8_t* i2 = input[2];
215       const uint8_t* i3 = input[3];
216       const uint8_t* i4 = input[4];
217       const uint8_t* i5 = input[5];
218       const uint8_t* i6 = input[6];
219       const uint8_t* i7 = input[7];
220       input = (const uint8_t**) ((uintptr_t) input + input_increment);
221       if (k < 2) {
222         i1 = zero;
223       }
224       assert(i1 != NULL);
225       if (k <= 2) {
226         i2 = zero;
227       }
228       assert(i2 != NULL);
229       if (k < 4) {
230         i3 = zero;
231       }
232       assert(i3 != NULL);
233       if (k <= 4) {
234         i4 = zero;
235       }
236       assert(i4 != NULL);
237       if (k < 6) {
238         i5 = zero;
239       }
240       assert(i5 != NULL);
241       if (k <= 6) {
242         i6 = zero;
243       }
244       assert(i6 != NULL);
245       if (k < 8) {
246         i7 = zero;
247       }
248       assert(i7 != NULL);
249       if XNN_UNPREDICTABLE(i0 != zero) {
250         i0 = (const uint8_t*) ((uintptr_t) i0 + input_offset);
251       }
252       if XNN_UNPREDICTABLE(i1 != zero) {
253         i1 = (const uint8_t*) ((uintptr_t) i1 + input_offset);
254       }
255       if XNN_UNPREDICTABLE(i2 != zero) {
256         i2 = (const uint8_t*) ((uintptr_t) i2 + input_offset);
257       }
258       if XNN_UNPREDICTABLE(i3 != zero) {
259         i3 = (const uint8_t*) ((uintptr_t) i3 + input_offset);
260       }
261       if XNN_UNPREDICTABLE(i4 != zero) {
262         i4 = (const uint8_t*) ((uintptr_t) i4 + input_offset);
263       }
264       if XNN_UNPREDICTABLE(i5 != zero) {
265         i5 = (const uint8_t*) ((uintptr_t) i5 + input_offset);
266       }
267       if XNN_UNPREDICTABLE(i6 != zero) {
268         i6 = (const uint8_t*) ((uintptr_t) i6 + input_offset);
269       }
270       if XNN_UNPREDICTABLE(i7 != zero) {
271         i7 = (const uint8_t*) ((uintptr_t) i7 + input_offset);
272       }
273 
274       size_t c = channels;
275       int32_t* b = buffer;
276       while (c >= 8) {
277         const __m128i vi0 = _mm_loadl_epi64((const __m128i*) i0); i0 += 8;
278         const __m128i vi1 = _mm_loadl_epi64((const __m128i*) i1); i1 += 8;
279         const __m128i vi2 = _mm_loadl_epi64((const __m128i*) i2); i2 += 8;
280         const __m128i vi3 = _mm_loadl_epi64((const __m128i*) i3); i3 += 8;
281         const __m128i vi4 = _mm_loadl_epi64((const __m128i*) i4); i4 += 8;
282         const __m128i vi5 = _mm_loadl_epi64((const __m128i*) i5); i5 += 8;
283         const __m128i vi6 = _mm_loadl_epi64((const __m128i*) i6); i6 += 8;
284         const __m128i vi7 = _mm_loadl_epi64((const __m128i*) i7); i7 += 8;
285         __m128i vacc_lo = _mm_load_si128((const __m128i*) b);
286         __m128i vacc_hi = _mm_load_si128((const __m128i*) b + 1);
287         b += 8;
288 
289         const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
290         const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
291         const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
292         const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
293         const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
294         const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
295         const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
296         const __m128i vxi7 = _mm_unpacklo_epi8(vi7, vzero);
297 
298         const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);
299         const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
300         const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
301         const __m128i vsum67 = _mm_add_epi16(vxi6, vxi7);
302 
303         const __m128i vsum0123 = _mm_add_epi16(vsum01, vsum23);
304         const __m128i vsum4567 = _mm_add_epi16(vsum45, vsum67);
305         const __m128i vsum = _mm_add_epi16(vsum0123, vsum4567);
306 
307         vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
308         vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
309 
310         const __m128i vneg_mask_lo = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_lo);
311         const __m128i vneg_mask_hi = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_hi);
312 
313         const __m128i vabs_lo0123 = _mm_sub_epi32(_mm_xor_si128(vacc_lo, vneg_mask_lo), vneg_mask_lo);
314         const __m128i vabs_hi0123 = _mm_sub_epi32(_mm_xor_si128(vacc_hi, vneg_mask_hi), vneg_mask_hi);
315 
316         const __m128i vabs_lo1032 = _mm_shuffle_epi32(vabs_lo0123, _MM_SHUFFLE(2, 3, 0, 1));
317         const __m128i vabs_hi1032 = _mm_shuffle_epi32(vabs_hi0123, _MM_SHUFFLE(2, 3, 0, 1));
318 
319         const __m128i vabsmul_lo02 = _mm_mul_epu32(vabs_lo0123, vmultiplier);
320         const __m128i vabsmul_hi02 = _mm_mul_epu32(vabs_hi0123, vmultiplier);
321 
322         const __m128i vabsmul_lo13 = _mm_mul_epu32(vabs_lo1032, vmultiplier);
323         const __m128i vabsmul_hi13 = _mm_mul_epu32(vabs_hi1032, vmultiplier);
324 
325         const __m128i vabs_scaled_lo02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo02, vrounding), vright_shift);
326         const __m128i vabs_scaled_lo13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo13, vrounding), vright_shift);
327         const __m128i vabs_scaled_hi02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi02, vrounding), vright_shift);
328         const __m128i vabs_scaled_hi13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi13, vrounding), vright_shift);
329 
330         const __m128i vabs_scaled_lo0213 = _mm_castps_si128(
331             _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_lo02), _mm_castsi128_ps(vabs_scaled_lo13), _MM_SHUFFLE(2, 0, 2, 0)));
332         const __m128i vabs_scaled_hi0213 = _mm_castps_si128(
333             _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_hi02), _mm_castsi128_ps(vabs_scaled_hi13), _MM_SHUFFLE(2, 0, 2, 0)));
334 
335         const __m128i vabs_scaled_lo = _mm_shuffle_epi32(vabs_scaled_lo0213, _MM_SHUFFLE(3, 1, 2, 0));
336         const __m128i vabs_scaled_hi = _mm_shuffle_epi32(vabs_scaled_hi0213, _MM_SHUFFLE(3, 1, 2, 0));
337 
338         const __m128i vscaled_lo = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_lo, vneg_mask_lo), vneg_mask_lo);
339         const __m128i vscaled_hi = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_hi, vneg_mask_hi), vneg_mask_hi);
340 
341         __m128i vout = _mm_packs_epi32(vscaled_lo, vscaled_hi);
342         vout = _mm_adds_epi16(vout, _mm_load_si128((const __m128i*) &params->sse2.output_zero_point));
343         vout = _mm_packus_epi16(vout, vout);
344         vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) &params->sse2.output_max));
345         vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) &params->sse2.output_min));
346 
347         _mm_storel_epi64((__m128i*) output, vout);
348         output += 8;
349 
350         c -= 8;
351       }
352       if (c != 0) {
353         const __m128i vi0 = _mm_loadl_epi64((const __m128i*) i0);
354         const __m128i vi1 = _mm_loadl_epi64((const __m128i*) i1);
355         const __m128i vi2 = _mm_loadl_epi64((const __m128i*) i2);
356         const __m128i vi3 = _mm_loadl_epi64((const __m128i*) i3);
357         const __m128i vi4 = _mm_loadl_epi64((const __m128i*) i4);
358         const __m128i vi5 = _mm_loadl_epi64((const __m128i*) i5);
359         const __m128i vi6 = _mm_loadl_epi64((const __m128i*) i6);
360         const __m128i vi7 = _mm_loadl_epi64((const __m128i*) i7);
361         __m128i vacc_lo = _mm_load_si128((const __m128i*) b);
362         __m128i vacc_hi = _mm_load_si128((const __m128i*) b + 1);
363 
364         const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
365         const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
366         const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
367         const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
368         const __m128i vxi4 = _mm_unpacklo_epi8(vi4, vzero);
369         const __m128i vxi5 = _mm_unpacklo_epi8(vi5, vzero);
370         const __m128i vxi6 = _mm_unpacklo_epi8(vi6, vzero);
371         const __m128i vxi7 = _mm_unpacklo_epi8(vi7, vzero);
372 
373         const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);
374         const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);
375         const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);
376         const __m128i vsum67 = _mm_add_epi16(vxi6, vxi7);
377 
378         const __m128i vsum0123 = _mm_add_epi16(vsum01, vsum23);
379         const __m128i vsum4567 = _mm_add_epi16(vsum45, vsum67);
380         const __m128i vsum = _mm_add_epi16(vsum0123, vsum4567);
381 
382         vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
383         vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
384 
385         const __m128i vneg_mask_lo = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_lo);
386         const __m128i vneg_mask_hi = _mm_cmpgt_epi32(_mm_setzero_si128(), vacc_hi);
387 
388         const __m128i vabs_lo0123 = _mm_sub_epi32(_mm_xor_si128(vacc_lo, vneg_mask_lo), vneg_mask_lo);
389         const __m128i vabs_hi0123 = _mm_sub_epi32(_mm_xor_si128(vacc_hi, vneg_mask_hi), vneg_mask_hi);
390 
391         const __m128i vabs_lo1032 = _mm_shuffle_epi32(vabs_lo0123, _MM_SHUFFLE(2, 3, 0, 1));
392         const __m128i vabs_hi1032 = _mm_shuffle_epi32(vabs_hi0123, _MM_SHUFFLE(2, 3, 0, 1));
393 
394         const __m128i vabsmul_lo02 = _mm_mul_epu32(vabs_lo0123, vmultiplier);
395         const __m128i vabsmul_hi02 = _mm_mul_epu32(vabs_hi0123, vmultiplier);
396 
397         const __m128i vabsmul_lo13 = _mm_mul_epu32(vabs_lo1032, vmultiplier);
398         const __m128i vabsmul_hi13 = _mm_mul_epu32(vabs_hi1032, vmultiplier);
399 
400         const __m128i vabs_scaled_lo02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo02, vrounding), vright_shift);
401         const __m128i vabs_scaled_lo13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_lo13, vrounding), vright_shift);
402         const __m128i vabs_scaled_hi02 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi02, vrounding), vright_shift);
403         const __m128i vabs_scaled_hi13 = _mm_srl_epi64(_mm_add_epi64(vabsmul_hi13, vrounding), vright_shift);
404 
405         const __m128i vabs_scaled_lo0213 = _mm_castps_si128(
406             _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_lo02), _mm_castsi128_ps(vabs_scaled_lo13), _MM_SHUFFLE(2, 0, 2, 0)));
407         const __m128i vabs_scaled_hi0213 = _mm_castps_si128(
408             _mm_shuffle_ps(_mm_castsi128_ps(vabs_scaled_hi02), _mm_castsi128_ps(vabs_scaled_hi13), _MM_SHUFFLE(2, 0, 2, 0)));
409 
410         const __m128i vabs_scaled_lo = _mm_shuffle_epi32(vabs_scaled_lo0213, _MM_SHUFFLE(3, 1, 2, 0));
411         const __m128i vabs_scaled_hi = _mm_shuffle_epi32(vabs_scaled_hi0213, _MM_SHUFFLE(3, 1, 2, 0));
412 
413         const __m128i vscaled_lo = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_lo, vneg_mask_lo), vneg_mask_lo);
414         const __m128i vscaled_hi = _mm_sub_epi32(_mm_xor_si128(vabs_scaled_hi, vneg_mask_hi), vneg_mask_hi);
415 
416         __m128i vout = _mm_packs_epi32(vscaled_lo, vscaled_hi);
417         vout = _mm_adds_epi16(vout, _mm_load_si128((const __m128i*) &params->sse2.output_zero_point));
418         vout = _mm_packus_epi16(vout, vout);
419         vout = _mm_min_epu8(vout, _mm_load_si128((const __m128i*) &params->sse2.output_max));
420         vout = _mm_max_epu8(vout, _mm_load_si128((const __m128i*) &params->sse2.output_min));
421 
422         if (c & 4) {
423           *((uint32_t*) output) = (uint32_t) _mm_cvtsi128_si32(vout);
424           output += 4;
425           vout = _mm_srli_epi64(vout, 32);
426         }
427         if (c & 2) {
428           *((uint16_t*) output) = (uint16_t) _mm_extract_epi16(vout, 0);
429           output += 2;
430           vout = _mm_srli_epi32(vout, 16);
431         }
432         if (c & 1) {
433           *((uint8_t*) output) = (uint8_t) _mm_cvtsi128_si32(vout);
434           output += 1;
435         }
436       }
437     }
438     output = (uint8_t*) ((uintptr_t) output + output_increment);
439   } while (--output_pixels != 0);
440 }
441