1 // Copyright 2020 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <assert.h>
7
8 #include <xmmintrin.h>
9
10 #include <xnnpack/fill.h>
11
12
xnn_x32_fill_ukernel__sse(size_t rows,size_t channels,uint32_t * output,size_t output_stride,const uint32_t * fill_value)13 void xnn_x32_fill_ukernel__sse(
14 size_t rows,
15 size_t channels,
16 uint32_t* output,
17 size_t output_stride,
18 const uint32_t* fill_value)
19 {
20 assert(rows != 0);
21 assert(channels != 0);
22 assert(channels % sizeof(uint32_t) == 0);
23 assert(fill_value != NULL);
24
25 const size_t output_increment = output_stride - channels;
26
27 const __m128 vfill = _mm_load1_ps((const float*) fill_value);
28 float* o = (float*) output;
29 do {
30 size_t c = channels;
31 for (; c >= 16 * sizeof(uint32_t); c -= 16 * sizeof(uint32_t)) {
32 _mm_storeu_ps(o, vfill);
33 _mm_storeu_ps(o + 4, vfill);
34 _mm_storeu_ps(o + 8, vfill);
35 _mm_storeu_ps(o + 12, vfill);
36 o += 16;
37 }
38 for (; c >= 4 * sizeof(uint32_t); c -= 4 * sizeof(uint32_t)) {
39 _mm_storeu_ps(o, vfill);
40 o += 4;
41 }
42 if XNN_UNLIKELY(c != 0) {
43 if XNN_LIKELY(c & (2 * sizeof(uint32_t))) {
44 _mm_storel_pi((__m64*) o, vfill);
45 o += 2;
46 }
47 if XNN_LIKELY(c & (1 * sizeof(uint32_t))) {
48 _mm_store_ss(o, vfill);
49 o += 1;
50 }
51 }
52 o = (void*) ((uintptr_t) o + output_increment);
53 } while (--rows != 0);
54 }
55