1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5
6 #include <assert.h>
7
8 #include <emmintrin.h>
9
10 #include <xnnpack/pad.h>
11
12
xnn_x32_pad_x2__sse2(size_t m,size_t n,size_t l,size_t r,uint32_t c,const void * x,size_t x_stride,void * y,size_t y_stride)13 void xnn_x32_pad_x2__sse2(
14 size_t m,
15 size_t n,
16 size_t l,
17 size_t r,
18 uint32_t c,
19 const void* x,
20 size_t x_stride,
21 void* y,
22 size_t y_stride)
23 {
24 assert(m <= 2);
25 assert(l % 4 == 0);
26 assert(n % 4 == 0);
27 assert(r % 4 == 0);
28
29 const uint32_t* x0 = x;
30 uint32_t* y0 = y;
31
32 const uint32_t* x1 = (const uint32_t*) ((uintptr_t) x0 + x_stride);
33 uint32_t* y1 = (uint32_t*) ((uintptr_t) y0 + y_stride);
34 if (m != 2) {
35 x1 = x0;
36 y1 = y0;
37 }
38 const __m128i vc = _mm_set1_epi32((int) c);
39
40 // Pre-pad input channels.
41 for (; l >= 16; l -= 16) {
42 _mm_storeu_si128((__m128i*) y0, vc); y0 += 4;
43 _mm_storeu_si128((__m128i*) y1, vc); y1 += 4;
44 }
45 if (l & 8) {
46 _mm_storel_epi64((__m128i*) y0, vc); y0 += 2;
47 _mm_storel_epi64((__m128i*) y1, vc); y1 += 2;
48 }
49 if (l & 4) {
50 *((uint32_t*) y0) = (uint32_t) _mm_cvtsi128_si32(vc); y0 += 1;
51 *((uint32_t*) y1) = (uint32_t) _mm_cvtsi128_si32(vc); y1 += 1;
52 }
53
54 // Copy input channels.
55 for (; n >= 16; n -= 16) {
56 const __m128i vt0 = _mm_loadu_si128((const __m128i*) x0); x0 += 4;
57 const __m128i vt1 = _mm_loadu_si128((const __m128i*) x1); x1 += 4;
58 _mm_storeu_si128((__m128i*) y0, vt0); y0 += 4;
59 _mm_storeu_si128((__m128i*) y1, vt1); y1 += 4;
60 }
61 if (n != 0) {
62 __m128i vt0 = _mm_loadu_si128((const __m128i*) x0);
63 __m128i vt1 = _mm_loadu_si128((const __m128i*) x1);
64 if (n & 8) {
65 _mm_storel_epi64((__m128i*) y0, vt0); y0 += 2;
66 _mm_storel_epi64((__m128i*) y1, vt1); y1 += 2;
67 vt0 = _mm_unpackhi_epi64(vt0, vt0);
68 vt1 = _mm_unpackhi_epi64(vt1, vt1);
69 }
70 if (n & 4) {
71 *((uint32_t*) y0) = (uint32_t) _mm_cvtsi128_si32(vt0); y0 += 1;
72 *((uint32_t*) y1) = (uint32_t) _mm_cvtsi128_si32(vt1); y1 += 1;
73 }
74 }
75
76 // Post-pad input channels.
77 for (; r >= 16; r -= 16) {
78 _mm_storeu_si128((__m128i*) y0, vc); y0 += 4;
79 _mm_storeu_si128((__m128i*) y1, vc); y1 += 4;
80 }
81 if (r & 8) {
82 _mm_storel_epi64((__m128i*) y0, vc); y0 += 2;
83 _mm_storel_epi64((__m128i*) y1, vc); y1 += 2;
84 }
85 if (r & 4) {
86 *((uint32_t*) y0) = (uint32_t) _mm_cvtsi128_si32(vc);
87 *((uint32_t*) y1) = (uint32_t) _mm_cvtsi128_si32(vc);
88 }
89 }
90