• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <assert.h>
7 
8 #include <emmintrin.h>
9 
10 #include <xnnpack/pad.h>
11 
12 
xnn_x32_pad_x2__sse2(size_t m,size_t n,size_t l,size_t r,uint32_t c,const void * x,size_t x_stride,void * y,size_t y_stride)13 void xnn_x32_pad_x2__sse2(
14     size_t m,
15     size_t n,
16     size_t l,
17     size_t r,
18     uint32_t c,
19     const void* x,
20     size_t x_stride,
21     void* y,
22     size_t y_stride)
23 {
24   assert(m <= 2);
25   assert(l % 4 == 0);
26   assert(n % 4 == 0);
27   assert(r % 4 == 0);
28 
29   const uint32_t* x0 = x;
30   uint32_t* y0 = y;
31 
32   const uint32_t* x1 = (const uint32_t*) ((uintptr_t) x0 + x_stride);
33   uint32_t* y1 = (uint32_t*) ((uintptr_t) y0 + y_stride);
34   if (m != 2) {
35     x1 = x0;
36     y1 = y0;
37   }
38   const __m128i vc = _mm_set1_epi32((int) c);
39 
40   // Pre-pad input channels.
41   for (; l >= 16; l -= 16) {
42     _mm_storeu_si128((__m128i*) y0, vc); y0 += 4;
43     _mm_storeu_si128((__m128i*) y1, vc); y1 += 4;
44   }
45   if (l & 8) {
46     _mm_storel_epi64((__m128i*) y0, vc); y0 += 2;
47     _mm_storel_epi64((__m128i*) y1, vc); y1 += 2;
48   }
49   if (l & 4) {
50     *((uint32_t*) y0) = (uint32_t) _mm_cvtsi128_si32(vc); y0 += 1;
51     *((uint32_t*) y1) = (uint32_t) _mm_cvtsi128_si32(vc); y1 += 1;
52   }
53 
54   // Copy input channels.
55   for (; n >= 16; n -= 16) {
56     const __m128i vt0 = _mm_loadu_si128((const __m128i*) x0); x0 += 4;
57     const __m128i vt1 = _mm_loadu_si128((const __m128i*) x1); x1 += 4;
58     _mm_storeu_si128((__m128i*) y0, vt0); y0 += 4;
59     _mm_storeu_si128((__m128i*) y1, vt1); y1 += 4;
60   }
61   if (n != 0) {
62     __m128i vt0 = _mm_loadu_si128((const __m128i*) x0);
63     __m128i vt1 = _mm_loadu_si128((const __m128i*) x1);
64     if (n & 8) {
65       _mm_storel_epi64((__m128i*) y0, vt0); y0 += 2;
66       _mm_storel_epi64((__m128i*) y1, vt1); y1 += 2;
67       vt0 = _mm_unpackhi_epi64(vt0, vt0);
68       vt1 = _mm_unpackhi_epi64(vt1, vt1);
69     }
70     if (n & 4) {
71       *((uint32_t*) y0) = (uint32_t) _mm_cvtsi128_si32(vt0); y0 += 1;
72       *((uint32_t*) y1) = (uint32_t) _mm_cvtsi128_si32(vt1); y1 += 1;
73     }
74   }
75 
76   // Post-pad input channels.
77   for (; r >= 16; r -= 16) {
78     _mm_storeu_si128((__m128i*) y0, vc); y0 += 4;
79     _mm_storeu_si128((__m128i*) y1, vc); y1 += 4;
80   }
81   if (r & 8) {
82     _mm_storel_epi64((__m128i*) y0, vc); y0 += 2;
83     _mm_storel_epi64((__m128i*) y1, vc); y1 += 2;
84   }
85   if (r & 4) {
86     *((uint32_t*) y0) = (uint32_t) _mm_cvtsi128_si32(vc);
87     *((uint32_t*) y1) = (uint32_t) _mm_cvtsi128_si32(vc);
88   }
89 }
90