1 /*
2 * Copyright (c) 2020, Alliance for Open Media. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12 #include <string.h>
13
14 #include "config/aom_dsp_rtcd.h"
15
aom_convolve_copy_neon(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int w,int h)16 void aom_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
17 uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
18 const uint8_t *src1;
19 uint8_t *dst1;
20 int y;
21
22 if (!(w & 0x0F)) {
23 for (y = 0; y < h; ++y) {
24 src1 = src;
25 dst1 = dst;
26 for (int x = 0; x < (w >> 4); ++x) {
27 vst1q_u8(dst1, vld1q_u8(src1));
28 src1 += 16;
29 dst1 += 16;
30 }
31 src += src_stride;
32 dst += dst_stride;
33 }
34 } else if (!(w & 0x07)) {
35 for (y = 0; y < h; ++y) {
36 vst1_u8(dst, vld1_u8(src));
37 src += src_stride;
38 dst += dst_stride;
39 }
40 } else if (!(w & 0x03)) {
41 for (y = 0; y < h; ++y) {
42 memcpy(dst, src, sizeof(uint32_t));
43 src += src_stride;
44 dst += dst_stride;
45 }
46 } else if (!(w & 0x01)) {
47 for (y = 0; y < h; ++y) {
48 memcpy(dst, src, sizeof(uint16_t));
49 src += src_stride;
50 dst += dst_stride;
51 }
52 }
53 }
54
55 #if CONFIG_AV1_HIGHBITDEPTH
aom_highbd_convolve_copy_neon(const uint16_t * src,ptrdiff_t src_stride,uint16_t * dst,ptrdiff_t dst_stride,int w,int h)56 void aom_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride,
57 uint16_t *dst, ptrdiff_t dst_stride, int w,
58 int h) {
59 if (w < 8) { // copy4
60 uint16x4_t s0, s1;
61 do {
62 s0 = vld1_u16(src);
63 src += src_stride;
64 s1 = vld1_u16(src);
65 src += src_stride;
66
67 vst1_u16(dst, s0);
68 dst += dst_stride;
69 vst1_u16(dst, s1);
70 dst += dst_stride;
71 h -= 2;
72 } while (h != 0);
73 } else if (w == 8) { // copy8
74 uint16x8_t s0, s1;
75 do {
76 s0 = vld1q_u16(src);
77 src += src_stride;
78 s1 = vld1q_u16(src);
79 src += src_stride;
80
81 vst1q_u16(dst, s0);
82 dst += dst_stride;
83 vst1q_u16(dst, s1);
84 dst += dst_stride;
85 h -= 2;
86 } while (h != 0);
87 } else if (w < 32) { // copy16
88 uint16x8_t s0, s1, s2, s3;
89 do {
90 s0 = vld1q_u16(src);
91 s1 = vld1q_u16(src + 8);
92 src += src_stride;
93 s2 = vld1q_u16(src);
94 s3 = vld1q_u16(src + 8);
95 src += src_stride;
96
97 vst1q_u16(dst, s0);
98 vst1q_u16(dst + 8, s1);
99 dst += dst_stride;
100 vst1q_u16(dst, s2);
101 vst1q_u16(dst + 8, s3);
102 dst += dst_stride;
103 h -= 2;
104 } while (h != 0);
105 } else if (w == 32) { // copy32
106 uint16x8_t s0, s1, s2, s3;
107 do {
108 s0 = vld1q_u16(src);
109 s1 = vld1q_u16(src + 8);
110 s2 = vld1q_u16(src + 16);
111 s3 = vld1q_u16(src + 24);
112 src += src_stride;
113
114 vst1q_u16(dst, s0);
115 vst1q_u16(dst + 8, s1);
116 vst1q_u16(dst + 16, s2);
117 vst1q_u16(dst + 24, s3);
118 dst += dst_stride;
119 } while (--h != 0);
120 } else { // copy64
121 uint16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
122 do {
123 const uint16_t *s = src;
124 uint16_t *d = dst;
125 int width = w;
126 do {
127 s0 = vld1q_u16(s);
128 s1 = vld1q_u16(s + 8);
129 s2 = vld1q_u16(s + 16);
130 s3 = vld1q_u16(s + 24);
131 s4 = vld1q_u16(s + 32);
132 s5 = vld1q_u16(s + 40);
133 s6 = vld1q_u16(s + 48);
134 s7 = vld1q_u16(s + 56);
135
136 vst1q_u16(d, s0);
137 vst1q_u16(d + 8, s1);
138 vst1q_u16(d + 16, s2);
139 vst1q_u16(d + 24, s3);
140 vst1q_u16(d + 32, s4);
141 vst1q_u16(d + 40, s5);
142 vst1q_u16(d + 48, s6);
143 vst1q_u16(d + 56, s7);
144 s += 64;
145 d += 64;
146 width -= 64;
147 } while (width > 0);
148 src += src_stride;
149 dst += dst_stride;
150 } while (--h != 0);
151 }
152 }
153
154 #endif // CONFIG_AV1_HIGHBITDEPTH
155