• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/dsp/intra_edge.h"
16 #include "src/utils/cpu.h"
17 
18 #if LIBGAV1_ENABLE_NEON
19 
20 #include <arm_neon.h>
21 
22 #include <algorithm>
23 #include <cassert>
24 
25 #include "src/dsp/arm/common_neon.h"
26 #include "src/dsp/constants.h"
27 #include "src/dsp/dsp.h"
28 #include "src/utils/common.h"  // RightShiftWithRounding()
29 
30 namespace libgav1 {
31 namespace dsp {
32 namespace {
33 
34 // Simplified version of intra_edge.cc:kKernels[][]. Only |strength| 1 and 2 are
35 // required.
36 constexpr int kKernelsNEON[3][2] = {{4, 8}, {5, 6}};
37 
IntraEdgeFilter_NEON(void * buffer,const int size,const int strength)38 void IntraEdgeFilter_NEON(void* buffer, const int size, const int strength) {
39   assert(strength == 1 || strength == 2 || strength == 3);
40   const int kernel_index = strength - 1;
41   auto* const dst_buffer = static_cast<uint8_t*>(buffer);
42 
43   // The first element is not written out (but it is input) so the number of
44   // elements written is |size| - 1.
45   if (size == 1) return;
46 
47   // |strength| 1 and 2 use a 3 tap filter.
48   if (strength < 3) {
49     // The last value requires extending the buffer (duplicating
50     // |dst_buffer[size - 1]). Calculate it here to avoid extra processing in
51     // neon.
52     const uint8_t last_val = RightShiftWithRounding(
53         kKernelsNEON[kernel_index][0] * dst_buffer[size - 2] +
54             kKernelsNEON[kernel_index][1] * dst_buffer[size - 1] +
55             kKernelsNEON[kernel_index][0] * dst_buffer[size - 1],
56         4);
57 
58     const uint8x8_t krn1 = vdup_n_u8(kKernelsNEON[kernel_index][1]);
59 
60     // The first value we need gets overwritten by the output from the
61     // previous iteration.
62     uint8x16_t src_0 = vld1q_u8(dst_buffer);
63     int i = 1;
64 
65     // Process blocks until there are less than 16 values remaining.
66     for (; i < size - 15; i += 16) {
67       // Loading these at the end of the block with |src_0| will read past the
68       // end of |top_row_data[160]|, the source of |buffer|.
69       const uint8x16_t src_1 = vld1q_u8(dst_buffer + i);
70       const uint8x16_t src_2 = vld1q_u8(dst_buffer + i + 1);
71       uint16x8_t sum_lo = vaddl_u8(vget_low_u8(src_0), vget_low_u8(src_2));
72       sum_lo = vmulq_n_u16(sum_lo, kKernelsNEON[kernel_index][0]);
73       sum_lo = vmlal_u8(sum_lo, vget_low_u8(src_1), krn1);
74       uint16x8_t sum_hi = vaddl_u8(vget_high_u8(src_0), vget_high_u8(src_2));
75       sum_hi = vmulq_n_u16(sum_hi, kKernelsNEON[kernel_index][0]);
76       sum_hi = vmlal_u8(sum_hi, vget_high_u8(src_1), krn1);
77 
78       const uint8x16_t result =
79           vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
80 
81       // Load the next row before overwriting. This loads an extra 15 values
82       // past |size| on the trailing iteration.
83       src_0 = vld1q_u8(dst_buffer + i + 15);
84 
85       vst1q_u8(dst_buffer + i, result);
86     }
87 
88     // The last output value |last_val| was already calculated so if
89     // |remainder| == 1 then we don't have to do anything.
90     const int remainder = (size - 1) & 0xf;
91     if (remainder > 1) {
92       uint8_t temp[16];
93       const uint8x16_t src_1 = vld1q_u8(dst_buffer + i);
94       const uint8x16_t src_2 = vld1q_u8(dst_buffer + i + 1);
95 
96       uint16x8_t sum_lo = vaddl_u8(vget_low_u8(src_0), vget_low_u8(src_2));
97       sum_lo = vmulq_n_u16(sum_lo, kKernelsNEON[kernel_index][0]);
98       sum_lo = vmlal_u8(sum_lo, vget_low_u8(src_1), krn1);
99       uint16x8_t sum_hi = vaddl_u8(vget_high_u8(src_0), vget_high_u8(src_2));
100       sum_hi = vmulq_n_u16(sum_hi, kKernelsNEON[kernel_index][0]);
101       sum_hi = vmlal_u8(sum_hi, vget_high_u8(src_1), krn1);
102 
103       const uint8x16_t result =
104           vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
105 
106       vst1q_u8(temp, result);
107       memcpy(dst_buffer + i, temp, remainder);
108     }
109 
110     dst_buffer[size - 1] = last_val;
111     return;
112   }
113 
114   assert(strength == 3);
115   // 5 tap filter. The first element requires duplicating |buffer[0]| and the
116   // last two elements require duplicating |buffer[size - 1]|.
117   uint8_t special_vals[3];
118   special_vals[0] = RightShiftWithRounding(
119       (dst_buffer[0] << 1) + (dst_buffer[0] << 2) + (dst_buffer[1] << 2) +
120           (dst_buffer[2] << 2) + (dst_buffer[3] << 1),
121       4);
122   // Clamp index for very small |size| values.
123   const int first_index_min = std::max(size - 4, 0);
124   const int second_index_min = std::max(size - 3, 0);
125   const int third_index_min = std::max(size - 2, 0);
126   special_vals[1] = RightShiftWithRounding(
127       (dst_buffer[first_index_min] << 1) + (dst_buffer[second_index_min] << 2) +
128           (dst_buffer[third_index_min] << 2) + (dst_buffer[size - 1] << 2) +
129           (dst_buffer[size - 1] << 1),
130       4);
131   special_vals[2] = RightShiftWithRounding(
132       (dst_buffer[second_index_min] << 1) + (dst_buffer[third_index_min] << 2) +
133           // x << 2 + x << 2 == x << 3
134           (dst_buffer[size - 1] << 3) + (dst_buffer[size - 1] << 1),
135       4);
136 
137   // The first two values we need get overwritten by the output from the
138   // previous iteration.
139   uint8x16_t src_0 = vld1q_u8(dst_buffer - 1);
140   uint8x16_t src_1 = vld1q_u8(dst_buffer);
141   int i = 1;
142 
143   for (; i < size - 15; i += 16) {
144     // Loading these at the end of the block with |src_[01]| will read past
145     // the end of |top_row_data[160]|, the source of |buffer|.
146     const uint8x16_t src_2 = vld1q_u8(dst_buffer + i);
147     const uint8x16_t src_3 = vld1q_u8(dst_buffer + i + 1);
148     const uint8x16_t src_4 = vld1q_u8(dst_buffer + i + 2);
149 
150     uint16x8_t sum_lo =
151         vshlq_n_u16(vaddl_u8(vget_low_u8(src_0), vget_low_u8(src_4)), 1);
152     const uint16x8_t sum_123_lo = vaddw_u8(
153         vaddl_u8(vget_low_u8(src_1), vget_low_u8(src_2)), vget_low_u8(src_3));
154     sum_lo = vaddq_u16(sum_lo, vshlq_n_u16(sum_123_lo, 2));
155 
156     uint16x8_t sum_hi =
157         vshlq_n_u16(vaddl_u8(vget_high_u8(src_0), vget_high_u8(src_4)), 1);
158     const uint16x8_t sum_123_hi =
159         vaddw_u8(vaddl_u8(vget_high_u8(src_1), vget_high_u8(src_2)),
160                  vget_high_u8(src_3));
161     sum_hi = vaddq_u16(sum_hi, vshlq_n_u16(sum_123_hi, 2));
162 
163     const uint8x16_t result =
164         vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
165 
166     src_0 = vld1q_u8(dst_buffer + i + 14);
167     src_1 = vld1q_u8(dst_buffer + i + 15);
168 
169     vst1q_u8(dst_buffer + i, result);
170   }
171 
172   const int remainder = (size - 1) & 0xf;
173   // Like the 3 tap but if there are two remaining values we have already
174   // calculated them.
175   if (remainder > 2) {
176     uint8_t temp[16];
177     const uint8x16_t src_2 = vld1q_u8(dst_buffer + i);
178     const uint8x16_t src_3 = vld1q_u8(dst_buffer + i + 1);
179     const uint8x16_t src_4 = vld1q_u8(dst_buffer + i + 2);
180 
181     uint16x8_t sum_lo =
182         vshlq_n_u16(vaddl_u8(vget_low_u8(src_0), vget_low_u8(src_4)), 1);
183     const uint16x8_t sum_123_lo = vaddw_u8(
184         vaddl_u8(vget_low_u8(src_1), vget_low_u8(src_2)), vget_low_u8(src_3));
185     sum_lo = vaddq_u16(sum_lo, vshlq_n_u16(sum_123_lo, 2));
186 
187     uint16x8_t sum_hi =
188         vshlq_n_u16(vaddl_u8(vget_high_u8(src_0), vget_high_u8(src_4)), 1);
189     const uint16x8_t sum_123_hi =
190         vaddw_u8(vaddl_u8(vget_high_u8(src_1), vget_high_u8(src_2)),
191                  vget_high_u8(src_3));
192     sum_hi = vaddq_u16(sum_hi, vshlq_n_u16(sum_123_hi, 2));
193 
194     const uint8x16_t result =
195         vcombine_u8(vrshrn_n_u16(sum_lo, 4), vrshrn_n_u16(sum_hi, 4));
196 
197     vst1q_u8(temp, result);
198     memcpy(dst_buffer + i, temp, remainder);
199   }
200 
201   dst_buffer[1] = special_vals[0];
202   // Avoid overwriting |dst_buffer[0]|.
203   if (size > 2) dst_buffer[size - 2] = special_vals[1];
204   dst_buffer[size - 1] = special_vals[2];
205 }
206 
207 // (-|src0| + |src1| * 9 + |src2| * 9 - |src3|) >> 4
Upsample(const uint8x8_t src0,const uint8x8_t src1,const uint8x8_t src2,const uint8x8_t src3)208 uint8x8_t Upsample(const uint8x8_t src0, const uint8x8_t src1,
209                    const uint8x8_t src2, const uint8x8_t src3) {
210   const uint16x8_t middle = vmulq_n_u16(vaddl_u8(src1, src2), 9);
211   const uint16x8_t ends = vaddl_u8(src0, src3);
212   const int16x8_t sum =
213       vsubq_s16(vreinterpretq_s16_u16(middle), vreinterpretq_s16_u16(ends));
214   return vqrshrun_n_s16(sum, 4);
215 }
216 
IntraEdgeUpsampler_NEON(void * buffer,const int size)217 void IntraEdgeUpsampler_NEON(void* buffer, const int size) {
218   assert(size % 4 == 0 && size <= 16);
219   auto* const pixel_buffer = static_cast<uint8_t*>(buffer);
220   // This is OK because we don't read this value for |size| 4 or 8 but if we
221   // write |pixel_buffer[size]| and then vld() it, that seems to introduce
222   // some latency.
223   pixel_buffer[-2] = pixel_buffer[-1];
224   if (size == 4) {
225     // This uses one load and two vtbl() which is better than 4x Load{Lo,Hi}4().
226     const uint8x8_t src = vld1_u8(pixel_buffer - 1);
227     // The outside values are negated so put those in the same vector.
228     const uint8x8_t src03 = vtbl1_u8(src, vcreate_u8(0x0404030202010000));
229     // Reverse |src1| and |src2| so we can use |src2| for the interleave at the
230     // end.
231     const uint8x8_t src21 = vtbl1_u8(src, vcreate_u8(0x0302010004030201));
232 
233     const uint16x8_t middle = vmull_u8(src21, vdup_n_u8(9));
234     const int16x8_t half_sum = vsubq_s16(
235         vreinterpretq_s16_u16(middle), vreinterpretq_s16_u16(vmovl_u8(src03)));
236     const int16x4_t sum =
237         vadd_s16(vget_low_s16(half_sum), vget_high_s16(half_sum));
238     const uint8x8_t result = vqrshrun_n_s16(vcombine_s16(sum, sum), 4);
239 
240     vst1_u8(pixel_buffer - 1, InterleaveLow8(result, src21));
241     return;
242   } else if (size == 8) {
243     // Likewise, one load + multiple vtbls seems preferred to multiple loads.
244     const uint8x16_t src = vld1q_u8(pixel_buffer - 1);
245     const uint8x8_t src0 = VQTbl1U8(src, vcreate_u8(0x0605040302010000));
246     const uint8x8_t src1 = vget_low_u8(src);
247     const uint8x8_t src2 = VQTbl1U8(src, vcreate_u8(0x0807060504030201));
248     const uint8x8_t src3 = VQTbl1U8(src, vcreate_u8(0x0808070605040302));
249 
250     const uint8x8x2_t output = {Upsample(src0, src1, src2, src3), src2};
251     vst2_u8(pixel_buffer - 1, output);
252     return;
253   }
254   assert(size == 12 || size == 16);
255   // Extend the input borders to avoid branching later.
256   pixel_buffer[size] = pixel_buffer[size - 1];
257   const uint8x16_t src0 = vld1q_u8(pixel_buffer - 2);
258   const uint8x16_t src1 = vld1q_u8(pixel_buffer - 1);
259   const uint8x16_t src2 = vld1q_u8(pixel_buffer);
260   const uint8x16_t src3 = vld1q_u8(pixel_buffer + 1);
261 
262   const uint8x8_t result_lo = Upsample(vget_low_u8(src0), vget_low_u8(src1),
263                                        vget_low_u8(src2), vget_low_u8(src3));
264 
265   const uint8x8x2_t output_lo = {result_lo, vget_low_u8(src2)};
266   vst2_u8(pixel_buffer - 1, output_lo);
267 
268   const uint8x8_t result_hi = Upsample(vget_high_u8(src0), vget_high_u8(src1),
269                                        vget_high_u8(src2), vget_high_u8(src3));
270 
271   if (size == 12) {
272     vst1_u8(pixel_buffer + 15, InterleaveLow8(result_hi, vget_high_u8(src2)));
273   } else /* size == 16 */ {
274     const uint8x8x2_t output_hi = {result_hi, vget_high_u8(src2)};
275     vst2_u8(pixel_buffer + 15, output_hi);
276   }
277 }
278 
Init8bpp()279 void Init8bpp() {
280   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
281   assert(dsp != nullptr);
282   dsp->intra_edge_filter = IntraEdgeFilter_NEON;
283   dsp->intra_edge_upsampler = IntraEdgeUpsampler_NEON;
284 }
285 
286 }  // namespace
287 
IntraEdgeInit_NEON()288 void IntraEdgeInit_NEON() { Init8bpp(); }
289 
290 }  // namespace dsp
291 }  // namespace libgav1
292 
293 #else  // !LIBGAV1_ENABLE_NEON
294 namespace libgav1 {
295 namespace dsp {
296 
IntraEdgeInit_NEON()297 void IntraEdgeInit_NEON() {}
298 
299 }  // namespace dsp
300 }  // namespace libgav1
301 #endif  // LIBGAV1_ENABLE_NEON
302