• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2020 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/dsp/super_res.h"
16 #include "src/utils/cpu.h"
17 
18 #if LIBGAV1_ENABLE_NEON
19 
20 #include <arm_neon.h>
21 
22 #include "src/dsp/arm/common_neon.h"
23 #include "src/dsp/constants.h"
24 #include "src/dsp/dsp.h"
25 #include "src/utils/common.h"
26 #include "src/utils/constants.h"
27 
28 namespace libgav1 {
29 namespace dsp {
30 
31 namespace low_bitdepth {
32 namespace {
33 
SuperResCoefficients_NEON(const int upscaled_width,const int initial_subpixel_x,const int step,void * const coefficients)34 void SuperResCoefficients_NEON(const int upscaled_width,
35                                const int initial_subpixel_x, const int step,
36                                void* const coefficients) {
37   auto* dst = static_cast<uint8_t*>(coefficients);
38   int subpixel_x = initial_subpixel_x;
39   int x = RightShiftWithCeiling(upscaled_width, 3);
40   do {
41     uint8x8_t filter[8];
42     uint8x16_t d[kSuperResFilterTaps / 2];
43     for (int i = 0; i < 8; ++i, subpixel_x += step) {
44       filter[i] =
45           vld1_u8(kUpscaleFilterUnsigned[(subpixel_x & kSuperResScaleMask) >>
46                                          kSuperResExtraBits]);
47     }
48     Transpose8x8(filter, d);
49     vst1q_u8(dst, d[0]);
50     dst += 16;
51     vst1q_u8(dst, d[1]);
52     dst += 16;
53     vst1q_u8(dst, d[2]);
54     dst += 16;
55     vst1q_u8(dst, d[3]);
56     dst += 16;
57   } while (--x != 0);
58 }
59 
60 // Maximum sum of positive taps: 171 = 7 + 86 + 71 + 7
61 // Maximum sum: 255*171 == 0xAA55
62 // The sum is clipped to [0, 255], so adding all positive and then
63 // subtracting all negative with saturation is sufficient.
64 //           0 1 2 3 4 5 6 7
65 // tap sign: - + - + + - + -
SuperRes(const uint8x8_t src[kSuperResFilterTaps],const uint8_t ** coefficients)66 inline uint8x8_t SuperRes(const uint8x8_t src[kSuperResFilterTaps],
67                           const uint8_t** coefficients) {
68   uint8x16_t f[kSuperResFilterTaps / 2];
69   for (int i = 0; i < kSuperResFilterTaps / 2; ++i, *coefficients += 16) {
70     f[i] = vld1q_u8(*coefficients);
71   }
72   uint16x8_t res = vmull_u8(src[1], vget_high_u8(f[0]));
73   res = vmlal_u8(res, src[3], vget_high_u8(f[1]));
74   res = vmlal_u8(res, src[4], vget_low_u8(f[2]));
75   res = vmlal_u8(res, src[6], vget_low_u8(f[3]));
76   uint16x8_t temp = vmull_u8(src[0], vget_low_u8(f[0]));
77   temp = vmlal_u8(temp, src[2], vget_low_u8(f[1]));
78   temp = vmlal_u8(temp, src[5], vget_high_u8(f[2]));
79   temp = vmlal_u8(temp, src[7], vget_high_u8(f[3]));
80   res = vqsubq_u16(res, temp);
81   return vqrshrn_n_u16(res, kFilterBits);
82 }
83 
SuperRes_NEON(const void * const coefficients,void * const source,const ptrdiff_t source_stride,const int height,const int downscaled_width,const int upscaled_width,const int initial_subpixel_x,const int step,void * const dest,const ptrdiff_t dest_stride)84 void SuperRes_NEON(const void* const coefficients, void* const source,
85                    const ptrdiff_t source_stride, const int height,
86                    const int downscaled_width, const int upscaled_width,
87                    const int initial_subpixel_x, const int step,
88                    void* const dest, const ptrdiff_t dest_stride) {
89   auto* src = static_cast<uint8_t*>(source) - DivideBy2(kSuperResFilterTaps);
90   auto* dst = static_cast<uint8_t*>(dest);
91   int y = height;
92   do {
93     const auto* filter = static_cast<const uint8_t*>(coefficients);
94     uint8_t* dst_ptr = dst;
95     ExtendLine<uint8_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
96                         kSuperResHorizontalBorder, kSuperResHorizontalBorder);
97     int subpixel_x = initial_subpixel_x;
98     uint8x8_t sr[8];
99     uint8x16_t s[8];
100     int x = RightShiftWithCeiling(upscaled_width, 4);
101     // The below code calculates up to 15 extra upscaled
102     // pixels which will over-read up to 15 downscaled pixels in the end of each
103     // row. kSuperResHorizontalPadding accounts for this.
104     do {
105       for (int i = 0; i < 8; ++i, subpixel_x += step) {
106         sr[i] = vld1_u8(&src[subpixel_x >> kSuperResScaleBits]);
107       }
108       for (int i = 0; i < 8; ++i, subpixel_x += step) {
109         const uint8x8_t s_hi = vld1_u8(&src[subpixel_x >> kSuperResScaleBits]);
110         s[i] = vcombine_u8(sr[i], s_hi);
111       }
112       Transpose8x16(s);
113       // Do not use loop for the following 8 instructions, since the compiler
114       // will generate redundant code.
115       sr[0] = vget_low_u8(s[0]);
116       sr[1] = vget_low_u8(s[1]);
117       sr[2] = vget_low_u8(s[2]);
118       sr[3] = vget_low_u8(s[3]);
119       sr[4] = vget_low_u8(s[4]);
120       sr[5] = vget_low_u8(s[5]);
121       sr[6] = vget_low_u8(s[6]);
122       sr[7] = vget_low_u8(s[7]);
123       const uint8x8_t d0 = SuperRes(sr, &filter);
124       // Do not use loop for the following 8 instructions, since the compiler
125       // will generate redundant code.
126       sr[0] = vget_high_u8(s[0]);
127       sr[1] = vget_high_u8(s[1]);
128       sr[2] = vget_high_u8(s[2]);
129       sr[3] = vget_high_u8(s[3]);
130       sr[4] = vget_high_u8(s[4]);
131       sr[5] = vget_high_u8(s[5]);
132       sr[6] = vget_high_u8(s[6]);
133       sr[7] = vget_high_u8(s[7]);
134       const uint8x8_t d1 = SuperRes(sr, &filter);
135       vst1q_u8(dst_ptr, vcombine_u8(d0, d1));
136       dst_ptr += 16;
137     } while (--x != 0);
138     src += source_stride;
139     dst += dest_stride;
140   } while (--y != 0);
141 }
142 
Init8bpp()143 void Init8bpp() {
144   Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
145   dsp->super_res_coefficients = SuperResCoefficients_NEON;
146   dsp->super_res = SuperRes_NEON;
147 }
148 
149 }  // namespace
150 }  // namespace low_bitdepth
151 
152 //------------------------------------------------------------------------------
153 #if LIBGAV1_MAX_BITDEPTH >= 10
154 namespace high_bitdepth {
155 namespace {
156 
SuperResCoefficients_NEON(const int upscaled_width,const int initial_subpixel_x,const int step,void * const coefficients)157 void SuperResCoefficients_NEON(const int upscaled_width,
158                                const int initial_subpixel_x, const int step,
159                                void* const coefficients) {
160   auto* dst = static_cast<uint16_t*>(coefficients);
161   int subpixel_x = initial_subpixel_x;
162   int x = RightShiftWithCeiling(upscaled_width, 3);
163   do {
164     uint16x8_t filter[8];
165     for (int i = 0; i < 8; ++i, subpixel_x += step) {
166       const uint8x8_t filter_8 =
167           vld1_u8(kUpscaleFilterUnsigned[(subpixel_x & kSuperResScaleMask) >>
168                                          kSuperResExtraBits]);
169       // uint8_t -> uint16_t
170       filter[i] = vmovl_u8(filter_8);
171     }
172 
173     Transpose8x8(filter);
174 
175     vst1q_u16(dst, filter[0]);
176     dst += 8;
177     vst1q_u16(dst, filter[1]);
178     dst += 8;
179     vst1q_u16(dst, filter[2]);
180     dst += 8;
181     vst1q_u16(dst, filter[3]);
182     dst += 8;
183     vst1q_u16(dst, filter[4]);
184     dst += 8;
185     vst1q_u16(dst, filter[5]);
186     dst += 8;
187     vst1q_u16(dst, filter[6]);
188     dst += 8;
189     vst1q_u16(dst, filter[7]);
190     dst += 8;
191   } while (--x != 0);
192 }
193 
194 // The sum is clipped to [0, ((1 << bitdepth) -1)]. Adding all positive and then
195 // subtracting all negative with saturation will clip to zero.
196 //           0 1 2 3 4 5 6 7
197 // tap sign: - + - + + - + -
SuperRes(const uint16x8_t src[kSuperResFilterTaps],const uint16_t ** coefficients,int bitdepth)198 inline uint16x8_t SuperRes(const uint16x8_t src[kSuperResFilterTaps],
199                            const uint16_t** coefficients, int bitdepth) {
200   uint16x8_t f[kSuperResFilterTaps];
201   for (int i = 0; i < kSuperResFilterTaps; ++i, *coefficients += 8) {
202     f[i] = vld1q_u16(*coefficients);
203   }
204 
205   uint32x4_t res_lo = vmull_u16(vget_low_u16(src[1]), vget_low_u16(f[1]));
206   res_lo = vmlal_u16(res_lo, vget_low_u16(src[3]), vget_low_u16(f[3]));
207   res_lo = vmlal_u16(res_lo, vget_low_u16(src[4]), vget_low_u16(f[4]));
208   res_lo = vmlal_u16(res_lo, vget_low_u16(src[6]), vget_low_u16(f[6]));
209 
210   uint32x4_t temp_lo = vmull_u16(vget_low_u16(src[0]), vget_low_u16(f[0]));
211   temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[2]), vget_low_u16(f[2]));
212   temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[5]), vget_low_u16(f[5]));
213   temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[7]), vget_low_u16(f[7]));
214 
215   res_lo = vqsubq_u32(res_lo, temp_lo);
216 
217   uint32x4_t res_hi = vmull_u16(vget_high_u16(src[1]), vget_high_u16(f[1]));
218   res_hi = vmlal_u16(res_hi, vget_high_u16(src[3]), vget_high_u16(f[3]));
219   res_hi = vmlal_u16(res_hi, vget_high_u16(src[4]), vget_high_u16(f[4]));
220   res_hi = vmlal_u16(res_hi, vget_high_u16(src[6]), vget_high_u16(f[6]));
221 
222   uint32x4_t temp_hi = vmull_u16(vget_high_u16(src[0]), vget_high_u16(f[0]));
223   temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[2]), vget_high_u16(f[2]));
224   temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[5]), vget_high_u16(f[5]));
225   temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[7]), vget_high_u16(f[7]));
226 
227   res_hi = vqsubq_u32(res_hi, temp_hi);
228 
229   const uint16x8_t res = vcombine_u16(vqrshrn_n_u32(res_lo, kFilterBits),
230                                       vqrshrn_n_u32(res_hi, kFilterBits));
231 
232   // Clip the result at (1 << bd) - 1.
233   return vminq_u16(res, vdupq_n_u16((1 << bitdepth) - 1));
234 }
235 
236 template <int bitdepth>
SuperRes_NEON(const void * const coefficients,void * const source,const ptrdiff_t source_stride,const int height,const int downscaled_width,const int upscaled_width,const int initial_subpixel_x,const int step,void * const dest,const ptrdiff_t dest_stride)237 void SuperRes_NEON(const void* const coefficients, void* const source,
238                    const ptrdiff_t source_stride, const int height,
239                    const int downscaled_width, const int upscaled_width,
240                    const int initial_subpixel_x, const int step,
241                    void* const dest, const ptrdiff_t dest_stride) {
242   auto* src = static_cast<uint16_t*>(source) - DivideBy2(kSuperResFilterTaps);
243   auto* dst = static_cast<uint16_t*>(dest);
244   int y = height;
245   do {
246     const auto* filter = static_cast<const uint16_t*>(coefficients);
247     uint16_t* dst_ptr = dst;
248     ExtendLine<uint16_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
249                          kSuperResHorizontalBorder, kSuperResHorizontalBorder);
250     int subpixel_x = initial_subpixel_x;
251     uint16x8_t sr[8];
252     int x = RightShiftWithCeiling(upscaled_width, 3);
253     // The below code calculates up to 7 extra upscaled
254     // pixels which will over-read up to 7 downscaled pixels in the end of each
255     // row. kSuperResHorizontalBorder accounts for this.
256     do {
257       for (int i = 0; i < 8; ++i, subpixel_x += step) {
258         sr[i] = vld1q_u16(&src[subpixel_x >> kSuperResScaleBits]);
259       }
260 
261       Transpose8x8(sr);
262 
263       const uint16x8_t d0 = SuperRes(sr, &filter, bitdepth);
264       vst1q_u16(dst_ptr, d0);
265       dst_ptr += 8;
266     } while (--x != 0);
267     src += source_stride;
268     dst += dest_stride;
269   } while (--y != 0);
270 }
271 
Init10bpp()272 void Init10bpp() {
273   Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
274   assert(dsp != nullptr);
275   dsp->super_res_coefficients = SuperResCoefficients_NEON;
276   dsp->super_res = SuperRes_NEON<10>;
277 }
278 
279 }  // namespace
280 }  // namespace high_bitdepth
281 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
282 
SuperResInit_NEON()283 void SuperResInit_NEON() {
284   low_bitdepth::Init8bpp();
285 #if LIBGAV1_MAX_BITDEPTH >= 10
286   high_bitdepth::Init10bpp();
287 #endif
288 }
289 }  // namespace dsp
290 }  // namespace libgav1
291 
292 #else   // !LIBGAV1_ENABLE_NEON
293 
294 namespace libgav1 {
295 namespace dsp {
296 
SuperResInit_NEON()297 void SuperResInit_NEON() {}
298 
299 }  // namespace dsp
300 }  // namespace libgav1
301 #endif  // LIBGAV1_ENABLE_NEON
302