1 // Copyright 2020 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/dsp/super_res.h"
16 #include "src/utils/cpu.h"
17
18 #if LIBGAV1_ENABLE_NEON
19
20 #include <arm_neon.h>
21
22 #include "src/dsp/arm/common_neon.h"
23 #include "src/dsp/constants.h"
24 #include "src/dsp/dsp.h"
25 #include "src/utils/common.h"
26 #include "src/utils/constants.h"
27
28 namespace libgav1 {
29 namespace dsp {
30
31 namespace low_bitdepth {
32 namespace {
33
SuperResCoefficients_NEON(const int upscaled_width,const int initial_subpixel_x,const int step,void * const coefficients)34 void SuperResCoefficients_NEON(const int upscaled_width,
35 const int initial_subpixel_x, const int step,
36 void* const coefficients) {
37 auto* dst = static_cast<uint8_t*>(coefficients);
38 int subpixel_x = initial_subpixel_x;
39 int x = RightShiftWithCeiling(upscaled_width, 3);
40 do {
41 uint8x8_t filter[8];
42 uint8x16_t d[kSuperResFilterTaps / 2];
43 for (int i = 0; i < 8; ++i, subpixel_x += step) {
44 filter[i] =
45 vld1_u8(kUpscaleFilterUnsigned[(subpixel_x & kSuperResScaleMask) >>
46 kSuperResExtraBits]);
47 }
48 Transpose8x8(filter, d);
49 vst1q_u8(dst, d[0]);
50 dst += 16;
51 vst1q_u8(dst, d[1]);
52 dst += 16;
53 vst1q_u8(dst, d[2]);
54 dst += 16;
55 vst1q_u8(dst, d[3]);
56 dst += 16;
57 } while (--x != 0);
58 }
59
60 // Maximum sum of positive taps: 171 = 7 + 86 + 71 + 7
61 // Maximum sum: 255*171 == 0xAA55
62 // The sum is clipped to [0, 255], so adding all positive and then
63 // subtracting all negative with saturation is sufficient.
64 // 0 1 2 3 4 5 6 7
65 // tap sign: - + - + + - + -
SuperRes(const uint8x8_t src[kSuperResFilterTaps],const uint8_t ** coefficients)66 inline uint8x8_t SuperRes(const uint8x8_t src[kSuperResFilterTaps],
67 const uint8_t** coefficients) {
68 uint8x16_t f[kSuperResFilterTaps / 2];
69 for (int i = 0; i < kSuperResFilterTaps / 2; ++i, *coefficients += 16) {
70 f[i] = vld1q_u8(*coefficients);
71 }
72 uint16x8_t res = vmull_u8(src[1], vget_high_u8(f[0]));
73 res = vmlal_u8(res, src[3], vget_high_u8(f[1]));
74 res = vmlal_u8(res, src[4], vget_low_u8(f[2]));
75 res = vmlal_u8(res, src[6], vget_low_u8(f[3]));
76 uint16x8_t temp = vmull_u8(src[0], vget_low_u8(f[0]));
77 temp = vmlal_u8(temp, src[2], vget_low_u8(f[1]));
78 temp = vmlal_u8(temp, src[5], vget_high_u8(f[2]));
79 temp = vmlal_u8(temp, src[7], vget_high_u8(f[3]));
80 res = vqsubq_u16(res, temp);
81 return vqrshrn_n_u16(res, kFilterBits);
82 }
83
SuperRes_NEON(const void * const coefficients,void * const source,const ptrdiff_t source_stride,const int height,const int downscaled_width,const int upscaled_width,const int initial_subpixel_x,const int step,void * const dest,const ptrdiff_t dest_stride)84 void SuperRes_NEON(const void* const coefficients, void* const source,
85 const ptrdiff_t source_stride, const int height,
86 const int downscaled_width, const int upscaled_width,
87 const int initial_subpixel_x, const int step,
88 void* const dest, const ptrdiff_t dest_stride) {
89 auto* src = static_cast<uint8_t*>(source) - DivideBy2(kSuperResFilterTaps);
90 auto* dst = static_cast<uint8_t*>(dest);
91 int y = height;
92 do {
93 const auto* filter = static_cast<const uint8_t*>(coefficients);
94 uint8_t* dst_ptr = dst;
95 ExtendLine<uint8_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
96 kSuperResHorizontalBorder, kSuperResHorizontalBorder);
97 int subpixel_x = initial_subpixel_x;
98 uint8x8_t sr[8];
99 uint8x16_t s[8];
100 int x = RightShiftWithCeiling(upscaled_width, 4);
101 // The below code calculates up to 15 extra upscaled
102 // pixels which will over-read up to 15 downscaled pixels in the end of each
103 // row. kSuperResHorizontalPadding accounts for this.
104 do {
105 for (int i = 0; i < 8; ++i, subpixel_x += step) {
106 sr[i] = vld1_u8(&src[subpixel_x >> kSuperResScaleBits]);
107 }
108 for (int i = 0; i < 8; ++i, subpixel_x += step) {
109 const uint8x8_t s_hi = vld1_u8(&src[subpixel_x >> kSuperResScaleBits]);
110 s[i] = vcombine_u8(sr[i], s_hi);
111 }
112 Transpose8x16(s);
113 // Do not use loop for the following 8 instructions, since the compiler
114 // will generate redundant code.
115 sr[0] = vget_low_u8(s[0]);
116 sr[1] = vget_low_u8(s[1]);
117 sr[2] = vget_low_u8(s[2]);
118 sr[3] = vget_low_u8(s[3]);
119 sr[4] = vget_low_u8(s[4]);
120 sr[5] = vget_low_u8(s[5]);
121 sr[6] = vget_low_u8(s[6]);
122 sr[7] = vget_low_u8(s[7]);
123 const uint8x8_t d0 = SuperRes(sr, &filter);
124 // Do not use loop for the following 8 instructions, since the compiler
125 // will generate redundant code.
126 sr[0] = vget_high_u8(s[0]);
127 sr[1] = vget_high_u8(s[1]);
128 sr[2] = vget_high_u8(s[2]);
129 sr[3] = vget_high_u8(s[3]);
130 sr[4] = vget_high_u8(s[4]);
131 sr[5] = vget_high_u8(s[5]);
132 sr[6] = vget_high_u8(s[6]);
133 sr[7] = vget_high_u8(s[7]);
134 const uint8x8_t d1 = SuperRes(sr, &filter);
135 vst1q_u8(dst_ptr, vcombine_u8(d0, d1));
136 dst_ptr += 16;
137 } while (--x != 0);
138 src += source_stride;
139 dst += dest_stride;
140 } while (--y != 0);
141 }
142
Init8bpp()143 void Init8bpp() {
144 Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
145 dsp->super_res_coefficients = SuperResCoefficients_NEON;
146 dsp->super_res = SuperRes_NEON;
147 }
148
149 } // namespace
150 } // namespace low_bitdepth
151
152 //------------------------------------------------------------------------------
153 #if LIBGAV1_MAX_BITDEPTH >= 10
154 namespace high_bitdepth {
155 namespace {
156
SuperResCoefficients_NEON(const int upscaled_width,const int initial_subpixel_x,const int step,void * const coefficients)157 void SuperResCoefficients_NEON(const int upscaled_width,
158 const int initial_subpixel_x, const int step,
159 void* const coefficients) {
160 auto* dst = static_cast<uint16_t*>(coefficients);
161 int subpixel_x = initial_subpixel_x;
162 int x = RightShiftWithCeiling(upscaled_width, 3);
163 do {
164 uint16x8_t filter[8];
165 for (int i = 0; i < 8; ++i, subpixel_x += step) {
166 const uint8x8_t filter_8 =
167 vld1_u8(kUpscaleFilterUnsigned[(subpixel_x & kSuperResScaleMask) >>
168 kSuperResExtraBits]);
169 // uint8_t -> uint16_t
170 filter[i] = vmovl_u8(filter_8);
171 }
172
173 Transpose8x8(filter);
174
175 vst1q_u16(dst, filter[0]);
176 dst += 8;
177 vst1q_u16(dst, filter[1]);
178 dst += 8;
179 vst1q_u16(dst, filter[2]);
180 dst += 8;
181 vst1q_u16(dst, filter[3]);
182 dst += 8;
183 vst1q_u16(dst, filter[4]);
184 dst += 8;
185 vst1q_u16(dst, filter[5]);
186 dst += 8;
187 vst1q_u16(dst, filter[6]);
188 dst += 8;
189 vst1q_u16(dst, filter[7]);
190 dst += 8;
191 } while (--x != 0);
192 }
193
194 // The sum is clipped to [0, ((1 << bitdepth) -1)]. Adding all positive and then
195 // subtracting all negative with saturation will clip to zero.
196 // 0 1 2 3 4 5 6 7
197 // tap sign: - + - + + - + -
SuperRes(const uint16x8_t src[kSuperResFilterTaps],const uint16_t ** coefficients,int bitdepth)198 inline uint16x8_t SuperRes(const uint16x8_t src[kSuperResFilterTaps],
199 const uint16_t** coefficients, int bitdepth) {
200 uint16x8_t f[kSuperResFilterTaps];
201 for (int i = 0; i < kSuperResFilterTaps; ++i, *coefficients += 8) {
202 f[i] = vld1q_u16(*coefficients);
203 }
204
205 uint32x4_t res_lo = vmull_u16(vget_low_u16(src[1]), vget_low_u16(f[1]));
206 res_lo = vmlal_u16(res_lo, vget_low_u16(src[3]), vget_low_u16(f[3]));
207 res_lo = vmlal_u16(res_lo, vget_low_u16(src[4]), vget_low_u16(f[4]));
208 res_lo = vmlal_u16(res_lo, vget_low_u16(src[6]), vget_low_u16(f[6]));
209
210 uint32x4_t temp_lo = vmull_u16(vget_low_u16(src[0]), vget_low_u16(f[0]));
211 temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[2]), vget_low_u16(f[2]));
212 temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[5]), vget_low_u16(f[5]));
213 temp_lo = vmlal_u16(temp_lo, vget_low_u16(src[7]), vget_low_u16(f[7]));
214
215 res_lo = vqsubq_u32(res_lo, temp_lo);
216
217 uint32x4_t res_hi = vmull_u16(vget_high_u16(src[1]), vget_high_u16(f[1]));
218 res_hi = vmlal_u16(res_hi, vget_high_u16(src[3]), vget_high_u16(f[3]));
219 res_hi = vmlal_u16(res_hi, vget_high_u16(src[4]), vget_high_u16(f[4]));
220 res_hi = vmlal_u16(res_hi, vget_high_u16(src[6]), vget_high_u16(f[6]));
221
222 uint32x4_t temp_hi = vmull_u16(vget_high_u16(src[0]), vget_high_u16(f[0]));
223 temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[2]), vget_high_u16(f[2]));
224 temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[5]), vget_high_u16(f[5]));
225 temp_hi = vmlal_u16(temp_hi, vget_high_u16(src[7]), vget_high_u16(f[7]));
226
227 res_hi = vqsubq_u32(res_hi, temp_hi);
228
229 const uint16x8_t res = vcombine_u16(vqrshrn_n_u32(res_lo, kFilterBits),
230 vqrshrn_n_u32(res_hi, kFilterBits));
231
232 // Clip the result at (1 << bd) - 1.
233 return vminq_u16(res, vdupq_n_u16((1 << bitdepth) - 1));
234 }
235
236 template <int bitdepth>
SuperRes_NEON(const void * const coefficients,void * const source,const ptrdiff_t source_stride,const int height,const int downscaled_width,const int upscaled_width,const int initial_subpixel_x,const int step,void * const dest,const ptrdiff_t dest_stride)237 void SuperRes_NEON(const void* const coefficients, void* const source,
238 const ptrdiff_t source_stride, const int height,
239 const int downscaled_width, const int upscaled_width,
240 const int initial_subpixel_x, const int step,
241 void* const dest, const ptrdiff_t dest_stride) {
242 auto* src = static_cast<uint16_t*>(source) - DivideBy2(kSuperResFilterTaps);
243 auto* dst = static_cast<uint16_t*>(dest);
244 int y = height;
245 do {
246 const auto* filter = static_cast<const uint16_t*>(coefficients);
247 uint16_t* dst_ptr = dst;
248 ExtendLine<uint16_t>(src + DivideBy2(kSuperResFilterTaps), downscaled_width,
249 kSuperResHorizontalBorder, kSuperResHorizontalBorder);
250 int subpixel_x = initial_subpixel_x;
251 uint16x8_t sr[8];
252 int x = RightShiftWithCeiling(upscaled_width, 3);
253 // The below code calculates up to 7 extra upscaled
254 // pixels which will over-read up to 7 downscaled pixels in the end of each
255 // row. kSuperResHorizontalBorder accounts for this.
256 do {
257 for (int i = 0; i < 8; ++i, subpixel_x += step) {
258 sr[i] = vld1q_u16(&src[subpixel_x >> kSuperResScaleBits]);
259 }
260
261 Transpose8x8(sr);
262
263 const uint16x8_t d0 = SuperRes(sr, &filter, bitdepth);
264 vst1q_u16(dst_ptr, d0);
265 dst_ptr += 8;
266 } while (--x != 0);
267 src += source_stride;
268 dst += dest_stride;
269 } while (--y != 0);
270 }
271
Init10bpp()272 void Init10bpp() {
273 Dsp* dsp = dsp_internal::GetWritableDspTable(kBitdepth10);
274 assert(dsp != nullptr);
275 dsp->super_res_coefficients = SuperResCoefficients_NEON;
276 dsp->super_res = SuperRes_NEON<10>;
277 }
278
279 } // namespace
280 } // namespace high_bitdepth
281 #endif // LIBGAV1_MAX_BITDEPTH >= 10
282
SuperResInit_NEON()283 void SuperResInit_NEON() {
284 low_bitdepth::Init8bpp();
285 #if LIBGAV1_MAX_BITDEPTH >= 10
286 high_bitdepth::Init10bpp();
287 #endif
288 }
289 } // namespace dsp
290 } // namespace libgav1
291
292 #else // !LIBGAV1_ENABLE_NEON
293
294 namespace libgav1 {
295 namespace dsp {
296
SuperResInit_NEON()297 void SuperResInit_NEON() {}
298
299 } // namespace dsp
300 } // namespace libgav1
301 #endif // LIBGAV1_ENABLE_NEON
302