1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/dsp/warp.h"
16
17 #include <algorithm>
18 #include <cassert>
19 #include <cstddef>
20 #include <cstdint>
21 #include <cstdlib>
22 #include <type_traits>
23
24 #include "src/dsp/constants.h"
25 #include "src/dsp/dsp.h"
26 #include "src/utils/common.h"
27 #include "src/utils/constants.h"
28 #include "src/utils/memory.h"
29
30 namespace libgav1 {
31 namespace dsp {
32 namespace {
33
34 // Number of extra bits of precision in warped filtering.
35 constexpr int kWarpedDiffPrecisionBits = 10;
36
37 // Warp prediction output ranges from WarpTest.ShowRange.
38 // Bitdepth: 8 Input range: [ 0, 255]
39 // 8bpp intermediate offset: 16384.
40 // intermediate range: [ 4399, 61009]
41 // first pass output range: [ 550, 7626]
42 // 8bpp intermediate offset removal: 262144.
43 // intermediate range: [ -620566, 1072406]
44 // second pass output range: [ 0, 255]
45 // compound second pass output range: [ -4848, 8378]
46 //
47 // Bitdepth: 10 Input range: [ 0, 1023]
48 // intermediate range: [ -48081, 179025]
49 // first pass output range: [ -6010, 22378]
50 // intermediate range: [-2103516, 4198620]
51 // second pass output range: [ 0, 1023]
52 // compound second pass output range: [ 8142, 57378]
53 //
54 // Bitdepth: 12 Input range: [ 0, 4095]
55 // intermediate range: [ -192465, 716625]
56 // first pass output range: [ -6015, 22395]
57 // intermediate range: [-2105190, 4201830]
58 // second pass output range: [ 0, 4095]
59 // compound second pass output range: [ 8129, 57403]
60
61 template <bool is_compound, int bitdepth, typename Pixel>
Warp_C(const void * const source,ptrdiff_t source_stride,const int source_width,const int source_height,const int * const warp_params,const int subsampling_x,const int subsampling_y,const int block_start_x,const int block_start_y,const int block_width,const int block_height,const int16_t alpha,const int16_t beta,const int16_t gamma,const int16_t delta,void * dest,ptrdiff_t dest_stride)62 void Warp_C(const void* const source, ptrdiff_t source_stride,
63 const int source_width, const int source_height,
64 const int* const warp_params, const int subsampling_x,
65 const int subsampling_y, const int block_start_x,
66 const int block_start_y, const int block_width,
67 const int block_height, const int16_t alpha, const int16_t beta,
68 const int16_t gamma, const int16_t delta, void* dest,
69 ptrdiff_t dest_stride) {
70 assert(block_width >= 8 && block_height >= 8);
71 if (is_compound) {
72 assert(dest_stride == block_width);
73 }
74 constexpr int kRoundBitsHorizontal = (bitdepth == 12)
75 ? kInterRoundBitsHorizontal12bpp
76 : kInterRoundBitsHorizontal;
77 constexpr int kRoundBitsVertical =
78 is_compound ? kInterRoundBitsCompoundVertical
79 : (bitdepth == 12) ? kInterRoundBitsVertical12bpp
80 : kInterRoundBitsVertical;
81
82 // Only used for 8bpp. Allows for keeping the first pass intermediates within
83 // uint16_t. With 10/12bpp the intermediate value will always require int32_t.
84 constexpr int first_pass_offset = (bitdepth == 8) ? 1 << 14 : 0;
85 constexpr int offset_removal =
86 (first_pass_offset >> kRoundBitsHorizontal) * 128;
87
88 constexpr int kMaxPixel = (1 << bitdepth) - 1;
89 union {
90 // |intermediate_result| is the output of the horizontal filtering and
91 // rounding. The range is within int16_t.
92 int16_t intermediate_result[15][8]; // 15 rows, 8 columns.
93 // In the simple special cases where the samples in each row are all the
94 // same, store one sample per row in a column vector.
95 int16_t intermediate_result_column[15];
96 };
97 const auto* const src = static_cast<const Pixel*>(source);
98 source_stride /= sizeof(Pixel);
99 using DestType =
100 typename std::conditional<is_compound, uint16_t, Pixel>::type;
101 auto* dst = static_cast<DestType*>(dest);
102 if (!is_compound) dest_stride /= sizeof(dst[0]);
103
104 assert(block_width >= 8);
105 assert(block_height >= 8);
106
107 // Warp process applies for each 8x8 block (or smaller).
108 for (int start_y = block_start_y; start_y < block_start_y + block_height;
109 start_y += 8) {
110 for (int start_x = block_start_x; start_x < block_start_x + block_width;
111 start_x += 8) {
112 const int src_x = (start_x + 4) << subsampling_x;
113 const int src_y = (start_y + 4) << subsampling_y;
114 const int dst_x =
115 src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
116 const int dst_y =
117 src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
118 const int x4 = dst_x >> subsampling_x;
119 const int y4 = dst_y >> subsampling_y;
120 const int ix4 = x4 >> kWarpedModelPrecisionBits;
121 const int iy4 = y4 >> kWarpedModelPrecisionBits;
122
123 // A prediction block may fall outside the frame's boundaries. If a
124 // prediction block is calculated using only samples outside the frame's
125 // boundary, the filtering can be simplified. We can divide the plane
126 // into several regions and handle them differently.
127 //
128 // | |
129 // 1 | 3 | 1
130 // | |
131 // -------+-----------+-------
132 // |***********|
133 // 2 |*****4*****| 2
134 // |***********|
135 // -------+-----------+-------
136 // | |
137 // 1 | 3 | 1
138 // | |
139 //
140 // At the center, region 4 represents the frame and is the general case.
141 //
142 // In regions 1 and 2, the prediction block is outside the frame's
143 // boundary horizontally. Therefore the horizontal filtering can be
144 // simplified. Furthermore, in the region 1 (at the four corners), the
145 // prediction is outside the frame's boundary both horizontally and
146 // vertically, so we get a constant prediction block.
147 //
148 // In region 3, the prediction block is outside the frame's boundary
149 // vertically. Unfortunately because we apply the horizontal filters
150 // first, by the time we apply the vertical filters, they no longer see
151 // simple inputs. So the only simplification is that all the rows are
152 // the same, but we still need to apply all the horizontal and vertical
153 // filters.
154
155 // Check for two simple special cases, where the horizontal filter can
156 // be significantly simplified.
157 //
158 // In general, for each row, the horizontal filter is calculated as
159 // follows:
160 // for (int x = -4; x < 4; ++x) {
161 // const int offset = ...;
162 // int sum = first_pass_offset;
163 // for (int k = 0; k < 8; ++k) {
164 // const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1);
165 // sum += kWarpedFilters[offset][k] * src_row[column];
166 // }
167 // ...
168 // }
169 // The column index before clipping, ix4 + x + k - 3, varies in the range
170 // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1
171 // or ix4 + 7 <= 0, then all the column indexes are clipped to the same
172 // border index (source_width - 1 or 0, respectively). Then for each x,
173 // the inner for loop of the horizontal filter is reduced to multiplying
174 // the border pixel by the sum of the filter coefficients.
175 if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) {
176 // Regions 1 and 2.
177 // Points to the left or right border of the first row of |src|.
178 const Pixel* first_row_border =
179 (ix4 + 7 <= 0) ? src : src + source_width - 1;
180 // In general, for y in [-7, 8), the row number iy4 + y is clipped:
181 // const int row = Clip3(iy4 + y, 0, source_height - 1);
182 // In two special cases, iy4 + y is clipped to either 0 or
183 // source_height - 1 for all y. In the rest of the cases, iy4 + y is
184 // bounded and we can avoid clipping iy4 + y by relying on a reference
185 // frame's boundary extension on the top and bottom.
186 if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
187 // Region 1.
188 // Every sample used to calculate the prediction block has the same
189 // value. So the whole prediction block has the same value.
190 const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
191 const Pixel row_border_pixel = first_row_border[row * source_stride];
192 DestType* dst_row = dst + start_x - block_start_x;
193 if (is_compound) {
194 int sum = row_border_pixel
195 << ((14 - kRoundBitsHorizontal) - kRoundBitsVertical);
196 sum += (bitdepth == 8) ? 0 : kCompoundOffset;
197 Memset(dst_row, sum, 8);
198 } else {
199 Memset(dst_row, row_border_pixel, 8);
200 }
201 const DestType* const first_dst_row = dst_row;
202 dst_row += dest_stride;
203 for (int y = 1; y < 8; ++y) {
204 memcpy(dst_row, first_dst_row, 8 * sizeof(*dst_row));
205 dst_row += dest_stride;
206 }
207 // End of region 1. Continue the |start_x| for loop.
208 continue;
209 }
210
211 // Region 2.
212 // Horizontal filter.
213 // The input values in this region are generated by extending the border
214 // which makes them identical in the horizontal direction. This
215 // computation could be inlined in the vertical pass but most
216 // implementations will need a transpose of some sort.
217 // It is not necessary to use the offset values here because the
218 // horizontal pass is a simple shift and the vertical pass will always
219 // require using 32 bits.
220 for (int y = -7; y < 8; ++y) {
221 // We may over-read up to 13 pixels above the top source row, or up
222 // to 13 pixels below the bottom source row. This is proved below.
223 const int row = iy4 + y;
224 int sum = first_row_border[row * source_stride];
225 sum <<= kFilterBits - kRoundBitsHorizontal;
226 intermediate_result_column[y + 7] = sum;
227 }
228 // Vertical filter.
229 DestType* dst_row = dst + start_x - block_start_x;
230 int sy4 =
231 (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
232 for (int y = 0; y < 8; ++y) {
233 int sy = sy4 - MultiplyBy4(gamma);
234 for (int x = 0; x < 8; ++x) {
235 const int offset =
236 RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
237 kWarpedPixelPrecisionShifts;
238 assert(offset >= 0);
239 assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
240 int sum = 0;
241 for (int k = 0; k < 8; ++k) {
242 sum +=
243 kWarpedFilters[offset][k] * intermediate_result_column[y + k];
244 }
245 sum = RightShiftWithRounding(sum, kRoundBitsVertical);
246 if (is_compound) {
247 sum += (bitdepth == 8) ? 0 : kCompoundOffset;
248 dst_row[x] = static_cast<DestType>(sum);
249 } else {
250 dst_row[x] = static_cast<DestType>(Clip3(sum, 0, kMaxPixel));
251 }
252 sy += gamma;
253 }
254 dst_row += dest_stride;
255 sy4 += delta;
256 }
257 // End of region 2. Continue the |start_x| for loop.
258 continue;
259 }
260
261 // Regions 3 and 4.
262 // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
263 // It follows that -6 <= ix4 <= source_width + 5. This inequality is
264 // used below.
265
266 // In general, for y in [-7, 8), the row number iy4 + y is clipped:
267 // const int row = Clip3(iy4 + y, 0, source_height - 1);
268 // In two special cases, iy4 + y is clipped to either 0 or
269 // source_height - 1 for all y. In the rest of the cases, iy4 + y is
270 // bounded and we can avoid clipping iy4 + y by relying on a reference
271 // frame's boundary extension on the top and bottom.
272 if (iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0) {
273 // Region 3.
274 // Horizontal filter.
275 const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
276 const Pixel* const src_row = src + row * source_stride;
277 int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
278 for (int y = -7; y < 8; ++y) {
279 int sx = sx4 - MultiplyBy4(alpha);
280 for (int x = -4; x < 4; ++x) {
281 const int offset =
282 RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
283 kWarpedPixelPrecisionShifts;
284 // Since alpha and beta have been validated by SetupShear(), one
285 // can prove that 0 <= offset <= 3 * 2^6.
286 assert(offset >= 0);
287 assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
288 // For SIMD optimization:
289 // |first_pass_offset| guarantees the sum fits in uint16_t for 8bpp.
290 // For 10/12 bit, the range of sum requires 32 bits.
291 int sum = first_pass_offset;
292 for (int k = 0; k < 8; ++k) {
293 // We assume the source frame has left and right borders of at
294 // least 13 pixels that extend the frame boundary pixels.
295 //
296 // Since -4 <= x <= 3 and 0 <= k <= 7, using the inequality on
297 // ix4 above, we have
298 // -13 <= ix4 + x + k - 3 <= source_width + 12,
299 // or
300 // -13 <= column <= (source_width - 1) + 13.
301 // Therefore we may over-read up to 13 pixels before the source
302 // row, or up to 13 pixels after the source row.
303 const int column = ix4 + x + k - 3;
304 sum += kWarpedFilters[offset][k] * src_row[column];
305 }
306 intermediate_result[y + 7][x + 4] =
307 RightShiftWithRounding(sum, kRoundBitsHorizontal);
308 sx += alpha;
309 }
310 sx4 += beta;
311 }
312 } else {
313 // Region 4.
314 // Horizontal filter.
315 // At this point, we know iy4 - 7 < source_height - 1 and iy4 + 7 > 0.
316 // It follows that -6 <= iy4 <= source_height + 5. This inequality is
317 // used below.
318 int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
319 for (int y = -7; y < 8; ++y) {
320 // We assume the source frame has top and bottom borders of at least
321 // 13 pixels that extend the frame boundary pixels.
322 //
323 // Since -7 <= y <= 7, using the inequality on iy4 above, we have
324 // -13 <= iy4 + y <= source_height + 12,
325 // or
326 // -13 <= row <= (source_height - 1) + 13.
327 // Therefore we may over-read up to 13 pixels above the top source
328 // row, or up to 13 pixels below the bottom source row.
329 const int row = iy4 + y;
330 const Pixel* const src_row = src + row * source_stride;
331 int sx = sx4 - MultiplyBy4(alpha);
332 for (int x = -4; x < 4; ++x) {
333 const int offset =
334 RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
335 kWarpedPixelPrecisionShifts;
336 // Since alpha and beta have been validated by SetupShear(), one
337 // can prove that 0 <= offset <= 3 * 2^6.
338 assert(offset >= 0);
339 assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
340 // For SIMD optimization:
341 // |first_pass_offset| guarantees the sum fits in uint16_t for 8bpp.
342 // For 10/12 bit, the range of sum requires 32 bits.
343 int sum = first_pass_offset;
344 for (int k = 0; k < 8; ++k) {
345 // We assume the source frame has left and right borders of at
346 // least 13 pixels that extend the frame boundary pixels.
347 //
348 // Since -4 <= x <= 3 and 0 <= k <= 7, using the inequality on
349 // ix4 above, we have
350 // -13 <= ix4 + x + k - 3 <= source_width + 12,
351 // or
352 // -13 <= column <= (source_width - 1) + 13.
353 // Therefore we may over-read up to 13 pixels before the source
354 // row, or up to 13 pixels after the source row.
355 const int column = ix4 + x + k - 3;
356 sum += kWarpedFilters[offset][k] * src_row[column];
357 }
358 intermediate_result[y + 7][x + 4] =
359 RightShiftWithRounding(sum, kRoundBitsHorizontal) -
360 offset_removal;
361 sx += alpha;
362 }
363 sx4 += beta;
364 }
365 }
366
367 // Regions 3 and 4.
368 // Vertical filter.
369 DestType* dst_row = dst + start_x - block_start_x;
370 int sy4 =
371 (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
372 // The spec says we should use the following loop condition:
373 // y < std::min(4, block_start_y + block_height - start_y - 4);
374 // We can prove that block_start_y + block_height - start_y >= 8, which
375 // implies std::min(4, block_start_y + block_height - start_y - 4) = 4.
376 // So the loop condition is simply y < 4.
377 //
378 // Proof:
379 // start_y < block_start_y + block_height
380 // => block_start_y + block_height - start_y > 0
381 // => block_height - (start_y - block_start_y) > 0
382 //
383 // Since block_height >= 8 and is a power of 2, it follows that
384 // block_height is a multiple of 8. start_y - block_start_y is also a
385 // multiple of 8. Therefore their difference is a multiple of 8. Since
386 // their difference is > 0, their difference must be >= 8.
387 //
388 // We then add an offset of 4 to y so that the loop starts with y = 0
389 // and continues if y < 8.
390 for (int y = 0; y < 8; ++y) {
391 int sy = sy4 - MultiplyBy4(gamma);
392 // The spec says we should use the following loop condition:
393 // x < std::min(4, block_start_x + block_width - start_x - 4);
394 // Similar to the above, we can prove that the loop condition can be
395 // simplified to x < 4.
396 //
397 // We then add an offset of 4 to x so that the loop starts with x = 0
398 // and continues if x < 8.
399 for (int x = 0; x < 8; ++x) {
400 const int offset =
401 RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
402 kWarpedPixelPrecisionShifts;
403 // Since gamma and delta have been validated by SetupShear(), one can
404 // prove that 0 <= offset <= 3 * 2^6.
405 assert(offset >= 0);
406 assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
407 int sum = 0;
408 for (int k = 0; k < 8; ++k) {
409 sum += kWarpedFilters[offset][k] * intermediate_result[y + k][x];
410 }
411 sum -= offset_removal;
412 sum = RightShiftWithRounding(sum, kRoundBitsVertical);
413 if (is_compound) {
414 sum += (bitdepth == 8) ? 0 : kCompoundOffset;
415 dst_row[x] = static_cast<DestType>(sum);
416 } else {
417 dst_row[x] = static_cast<DestType>(Clip3(sum, 0, kMaxPixel));
418 }
419 sy += gamma;
420 }
421 dst_row += dest_stride;
422 sy4 += delta;
423 }
424 }
425 dst += 8 * dest_stride;
426 }
427 }
428
Init8bpp()429 void Init8bpp() {
430 Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
431 assert(dsp != nullptr);
432 #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
433 dsp->warp = Warp_C</*is_compound=*/false, 8, uint8_t>;
434 dsp->warp_compound = Warp_C</*is_compound=*/true, 8, uint8_t>;
435 #else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
436 static_cast<void>(dsp);
437 #ifndef LIBGAV1_Dsp8bpp_Warp
438 dsp->warp = Warp_C</*is_compound=*/false, 8, uint8_t>;
439 #endif
440 #ifndef LIBGAV1_Dsp8bpp_WarpCompound
441 dsp->warp_compound = Warp_C</*is_compound=*/true, 8, uint8_t>;
442 #endif
443 #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
444 }
445
446 #if LIBGAV1_MAX_BITDEPTH >= 10
Init10bpp()447 void Init10bpp() {
448 Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
449 assert(dsp != nullptr);
450 #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
451 dsp->warp = Warp_C</*is_compound=*/false, 10, uint16_t>;
452 dsp->warp_compound = Warp_C</*is_compound=*/true, 10, uint16_t>;
453 #else // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
454 static_cast<void>(dsp);
455 #ifndef LIBGAV1_Dsp10bpp_Warp
456 dsp->warp = Warp_C</*is_compound=*/false, 10, uint16_t>;
457 #endif
458 #ifndef LIBGAV1_Dsp10bpp_WarpCompound
459 dsp->warp_compound = Warp_C</*is_compound=*/true, 10, uint16_t>;
460 #endif
461 #endif // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
462 }
463 #endif
464
465 } // namespace
466
WarpInit_C()467 void WarpInit_C() {
468 Init8bpp();
469 #if LIBGAV1_MAX_BITDEPTH >= 10
470 Init10bpp();
471 #endif
472 }
473
474 } // namespace dsp
475 } // namespace libgav1
476