• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/dsp/warp.h"
16 
17 #include <algorithm>
18 #include <cassert>
19 #include <cstddef>
20 #include <cstdint>
21 #include <cstdlib>
22 #include <type_traits>
23 
24 #include "src/dsp/constants.h"
25 #include "src/dsp/dsp.h"
26 #include "src/utils/common.h"
27 #include "src/utils/constants.h"
28 #include "src/utils/memory.h"
29 
30 namespace libgav1 {
31 namespace dsp {
32 namespace {
33 
34 // Number of extra bits of precision in warped filtering.
35 constexpr int kWarpedDiffPrecisionBits = 10;
36 
37 // Warp prediction output ranges from WarpTest.ShowRange.
38 // Bitdepth:  8 Input range:            [       0,      255]
39 //   8bpp intermediate offset: 16384.
40 //   intermediate range:                [    4399,    61009]
41 //   first pass output range:           [     550,     7626]
42 //   8bpp intermediate offset removal: 262144.
43 //   intermediate range:                [ -620566,  1072406]
44 //   second pass output range:          [       0,      255]
45 //   compound second pass output range: [   -4848,     8378]
46 //
47 // Bitdepth: 10 Input range:            [       0,     1023]
48 //   intermediate range:                [  -48081,   179025]
49 //   first pass output range:           [   -6010,    22378]
50 //   intermediate range:                [-2103516,  4198620]
51 //   second pass output range:          [       0,     1023]
52 //   compound second pass output range: [    8142,    57378]
53 //
54 // Bitdepth: 12 Input range:            [       0,     4095]
55 //   intermediate range:                [ -192465,   716625]
56 //   first pass output range:           [   -6015,    22395]
57 //   intermediate range:                [-2105190,  4201830]
58 //   second pass output range:          [       0,     4095]
59 //   compound second pass output range: [    8129,    57403]
60 
61 template <bool is_compound, int bitdepth, typename Pixel>
Warp_C(const void * LIBGAV1_RESTRICT const source,ptrdiff_t source_stride,const int source_width,const int source_height,const int * LIBGAV1_RESTRICT const warp_params,const int subsampling_x,const int subsampling_y,const int block_start_x,const int block_start_y,const int block_width,const int block_height,const int16_t alpha,const int16_t beta,const int16_t gamma,const int16_t delta,void * LIBGAV1_RESTRICT dest,ptrdiff_t dest_stride)62 void Warp_C(const void* LIBGAV1_RESTRICT const source, ptrdiff_t source_stride,
63             const int source_width, const int source_height,
64             const int* LIBGAV1_RESTRICT const warp_params,
65             const int subsampling_x, const int subsampling_y,
66             const int block_start_x, const int block_start_y,
67             const int block_width, const int block_height, const int16_t alpha,
68             const int16_t beta, const int16_t gamma, const int16_t delta,
69             void* LIBGAV1_RESTRICT dest, ptrdiff_t dest_stride) {
70   assert(block_width >= 8 && block_height >= 8);
71   if (is_compound) {
72     assert(dest_stride == block_width);
73   }
74   constexpr int kRoundBitsHorizontal = (bitdepth == 12)
75                                            ? kInterRoundBitsHorizontal12bpp
76                                            : kInterRoundBitsHorizontal;
77   constexpr int kRoundBitsVertical =
78       is_compound        ? kInterRoundBitsCompoundVertical
79       : (bitdepth == 12) ? kInterRoundBitsVertical12bpp
80                          : kInterRoundBitsVertical;
81 
82   // Only used for 8bpp. Allows for keeping the first pass intermediates within
83   // uint16_t. With 10/12bpp the intermediate value will always require int32_t.
84   constexpr int first_pass_offset = (bitdepth == 8) ? 1 << 14 : 0;
85   constexpr int offset_removal =
86       (first_pass_offset >> kRoundBitsHorizontal) * 128;
87 
88   constexpr int kMaxPixel = (1 << bitdepth) - 1;
89   union {
90     // |intermediate_result| is the output of the horizontal filtering and
91     // rounding. The range is within int16_t.
92     int16_t intermediate_result[15][8];  // 15 rows, 8 columns.
93     // In the simple special cases where the samples in each row are all the
94     // same, store one sample per row in a column vector.
95     int16_t intermediate_result_column[15];
96   };
97   const auto* const src = static_cast<const Pixel*>(source);
98   source_stride /= sizeof(Pixel);
99   using DestType =
100       typename std::conditional<is_compound, uint16_t, Pixel>::type;
101   auto* dst = static_cast<DestType*>(dest);
102   if (!is_compound) dest_stride /= sizeof(dst[0]);
103 
104   assert(block_width >= 8);
105   assert(block_height >= 8);
106 
107   // Warp process applies for each 8x8 block (or smaller).
108   for (int start_y = block_start_y; start_y < block_start_y + block_height;
109        start_y += 8) {
110     for (int start_x = block_start_x; start_x < block_start_x + block_width;
111          start_x += 8) {
112       const int src_x = (start_x + 4) << subsampling_x;
113       const int src_y = (start_y + 4) << subsampling_y;
114       const WarpFilterParams filter_params = GetWarpFilterParams(
115           src_x, src_y, subsampling_x, subsampling_y, warp_params);
116 
117       // A prediction block may fall outside the frame's boundaries. If a
118       // prediction block is calculated using only samples outside the frame's
119       // boundary, the filtering can be simplified. We can divide the plane
120       // into several regions and handle them differently.
121       //
122       //                |           |
123       //            1   |     3     |   1
124       //                |           |
125       //         -------+-----------+-------
126       //                |***********|
127       //            2   |*****4*****|   2
128       //                |***********|
129       //         -------+-----------+-------
130       //                |           |
131       //            1   |     3     |   1
132       //                |           |
133       //
134       // At the center, region 4 represents the frame and is the general case.
135       //
136       // In regions 1 and 2, the prediction block is outside the frame's
137       // boundary horizontally. Therefore the horizontal filtering can be
138       // simplified. Furthermore, in the region 1 (at the four corners), the
139       // prediction is outside the frame's boundary both horizontally and
140       // vertically, so we get a constant prediction block.
141       //
142       // In region 3, the prediction block is outside the frame's boundary
143       // vertically. Unfortunately because we apply the horizontal filters
144       // first, by the time we apply the vertical filters, they no longer see
145       // simple inputs. So the only simplification is that all the rows are
146       // the same, but we still need to apply all the horizontal and vertical
147       // filters.
148 
149       // Check for two simple special cases, where the horizontal filter can
150       // be significantly simplified.
151       //
152       // In general, for each row, the horizontal filter is calculated as
153       // follows:
154       //   for (int x = -4; x < 4; ++x) {
155       //     const int offset = ...;
156       //     int sum = first_pass_offset;
157       //     for (int k = 0; k < 8; ++k) {
158       //       const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1);
159       //       sum += kWarpedFilters[offset][k] * src_row[column];
160       //     }
161       //     ...
162       //   }
163       // The column index before clipping, ix4 + x + k - 3, varies in the range
164       // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1
165       // or ix4 + 7 <= 0, then all the column indexes are clipped to the same
166       // border index (source_width - 1 or 0, respectively). Then for each x,
167       // the inner for loop of the horizontal filter is reduced to multiplying
168       // the border pixel by the sum of the filter coefficients.
169       if (filter_params.ix4 - 7 >= source_width - 1 ||
170           filter_params.ix4 + 7 <= 0) {
171         // Regions 1 and 2.
172         // Points to the left or right border of the first row of |src|.
173         const Pixel* first_row_border =
174             (filter_params.ix4 + 7 <= 0) ? src : src + source_width - 1;
175         // In general, for y in [-7, 8), the row number iy4 + y is clipped:
176         //   const int row = Clip3(iy4 + y, 0, source_height - 1);
177         // In two special cases, iy4 + y is clipped to either 0 or
178         // source_height - 1 for all y. In the rest of the cases, iy4 + y is
179         // bounded and we can avoid clipping iy4 + y by relying on a reference
180         // frame's boundary extension on the top and bottom.
181         if (filter_params.iy4 - 7 >= source_height - 1 ||
182             filter_params.iy4 + 7 <= 0) {
183           // Region 1.
184           // Every sample used to calculate the prediction block has the same
185           // value. So the whole prediction block has the same value.
186           const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1;
187           const Pixel row_border_pixel = first_row_border[row * source_stride];
188           DestType* dst_row = dst + start_x - block_start_x;
189           if (is_compound) {
190             int sum = row_border_pixel
191                       << ((14 - kRoundBitsHorizontal) - kRoundBitsVertical);
192             sum += (bitdepth == 8) ? 0 : kCompoundOffset;
193             Memset(dst_row, sum, 8);
194           } else {
195             Memset(dst_row, row_border_pixel, 8);
196           }
197           const DestType* const first_dst_row = dst_row;
198           dst_row += dest_stride;
199           for (int y = 1; y < 8; ++y) {
200             memcpy(dst_row, first_dst_row, 8 * sizeof(*dst_row));
201             dst_row += dest_stride;
202           }
203           // End of region 1. Continue the |start_x| for loop.
204           continue;
205         }
206 
207         // Region 2.
208         // Horizontal filter.
209         // The input values in this region are generated by extending the border
210         // which makes them identical in the horizontal direction. This
211         // computation could be inlined in the vertical pass but most
212         // implementations will need a transpose of some sort.
213         // It is not necessary to use the offset values here because the
214         // horizontal pass is a simple shift and the vertical pass will always
215         // require using 32 bits.
216         for (int y = -7; y < 8; ++y) {
217           // We may over-read up to 13 pixels above the top source row, or up
218           // to 13 pixels below the bottom source row. This is proved below.
219           const int row = filter_params.iy4 + y;
220           int sum = first_row_border[row * source_stride];
221           sum <<= kFilterBits - kRoundBitsHorizontal;
222           intermediate_result_column[y + 7] = sum;
223         }
224         // Vertical filter.
225         DestType* dst_row = dst + start_x - block_start_x;
226         int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
227                   MultiplyBy4(delta);
228         for (int y = 0; y < 8; ++y) {
229           int sy = sy4 - MultiplyBy4(gamma);
230           for (int x = 0; x < 8; ++x) {
231             const int offset =
232                 RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
233                 kWarpedPixelPrecisionShifts;
234             assert(offset >= 0);
235             assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
236             int sum = 0;
237             for (int k = 0; k < 8; ++k) {
238               sum +=
239                   kWarpedFilters[offset][k] * intermediate_result_column[y + k];
240             }
241             sum = RightShiftWithRounding(sum, kRoundBitsVertical);
242             if (is_compound) {
243               sum += (bitdepth == 8) ? 0 : kCompoundOffset;
244               dst_row[x] = static_cast<DestType>(sum);
245             } else {
246               dst_row[x] = static_cast<DestType>(Clip3(sum, 0, kMaxPixel));
247             }
248             sy += gamma;
249           }
250           dst_row += dest_stride;
251           sy4 += delta;
252         }
253         // End of region 2. Continue the |start_x| for loop.
254         continue;
255       }
256 
257       // Regions 3 and 4.
258       // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
259       // It follows that -6 <= ix4 <= source_width + 5. This inequality is
260       // used below.
261 
262       // In general, for y in [-7, 8), the row number iy4 + y is clipped:
263       //   const int row = Clip3(iy4 + y, 0, source_height - 1);
264       // In two special cases, iy4 + y is clipped to either 0 or
265       // source_height - 1 for all y. In the rest of the cases, iy4 + y is
266       // bounded and we can avoid clipping iy4 + y by relying on a reference
267       // frame's boundary extension on the top and bottom.
268       if (filter_params.iy4 - 7 >= source_height - 1 ||
269           filter_params.iy4 + 7 <= 0) {
270         // Region 3.
271         // Horizontal filter.
272         const int row = (filter_params.iy4 + 7 <= 0) ? 0 : source_height - 1;
273         const Pixel* const src_row = src + row * source_stride;
274         int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
275                   beta * 7;
276         for (int y = -7; y < 8; ++y) {
277           int sx = sx4 - MultiplyBy4(alpha);
278           for (int x = -4; x < 4; ++x) {
279             const int offset =
280                 RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
281                 kWarpedPixelPrecisionShifts;
282             // Since alpha and beta have been validated by SetupShear(), one
283             // can prove that 0 <= offset <= 3 * 2^6.
284             assert(offset >= 0);
285             assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
286             // For SIMD optimization:
287             // |first_pass_offset| guarantees the sum fits in uint16_t for 8bpp.
288             // For 10/12 bit, the range of sum requires 32 bits.
289             int sum = first_pass_offset;
290             for (int k = 0; k < 8; ++k) {
291               // We assume the source frame has left and right borders of at
292               // least 13 pixels that extend the frame boundary pixels.
293               //
294               // Since -4 <= x <= 3 and 0 <= k <= 7, using the inequality on
295               // ix4 above, we have
296               //   -13 <= ix4 + x + k - 3 <= source_width + 12,
297               // or
298               //   -13 <= column <= (source_width - 1) + 13.
299               // Therefore we may over-read up to 13 pixels before the source
300               // row, or up to 13 pixels after the source row.
301               const int column = filter_params.ix4 + x + k - 3;
302               sum += kWarpedFilters[offset][k] * src_row[column];
303             }
304             intermediate_result[y + 7][x + 4] =
305                 RightShiftWithRounding(sum, kRoundBitsHorizontal);
306             sx += alpha;
307           }
308           sx4 += beta;
309         }
310       } else {
311         // Region 4.
312         // Horizontal filter.
313         // At this point, we know iy4 - 7 < source_height - 1 and iy4 + 7 > 0.
314         // It follows that -6 <= iy4 <= source_height + 5. This inequality is
315         // used below.
316         int sx4 = (filter_params.x4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
317                   beta * 7;
318         for (int y = -7; y < 8; ++y) {
319           // We assume the source frame has top and bottom borders of at least
320           // 13 pixels that extend the frame boundary pixels.
321           //
322           // Since -7 <= y <= 7, using the inequality on iy4 above, we have
323           //   -13 <= iy4 + y <= source_height + 12,
324           // or
325           //   -13 <= row <= (source_height - 1) + 13.
326           // Therefore we may over-read up to 13 pixels above the top source
327           // row, or up to 13 pixels below the bottom source row.
328           const int row = filter_params.iy4 + y;
329           const Pixel* const src_row = src + row * source_stride;
330           int sx = sx4 - MultiplyBy4(alpha);
331           for (int x = -4; x < 4; ++x) {
332             const int offset =
333                 RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
334                 kWarpedPixelPrecisionShifts;
335             // Since alpha and beta have been validated by SetupShear(), one
336             // can prove that 0 <= offset <= 3 * 2^6.
337             assert(offset >= 0);
338             assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
339             // For SIMD optimization:
340             // |first_pass_offset| guarantees the sum fits in uint16_t for 8bpp.
341             // For 10/12 bit, the range of sum requires 32 bits.
342             int sum = first_pass_offset;
343             for (int k = 0; k < 8; ++k) {
344               // We assume the source frame has left and right borders of at
345               // least 13 pixels that extend the frame boundary pixels.
346               //
347               // Since -4 <= x <= 3 and 0 <= k <= 7, using the inequality on
348               // ix4 above, we have
349               //   -13 <= ix4 + x + k - 3 <= source_width + 12,
350               // or
351               //   -13 <= column <= (source_width - 1) + 13.
352               // Therefore we may over-read up to 13 pixels before the source
353               // row, or up to 13 pixels after the source row.
354               const int column = filter_params.ix4 + x + k - 3;
355               sum += kWarpedFilters[offset][k] * src_row[column];
356             }
357             intermediate_result[y + 7][x + 4] =
358                 RightShiftWithRounding(sum, kRoundBitsHorizontal) -
359                 offset_removal;
360             sx += alpha;
361           }
362           sx4 += beta;
363         }
364       }
365 
366       // Regions 3 and 4.
367       // Vertical filter.
368       DestType* dst_row = dst + start_x - block_start_x;
369       int sy4 = (filter_params.y4 & ((1 << kWarpedModelPrecisionBits) - 1)) -
370                 MultiplyBy4(delta);
371       // The spec says we should use the following loop condition:
372       //   y < std::min(4, block_start_y + block_height - start_y - 4);
373       // We can prove that block_start_y + block_height - start_y >= 8, which
374       // implies std::min(4, block_start_y + block_height - start_y - 4) = 4.
375       // So the loop condition is simply y < 4.
376       //
377       //   Proof:
378       //      start_y < block_start_y + block_height
379       //   => block_start_y + block_height - start_y > 0
380       //   => block_height - (start_y - block_start_y) > 0
381       //
382       //   Since block_height >= 8 and is a power of 2, it follows that
383       //   block_height is a multiple of 8. start_y - block_start_y is also a
384       //   multiple of 8. Therefore their difference is a multiple of 8. Since
385       //   their difference is > 0, their difference must be >= 8.
386       //
387       // We then add an offset of 4 to y so that the loop starts with y = 0
388       // and continues if y < 8.
389       for (int y = 0; y < 8; ++y) {
390         int sy = sy4 - MultiplyBy4(gamma);
391         // The spec says we should use the following loop condition:
392         //   x < std::min(4, block_start_x + block_width - start_x - 4);
393         // Similar to the above, we can prove that the loop condition can be
394         // simplified to x < 4.
395         //
396         // We then add an offset of 4 to x so that the loop starts with x = 0
397         // and continues if x < 8.
398         for (int x = 0; x < 8; ++x) {
399           const int offset =
400               RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
401               kWarpedPixelPrecisionShifts;
402           // Since gamma and delta have been validated by SetupShear(), one can
403           // prove that 0 <= offset <= 3 * 2^6.
404           assert(offset >= 0);
405           assert(offset < 3 * kWarpedPixelPrecisionShifts + 1);
406           int sum = 0;
407           for (int k = 0; k < 8; ++k) {
408             sum += kWarpedFilters[offset][k] * intermediate_result[y + k][x];
409           }
410           sum -= offset_removal;
411           sum = RightShiftWithRounding(sum, kRoundBitsVertical);
412           if (is_compound) {
413             sum += (bitdepth == 8) ? 0 : kCompoundOffset;
414             dst_row[x] = static_cast<DestType>(sum);
415           } else {
416             dst_row[x] = static_cast<DestType>(Clip3(sum, 0, kMaxPixel));
417           }
418           sy += gamma;
419         }
420         dst_row += dest_stride;
421         sy4 += delta;
422       }
423     }
424     dst += 8 * dest_stride;
425   }
426 }
427 
Init8bpp()428 void Init8bpp() {
429   Dsp* const dsp = dsp_internal::GetWritableDspTable(8);
430   assert(dsp != nullptr);
431 #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
432   dsp->warp = Warp_C</*is_compound=*/false, 8, uint8_t>;
433   dsp->warp_compound = Warp_C</*is_compound=*/true, 8, uint8_t>;
434 #else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
435   static_cast<void>(dsp);
436 #ifndef LIBGAV1_Dsp8bpp_Warp
437   dsp->warp = Warp_C</*is_compound=*/false, 8, uint8_t>;
438 #endif
439 #ifndef LIBGAV1_Dsp8bpp_WarpCompound
440   dsp->warp_compound = Warp_C</*is_compound=*/true, 8, uint8_t>;
441 #endif
442 #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
443 }
444 
445 #if LIBGAV1_MAX_BITDEPTH >= 10
Init10bpp()446 void Init10bpp() {
447   Dsp* const dsp = dsp_internal::GetWritableDspTable(10);
448   assert(dsp != nullptr);
449 #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
450   dsp->warp = Warp_C</*is_compound=*/false, 10, uint16_t>;
451   dsp->warp_compound = Warp_C</*is_compound=*/true, 10, uint16_t>;
452 #else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
453   static_cast<void>(dsp);
454 #ifndef LIBGAV1_Dsp10bpp_Warp
455   dsp->warp = Warp_C</*is_compound=*/false, 10, uint16_t>;
456 #endif
457 #ifndef LIBGAV1_Dsp10bpp_WarpCompound
458   dsp->warp_compound = Warp_C</*is_compound=*/true, 10, uint16_t>;
459 #endif
460 #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
461 }
462 #endif  // LIBGAV1_MAX_BITDEPTH >= 10
463 
464 #if LIBGAV1_MAX_BITDEPTH == 12
Init12bpp()465 void Init12bpp() {
466   Dsp* const dsp = dsp_internal::GetWritableDspTable(12);
467   assert(dsp != nullptr);
468 #if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
469   dsp->warp = Warp_C</*is_compound=*/false, 12, uint16_t>;
470   dsp->warp_compound = Warp_C</*is_compound=*/true, 12, uint16_t>;
471 #else  // !LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
472   static_cast<void>(dsp);
473 #ifndef LIBGAV1_Dsp12bpp_Warp
474   dsp->warp = Warp_C</*is_compound=*/false, 12, uint16_t>;
475 #endif
476 #ifndef LIBGAV1_Dsp12bpp_WarpCompound
477   dsp->warp_compound = Warp_C</*is_compound=*/true, 12, uint16_t>;
478 #endif
479 #endif  // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
480 }
481 #endif  // LIBGAV1_MAX_BITDEPTH == 12
482 
483 }  // namespace
484 
WarpInit_C()485 void WarpInit_C() {
486   Init8bpp();
487 #if LIBGAV1_MAX_BITDEPTH >= 10
488   Init10bpp();
489 #endif
490 #if LIBGAV1_MAX_BITDEPTH == 12
491   Init12bpp();
492 #endif
493 }
494 
495 }  // namespace dsp
496 }  // namespace libgav1
497