• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2019 The libgav1 Authors
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LIBGAV1_SRC_DSP_DSP_H_
18 #define LIBGAV1_SRC_DSP_DSP_H_
19 
20 #include <cstddef>
21 #include <cstdint>
22 #include <cstdlib>
23 
24 #include "src/dsp/common.h"
25 #include "src/dsp/constants.h"
26 #include "src/dsp/film_grain_common.h"
27 #include "src/utils/cpu.h"
28 #include "src/utils/reference_info.h"
29 #include "src/utils/types.h"
30 
31 namespace libgav1 {
32 namespace dsp {
33 
34 #if !defined(LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS)
35 #define LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS 0
36 #endif
37 
38 enum IntraPredictor : uint8_t {
39   kIntraPredictorDcFill,
40   kIntraPredictorDcTop,
41   kIntraPredictorDcLeft,
42   kIntraPredictorDc,
43   kIntraPredictorVertical,
44   kIntraPredictorHorizontal,
45   kIntraPredictorPaeth,
46   kIntraPredictorSmooth,
47   kIntraPredictorSmoothVertical,
48   kIntraPredictorSmoothHorizontal,
49   kNumIntraPredictors
50 };
51 
52 // List of valid 1D transforms.
53 enum Transform1D : uint8_t {
54   k1DTransformDct,   // Discrete Cosine Transform.
55   k1DTransformAdst,  // Asymmetric Discrete Sine Transform.
56   k1DTransformIdentity,
57   k1DTransformWht,  // Walsh Hadamard Transform.
58   kNum1DTransforms
59 };
60 
61 // List of valid 1D transform sizes. Not all transforms may be available for all
62 // the sizes.
63 enum TransformSize1D : uint8_t {
64   k1DTransformSize4,
65   k1DTransformSize8,
66   k1DTransformSize16,
67   k1DTransformSize32,
68   k1DTransformSize64,
69   kNum1DTransformSizes
70 };
71 
72 // The maximum width of the loop filter, fewer pixels may be filtered depending
73 // on strength thresholds.
74 enum LoopFilterSize : uint8_t {
75   kLoopFilterSize4,
76   kLoopFilterSize6,
77   kLoopFilterSize8,
78   kLoopFilterSize14,
79   kNumLoopFilterSizes
80 };
81 
82 enum : uint8_t {
83   kRow = 0,
84   kColumn = 1,
85 };
86 
87 //------------------------------------------------------------------------------
88 // ToString()
89 //
90 // These functions are meant to be used only in debug logging and within tests.
91 // They are defined inline to avoid including the strings in the release
92 // library when logging is disabled; unreferenced functions will not be added to
93 // any object file in that case.
94 
ToString(const IntraPredictor predictor)95 inline const char* ToString(const IntraPredictor predictor) {
96   switch (predictor) {
97     case kIntraPredictorDcFill:
98       return "kIntraPredictorDcFill";
99     case kIntraPredictorDcTop:
100       return "kIntraPredictorDcTop";
101     case kIntraPredictorDcLeft:
102       return "kIntraPredictorDcLeft";
103     case kIntraPredictorDc:
104       return "kIntraPredictorDc";
105     case kIntraPredictorVertical:
106       return "kIntraPredictorVertical";
107     case kIntraPredictorHorizontal:
108       return "kIntraPredictorHorizontal";
109     case kIntraPredictorPaeth:
110       return "kIntraPredictorPaeth";
111     case kIntraPredictorSmooth:
112       return "kIntraPredictorSmooth";
113     case kIntraPredictorSmoothVertical:
114       return "kIntraPredictorSmoothVertical";
115     case kIntraPredictorSmoothHorizontal:
116       return "kIntraPredictorSmoothHorizontal";
117     case kNumIntraPredictors:
118       return "kNumIntraPredictors";
119   }
120   abort();
121 }
122 
ToString(const Transform1D transform)123 inline const char* ToString(const Transform1D transform) {
124   switch (transform) {
125     case k1DTransformDct:
126       return "k1DTransformDct";
127     case k1DTransformAdst:
128       return "k1DTransformAdst";
129     case k1DTransformIdentity:
130       return "k1DTransformIdentity";
131     case k1DTransformWht:
132       return "k1DTransformWht";
133     case kNum1DTransforms:
134       return "kNum1DTransforms";
135   }
136   abort();
137 }
138 
ToString(const TransformSize1D transform_size)139 inline const char* ToString(const TransformSize1D transform_size) {
140   switch (transform_size) {
141     case k1DTransformSize4:
142       return "k1DTransformSize4";
143     case k1DTransformSize8:
144       return "k1DTransformSize8";
145     case k1DTransformSize16:
146       return "k1DTransformSize16";
147     case k1DTransformSize32:
148       return "k1DTransformSize32";
149     case k1DTransformSize64:
150       return "k1DTransformSize64";
151     case kNum1DTransformSizes:
152       return "kNum1DTransformSizes";
153   }
154   abort();
155 }
156 
ToString(const LoopFilterSize filter_size)157 inline const char* ToString(const LoopFilterSize filter_size) {
158   switch (filter_size) {
159     case kLoopFilterSize4:
160       return "kLoopFilterSize4";
161     case kLoopFilterSize6:
162       return "kLoopFilterSize6";
163     case kLoopFilterSize8:
164       return "kLoopFilterSize8";
165     case kLoopFilterSize14:
166       return "kLoopFilterSize14";
167     case kNumLoopFilterSizes:
168       return "kNumLoopFilterSizes";
169   }
170   abort();
171 }
172 
ToString(const LoopFilterType filter_type)173 inline const char* ToString(const LoopFilterType filter_type) {
174   switch (filter_type) {
175     case kLoopFilterTypeVertical:
176       return "kLoopFilterTypeVertical";
177     case kLoopFilterTypeHorizontal:
178       return "kLoopFilterTypeHorizontal";
179     case kNumLoopFilterTypes:
180       return "kNumLoopFilterTypes";
181   }
182   abort();
183 }
184 
185 //------------------------------------------------------------------------------
186 // Intra predictors. Section 7.11.2.
187 // These require access to one or both of the top row and left column. Some may
188 // access the top-left (top[-1]), top-right (top[width+N]), bottom-left
189 // (left[height+N]) or upper-left (left[-1]).
190 
191 // Intra predictor function signature. Sections 7.11.2.2, 7.11.2.4 (#10,#11),
192 // 7.11.2.5, 7.11.2.6.
193 // |dst| is an unaligned pointer to the output block. Pixel size is determined
194 // by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to
195 // the row above |dst|. |left| is an aligned vector of the column to the left
196 // of |dst|. top-left and bottom-left may be accessed.
197 using IntraPredictorFunc = void (*)(void* dst, ptrdiff_t stride,
198                                     const void* top, const void* left);
199 using IntraPredictorFuncs =
200     IntraPredictorFunc[kNumTransformSizes][kNumIntraPredictors];
201 
202 // Directional intra predictor function signature, zone 1 (0 < angle < 90).
203 // Section 7.11.2.4 (#7).
204 // |dst| is an unaligned pointer to the output block. Pixel size is determined
205 // by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to
206 // the row above |dst|. |width| and |height| give the dimensions of the block.
207 // |xstep| is the scaled starting index to |top| from
208 // kDirectionalIntraPredictorDerivative. |upsampled_top| indicates whether
209 // |top| has been upsampled as described in '7.11.2.11. Intra edge upsample
210 // process'. This can occur in cases with |width| + |height| <= 16. top-right
211 // is accessed.
212 using DirectionalIntraPredictorZone1Func = void (*)(void* dst, ptrdiff_t stride,
213                                                     const void* top, int width,
214                                                     int height, int xstep,
215                                                     bool upsampled_top);
216 
217 // Directional intra predictor function signature, zone 2 (90 < angle < 180).
218 // Section 7.11.2.4 (#8).
219 // |dst| is an unaligned pointer to the output block. Pixel size is determined
220 // by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to
221 // the row above |dst|. |left| is an aligned vector of the column to the left of
222 // |dst|. |width| and |height| give the dimensions of the block. |xstep| and
223 // |ystep| are the scaled starting index to |top| and |left|, respectively,
224 // from kDirectionalIntraPredictorDerivative. |upsampled_top| and
225 // |upsampled_left| indicate whether |top| and |left| have been upsampled as
226 // described in '7.11.2.11. Intra edge upsample process'. This can occur in
227 // cases with |width| + |height| <= 16. top-left and upper-left are accessed,
228 // up to [-2] in each if |upsampled_top/left| are set.
229 using DirectionalIntraPredictorZone2Func = void (*)(
230     void* dst, ptrdiff_t stride, const void* top, const void* left, int width,
231     int height, int xstep, int ystep, bool upsampled_top, bool upsampled_left);
232 
233 // Directional intra predictor function signature, zone 3 (180 < angle < 270).
234 // Section 7.11.2.4 (#9).
235 // |dst| is an unaligned pointer to the output block. Pixel size is determined
236 // by bitdepth with |stride| given in bytes. |left| is an aligned vector of the
237 // column to the left of |dst|. |width| and |height| give the dimensions of the
238 // block. |ystep| is the scaled starting index to |left| from
239 // kDirectionalIntraPredictorDerivative. |upsampled_left| indicates whether
240 // |left| has been upsampled as described in '7.11.2.11. Intra edge upsample
241 // process'. This can occur in cases with |width| + |height| <= 16. bottom-left
242 // is accessed.
243 using DirectionalIntraPredictorZone3Func = void (*)(void* dst, ptrdiff_t stride,
244                                                     const void* left, int width,
245                                                     int height, int ystep,
246                                                     bool upsampled_left);
247 
248 // Filter intra predictor function signature. Section 7.11.2.3.
249 // |dst| is an unaligned pointer to the output block. Pixel size is determined
250 // by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to
251 // the row above |dst|. |left| is an aligned vector of the column to the left
252 // of |dst|. |width| and |height| are the size of the block in pixels.
253 using FilterIntraPredictorFunc = void (*)(void* dst, ptrdiff_t stride,
254                                           const void* top, const void* left,
255                                           FilterIntraPredictor pred, int width,
256                                           int height);
257 
258 //------------------------------------------------------------------------------
259 // Chroma from Luma (Cfl) prediction. Section 7.11.5.
260 
261 // Chroma from Luma (Cfl) intra prediction function signature. |dst| is an
262 // unaligned pointer to the output block. Pixel size is determined by bitdepth
263 // with |stride| given in bytes. |luma| contains subsampled luma pixels with 3
264 // fractional bits of precision. |alpha| is the signed Cfl alpha value for the
265 // appropriate plane.
266 using CflIntraPredictorFunc = void (*)(
267     void* dst, ptrdiff_t stride,
268     const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], int alpha);
269 using CflIntraPredictorFuncs = CflIntraPredictorFunc[kNumTransformSizes];
270 
271 // Chroma from Luma (Cfl) subsampler function signature. |luma| is an unaligned
272 // pointer to the output block. |src| is an unaligned pointer to the input
273 // block. Pixel size is determined by bitdepth with |stride| given in bytes.
274 using CflSubsamplerFunc =
275     void (*)(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
276              int max_luma_width, int max_luma_height, const void* source,
277              ptrdiff_t stride);
278 using CflSubsamplerFuncs =
279     CflSubsamplerFunc[kNumTransformSizes][kNumSubsamplingTypes];
280 
281 //------------------------------------------------------------------------------
282 // Intra Edge Filtering and Upsampling. Step 4 in section 7.11.2.4.
283 
284 // Intra edge filter function signature. |buffer| is a pointer to the top_row or
285 // left_column that needs to be filtered. Typically the -1'th index of |top_row|
286 // and |left_column| need to be filtered as well, so the caller can merely pass
287 // the |buffer| as top_row[-1] or left_column[-1]. Pixel size is determined by
288 // bitdepth. |size| is the number of pixels to be filtered. |strength| is the
289 // filter strength. Section 7.11.2.12 in the spec.
290 using IntraEdgeFilterFunc = void (*)(void* buffer, int size, int strength);
291 
292 // Intra edge upsampler function signature. |buffer| is a pointer to the top_row
293 // or left_column that needs to be upsampled. Pixel size is determined by
294 // bitdepth. |size| is the number of pixels to be upsampled; valid values are:
295 // 4, 8, 12, 16. This function needs access to negative indices -1 and -2 of
296 // the |buffer|. Section 7.11.2.11 in the spec.
297 using IntraEdgeUpsamplerFunc = void (*)(void* buffer, int size);
298 
299 //------------------------------------------------------------------------------
300 // Inverse transform add function signature.
301 //
302 // Steps 2 and 3 of section 7.12.3 (contains the implementation of section
303 // 7.13.3).
304 // Apply the inverse transforms and add the residual to the destination frame
305 // for the transform type and block size |tx_size| starting at position
306 // |start_x| and |start_y|. |dst_frame| is a pointer to an Array2D.
307 // |adjusted_tx_height| is the number of rows to process based on the non-zero
308 // coefficient count in the block. It will be 1 (non-zero coefficient count ==
309 // 1), 4 or a multiple of 8 up to 32 or the original transform height,
310 // whichever is less.
311 using InverseTransformAddFunc = void (*)(TransformType tx_type,
312                                          TransformSize tx_size,
313                                          int adjusted_tx_height,
314                                          void* src_buffer, int start_x,
315                                          int start_y, void* dst_frame);
316 // The final dimension holds row and column transforms indexed with kRow and
317 // kColumn.
318 using InverseTransformAddFuncs =
319     InverseTransformAddFunc[kNum1DTransforms][kNum1DTransformSizes][2];
320 
321 //------------------------------------------------------------------------------
322 // Post processing.
323 
324 // Loop filter function signature. Section 7.14.
325 // |dst| is an unaligned pointer to the output block. Pixel size is determined
326 // by bitdepth with |stride| given in bytes.
327 using LoopFilterFunc = void (*)(void* dst, ptrdiff_t stride, int outer_thresh,
328                                 int inner_thresh, int hev_thresh);
329 using LoopFilterFuncs =
330     LoopFilterFunc[kNumLoopFilterSizes][kNumLoopFilterTypes];
331 
332 // Cdef direction function signature. Section 7.15.2.
333 // |src| is a pointer to the source block. Pixel size is determined by bitdepth
334 // with |stride| given in bytes. |direction| and |variance| are output
335 // parameters and must not be nullptr.
336 using CdefDirectionFunc = void (*)(const void* src, ptrdiff_t stride,
337                                    uint8_t* direction, int* variance);
338 
339 // Cdef filtering function signature. Section 7.15.3.
340 // |source| is a pointer to the input block padded with kCdefLargeValue if at a
341 // frame border. |source_stride| is given in units of uint16_t.
342 // |block_width|, |block_height| are the width/height of the input block.
343 // |primary_strength|, |secondary_strength|, and |damping| are Cdef filtering
344 // parameters.
345 // |direction| is the filtering direction.
346 // |dest| is the output buffer. |dest_stride| is given in bytes.
347 using CdefFilteringFunc = void (*)(const uint16_t* source,
348                                    ptrdiff_t source_stride, int block_height,
349                                    int primary_strength, int secondary_strength,
350                                    int damping, int direction, void* dest,
351                                    ptrdiff_t dest_stride);
352 
353 // The first index is block width: [0]: 4, [1]: 8. The second is based on
354 // non-zero strengths: [0]: |primary_strength| and |secondary_strength|, [1]:
355 // |primary_strength| only, [2]: |secondary_strength| only.
356 using CdefFilteringFuncs = CdefFilteringFunc[2][3];
357 
358 // Upscaling coefficients function signature. Section 7.16.
359 // This is an auxiliary function for SIMD optimizations and has no corresponding
360 // C function. Different SIMD versions may have different outputs. So it must
361 // pair with the corresponding version of SuperResFunc.
362 // |upscaled_width| is the width of the output frame.
363 // |step| is the number of subpixels to move the kernel for the next destination
364 // pixel.
365 // |initial_subpixel_x| is a base offset from which |step| increments.
366 // |coefficients| is the upscale filter used by each pixel in a row.
367 using SuperResCoefficientsFunc = void (*)(int upscaled_width,
368                                           int initial_subpixel_x, int step,
369                                           void* coefficients);
370 
371 // Upscaling process function signature. Section 7.16.
372 // |coefficients| is the upscale filter used by each pixel in a row. It is not
373 // used by the C function.
374 // |source| is the input frame buffer. It will be line extended.
375 // |source_stride| is given in pixels.
376 // |dest| is the output buffer.
377 // |dest_stride| is given in pixels.
378 // |height| is the height of the block to be processed.
379 // |downscaled_width| is the width of the input frame.
380 // |upscaled_width| is the width of the output frame.
381 // |step| is the number of subpixels to move the kernel for the next destination
382 // pixel.
383 // |initial_subpixel_x| is a base offset from which |step| increments.
384 using SuperResFunc = void (*)(const void* coefficients, void* source,
385                               ptrdiff_t source_stride, int height,
386                               int downscaled_width, int upscaled_width,
387                               int initial_subpixel_x, int step, void* dest,
388                               ptrdiff_t dest_stride);
389 
390 // Loop restoration function signature. Sections 7.16, 7.17.
391 // |restoration_info| contains loop restoration information, such as filter
392 // type, strength.
393 // |source| is the input frame buffer, which is deblocked and cdef filtered.
394 // |top_border| and |bottom_border| are the top and bottom borders.
395 // |dest| is the output.
396 // |stride| is given in pixels, and shared by |source| and |dest|.
397 // |top_border_stride| and |bottom_border_stride| are given in pixels.
398 // |restoration_buffer| contains buffers required for self guided filter and
399 // wiener filter. They must be initialized before calling.
400 using LoopRestorationFunc = void (*)(
401     const RestorationUnitInfo& restoration_info, const void* source,
402     ptrdiff_t stride, const void* top_border, ptrdiff_t top_border_stride,
403     const void* bottom_border, ptrdiff_t bottom_border_stride, int width,
404     int height, RestorationBuffer* restoration_buffer, void* dest);
405 
406 // Index 0 is Wiener Filter.
407 // Index 1 is Self Guided Restoration Filter.
408 // This can be accessed as LoopRestorationType - 2.
409 using LoopRestorationFuncs = LoopRestorationFunc[2];
410 
411 // Convolve function signature. Section 7.11.3.4.
412 // This function applies a horizontal filter followed by a vertical filter.
413 // |reference| is the input block (reference frame buffer). |reference_stride|
414 // is the corresponding frame stride.
415 // |vertical_filter_index|/|horizontal_filter_index| is the index to
416 // retrieve the type of filter to be applied for vertical/horizontal direction
417 // from the filter lookup table 'kSubPixelFilters'.
418 // |horizontal_filter_id| and |vertical_filter_id| are the filter ids.
419 // |width| and |height| are width and height of the block to be filtered.
420 // |ref_last_x| and |ref_last_y| are the last pixel of the reference frame in
421 // x/y direction.
422 // |prediction| is the output block (output frame buffer).
423 // Rounding precision is derived from the function being called. For horizontal
424 // filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be
425 // used. For compound vertical filtering kInterRoundBitsCompoundVertical will be
426 // used. Otherwise kInterRoundBitsVertical & kInterRoundBitsVertical12bpp will
427 // be used.
428 using ConvolveFunc = void (*)(const void* reference, ptrdiff_t reference_stride,
429                               int horizontal_filter_index,
430                               int vertical_filter_index,
431                               int horizontal_filter_id, int vertical_filter_id,
432                               int width, int height, void* prediction,
433                               ptrdiff_t pred_stride);
434 
435 // Convolve functions signature. Each points to one convolve function with
436 // a specific setting:
437 // ConvolveFunc[is_intra_block_copy][is_compound][has_vertical_filter]
438 // [has_horizontal_filter].
439 // If is_compound is false, the prediction is clipped to Pixel.
440 // If is_compound is true, the range of prediction is:
441 //   8bpp:  [-5132,  9212] (int16_t)
442 //   10bpp: [ 3988, 61532] (uint16_t)
443 //   12bpp: [ 3974, 61559] (uint16_t)
444 // See src/dsp/convolve.cc
445 using ConvolveFuncs = ConvolveFunc[2][2][2][2];
446 
447 // Convolve + scale function signature. Section 7.11.3.4.
448 // This function applies a horizontal filter followed by a vertical filter.
449 // |reference| is the input block (reference frame buffer). |reference_stride|
450 // is the corresponding frame stride.
451 // |vertical_filter_index|/|horizontal_filter_index| is the index to
452 // retrieve the type of filter to be applied for vertical/horizontal direction
453 // from the filter lookup table 'kSubPixelFilters'.
454 // |subpixel_x| and |subpixel_y| are starting positions in units of 1/1024.
455 // |step_x| and |step_y| are step sizes in units of 1/1024 of a pixel.
456 // |width| and |height| are width and height of the block to be filtered.
457 // |ref_last_x| and |ref_last_y| are the last pixel of the reference frame in
458 // x/y direction.
459 // |prediction| is the output block (output frame buffer).
460 // Rounding precision is derived from the function being called. For horizontal
461 // filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be
462 // used. For compound vertical filtering kInterRoundBitsCompoundVertical will be
463 // used. Otherwise kInterRoundBitsVertical & kInterRoundBitsVertical12bpp will
464 // be used.
465 using ConvolveScaleFunc = void (*)(const void* reference,
466                                    ptrdiff_t reference_stride,
467                                    int horizontal_filter_index,
468                                    int vertical_filter_index, int subpixel_x,
469                                    int subpixel_y, int step_x, int step_y,
470                                    int width, int height, void* prediction,
471                                    ptrdiff_t pred_stride);
472 
473 // Convolve functions signature for scaling version.
474 // 0: single predictor. 1: compound predictor.
475 using ConvolveScaleFuncs = ConvolveScaleFunc[2];
476 
477 // Weight mask function signature. Section 7.11.3.12.
478 // |prediction_0| is the first input block.
479 // |prediction_1| is the second input block. Both blocks are int16_t* when
480 // bitdepth == 8 and uint16_t* otherwise.
481 // |width| and |height| are the prediction width and height.
482 // The stride for the input buffers is equal to |width|.
483 // The valid range of block size is [8x8, 128x128] for the luma plane.
484 // |mask| is the output buffer. |mask_stride| is the output buffer stride.
485 using WeightMaskFunc = void (*)(const void* prediction_0,
486                                 const void* prediction_1, uint8_t* mask,
487                                 ptrdiff_t mask_stride);
488 
489 // Weight mask functions signature. The dimensions (in order) are:
490 //   * Width index (4 => 0, 8 => 1, 16 => 2 and so on).
491 //   * Height index (4 => 0, 8 => 1, 16 => 2 and so on).
492 //   * mask_is_inverse.
493 using WeightMaskFuncs = WeightMaskFunc[6][6][2];
494 
495 // Average blending function signature.
496 // Two predictors are averaged to generate the output.
497 // Input predictor values are int16_t. Output type is uint8_t, with actual
498 // range of Pixel value.
499 // Average blending is in the bottom of Section 7.11.3.1 (COMPOUND_AVERAGE).
500 // |prediction_0| is the first input block.
501 // |prediction_1| is the second input block. Both blocks are int16_t* when
502 // bitdepth == 8 and uint16_t* otherwise.
503 // |width| and |height| are the same for the first and second input blocks.
504 // The stride for the input buffers is equal to |width|.
505 // The valid range of block size is [8x8, 128x128] for the luma plane.
506 // |dest| is the output buffer. |dest_stride| is the output buffer stride.
507 using AverageBlendFunc = void (*)(const void* prediction_0,
508                                   const void* prediction_1, int width,
509                                   int height, void* dest,
510                                   ptrdiff_t dest_stride);
511 
512 // Distance weighted blending function signature.
513 // Weights are generated in Section 7.11.3.15.
514 // Weighted blending is in the bottom of Section 7.11.3.1 (COMPOUND_DISTANCE).
515 // This function takes two blocks (inter frame prediction) and produces a
516 // weighted output.
517 // |prediction_0| is the first input block.
518 // |prediction_1| is the second input block. Both blocks are int16_t* when
519 // bitdepth == 8 and uint16_t* otherwise.
520 // |weight_0| is the weight for the first block. It is derived from the relative
521 // distance of the first reference frame and the current frame.
522 // |weight_1| is the weight for the second block. It is derived from the
523 // relative distance of the second reference frame and the current frame.
524 // |width| and |height| are the same for the first and second input blocks.
525 // The stride for the input buffers is equal to |width|.
526 // The valid range of block size is [8x8, 128x128] for the luma plane.
527 // |dest| is the output buffer. |dest_stride| is the output buffer stride.
528 using DistanceWeightedBlendFunc = void (*)(const void* prediction_0,
529                                            const void* prediction_1,
530                                            uint8_t weight_0, uint8_t weight_1,
531                                            int width, int height, void* dest,
532                                            ptrdiff_t dest_stride);
533 
534 // Mask blending function signature. Section 7.11.3.14.
535 // This function takes two blocks and produces a blended output stored into the
536 // output block |dest|. The blending is a weighted average process, controlled
537 // by values of the mask.
538 // |prediction_0| is the first input block. When prediction mode is inter_intra
539 // (or wedge_inter_intra), this refers to the inter frame prediction. It is
540 // int16_t* when bitdepth == 8 and uint16_t* otherwise.
541 // The stride for |prediction_0| is equal to |width|.
542 // |prediction_1| is the second input block. When prediction mode is inter_intra
543 // (or wedge_inter_intra), this refers to the intra frame prediction and uses
544 // Pixel values. It is only used for intra frame prediction when bitdepth >= 10.
545 // It is int16_t* when bitdepth == 8 and uint16_t* otherwise.
546 // |prediction_stride_1| is the stride, given in units of [u]int16_t. When
547 // |is_inter_intra| is false (compound prediction) then |prediction_stride_1| is
548 // equal to |width|.
549 // |mask| is an integer array, whose value indicates the weight of the blending.
550 // |mask_stride| is corresponding stride.
551 // |width|, |height| are the same for both input blocks.
552 // If it's inter_intra (or wedge_inter_intra), the valid range of block size is
553 // [8x8, 32x32]. Otherwise (including difference weighted prediction and
554 // compound average prediction), the valid range is [8x8, 128x128].
555 // If there's subsampling, the corresponding width and height are halved for
556 // chroma planes.
557 // |subsampling_x|, |subsampling_y| are the subsampling factors.
558 // |is_inter_intra| stands for the prediction mode. If it is true, one of the
559 // prediction blocks is from intra prediction of current frame. Otherwise, two
560 // prediction blocks are both inter frame predictions.
561 // |is_wedge_inter_intra| indicates if the mask is for the wedge prediction.
562 // |dest| is the output block.
563 // |dest_stride| is the corresponding stride for dest.
564 using MaskBlendFunc = void (*)(const void* prediction_0,
565                                const void* prediction_1,
566                                ptrdiff_t prediction_stride_1,
567                                const uint8_t* mask, ptrdiff_t mask_stride,
568                                int width, int height, void* dest,
569                                ptrdiff_t dest_stride);
570 
571 // Mask blending functions signature. Each points to one function with
572 // a specific setting:
573 // MaskBlendFunc[subsampling_x + subsampling_y][is_inter_intra].
574 using MaskBlendFuncs = MaskBlendFunc[3][2];
575 
576 // This function is similar to the MaskBlendFunc. It is only used when
577 // |is_inter_intra| is true and |bitdepth| == 8.
578 // |prediction_[01]| are Pixel values (uint8_t).
579 // |prediction_1| is also the output buffer.
580 using InterIntraMaskBlendFunc8bpp = void (*)(const uint8_t* prediction_0,
581                                              uint8_t* prediction_1,
582                                              ptrdiff_t prediction_stride_1,
583                                              const uint8_t* mask,
584                                              ptrdiff_t mask_stride, int width,
585                                              int height);
586 
587 // InterIntra8bpp mask blending functions signature. When is_wedge_inter_intra
588 // is false, the function at index 0 must be used. Otherwise, the function at
589 // index subsampling_x + subsampling_y must be used.
590 using InterIntraMaskBlendFuncs8bpp = InterIntraMaskBlendFunc8bpp[3];
591 
592 // Obmc (overlapped block motion compensation) blending function signature.
593 // Section 7.11.3.10.
594 // This function takes two blocks and produces a blended output stored into the
595 // first input block. The blending is a weighted average process, controlled by
596 // values of the mask.
597 // Obmc is not a compound mode. It is different from other compound blending,
598 // in terms of precision. The current block is computed using convolution with
599 // clipping to the range of pixel values. Its above and left blocks are also
600 // clipped. Therefore obmc blending process doesn't need to clip the output.
601 // |prediction| is the first input block, which will be overwritten.
602 // |prediction_stride| is the stride, given in bytes.
603 // |width|, |height| are the same for both input blocks.
604 // |obmc_prediction| is the second input block.
605 // |obmc_prediction_stride| is its stride, given in bytes.
606 using ObmcBlendFunc = void (*)(void* prediction, ptrdiff_t prediction_stride,
607                                int width, int height,
608                                const void* obmc_prediction,
609                                ptrdiff_t obmc_prediction_stride);
610 using ObmcBlendFuncs = ObmcBlendFunc[kNumObmcDirections];
611 
612 // Warp function signature. Section 7.11.3.5.
613 // This function applies warp filtering for each 8x8 block inside the current
614 // coding block. The filtering process is similar to 2d convolve filtering.
615 // The horizontal filter is applied followed by the vertical filter.
616 // The function has to calculate corresponding pixel positions before and
617 // after warping.
618 // |source| is the input reference frame buffer.
619 // |source_stride|, |source_width|, |source_height| are corresponding frame
620 // stride, width, and height. |source_stride| is given in bytes.
621 // |warp_params| is the matrix of warp motion: warp_params[i] = mN.
622 //         [x'     (m2 m3 m0   [x
623 //     z .  y'  =   m4 m5 m1 *  y
624 //          1]      m6 m7 1)    1]
625 // |subsampling_x/y| is the current frame's plane subsampling factor.
626 // |block_start_x| and |block_start_y| are the starting position the current
627 // coding block.
628 // |block_width| and |block_height| are width and height of the current coding
629 // block. |block_width| and |block_height| are at least 8.
630 // |alpha|, |beta|, |gamma|, |delta| are valid warp parameters. See the
631 // comments in the definition of struct GlobalMotion for the range of their
632 // values.
633 // |dest| is the output buffer of type Pixel. The output values are clipped to
634 // Pixel values.
635 // |dest_stride| is the stride, in units of bytes.
636 // Rounding precision is derived from the function being called. For horizontal
637 // filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be
638 // used. For vertical filtering kInterRoundBitsVertical &
639 // kInterRoundBitsVertical12bpp will be used.
640 //
641 // NOTE: WarpFunc assumes the source frame has left, right, top, and bottom
642 // borders that extend the frame boundary pixels.
643 // * The left and right borders must be at least 13 pixels wide. In addition,
644 //   Warp_NEON() may read up to 14 bytes after a row in the |source| buffer.
645 //   Therefore, there must be at least one extra padding byte after the right
646 //   border of the last row in the source buffer.
647 // * The top and bottom borders must be at least 13 pixels high.
648 using WarpFunc = void (*)(const void* source, ptrdiff_t source_stride,
649                           int source_width, int source_height,
650                           const int* warp_params, int subsampling_x,
651                           int subsampling_y, int block_start_x,
652                           int block_start_y, int block_width, int block_height,
653                           int16_t alpha, int16_t beta, int16_t gamma,
654                           int16_t delta, void* dest, ptrdiff_t dest_stride);
655 
656 // Warp for compound predictions. Section 7.11.3.5.
657 // Similar to WarpFunc, but |dest| is a uint16_t predictor buffer,
658 // |dest_stride| is given in units of uint16_t and |inter_round_bits_vertical|
659 // is always 7 (kCompoundInterRoundBitsVertical).
660 // Rounding precision is derived from the function being called. For horizontal
661 // filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be
662 // used. For vertical filtering kInterRoundBitsCompondVertical will be used.
663 using WarpCompoundFunc = WarpFunc;
664 
665 constexpr int kNumAutoRegressionLags = 4;
666 // Applies an auto-regressive filter to the white noise in |luma_grain_buffer|.
667 // Section 7.18.3.3, second code block
668 // |params| are parameters read from frame header, mainly providing
669 // auto_regression_coeff_y for the filter and auto_regression_shift to right
670 // shift the filter sum by. Note: This method assumes
671 // params.auto_regression_coeff_lag is not 0. Do not call this method if
672 // params.auto_regression_coeff_lag is 0.
673 using LumaAutoRegressionFunc = void (*)(const FilmGrainParams& params,
674                                         void* luma_grain_buffer);
675 // Function index is auto_regression_coeff_lag - 1.
676 using LumaAutoRegressionFuncs =
677     LumaAutoRegressionFunc[kNumAutoRegressionLags - 1];
678 
679 // Applies an auto-regressive filter to the white noise in u_grain and v_grain.
680 // Section 7.18.3.3, third code block
681 // The |luma_grain_buffer| provides samples that are added to the autoregressive
682 // sum when num_y_points > 0.
683 // |u_grain_buffer| and |v_grain_buffer| point to the buffers of chroma noise
684 // that were generated from the stored Gaussian sequence, and are overwritten
685 // with the results of the autoregressive filter. |params| are parameters read
686 // from frame header, mainly providing auto_regression_coeff_u and
687 // auto_regression_coeff_v for each chroma plane's filter, and
688 // auto_regression_shift to right shift the filter sums by.
689 using ChromaAutoRegressionFunc = void (*)(const FilmGrainParams& params,
690                                           const void* luma_grain_buffer,
691                                           int subsampling_x, int subsampling_y,
692                                           void* u_grain_buffer,
693                                           void* v_grain_buffer);
694 using ChromaAutoRegressionFuncs =
695     ChromaAutoRegressionFunc[/*use_luma*/ 2][kNumAutoRegressionLags];
696 
697 // Build an image-wide "stripe" of grain noise for every 32 rows in the image.
698 // Section 7.18.3.5, first code block.
699 // Each 32x32 luma block is copied at a random offset specified via
700 // |grain_seed| from the grain template produced by autoregression, and the same
701 // is done for chroma grains, subject to subsampling.
702 // |width| and |height| are the dimensions of the overall image.
703 // |noise_stripes_buffer| points to an Array2DView with one row for each stripe.
704 // Because this function treats all planes identically and independently, it is
705 // simplified to take one grain buffer at a time. This means duplicating some
706 // random number generations, but that work can be reduced in other ways.
707 using ConstructNoiseStripesFunc = void (*)(const void* grain_buffer,
708                                            int grain_seed, int width,
709                                            int height, int subsampling_x,
710                                            int subsampling_y,
711                                            void* noise_stripes_buffer);
712 using ConstructNoiseStripesFuncs =
713     ConstructNoiseStripesFunc[/*overlap_flag*/ 2];
714 
715 // Compute the one or two overlap rows for each stripe copied to the noise
716 // image.
717 // Section 7.18.3.5, second code block. |width| and |height| are the
718 // dimensions of the overall image. |noise_stripes_buffer| points to an
719 // Array2DView with one row for each stripe. |noise_image_buffer| points to an
720 // Array2D containing the allocated plane for this frame. Because this function
721 // treats all planes identically and independently, it is simplified to take one
722 // grain buffer at a time.
723 using ConstructNoiseImageOverlapFunc =
724     void (*)(const void* noise_stripes_buffer, int width, int height,
725              int subsampling_x, int subsampling_y, void* noise_image_buffer);
726 
727 // Populate a scaling lookup table with interpolated values of a piecewise
728 // linear function where values in |point_value| are mapped to the values in
729 // |point_scaling|.
730 // |num_points| can be between 0 and 15. When 0, the lookup table is set to
731 // zero.
732 // |point_value| and |point_scaling| have |num_points| valid elements.
733 using InitializeScalingLutFunc = void (*)(
734     int num_points, const uint8_t point_value[], const uint8_t point_scaling[],
735     uint8_t scaling_lut[kScalingLookupTableSize]);
736 
737 // Blend noise with image. Section 7.18.3.5, third code block.
738 // |width| is the width of each row, while |height| is how many rows to compute.
739 // |start_height| is an offset for the noise image, to support multithreading.
740 // |min_value|, |max_luma|, and |max_chroma| are computed by the caller of these
741 // functions, according to the code in the spec.
742 // |source_plane_y| and |source_plane_uv| are the plane buffers of the decoded
743 // frame. They are blended with the film grain noise and written to
744 // |dest_plane_y| and |dest_plane_uv| as final output for display.
745 // source_plane_* and dest_plane_* may point to the same buffer, in which case
746 // the film grain noise is added in place.
747 // |scaling_lut_y|  and |scaling_lut| represent a piecewise linear mapping from
748 // the frame's raw pixel value, to a scaling factor for the noise sample.
749 // |scaling_shift| is applied as a right shift after scaling, so that scaling
750 // down is possible. It is found in FilmGrainParams, but supplied directly to
751 // BlendNoiseWithImageLumaFunc because it's the only member used.
752 using BlendNoiseWithImageLumaFunc =
753     void (*)(const void* noise_image_ptr, int min_value, int max_value,
754              int scaling_shift, int width, int height, int start_height,
755              const uint8_t scaling_lut_y[kScalingLookupTableSize],
756              const void* source_plane_y, ptrdiff_t source_stride_y,
757              void* dest_plane_y, ptrdiff_t dest_stride_y);
758 
759 using BlendNoiseWithImageChromaFunc = void (*)(
760     Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
761     int min_value, int max_value, int width, int height, int start_height,
762     int subsampling_x, int subsampling_y,
763     const uint8_t scaling_lut[kScalingLookupTableSize],
764     const void* source_plane_y, ptrdiff_t source_stride_y,
765     const void* source_plane_uv, ptrdiff_t source_stride_uv,
766     void* dest_plane_uv, ptrdiff_t dest_stride_uv);
767 
768 using BlendNoiseWithImageChromaFuncs =
769     BlendNoiseWithImageChromaFunc[/*chroma_scaling_from_luma*/ 2];
770 
771 //------------------------------------------------------------------------------
772 
773 struct FilmGrainFuncs {
774   LumaAutoRegressionFuncs luma_auto_regression;
775   ChromaAutoRegressionFuncs chroma_auto_regression;
776   ConstructNoiseStripesFuncs construct_noise_stripes;
777   ConstructNoiseImageOverlapFunc construct_noise_image_overlap;
778   InitializeScalingLutFunc initialize_scaling_lut;
779   BlendNoiseWithImageLumaFunc blend_noise_luma;
780   BlendNoiseWithImageChromaFuncs blend_noise_chroma;
781 };
782 
783 // Motion field projection function signature. Section 7.9.
784 // |reference_info| provides reference information for motion field projection.
785 // |reference_to_current_with_sign| is the precalculated reference frame id
786 // distance from current frame.
787 // |dst_sign| is -1 for LAST_FRAME and LAST2_FRAME, or 0 (1 in spec) for others.
788 // |y8_start| and |y8_end| are the start and end 8x8 rows of the current tile.
789 // |x8_start| and |x8_end| are the start and end 8x8 columns of the current
790 // tile.
791 // |motion_field| is the output which saves the projected motion field
792 // information.
793 using MotionFieldProjectionKernelFunc = void (*)(
794     const ReferenceInfo& reference_info, int reference_to_current_with_sign,
795     int dst_sign, int y8_start, int y8_end, int x8_start, int x8_end,
796     TemporalMotionField* motion_field);
797 
798 // Compound temporal motion vector projection function signature.
799 // Section 7.9.3 and 7.10.2.10.
800 // |temporal_mvs| is the set of temporal reference motion vectors.
801 // |temporal_reference_offsets| specifies the number of frames covered by the
802 // original motion vector.
803 // |reference_offsets| specifies the number of frames to be covered by the
804 // projected motion vector.
805 // |count| is the number of the temporal motion vectors.
806 // |candidate_mvs| is the set of projected motion vectors.
807 using MvProjectionCompoundFunc = void (*)(
808     const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
809     const int reference_offsets[2], int count,
810     CompoundMotionVector* candidate_mvs);
811 
812 // Single temporal motion vector projection function signature.
813 // Section 7.9.3 and 7.10.2.10.
814 // |temporal_mvs| is the set of temporal reference motion vectors.
815 // |temporal_reference_offsets| specifies the number of frames covered by the
816 // original motion vector.
817 // |reference_offset| specifies the number of frames to be covered by the
818 // projected motion vector.
819 // |count| is the number of the temporal motion vectors.
820 // |candidate_mvs| is the set of projected motion vectors.
821 using MvProjectionSingleFunc = void (*)(
822     const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
823     int reference_offset, int count, MotionVector* candidate_mvs);
824 
825 struct Dsp {
826   AverageBlendFunc average_blend;
827   CdefDirectionFunc cdef_direction;
828   CdefFilteringFuncs cdef_filters;
829   CflIntraPredictorFuncs cfl_intra_predictors;
830   CflSubsamplerFuncs cfl_subsamplers;
831   ConvolveFuncs convolve;
832   ConvolveScaleFuncs convolve_scale;
833   DirectionalIntraPredictorZone1Func directional_intra_predictor_zone1;
834   DirectionalIntraPredictorZone2Func directional_intra_predictor_zone2;
835   DirectionalIntraPredictorZone3Func directional_intra_predictor_zone3;
836   DistanceWeightedBlendFunc distance_weighted_blend;
837   FilmGrainFuncs film_grain;
838   FilterIntraPredictorFunc filter_intra_predictor;
839   InterIntraMaskBlendFuncs8bpp inter_intra_mask_blend_8bpp;
840   IntraEdgeFilterFunc intra_edge_filter;
841   IntraEdgeUpsamplerFunc intra_edge_upsampler;
842   IntraPredictorFuncs intra_predictors;
843   InverseTransformAddFuncs inverse_transforms;
844   LoopFilterFuncs loop_filters;
845   LoopRestorationFuncs loop_restorations;
846   MaskBlendFuncs mask_blend;
847   MotionFieldProjectionKernelFunc motion_field_projection_kernel;
848   MvProjectionCompoundFunc mv_projection_compound[3];
849   MvProjectionSingleFunc mv_projection_single[3];
850   ObmcBlendFuncs obmc_blend;
851   SuperResCoefficientsFunc super_res_coefficients;
852   SuperResFunc super_res;
853   WarpCompoundFunc warp_compound;
854   WarpFunc warp;
855   WeightMaskFuncs weight_mask;
856 };
857 
858 // Initializes function pointers based on build config and runtime
859 // environment. Must be called once before first use. This function is
860 // thread-safe.
861 void DspInit();
862 
863 // Returns the appropriate Dsp table for |bitdepth| or nullptr if one doesn't
864 // exist.
865 const Dsp* GetDspTable(int bitdepth);
866 
867 }  // namespace dsp
868 
869 namespace dsp_internal {
870 
871 // Visual Studio builds don't have a way to detect SSE4_1. Only exclude the C
872 // functions if /arch:AVX2 is used across all sources.
873 #if !LIBGAV1_TARGETING_AVX2 && \
874     (defined(_MSC_VER) || (defined(_M_IX86) || defined(_M_X64)))
875 #undef LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
876 #define LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS 1
877 #endif
878 
879 // Returns true if a more highly optimized version of |func| is not defined for
880 // the associated bitdepth or if it is forcibly enabled with
881 // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS. The define checked for |func| corresponds
882 // to the LIBGAV1_Dsp<bitdepth>bpp_|func| define in the header file associated
883 // with the module.
884 // |func| is one of:
885 //   - FunctionName, e.g., SelfGuidedFilter.
886 //   - [sub-table-index1][...-indexN] e.g.,
887 //     TransformSize4x4_IntraPredictorDc. The indices correspond to enum values
888 //     used as lookups with leading 'k' removed.
889 //
890 //  NEON support is the only extension available for ARM and it is always
891 //  required. Because of this restriction DSP_ENABLED_8BPP_NEON(func) is always
892 //  true and can be omitted.
893 #define DSP_ENABLED_8BPP_AVX2(func)    \
894   (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
895    LIBGAV1_Dsp8bpp_##func == LIBGAV1_CPU_AVX2)
896 #define DSP_ENABLED_10BPP_AVX2(func)   \
897   (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
898    LIBGAV1_Dsp10bpp_##func == LIBGAV1_CPU_AVX2)
899 #define DSP_ENABLED_8BPP_SSE4_1(func)  \
900   (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
901    LIBGAV1_Dsp8bpp_##func == LIBGAV1_CPU_SSE4_1)
902 #define DSP_ENABLED_10BPP_SSE4_1(func) \
903   (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
904    LIBGAV1_Dsp10bpp_##func == LIBGAV1_CPU_SSE4_1)
905 
906 // Initializes C-only function pointers. Note some entries may be set to
907 // nullptr if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS is not defined. This is meant
908 // for use in tests only, it is not thread-safe.
909 void DspInit_C();
910 
911 // Returns the appropriate Dsp table for |bitdepth| or nullptr if one doesn't
912 // exist. This version is meant for use by test or dsp/*Init() functions only.
913 dsp::Dsp* GetWritableDspTable(int bitdepth);
914 
915 }  // namespace dsp_internal
916 }  // namespace libgav1
917 
918 #endif  // LIBGAV1_SRC_DSP_DSP_H_
919