1 /*
2 * Copyright 2019 The libgav1 Authors
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #ifndef LIBGAV1_SRC_DSP_DSP_H_
18 #define LIBGAV1_SRC_DSP_DSP_H_
19
20 #include <cstddef>
21 #include <cstdint>
22 #include <cstdlib>
23
24 #include "src/dsp/common.h"
25 #include "src/dsp/constants.h"
26 #include "src/dsp/film_grain_common.h"
27 #include "src/utils/cpu.h"
28 #include "src/utils/reference_info.h"
29 #include "src/utils/types.h"
30
31 namespace libgav1 {
32 namespace dsp {
33
34 #if !defined(LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS)
35 #define LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS 0
36 #endif
37
38 enum IntraPredictor : uint8_t {
39 kIntraPredictorDcFill,
40 kIntraPredictorDcTop,
41 kIntraPredictorDcLeft,
42 kIntraPredictorDc,
43 kIntraPredictorVertical,
44 kIntraPredictorHorizontal,
45 kIntraPredictorPaeth,
46 kIntraPredictorSmooth,
47 kIntraPredictorSmoothVertical,
48 kIntraPredictorSmoothHorizontal,
49 kNumIntraPredictors
50 };
51
52 // List of valid 1D transforms.
53 enum Transform1D : uint8_t {
54 k1DTransformDct, // Discrete Cosine Transform.
55 k1DTransformAdst, // Asymmetric Discrete Sine Transform.
56 k1DTransformIdentity,
57 k1DTransformWht, // Walsh Hadamard Transform.
58 kNum1DTransforms
59 };
60
61 // List of valid 1D transform sizes. Not all transforms may be available for all
62 // the sizes.
63 enum TransformSize1D : uint8_t {
64 k1DTransformSize4,
65 k1DTransformSize8,
66 k1DTransformSize16,
67 k1DTransformSize32,
68 k1DTransformSize64,
69 kNum1DTransformSizes
70 };
71
72 // The maximum width of the loop filter, fewer pixels may be filtered depending
73 // on strength thresholds.
74 enum LoopFilterSize : uint8_t {
75 kLoopFilterSize4,
76 kLoopFilterSize6,
77 kLoopFilterSize8,
78 kLoopFilterSize14,
79 kNumLoopFilterSizes
80 };
81
82 enum : uint8_t {
83 kRow = 0,
84 kColumn = 1,
85 };
86
87 //------------------------------------------------------------------------------
88 // ToString()
89 //
90 // These functions are meant to be used only in debug logging and within tests.
91 // They are defined inline to avoid including the strings in the release
92 // library when logging is disabled; unreferenced functions will not be added to
93 // any object file in that case.
94
ToString(const IntraPredictor predictor)95 inline const char* ToString(const IntraPredictor predictor) {
96 switch (predictor) {
97 case kIntraPredictorDcFill:
98 return "kIntraPredictorDcFill";
99 case kIntraPredictorDcTop:
100 return "kIntraPredictorDcTop";
101 case kIntraPredictorDcLeft:
102 return "kIntraPredictorDcLeft";
103 case kIntraPredictorDc:
104 return "kIntraPredictorDc";
105 case kIntraPredictorVertical:
106 return "kIntraPredictorVertical";
107 case kIntraPredictorHorizontal:
108 return "kIntraPredictorHorizontal";
109 case kIntraPredictorPaeth:
110 return "kIntraPredictorPaeth";
111 case kIntraPredictorSmooth:
112 return "kIntraPredictorSmooth";
113 case kIntraPredictorSmoothVertical:
114 return "kIntraPredictorSmoothVertical";
115 case kIntraPredictorSmoothHorizontal:
116 return "kIntraPredictorSmoothHorizontal";
117 case kNumIntraPredictors:
118 return "kNumIntraPredictors";
119 }
120 abort();
121 }
122
ToString(const Transform1D transform)123 inline const char* ToString(const Transform1D transform) {
124 switch (transform) {
125 case k1DTransformDct:
126 return "k1DTransformDct";
127 case k1DTransformAdst:
128 return "k1DTransformAdst";
129 case k1DTransformIdentity:
130 return "k1DTransformIdentity";
131 case k1DTransformWht:
132 return "k1DTransformWht";
133 case kNum1DTransforms:
134 return "kNum1DTransforms";
135 }
136 abort();
137 }
138
ToString(const TransformSize1D transform_size)139 inline const char* ToString(const TransformSize1D transform_size) {
140 switch (transform_size) {
141 case k1DTransformSize4:
142 return "k1DTransformSize4";
143 case k1DTransformSize8:
144 return "k1DTransformSize8";
145 case k1DTransformSize16:
146 return "k1DTransformSize16";
147 case k1DTransformSize32:
148 return "k1DTransformSize32";
149 case k1DTransformSize64:
150 return "k1DTransformSize64";
151 case kNum1DTransformSizes:
152 return "kNum1DTransformSizes";
153 }
154 abort();
155 }
156
ToString(const LoopFilterSize filter_size)157 inline const char* ToString(const LoopFilterSize filter_size) {
158 switch (filter_size) {
159 case kLoopFilterSize4:
160 return "kLoopFilterSize4";
161 case kLoopFilterSize6:
162 return "kLoopFilterSize6";
163 case kLoopFilterSize8:
164 return "kLoopFilterSize8";
165 case kLoopFilterSize14:
166 return "kLoopFilterSize14";
167 case kNumLoopFilterSizes:
168 return "kNumLoopFilterSizes";
169 }
170 abort();
171 }
172
ToString(const LoopFilterType filter_type)173 inline const char* ToString(const LoopFilterType filter_type) {
174 switch (filter_type) {
175 case kLoopFilterTypeVertical:
176 return "kLoopFilterTypeVertical";
177 case kLoopFilterTypeHorizontal:
178 return "kLoopFilterTypeHorizontal";
179 case kNumLoopFilterTypes:
180 return "kNumLoopFilterTypes";
181 }
182 abort();
183 }
184
185 //------------------------------------------------------------------------------
186 // Intra predictors. Section 7.11.2.
187 // These require access to one or both of the top row and left column. Some may
188 // access the top-left (top[-1]), top-right (top[width+N]), bottom-left
189 // (left[height+N]) or upper-left (left[-1]).
190
191 // Intra predictor function signature. Sections 7.11.2.2, 7.11.2.4 (#10,#11),
192 // 7.11.2.5, 7.11.2.6.
193 // |dst| is an unaligned pointer to the output block. Pixel size is determined
194 // by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to
195 // the row above |dst|. |left| is an aligned vector of the column to the left
196 // of |dst|. top-left and bottom-left may be accessed.
197 using IntraPredictorFunc = void (*)(void* dst, ptrdiff_t stride,
198 const void* top, const void* left);
199 using IntraPredictorFuncs =
200 IntraPredictorFunc[kNumTransformSizes][kNumIntraPredictors];
201
202 // Directional intra predictor function signature, zone 1 (0 < angle < 90).
203 // Section 7.11.2.4 (#7).
204 // |dst| is an unaligned pointer to the output block. Pixel size is determined
205 // by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to
206 // the row above |dst|. |width| and |height| give the dimensions of the block.
207 // |xstep| is the scaled starting index to |top| from
208 // kDirectionalIntraPredictorDerivative. |upsampled_top| indicates whether
209 // |top| has been upsampled as described in '7.11.2.11. Intra edge upsample
210 // process'. This can occur in cases with |width| + |height| <= 16. top-right
211 // is accessed.
212 using DirectionalIntraPredictorZone1Func = void (*)(void* dst, ptrdiff_t stride,
213 const void* top, int width,
214 int height, int xstep,
215 bool upsampled_top);
216
217 // Directional intra predictor function signature, zone 2 (90 < angle < 180).
218 // Section 7.11.2.4 (#8).
219 // |dst| is an unaligned pointer to the output block. Pixel size is determined
220 // by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to
221 // the row above |dst|. |left| is an aligned vector of the column to the left of
222 // |dst|. |width| and |height| give the dimensions of the block. |xstep| and
223 // |ystep| are the scaled starting index to |top| and |left|, respectively,
224 // from kDirectionalIntraPredictorDerivative. |upsampled_top| and
225 // |upsampled_left| indicate whether |top| and |left| have been upsampled as
226 // described in '7.11.2.11. Intra edge upsample process'. This can occur in
227 // cases with |width| + |height| <= 16. top-left and upper-left are accessed,
228 // up to [-2] in each if |upsampled_top/left| are set.
229 using DirectionalIntraPredictorZone2Func = void (*)(
230 void* dst, ptrdiff_t stride, const void* top, const void* left, int width,
231 int height, int xstep, int ystep, bool upsampled_top, bool upsampled_left);
232
233 // Directional intra predictor function signature, zone 3 (180 < angle < 270).
234 // Section 7.11.2.4 (#9).
235 // |dst| is an unaligned pointer to the output block. Pixel size is determined
236 // by bitdepth with |stride| given in bytes. |left| is an aligned vector of the
237 // column to the left of |dst|. |width| and |height| give the dimensions of the
238 // block. |ystep| is the scaled starting index to |left| from
239 // kDirectionalIntraPredictorDerivative. |upsampled_left| indicates whether
240 // |left| has been upsampled as described in '7.11.2.11. Intra edge upsample
241 // process'. This can occur in cases with |width| + |height| <= 16. bottom-left
242 // is accessed.
243 using DirectionalIntraPredictorZone3Func = void (*)(void* dst, ptrdiff_t stride,
244 const void* left, int width,
245 int height, int ystep,
246 bool upsampled_left);
247
248 // Filter intra predictor function signature. Section 7.11.2.3.
249 // |dst| is an unaligned pointer to the output block. Pixel size is determined
250 // by bitdepth with |stride| given in bytes. |top| is an unaligned pointer to
251 // the row above |dst|. |left| is an aligned vector of the column to the left
252 // of |dst|. |width| and |height| are the size of the block in pixels.
253 using FilterIntraPredictorFunc = void (*)(void* dst, ptrdiff_t stride,
254 const void* top, const void* left,
255 FilterIntraPredictor pred, int width,
256 int height);
257
258 //------------------------------------------------------------------------------
259 // Chroma from Luma (Cfl) prediction. Section 7.11.5.
260
261 // Chroma from Luma (Cfl) intra prediction function signature. |dst| is an
262 // unaligned pointer to the output block. Pixel size is determined by bitdepth
263 // with |stride| given in bytes. |luma| contains subsampled luma pixels with 3
264 // fractional bits of precision. |alpha| is the signed Cfl alpha value for the
265 // appropriate plane.
266 using CflIntraPredictorFunc = void (*)(
267 void* dst, ptrdiff_t stride,
268 const int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride], int alpha);
269 using CflIntraPredictorFuncs = CflIntraPredictorFunc[kNumTransformSizes];
270
271 // Chroma from Luma (Cfl) subsampler function signature. |luma| is an unaligned
272 // pointer to the output block. |src| is an unaligned pointer to the input
273 // block. Pixel size is determined by bitdepth with |stride| given in bytes.
274 using CflSubsamplerFunc =
275 void (*)(int16_t luma[kCflLumaBufferStride][kCflLumaBufferStride],
276 int max_luma_width, int max_luma_height, const void* source,
277 ptrdiff_t stride);
278 using CflSubsamplerFuncs =
279 CflSubsamplerFunc[kNumTransformSizes][kNumSubsamplingTypes];
280
281 //------------------------------------------------------------------------------
282 // Intra Edge Filtering and Upsampling. Step 4 in section 7.11.2.4.
283
284 // Intra edge filter function signature. |buffer| is a pointer to the top_row or
285 // left_column that needs to be filtered. Typically the -1'th index of |top_row|
286 // and |left_column| need to be filtered as well, so the caller can merely pass
287 // the |buffer| as top_row[-1] or left_column[-1]. Pixel size is determined by
288 // bitdepth. |size| is the number of pixels to be filtered. |strength| is the
289 // filter strength. Section 7.11.2.12 in the spec.
290 using IntraEdgeFilterFunc = void (*)(void* buffer, int size, int strength);
291
292 // Intra edge upsampler function signature. |buffer| is a pointer to the top_row
293 // or left_column that needs to be upsampled. Pixel size is determined by
294 // bitdepth. |size| is the number of pixels to be upsampled; valid values are:
295 // 4, 8, 12, 16. This function needs access to negative indices -1 and -2 of
296 // the |buffer|. Section 7.11.2.11 in the spec.
297 using IntraEdgeUpsamplerFunc = void (*)(void* buffer, int size);
298
299 //------------------------------------------------------------------------------
300 // Inverse transform add function signature.
301 //
302 // Steps 2 and 3 of section 7.12.3 (contains the implementation of section
303 // 7.13.3).
304 // Apply the inverse transforms and add the residual to the destination frame
305 // for the transform type and block size |tx_size| starting at position
306 // |start_x| and |start_y|. |dst_frame| is a pointer to an Array2D.
307 // |adjusted_tx_height| is the number of rows to process based on the non-zero
308 // coefficient count in the block. It will be 1 (non-zero coefficient count ==
309 // 1), 4 or a multiple of 8 up to 32 or the original transform height,
310 // whichever is less.
311 using InverseTransformAddFunc = void (*)(TransformType tx_type,
312 TransformSize tx_size,
313 int adjusted_tx_height,
314 void* src_buffer, int start_x,
315 int start_y, void* dst_frame);
316 // The final dimension holds row and column transforms indexed with kRow and
317 // kColumn.
318 using InverseTransformAddFuncs =
319 InverseTransformAddFunc[kNum1DTransforms][kNum1DTransformSizes][2];
320
321 //------------------------------------------------------------------------------
322 // Post processing.
323
324 // Loop filter function signature. Section 7.14.
325 // |dst| is an unaligned pointer to the output block. Pixel size is determined
326 // by bitdepth with |stride| given in bytes.
327 using LoopFilterFunc = void (*)(void* dst, ptrdiff_t stride, int outer_thresh,
328 int inner_thresh, int hev_thresh);
329 using LoopFilterFuncs =
330 LoopFilterFunc[kNumLoopFilterSizes][kNumLoopFilterTypes];
331
332 // Cdef direction function signature. Section 7.15.2.
333 // |src| is a pointer to the source block. Pixel size is determined by bitdepth
334 // with |stride| given in bytes. |direction| and |variance| are output
335 // parameters and must not be nullptr.
336 using CdefDirectionFunc = void (*)(const void* src, ptrdiff_t stride,
337 uint8_t* direction, int* variance);
338
339 // Cdef filtering function signature. Section 7.15.3.
340 // |source| is a pointer to the input block padded with kCdefLargeValue if at a
341 // frame border. |source_stride| is given in units of uint16_t.
342 // |block_width|, |block_height| are the width/height of the input block.
343 // |primary_strength|, |secondary_strength|, and |damping| are Cdef filtering
344 // parameters.
345 // |direction| is the filtering direction.
346 // |dest| is the output buffer. |dest_stride| is given in bytes.
347 using CdefFilteringFunc = void (*)(const uint16_t* source,
348 ptrdiff_t source_stride, int block_height,
349 int primary_strength, int secondary_strength,
350 int damping, int direction, void* dest,
351 ptrdiff_t dest_stride);
352
353 // The first index is block width: [0]: 4, [1]: 8. The second is based on
354 // non-zero strengths: [0]: |primary_strength| and |secondary_strength|, [1]:
355 // |primary_strength| only, [2]: |secondary_strength| only.
356 using CdefFilteringFuncs = CdefFilteringFunc[2][3];
357
358 // Upscaling coefficients function signature. Section 7.16.
359 // This is an auxiliary function for SIMD optimizations and has no corresponding
360 // C function. Different SIMD versions may have different outputs. So it must
361 // pair with the corresponding version of SuperResFunc.
362 // |upscaled_width| is the width of the output frame.
363 // |step| is the number of subpixels to move the kernel for the next destination
364 // pixel.
365 // |initial_subpixel_x| is a base offset from which |step| increments.
366 // |coefficients| is the upscale filter used by each pixel in a row.
367 using SuperResCoefficientsFunc = void (*)(int upscaled_width,
368 int initial_subpixel_x, int step,
369 void* coefficients);
370
371 // Upscaling process function signature. Section 7.16.
372 // |coefficients| is the upscale filter used by each pixel in a row. It is not
373 // used by the C function.
374 // |source| is the input frame buffer. It will be line extended.
375 // |source_stride| is given in pixels.
376 // |dest| is the output buffer.
377 // |dest_stride| is given in pixels.
378 // |height| is the height of the block to be processed.
379 // |downscaled_width| is the width of the input frame.
380 // |upscaled_width| is the width of the output frame.
381 // |step| is the number of subpixels to move the kernel for the next destination
382 // pixel.
383 // |initial_subpixel_x| is a base offset from which |step| increments.
384 using SuperResFunc = void (*)(const void* coefficients, void* source,
385 ptrdiff_t source_stride, int height,
386 int downscaled_width, int upscaled_width,
387 int initial_subpixel_x, int step, void* dest,
388 ptrdiff_t dest_stride);
389
390 // Loop restoration function signature. Sections 7.16, 7.17.
391 // |restoration_info| contains loop restoration information, such as filter
392 // type, strength.
393 // |source| is the input frame buffer, which is deblocked and cdef filtered.
394 // |top_border| and |bottom_border| are the top and bottom borders.
395 // |dest| is the output.
396 // |stride| is given in pixels, and shared by |source| and |dest|.
397 // |top_border_stride| and |bottom_border_stride| are given in pixels.
398 // |restoration_buffer| contains buffers required for self guided filter and
399 // wiener filter. They must be initialized before calling.
400 using LoopRestorationFunc = void (*)(
401 const RestorationUnitInfo& restoration_info, const void* source,
402 ptrdiff_t stride, const void* top_border, ptrdiff_t top_border_stride,
403 const void* bottom_border, ptrdiff_t bottom_border_stride, int width,
404 int height, RestorationBuffer* restoration_buffer, void* dest);
405
406 // Index 0 is Wiener Filter.
407 // Index 1 is Self Guided Restoration Filter.
408 // This can be accessed as LoopRestorationType - 2.
409 using LoopRestorationFuncs = LoopRestorationFunc[2];
410
411 // Convolve function signature. Section 7.11.3.4.
412 // This function applies a horizontal filter followed by a vertical filter.
413 // |reference| is the input block (reference frame buffer). |reference_stride|
414 // is the corresponding frame stride.
415 // |vertical_filter_index|/|horizontal_filter_index| is the index to
416 // retrieve the type of filter to be applied for vertical/horizontal direction
417 // from the filter lookup table 'kSubPixelFilters'.
418 // |horizontal_filter_id| and |vertical_filter_id| are the filter ids.
419 // |width| and |height| are width and height of the block to be filtered.
420 // |ref_last_x| and |ref_last_y| are the last pixel of the reference frame in
421 // x/y direction.
422 // |prediction| is the output block (output frame buffer).
423 // Rounding precision is derived from the function being called. For horizontal
424 // filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be
425 // used. For compound vertical filtering kInterRoundBitsCompoundVertical will be
426 // used. Otherwise kInterRoundBitsVertical & kInterRoundBitsVertical12bpp will
427 // be used.
428 using ConvolveFunc = void (*)(const void* reference, ptrdiff_t reference_stride,
429 int horizontal_filter_index,
430 int vertical_filter_index,
431 int horizontal_filter_id, int vertical_filter_id,
432 int width, int height, void* prediction,
433 ptrdiff_t pred_stride);
434
435 // Convolve functions signature. Each points to one convolve function with
436 // a specific setting:
437 // ConvolveFunc[is_intra_block_copy][is_compound][has_vertical_filter]
438 // [has_horizontal_filter].
439 // If is_compound is false, the prediction is clipped to Pixel.
440 // If is_compound is true, the range of prediction is:
441 // 8bpp: [-5132, 9212] (int16_t)
442 // 10bpp: [ 3988, 61532] (uint16_t)
443 // 12bpp: [ 3974, 61559] (uint16_t)
444 // See src/dsp/convolve.cc
445 using ConvolveFuncs = ConvolveFunc[2][2][2][2];
446
447 // Convolve + scale function signature. Section 7.11.3.4.
448 // This function applies a horizontal filter followed by a vertical filter.
449 // |reference| is the input block (reference frame buffer). |reference_stride|
450 // is the corresponding frame stride.
451 // |vertical_filter_index|/|horizontal_filter_index| is the index to
452 // retrieve the type of filter to be applied for vertical/horizontal direction
453 // from the filter lookup table 'kSubPixelFilters'.
454 // |subpixel_x| and |subpixel_y| are starting positions in units of 1/1024.
455 // |step_x| and |step_y| are step sizes in units of 1/1024 of a pixel.
456 // |width| and |height| are width and height of the block to be filtered.
457 // |ref_last_x| and |ref_last_y| are the last pixel of the reference frame in
458 // x/y direction.
459 // |prediction| is the output block (output frame buffer).
460 // Rounding precision is derived from the function being called. For horizontal
461 // filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be
462 // used. For compound vertical filtering kInterRoundBitsCompoundVertical will be
463 // used. Otherwise kInterRoundBitsVertical & kInterRoundBitsVertical12bpp will
464 // be used.
465 using ConvolveScaleFunc = void (*)(const void* reference,
466 ptrdiff_t reference_stride,
467 int horizontal_filter_index,
468 int vertical_filter_index, int subpixel_x,
469 int subpixel_y, int step_x, int step_y,
470 int width, int height, void* prediction,
471 ptrdiff_t pred_stride);
472
473 // Convolve functions signature for scaling version.
474 // 0: single predictor. 1: compound predictor.
475 using ConvolveScaleFuncs = ConvolveScaleFunc[2];
476
477 // Weight mask function signature. Section 7.11.3.12.
478 // |prediction_0| is the first input block.
479 // |prediction_1| is the second input block. Both blocks are int16_t* when
480 // bitdepth == 8 and uint16_t* otherwise.
481 // |width| and |height| are the prediction width and height.
482 // The stride for the input buffers is equal to |width|.
483 // The valid range of block size is [8x8, 128x128] for the luma plane.
484 // |mask| is the output buffer. |mask_stride| is the output buffer stride.
485 using WeightMaskFunc = void (*)(const void* prediction_0,
486 const void* prediction_1, uint8_t* mask,
487 ptrdiff_t mask_stride);
488
489 // Weight mask functions signature. The dimensions (in order) are:
490 // * Width index (4 => 0, 8 => 1, 16 => 2 and so on).
491 // * Height index (4 => 0, 8 => 1, 16 => 2 and so on).
492 // * mask_is_inverse.
493 using WeightMaskFuncs = WeightMaskFunc[6][6][2];
494
495 // Average blending function signature.
496 // Two predictors are averaged to generate the output.
497 // Input predictor values are int16_t. Output type is uint8_t, with actual
498 // range of Pixel value.
499 // Average blending is in the bottom of Section 7.11.3.1 (COMPOUND_AVERAGE).
500 // |prediction_0| is the first input block.
501 // |prediction_1| is the second input block. Both blocks are int16_t* when
502 // bitdepth == 8 and uint16_t* otherwise.
503 // |width| and |height| are the same for the first and second input blocks.
504 // The stride for the input buffers is equal to |width|.
505 // The valid range of block size is [8x8, 128x128] for the luma plane.
506 // |dest| is the output buffer. |dest_stride| is the output buffer stride.
507 using AverageBlendFunc = void (*)(const void* prediction_0,
508 const void* prediction_1, int width,
509 int height, void* dest,
510 ptrdiff_t dest_stride);
511
512 // Distance weighted blending function signature.
513 // Weights are generated in Section 7.11.3.15.
514 // Weighted blending is in the bottom of Section 7.11.3.1 (COMPOUND_DISTANCE).
515 // This function takes two blocks (inter frame prediction) and produces a
516 // weighted output.
517 // |prediction_0| is the first input block.
518 // |prediction_1| is the second input block. Both blocks are int16_t* when
519 // bitdepth == 8 and uint16_t* otherwise.
520 // |weight_0| is the weight for the first block. It is derived from the relative
521 // distance of the first reference frame and the current frame.
522 // |weight_1| is the weight for the second block. It is derived from the
523 // relative distance of the second reference frame and the current frame.
524 // |width| and |height| are the same for the first and second input blocks.
525 // The stride for the input buffers is equal to |width|.
526 // The valid range of block size is [8x8, 128x128] for the luma plane.
527 // |dest| is the output buffer. |dest_stride| is the output buffer stride.
528 using DistanceWeightedBlendFunc = void (*)(const void* prediction_0,
529 const void* prediction_1,
530 uint8_t weight_0, uint8_t weight_1,
531 int width, int height, void* dest,
532 ptrdiff_t dest_stride);
533
534 // Mask blending function signature. Section 7.11.3.14.
535 // This function takes two blocks and produces a blended output stored into the
536 // output block |dest|. The blending is a weighted average process, controlled
537 // by values of the mask.
538 // |prediction_0| is the first input block. When prediction mode is inter_intra
539 // (or wedge_inter_intra), this refers to the inter frame prediction. It is
540 // int16_t* when bitdepth == 8 and uint16_t* otherwise.
541 // The stride for |prediction_0| is equal to |width|.
542 // |prediction_1| is the second input block. When prediction mode is inter_intra
543 // (or wedge_inter_intra), this refers to the intra frame prediction and uses
544 // Pixel values. It is only used for intra frame prediction when bitdepth >= 10.
545 // It is int16_t* when bitdepth == 8 and uint16_t* otherwise.
546 // |prediction_stride_1| is the stride, given in units of [u]int16_t. When
547 // |is_inter_intra| is false (compound prediction) then |prediction_stride_1| is
548 // equal to |width|.
549 // |mask| is an integer array, whose value indicates the weight of the blending.
550 // |mask_stride| is corresponding stride.
551 // |width|, |height| are the same for both input blocks.
552 // If it's inter_intra (or wedge_inter_intra), the valid range of block size is
553 // [8x8, 32x32]. Otherwise (including difference weighted prediction and
554 // compound average prediction), the valid range is [8x8, 128x128].
555 // If there's subsampling, the corresponding width and height are halved for
556 // chroma planes.
557 // |subsampling_x|, |subsampling_y| are the subsampling factors.
558 // |is_inter_intra| stands for the prediction mode. If it is true, one of the
559 // prediction blocks is from intra prediction of current frame. Otherwise, two
560 // prediction blocks are both inter frame predictions.
561 // |is_wedge_inter_intra| indicates if the mask is for the wedge prediction.
562 // |dest| is the output block.
563 // |dest_stride| is the corresponding stride for dest.
564 using MaskBlendFunc = void (*)(const void* prediction_0,
565 const void* prediction_1,
566 ptrdiff_t prediction_stride_1,
567 const uint8_t* mask, ptrdiff_t mask_stride,
568 int width, int height, void* dest,
569 ptrdiff_t dest_stride);
570
571 // Mask blending functions signature. Each points to one function with
572 // a specific setting:
573 // MaskBlendFunc[subsampling_x + subsampling_y][is_inter_intra].
574 using MaskBlendFuncs = MaskBlendFunc[3][2];
575
576 // This function is similar to the MaskBlendFunc. It is only used when
577 // |is_inter_intra| is true and |bitdepth| == 8.
578 // |prediction_[01]| are Pixel values (uint8_t).
579 // |prediction_1| is also the output buffer.
580 using InterIntraMaskBlendFunc8bpp = void (*)(const uint8_t* prediction_0,
581 uint8_t* prediction_1,
582 ptrdiff_t prediction_stride_1,
583 const uint8_t* mask,
584 ptrdiff_t mask_stride, int width,
585 int height);
586
587 // InterIntra8bpp mask blending functions signature. When is_wedge_inter_intra
588 // is false, the function at index 0 must be used. Otherwise, the function at
589 // index subsampling_x + subsampling_y must be used.
590 using InterIntraMaskBlendFuncs8bpp = InterIntraMaskBlendFunc8bpp[3];
591
592 // Obmc (overlapped block motion compensation) blending function signature.
593 // Section 7.11.3.10.
594 // This function takes two blocks and produces a blended output stored into the
595 // first input block. The blending is a weighted average process, controlled by
596 // values of the mask.
597 // Obmc is not a compound mode. It is different from other compound blending,
598 // in terms of precision. The current block is computed using convolution with
599 // clipping to the range of pixel values. Its above and left blocks are also
600 // clipped. Therefore obmc blending process doesn't need to clip the output.
601 // |prediction| is the first input block, which will be overwritten.
602 // |prediction_stride| is the stride, given in bytes.
603 // |width|, |height| are the same for both input blocks.
604 // |obmc_prediction| is the second input block.
605 // |obmc_prediction_stride| is its stride, given in bytes.
606 using ObmcBlendFunc = void (*)(void* prediction, ptrdiff_t prediction_stride,
607 int width, int height,
608 const void* obmc_prediction,
609 ptrdiff_t obmc_prediction_stride);
610 using ObmcBlendFuncs = ObmcBlendFunc[kNumObmcDirections];
611
612 // Warp function signature. Section 7.11.3.5.
613 // This function applies warp filtering for each 8x8 block inside the current
614 // coding block. The filtering process is similar to 2d convolve filtering.
615 // The horizontal filter is applied followed by the vertical filter.
616 // The function has to calculate corresponding pixel positions before and
617 // after warping.
618 // |source| is the input reference frame buffer.
619 // |source_stride|, |source_width|, |source_height| are corresponding frame
620 // stride, width, and height. |source_stride| is given in bytes.
621 // |warp_params| is the matrix of warp motion: warp_params[i] = mN.
622 // [x' (m2 m3 m0 [x
623 // z . y' = m4 m5 m1 * y
624 // 1] m6 m7 1) 1]
625 // |subsampling_x/y| is the current frame's plane subsampling factor.
626 // |block_start_x| and |block_start_y| are the starting position the current
627 // coding block.
628 // |block_width| and |block_height| are width and height of the current coding
629 // block. |block_width| and |block_height| are at least 8.
630 // |alpha|, |beta|, |gamma|, |delta| are valid warp parameters. See the
631 // comments in the definition of struct GlobalMotion for the range of their
632 // values.
633 // |dest| is the output buffer of type Pixel. The output values are clipped to
634 // Pixel values.
635 // |dest_stride| is the stride, in units of bytes.
636 // Rounding precision is derived from the function being called. For horizontal
637 // filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be
638 // used. For vertical filtering kInterRoundBitsVertical &
639 // kInterRoundBitsVertical12bpp will be used.
640 //
641 // NOTE: WarpFunc assumes the source frame has left, right, top, and bottom
642 // borders that extend the frame boundary pixels.
643 // * The left and right borders must be at least 13 pixels wide. In addition,
644 // Warp_NEON() may read up to 14 bytes after a row in the |source| buffer.
645 // Therefore, there must be at least one extra padding byte after the right
646 // border of the last row in the source buffer.
647 // * The top and bottom borders must be at least 13 pixels high.
648 using WarpFunc = void (*)(const void* source, ptrdiff_t source_stride,
649 int source_width, int source_height,
650 const int* warp_params, int subsampling_x,
651 int subsampling_y, int block_start_x,
652 int block_start_y, int block_width, int block_height,
653 int16_t alpha, int16_t beta, int16_t gamma,
654 int16_t delta, void* dest, ptrdiff_t dest_stride);
655
656 // Warp for compound predictions. Section 7.11.3.5.
657 // Similar to WarpFunc, but |dest| is a uint16_t predictor buffer,
658 // |dest_stride| is given in units of uint16_t and |inter_round_bits_vertical|
659 // is always 7 (kCompoundInterRoundBitsVertical).
660 // Rounding precision is derived from the function being called. For horizontal
661 // filtering kInterRoundBitsHorizontal & kInterRoundBitsHorizontal12bpp will be
662 // used. For vertical filtering kInterRoundBitsCompondVertical will be used.
663 using WarpCompoundFunc = WarpFunc;
664
665 constexpr int kNumAutoRegressionLags = 4;
666 // Applies an auto-regressive filter to the white noise in |luma_grain_buffer|.
667 // Section 7.18.3.3, second code block
668 // |params| are parameters read from frame header, mainly providing
669 // auto_regression_coeff_y for the filter and auto_regression_shift to right
670 // shift the filter sum by. Note: This method assumes
671 // params.auto_regression_coeff_lag is not 0. Do not call this method if
672 // params.auto_regression_coeff_lag is 0.
673 using LumaAutoRegressionFunc = void (*)(const FilmGrainParams& params,
674 void* luma_grain_buffer);
675 // Function index is auto_regression_coeff_lag - 1.
676 using LumaAutoRegressionFuncs =
677 LumaAutoRegressionFunc[kNumAutoRegressionLags - 1];
678
679 // Applies an auto-regressive filter to the white noise in u_grain and v_grain.
680 // Section 7.18.3.3, third code block
681 // The |luma_grain_buffer| provides samples that are added to the autoregressive
682 // sum when num_y_points > 0.
683 // |u_grain_buffer| and |v_grain_buffer| point to the buffers of chroma noise
684 // that were generated from the stored Gaussian sequence, and are overwritten
685 // with the results of the autoregressive filter. |params| are parameters read
686 // from frame header, mainly providing auto_regression_coeff_u and
687 // auto_regression_coeff_v for each chroma plane's filter, and
688 // auto_regression_shift to right shift the filter sums by.
689 using ChromaAutoRegressionFunc = void (*)(const FilmGrainParams& params,
690 const void* luma_grain_buffer,
691 int subsampling_x, int subsampling_y,
692 void* u_grain_buffer,
693 void* v_grain_buffer);
694 using ChromaAutoRegressionFuncs =
695 ChromaAutoRegressionFunc[/*use_luma*/ 2][kNumAutoRegressionLags];
696
697 // Build an image-wide "stripe" of grain noise for every 32 rows in the image.
698 // Section 7.18.3.5, first code block.
699 // Each 32x32 luma block is copied at a random offset specified via
700 // |grain_seed| from the grain template produced by autoregression, and the same
701 // is done for chroma grains, subject to subsampling.
702 // |width| and |height| are the dimensions of the overall image.
703 // |noise_stripes_buffer| points to an Array2DView with one row for each stripe.
704 // Because this function treats all planes identically and independently, it is
705 // simplified to take one grain buffer at a time. This means duplicating some
706 // random number generations, but that work can be reduced in other ways.
707 using ConstructNoiseStripesFunc = void (*)(const void* grain_buffer,
708 int grain_seed, int width,
709 int height, int subsampling_x,
710 int subsampling_y,
711 void* noise_stripes_buffer);
712 using ConstructNoiseStripesFuncs =
713 ConstructNoiseStripesFunc[/*overlap_flag*/ 2];
714
715 // Compute the one or two overlap rows for each stripe copied to the noise
716 // image.
717 // Section 7.18.3.5, second code block. |width| and |height| are the
718 // dimensions of the overall image. |noise_stripes_buffer| points to an
719 // Array2DView with one row for each stripe. |noise_image_buffer| points to an
720 // Array2D containing the allocated plane for this frame. Because this function
721 // treats all planes identically and independently, it is simplified to take one
722 // grain buffer at a time.
723 using ConstructNoiseImageOverlapFunc =
724 void (*)(const void* noise_stripes_buffer, int width, int height,
725 int subsampling_x, int subsampling_y, void* noise_image_buffer);
726
727 // Populate a scaling lookup table with interpolated values of a piecewise
728 // linear function where values in |point_value| are mapped to the values in
729 // |point_scaling|.
730 // |num_points| can be between 0 and 15. When 0, the lookup table is set to
731 // zero.
732 // |point_value| and |point_scaling| have |num_points| valid elements.
733 using InitializeScalingLutFunc = void (*)(
734 int num_points, const uint8_t point_value[], const uint8_t point_scaling[],
735 uint8_t scaling_lut[kScalingLookupTableSize]);
736
737 // Blend noise with image. Section 7.18.3.5, third code block.
738 // |width| is the width of each row, while |height| is how many rows to compute.
739 // |start_height| is an offset for the noise image, to support multithreading.
740 // |min_value|, |max_luma|, and |max_chroma| are computed by the caller of these
741 // functions, according to the code in the spec.
742 // |source_plane_y| and |source_plane_uv| are the plane buffers of the decoded
743 // frame. They are blended with the film grain noise and written to
744 // |dest_plane_y| and |dest_plane_uv| as final output for display.
745 // source_plane_* and dest_plane_* may point to the same buffer, in which case
746 // the film grain noise is added in place.
747 // |scaling_lut_y| and |scaling_lut| represent a piecewise linear mapping from
748 // the frame's raw pixel value, to a scaling factor for the noise sample.
749 // |scaling_shift| is applied as a right shift after scaling, so that scaling
750 // down is possible. It is found in FilmGrainParams, but supplied directly to
751 // BlendNoiseWithImageLumaFunc because it's the only member used.
752 using BlendNoiseWithImageLumaFunc =
753 void (*)(const void* noise_image_ptr, int min_value, int max_value,
754 int scaling_shift, int width, int height, int start_height,
755 const uint8_t scaling_lut_y[kScalingLookupTableSize],
756 const void* source_plane_y, ptrdiff_t source_stride_y,
757 void* dest_plane_y, ptrdiff_t dest_stride_y);
758
759 using BlendNoiseWithImageChromaFunc = void (*)(
760 Plane plane, const FilmGrainParams& params, const void* noise_image_ptr,
761 int min_value, int max_value, int width, int height, int start_height,
762 int subsampling_x, int subsampling_y,
763 const uint8_t scaling_lut[kScalingLookupTableSize],
764 const void* source_plane_y, ptrdiff_t source_stride_y,
765 const void* source_plane_uv, ptrdiff_t source_stride_uv,
766 void* dest_plane_uv, ptrdiff_t dest_stride_uv);
767
768 using BlendNoiseWithImageChromaFuncs =
769 BlendNoiseWithImageChromaFunc[/*chroma_scaling_from_luma*/ 2];
770
771 //------------------------------------------------------------------------------
772
773 struct FilmGrainFuncs {
774 LumaAutoRegressionFuncs luma_auto_regression;
775 ChromaAutoRegressionFuncs chroma_auto_regression;
776 ConstructNoiseStripesFuncs construct_noise_stripes;
777 ConstructNoiseImageOverlapFunc construct_noise_image_overlap;
778 InitializeScalingLutFunc initialize_scaling_lut;
779 BlendNoiseWithImageLumaFunc blend_noise_luma;
780 BlendNoiseWithImageChromaFuncs blend_noise_chroma;
781 };
782
783 // Motion field projection function signature. Section 7.9.
784 // |reference_info| provides reference information for motion field projection.
785 // |reference_to_current_with_sign| is the precalculated reference frame id
786 // distance from current frame.
787 // |dst_sign| is -1 for LAST_FRAME and LAST2_FRAME, or 0 (1 in spec) for others.
788 // |y8_start| and |y8_end| are the start and end 8x8 rows of the current tile.
789 // |x8_start| and |x8_end| are the start and end 8x8 columns of the current
790 // tile.
791 // |motion_field| is the output which saves the projected motion field
792 // information.
793 using MotionFieldProjectionKernelFunc = void (*)(
794 const ReferenceInfo& reference_info, int reference_to_current_with_sign,
795 int dst_sign, int y8_start, int y8_end, int x8_start, int x8_end,
796 TemporalMotionField* motion_field);
797
798 // Compound temporal motion vector projection function signature.
799 // Section 7.9.3 and 7.10.2.10.
800 // |temporal_mvs| is the set of temporal reference motion vectors.
801 // |temporal_reference_offsets| specifies the number of frames covered by the
802 // original motion vector.
803 // |reference_offsets| specifies the number of frames to be covered by the
804 // projected motion vector.
805 // |count| is the number of the temporal motion vectors.
806 // |candidate_mvs| is the set of projected motion vectors.
807 using MvProjectionCompoundFunc = void (*)(
808 const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
809 const int reference_offsets[2], int count,
810 CompoundMotionVector* candidate_mvs);
811
812 // Single temporal motion vector projection function signature.
813 // Section 7.9.3 and 7.10.2.10.
814 // |temporal_mvs| is the set of temporal reference motion vectors.
815 // |temporal_reference_offsets| specifies the number of frames covered by the
816 // original motion vector.
817 // |reference_offset| specifies the number of frames to be covered by the
818 // projected motion vector.
819 // |count| is the number of the temporal motion vectors.
820 // |candidate_mvs| is the set of projected motion vectors.
821 using MvProjectionSingleFunc = void (*)(
822 const MotionVector* temporal_mvs, const int8_t* temporal_reference_offsets,
823 int reference_offset, int count, MotionVector* candidate_mvs);
824
825 struct Dsp {
826 AverageBlendFunc average_blend;
827 CdefDirectionFunc cdef_direction;
828 CdefFilteringFuncs cdef_filters;
829 CflIntraPredictorFuncs cfl_intra_predictors;
830 CflSubsamplerFuncs cfl_subsamplers;
831 ConvolveFuncs convolve;
832 ConvolveScaleFuncs convolve_scale;
833 DirectionalIntraPredictorZone1Func directional_intra_predictor_zone1;
834 DirectionalIntraPredictorZone2Func directional_intra_predictor_zone2;
835 DirectionalIntraPredictorZone3Func directional_intra_predictor_zone3;
836 DistanceWeightedBlendFunc distance_weighted_blend;
837 FilmGrainFuncs film_grain;
838 FilterIntraPredictorFunc filter_intra_predictor;
839 InterIntraMaskBlendFuncs8bpp inter_intra_mask_blend_8bpp;
840 IntraEdgeFilterFunc intra_edge_filter;
841 IntraEdgeUpsamplerFunc intra_edge_upsampler;
842 IntraPredictorFuncs intra_predictors;
843 InverseTransformAddFuncs inverse_transforms;
844 LoopFilterFuncs loop_filters;
845 LoopRestorationFuncs loop_restorations;
846 MaskBlendFuncs mask_blend;
847 MotionFieldProjectionKernelFunc motion_field_projection_kernel;
848 MvProjectionCompoundFunc mv_projection_compound[3];
849 MvProjectionSingleFunc mv_projection_single[3];
850 ObmcBlendFuncs obmc_blend;
851 SuperResCoefficientsFunc super_res_coefficients;
852 SuperResFunc super_res;
853 WarpCompoundFunc warp_compound;
854 WarpFunc warp;
855 WeightMaskFuncs weight_mask;
856 };
857
858 // Initializes function pointers based on build config and runtime
859 // environment. Must be called once before first use. This function is
860 // thread-safe.
861 void DspInit();
862
863 // Returns the appropriate Dsp table for |bitdepth| or nullptr if one doesn't
864 // exist.
865 const Dsp* GetDspTable(int bitdepth);
866
867 } // namespace dsp
868
869 namespace dsp_internal {
870
871 // Visual Studio builds don't have a way to detect SSE4_1. Only exclude the C
872 // functions if /arch:AVX2 is used across all sources.
873 #if !LIBGAV1_TARGETING_AVX2 && \
874 (defined(_MSC_VER) || (defined(_M_IX86) || defined(_M_X64)))
875 #undef LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS
876 #define LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS 1
877 #endif
878
879 // Returns true if a more highly optimized version of |func| is not defined for
880 // the associated bitdepth or if it is forcibly enabled with
881 // LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS. The define checked for |func| corresponds
882 // to the LIBGAV1_Dsp<bitdepth>bpp_|func| define in the header file associated
883 // with the module.
884 // |func| is one of:
885 // - FunctionName, e.g., SelfGuidedFilter.
886 // - [sub-table-index1][...-indexN] e.g.,
887 // TransformSize4x4_IntraPredictorDc. The indices correspond to enum values
888 // used as lookups with leading 'k' removed.
889 //
890 // NEON support is the only extension available for ARM and it is always
891 // required. Because of this restriction DSP_ENABLED_8BPP_NEON(func) is always
892 // true and can be omitted.
893 #define DSP_ENABLED_8BPP_AVX2(func) \
894 (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
895 LIBGAV1_Dsp8bpp_##func == LIBGAV1_CPU_AVX2)
896 #define DSP_ENABLED_10BPP_AVX2(func) \
897 (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
898 LIBGAV1_Dsp10bpp_##func == LIBGAV1_CPU_AVX2)
899 #define DSP_ENABLED_8BPP_SSE4_1(func) \
900 (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
901 LIBGAV1_Dsp8bpp_##func == LIBGAV1_CPU_SSE4_1)
902 #define DSP_ENABLED_10BPP_SSE4_1(func) \
903 (LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS || \
904 LIBGAV1_Dsp10bpp_##func == LIBGAV1_CPU_SSE4_1)
905
906 // Initializes C-only function pointers. Note some entries may be set to
907 // nullptr if LIBGAV1_ENABLE_ALL_DSP_FUNCTIONS is not defined. This is meant
908 // for use in tests only, it is not thread-safe.
909 void DspInit_C();
910
911 // Returns the appropriate Dsp table for |bitdepth| or nullptr if one doesn't
912 // exist. This version is meant for use by test or dsp/*Init() functions only.
913 dsp::Dsp* GetWritableDspTable(int bitdepth);
914
915 } // namespace dsp_internal
916 } // namespace libgav1
917
918 #endif // LIBGAV1_SRC_DSP_DSP_H_
919