1 // Copyright 2020 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "src/film_grain.h"
16
17 #include <algorithm>
18 #include <atomic>
19 #include <cassert>
20 #include <cstddef>
21 #include <cstdint>
22 #include <cstring>
23 #include <new>
24
25 #include "src/dsp/common.h"
26 #include "src/dsp/constants.h"
27 #include "src/dsp/dsp.h"
28 #include "src/dsp/film_grain_common.h"
29 #include "src/utils/array_2d.h"
30 #include "src/utils/blocking_counter.h"
31 #include "src/utils/common.h"
32 #include "src/utils/compiler_attributes.h"
33 #include "src/utils/constants.h"
34 #include "src/utils/logging.h"
35 #include "src/utils/threadpool.h"
36
37 namespace libgav1 {
38
39 namespace {
40
41 // The kGaussianSequence array contains random samples from a Gaussian
42 // distribution with zero mean and standard deviation of about 512 clipped to
43 // the range of [-2048, 2047] (representable by a signed integer using 12 bits
44 // of precision) and rounded to the nearest multiple of 4.
45 //
46 // Note: It is important that every element in the kGaussianSequence array be
47 // less than 2040, so that RightShiftWithRounding(kGaussianSequence[i], 4) is
48 // less than 128 for bitdepth=8 (GrainType=int8_t).
49 constexpr int16_t kGaussianSequence[/*2048*/] = {
50 56, 568, -180, 172, 124, -84, 172, -64, -900, 24, 820,
51 224, 1248, 996, 272, -8, -916, -388, -732, -104, -188, 800,
52 112, -652, -320, -376, 140, -252, 492, -168, 44, -788, 588,
53 -584, 500, -228, 12, 680, 272, -476, 972, -100, 652, 368,
54 432, -196, -720, -192, 1000, -332, 652, -136, -552, -604, -4,
55 192, -220, -136, 1000, -52, 372, -96, -624, 124, -24, 396,
56 540, -12, -104, 640, 464, 244, -208, -84, 368, -528, -740,
57 248, -968, -848, 608, 376, -60, -292, -40, -156, 252, -292,
58 248, 224, -280, 400, -244, 244, -60, 76, -80, 212, 532,
59 340, 128, -36, 824, -352, -60, -264, -96, -612, 416, -704,
60 220, -204, 640, -160, 1220, -408, 900, 336, 20, -336, -96,
61 -792, 304, 48, -28, -1232, -1172, -448, 104, -292, -520, 244,
62 60, -948, 0, -708, 268, 108, 356, -548, 488, -344, -136,
63 488, -196, -224, 656, -236, -1128, 60, 4, 140, 276, -676,
64 -376, 168, -108, 464, 8, 564, 64, 240, 308, -300, -400,
65 -456, -136, 56, 120, -408, -116, 436, 504, -232, 328, 844,
66 -164, -84, 784, -168, 232, -224, 348, -376, 128, 568, 96,
67 -1244, -288, 276, 848, 832, -360, 656, 464, -384, -332, -356,
68 728, -388, 160, -192, 468, 296, 224, 140, -776, -100, 280,
69 4, 196, 44, -36, -648, 932, 16, 1428, 28, 528, 808,
70 772, 20, 268, 88, -332, -284, 124, -384, -448, 208, -228,
71 -1044, -328, 660, 380, -148, -300, 588, 240, 540, 28, 136,
72 -88, -436, 256, 296, -1000, 1400, 0, -48, 1056, -136, 264,
73 -528, -1108, 632, -484, -592, -344, 796, 124, -668, -768, 388,
74 1296, -232, -188, -200, -288, -4, 308, 100, -168, 256, -500,
75 204, -508, 648, -136, 372, -272, -120, -1004, -552, -548, -384,
76 548, -296, 428, -108, -8, -912, -324, -224, -88, -112, -220,
77 -100, 996, -796, 548, 360, -216, 180, 428, -200, -212, 148,
78 96, 148, 284, 216, -412, -320, 120, -300, -384, -604, -572,
79 -332, -8, -180, -176, 696, 116, -88, 628, 76, 44, -516,
80 240, -208, -40, 100, -592, 344, -308, -452, -228, 20, 916,
81 -1752, -136, -340, -804, 140, 40, 512, 340, 248, 184, -492,
82 896, -156, 932, -628, 328, -688, -448, -616, -752, -100, 560,
83 -1020, 180, -800, -64, 76, 576, 1068, 396, 660, 552, -108,
84 -28, 320, -628, 312, -92, -92, -472, 268, 16, 560, 516,
85 -672, -52, 492, -100, 260, 384, 284, 292, 304, -148, 88,
86 -152, 1012, 1064, -228, 164, -376, -684, 592, -392, 156, 196,
87 -524, -64, -884, 160, -176, 636, 648, 404, -396, -436, 864,
88 424, -728, 988, -604, 904, -592, 296, -224, 536, -176, -920,
89 436, -48, 1176, -884, 416, -776, -824, -884, 524, -548, -564,
90 -68, -164, -96, 692, 364, -692, -1012, -68, 260, -480, 876,
91 -1116, 452, -332, -352, 892, -1088, 1220, -676, 12, -292, 244,
92 496, 372, -32, 280, 200, 112, -440, -96, 24, -644, -184,
93 56, -432, 224, -980, 272, -260, 144, -436, 420, 356, 364,
94 -528, 76, 172, -744, -368, 404, -752, -416, 684, -688, 72,
95 540, 416, 92, 444, 480, -72, -1416, 164, -1172, -68, 24,
96 424, 264, 1040, 128, -912, -524, -356, 64, 876, -12, 4,
97 -88, 532, 272, -524, 320, 276, -508, 940, 24, -400, -120,
98 756, 60, 236, -412, 100, 376, -484, 400, -100, -740, -108,
99 -260, 328, -268, 224, -200, -416, 184, -604, -564, -20, 296,
100 60, 892, -888, 60, 164, 68, -760, 216, -296, 904, -336,
101 -28, 404, -356, -568, -208, -1480, -512, 296, 328, -360, -164,
102 -1560, -776, 1156, -428, 164, -504, -112, 120, -216, -148, -264,
103 308, 32, 64, -72, 72, 116, 176, -64, -272, 460, -536,
104 -784, -280, 348, 108, -752, -132, 524, -540, -776, 116, -296,
105 -1196, -288, -560, 1040, -472, 116, -848, -1116, 116, 636, 696,
106 284, -176, 1016, 204, -864, -648, -248, 356, 972, -584, -204,
107 264, 880, 528, -24, -184, 116, 448, -144, 828, 524, 212,
108 -212, 52, 12, 200, 268, -488, -404, -880, 824, -672, -40,
109 908, -248, 500, 716, -576, 492, -576, 16, 720, -108, 384,
110 124, 344, 280, 576, -500, 252, 104, -308, 196, -188, -8,
111 1268, 296, 1032, -1196, 436, 316, 372, -432, -200, -660, 704,
112 -224, 596, -132, 268, 32, -452, 884, 104, -1008, 424, -1348,
113 -280, 4, -1168, 368, 476, 696, 300, -8, 24, 180, -592,
114 -196, 388, 304, 500, 724, -160, 244, -84, 272, -256, -420,
115 320, 208, -144, -156, 156, 364, 452, 28, 540, 316, 220,
116 -644, -248, 464, 72, 360, 32, -388, 496, -680, -48, 208,
117 -116, -408, 60, -604, -392, 548, -840, 784, -460, 656, -544,
118 -388, -264, 908, -800, -628, -612, -568, 572, -220, 164, 288,
119 -16, -308, 308, -112, -636, -760, 280, -668, 432, 364, 240,
120 -196, 604, 340, 384, 196, 592, -44, -500, 432, -580, -132,
121 636, -76, 392, 4, -412, 540, 508, 328, -356, -36, 16,
122 -220, -64, -248, -60, 24, -192, 368, 1040, 92, -24, -1044,
123 -32, 40, 104, 148, 192, -136, -520, 56, -816, -224, 732,
124 392, 356, 212, -80, -424, -1008, -324, 588, -1496, 576, 460,
125 -816, -848, 56, -580, -92, -1372, -112, -496, 200, 364, 52,
126 -140, 48, -48, -60, 84, 72, 40, 132, -356, -268, -104,
127 -284, -404, 732, -520, 164, -304, -540, 120, 328, -76, -460,
128 756, 388, 588, 236, -436, -72, -176, -404, -316, -148, 716,
129 -604, 404, -72, -88, -888, -68, 944, 88, -220, -344, 960,
130 472, 460, -232, 704, 120, 832, -228, 692, -508, 132, -476,
131 844, -748, -364, -44, 1116, -1104, -1056, 76, 428, 552, -692,
132 60, 356, 96, -384, -188, -612, -576, 736, 508, 892, 352,
133 -1132, 504, -24, -352, 324, 332, -600, -312, 292, 508, -144,
134 -8, 484, 48, 284, -260, -240, 256, -100, -292, -204, -44,
135 472, -204, 908, -188, -1000, -256, 92, 1164, -392, 564, 356,
136 652, -28, -884, 256, 484, -192, 760, -176, 376, -524, -452,
137 -436, 860, -736, 212, 124, 504, -476, 468, 76, -472, 552,
138 -692, -944, -620, 740, -240, 400, 132, 20, 192, -196, 264,
139 -668, -1012, -60, 296, -316, -828, 76, -156, 284, -768, -448,
140 -832, 148, 248, 652, 616, 1236, 288, -328, -400, -124, 588,
141 220, 520, -696, 1032, 768, -740, -92, -272, 296, 448, -464,
142 412, -200, 392, 440, -200, 264, -152, -260, 320, 1032, 216,
143 320, -8, -64, 156, -1016, 1084, 1172, 536, 484, -432, 132,
144 372, -52, -256, 84, 116, -352, 48, 116, 304, -384, 412,
145 924, -300, 528, 628, 180, 648, 44, -980, -220, 1320, 48,
146 332, 748, 524, -268, -720, 540, -276, 564, -344, -208, -196,
147 436, 896, 88, -392, 132, 80, -964, -288, 568, 56, -48,
148 -456, 888, 8, 552, -156, -292, 948, 288, 128, -716, -292,
149 1192, -152, 876, 352, -600, -260, -812, -468, -28, -120, -32,
150 -44, 1284, 496, 192, 464, 312, -76, -516, -380, -456, -1012,
151 -48, 308, -156, 36, 492, -156, -808, 188, 1652, 68, -120,
152 -116, 316, 160, -140, 352, 808, -416, 592, 316, -480, 56,
153 528, -204, -568, 372, -232, 752, -344, 744, -4, 324, -416,
154 -600, 768, 268, -248, -88, -132, -420, -432, 80, -288, 404,
155 -316, -1216, -588, 520, -108, 92, -320, 368, -480, -216, -92,
156 1688, -300, 180, 1020, -176, 820, -68, -228, -260, 436, -904,
157 20, 40, -508, 440, -736, 312, 332, 204, 760, -372, 728,
158 96, -20, -632, -520, -560, 336, 1076, -64, -532, 776, 584,
159 192, 396, -728, -520, 276, -188, 80, -52, -612, -252, -48,
160 648, 212, -688, 228, -52, -260, 428, -412, -272, -404, 180,
161 816, -796, 48, 152, 484, -88, -216, 988, 696, 188, -528,
162 648, -116, -180, 316, 476, 12, -564, 96, 476, -252, -364,
163 -376, -392, 556, -256, -576, 260, -352, 120, -16, -136, -260,
164 -492, 72, 556, 660, 580, 616, 772, 436, 424, -32, -324,
165 -1268, 416, -324, -80, 920, 160, 228, 724, 32, -516, 64,
166 384, 68, -128, 136, 240, 248, -204, -68, 252, -932, -120,
167 -480, -628, -84, 192, 852, -404, -288, -132, 204, 100, 168,
168 -68, -196, -868, 460, 1080, 380, -80, 244, 0, 484, -888,
169 64, 184, 352, 600, 460, 164, 604, -196, 320, -64, 588,
170 -184, 228, 12, 372, 48, -848, -344, 224, 208, -200, 484,
171 128, -20, 272, -468, -840, 384, 256, -720, -520, -464, -580,
172 112, -120, 644, -356, -208, -608, -528, 704, 560, -424, 392,
173 828, 40, 84, 200, -152, 0, -144, 584, 280, -120, 80,
174 -556, -972, -196, -472, 724, 80, 168, -32, 88, 160, -688,
175 0, 160, 356, 372, -776, 740, -128, 676, -248, -480, 4,
176 -364, 96, 544, 232, -1032, 956, 236, 356, 20, -40, 300,
177 24, -676, -596, 132, 1120, -104, 532, -1096, 568, 648, 444,
178 508, 380, 188, -376, -604, 1488, 424, 24, 756, -220, -192,
179 716, 120, 920, 688, 168, 44, -460, 568, 284, 1144, 1160,
180 600, 424, 888, 656, -356, -320, 220, 316, -176, -724, -188,
181 -816, -628, -348, -228, -380, 1012, -452, -660, 736, 928, 404,
182 -696, -72, -268, -892, 128, 184, -344, -780, 360, 336, 400,
183 344, 428, 548, -112, 136, -228, -216, -820, -516, 340, 92,
184 -136, 116, -300, 376, -244, 100, -316, -520, -284, -12, 824,
185 164, -548, -180, -128, 116, -924, -828, 268, -368, -580, 620,
186 192, 160, 0, -1676, 1068, 424, -56, -360, 468, -156, 720,
187 288, -528, 556, -364, 548, -148, 504, 316, 152, -648, -620,
188 -684, -24, -376, -384, -108, -920, -1032, 768, 180, -264, -508,
189 -1268, -260, -60, 300, -240, 988, 724, -376, -576, -212, -736,
190 556, 192, 1092, -620, -880, 376, -56, -4, -216, -32, 836,
191 268, 396, 1332, 864, -600, 100, 56, -412, -92, 356, 180,
192 884, -468, -436, 292, -388, -804, -704, -840, 368, -348, 140,
193 -724, 1536, 940, 372, 112, -372, 436, -480, 1136, 296, -32,
194 -228, 132, -48, -220, 868, -1016, -60, -1044, -464, 328, 916,
195 244, 12, -736, -296, 360, 468, -376, -108, -92, 788, 368,
196 -56, 544, 400, -672, -420, 728, 16, 320, 44, -284, -380,
197 -796, 488, 132, 204, -596, -372, 88, -152, -908, -636, -572,
198 -624, -116, -692, -200, -56, 276, -88, 484, -324, 948, 864,
199 1000, -456, -184, -276, 292, -296, 156, 676, 320, 160, 908,
200 -84, -1236, -288, -116, 260, -372, -644, 732, -756, -96, 84,
201 344, -520, 348, -688, 240, -84, 216, -1044, -136, -676, -396,
202 -1500, 960, -40, 176, 168, 1516, 420, -504, -344, -364, -360,
203 1216, -940, -380, -212, 252, -660, -708, 484, -444, -152, 928,
204 -120, 1112, 476, -260, 560, -148, -344, 108, -196, 228, -288,
205 504, 560, -328, -88, 288, -1008, 460, -228, 468, -836, -196,
206 76, 388, 232, 412, -1168, -716, -644, 756, -172, -356, -504,
207 116, 432, 528, 48, 476, -168, -608, 448, 160, -532, -272,
208 28, -676, -12, 828, 980, 456, 520, 104, -104, 256, -344,
209 -4, -28, -368, -52, -524, -572, -556, -200, 768, 1124, -208,
210 -512, 176, 232, 248, -148, -888, 604, -600, -304, 804, -156,
211 -212, 488, -192, -804, -256, 368, -360, -916, -328, 228, -240,
212 -448, -472, 856, -556, -364, 572, -12, -156, -368, -340, 432,
213 252, -752, -152, 288, 268, -580, -848, -592, 108, -76, 244,
214 312, -716, 592, -80, 436, 360, 4, -248, 160, 516, 584,
215 732, 44, -468, -280, -292, -156, -588, 28, 308, 912, 24,
216 124, 156, 180, -252, 944, -924, -772, -520, -428, -624, 300,
217 -212, -1144, 32, -724, 800, -1128, -212, -1288, -848, 180, -416,
218 440, 192, -576, -792, -76, -1080, 80, -532, -352, -132, 380,
219 -820, 148, 1112, 128, 164, 456, 700, -924, 144, -668, -384,
220 648, -832, 508, 552, -52, -100, -656, 208, -568, 748, -88,
221 680, 232, 300, 192, -408, -1012, -152, -252, -268, 272, -876,
222 -664, -648, -332, -136, 16, 12, 1152, -28, 332, -536, 320,
223 -672, -460, -316, 532, -260, 228, -40, 1052, -816, 180, 88,
224 -496, -556, -672, -368, 428, 92, 356, 404, -408, 252, 196,
225 -176, -556, 792, 268, 32, 372, 40, 96, -332, 328, 120,
226 372, -900, -40, 472, -264, -592, 952, 128, 656, 112, 664,
227 -232, 420, 4, -344, -464, 556, 244, -416, -32, 252, 0,
228 -412, 188, -696, 508, -476, 324, -1096, 656, -312, 560, 264,
229 -136, 304, 160, -64, -580, 248, 336, -720, 560, -348, -288,
230 -276, -196, -500, 852, -544, -236, -1128, -992, -776, 116, 56,
231 52, 860, 884, 212, -12, 168, 1020, 512, -552, 924, -148,
232 716, 188, 164, -340, -520, -184, 880, -152, -680, -208, -1156,
233 -300, -528, -472, 364, 100, -744, -1056, -32, 540, 280, 144,
234 -676, -32, -232, -280, -224, 96, 568, -76, 172, 148, 148,
235 104, 32, -296, -32, 788, -80, 32, -16, 280, 288, 944,
236 428, -484};
237 static_assert(sizeof(kGaussianSequence) / sizeof(kGaussianSequence[0]) == 2048,
238 "");
239
240 // The number of rows in a contiguous group computed by a single worker thread
241 // before checking for the next available group.
242 constexpr int kFrameChunkHeight = 8;
243
244 // |width| and |height| refer to the plane, not the frame, meaning any
245 // subsampling should be applied by the caller.
246 template <typename Pixel>
CopyImagePlane(const uint8_t * source_plane,ptrdiff_t source_stride,int width,int height,uint8_t * dest_plane,ptrdiff_t dest_stride)247 inline void CopyImagePlane(const uint8_t* source_plane, ptrdiff_t source_stride,
248 int width, int height, uint8_t* dest_plane,
249 ptrdiff_t dest_stride) {
250 // If it's the same buffer there's nothing to do.
251 if (source_plane == dest_plane) return;
252
253 int y = 0;
254 do {
255 memcpy(dest_plane, source_plane, width * sizeof(Pixel));
256 source_plane += source_stride;
257 dest_plane += dest_stride;
258 } while (++y < height);
259 }
260
261 } // namespace
262
263 template <int bitdepth>
FilmGrain(const FilmGrainParams & params,bool is_monochrome,bool color_matrix_is_identity,int subsampling_x,int subsampling_y,int width,int height,ThreadPool * thread_pool)264 FilmGrain<bitdepth>::FilmGrain(const FilmGrainParams& params,
265 bool is_monochrome,
266 bool color_matrix_is_identity, int subsampling_x,
267 int subsampling_y, int width, int height,
268 ThreadPool* thread_pool)
269 : params_(params),
270 is_monochrome_(is_monochrome),
271 color_matrix_is_identity_(color_matrix_is_identity),
272 subsampling_x_(subsampling_x),
273 subsampling_y_(subsampling_y),
274 width_(width),
275 height_(height),
276 template_uv_width_((subsampling_x != 0) ? kMinChromaWidth
277 : kMaxChromaWidth),
278 template_uv_height_((subsampling_y != 0) ? kMinChromaHeight
279 : kMaxChromaHeight),
280 thread_pool_(thread_pool) {}
281
282 template <int bitdepth>
~FilmGrain()283 FilmGrain<bitdepth>::~FilmGrain() {
284 // Clear the earlier poisoning to avoid false reports when the memory range
285 // is reused.
286 ASAN_UNPOISON_MEMORY_REGION(luma_grain_, sizeof(luma_grain_));
287 ASAN_UNPOISON_MEMORY_REGION(scaling_lut_y_, sizeof(scaling_lut_y_));
288 }
289
290 template <int bitdepth>
Init()291 bool FilmGrain<bitdepth>::Init() {
292 // Section 7.18.3.3. Generate grain process.
293 const dsp::Dsp& dsp = *dsp::GetDspTable(bitdepth);
294 // If params_.num_y_points is 0, luma_grain_ will never be read, so we don't
295 // need to generate it.
296 const bool use_luma = params_.num_y_points > 0;
297 if (use_luma) {
298 GenerateLumaGrain(params_, luma_grain_);
299 // If params_.auto_regression_coeff_lag is 0, the filter is the identity
300 // filter and therefore can be skipped.
301 if (params_.auto_regression_coeff_lag > 0) {
302 dsp.film_grain
303 .luma_auto_regression[params_.auto_regression_coeff_lag - 1](
304 params_, luma_grain_);
305 }
306 } else {
307 // Have AddressSanitizer warn if luma_grain_ is used.
308 ASAN_POISON_MEMORY_REGION(luma_grain_, sizeof(luma_grain_));
309 }
310 if (!is_monochrome_) {
311 GenerateChromaGrains(params_, template_uv_width_, template_uv_height_,
312 u_grain_, v_grain_);
313 if (params_.auto_regression_coeff_lag > 0 || use_luma) {
314 dsp.film_grain.chroma_auto_regression[static_cast<int>(
315 use_luma)][params_.auto_regression_coeff_lag](
316 params_, luma_grain_, subsampling_x_, subsampling_y_, u_grain_,
317 v_grain_);
318 }
319 }
320
321 // Section 7.18.3.4. Scaling lookup initialization process.
322
323 // Initialize scaling_lut_y_. If params_.num_y_points > 0, scaling_lut_y_
324 // is used for the Y plane. If params_.chroma_scaling_from_luma is true,
325 // scaling_lut_u_ and scaling_lut_v_ are the same as scaling_lut_y_ and are
326 // set up as aliases. So we need to initialize scaling_lut_y_ under these
327 // two conditions.
328 //
329 // Note: Although it does not seem to make sense, there are test vectors
330 // with chroma_scaling_from_luma=true and params_.num_y_points=0.
331 #if LIBGAV1_MSAN
332 // Quiet film grain / md5 msan warnings.
333 memset(scaling_lut_y_, 0, sizeof(scaling_lut_y_));
334 #endif
335 if (use_luma || params_.chroma_scaling_from_luma) {
336 dsp.film_grain.initialize_scaling_lut(
337 params_.num_y_points, params_.point_y_value, params_.point_y_scaling,
338 scaling_lut_y_, kScalingLutLength);
339 } else {
340 ASAN_POISON_MEMORY_REGION(scaling_lut_y_, sizeof(scaling_lut_y_));
341 }
342 if (!is_monochrome_) {
343 if (params_.chroma_scaling_from_luma) {
344 scaling_lut_u_ = scaling_lut_y_;
345 scaling_lut_v_ = scaling_lut_y_;
346 } else if (params_.num_u_points > 0 || params_.num_v_points > 0) {
347 const size_t buffer_size =
348 kScalingLutLength * (static_cast<int>(params_.num_u_points > 0) +
349 static_cast<int>(params_.num_v_points > 0));
350 scaling_lut_chroma_buffer_.reset(new (std::nothrow) int16_t[buffer_size]);
351 if (scaling_lut_chroma_buffer_ == nullptr) return false;
352
353 int16_t* buffer = scaling_lut_chroma_buffer_.get();
354 #if LIBGAV1_MSAN
355 // Quiet film grain / md5 msan warnings.
356 memset(buffer, 0, buffer_size * 2);
357 #endif
358 if (params_.num_u_points > 0) {
359 scaling_lut_u_ = buffer;
360 dsp.film_grain.initialize_scaling_lut(
361 params_.num_u_points, params_.point_u_value,
362 params_.point_u_scaling, scaling_lut_u_, kScalingLutLength);
363 buffer += kScalingLutLength;
364 }
365 if (params_.num_v_points > 0) {
366 scaling_lut_v_ = buffer;
367 dsp.film_grain.initialize_scaling_lut(
368 params_.num_v_points, params_.point_v_value,
369 params_.point_v_scaling, scaling_lut_v_, kScalingLutLength);
370 }
371 }
372 }
373 return true;
374 }
375
376 template <int bitdepth>
GenerateLumaGrain(const FilmGrainParams & params,GrainType * luma_grain)377 void FilmGrain<bitdepth>::GenerateLumaGrain(const FilmGrainParams& params,
378 GrainType* luma_grain) {
379 // If params.num_y_points is equal to 0, Section 7.18.3.3 specifies we set
380 // the luma_grain array to all zeros. But the Note at the end of Section
381 // 7.18.3.3 says luma_grain "will never be read in this case". So we don't
382 // call GenerateLumaGrain if params.num_y_points is equal to 0.
383 assert(params.num_y_points > 0);
384 const int shift = kBitdepth12 - bitdepth + params.grain_scale_shift;
385 uint16_t seed = params.grain_seed;
386 GrainType* luma_grain_row = luma_grain;
387 for (int y = 0; y < kLumaHeight; ++y) {
388 for (int x = 0; x < kLumaWidth; ++x) {
389 luma_grain_row[x] = RightShiftWithRounding(
390 kGaussianSequence[GetFilmGrainRandomNumber(11, &seed)], shift);
391 }
392 luma_grain_row += kLumaWidth;
393 }
394 }
395
396 template <int bitdepth>
GenerateChromaGrains(const FilmGrainParams & params,int chroma_width,int chroma_height,GrainType * u_grain,GrainType * v_grain)397 void FilmGrain<bitdepth>::GenerateChromaGrains(const FilmGrainParams& params,
398 int chroma_width,
399 int chroma_height,
400 GrainType* u_grain,
401 GrainType* v_grain) {
402 const int shift = kBitdepth12 - bitdepth + params.grain_scale_shift;
403 if (params.num_u_points == 0 && !params.chroma_scaling_from_luma) {
404 memset(u_grain, 0, chroma_height * chroma_width * sizeof(*u_grain));
405 } else {
406 uint16_t seed = params.grain_seed ^ 0xb524;
407 GrainType* u_grain_row = u_grain;
408 assert(chroma_width > 0);
409 assert(chroma_height > 0);
410 int y = 0;
411 do {
412 int x = 0;
413 do {
414 u_grain_row[x] = RightShiftWithRounding(
415 kGaussianSequence[GetFilmGrainRandomNumber(11, &seed)], shift);
416 } while (++x < chroma_width);
417
418 u_grain_row += chroma_width;
419 } while (++y < chroma_height);
420 }
421 if (params.num_v_points == 0 && !params.chroma_scaling_from_luma) {
422 memset(v_grain, 0, chroma_height * chroma_width * sizeof(*v_grain));
423 } else {
424 GrainType* v_grain_row = v_grain;
425 uint16_t seed = params.grain_seed ^ 0x49d8;
426 int y = 0;
427 do {
428 int x = 0;
429 do {
430 v_grain_row[x] = RightShiftWithRounding(
431 kGaussianSequence[GetFilmGrainRandomNumber(11, &seed)], shift);
432 } while (++x < chroma_width);
433
434 v_grain_row += chroma_width;
435 } while (++y < chroma_height);
436 }
437 }
438
439 template <int bitdepth>
AllocateNoiseStripes()440 bool FilmGrain<bitdepth>::AllocateNoiseStripes() {
441 const int half_height = DivideBy2(height_ + 1);
442 assert(half_height > 0);
443 // ceil(half_height / 16.0)
444 const int max_luma_num = DivideBy16(half_height + 15);
445 constexpr int kNoiseStripeHeight = 34;
446 size_t noise_buffer_size = kNoiseStripePadding;
447 if (params_.num_y_points > 0) {
448 noise_buffer_size += max_luma_num * kNoiseStripeHeight * width_;
449 }
450 if (!is_monochrome_) {
451 noise_buffer_size += 2 * max_luma_num *
452 (kNoiseStripeHeight >> subsampling_y_) *
453 SubsampledValue(width_, subsampling_x_);
454 }
455 noise_buffer_.reset(new (std::nothrow) GrainType[noise_buffer_size]);
456 if (noise_buffer_ == nullptr) return false;
457 GrainType* noise_buffer = noise_buffer_.get();
458 if (params_.num_y_points > 0) {
459 noise_stripes_[kPlaneY].Reset(max_luma_num, kNoiseStripeHeight * width_,
460 noise_buffer);
461 noise_buffer += max_luma_num * kNoiseStripeHeight * width_;
462 }
463 if (!is_monochrome_) {
464 noise_stripes_[kPlaneU].Reset(max_luma_num,
465 (kNoiseStripeHeight >> subsampling_y_) *
466 SubsampledValue(width_, subsampling_x_),
467 noise_buffer);
468 noise_buffer += max_luma_num * (kNoiseStripeHeight >> subsampling_y_) *
469 SubsampledValue(width_, subsampling_x_);
470 noise_stripes_[kPlaneV].Reset(max_luma_num,
471 (kNoiseStripeHeight >> subsampling_y_) *
472 SubsampledValue(width_, subsampling_x_),
473 noise_buffer);
474 }
475 return true;
476 }
477
478 template <int bitdepth>
AllocateNoiseImage()479 bool FilmGrain<bitdepth>::AllocateNoiseImage() {
480 // When LIBGAV1_MSAN is enabled, zero initialize to quiet optimized film grain
481 // msan warnings.
482 constexpr bool zero_initialize = LIBGAV1_MSAN == 1;
483 if (params_.num_y_points > 0 &&
484 !noise_image_[kPlaneY].Reset(height_, width_ + kNoiseImagePadding,
485 zero_initialize)) {
486 return false;
487 }
488 if (!is_monochrome_) {
489 if (!noise_image_[kPlaneU].Reset(
490 (height_ + subsampling_y_) >> subsampling_y_,
491 ((width_ + subsampling_x_) >> subsampling_x_) + kNoiseImagePadding,
492 zero_initialize)) {
493 return false;
494 }
495 if (!noise_image_[kPlaneV].Reset(
496 (height_ + subsampling_y_) >> subsampling_y_,
497 ((width_ + subsampling_x_) >> subsampling_x_) + kNoiseImagePadding,
498 zero_initialize)) {
499 return false;
500 }
501 }
502 return true;
503 }
504
505 // Uses |overlap_flag| to skip rows that are covered by the overlap computation.
506 template <int bitdepth>
ConstructNoiseImage(const Array2DView<GrainType> * noise_stripes,int width,int height,int subsampling_x,int subsampling_y,int stripe_start_offset,Array2D<GrainType> * noise_image)507 void FilmGrain<bitdepth>::ConstructNoiseImage(
508 const Array2DView<GrainType>* noise_stripes, int width, int height,
509 int subsampling_x, int subsampling_y, int stripe_start_offset,
510 Array2D<GrainType>* noise_image) {
511 const int plane_width = (width + subsampling_x) >> subsampling_x;
512 const int plane_height = (height + subsampling_y) >> subsampling_y;
513 const int stripe_height = 32 >> subsampling_y;
514 const int stripe_mask = stripe_height - 1;
515 int y = 0;
516 // |luma_num| = y >> (5 - |subsampling_y|). Hence |luma_num| == 0 for all y up
517 // to either 16 or 32.
518 const GrainType* first_noise_stripe = (*noise_stripes)[0];
519 do {
520 memcpy((*noise_image)[y], first_noise_stripe + y * plane_width,
521 plane_width * sizeof(first_noise_stripe[0]));
522 } while (++y < std::min(stripe_height, plane_height));
523 // End special iterations for luma_num == 0.
524
525 int luma_num = 1;
526 for (; y < (plane_height & ~stripe_mask); ++luma_num, y += stripe_height) {
527 const GrainType* noise_stripe = (*noise_stripes)[luma_num];
528 int i = stripe_start_offset;
529 do {
530 memcpy((*noise_image)[y + i], noise_stripe + i * plane_width,
531 plane_width * sizeof(noise_stripe[0]));
532 } while (++i < stripe_height);
533 }
534
535 // If there is a partial stripe, copy any rows beyond the overlap rows.
536 const int remaining_height = plane_height - y;
537 if (remaining_height > stripe_start_offset) {
538 assert(luma_num < noise_stripes->rows());
539 const GrainType* noise_stripe = (*noise_stripes)[luma_num];
540 int i = stripe_start_offset;
541 do {
542 memcpy((*noise_image)[y + i], noise_stripe + i * plane_width,
543 plane_width * sizeof(noise_stripe[0]));
544 } while (++i < remaining_height);
545 }
546 }
547
548 template <int bitdepth>
BlendNoiseChromaWorker(const dsp::Dsp & dsp,const Plane * planes,int num_planes,std::atomic<int> * job_counter,int min_value,int max_chroma,const uint8_t * source_plane_y,ptrdiff_t source_stride_y,const uint8_t * source_plane_u,const uint8_t * source_plane_v,ptrdiff_t source_stride_uv,uint8_t * dest_plane_u,uint8_t * dest_plane_v,ptrdiff_t dest_stride_uv)549 void FilmGrain<bitdepth>::BlendNoiseChromaWorker(
550 const dsp::Dsp& dsp, const Plane* planes, int num_planes,
551 std::atomic<int>* job_counter, int min_value, int max_chroma,
552 const uint8_t* source_plane_y, ptrdiff_t source_stride_y,
553 const uint8_t* source_plane_u, const uint8_t* source_plane_v,
554 ptrdiff_t source_stride_uv, uint8_t* dest_plane_u, uint8_t* dest_plane_v,
555 ptrdiff_t dest_stride_uv) {
556 assert(num_planes > 0);
557 const int full_jobs_per_plane = height_ / kFrameChunkHeight;
558 const int remainder_job_height = height_ & (kFrameChunkHeight - 1);
559 const int total_full_jobs = full_jobs_per_plane * num_planes;
560 // If the frame height is not a multiple of kFrameChunkHeight, one job with
561 // a smaller number of rows is necessary at the end of each plane.
562 const int total_jobs =
563 total_full_jobs + ((remainder_job_height == 0) ? 0 : num_planes);
564 int job_index;
565 // Each job corresponds to a slice of kFrameChunkHeight rows in the luma
566 // plane. dsp->blend_noise_chroma handles subsampling.
567 // This loop body handles a slice of one plane or the other, depending on
568 // which are active. That way, threads working on consecutive jobs will keep
569 // the same region of luma source in working memory.
570 while ((job_index = job_counter->fetch_add(1, std::memory_order_relaxed)) <
571 total_jobs) {
572 const Plane plane = planes[job_index % num_planes];
573 const int slice_index = job_index / num_planes;
574 const int start_height = slice_index * kFrameChunkHeight;
575 const int job_height = std::min(height_ - start_height, kFrameChunkHeight);
576
577 const auto* source_cursor_y = reinterpret_cast<const Pixel*>(
578 source_plane_y + start_height * source_stride_y);
579 const int16_t* scaling_lut_uv;
580 const uint8_t* source_plane_uv;
581 uint8_t* dest_plane_uv;
582
583 if (plane == kPlaneU) {
584 scaling_lut_uv = scaling_lut_u_;
585 source_plane_uv = source_plane_u;
586 dest_plane_uv = dest_plane_u;
587 } else {
588 assert(plane == kPlaneV);
589 scaling_lut_uv = scaling_lut_v_;
590 source_plane_uv = source_plane_v;
591 dest_plane_uv = dest_plane_v;
592 }
593 const auto* source_cursor_uv = reinterpret_cast<const Pixel*>(
594 source_plane_uv + (start_height >> subsampling_y_) * source_stride_uv);
595 auto* dest_cursor_uv = reinterpret_cast<Pixel*>(
596 dest_plane_uv + (start_height >> subsampling_y_) * dest_stride_uv);
597 dsp.film_grain.blend_noise_chroma[params_.chroma_scaling_from_luma](
598 plane, params_, noise_image_, min_value, max_chroma, width_, job_height,
599 start_height, subsampling_x_, subsampling_y_, scaling_lut_uv,
600 source_cursor_y, source_stride_y, source_cursor_uv, source_stride_uv,
601 dest_cursor_uv, dest_stride_uv);
602 }
603 }
604
605 template <int bitdepth>
BlendNoiseLumaWorker(const dsp::Dsp & dsp,std::atomic<int> * job_counter,int min_value,int max_luma,const uint8_t * source_plane_y,ptrdiff_t source_stride_y,uint8_t * dest_plane_y,ptrdiff_t dest_stride_y)606 void FilmGrain<bitdepth>::BlendNoiseLumaWorker(
607 const dsp::Dsp& dsp, std::atomic<int>* job_counter, int min_value,
608 int max_luma, const uint8_t* source_plane_y, ptrdiff_t source_stride_y,
609 uint8_t* dest_plane_y, ptrdiff_t dest_stride_y) {
610 const int total_full_jobs = height_ / kFrameChunkHeight;
611 const int remainder_job_height = height_ & (kFrameChunkHeight - 1);
612 const int total_jobs =
613 total_full_jobs + static_cast<int>(remainder_job_height > 0);
614 int job_index;
615 // Each job is some number of rows in a plane.
616 while ((job_index = job_counter->fetch_add(1, std::memory_order_relaxed)) <
617 total_jobs) {
618 const int start_height = job_index * kFrameChunkHeight;
619 const int job_height = std::min(height_ - start_height, kFrameChunkHeight);
620
621 const auto* source_cursor_y = reinterpret_cast<const Pixel*>(
622 source_plane_y + start_height * source_stride_y);
623 auto* dest_cursor_y =
624 reinterpret_cast<Pixel*>(dest_plane_y + start_height * dest_stride_y);
625 dsp.film_grain.blend_noise_luma(
626 noise_image_, min_value, max_luma, params_.chroma_scaling, width_,
627 job_height, start_height, scaling_lut_y_, source_cursor_y,
628 source_stride_y, dest_cursor_y, dest_stride_y);
629 }
630 }
631
632 template <int bitdepth>
AddNoise(const uint8_t * source_plane_y,ptrdiff_t source_stride_y,const uint8_t * source_plane_u,const uint8_t * source_plane_v,ptrdiff_t source_stride_uv,uint8_t * dest_plane_y,ptrdiff_t dest_stride_y,uint8_t * dest_plane_u,uint8_t * dest_plane_v,ptrdiff_t dest_stride_uv)633 bool FilmGrain<bitdepth>::AddNoise(
634 const uint8_t* source_plane_y, ptrdiff_t source_stride_y,
635 const uint8_t* source_plane_u, const uint8_t* source_plane_v,
636 ptrdiff_t source_stride_uv, uint8_t* dest_plane_y, ptrdiff_t dest_stride_y,
637 uint8_t* dest_plane_u, uint8_t* dest_plane_v, ptrdiff_t dest_stride_uv) {
638 if (!Init()) {
639 LIBGAV1_DLOG(ERROR, "Init() failed.");
640 return false;
641 }
642 if (!AllocateNoiseStripes()) {
643 LIBGAV1_DLOG(ERROR, "AllocateNoiseStripes() failed.");
644 return false;
645 }
646
647 const dsp::Dsp& dsp = *dsp::GetDspTable(bitdepth);
648 const bool use_luma = params_.num_y_points > 0;
649
650 // Construct noise stripes.
651 if (use_luma) {
652 // The luma plane is never subsampled.
653 dsp.film_grain
654 .construct_noise_stripes[static_cast<int>(params_.overlap_flag)](
655 luma_grain_, params_.grain_seed, width_, height_,
656 /*subsampling_x=*/0, /*subsampling_y=*/0, &noise_stripes_[kPlaneY]);
657 }
658 if (!is_monochrome_) {
659 dsp.film_grain
660 .construct_noise_stripes[static_cast<int>(params_.overlap_flag)](
661 u_grain_, params_.grain_seed, width_, height_, subsampling_x_,
662 subsampling_y_, &noise_stripes_[kPlaneU]);
663 dsp.film_grain
664 .construct_noise_stripes[static_cast<int>(params_.overlap_flag)](
665 v_grain_, params_.grain_seed, width_, height_, subsampling_x_,
666 subsampling_y_, &noise_stripes_[kPlaneV]);
667 }
668
669 if (!AllocateNoiseImage()) {
670 LIBGAV1_DLOG(ERROR, "AllocateNoiseImage() failed.");
671 return false;
672 }
673
674 // Construct noise image.
675 if (use_luma) {
676 ConstructNoiseImage(
677 &noise_stripes_[kPlaneY], width_, height_, /*subsampling_x=*/0,
678 /*subsampling_y=*/0, static_cast<int>(params_.overlap_flag) << 1,
679 &noise_image_[kPlaneY]);
680 if (params_.overlap_flag) {
681 dsp.film_grain.construct_noise_image_overlap(
682 &noise_stripes_[kPlaneY], width_, height_, /*subsampling_x=*/0,
683 /*subsampling_y=*/0, &noise_image_[kPlaneY]);
684 }
685 }
686 if (!is_monochrome_) {
687 ConstructNoiseImage(&noise_stripes_[kPlaneU], width_, height_,
688 subsampling_x_, subsampling_y_,
689 static_cast<int>(params_.overlap_flag)
690 << (1 - subsampling_y_),
691 &noise_image_[kPlaneU]);
692 ConstructNoiseImage(&noise_stripes_[kPlaneV], width_, height_,
693 subsampling_x_, subsampling_y_,
694 static_cast<int>(params_.overlap_flag)
695 << (1 - subsampling_y_),
696 &noise_image_[kPlaneV]);
697 if (params_.overlap_flag) {
698 dsp.film_grain.construct_noise_image_overlap(
699 &noise_stripes_[kPlaneU], width_, height_, subsampling_x_,
700 subsampling_y_, &noise_image_[kPlaneU]);
701 dsp.film_grain.construct_noise_image_overlap(
702 &noise_stripes_[kPlaneV], width_, height_, subsampling_x_,
703 subsampling_y_, &noise_image_[kPlaneV]);
704 }
705 }
706
707 // Blend noise image.
708 int min_value;
709 int max_luma;
710 int max_chroma;
711 if (params_.clip_to_restricted_range) {
712 min_value = 16 << (bitdepth - kBitdepth8);
713 max_luma = 235 << (bitdepth - kBitdepth8);
714 if (color_matrix_is_identity_) {
715 max_chroma = max_luma;
716 } else {
717 max_chroma = 240 << (bitdepth - kBitdepth8);
718 }
719 } else {
720 min_value = 0;
721 max_luma = (256 << (bitdepth - kBitdepth8)) - 1;
722 max_chroma = max_luma;
723 }
724
725 // Handle all chroma planes first because luma source may be altered in place.
726 if (!is_monochrome_) {
727 // This is done in a strange way but Vector can't be passed by copy to the
728 // lambda capture that spawns the thread.
729 Plane planes_to_blend[2];
730 int num_planes = 0;
731 if (params_.chroma_scaling_from_luma) {
732 // Both noise planes are computed from the luma scaling lookup table.
733 planes_to_blend[num_planes++] = kPlaneU;
734 planes_to_blend[num_planes++] = kPlaneV;
735 } else {
736 const int height_uv = SubsampledValue(height_, subsampling_y_);
737 const int width_uv = SubsampledValue(width_, subsampling_x_);
738
739 // Noise is applied according to a lookup table defined by pieceiwse
740 // linear "points." If the lookup table is empty, that corresponds to
741 // outputting zero noise.
742 if (params_.num_u_points == 0) {
743 CopyImagePlane<Pixel>(source_plane_u, source_stride_uv, width_uv,
744 height_uv, dest_plane_u, dest_stride_uv);
745 } else {
746 planes_to_blend[num_planes++] = kPlaneU;
747 }
748 if (params_.num_v_points == 0) {
749 CopyImagePlane<Pixel>(source_plane_v, source_stride_uv, width_uv,
750 height_uv, dest_plane_v, dest_stride_uv);
751 } else {
752 planes_to_blend[num_planes++] = kPlaneV;
753 }
754 }
755 if (thread_pool_ != nullptr && num_planes > 0) {
756 const int num_workers = thread_pool_->num_threads();
757 BlockingCounter pending_workers(num_workers);
758 std::atomic<int> job_counter(0);
759 for (int i = 0; i < num_workers; ++i) {
760 thread_pool_->Schedule([this, dsp, &pending_workers, &planes_to_blend,
761 num_planes, &job_counter, min_value, max_chroma,
762 source_plane_y, source_stride_y, source_plane_u,
763 source_plane_v, source_stride_uv, dest_plane_u,
764 dest_plane_v, dest_stride_uv]() {
765 BlendNoiseChromaWorker(dsp, planes_to_blend, num_planes, &job_counter,
766 min_value, max_chroma, source_plane_y,
767 source_stride_y, source_plane_u,
768 source_plane_v, source_stride_uv, dest_plane_u,
769 dest_plane_v, dest_stride_uv);
770 pending_workers.Decrement();
771 });
772 }
773 BlendNoiseChromaWorker(
774 dsp, planes_to_blend, num_planes, &job_counter, min_value, max_chroma,
775 source_plane_y, source_stride_y, source_plane_u, source_plane_v,
776 source_stride_uv, dest_plane_u, dest_plane_v, dest_stride_uv);
777
778 pending_workers.Wait();
779 } else {
780 // Single threaded.
781 if (params_.num_u_points > 0 || params_.chroma_scaling_from_luma) {
782 dsp.film_grain.blend_noise_chroma[params_.chroma_scaling_from_luma](
783 kPlaneU, params_, noise_image_, min_value, max_chroma, width_,
784 height_, /*start_height=*/0, subsampling_x_, subsampling_y_,
785 scaling_lut_u_, source_plane_y, source_stride_y, source_plane_u,
786 source_stride_uv, dest_plane_u, dest_stride_uv);
787 }
788 if (params_.num_v_points > 0 || params_.chroma_scaling_from_luma) {
789 dsp.film_grain.blend_noise_chroma[params_.chroma_scaling_from_luma](
790 kPlaneV, params_, noise_image_, min_value, max_chroma, width_,
791 height_, /*start_height=*/0, subsampling_x_, subsampling_y_,
792 scaling_lut_v_, source_plane_y, source_stride_y, source_plane_v,
793 source_stride_uv, dest_plane_v, dest_stride_uv);
794 }
795 }
796 }
797 if (use_luma) {
798 if (thread_pool_ != nullptr) {
799 const int num_workers = thread_pool_->num_threads();
800 BlockingCounter pending_workers(num_workers);
801 std::atomic<int> job_counter(0);
802 for (int i = 0; i < num_workers; ++i) {
803 thread_pool_->Schedule(
804 [this, dsp, &pending_workers, &job_counter, min_value, max_luma,
805 source_plane_y, source_stride_y, dest_plane_y, dest_stride_y]() {
806 BlendNoiseLumaWorker(dsp, &job_counter, min_value, max_luma,
807 source_plane_y, source_stride_y,
808 dest_plane_y, dest_stride_y);
809 pending_workers.Decrement();
810 });
811 }
812
813 BlendNoiseLumaWorker(dsp, &job_counter, min_value, max_luma,
814 source_plane_y, source_stride_y, dest_plane_y,
815 dest_stride_y);
816 pending_workers.Wait();
817 } else {
818 dsp.film_grain.blend_noise_luma(
819 noise_image_, min_value, max_luma, params_.chroma_scaling, width_,
820 height_, /*start_height=*/0, scaling_lut_y_, source_plane_y,
821 source_stride_y, dest_plane_y, dest_stride_y);
822 }
823 } else {
824 CopyImagePlane<Pixel>(source_plane_y, source_stride_y, width_, height_,
825 dest_plane_y, dest_stride_y);
826 }
827
828 return true;
829 }
830
831 // Explicit instantiations.
832 template class FilmGrain<kBitdepth8>;
833 #if LIBGAV1_MAX_BITDEPTH >= 10
834 template class FilmGrain<kBitdepth10>;
835 #endif
836 #if LIBGAV1_MAX_BITDEPTH == 12
837 template class FilmGrain<kBitdepth12>;
838 #endif
839
840 } // namespace libgav1
841