1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <memory.h>
15 #include <math.h>
16 #include <assert.h>
17
18 #include "config/av1_rtcd.h"
19
20 #include "av1/common/av1_common_int.h"
21 #include "av1/common/warped_motion.h"
22 #include "av1/common/scale.h"
23
24 // For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
25 // at a time. The zoom/rotation/shear in the model are applied to the
26 // "fractional" position of each pixel, which therefore varies within
27 // [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
28 // We need an extra 2 taps to fit this in, for a total of 8 taps.
29 /* clang-format off */
30 const WarpedFilterCoeff av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1]
31 [8] = {
32 // [-1, 0)
33 { 0, 0, 127, 1, 0, 0, 0, 0 }, { 0, - 1, 127, 2, 0, 0, 0, 0 },
34 { 1, - 3, 127, 4, - 1, 0, 0, 0 }, { 1, - 4, 126, 6, - 2, 1, 0, 0 },
35 { 1, - 5, 126, 8, - 3, 1, 0, 0 }, { 1, - 6, 125, 11, - 4, 1, 0, 0 },
36 { 1, - 7, 124, 13, - 4, 1, 0, 0 }, { 2, - 8, 123, 15, - 5, 1, 0, 0 },
37 { 2, - 9, 122, 18, - 6, 1, 0, 0 }, { 2, -10, 121, 20, - 6, 1, 0, 0 },
38 { 2, -11, 120, 22, - 7, 2, 0, 0 }, { 2, -12, 119, 25, - 8, 2, 0, 0 },
39 { 3, -13, 117, 27, - 8, 2, 0, 0 }, { 3, -13, 116, 29, - 9, 2, 0, 0 },
40 { 3, -14, 114, 32, -10, 3, 0, 0 }, { 3, -15, 113, 35, -10, 2, 0, 0 },
41 { 3, -15, 111, 37, -11, 3, 0, 0 }, { 3, -16, 109, 40, -11, 3, 0, 0 },
42 { 3, -16, 108, 42, -12, 3, 0, 0 }, { 4, -17, 106, 45, -13, 3, 0, 0 },
43 { 4, -17, 104, 47, -13, 3, 0, 0 }, { 4, -17, 102, 50, -14, 3, 0, 0 },
44 { 4, -17, 100, 52, -14, 3, 0, 0 }, { 4, -18, 98, 55, -15, 4, 0, 0 },
45 { 4, -18, 96, 58, -15, 3, 0, 0 }, { 4, -18, 94, 60, -16, 4, 0, 0 },
46 { 4, -18, 91, 63, -16, 4, 0, 0 }, { 4, -18, 89, 65, -16, 4, 0, 0 },
47 { 4, -18, 87, 68, -17, 4, 0, 0 }, { 4, -18, 85, 70, -17, 4, 0, 0 },
48 { 4, -18, 82, 73, -17, 4, 0, 0 }, { 4, -18, 80, 75, -17, 4, 0, 0 },
49 { 4, -18, 78, 78, -18, 4, 0, 0 }, { 4, -17, 75, 80, -18, 4, 0, 0 },
50 { 4, -17, 73, 82, -18, 4, 0, 0 }, { 4, -17, 70, 85, -18, 4, 0, 0 },
51 { 4, -17, 68, 87, -18, 4, 0, 0 }, { 4, -16, 65, 89, -18, 4, 0, 0 },
52 { 4, -16, 63, 91, -18, 4, 0, 0 }, { 4, -16, 60, 94, -18, 4, 0, 0 },
53 { 3, -15, 58, 96, -18, 4, 0, 0 }, { 4, -15, 55, 98, -18, 4, 0, 0 },
54 { 3, -14, 52, 100, -17, 4, 0, 0 }, { 3, -14, 50, 102, -17, 4, 0, 0 },
55 { 3, -13, 47, 104, -17, 4, 0, 0 }, { 3, -13, 45, 106, -17, 4, 0, 0 },
56 { 3, -12, 42, 108, -16, 3, 0, 0 }, { 3, -11, 40, 109, -16, 3, 0, 0 },
57 { 3, -11, 37, 111, -15, 3, 0, 0 }, { 2, -10, 35, 113, -15, 3, 0, 0 },
58 { 3, -10, 32, 114, -14, 3, 0, 0 }, { 2, - 9, 29, 116, -13, 3, 0, 0 },
59 { 2, - 8, 27, 117, -13, 3, 0, 0 }, { 2, - 8, 25, 119, -12, 2, 0, 0 },
60 { 2, - 7, 22, 120, -11, 2, 0, 0 }, { 1, - 6, 20, 121, -10, 2, 0, 0 },
61 { 1, - 6, 18, 122, - 9, 2, 0, 0 }, { 1, - 5, 15, 123, - 8, 2, 0, 0 },
62 { 1, - 4, 13, 124, - 7, 1, 0, 0 }, { 1, - 4, 11, 125, - 6, 1, 0, 0 },
63 { 1, - 3, 8, 126, - 5, 1, 0, 0 }, { 1, - 2, 6, 126, - 4, 1, 0, 0 },
64 { 0, - 1, 4, 127, - 3, 1, 0, 0 }, { 0, 0, 2, 127, - 1, 0, 0, 0 },
65
66 // [0, 1)
67 { 0, 0, 0, 127, 1, 0, 0, 0}, { 0, 0, -1, 127, 2, 0, 0, 0},
68 { 0, 1, -3, 127, 4, -2, 1, 0}, { 0, 1, -5, 127, 6, -2, 1, 0},
69 { 0, 2, -6, 126, 8, -3, 1, 0}, {-1, 2, -7, 126, 11, -4, 2, -1},
70 {-1, 3, -8, 125, 13, -5, 2, -1}, {-1, 3, -10, 124, 16, -6, 3, -1},
71 {-1, 4, -11, 123, 18, -7, 3, -1}, {-1, 4, -12, 122, 20, -7, 3, -1},
72 {-1, 4, -13, 121, 23, -8, 3, -1}, {-2, 5, -14, 120, 25, -9, 4, -1},
73 {-1, 5, -15, 119, 27, -10, 4, -1}, {-1, 5, -16, 118, 30, -11, 4, -1},
74 {-2, 6, -17, 116, 33, -12, 5, -1}, {-2, 6, -17, 114, 35, -12, 5, -1},
75 {-2, 6, -18, 113, 38, -13, 5, -1}, {-2, 7, -19, 111, 41, -14, 6, -2},
76 {-2, 7, -19, 110, 43, -15, 6, -2}, {-2, 7, -20, 108, 46, -15, 6, -2},
77 {-2, 7, -20, 106, 49, -16, 6, -2}, {-2, 7, -21, 104, 51, -16, 7, -2},
78 {-2, 7, -21, 102, 54, -17, 7, -2}, {-2, 8, -21, 100, 56, -18, 7, -2},
79 {-2, 8, -22, 98, 59, -18, 7, -2}, {-2, 8, -22, 96, 62, -19, 7, -2},
80 {-2, 8, -22, 94, 64, -19, 7, -2}, {-2, 8, -22, 91, 67, -20, 8, -2},
81 {-2, 8, -22, 89, 69, -20, 8, -2}, {-2, 8, -22, 87, 72, -21, 8, -2},
82 {-2, 8, -21, 84, 74, -21, 8, -2}, {-2, 8, -22, 82, 77, -21, 8, -2},
83 {-2, 8, -21, 79, 79, -21, 8, -2}, {-2, 8, -21, 77, 82, -22, 8, -2},
84 {-2, 8, -21, 74, 84, -21, 8, -2}, {-2, 8, -21, 72, 87, -22, 8, -2},
85 {-2, 8, -20, 69, 89, -22, 8, -2}, {-2, 8, -20, 67, 91, -22, 8, -2},
86 {-2, 7, -19, 64, 94, -22, 8, -2}, {-2, 7, -19, 62, 96, -22, 8, -2},
87 {-2, 7, -18, 59, 98, -22, 8, -2}, {-2, 7, -18, 56, 100, -21, 8, -2},
88 {-2, 7, -17, 54, 102, -21, 7, -2}, {-2, 7, -16, 51, 104, -21, 7, -2},
89 {-2, 6, -16, 49, 106, -20, 7, -2}, {-2, 6, -15, 46, 108, -20, 7, -2},
90 {-2, 6, -15, 43, 110, -19, 7, -2}, {-2, 6, -14, 41, 111, -19, 7, -2},
91 {-1, 5, -13, 38, 113, -18, 6, -2}, {-1, 5, -12, 35, 114, -17, 6, -2},
92 {-1, 5, -12, 33, 116, -17, 6, -2}, {-1, 4, -11, 30, 118, -16, 5, -1},
93 {-1, 4, -10, 27, 119, -15, 5, -1}, {-1, 4, -9, 25, 120, -14, 5, -2},
94 {-1, 3, -8, 23, 121, -13, 4, -1}, {-1, 3, -7, 20, 122, -12, 4, -1},
95 {-1, 3, -7, 18, 123, -11, 4, -1}, {-1, 3, -6, 16, 124, -10, 3, -1},
96 {-1, 2, -5, 13, 125, -8, 3, -1}, {-1, 2, -4, 11, 126, -7, 2, -1},
97 { 0, 1, -3, 8, 126, -6, 2, 0}, { 0, 1, -2, 6, 127, -5, 1, 0},
98 { 0, 1, -2, 4, 127, -3, 1, 0}, { 0, 0, 0, 2, 127, -1, 0, 0},
99
100 // [1, 2)
101 { 0, 0, 0, 1, 127, 0, 0, 0 }, { 0, 0, 0, - 1, 127, 2, 0, 0 },
102 { 0, 0, 1, - 3, 127, 4, - 1, 0 }, { 0, 0, 1, - 4, 126, 6, - 2, 1 },
103 { 0, 0, 1, - 5, 126, 8, - 3, 1 }, { 0, 0, 1, - 6, 125, 11, - 4, 1 },
104 { 0, 0, 1, - 7, 124, 13, - 4, 1 }, { 0, 0, 2, - 8, 123, 15, - 5, 1 },
105 { 0, 0, 2, - 9, 122, 18, - 6, 1 }, { 0, 0, 2, -10, 121, 20, - 6, 1 },
106 { 0, 0, 2, -11, 120, 22, - 7, 2 }, { 0, 0, 2, -12, 119, 25, - 8, 2 },
107 { 0, 0, 3, -13, 117, 27, - 8, 2 }, { 0, 0, 3, -13, 116, 29, - 9, 2 },
108 { 0, 0, 3, -14, 114, 32, -10, 3 }, { 0, 0, 3, -15, 113, 35, -10, 2 },
109 { 0, 0, 3, -15, 111, 37, -11, 3 }, { 0, 0, 3, -16, 109, 40, -11, 3 },
110 { 0, 0, 3, -16, 108, 42, -12, 3 }, { 0, 0, 4, -17, 106, 45, -13, 3 },
111 { 0, 0, 4, -17, 104, 47, -13, 3 }, { 0, 0, 4, -17, 102, 50, -14, 3 },
112 { 0, 0, 4, -17, 100, 52, -14, 3 }, { 0, 0, 4, -18, 98, 55, -15, 4 },
113 { 0, 0, 4, -18, 96, 58, -15, 3 }, { 0, 0, 4, -18, 94, 60, -16, 4 },
114 { 0, 0, 4, -18, 91, 63, -16, 4 }, { 0, 0, 4, -18, 89, 65, -16, 4 },
115 { 0, 0, 4, -18, 87, 68, -17, 4 }, { 0, 0, 4, -18, 85, 70, -17, 4 },
116 { 0, 0, 4, -18, 82, 73, -17, 4 }, { 0, 0, 4, -18, 80, 75, -17, 4 },
117 { 0, 0, 4, -18, 78, 78, -18, 4 }, { 0, 0, 4, -17, 75, 80, -18, 4 },
118 { 0, 0, 4, -17, 73, 82, -18, 4 }, { 0, 0, 4, -17, 70, 85, -18, 4 },
119 { 0, 0, 4, -17, 68, 87, -18, 4 }, { 0, 0, 4, -16, 65, 89, -18, 4 },
120 { 0, 0, 4, -16, 63, 91, -18, 4 }, { 0, 0, 4, -16, 60, 94, -18, 4 },
121 { 0, 0, 3, -15, 58, 96, -18, 4 }, { 0, 0, 4, -15, 55, 98, -18, 4 },
122 { 0, 0, 3, -14, 52, 100, -17, 4 }, { 0, 0, 3, -14, 50, 102, -17, 4 },
123 { 0, 0, 3, -13, 47, 104, -17, 4 }, { 0, 0, 3, -13, 45, 106, -17, 4 },
124 { 0, 0, 3, -12, 42, 108, -16, 3 }, { 0, 0, 3, -11, 40, 109, -16, 3 },
125 { 0, 0, 3, -11, 37, 111, -15, 3 }, { 0, 0, 2, -10, 35, 113, -15, 3 },
126 { 0, 0, 3, -10, 32, 114, -14, 3 }, { 0, 0, 2, - 9, 29, 116, -13, 3 },
127 { 0, 0, 2, - 8, 27, 117, -13, 3 }, { 0, 0, 2, - 8, 25, 119, -12, 2 },
128 { 0, 0, 2, - 7, 22, 120, -11, 2 }, { 0, 0, 1, - 6, 20, 121, -10, 2 },
129 { 0, 0, 1, - 6, 18, 122, - 9, 2 }, { 0, 0, 1, - 5, 15, 123, - 8, 2 },
130 { 0, 0, 1, - 4, 13, 124, - 7, 1 }, { 0, 0, 1, - 4, 11, 125, - 6, 1 },
131 { 0, 0, 1, - 3, 8, 126, - 5, 1 }, { 0, 0, 1, - 2, 6, 126, - 4, 1 },
132 { 0, 0, 0, - 1, 4, 127, - 3, 1 }, { 0, 0, 0, 0, 2, 127, - 1, 0 },
133 // dummy (replicate row index 191)
134 { 0, 0, 0, 0, 2, 127, - 1, 0 },
135 };
136
137 /* clang-format on */
138
139 #define DIV_LUT_PREC_BITS 14
140 #define DIV_LUT_BITS 8
141 #define DIV_LUT_NUM (1 << DIV_LUT_BITS)
142
143 static const uint16_t div_lut[DIV_LUT_NUM + 1] = {
144 16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
145 15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
146 15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
147 14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
148 13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
149 13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
150 13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
151 12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
152 12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
153 11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
154 11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
155 11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
156 10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
157 10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
158 10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
159 9963, 9939, 9916, 9892, 9869, 9846, 9823, 9800, 9777, 9754, 9732,
160 9709, 9687, 9664, 9642, 9620, 9598, 9576, 9554, 9533, 9511, 9489,
161 9468, 9447, 9425, 9404, 9383, 9362, 9341, 9321, 9300, 9279, 9259,
162 9239, 9218, 9198, 9178, 9158, 9138, 9118, 9098, 9079, 9059, 9039,
163 9020, 9001, 8981, 8962, 8943, 8924, 8905, 8886, 8867, 8849, 8830,
164 8812, 8793, 8775, 8756, 8738, 8720, 8702, 8684, 8666, 8648, 8630,
165 8613, 8595, 8577, 8560, 8542, 8525, 8508, 8490, 8473, 8456, 8439,
166 8422, 8405, 8389, 8372, 8355, 8339, 8322, 8306, 8289, 8273, 8257,
167 8240, 8224, 8208, 8192,
168 };
169
170 // Decomposes a divisor D such that 1/D = y/2^shift, where y is returned
171 // at precision of DIV_LUT_PREC_BITS along with the shift.
resolve_divisor_64(uint64_t D,int16_t * shift)172 static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) {
173 int64_t f;
174 *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
175 : get_msb((unsigned int)D));
176 // e is obtained from D after resetting the most significant 1 bit.
177 const int64_t e = D - ((uint64_t)1 << *shift);
178 // Get the most significant DIV_LUT_BITS (8) bits of e into f
179 if (*shift > DIV_LUT_BITS)
180 f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS);
181 else
182 f = e << (DIV_LUT_BITS - *shift);
183 assert(f <= DIV_LUT_NUM);
184 *shift += DIV_LUT_PREC_BITS;
185 // Use f as lookup into the precomputed table of multipliers
186 return div_lut[f];
187 }
188
resolve_divisor_32(uint32_t D,int16_t * shift)189 static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) {
190 int32_t f;
191 *shift = get_msb(D);
192 // e is obtained from D after resetting the most significant 1 bit.
193 const int32_t e = D - ((uint32_t)1 << *shift);
194 // Get the most significant DIV_LUT_BITS (8) bits of e into f
195 if (*shift > DIV_LUT_BITS)
196 f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS);
197 else
198 f = e << (DIV_LUT_BITS - *shift);
199 assert(f <= DIV_LUT_NUM);
200 *shift += DIV_LUT_PREC_BITS;
201 // Use f as lookup into the precomputed table of multipliers
202 return div_lut[f];
203 }
204
is_affine_valid(const WarpedMotionParams * const wm)205 static int is_affine_valid(const WarpedMotionParams *const wm) {
206 const int32_t *mat = wm->wmmat;
207 return (mat[2] > 0);
208 }
209
is_affine_shear_allowed(int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)210 static int is_affine_shear_allowed(int16_t alpha, int16_t beta, int16_t gamma,
211 int16_t delta) {
212 if ((4 * abs(alpha) + 7 * abs(beta) >= (1 << WARPEDMODEL_PREC_BITS)) ||
213 (4 * abs(gamma) + 4 * abs(delta) >= (1 << WARPEDMODEL_PREC_BITS)))
214 return 0;
215 else
216 return 1;
217 }
218
219 #ifndef NDEBUG
220 // Check that the given warp model satisfies the relevant constraints for
221 // its stated model type
check_model_consistency(WarpedMotionParams * wm)222 static void check_model_consistency(WarpedMotionParams *wm) {
223 switch (wm->wmtype) {
224 case IDENTITY:
225 assert(wm->wmmat[0] == 0);
226 assert(wm->wmmat[1] == 0);
227 AOM_FALLTHROUGH_INTENDED;
228 case TRANSLATION:
229 assert(wm->wmmat[2] == 1 << WARPEDMODEL_PREC_BITS);
230 assert(wm->wmmat[3] == 0);
231 AOM_FALLTHROUGH_INTENDED;
232 case ROTZOOM:
233 assert(wm->wmmat[4] == -wm->wmmat[3]);
234 assert(wm->wmmat[5] == wm->wmmat[2]);
235 AOM_FALLTHROUGH_INTENDED;
236 case AFFINE: break;
237 default: assert(0 && "Bad wmtype");
238 }
239 }
240 #endif // NDEBUG
241
242 // Returns 1 on success or 0 on an invalid affine set
av1_get_shear_params(WarpedMotionParams * wm)243 int av1_get_shear_params(WarpedMotionParams *wm) {
244 #ifndef NDEBUG
245 // Check that models have been constructed sensibly
246 // This is a good place to check, because this function does not need to
247 // be called until after model construction is complete, but must be called
248 // before the model can be used for prediction.
249 check_model_consistency(wm);
250 #endif // NDEBUG
251
252 const int32_t *mat = wm->wmmat;
253 if (!is_affine_valid(wm)) return 0;
254
255 wm->alpha =
256 clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
257 wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX);
258 int16_t shift;
259 int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1);
260 int64_t v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y;
261 wm->gamma =
262 clamp((int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift), INT16_MIN, INT16_MAX);
263 v = ((int64_t)mat[3] * mat[4]) * y;
264 wm->delta = clamp(mat[5] - (int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift) -
265 (1 << WARPEDMODEL_PREC_BITS),
266 INT16_MIN, INT16_MAX);
267
268 wm->alpha = ROUND_POWER_OF_TWO_SIGNED(wm->alpha, WARP_PARAM_REDUCE_BITS) *
269 (1 << WARP_PARAM_REDUCE_BITS);
270 wm->beta = ROUND_POWER_OF_TWO_SIGNED(wm->beta, WARP_PARAM_REDUCE_BITS) *
271 (1 << WARP_PARAM_REDUCE_BITS);
272 wm->gamma = ROUND_POWER_OF_TWO_SIGNED(wm->gamma, WARP_PARAM_REDUCE_BITS) *
273 (1 << WARP_PARAM_REDUCE_BITS);
274 wm->delta = ROUND_POWER_OF_TWO_SIGNED(wm->delta, WARP_PARAM_REDUCE_BITS) *
275 (1 << WARP_PARAM_REDUCE_BITS);
276
277 if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta))
278 return 0;
279
280 return 1;
281 }
282
283 #if CONFIG_AV1_HIGHBITDEPTH
284 /* Note: For an explanation of the warp algorithm, and some notes on bit widths
285 for hardware implementations, see the comments above av1_warp_affine_c
286 */
av1_highbd_warp_affine_c(const int32_t * mat,const uint16_t * ref,int width,int height,int stride,uint16_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)287 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
288 int width, int height, int stride, uint16_t *pred,
289 int p_col, int p_row, int p_width, int p_height,
290 int p_stride, int subsampling_x,
291 int subsampling_y, int bd,
292 ConvolveParams *conv_params, int16_t alpha,
293 int16_t beta, int16_t gamma, int16_t delta) {
294 int32_t tmp[15 * 8];
295 const int reduce_bits_horiz = conv_params->round_0;
296 const int reduce_bits_vert = conv_params->is_compound
297 ? conv_params->round_1
298 : 2 * FILTER_BITS - reduce_bits_horiz;
299 const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
300 const int offset_bits_horiz = bd + FILTER_BITS - 1;
301 const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
302 const int round_bits =
303 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
304 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
305 (void)max_bits_horiz;
306 assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
307
308 // Check that, even with 12-bit input, the intermediate values will fit
309 // into an unsigned 16-bit intermediate array.
310 assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
311
312 for (int i = p_row; i < p_row + p_height; i += 8) {
313 for (int j = p_col; j < p_col + p_width; j += 8) {
314 // Calculate the center of this 8x8 block,
315 // project to luma coordinates (if in a subsampled chroma plane),
316 // apply the affine transformation,
317 // then convert back to the original coordinates (if necessary)
318 const int32_t src_x = (j + 4) << subsampling_x;
319 const int32_t src_y = (i + 4) << subsampling_y;
320 const int64_t dst_x =
321 (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
322 const int64_t dst_y =
323 (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
324 const int64_t x4 = dst_x >> subsampling_x;
325 const int64_t y4 = dst_y >> subsampling_y;
326
327 const int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
328 int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
329 const int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
330 int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
331
332 sx4 += alpha * (-4) + beta * (-4);
333 sy4 += gamma * (-4) + delta * (-4);
334
335 sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
336 sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
337
338 // Horizontal filter
339 for (int k = -7; k < 8; ++k) {
340 const int iy = clamp(iy4 + k, 0, height - 1);
341
342 int sx = sx4 + beta * (k + 4);
343 for (int l = -4; l < 4; ++l) {
344 int ix = ix4 + l - 3;
345 const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
346 WARPEDPIXEL_PREC_SHIFTS;
347 assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
348 const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
349
350 int32_t sum = 1 << offset_bits_horiz;
351 for (int m = 0; m < 8; ++m) {
352 const int sample_x = clamp(ix + m, 0, width - 1);
353 sum += ref[iy * stride + sample_x] * coeffs[m];
354 }
355 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
356 assert(0 <= sum && sum < (1 << max_bits_horiz));
357 tmp[(k + 7) * 8 + (l + 4)] = sum;
358 sx += alpha;
359 }
360 }
361
362 // Vertical filter
363 for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
364 int sy = sy4 + delta * (k + 4);
365 for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
366 const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
367 WARPEDPIXEL_PREC_SHIFTS;
368 assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
369 const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
370
371 int32_t sum = 1 << offset_bits_vert;
372 for (int m = 0; m < 8; ++m) {
373 sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
374 }
375
376 if (conv_params->is_compound) {
377 CONV_BUF_TYPE *p =
378 &conv_params
379 ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
380 (j - p_col + l + 4)];
381 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
382 if (conv_params->do_average) {
383 uint16_t *dst16 =
384 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
385 int32_t tmp32 = *p;
386 if (conv_params->use_dist_wtd_comp_avg) {
387 tmp32 = tmp32 * conv_params->fwd_offset +
388 sum * conv_params->bck_offset;
389 tmp32 = tmp32 >> DIST_PRECISION_BITS;
390 } else {
391 tmp32 += sum;
392 tmp32 = tmp32 >> 1;
393 }
394 tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
395 (1 << (offset_bits - conv_params->round_1 - 1));
396 *dst16 =
397 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp32, round_bits), bd);
398 } else {
399 *p = sum;
400 }
401 } else {
402 uint16_t *p =
403 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
404 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
405 assert(0 <= sum && sum < (1 << (bd + 2)));
406 *p = clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
407 }
408 sy += gamma;
409 }
410 }
411 }
412 }
413 }
414
highbd_warp_plane(WarpedMotionParams * wm,const uint16_t * const ref,int width,int height,int stride,uint16_t * const pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,ConvolveParams * conv_params)415 void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref,
416 int width, int height, int stride, uint16_t *const pred,
417 int p_col, int p_row, int p_width, int p_height,
418 int p_stride, int subsampling_x, int subsampling_y,
419 int bd, ConvolveParams *conv_params) {
420 const int32_t *const mat = wm->wmmat;
421 const int16_t alpha = wm->alpha;
422 const int16_t beta = wm->beta;
423 const int16_t gamma = wm->gamma;
424 const int16_t delta = wm->delta;
425
426 av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
427 p_width, p_height, p_stride, subsampling_x,
428 subsampling_y, bd, conv_params, alpha, beta, gamma,
429 delta);
430 }
431 #endif // CONFIG_AV1_HIGHBITDEPTH
432
433 /* The warp filter for ROTZOOM and AFFINE models works as follows:
434 * Split the input into 8x8 blocks
435 * For each block, project the point (4, 4) within the block, to get the
436 overall block position. Split into integer and fractional coordinates,
437 maintaining full WARPEDMODEL precision
438 * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a
439 variable horizontal offset. This means that, while the rows of the
440 intermediate buffer align with the rows of the *reference* image, the
441 columns align with the columns of the *destination* image.
442 * Filter vertically: Generate the output block (up to 8x8 pixels, but if the
443 destination is too small we crop the output at this stage). Each pixel has
444 a variable vertical offset, so that the resulting rows are aligned with
445 the rows of the destination image.
446
447 To accomplish these alignments, we factor the warp matrix as a
448 product of two shear / asymmetric zoom matrices:
449 / a b \ = / 1 0 \ * / 1+alpha beta \
450 \ c d / \ gamma 1+delta / \ 0 1 /
451 where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively.
452 The horizontal shear (with alpha and beta) is applied first,
453 then the vertical shear (with gamma and delta) is applied second.
454
455 The only limitation is that, to fit this in a fixed 8-tap filter size,
456 the fractional pixel offsets must be at most +-1. Since the horizontal filter
457 generates 15 rows of 8 columns, and the initial point we project is at (4, 4)
458 within the block, the parameters must satisfy
459 4 * |alpha| + 7 * |beta| <= 1 and 4 * |gamma| + 4 * |delta| <= 1
460 for this filter to be applicable.
461
462 Note: This function assumes that the caller has done all of the relevant
463 checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5]
464 are set appropriately (if using a ROTZOOM model), and that alpha, beta,
465 gamma, delta are all in range.
466
467 TODO(rachelbarker): Maybe support scaled references?
468 */
469 /* A note on hardware implementation:
470 The warp filter is intended to be implementable using the same hardware as
471 the high-precision convolve filters from the loop-restoration and
472 convolve-round experiments.
473
474 For a single filter stage, considering all of the coefficient sets for the
475 warp filter and the regular convolution filter, an input in the range
476 [0, 2^k - 1] is mapped into the range [-56 * (2^k - 1), 184 * (2^k - 1)]
477 before rounding.
478
479 Allowing for some changes to the filter coefficient sets, call the range
480 [-64 * 2^k, 192 * 2^k]. Then, if we initialize the accumulator to 64 * 2^k,
481 we can replace this by the range [0, 256 * 2^k], which can be stored in an
482 unsigned value with 8 + k bits.
483
484 This allows the derivation of the appropriate bit widths and offsets for
485 the various intermediate values: If
486
487 F := FILTER_BITS = 7 (or else the above ranges need adjusting)
488 So a *single* filter stage maps a k-bit input to a (k + F + 1)-bit
489 intermediate value.
490 H := ROUND0_BITS
491 V := VERSHEAR_REDUCE_PREC_BITS
492 (and note that we must have H + V = 2*F for the output to have the same
493 scale as the input)
494
495 then we end up with the following offsets and ranges:
496 Horizontal filter: Apply an offset of 1 << (bd + F - 1), sum fits into a
497 uint{bd + F + 1}
498 After rounding: The values stored in 'tmp' fit into a uint{bd + F + 1 - H}.
499 Vertical filter: Apply an offset of 1 << (bd + 2*F - H), sum fits into a
500 uint{bd + 2*F + 2 - H}
501 After rounding: The final value, before undoing the offset, fits into a
502 uint{bd + 2}.
503
504 Then we need to undo the offsets before clamping to a pixel. Note that,
505 if we do this at the end, the amount to subtract is actually independent
506 of H and V:
507
508 offset to subtract = (1 << ((bd + F - 1) - H + F - V)) +
509 (1 << ((bd + 2*F - H) - V))
510 == (1 << (bd - 1)) + (1 << bd)
511
512 This allows us to entirely avoid clamping in both the warp filter and
513 the convolve-round experiment. As of the time of writing, the Wiener filter
514 from loop-restoration can encode a central coefficient up to 216, which
515 leads to a maximum value of about 282 * 2^k after applying the offset.
516 So in that case we still need to clamp.
517 */
av1_warp_affine_c(const int32_t * mat,const uint8_t * ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)518 void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
519 int height, int stride, uint8_t *pred, int p_col,
520 int p_row, int p_width, int p_height, int p_stride,
521 int subsampling_x, int subsampling_y,
522 ConvolveParams *conv_params, int16_t alpha, int16_t beta,
523 int16_t gamma, int16_t delta) {
524 int32_t tmp[15 * 8];
525 const int bd = 8;
526 const int reduce_bits_horiz = conv_params->round_0;
527 const int reduce_bits_vert = conv_params->is_compound
528 ? conv_params->round_1
529 : 2 * FILTER_BITS - reduce_bits_horiz;
530 const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
531 const int offset_bits_horiz = bd + FILTER_BITS - 1;
532 const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
533 const int round_bits =
534 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
535 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
536 (void)max_bits_horiz;
537 assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
538 assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
539
540 for (int i = p_row; i < p_row + p_height; i += 8) {
541 for (int j = p_col; j < p_col + p_width; j += 8) {
542 // Calculate the center of this 8x8 block,
543 // project to luma coordinates (if in a subsampled chroma plane),
544 // apply the affine transformation,
545 // then convert back to the original coordinates (if necessary)
546 const int32_t src_x = (j + 4) << subsampling_x;
547 const int32_t src_y = (i + 4) << subsampling_y;
548 const int64_t dst_x =
549 (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
550 const int64_t dst_y =
551 (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
552 const int64_t x4 = dst_x >> subsampling_x;
553 const int64_t y4 = dst_y >> subsampling_y;
554
555 int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
556 int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
557 int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
558 int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
559
560 sx4 += alpha * (-4) + beta * (-4);
561 sy4 += gamma * (-4) + delta * (-4);
562
563 sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
564 sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
565
566 // Horizontal filter
567 for (int k = -7; k < 8; ++k) {
568 // Clamp to top/bottom edge of the frame
569 const int iy = clamp(iy4 + k, 0, height - 1);
570
571 int sx = sx4 + beta * (k + 4);
572
573 for (int l = -4; l < 4; ++l) {
574 int ix = ix4 + l - 3;
575 // At this point, sx = sx4 + alpha * l + beta * k
576 const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
577 WARPEDPIXEL_PREC_SHIFTS;
578 assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
579 const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
580
581 int32_t sum = 1 << offset_bits_horiz;
582 for (int m = 0; m < 8; ++m) {
583 // Clamp to left/right edge of the frame
584 const int sample_x = clamp(ix + m, 0, width - 1);
585
586 sum += ref[iy * stride + sample_x] * coeffs[m];
587 }
588 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
589 assert(0 <= sum && sum < (1 << max_bits_horiz));
590 tmp[(k + 7) * 8 + (l + 4)] = sum;
591 sx += alpha;
592 }
593 }
594
595 // Vertical filter
596 for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
597 int sy = sy4 + delta * (k + 4);
598 for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
599 // At this point, sy = sy4 + gamma * l + delta * k
600 const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
601 WARPEDPIXEL_PREC_SHIFTS;
602 assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
603 const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
604
605 int32_t sum = 1 << offset_bits_vert;
606 for (int m = 0; m < 8; ++m) {
607 sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
608 }
609
610 if (conv_params->is_compound) {
611 CONV_BUF_TYPE *p =
612 &conv_params
613 ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
614 (j - p_col + l + 4)];
615 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
616 if (conv_params->do_average) {
617 uint8_t *dst8 =
618 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
619 int32_t tmp32 = *p;
620 if (conv_params->use_dist_wtd_comp_avg) {
621 tmp32 = tmp32 * conv_params->fwd_offset +
622 sum * conv_params->bck_offset;
623 tmp32 = tmp32 >> DIST_PRECISION_BITS;
624 } else {
625 tmp32 += sum;
626 tmp32 = tmp32 >> 1;
627 }
628 tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
629 (1 << (offset_bits - conv_params->round_1 - 1));
630 *dst8 = clip_pixel(ROUND_POWER_OF_TWO(tmp32, round_bits));
631 } else {
632 *p = sum;
633 }
634 } else {
635 uint8_t *p =
636 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
637 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
638 assert(0 <= sum && sum < (1 << (bd + 2)));
639 *p = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
640 }
641 sy += gamma;
642 }
643 }
644 }
645 }
646 }
647
warp_plane(WarpedMotionParams * wm,const uint8_t * const ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params)648 void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width,
649 int height, int stride, uint8_t *pred, int p_col, int p_row,
650 int p_width, int p_height, int p_stride, int subsampling_x,
651 int subsampling_y, ConvolveParams *conv_params) {
652 const int32_t *const mat = wm->wmmat;
653 const int16_t alpha = wm->alpha;
654 const int16_t beta = wm->beta;
655 const int16_t gamma = wm->gamma;
656 const int16_t delta = wm->delta;
657 av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width,
658 p_height, p_stride, subsampling_x, subsampling_y, conv_params,
659 alpha, beta, gamma, delta);
660 }
661
av1_warp_plane(WarpedMotionParams * wm,int use_hbd,int bd,const uint8_t * ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params)662 void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
663 const uint8_t *ref, int width, int height, int stride,
664 uint8_t *pred, int p_col, int p_row, int p_width,
665 int p_height, int p_stride, int subsampling_x,
666 int subsampling_y, ConvolveParams *conv_params) {
667 #if CONFIG_AV1_HIGHBITDEPTH
668 if (use_hbd)
669 highbd_warp_plane(wm, CONVERT_TO_SHORTPTR(ref), width, height, stride,
670 CONVERT_TO_SHORTPTR(pred), p_col, p_row, p_width,
671 p_height, p_stride, subsampling_x, subsampling_y, bd,
672 conv_params);
673 else
674 warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
675 p_height, p_stride, subsampling_x, subsampling_y, conv_params);
676 #else
677 (void)use_hbd;
678 (void)bd;
679 warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
680 p_height, p_stride, subsampling_x, subsampling_y, conv_params);
681 #endif
682 }
683
684 #define LS_MV_MAX 256 // max mv in 1/8-pel
685 // Use LS_STEP = 8 so that 2 less bits needed for A, Bx, By.
686 #define LS_STEP 8
687
688 // Assuming LS_MV_MAX is < MAX_SB_SIZE * 8,
689 // the precision needed is:
690 // (MAX_SB_SIZE_LOG2 + 3) [for sx * sx magnitude] +
691 // (MAX_SB_SIZE_LOG2 + 4) [for sx * dx magnitude] +
692 // 1 [for sign] +
693 // LEAST_SQUARES_SAMPLES_MAX_BITS
694 // [for adding up to LEAST_SQUARES_SAMPLES_MAX samples]
695 // The value is 23
696 #define LS_MAT_RANGE_BITS \
697 ((MAX_SB_SIZE_LOG2 + 4) * 2 + LEAST_SQUARES_SAMPLES_MAX_BITS)
698
699 // Bit-depth reduction from the full-range
700 #define LS_MAT_DOWN_BITS 2
701
702 // bits range of A, Bx and By after downshifting
703 #define LS_MAT_BITS (LS_MAT_RANGE_BITS - LS_MAT_DOWN_BITS)
704 #define LS_MAT_MIN (-(1 << (LS_MAT_BITS - 1)))
705 #define LS_MAT_MAX ((1 << (LS_MAT_BITS - 1)) - 1)
706
707 // By setting LS_STEP = 8, the least 2 bits of every elements in A, Bx, By are
708 // 0. So, we can reduce LS_MAT_RANGE_BITS(2) bits here.
709 #define LS_SQUARE(a) \
710 (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
711 (2 + LS_MAT_DOWN_BITS))
712 #define LS_PRODUCT1(a, b) \
713 (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
714 (2 + LS_MAT_DOWN_BITS))
715 #define LS_PRODUCT2(a, b) \
716 (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
717 (2 + LS_MAT_DOWN_BITS))
718
719 #define USE_LIMITED_PREC_MULT 0
720
721 #if USE_LIMITED_PREC_MULT
722
723 #define MUL_PREC_BITS 16
resolve_multiplier_64(uint64_t D,int16_t * shift)724 static uint16_t resolve_multiplier_64(uint64_t D, int16_t *shift) {
725 int msb = 0;
726 uint16_t mult = 0;
727 *shift = 0;
728 if (D != 0) {
729 msb = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
730 : get_msb((unsigned int)D));
731 if (msb >= MUL_PREC_BITS) {
732 mult = (uint16_t)ROUND_POWER_OF_TWO_64(D, msb + 1 - MUL_PREC_BITS);
733 *shift = msb + 1 - MUL_PREC_BITS;
734 } else {
735 mult = (uint16_t)D;
736 *shift = 0;
737 }
738 }
739 return mult;
740 }
741
get_mult_shift_ndiag(int64_t Px,int16_t iDet,int shift)742 static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
743 int32_t ret;
744 int16_t mshift;
745 uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
746 int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
747 shift -= mshift;
748 if (shift > 0) {
749 return (int32_t)clamp(ROUND_POWER_OF_TWO_SIGNED(v, shift),
750 -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
751 WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
752 } else {
753 return (int32_t)clamp(v * (1 << (-shift)),
754 -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
755 WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
756 }
757 return ret;
758 }
759
get_mult_shift_diag(int64_t Px,int16_t iDet,int shift)760 static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
761 int16_t mshift;
762 uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
763 int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
764 shift -= mshift;
765 if (shift > 0) {
766 return (int32_t)clamp(
767 ROUND_POWER_OF_TWO_SIGNED(v, shift),
768 (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
769 (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
770 } else {
771 return (int32_t)clamp(
772 v * (1 << (-shift)),
773 (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
774 (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
775 }
776 }
777
778 #else
779
get_mult_shift_ndiag(int64_t Px,int16_t iDet,int shift)780 static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
781 int64_t v = Px * (int64_t)iDet;
782 return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
783 -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
784 WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
785 }
786
get_mult_shift_diag(int64_t Px,int16_t iDet,int shift)787 static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
788 int64_t v = Px * (int64_t)iDet;
789 return (int32_t)clamp64(
790 ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
791 (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
792 (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
793 }
794 #endif // USE_LIMITED_PREC_MULT
795
find_affine_int(int np,const int * pts1,const int * pts2,BLOCK_SIZE bsize,int mvy,int mvx,WarpedMotionParams * wm,int mi_row,int mi_col)796 static int find_affine_int(int np, const int *pts1, const int *pts2,
797 BLOCK_SIZE bsize, int mvy, int mvx,
798 WarpedMotionParams *wm, int mi_row, int mi_col) {
799 int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
800 int32_t Bx[2] = { 0, 0 };
801 int32_t By[2] = { 0, 0 };
802
803 const int bw = block_size_wide[bsize];
804 const int bh = block_size_high[bsize];
805 const int rsuy = bh / 2 - 1;
806 const int rsux = bw / 2 - 1;
807 const int suy = rsuy * 8;
808 const int sux = rsux * 8;
809 const int duy = suy + mvy;
810 const int dux = sux + mvx;
811
812 // Assume the center pixel of the block has exactly the same motion vector
813 // as transmitted for the block. First shift the origin of the source
814 // points to the block center, and the origin of the destination points to
815 // the block center added to the motion vector transmitted.
816 // Let (xi, yi) denote the source points and (xi', yi') denote destination
817 // points after origin shfifting, for i = 0, 1, 2, .... n-1.
818 // Then if P = [x0, y0,
819 // x1, y1
820 // x2, y1,
821 // ....
822 // ]
823 // q = [x0', x1', x2', ... ]'
824 // r = [y0', y1', y2', ... ]'
825 // the least squares problems that need to be solved are:
826 // [h1, h2]' = inv(P'P)P'q and
827 // [h3, h4]' = inv(P'P)P'r
828 // where the affine transformation is given by:
829 // x' = h1.x + h2.y
830 // y' = h3.x + h4.y
831 //
832 // The loop below computes: A = P'P, Bx = P'q, By = P'r
833 // We need to just compute inv(A).Bx and inv(A).By for the solutions.
834 // Contribution from neighbor block
835 for (int i = 0; i < np; i++) {
836 const int dx = pts2[i * 2] - dux;
837 const int dy = pts2[i * 2 + 1] - duy;
838 const int sx = pts1[i * 2] - sux;
839 const int sy = pts1[i * 2 + 1] - suy;
840 // (TODO)yunqing: This comparison wouldn't be necessary if the sample
841 // selection is done in find_samples(). Also, global offset can be removed
842 // while collecting samples.
843 if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
844 A[0][0] += LS_SQUARE(sx);
845 A[0][1] += LS_PRODUCT1(sx, sy);
846 A[1][1] += LS_SQUARE(sy);
847 Bx[0] += LS_PRODUCT2(sx, dx);
848 Bx[1] += LS_PRODUCT1(sy, dx);
849 By[0] += LS_PRODUCT1(sx, dy);
850 By[1] += LS_PRODUCT2(sy, dy);
851 }
852 }
853
854 // Just for debugging, and can be removed later.
855 assert(A[0][0] >= LS_MAT_MIN && A[0][0] <= LS_MAT_MAX);
856 assert(A[0][1] >= LS_MAT_MIN && A[0][1] <= LS_MAT_MAX);
857 assert(A[1][1] >= LS_MAT_MIN && A[1][1] <= LS_MAT_MAX);
858 assert(Bx[0] >= LS_MAT_MIN && Bx[0] <= LS_MAT_MAX);
859 assert(Bx[1] >= LS_MAT_MIN && Bx[1] <= LS_MAT_MAX);
860 assert(By[0] >= LS_MAT_MIN && By[0] <= LS_MAT_MAX);
861 assert(By[1] >= LS_MAT_MIN && By[1] <= LS_MAT_MAX);
862
863 // Compute Determinant of A
864 const int64_t Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
865 if (Det == 0) return 1;
866
867 int16_t shift;
868 int16_t iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1);
869 shift -= WARPEDMODEL_PREC_BITS;
870 if (shift < 0) {
871 iDet <<= (-shift);
872 shift = 0;
873 }
874
875 int64_t Px[2], Py[2];
876 // These divided by the Det, are the least squares solutions
877 Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
878 Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1];
879 Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1];
880 Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1];
881
882 wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift);
883 wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift);
884 wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift);
885 wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift);
886
887 const int isuy = (mi_row * MI_SIZE + rsuy);
888 const int isux = (mi_col * MI_SIZE + rsux);
889 // Note: In the vx, vy expressions below, the max value of each of the
890 // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room
891 // for the first term so that the overall sum in the worst case fits
892 // within 32 bits overall.
893 const int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
894 (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) +
895 isuy * wm->wmmat[3]);
896 const int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
897 (isux * wm->wmmat[4] +
898 isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS)));
899 wm->wmmat[0] =
900 clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
901 wm->wmmat[1] =
902 clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
903 return 0;
904 }
905
av1_find_projection(int np,const int * pts1,const int * pts2,BLOCK_SIZE bsize,int mvy,int mvx,WarpedMotionParams * wm_params,int mi_row,int mi_col)906 int av1_find_projection(int np, const int *pts1, const int *pts2,
907 BLOCK_SIZE bsize, int mvy, int mvx,
908 WarpedMotionParams *wm_params, int mi_row, int mi_col) {
909 assert(wm_params->wmtype == AFFINE);
910
911 if (find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, mi_row,
912 mi_col))
913 return 1;
914
915 // check compatibility with the fast warp filter
916 if (!av1_get_shear_params(wm_params)) return 1;
917
918 return 0;
919 }
920