1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <memory.h>
15 #include <math.h>
16 #include <assert.h>
17
18 #include "config/av1_rtcd.h"
19
20 #include "av1/common/warped_motion.h"
21 #include "av1/common/scale.h"
22
23 #define WARP_ERROR_BLOCK 32
24
25 /* clang-format off */
26 static const int error_measure_lut[512] = {
27 // pow 0.7
28 16384, 16339, 16294, 16249, 16204, 16158, 16113, 16068,
29 16022, 15977, 15932, 15886, 15840, 15795, 15749, 15703,
30 15657, 15612, 15566, 15520, 15474, 15427, 15381, 15335,
31 15289, 15242, 15196, 15149, 15103, 15056, 15010, 14963,
32 14916, 14869, 14822, 14775, 14728, 14681, 14634, 14587,
33 14539, 14492, 14445, 14397, 14350, 14302, 14254, 14206,
34 14159, 14111, 14063, 14015, 13967, 13918, 13870, 13822,
35 13773, 13725, 13676, 13628, 13579, 13530, 13481, 13432,
36 13383, 13334, 13285, 13236, 13187, 13137, 13088, 13038,
37 12988, 12939, 12889, 12839, 12789, 12739, 12689, 12639,
38 12588, 12538, 12487, 12437, 12386, 12335, 12285, 12234,
39 12183, 12132, 12080, 12029, 11978, 11926, 11875, 11823,
40 11771, 11719, 11667, 11615, 11563, 11511, 11458, 11406,
41 11353, 11301, 11248, 11195, 11142, 11089, 11036, 10982,
42 10929, 10875, 10822, 10768, 10714, 10660, 10606, 10552,
43 10497, 10443, 10388, 10333, 10279, 10224, 10168, 10113,
44 10058, 10002, 9947, 9891, 9835, 9779, 9723, 9666,
45 9610, 9553, 9497, 9440, 9383, 9326, 9268, 9211,
46 9153, 9095, 9037, 8979, 8921, 8862, 8804, 8745,
47 8686, 8627, 8568, 8508, 8449, 8389, 8329, 8269,
48 8208, 8148, 8087, 8026, 7965, 7903, 7842, 7780,
49 7718, 7656, 7593, 7531, 7468, 7405, 7341, 7278,
50 7214, 7150, 7086, 7021, 6956, 6891, 6826, 6760,
51 6695, 6628, 6562, 6495, 6428, 6361, 6293, 6225,
52 6157, 6089, 6020, 5950, 5881, 5811, 5741, 5670,
53 5599, 5527, 5456, 5383, 5311, 5237, 5164, 5090,
54 5015, 4941, 4865, 4789, 4713, 4636, 4558, 4480,
55 4401, 4322, 4242, 4162, 4080, 3998, 3916, 3832,
56 3748, 3663, 3577, 3490, 3402, 3314, 3224, 3133,
57 3041, 2948, 2854, 2758, 2661, 2562, 2461, 2359,
58 2255, 2148, 2040, 1929, 1815, 1698, 1577, 1452,
59 1323, 1187, 1045, 894, 731, 550, 339, 0,
60 339, 550, 731, 894, 1045, 1187, 1323, 1452,
61 1577, 1698, 1815, 1929, 2040, 2148, 2255, 2359,
62 2461, 2562, 2661, 2758, 2854, 2948, 3041, 3133,
63 3224, 3314, 3402, 3490, 3577, 3663, 3748, 3832,
64 3916, 3998, 4080, 4162, 4242, 4322, 4401, 4480,
65 4558, 4636, 4713, 4789, 4865, 4941, 5015, 5090,
66 5164, 5237, 5311, 5383, 5456, 5527, 5599, 5670,
67 5741, 5811, 5881, 5950, 6020, 6089, 6157, 6225,
68 6293, 6361, 6428, 6495, 6562, 6628, 6695, 6760,
69 6826, 6891, 6956, 7021, 7086, 7150, 7214, 7278,
70 7341, 7405, 7468, 7531, 7593, 7656, 7718, 7780,
71 7842, 7903, 7965, 8026, 8087, 8148, 8208, 8269,
72 8329, 8389, 8449, 8508, 8568, 8627, 8686, 8745,
73 8804, 8862, 8921, 8979, 9037, 9095, 9153, 9211,
74 9268, 9326, 9383, 9440, 9497, 9553, 9610, 9666,
75 9723, 9779, 9835, 9891, 9947, 10002, 10058, 10113,
76 10168, 10224, 10279, 10333, 10388, 10443, 10497, 10552,
77 10606, 10660, 10714, 10768, 10822, 10875, 10929, 10982,
78 11036, 11089, 11142, 11195, 11248, 11301, 11353, 11406,
79 11458, 11511, 11563, 11615, 11667, 11719, 11771, 11823,
80 11875, 11926, 11978, 12029, 12080, 12132, 12183, 12234,
81 12285, 12335, 12386, 12437, 12487, 12538, 12588, 12639,
82 12689, 12739, 12789, 12839, 12889, 12939, 12988, 13038,
83 13088, 13137, 13187, 13236, 13285, 13334, 13383, 13432,
84 13481, 13530, 13579, 13628, 13676, 13725, 13773, 13822,
85 13870, 13918, 13967, 14015, 14063, 14111, 14159, 14206,
86 14254, 14302, 14350, 14397, 14445, 14492, 14539, 14587,
87 14634, 14681, 14728, 14775, 14822, 14869, 14916, 14963,
88 15010, 15056, 15103, 15149, 15196, 15242, 15289, 15335,
89 15381, 15427, 15474, 15520, 15566, 15612, 15657, 15703,
90 15749, 15795, 15840, 15886, 15932, 15977, 16022, 16068,
91 16113, 16158, 16204, 16249, 16294, 16339, 16384, 16384,
92 };
93 /* clang-format on */
94
95 // For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
96 // at a time. The zoom/rotation/shear in the model are applied to the
97 // "fractional" position of each pixel, which therefore varies within
98 // [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
99 // We need an extra 2 taps to fit this in, for a total of 8 taps.
100 /* clang-format off */
101 const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
102 #if WARPEDPIXEL_PREC_BITS == 6
103 // [-1, 0)
104 { 0, 0, 127, 1, 0, 0, 0, 0 }, { 0, - 1, 127, 2, 0, 0, 0, 0 },
105 { 1, - 3, 127, 4, - 1, 0, 0, 0 }, { 1, - 4, 126, 6, - 2, 1, 0, 0 },
106 { 1, - 5, 126, 8, - 3, 1, 0, 0 }, { 1, - 6, 125, 11, - 4, 1, 0, 0 },
107 { 1, - 7, 124, 13, - 4, 1, 0, 0 }, { 2, - 8, 123, 15, - 5, 1, 0, 0 },
108 { 2, - 9, 122, 18, - 6, 1, 0, 0 }, { 2, -10, 121, 20, - 6, 1, 0, 0 },
109 { 2, -11, 120, 22, - 7, 2, 0, 0 }, { 2, -12, 119, 25, - 8, 2, 0, 0 },
110 { 3, -13, 117, 27, - 8, 2, 0, 0 }, { 3, -13, 116, 29, - 9, 2, 0, 0 },
111 { 3, -14, 114, 32, -10, 3, 0, 0 }, { 3, -15, 113, 35, -10, 2, 0, 0 },
112 { 3, -15, 111, 37, -11, 3, 0, 0 }, { 3, -16, 109, 40, -11, 3, 0, 0 },
113 { 3, -16, 108, 42, -12, 3, 0, 0 }, { 4, -17, 106, 45, -13, 3, 0, 0 },
114 { 4, -17, 104, 47, -13, 3, 0, 0 }, { 4, -17, 102, 50, -14, 3, 0, 0 },
115 { 4, -17, 100, 52, -14, 3, 0, 0 }, { 4, -18, 98, 55, -15, 4, 0, 0 },
116 { 4, -18, 96, 58, -15, 3, 0, 0 }, { 4, -18, 94, 60, -16, 4, 0, 0 },
117 { 4, -18, 91, 63, -16, 4, 0, 0 }, { 4, -18, 89, 65, -16, 4, 0, 0 },
118 { 4, -18, 87, 68, -17, 4, 0, 0 }, { 4, -18, 85, 70, -17, 4, 0, 0 },
119 { 4, -18, 82, 73, -17, 4, 0, 0 }, { 4, -18, 80, 75, -17, 4, 0, 0 },
120 { 4, -18, 78, 78, -18, 4, 0, 0 }, { 4, -17, 75, 80, -18, 4, 0, 0 },
121 { 4, -17, 73, 82, -18, 4, 0, 0 }, { 4, -17, 70, 85, -18, 4, 0, 0 },
122 { 4, -17, 68, 87, -18, 4, 0, 0 }, { 4, -16, 65, 89, -18, 4, 0, 0 },
123 { 4, -16, 63, 91, -18, 4, 0, 0 }, { 4, -16, 60, 94, -18, 4, 0, 0 },
124 { 3, -15, 58, 96, -18, 4, 0, 0 }, { 4, -15, 55, 98, -18, 4, 0, 0 },
125 { 3, -14, 52, 100, -17, 4, 0, 0 }, { 3, -14, 50, 102, -17, 4, 0, 0 },
126 { 3, -13, 47, 104, -17, 4, 0, 0 }, { 3, -13, 45, 106, -17, 4, 0, 0 },
127 { 3, -12, 42, 108, -16, 3, 0, 0 }, { 3, -11, 40, 109, -16, 3, 0, 0 },
128 { 3, -11, 37, 111, -15, 3, 0, 0 }, { 2, -10, 35, 113, -15, 3, 0, 0 },
129 { 3, -10, 32, 114, -14, 3, 0, 0 }, { 2, - 9, 29, 116, -13, 3, 0, 0 },
130 { 2, - 8, 27, 117, -13, 3, 0, 0 }, { 2, - 8, 25, 119, -12, 2, 0, 0 },
131 { 2, - 7, 22, 120, -11, 2, 0, 0 }, { 1, - 6, 20, 121, -10, 2, 0, 0 },
132 { 1, - 6, 18, 122, - 9, 2, 0, 0 }, { 1, - 5, 15, 123, - 8, 2, 0, 0 },
133 { 1, - 4, 13, 124, - 7, 1, 0, 0 }, { 1, - 4, 11, 125, - 6, 1, 0, 0 },
134 { 1, - 3, 8, 126, - 5, 1, 0, 0 }, { 1, - 2, 6, 126, - 4, 1, 0, 0 },
135 { 0, - 1, 4, 127, - 3, 1, 0, 0 }, { 0, 0, 2, 127, - 1, 0, 0, 0 },
136
137 // [0, 1)
138 { 0, 0, 0, 127, 1, 0, 0, 0}, { 0, 0, -1, 127, 2, 0, 0, 0},
139 { 0, 1, -3, 127, 4, -2, 1, 0}, { 0, 1, -5, 127, 6, -2, 1, 0},
140 { 0, 2, -6, 126, 8, -3, 1, 0}, {-1, 2, -7, 126, 11, -4, 2, -1},
141 {-1, 3, -8, 125, 13, -5, 2, -1}, {-1, 3, -10, 124, 16, -6, 3, -1},
142 {-1, 4, -11, 123, 18, -7, 3, -1}, {-1, 4, -12, 122, 20, -7, 3, -1},
143 {-1, 4, -13, 121, 23, -8, 3, -1}, {-2, 5, -14, 120, 25, -9, 4, -1},
144 {-1, 5, -15, 119, 27, -10, 4, -1}, {-1, 5, -16, 118, 30, -11, 4, -1},
145 {-2, 6, -17, 116, 33, -12, 5, -1}, {-2, 6, -17, 114, 35, -12, 5, -1},
146 {-2, 6, -18, 113, 38, -13, 5, -1}, {-2, 7, -19, 111, 41, -14, 6, -2},
147 {-2, 7, -19, 110, 43, -15, 6, -2}, {-2, 7, -20, 108, 46, -15, 6, -2},
148 {-2, 7, -20, 106, 49, -16, 6, -2}, {-2, 7, -21, 104, 51, -16, 7, -2},
149 {-2, 7, -21, 102, 54, -17, 7, -2}, {-2, 8, -21, 100, 56, -18, 7, -2},
150 {-2, 8, -22, 98, 59, -18, 7, -2}, {-2, 8, -22, 96, 62, -19, 7, -2},
151 {-2, 8, -22, 94, 64, -19, 7, -2}, {-2, 8, -22, 91, 67, -20, 8, -2},
152 {-2, 8, -22, 89, 69, -20, 8, -2}, {-2, 8, -22, 87, 72, -21, 8, -2},
153 {-2, 8, -21, 84, 74, -21, 8, -2}, {-2, 8, -22, 82, 77, -21, 8, -2},
154 {-2, 8, -21, 79, 79, -21, 8, -2}, {-2, 8, -21, 77, 82, -22, 8, -2},
155 {-2, 8, -21, 74, 84, -21, 8, -2}, {-2, 8, -21, 72, 87, -22, 8, -2},
156 {-2, 8, -20, 69, 89, -22, 8, -2}, {-2, 8, -20, 67, 91, -22, 8, -2},
157 {-2, 7, -19, 64, 94, -22, 8, -2}, {-2, 7, -19, 62, 96, -22, 8, -2},
158 {-2, 7, -18, 59, 98, -22, 8, -2}, {-2, 7, -18, 56, 100, -21, 8, -2},
159 {-2, 7, -17, 54, 102, -21, 7, -2}, {-2, 7, -16, 51, 104, -21, 7, -2},
160 {-2, 6, -16, 49, 106, -20, 7, -2}, {-2, 6, -15, 46, 108, -20, 7, -2},
161 {-2, 6, -15, 43, 110, -19, 7, -2}, {-2, 6, -14, 41, 111, -19, 7, -2},
162 {-1, 5, -13, 38, 113, -18, 6, -2}, {-1, 5, -12, 35, 114, -17, 6, -2},
163 {-1, 5, -12, 33, 116, -17, 6, -2}, {-1, 4, -11, 30, 118, -16, 5, -1},
164 {-1, 4, -10, 27, 119, -15, 5, -1}, {-1, 4, -9, 25, 120, -14, 5, -2},
165 {-1, 3, -8, 23, 121, -13, 4, -1}, {-1, 3, -7, 20, 122, -12, 4, -1},
166 {-1, 3, -7, 18, 123, -11, 4, -1}, {-1, 3, -6, 16, 124, -10, 3, -1},
167 {-1, 2, -5, 13, 125, -8, 3, -1}, {-1, 2, -4, 11, 126, -7, 2, -1},
168 { 0, 1, -3, 8, 126, -6, 2, 0}, { 0, 1, -2, 6, 127, -5, 1, 0},
169 { 0, 1, -2, 4, 127, -3, 1, 0}, { 0, 0, 0, 2, 127, -1, 0, 0},
170
171 // [1, 2)
172 { 0, 0, 0, 1, 127, 0, 0, 0 }, { 0, 0, 0, - 1, 127, 2, 0, 0 },
173 { 0, 0, 1, - 3, 127, 4, - 1, 0 }, { 0, 0, 1, - 4, 126, 6, - 2, 1 },
174 { 0, 0, 1, - 5, 126, 8, - 3, 1 }, { 0, 0, 1, - 6, 125, 11, - 4, 1 },
175 { 0, 0, 1, - 7, 124, 13, - 4, 1 }, { 0, 0, 2, - 8, 123, 15, - 5, 1 },
176 { 0, 0, 2, - 9, 122, 18, - 6, 1 }, { 0, 0, 2, -10, 121, 20, - 6, 1 },
177 { 0, 0, 2, -11, 120, 22, - 7, 2 }, { 0, 0, 2, -12, 119, 25, - 8, 2 },
178 { 0, 0, 3, -13, 117, 27, - 8, 2 }, { 0, 0, 3, -13, 116, 29, - 9, 2 },
179 { 0, 0, 3, -14, 114, 32, -10, 3 }, { 0, 0, 3, -15, 113, 35, -10, 2 },
180 { 0, 0, 3, -15, 111, 37, -11, 3 }, { 0, 0, 3, -16, 109, 40, -11, 3 },
181 { 0, 0, 3, -16, 108, 42, -12, 3 }, { 0, 0, 4, -17, 106, 45, -13, 3 },
182 { 0, 0, 4, -17, 104, 47, -13, 3 }, { 0, 0, 4, -17, 102, 50, -14, 3 },
183 { 0, 0, 4, -17, 100, 52, -14, 3 }, { 0, 0, 4, -18, 98, 55, -15, 4 },
184 { 0, 0, 4, -18, 96, 58, -15, 3 }, { 0, 0, 4, -18, 94, 60, -16, 4 },
185 { 0, 0, 4, -18, 91, 63, -16, 4 }, { 0, 0, 4, -18, 89, 65, -16, 4 },
186 { 0, 0, 4, -18, 87, 68, -17, 4 }, { 0, 0, 4, -18, 85, 70, -17, 4 },
187 { 0, 0, 4, -18, 82, 73, -17, 4 }, { 0, 0, 4, -18, 80, 75, -17, 4 },
188 { 0, 0, 4, -18, 78, 78, -18, 4 }, { 0, 0, 4, -17, 75, 80, -18, 4 },
189 { 0, 0, 4, -17, 73, 82, -18, 4 }, { 0, 0, 4, -17, 70, 85, -18, 4 },
190 { 0, 0, 4, -17, 68, 87, -18, 4 }, { 0, 0, 4, -16, 65, 89, -18, 4 },
191 { 0, 0, 4, -16, 63, 91, -18, 4 }, { 0, 0, 4, -16, 60, 94, -18, 4 },
192 { 0, 0, 3, -15, 58, 96, -18, 4 }, { 0, 0, 4, -15, 55, 98, -18, 4 },
193 { 0, 0, 3, -14, 52, 100, -17, 4 }, { 0, 0, 3, -14, 50, 102, -17, 4 },
194 { 0, 0, 3, -13, 47, 104, -17, 4 }, { 0, 0, 3, -13, 45, 106, -17, 4 },
195 { 0, 0, 3, -12, 42, 108, -16, 3 }, { 0, 0, 3, -11, 40, 109, -16, 3 },
196 { 0, 0, 3, -11, 37, 111, -15, 3 }, { 0, 0, 2, -10, 35, 113, -15, 3 },
197 { 0, 0, 3, -10, 32, 114, -14, 3 }, { 0, 0, 2, - 9, 29, 116, -13, 3 },
198 { 0, 0, 2, - 8, 27, 117, -13, 3 }, { 0, 0, 2, - 8, 25, 119, -12, 2 },
199 { 0, 0, 2, - 7, 22, 120, -11, 2 }, { 0, 0, 1, - 6, 20, 121, -10, 2 },
200 { 0, 0, 1, - 6, 18, 122, - 9, 2 }, { 0, 0, 1, - 5, 15, 123, - 8, 2 },
201 { 0, 0, 1, - 4, 13, 124, - 7, 1 }, { 0, 0, 1, - 4, 11, 125, - 6, 1 },
202 { 0, 0, 1, - 3, 8, 126, - 5, 1 }, { 0, 0, 1, - 2, 6, 126, - 4, 1 },
203 { 0, 0, 0, - 1, 4, 127, - 3, 1 }, { 0, 0, 0, 0, 2, 127, - 1, 0 },
204 // dummy (replicate row index 191)
205 { 0, 0, 0, 0, 2, 127, - 1, 0 },
206
207 #elif WARPEDPIXEL_PREC_BITS == 5
208 // [-1, 0)
209 {0, 0, 127, 1, 0, 0, 0, 0}, {1, -3, 127, 4, -1, 0, 0, 0},
210 {1, -5, 126, 8, -3, 1, 0, 0}, {1, -7, 124, 13, -4, 1, 0, 0},
211 {2, -9, 122, 18, -6, 1, 0, 0}, {2, -11, 120, 22, -7, 2, 0, 0},
212 {3, -13, 117, 27, -8, 2, 0, 0}, {3, -14, 114, 32, -10, 3, 0, 0},
213 {3, -15, 111, 37, -11, 3, 0, 0}, {3, -16, 108, 42, -12, 3, 0, 0},
214 {4, -17, 104, 47, -13, 3, 0, 0}, {4, -17, 100, 52, -14, 3, 0, 0},
215 {4, -18, 96, 58, -15, 3, 0, 0}, {4, -18, 91, 63, -16, 4, 0, 0},
216 {4, -18, 87, 68, -17, 4, 0, 0}, {4, -18, 82, 73, -17, 4, 0, 0},
217 {4, -18, 78, 78, -18, 4, 0, 0}, {4, -17, 73, 82, -18, 4, 0, 0},
218 {4, -17, 68, 87, -18, 4, 0, 0}, {4, -16, 63, 91, -18, 4, 0, 0},
219 {3, -15, 58, 96, -18, 4, 0, 0}, {3, -14, 52, 100, -17, 4, 0, 0},
220 {3, -13, 47, 104, -17, 4, 0, 0}, {3, -12, 42, 108, -16, 3, 0, 0},
221 {3, -11, 37, 111, -15, 3, 0, 0}, {3, -10, 32, 114, -14, 3, 0, 0},
222 {2, -8, 27, 117, -13, 3, 0, 0}, {2, -7, 22, 120, -11, 2, 0, 0},
223 {1, -6, 18, 122, -9, 2, 0, 0}, {1, -4, 13, 124, -7, 1, 0, 0},
224 {1, -3, 8, 126, -5, 1, 0, 0}, {0, -1, 4, 127, -3, 1, 0, 0},
225 // [0, 1)
226 { 0, 0, 0, 127, 1, 0, 0, 0}, { 0, 1, -3, 127, 4, -2, 1, 0},
227 { 0, 2, -6, 126, 8, -3, 1, 0}, {-1, 3, -8, 125, 13, -5, 2, -1},
228 {-1, 4, -11, 123, 18, -7, 3, -1}, {-1, 4, -13, 121, 23, -8, 3, -1},
229 {-1, 5, -15, 119, 27, -10, 4, -1}, {-2, 6, -17, 116, 33, -12, 5, -1},
230 {-2, 6, -18, 113, 38, -13, 5, -1}, {-2, 7, -19, 110, 43, -15, 6, -2},
231 {-2, 7, -20, 106, 49, -16, 6, -2}, {-2, 7, -21, 102, 54, -17, 7, -2},
232 {-2, 8, -22, 98, 59, -18, 7, -2}, {-2, 8, -22, 94, 64, -19, 7, -2},
233 {-2, 8, -22, 89, 69, -20, 8, -2}, {-2, 8, -21, 84, 74, -21, 8, -2},
234 {-2, 8, -21, 79, 79, -21, 8, -2}, {-2, 8, -21, 74, 84, -21, 8, -2},
235 {-2, 8, -20, 69, 89, -22, 8, -2}, {-2, 7, -19, 64, 94, -22, 8, -2},
236 {-2, 7, -18, 59, 98, -22, 8, -2}, {-2, 7, -17, 54, 102, -21, 7, -2},
237 {-2, 6, -16, 49, 106, -20, 7, -2}, {-2, 6, -15, 43, 110, -19, 7, -2},
238 {-1, 5, -13, 38, 113, -18, 6, -2}, {-1, 5, -12, 33, 116, -17, 6, -2},
239 {-1, 4, -10, 27, 119, -15, 5, -1}, {-1, 3, -8, 23, 121, -13, 4, -1},
240 {-1, 3, -7, 18, 123, -11, 4, -1}, {-1, 2, -5, 13, 125, -8, 3, -1},
241 { 0, 1, -3, 8, 126, -6, 2, 0}, { 0, 1, -2, 4, 127, -3, 1, 0},
242 // [1, 2)
243 {0, 0, 0, 1, 127, 0, 0, 0}, {0, 0, 1, -3, 127, 4, -1, 0},
244 {0, 0, 1, -5, 126, 8, -3, 1}, {0, 0, 1, -7, 124, 13, -4, 1},
245 {0, 0, 2, -9, 122, 18, -6, 1}, {0, 0, 2, -11, 120, 22, -7, 2},
246 {0, 0, 3, -13, 117, 27, -8, 2}, {0, 0, 3, -14, 114, 32, -10, 3},
247 {0, 0, 3, -15, 111, 37, -11, 3}, {0, 0, 3, -16, 108, 42, -12, 3},
248 {0, 0, 4, -17, 104, 47, -13, 3}, {0, 0, 4, -17, 100, 52, -14, 3},
249 {0, 0, 4, -18, 96, 58, -15, 3}, {0, 0, 4, -18, 91, 63, -16, 4},
250 {0, 0, 4, -18, 87, 68, -17, 4}, {0, 0, 4, -18, 82, 73, -17, 4},
251 {0, 0, 4, -18, 78, 78, -18, 4}, {0, 0, 4, -17, 73, 82, -18, 4},
252 {0, 0, 4, -17, 68, 87, -18, 4}, {0, 0, 4, -16, 63, 91, -18, 4},
253 {0, 0, 3, -15, 58, 96, -18, 4}, {0, 0, 3, -14, 52, 100, -17, 4},
254 {0, 0, 3, -13, 47, 104, -17, 4}, {0, 0, 3, -12, 42, 108, -16, 3},
255 {0, 0, 3, -11, 37, 111, -15, 3}, {0, 0, 3, -10, 32, 114, -14, 3},
256 {0, 0, 2, -8, 27, 117, -13, 3}, {0, 0, 2, -7, 22, 120, -11, 2},
257 {0, 0, 1, -6, 18, 122, -9, 2}, {0, 0, 1, -4, 13, 124, -7, 1},
258 {0, 0, 1, -3, 8, 126, -5, 1}, {0, 0, 0, -1, 4, 127, -3, 1},
259 // dummy (replicate row index 95)
260 {0, 0, 0, -1, 4, 127, -3, 1},
261
262 #endif // WARPEDPIXEL_PREC_BITS == 6
263 };
264
265 /* clang-format on */
266
267 #define DIV_LUT_PREC_BITS 14
268 #define DIV_LUT_BITS 8
269 #define DIV_LUT_NUM (1 << DIV_LUT_BITS)
270
271 static const uint16_t div_lut[DIV_LUT_NUM + 1] = {
272 16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
273 15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
274 15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
275 14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
276 13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
277 13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
278 13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
279 12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
280 12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
281 11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
282 11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
283 11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
284 10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
285 10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
286 10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
287 9963, 9939, 9916, 9892, 9869, 9846, 9823, 9800, 9777, 9754, 9732,
288 9709, 9687, 9664, 9642, 9620, 9598, 9576, 9554, 9533, 9511, 9489,
289 9468, 9447, 9425, 9404, 9383, 9362, 9341, 9321, 9300, 9279, 9259,
290 9239, 9218, 9198, 9178, 9158, 9138, 9118, 9098, 9079, 9059, 9039,
291 9020, 9001, 8981, 8962, 8943, 8924, 8905, 8886, 8867, 8849, 8830,
292 8812, 8793, 8775, 8756, 8738, 8720, 8702, 8684, 8666, 8648, 8630,
293 8613, 8595, 8577, 8560, 8542, 8525, 8508, 8490, 8473, 8456, 8439,
294 8422, 8405, 8389, 8372, 8355, 8339, 8322, 8306, 8289, 8273, 8257,
295 8240, 8224, 8208, 8192,
296 };
297
298 // Decomposes a divisor D such that 1/D = y/2^shift, where y is returned
299 // at precision of DIV_LUT_PREC_BITS along with the shift.
resolve_divisor_64(uint64_t D,int16_t * shift)300 static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) {
301 int64_t f;
302 *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
303 : get_msb((unsigned int)D));
304 // e is obtained from D after resetting the most significant 1 bit.
305 const int64_t e = D - ((uint64_t)1 << *shift);
306 // Get the most significant DIV_LUT_BITS (8) bits of e into f
307 if (*shift > DIV_LUT_BITS)
308 f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS);
309 else
310 f = e << (DIV_LUT_BITS - *shift);
311 assert(f <= DIV_LUT_NUM);
312 *shift += DIV_LUT_PREC_BITS;
313 // Use f as lookup into the precomputed table of multipliers
314 return div_lut[f];
315 }
316
resolve_divisor_32(uint32_t D,int16_t * shift)317 static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) {
318 int32_t f;
319 *shift = get_msb(D);
320 // e is obtained from D after resetting the most significant 1 bit.
321 const int32_t e = D - ((uint32_t)1 << *shift);
322 // Get the most significant DIV_LUT_BITS (8) bits of e into f
323 if (*shift > DIV_LUT_BITS)
324 f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS);
325 else
326 f = e << (DIV_LUT_BITS - *shift);
327 assert(f <= DIV_LUT_NUM);
328 *shift += DIV_LUT_PREC_BITS;
329 // Use f as lookup into the precomputed table of multipliers
330 return div_lut[f];
331 }
332
is_affine_valid(const WarpedMotionParams * const wm)333 static int is_affine_valid(const WarpedMotionParams *const wm) {
334 const int32_t *mat = wm->wmmat;
335 return (mat[2] > 0);
336 }
337
is_affine_shear_allowed(int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)338 static int is_affine_shear_allowed(int16_t alpha, int16_t beta, int16_t gamma,
339 int16_t delta) {
340 if ((4 * abs(alpha) + 7 * abs(beta) >= (1 << WARPEDMODEL_PREC_BITS)) ||
341 (4 * abs(gamma) + 4 * abs(delta) >= (1 << WARPEDMODEL_PREC_BITS)))
342 return 0;
343 else
344 return 1;
345 }
346
347 // Returns 1 on success or 0 on an invalid affine set
get_shear_params(WarpedMotionParams * wm)348 int get_shear_params(WarpedMotionParams *wm) {
349 const int32_t *mat = wm->wmmat;
350 if (!is_affine_valid(wm)) return 0;
351 wm->alpha =
352 clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
353 wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX);
354 int16_t shift;
355 int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1);
356 int64_t v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y;
357 wm->gamma =
358 clamp((int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift), INT16_MIN, INT16_MAX);
359 v = ((int64_t)mat[3] * mat[4]) * y;
360 wm->delta = clamp(mat[5] - (int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift) -
361 (1 << WARPEDMODEL_PREC_BITS),
362 INT16_MIN, INT16_MAX);
363
364 wm->alpha = ROUND_POWER_OF_TWO_SIGNED(wm->alpha, WARP_PARAM_REDUCE_BITS) *
365 (1 << WARP_PARAM_REDUCE_BITS);
366 wm->beta = ROUND_POWER_OF_TWO_SIGNED(wm->beta, WARP_PARAM_REDUCE_BITS) *
367 (1 << WARP_PARAM_REDUCE_BITS);
368 wm->gamma = ROUND_POWER_OF_TWO_SIGNED(wm->gamma, WARP_PARAM_REDUCE_BITS) *
369 (1 << WARP_PARAM_REDUCE_BITS);
370 wm->delta = ROUND_POWER_OF_TWO_SIGNED(wm->delta, WARP_PARAM_REDUCE_BITS) *
371 (1 << WARP_PARAM_REDUCE_BITS);
372
373 if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta))
374 return 0;
375
376 return 1;
377 }
378
highbd_error_measure(int err,int bd)379 static INLINE int highbd_error_measure(int err, int bd) {
380 const int b = bd - 8;
381 const int bmask = (1 << b) - 1;
382 const int v = (1 << b);
383 err = abs(err);
384 const int e1 = err >> b;
385 const int e2 = err & bmask;
386 return error_measure_lut[255 + e1] * (v - e2) +
387 error_measure_lut[256 + e1] * e2;
388 }
389
390 /* Note: For an explanation of the warp algorithm, and some notes on bit widths
391 for hardware implementations, see the comments above av1_warp_affine_c
392 */
av1_highbd_warp_affine_c(const int32_t * mat,const uint16_t * ref,int width,int height,int stride,uint16_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)393 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
394 int width, int height, int stride, uint16_t *pred,
395 int p_col, int p_row, int p_width, int p_height,
396 int p_stride, int subsampling_x,
397 int subsampling_y, int bd,
398 ConvolveParams *conv_params, int16_t alpha,
399 int16_t beta, int16_t gamma, int16_t delta) {
400 int32_t tmp[15 * 8];
401 const int reduce_bits_horiz =
402 conv_params->round_0 +
403 AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);
404 const int reduce_bits_vert = conv_params->is_compound
405 ? conv_params->round_1
406 : 2 * FILTER_BITS - reduce_bits_horiz;
407 const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
408 const int offset_bits_horiz = bd + FILTER_BITS - 1;
409 const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
410 const int round_bits =
411 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
412 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
413 (void)max_bits_horiz;
414 assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
415
416 for (int i = p_row; i < p_row + p_height; i += 8) {
417 for (int j = p_col; j < p_col + p_width; j += 8) {
418 // Calculate the center of this 8x8 block,
419 // project to luma coordinates (if in a subsampled chroma plane),
420 // apply the affine transformation,
421 // then convert back to the original coordinates (if necessary)
422 const int32_t src_x = (j + 4) << subsampling_x;
423 const int32_t src_y = (i + 4) << subsampling_y;
424 const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
425 const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
426 const int32_t x4 = dst_x >> subsampling_x;
427 const int32_t y4 = dst_y >> subsampling_y;
428
429 const int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
430 int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
431 const int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
432 int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
433
434 sx4 += alpha * (-4) + beta * (-4);
435 sy4 += gamma * (-4) + delta * (-4);
436
437 sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
438 sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
439
440 // Horizontal filter
441 for (int k = -7; k < 8; ++k) {
442 const int iy = clamp(iy4 + k, 0, height - 1);
443
444 int sx = sx4 + beta * (k + 4);
445 for (int l = -4; l < 4; ++l) {
446 int ix = ix4 + l - 3;
447 const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
448 WARPEDPIXEL_PREC_SHIFTS;
449 assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
450 const int16_t *coeffs = warped_filter[offs];
451
452 int32_t sum = 1 << offset_bits_horiz;
453 for (int m = 0; m < 8; ++m) {
454 const int sample_x = clamp(ix + m, 0, width - 1);
455 sum += ref[iy * stride + sample_x] * coeffs[m];
456 }
457 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
458 assert(0 <= sum && sum < (1 << max_bits_horiz));
459 tmp[(k + 7) * 8 + (l + 4)] = sum;
460 sx += alpha;
461 }
462 }
463
464 // Vertical filter
465 for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
466 int sy = sy4 + delta * (k + 4);
467 for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
468 const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
469 WARPEDPIXEL_PREC_SHIFTS;
470 assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
471 const int16_t *coeffs = warped_filter[offs];
472
473 int32_t sum = 1 << offset_bits_vert;
474 for (int m = 0; m < 8; ++m) {
475 sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
476 }
477
478 if (conv_params->is_compound) {
479 CONV_BUF_TYPE *p =
480 &conv_params
481 ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
482 (j - p_col + l + 4)];
483 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
484 if (conv_params->do_average) {
485 uint16_t *dst16 =
486 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
487 int32_t tmp32 = *p;
488 if (conv_params->use_dist_wtd_comp_avg) {
489 tmp32 = tmp32 * conv_params->fwd_offset +
490 sum * conv_params->bck_offset;
491 tmp32 = tmp32 >> DIST_PRECISION_BITS;
492 } else {
493 tmp32 += sum;
494 tmp32 = tmp32 >> 1;
495 }
496 tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
497 (1 << (offset_bits - conv_params->round_1 - 1));
498 *dst16 =
499 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp32, round_bits), bd);
500 } else {
501 *p = sum;
502 }
503 } else {
504 uint16_t *p =
505 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
506 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
507 assert(0 <= sum && sum < (1 << (bd + 2)));
508 *p = clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
509 }
510 sy += gamma;
511 }
512 }
513 }
514 }
515 }
516
highbd_warp_plane(WarpedMotionParams * wm,const uint8_t * const ref8,int width,int height,int stride,const uint8_t * const pred8,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,ConvolveParams * conv_params)517 static void highbd_warp_plane(WarpedMotionParams *wm, const uint8_t *const ref8,
518 int width, int height, int stride,
519 const uint8_t *const pred8, int p_col, int p_row,
520 int p_width, int p_height, int p_stride,
521 int subsampling_x, int subsampling_y, int bd,
522 ConvolveParams *conv_params) {
523 assert(wm->wmtype <= AFFINE);
524 if (wm->wmtype == ROTZOOM) {
525 wm->wmmat[5] = wm->wmmat[2];
526 wm->wmmat[4] = -wm->wmmat[3];
527 }
528 const int32_t *const mat = wm->wmmat;
529 const int16_t alpha = wm->alpha;
530 const int16_t beta = wm->beta;
531 const int16_t gamma = wm->gamma;
532 const int16_t delta = wm->delta;
533
534 const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8);
535 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
536 av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
537 p_width, p_height, p_stride, subsampling_x,
538 subsampling_y, bd, conv_params, alpha, beta, gamma,
539 delta);
540 }
541
highbd_frame_error(const uint16_t * const ref,int stride,const uint16_t * const dst,int p_width,int p_height,int p_stride,int bd)542 static int64_t highbd_frame_error(const uint16_t *const ref, int stride,
543 const uint16_t *const dst, int p_width,
544 int p_height, int p_stride, int bd) {
545 int64_t sum_error = 0;
546 for (int i = 0; i < p_height; ++i) {
547 for (int j = 0; j < p_width; ++j) {
548 sum_error +=
549 highbd_error_measure(dst[j + i * p_stride] - ref[j + i * stride], bd);
550 }
551 }
552 return sum_error;
553 }
554
highbd_warp_error(WarpedMotionParams * wm,const uint8_t * const ref8,int width,int height,int stride,const uint8_t * const dst8,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,int64_t best_error)555 static int64_t highbd_warp_error(
556 WarpedMotionParams *wm, const uint8_t *const ref8, int width, int height,
557 int stride, const uint8_t *const dst8, int p_col, int p_row, int p_width,
558 int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd,
559 int64_t best_error) {
560 int64_t gm_sumerr = 0;
561 const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
562 const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
563 uint16_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
564
565 ConvolveParams conv_params = get_conv_params(0, 0, bd);
566 conv_params.use_dist_wtd_comp_avg = 0;
567 for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
568 for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
569 // avoid warping extra 8x8 blocks in the padded region of the frame
570 // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
571 const int warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
572 const int warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
573 highbd_warp_plane(wm, ref8, width, height, stride,
574 CONVERT_TO_BYTEPTR(tmp), j, i, warp_w, warp_h,
575 WARP_ERROR_BLOCK, subsampling_x, subsampling_y, bd,
576 &conv_params);
577
578 gm_sumerr += highbd_frame_error(
579 tmp, WARP_ERROR_BLOCK, CONVERT_TO_SHORTPTR(dst8) + j + i * p_stride,
580 warp_w, warp_h, p_stride, bd);
581 if (gm_sumerr > best_error) return gm_sumerr;
582 }
583 }
584 return gm_sumerr;
585 }
586
error_measure(int err)587 static INLINE int error_measure(int err) {
588 return error_measure_lut[255 + err];
589 }
590
591 /* The warp filter for ROTZOOM and AFFINE models works as follows:
592 * Split the input into 8x8 blocks
593 * For each block, project the point (4, 4) within the block, to get the
594 overall block position. Split into integer and fractional coordinates,
595 maintaining full WARPEDMODEL precision
596 * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a
597 variable horizontal offset. This means that, while the rows of the
598 intermediate buffer align with the rows of the *reference* image, the
599 columns align with the columns of the *destination* image.
600 * Filter vertically: Generate the output block (up to 8x8 pixels, but if the
601 destination is too small we crop the output at this stage). Each pixel has
602 a variable vertical offset, so that the resulting rows are aligned with
603 the rows of the destination image.
604
605 To accomplish these alignments, we factor the warp matrix as a
606 product of two shear / asymmetric zoom matrices:
607 / a b \ = / 1 0 \ * / 1+alpha beta \
608 \ c d / \ gamma 1+delta / \ 0 1 /
609 where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively.
610 The horizontal shear (with alpha and beta) is applied first,
611 then the vertical shear (with gamma and delta) is applied second.
612
613 The only limitation is that, to fit this in a fixed 8-tap filter size,
614 the fractional pixel offsets must be at most +-1. Since the horizontal filter
615 generates 15 rows of 8 columns, and the initial point we project is at (4, 4)
616 within the block, the parameters must satisfy
617 4 * |alpha| + 7 * |beta| <= 1 and 4 * |gamma| + 4 * |delta| <= 1
618 for this filter to be applicable.
619
620 Note: This function assumes that the caller has done all of the relevant
621 checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5]
622 are set appropriately (if using a ROTZOOM model), and that alpha, beta,
623 gamma, delta are all in range.
624
625 TODO(david.barker): Maybe support scaled references?
626 */
627 /* A note on hardware implementation:
628 The warp filter is intended to be implementable using the same hardware as
629 the high-precision convolve filters from the loop-restoration and
630 convolve-round experiments.
631
632 For a single filter stage, considering all of the coefficient sets for the
633 warp filter and the regular convolution filter, an input in the range
634 [0, 2^k - 1] is mapped into the range [-56 * (2^k - 1), 184 * (2^k - 1)]
635 before rounding.
636
637 Allowing for some changes to the filter coefficient sets, call the range
638 [-64 * 2^k, 192 * 2^k]. Then, if we initialize the accumulator to 64 * 2^k,
639 we can replace this by the range [0, 256 * 2^k], which can be stored in an
640 unsigned value with 8 + k bits.
641
642 This allows the derivation of the appropriate bit widths and offsets for
643 the various intermediate values: If
644
645 F := FILTER_BITS = 7 (or else the above ranges need adjusting)
646 So a *single* filter stage maps a k-bit input to a (k + F + 1)-bit
647 intermediate value.
648 H := ROUND0_BITS
649 V := VERSHEAR_REDUCE_PREC_BITS
650 (and note that we must have H + V = 2*F for the output to have the same
651 scale as the input)
652
653 then we end up with the following offsets and ranges:
654 Horizontal filter: Apply an offset of 1 << (bd + F - 1), sum fits into a
655 uint{bd + F + 1}
656 After rounding: The values stored in 'tmp' fit into a uint{bd + F + 1 - H}.
657 Vertical filter: Apply an offset of 1 << (bd + 2*F - H), sum fits into a
658 uint{bd + 2*F + 2 - H}
659 After rounding: The final value, before undoing the offset, fits into a
660 uint{bd + 2}.
661
662 Then we need to undo the offsets before clamping to a pixel. Note that,
663 if we do this at the end, the amount to subtract is actually independent
664 of H and V:
665
666 offset to subtract = (1 << ((bd + F - 1) - H + F - V)) +
667 (1 << ((bd + 2*F - H) - V))
668 == (1 << (bd - 1)) + (1 << bd)
669
670 This allows us to entirely avoid clamping in both the warp filter and
671 the convolve-round experiment. As of the time of writing, the Wiener filter
672 from loop-restoration can encode a central coefficient up to 216, which
673 leads to a maximum value of about 282 * 2^k after applying the offset.
674 So in that case we still need to clamp.
675 */
av1_warp_affine_c(const int32_t * mat,const uint8_t * ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)676 void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
677 int height, int stride, uint8_t *pred, int p_col,
678 int p_row, int p_width, int p_height, int p_stride,
679 int subsampling_x, int subsampling_y,
680 ConvolveParams *conv_params, int16_t alpha, int16_t beta,
681 int16_t gamma, int16_t delta) {
682 int32_t tmp[15 * 8];
683 const int bd = 8;
684 const int reduce_bits_horiz = conv_params->round_0;
685 const int reduce_bits_vert = conv_params->is_compound
686 ? conv_params->round_1
687 : 2 * FILTER_BITS - reduce_bits_horiz;
688 const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
689 const int offset_bits_horiz = bd + FILTER_BITS - 1;
690 const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
691 const int round_bits =
692 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
693 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
694 (void)max_bits_horiz;
695 assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
696 assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
697
698 for (int i = p_row; i < p_row + p_height; i += 8) {
699 for (int j = p_col; j < p_col + p_width; j += 8) {
700 // Calculate the center of this 8x8 block,
701 // project to luma coordinates (if in a subsampled chroma plane),
702 // apply the affine transformation,
703 // then convert back to the original coordinates (if necessary)
704 const int32_t src_x = (j + 4) << subsampling_x;
705 const int32_t src_y = (i + 4) << subsampling_y;
706 const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
707 const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
708 const int32_t x4 = dst_x >> subsampling_x;
709 const int32_t y4 = dst_y >> subsampling_y;
710
711 int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
712 int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
713 int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
714 int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
715
716 sx4 += alpha * (-4) + beta * (-4);
717 sy4 += gamma * (-4) + delta * (-4);
718
719 sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
720 sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
721
722 // Horizontal filter
723 for (int k = -7; k < 8; ++k) {
724 // Clamp to top/bottom edge of the frame
725 const int iy = clamp(iy4 + k, 0, height - 1);
726
727 int sx = sx4 + beta * (k + 4);
728
729 for (int l = -4; l < 4; ++l) {
730 int ix = ix4 + l - 3;
731 // At this point, sx = sx4 + alpha * l + beta * k
732 const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
733 WARPEDPIXEL_PREC_SHIFTS;
734 assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
735 const int16_t *coeffs = warped_filter[offs];
736
737 int32_t sum = 1 << offset_bits_horiz;
738 for (int m = 0; m < 8; ++m) {
739 // Clamp to left/right edge of the frame
740 const int sample_x = clamp(ix + m, 0, width - 1);
741
742 sum += ref[iy * stride + sample_x] * coeffs[m];
743 }
744 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
745 assert(0 <= sum && sum < (1 << max_bits_horiz));
746 tmp[(k + 7) * 8 + (l + 4)] = sum;
747 sx += alpha;
748 }
749 }
750
751 // Vertical filter
752 for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
753 int sy = sy4 + delta * (k + 4);
754 for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
755 // At this point, sy = sy4 + gamma * l + delta * k
756 const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
757 WARPEDPIXEL_PREC_SHIFTS;
758 assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
759 const int16_t *coeffs = warped_filter[offs];
760
761 int32_t sum = 1 << offset_bits_vert;
762 for (int m = 0; m < 8; ++m) {
763 sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
764 }
765
766 if (conv_params->is_compound) {
767 CONV_BUF_TYPE *p =
768 &conv_params
769 ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
770 (j - p_col + l + 4)];
771 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
772 if (conv_params->do_average) {
773 uint8_t *dst8 =
774 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
775 int32_t tmp32 = *p;
776 if (conv_params->use_dist_wtd_comp_avg) {
777 tmp32 = tmp32 * conv_params->fwd_offset +
778 sum * conv_params->bck_offset;
779 tmp32 = tmp32 >> DIST_PRECISION_BITS;
780 } else {
781 tmp32 += sum;
782 tmp32 = tmp32 >> 1;
783 }
784 tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
785 (1 << (offset_bits - conv_params->round_1 - 1));
786 *dst8 = clip_pixel(ROUND_POWER_OF_TWO(tmp32, round_bits));
787 } else {
788 *p = sum;
789 }
790 } else {
791 uint8_t *p =
792 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
793 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
794 assert(0 <= sum && sum < (1 << (bd + 2)));
795 *p = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
796 }
797 sy += gamma;
798 }
799 }
800 }
801 }
802 }
803
warp_plane(WarpedMotionParams * wm,const uint8_t * const ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params)804 static void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref,
805 int width, int height, int stride, uint8_t *pred,
806 int p_col, int p_row, int p_width, int p_height,
807 int p_stride, int subsampling_x, int subsampling_y,
808 ConvolveParams *conv_params) {
809 assert(wm->wmtype <= AFFINE);
810 if (wm->wmtype == ROTZOOM) {
811 wm->wmmat[5] = wm->wmmat[2];
812 wm->wmmat[4] = -wm->wmmat[3];
813 }
814 const int32_t *const mat = wm->wmmat;
815 const int16_t alpha = wm->alpha;
816 const int16_t beta = wm->beta;
817 const int16_t gamma = wm->gamma;
818 const int16_t delta = wm->delta;
819 av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width,
820 p_height, p_stride, subsampling_x, subsampling_y, conv_params,
821 alpha, beta, gamma, delta);
822 }
823
frame_error(const uint8_t * const ref,int stride,const uint8_t * const dst,int p_width,int p_height,int p_stride)824 static int64_t frame_error(const uint8_t *const ref, int stride,
825 const uint8_t *const dst, int p_width, int p_height,
826 int p_stride) {
827 int64_t sum_error = 0;
828 for (int i = 0; i < p_height; ++i) {
829 for (int j = 0; j < p_width; ++j) {
830 sum_error +=
831 (int64_t)error_measure(dst[j + i * p_stride] - ref[j + i * stride]);
832 }
833 }
834 return sum_error;
835 }
836
warp_error(WarpedMotionParams * wm,const uint8_t * const ref,int width,int height,int stride,const uint8_t * const dst,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int64_t best_error)837 static int64_t warp_error(WarpedMotionParams *wm, const uint8_t *const ref,
838 int width, int height, int stride,
839 const uint8_t *const dst, int p_col, int p_row,
840 int p_width, int p_height, int p_stride,
841 int subsampling_x, int subsampling_y,
842 int64_t best_error) {
843 int64_t gm_sumerr = 0;
844 int warp_w, warp_h;
845 int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
846 int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
847 uint8_t tmp[WARP_ERROR_BLOCK * WARP_ERROR_BLOCK];
848 ConvolveParams conv_params = get_conv_params(0, 0, 8);
849 conv_params.use_dist_wtd_comp_avg = 0;
850
851 for (int i = p_row; i < p_row + p_height; i += WARP_ERROR_BLOCK) {
852 for (int j = p_col; j < p_col + p_width; j += WARP_ERROR_BLOCK) {
853 // avoid warping extra 8x8 blocks in the padded region of the frame
854 // when p_width and p_height are not multiples of WARP_ERROR_BLOCK
855 warp_w = AOMMIN(error_bsize_w, p_col + p_width - j);
856 warp_h = AOMMIN(error_bsize_h, p_row + p_height - i);
857 warp_plane(wm, ref, width, height, stride, tmp, j, i, warp_w, warp_h,
858 WARP_ERROR_BLOCK, subsampling_x, subsampling_y, &conv_params);
859
860 gm_sumerr += frame_error(tmp, WARP_ERROR_BLOCK, dst + j + i * p_stride,
861 warp_w, warp_h, p_stride);
862 if (gm_sumerr > best_error) return gm_sumerr;
863 }
864 }
865 return gm_sumerr;
866 }
867
av1_frame_error(int use_hbd,int bd,const uint8_t * ref,int stride,uint8_t * dst,int p_width,int p_height,int p_stride)868 int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
869 uint8_t *dst, int p_width, int p_height, int p_stride) {
870 if (use_hbd) {
871 return highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride,
872 CONVERT_TO_SHORTPTR(dst), p_width, p_height,
873 p_stride, bd);
874 }
875 return frame_error(ref, stride, dst, p_width, p_height, p_stride);
876 }
877
av1_warp_error(WarpedMotionParams * wm,int use_hbd,int bd,const uint8_t * ref,int width,int height,int stride,uint8_t * dst,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int64_t best_error)878 int64_t av1_warp_error(WarpedMotionParams *wm, int use_hbd, int bd,
879 const uint8_t *ref, int width, int height, int stride,
880 uint8_t *dst, int p_col, int p_row, int p_width,
881 int p_height, int p_stride, int subsampling_x,
882 int subsampling_y, int64_t best_error) {
883 if (wm->wmtype <= AFFINE)
884 if (!get_shear_params(wm)) return 1;
885 if (use_hbd)
886 return highbd_warp_error(wm, ref, width, height, stride, dst, p_col, p_row,
887 p_width, p_height, p_stride, subsampling_x,
888 subsampling_y, bd, best_error);
889 return warp_error(wm, ref, width, height, stride, dst, p_col, p_row, p_width,
890 p_height, p_stride, subsampling_x, subsampling_y,
891 best_error);
892 }
893
av1_warp_plane(WarpedMotionParams * wm,int use_hbd,int bd,const uint8_t * ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params)894 void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
895 const uint8_t *ref, int width, int height, int stride,
896 uint8_t *pred, int p_col, int p_row, int p_width,
897 int p_height, int p_stride, int subsampling_x,
898 int subsampling_y, ConvolveParams *conv_params) {
899 if (use_hbd)
900 highbd_warp_plane(wm, ref, width, height, stride, pred, p_col, p_row,
901 p_width, p_height, p_stride, subsampling_x, subsampling_y,
902 bd, conv_params);
903 else
904 warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
905 p_height, p_stride, subsampling_x, subsampling_y, conv_params);
906 }
907
908 #define LS_MV_MAX 256 // max mv in 1/8-pel
909 // Use LS_STEP = 8 so that 2 less bits needed for A, Bx, By.
910 #define LS_STEP 8
911
912 // Assuming LS_MV_MAX is < MAX_SB_SIZE * 8,
913 // the precision needed is:
914 // (MAX_SB_SIZE_LOG2 + 3) [for sx * sx magnitude] +
915 // (MAX_SB_SIZE_LOG2 + 4) [for sx * dx magnitude] +
916 // 1 [for sign] +
917 // LEAST_SQUARES_SAMPLES_MAX_BITS
918 // [for adding up to LEAST_SQUARES_SAMPLES_MAX samples]
919 // The value is 23
920 #define LS_MAT_RANGE_BITS \
921 ((MAX_SB_SIZE_LOG2 + 4) * 2 + LEAST_SQUARES_SAMPLES_MAX_BITS)
922
923 // Bit-depth reduction from the full-range
924 #define LS_MAT_DOWN_BITS 2
925
926 // bits range of A, Bx and By after downshifting
927 #define LS_MAT_BITS (LS_MAT_RANGE_BITS - LS_MAT_DOWN_BITS)
928 #define LS_MAT_MIN (-(1 << (LS_MAT_BITS - 1)))
929 #define LS_MAT_MAX ((1 << (LS_MAT_BITS - 1)) - 1)
930
931 // By setting LS_STEP = 8, the least 2 bits of every elements in A, Bx, By are
932 // 0. So, we can reduce LS_MAT_RANGE_BITS(2) bits here.
933 #define LS_SQUARE(a) \
934 (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
935 (2 + LS_MAT_DOWN_BITS))
936 #define LS_PRODUCT1(a, b) \
937 (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
938 (2 + LS_MAT_DOWN_BITS))
939 #define LS_PRODUCT2(a, b) \
940 (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
941 (2 + LS_MAT_DOWN_BITS))
942
943 #define USE_LIMITED_PREC_MULT 0
944
945 #if USE_LIMITED_PREC_MULT
946
947 #define MUL_PREC_BITS 16
resolve_multiplier_64(uint64_t D,int16_t * shift)948 static uint16_t resolve_multiplier_64(uint64_t D, int16_t *shift) {
949 int msb = 0;
950 uint16_t mult = 0;
951 *shift = 0;
952 if (D != 0) {
953 msb = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
954 : get_msb((unsigned int)D));
955 if (msb >= MUL_PREC_BITS) {
956 mult = (uint16_t)ROUND_POWER_OF_TWO_64(D, msb + 1 - MUL_PREC_BITS);
957 *shift = msb + 1 - MUL_PREC_BITS;
958 } else {
959 mult = (uint16_t)D;
960 *shift = 0;
961 }
962 }
963 return mult;
964 }
965
get_mult_shift_ndiag(int64_t Px,int16_t iDet,int shift)966 static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
967 int32_t ret;
968 int16_t mshift;
969 uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
970 int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
971 shift -= mshift;
972 if (shift > 0) {
973 return (int32_t)clamp(ROUND_POWER_OF_TWO_SIGNED(v, shift),
974 -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
975 WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
976 } else {
977 return (int32_t)clamp(v * (1 << (-shift)),
978 -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
979 WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
980 }
981 return ret;
982 }
983
get_mult_shift_diag(int64_t Px,int16_t iDet,int shift)984 static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
985 int16_t mshift;
986 uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
987 int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
988 shift -= mshift;
989 if (shift > 0) {
990 return (int32_t)clamp(
991 ROUND_POWER_OF_TWO_SIGNED(v, shift),
992 (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
993 (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
994 } else {
995 return (int32_t)clamp(
996 v * (1 << (-shift)),
997 (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
998 (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
999 }
1000 }
1001
1002 #else
1003
get_mult_shift_ndiag(int64_t Px,int16_t iDet,int shift)1004 static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
1005 int64_t v = Px * (int64_t)iDet;
1006 return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
1007 -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
1008 WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
1009 }
1010
get_mult_shift_diag(int64_t Px,int16_t iDet,int shift)1011 static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
1012 int64_t v = Px * (int64_t)iDet;
1013 return (int32_t)clamp64(
1014 ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
1015 (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
1016 (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
1017 }
1018 #endif // USE_LIMITED_PREC_MULT
1019
find_affine_int(int np,const int * pts1,const int * pts2,BLOCK_SIZE bsize,int mvy,int mvx,WarpedMotionParams * wm,int mi_row,int mi_col)1020 static int find_affine_int(int np, const int *pts1, const int *pts2,
1021 BLOCK_SIZE bsize, int mvy, int mvx,
1022 WarpedMotionParams *wm, int mi_row, int mi_col) {
1023 int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
1024 int32_t Bx[2] = { 0, 0 };
1025 int32_t By[2] = { 0, 0 };
1026 int i;
1027
1028 const int bw = block_size_wide[bsize];
1029 const int bh = block_size_high[bsize];
1030 const int rsuy = (AOMMAX(bh, MI_SIZE) / 2 - 1);
1031 const int rsux = (AOMMAX(bw, MI_SIZE) / 2 - 1);
1032 const int suy = rsuy * 8;
1033 const int sux = rsux * 8;
1034 const int duy = suy + mvy;
1035 const int dux = sux + mvx;
1036 const int isuy = (mi_row * MI_SIZE + rsuy);
1037 const int isux = (mi_col * MI_SIZE + rsux);
1038
1039 // Assume the center pixel of the block has exactly the same motion vector
1040 // as transmitted for the block. First shift the origin of the source
1041 // points to the block center, and the origin of the destination points to
1042 // the block center added to the motion vector transmitted.
1043 // Let (xi, yi) denote the source points and (xi', yi') denote destination
1044 // points after origin shfifting, for i = 0, 1, 2, .... n-1.
1045 // Then if P = [x0, y0,
1046 // x1, y1
1047 // x2, y1,
1048 // ....
1049 // ]
1050 // q = [x0', x1', x2', ... ]'
1051 // r = [y0', y1', y2', ... ]'
1052 // the least squares problems that need to be solved are:
1053 // [h1, h2]' = inv(P'P)P'q and
1054 // [h3, h4]' = inv(P'P)P'r
1055 // where the affine transformation is given by:
1056 // x' = h1.x + h2.y
1057 // y' = h3.x + h4.y
1058 //
1059 // The loop below computes: A = P'P, Bx = P'q, By = P'r
1060 // We need to just compute inv(A).Bx and inv(A).By for the solutions.
1061 // Contribution from neighbor block
1062 for (i = 0; i < np; i++) {
1063 const int dx = pts2[i * 2] - dux;
1064 const int dy = pts2[i * 2 + 1] - duy;
1065 const int sx = pts1[i * 2] - sux;
1066 const int sy = pts1[i * 2 + 1] - suy;
1067 // (TODO)yunqing: This comparison wouldn't be necessary if the sample
1068 // selection is done in find_samples(). Also, global offset can be removed
1069 // while collecting samples.
1070 if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
1071 A[0][0] += LS_SQUARE(sx);
1072 A[0][1] += LS_PRODUCT1(sx, sy);
1073 A[1][1] += LS_SQUARE(sy);
1074 Bx[0] += LS_PRODUCT2(sx, dx);
1075 Bx[1] += LS_PRODUCT1(sy, dx);
1076 By[0] += LS_PRODUCT1(sx, dy);
1077 By[1] += LS_PRODUCT2(sy, dy);
1078 }
1079 }
1080
1081 // Just for debugging, and can be removed later.
1082 assert(A[0][0] >= LS_MAT_MIN && A[0][0] <= LS_MAT_MAX);
1083 assert(A[0][1] >= LS_MAT_MIN && A[0][1] <= LS_MAT_MAX);
1084 assert(A[1][1] >= LS_MAT_MIN && A[1][1] <= LS_MAT_MAX);
1085 assert(Bx[0] >= LS_MAT_MIN && Bx[0] <= LS_MAT_MAX);
1086 assert(Bx[1] >= LS_MAT_MIN && Bx[1] <= LS_MAT_MAX);
1087 assert(By[0] >= LS_MAT_MIN && By[0] <= LS_MAT_MAX);
1088 assert(By[1] >= LS_MAT_MIN && By[1] <= LS_MAT_MAX);
1089
1090 int64_t Det;
1091 int16_t iDet, shift;
1092
1093 // Compute Determinant of A
1094 Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
1095 if (Det == 0) return 1;
1096 iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1);
1097 shift -= WARPEDMODEL_PREC_BITS;
1098 if (shift < 0) {
1099 iDet <<= (-shift);
1100 shift = 0;
1101 }
1102
1103 int64_t Px[2], Py[2];
1104
1105 // These divided by the Det, are the least squares solutions
1106 Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
1107 Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1];
1108 Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1];
1109 Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1];
1110
1111 wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift);
1112 wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift);
1113 wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift);
1114 wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift);
1115
1116 // Note: In the vx, vy expressions below, the max value of each of the
1117 // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room
1118 // for the first term so that the overall sum in the worst case fits
1119 // within 32 bits overall.
1120 int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
1121 (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) +
1122 isuy * wm->wmmat[3]);
1123 int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
1124 (isux * wm->wmmat[4] +
1125 isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS)));
1126 wm->wmmat[0] =
1127 clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
1128 wm->wmmat[1] =
1129 clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
1130
1131 wm->wmmat[6] = wm->wmmat[7] = 0;
1132 return 0;
1133 }
1134
find_projection(int np,int * pts1,int * pts2,BLOCK_SIZE bsize,int mvy,int mvx,WarpedMotionParams * wm_params,int mi_row,int mi_col)1135 int find_projection(int np, int *pts1, int *pts2, BLOCK_SIZE bsize, int mvy,
1136 int mvx, WarpedMotionParams *wm_params, int mi_row,
1137 int mi_col) {
1138 assert(wm_params->wmtype == AFFINE);
1139
1140 if (find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, mi_row,
1141 mi_col))
1142 return 1;
1143
1144 // check compatibility with the fast warp filter
1145 if (!get_shear_params(wm_params)) return 1;
1146
1147 return 0;
1148 }
1149