• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <memory.h>
15 #include <math.h>
16 #include <assert.h>
17 
18 #include "config/av1_rtcd.h"
19 
20 #include "av1/common/av1_common_int.h"
21 #include "av1/common/warped_motion.h"
22 #include "av1/common/scale.h"
23 
24 // For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
25 // at a time. The zoom/rotation/shear in the model are applied to the
26 // "fractional" position of each pixel, which therefore varies within
27 // [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
28 // We need an extra 2 taps to fit this in, for a total of 8 taps.
29 /* clang-format off */
30 const WarpedFilterCoeff av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1]
31                                          [8] = {
32   // [-1, 0)
33   { 0,   0, 127,   1,   0, 0, 0, 0 }, { 0, - 1, 127,   2,   0, 0, 0, 0 },
34   { 1, - 3, 127,   4, - 1, 0, 0, 0 }, { 1, - 4, 126,   6, - 2, 1, 0, 0 },
35   { 1, - 5, 126,   8, - 3, 1, 0, 0 }, { 1, - 6, 125,  11, - 4, 1, 0, 0 },
36   { 1, - 7, 124,  13, - 4, 1, 0, 0 }, { 2, - 8, 123,  15, - 5, 1, 0, 0 },
37   { 2, - 9, 122,  18, - 6, 1, 0, 0 }, { 2, -10, 121,  20, - 6, 1, 0, 0 },
38   { 2, -11, 120,  22, - 7, 2, 0, 0 }, { 2, -12, 119,  25, - 8, 2, 0, 0 },
39   { 3, -13, 117,  27, - 8, 2, 0, 0 }, { 3, -13, 116,  29, - 9, 2, 0, 0 },
40   { 3, -14, 114,  32, -10, 3, 0, 0 }, { 3, -15, 113,  35, -10, 2, 0, 0 },
41   { 3, -15, 111,  37, -11, 3, 0, 0 }, { 3, -16, 109,  40, -11, 3, 0, 0 },
42   { 3, -16, 108,  42, -12, 3, 0, 0 }, { 4, -17, 106,  45, -13, 3, 0, 0 },
43   { 4, -17, 104,  47, -13, 3, 0, 0 }, { 4, -17, 102,  50, -14, 3, 0, 0 },
44   { 4, -17, 100,  52, -14, 3, 0, 0 }, { 4, -18,  98,  55, -15, 4, 0, 0 },
45   { 4, -18,  96,  58, -15, 3, 0, 0 }, { 4, -18,  94,  60, -16, 4, 0, 0 },
46   { 4, -18,  91,  63, -16, 4, 0, 0 }, { 4, -18,  89,  65, -16, 4, 0, 0 },
47   { 4, -18,  87,  68, -17, 4, 0, 0 }, { 4, -18,  85,  70, -17, 4, 0, 0 },
48   { 4, -18,  82,  73, -17, 4, 0, 0 }, { 4, -18,  80,  75, -17, 4, 0, 0 },
49   { 4, -18,  78,  78, -18, 4, 0, 0 }, { 4, -17,  75,  80, -18, 4, 0, 0 },
50   { 4, -17,  73,  82, -18, 4, 0, 0 }, { 4, -17,  70,  85, -18, 4, 0, 0 },
51   { 4, -17,  68,  87, -18, 4, 0, 0 }, { 4, -16,  65,  89, -18, 4, 0, 0 },
52   { 4, -16,  63,  91, -18, 4, 0, 0 }, { 4, -16,  60,  94, -18, 4, 0, 0 },
53   { 3, -15,  58,  96, -18, 4, 0, 0 }, { 4, -15,  55,  98, -18, 4, 0, 0 },
54   { 3, -14,  52, 100, -17, 4, 0, 0 }, { 3, -14,  50, 102, -17, 4, 0, 0 },
55   { 3, -13,  47, 104, -17, 4, 0, 0 }, { 3, -13,  45, 106, -17, 4, 0, 0 },
56   { 3, -12,  42, 108, -16, 3, 0, 0 }, { 3, -11,  40, 109, -16, 3, 0, 0 },
57   { 3, -11,  37, 111, -15, 3, 0, 0 }, { 2, -10,  35, 113, -15, 3, 0, 0 },
58   { 3, -10,  32, 114, -14, 3, 0, 0 }, { 2, - 9,  29, 116, -13, 3, 0, 0 },
59   { 2, - 8,  27, 117, -13, 3, 0, 0 }, { 2, - 8,  25, 119, -12, 2, 0, 0 },
60   { 2, - 7,  22, 120, -11, 2, 0, 0 }, { 1, - 6,  20, 121, -10, 2, 0, 0 },
61   { 1, - 6,  18, 122, - 9, 2, 0, 0 }, { 1, - 5,  15, 123, - 8, 2, 0, 0 },
62   { 1, - 4,  13, 124, - 7, 1, 0, 0 }, { 1, - 4,  11, 125, - 6, 1, 0, 0 },
63   { 1, - 3,   8, 126, - 5, 1, 0, 0 }, { 1, - 2,   6, 126, - 4, 1, 0, 0 },
64   { 0, - 1,   4, 127, - 3, 1, 0, 0 }, { 0,   0,   2, 127, - 1, 0, 0, 0 },
65 
66   // [0, 1)
67   { 0,  0,   0, 127,   1,   0,  0,  0}, { 0,  0,  -1, 127,   2,   0,  0,  0},
68   { 0,  1,  -3, 127,   4,  -2,  1,  0}, { 0,  1,  -5, 127,   6,  -2,  1,  0},
69   { 0,  2,  -6, 126,   8,  -3,  1,  0}, {-1,  2,  -7, 126,  11,  -4,  2, -1},
70   {-1,  3,  -8, 125,  13,  -5,  2, -1}, {-1,  3, -10, 124,  16,  -6,  3, -1},
71   {-1,  4, -11, 123,  18,  -7,  3, -1}, {-1,  4, -12, 122,  20,  -7,  3, -1},
72   {-1,  4, -13, 121,  23,  -8,  3, -1}, {-2,  5, -14, 120,  25,  -9,  4, -1},
73   {-1,  5, -15, 119,  27, -10,  4, -1}, {-1,  5, -16, 118,  30, -11,  4, -1},
74   {-2,  6, -17, 116,  33, -12,  5, -1}, {-2,  6, -17, 114,  35, -12,  5, -1},
75   {-2,  6, -18, 113,  38, -13,  5, -1}, {-2,  7, -19, 111,  41, -14,  6, -2},
76   {-2,  7, -19, 110,  43, -15,  6, -2}, {-2,  7, -20, 108,  46, -15,  6, -2},
77   {-2,  7, -20, 106,  49, -16,  6, -2}, {-2,  7, -21, 104,  51, -16,  7, -2},
78   {-2,  7, -21, 102,  54, -17,  7, -2}, {-2,  8, -21, 100,  56, -18,  7, -2},
79   {-2,  8, -22,  98,  59, -18,  7, -2}, {-2,  8, -22,  96,  62, -19,  7, -2},
80   {-2,  8, -22,  94,  64, -19,  7, -2}, {-2,  8, -22,  91,  67, -20,  8, -2},
81   {-2,  8, -22,  89,  69, -20,  8, -2}, {-2,  8, -22,  87,  72, -21,  8, -2},
82   {-2,  8, -21,  84,  74, -21,  8, -2}, {-2,  8, -22,  82,  77, -21,  8, -2},
83   {-2,  8, -21,  79,  79, -21,  8, -2}, {-2,  8, -21,  77,  82, -22,  8, -2},
84   {-2,  8, -21,  74,  84, -21,  8, -2}, {-2,  8, -21,  72,  87, -22,  8, -2},
85   {-2,  8, -20,  69,  89, -22,  8, -2}, {-2,  8, -20,  67,  91, -22,  8, -2},
86   {-2,  7, -19,  64,  94, -22,  8, -2}, {-2,  7, -19,  62,  96, -22,  8, -2},
87   {-2,  7, -18,  59,  98, -22,  8, -2}, {-2,  7, -18,  56, 100, -21,  8, -2},
88   {-2,  7, -17,  54, 102, -21,  7, -2}, {-2,  7, -16,  51, 104, -21,  7, -2},
89   {-2,  6, -16,  49, 106, -20,  7, -2}, {-2,  6, -15,  46, 108, -20,  7, -2},
90   {-2,  6, -15,  43, 110, -19,  7, -2}, {-2,  6, -14,  41, 111, -19,  7, -2},
91   {-1,  5, -13,  38, 113, -18,  6, -2}, {-1,  5, -12,  35, 114, -17,  6, -2},
92   {-1,  5, -12,  33, 116, -17,  6, -2}, {-1,  4, -11,  30, 118, -16,  5, -1},
93   {-1,  4, -10,  27, 119, -15,  5, -1}, {-1,  4,  -9,  25, 120, -14,  5, -2},
94   {-1,  3,  -8,  23, 121, -13,  4, -1}, {-1,  3,  -7,  20, 122, -12,  4, -1},
95   {-1,  3,  -7,  18, 123, -11,  4, -1}, {-1,  3,  -6,  16, 124, -10,  3, -1},
96   {-1,  2,  -5,  13, 125,  -8,  3, -1}, {-1,  2,  -4,  11, 126,  -7,  2, -1},
97   { 0,  1,  -3,   8, 126,  -6,  2,  0}, { 0,  1,  -2,   6, 127,  -5,  1,  0},
98   { 0,  1,  -2,   4, 127,  -3,  1,  0}, { 0,  0,   0,   2, 127,  -1,  0,  0},
99 
100   // [1, 2)
101   { 0, 0, 0,   1, 127,   0,   0, 0 }, { 0, 0, 0, - 1, 127,   2,   0, 0 },
102   { 0, 0, 1, - 3, 127,   4, - 1, 0 }, { 0, 0, 1, - 4, 126,   6, - 2, 1 },
103   { 0, 0, 1, - 5, 126,   8, - 3, 1 }, { 0, 0, 1, - 6, 125,  11, - 4, 1 },
104   { 0, 0, 1, - 7, 124,  13, - 4, 1 }, { 0, 0, 2, - 8, 123,  15, - 5, 1 },
105   { 0, 0, 2, - 9, 122,  18, - 6, 1 }, { 0, 0, 2, -10, 121,  20, - 6, 1 },
106   { 0, 0, 2, -11, 120,  22, - 7, 2 }, { 0, 0, 2, -12, 119,  25, - 8, 2 },
107   { 0, 0, 3, -13, 117,  27, - 8, 2 }, { 0, 0, 3, -13, 116,  29, - 9, 2 },
108   { 0, 0, 3, -14, 114,  32, -10, 3 }, { 0, 0, 3, -15, 113,  35, -10, 2 },
109   { 0, 0, 3, -15, 111,  37, -11, 3 }, { 0, 0, 3, -16, 109,  40, -11, 3 },
110   { 0, 0, 3, -16, 108,  42, -12, 3 }, { 0, 0, 4, -17, 106,  45, -13, 3 },
111   { 0, 0, 4, -17, 104,  47, -13, 3 }, { 0, 0, 4, -17, 102,  50, -14, 3 },
112   { 0, 0, 4, -17, 100,  52, -14, 3 }, { 0, 0, 4, -18,  98,  55, -15, 4 },
113   { 0, 0, 4, -18,  96,  58, -15, 3 }, { 0, 0, 4, -18,  94,  60, -16, 4 },
114   { 0, 0, 4, -18,  91,  63, -16, 4 }, { 0, 0, 4, -18,  89,  65, -16, 4 },
115   { 0, 0, 4, -18,  87,  68, -17, 4 }, { 0, 0, 4, -18,  85,  70, -17, 4 },
116   { 0, 0, 4, -18,  82,  73, -17, 4 }, { 0, 0, 4, -18,  80,  75, -17, 4 },
117   { 0, 0, 4, -18,  78,  78, -18, 4 }, { 0, 0, 4, -17,  75,  80, -18, 4 },
118   { 0, 0, 4, -17,  73,  82, -18, 4 }, { 0, 0, 4, -17,  70,  85, -18, 4 },
119   { 0, 0, 4, -17,  68,  87, -18, 4 }, { 0, 0, 4, -16,  65,  89, -18, 4 },
120   { 0, 0, 4, -16,  63,  91, -18, 4 }, { 0, 0, 4, -16,  60,  94, -18, 4 },
121   { 0, 0, 3, -15,  58,  96, -18, 4 }, { 0, 0, 4, -15,  55,  98, -18, 4 },
122   { 0, 0, 3, -14,  52, 100, -17, 4 }, { 0, 0, 3, -14,  50, 102, -17, 4 },
123   { 0, 0, 3, -13,  47, 104, -17, 4 }, { 0, 0, 3, -13,  45, 106, -17, 4 },
124   { 0, 0, 3, -12,  42, 108, -16, 3 }, { 0, 0, 3, -11,  40, 109, -16, 3 },
125   { 0, 0, 3, -11,  37, 111, -15, 3 }, { 0, 0, 2, -10,  35, 113, -15, 3 },
126   { 0, 0, 3, -10,  32, 114, -14, 3 }, { 0, 0, 2, - 9,  29, 116, -13, 3 },
127   { 0, 0, 2, - 8,  27, 117, -13, 3 }, { 0, 0, 2, - 8,  25, 119, -12, 2 },
128   { 0, 0, 2, - 7,  22, 120, -11, 2 }, { 0, 0, 1, - 6,  20, 121, -10, 2 },
129   { 0, 0, 1, - 6,  18, 122, - 9, 2 }, { 0, 0, 1, - 5,  15, 123, - 8, 2 },
130   { 0, 0, 1, - 4,  13, 124, - 7, 1 }, { 0, 0, 1, - 4,  11, 125, - 6, 1 },
131   { 0, 0, 1, - 3,   8, 126, - 5, 1 }, { 0, 0, 1, - 2,   6, 126, - 4, 1 },
132   { 0, 0, 0, - 1,   4, 127, - 3, 1 }, { 0, 0, 0,   0,   2, 127, - 1, 0 },
133   // dummy (replicate row index 191)
134   { 0, 0, 0,   0,   2, 127, - 1, 0 },
135 };
136 
137 /* clang-format on */
138 
139 #define DIV_LUT_PREC_BITS 14
140 #define DIV_LUT_BITS 8
141 #define DIV_LUT_NUM (1 << DIV_LUT_BITS)
142 
143 static const uint16_t div_lut[DIV_LUT_NUM + 1] = {
144   16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
145   15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
146   15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
147   14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
148   13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
149   13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
150   13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
151   12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
152   12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
153   11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
154   11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
155   11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
156   10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
157   10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
158   10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
159   9963,  9939,  9916,  9892,  9869,  9846,  9823,  9800,  9777,  9754,  9732,
160   9709,  9687,  9664,  9642,  9620,  9598,  9576,  9554,  9533,  9511,  9489,
161   9468,  9447,  9425,  9404,  9383,  9362,  9341,  9321,  9300,  9279,  9259,
162   9239,  9218,  9198,  9178,  9158,  9138,  9118,  9098,  9079,  9059,  9039,
163   9020,  9001,  8981,  8962,  8943,  8924,  8905,  8886,  8867,  8849,  8830,
164   8812,  8793,  8775,  8756,  8738,  8720,  8702,  8684,  8666,  8648,  8630,
165   8613,  8595,  8577,  8560,  8542,  8525,  8508,  8490,  8473,  8456,  8439,
166   8422,  8405,  8389,  8372,  8355,  8339,  8322,  8306,  8289,  8273,  8257,
167   8240,  8224,  8208,  8192,
168 };
169 
170 // Decomposes a divisor D such that 1/D = y/2^shift, where y is returned
171 // at precision of DIV_LUT_PREC_BITS along with the shift.
resolve_divisor_64(uint64_t D,int16_t * shift)172 static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) {
173   int64_t f;
174   *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
175                                : get_msb((unsigned int)D));
176   // e is obtained from D after resetting the most significant 1 bit.
177   const int64_t e = D - ((uint64_t)1 << *shift);
178   // Get the most significant DIV_LUT_BITS (8) bits of e into f
179   if (*shift > DIV_LUT_BITS)
180     f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS);
181   else
182     f = e << (DIV_LUT_BITS - *shift);
183   assert(f <= DIV_LUT_NUM);
184   *shift += DIV_LUT_PREC_BITS;
185   // Use f as lookup into the precomputed table of multipliers
186   return div_lut[f];
187 }
188 
resolve_divisor_32(uint32_t D,int16_t * shift)189 static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) {
190   int32_t f;
191   *shift = get_msb(D);
192   // e is obtained from D after resetting the most significant 1 bit.
193   const int32_t e = D - ((uint32_t)1 << *shift);
194   // Get the most significant DIV_LUT_BITS (8) bits of e into f
195   if (*shift > DIV_LUT_BITS)
196     f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS);
197   else
198     f = e << (DIV_LUT_BITS - *shift);
199   assert(f <= DIV_LUT_NUM);
200   *shift += DIV_LUT_PREC_BITS;
201   // Use f as lookup into the precomputed table of multipliers
202   return div_lut[f];
203 }
204 
is_affine_valid(const WarpedMotionParams * const wm)205 static int is_affine_valid(const WarpedMotionParams *const wm) {
206   const int32_t *mat = wm->wmmat;
207   return (mat[2] > 0);
208 }
209 
is_affine_shear_allowed(int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)210 static int is_affine_shear_allowed(int16_t alpha, int16_t beta, int16_t gamma,
211                                    int16_t delta) {
212   if ((4 * abs(alpha) + 7 * abs(beta) >= (1 << WARPEDMODEL_PREC_BITS)) ||
213       (4 * abs(gamma) + 4 * abs(delta) >= (1 << WARPEDMODEL_PREC_BITS)))
214     return 0;
215   else
216     return 1;
217 }
218 
219 #ifndef NDEBUG
220 // Check that the given warp model satisfies the relevant constraints for
221 // its stated model type
check_model_consistency(WarpedMotionParams * wm)222 static void check_model_consistency(WarpedMotionParams *wm) {
223   switch (wm->wmtype) {
224     case IDENTITY:
225       assert(wm->wmmat[0] == 0);
226       assert(wm->wmmat[1] == 0);
227       AOM_FALLTHROUGH_INTENDED;
228     case TRANSLATION:
229       assert(wm->wmmat[2] == 1 << WARPEDMODEL_PREC_BITS);
230       assert(wm->wmmat[3] == 0);
231       AOM_FALLTHROUGH_INTENDED;
232     case ROTZOOM:
233       assert(wm->wmmat[4] == -wm->wmmat[3]);
234       assert(wm->wmmat[5] == wm->wmmat[2]);
235       AOM_FALLTHROUGH_INTENDED;
236     case AFFINE: break;
237     default: assert(0 && "Bad wmtype");
238   }
239 }
240 #endif  // NDEBUG
241 
242 // Returns 1 on success or 0 on an invalid affine set
av1_get_shear_params(WarpedMotionParams * wm)243 int av1_get_shear_params(WarpedMotionParams *wm) {
244 #ifndef NDEBUG
245   // Check that models have been constructed sensibly
246   // This is a good place to check, because this function does not need to
247   // be called until after model construction is complete, but must be called
248   // before the model can be used for prediction.
249   check_model_consistency(wm);
250 #endif  // NDEBUG
251 
252   const int32_t *mat = wm->wmmat;
253   if (!is_affine_valid(wm)) return 0;
254 
255   wm->alpha =
256       clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
257   wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX);
258   int16_t shift;
259   int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1);
260   int64_t v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y;
261   wm->gamma =
262       clamp((int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift), INT16_MIN, INT16_MAX);
263   v = ((int64_t)mat[3] * mat[4]) * y;
264   wm->delta = clamp(mat[5] - (int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift) -
265                         (1 << WARPEDMODEL_PREC_BITS),
266                     INT16_MIN, INT16_MAX);
267 
268   wm->alpha = ROUND_POWER_OF_TWO_SIGNED(wm->alpha, WARP_PARAM_REDUCE_BITS) *
269               (1 << WARP_PARAM_REDUCE_BITS);
270   wm->beta = ROUND_POWER_OF_TWO_SIGNED(wm->beta, WARP_PARAM_REDUCE_BITS) *
271              (1 << WARP_PARAM_REDUCE_BITS);
272   wm->gamma = ROUND_POWER_OF_TWO_SIGNED(wm->gamma, WARP_PARAM_REDUCE_BITS) *
273               (1 << WARP_PARAM_REDUCE_BITS);
274   wm->delta = ROUND_POWER_OF_TWO_SIGNED(wm->delta, WARP_PARAM_REDUCE_BITS) *
275               (1 << WARP_PARAM_REDUCE_BITS);
276 
277   if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta))
278     return 0;
279 
280   return 1;
281 }
282 
283 #if CONFIG_AV1_HIGHBITDEPTH
284 /* Note: For an explanation of the warp algorithm, and some notes on bit widths
285     for hardware implementations, see the comments above av1_warp_affine_c
286 */
av1_highbd_warp_affine_c(const int32_t * mat,const uint16_t * ref,int width,int height,int stride,uint16_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)287 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
288                               int width, int height, int stride, uint16_t *pred,
289                               int p_col, int p_row, int p_width, int p_height,
290                               int p_stride, int subsampling_x,
291                               int subsampling_y, int bd,
292                               ConvolveParams *conv_params, int16_t alpha,
293                               int16_t beta, int16_t gamma, int16_t delta) {
294   int32_t tmp[15 * 8];
295   const int reduce_bits_horiz = conv_params->round_0;
296   const int reduce_bits_vert = conv_params->is_compound
297                                    ? conv_params->round_1
298                                    : 2 * FILTER_BITS - reduce_bits_horiz;
299   const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
300   const int offset_bits_horiz = bd + FILTER_BITS - 1;
301   const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
302   const int round_bits =
303       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
304   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
305   (void)max_bits_horiz;
306   assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
307 
308   // Check that, even with 12-bit input, the intermediate values will fit
309   // into an unsigned 16-bit intermediate array.
310   assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
311 
312   for (int i = p_row; i < p_row + p_height; i += 8) {
313     for (int j = p_col; j < p_col + p_width; j += 8) {
314       // Calculate the center of this 8x8 block,
315       // project to luma coordinates (if in a subsampled chroma plane),
316       // apply the affine transformation,
317       // then convert back to the original coordinates (if necessary)
318       const int32_t src_x = (j + 4) << subsampling_x;
319       const int32_t src_y = (i + 4) << subsampling_y;
320       const int64_t dst_x =
321           (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
322       const int64_t dst_y =
323           (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
324       const int64_t x4 = dst_x >> subsampling_x;
325       const int64_t y4 = dst_y >> subsampling_y;
326 
327       const int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
328       int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
329       const int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
330       int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
331 
332       sx4 += alpha * (-4) + beta * (-4);
333       sy4 += gamma * (-4) + delta * (-4);
334 
335       sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
336       sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
337 
338       // Horizontal filter
339       for (int k = -7; k < 8; ++k) {
340         const int iy = clamp(iy4 + k, 0, height - 1);
341 
342         int sx = sx4 + beta * (k + 4);
343         for (int l = -4; l < 4; ++l) {
344           int ix = ix4 + l - 3;
345           const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
346                            WARPEDPIXEL_PREC_SHIFTS;
347           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
348           const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
349 
350           int32_t sum = 1 << offset_bits_horiz;
351           for (int m = 0; m < 8; ++m) {
352             const int sample_x = clamp(ix + m, 0, width - 1);
353             sum += ref[iy * stride + sample_x] * coeffs[m];
354           }
355           sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
356           assert(0 <= sum && sum < (1 << max_bits_horiz));
357           tmp[(k + 7) * 8 + (l + 4)] = sum;
358           sx += alpha;
359         }
360       }
361 
362       // Vertical filter
363       for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
364         int sy = sy4 + delta * (k + 4);
365         for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
366           const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
367                            WARPEDPIXEL_PREC_SHIFTS;
368           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
369           const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
370 
371           int32_t sum = 1 << offset_bits_vert;
372           for (int m = 0; m < 8; ++m) {
373             sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
374           }
375 
376           if (conv_params->is_compound) {
377             CONV_BUF_TYPE *p =
378                 &conv_params
379                      ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
380                            (j - p_col + l + 4)];
381             sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
382             if (conv_params->do_average) {
383               uint16_t *dst16 =
384                   &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
385               int32_t tmp32 = *p;
386               if (conv_params->use_dist_wtd_comp_avg) {
387                 tmp32 = tmp32 * conv_params->fwd_offset +
388                         sum * conv_params->bck_offset;
389                 tmp32 = tmp32 >> DIST_PRECISION_BITS;
390               } else {
391                 tmp32 += sum;
392                 tmp32 = tmp32 >> 1;
393               }
394               tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
395                       (1 << (offset_bits - conv_params->round_1 - 1));
396               *dst16 =
397                   clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp32, round_bits), bd);
398             } else {
399               *p = sum;
400             }
401           } else {
402             uint16_t *p =
403                 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
404             sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
405             assert(0 <= sum && sum < (1 << (bd + 2)));
406             *p = clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
407           }
408           sy += gamma;
409         }
410       }
411     }
412   }
413 }
414 
highbd_warp_plane(WarpedMotionParams * wm,const uint16_t * const ref,int width,int height,int stride,uint16_t * const pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,ConvolveParams * conv_params)415 void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref,
416                        int width, int height, int stride, uint16_t *const pred,
417                        int p_col, int p_row, int p_width, int p_height,
418                        int p_stride, int subsampling_x, int subsampling_y,
419                        int bd, ConvolveParams *conv_params) {
420   const int32_t *const mat = wm->wmmat;
421   const int16_t alpha = wm->alpha;
422   const int16_t beta = wm->beta;
423   const int16_t gamma = wm->gamma;
424   const int16_t delta = wm->delta;
425 
426   av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
427                          p_width, p_height, p_stride, subsampling_x,
428                          subsampling_y, bd, conv_params, alpha, beta, gamma,
429                          delta);
430 }
431 #endif  // CONFIG_AV1_HIGHBITDEPTH
432 
433 /* The warp filter for ROTZOOM and AFFINE models works as follows:
434    * Split the input into 8x8 blocks
435    * For each block, project the point (4, 4) within the block, to get the
436      overall block position. Split into integer and fractional coordinates,
437      maintaining full WARPEDMODEL precision
438    * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a
439      variable horizontal offset. This means that, while the rows of the
440      intermediate buffer align with the rows of the *reference* image, the
441      columns align with the columns of the *destination* image.
442    * Filter vertically: Generate the output block (up to 8x8 pixels, but if the
443      destination is too small we crop the output at this stage). Each pixel has
444      a variable vertical offset, so that the resulting rows are aligned with
445      the rows of the destination image.
446 
447    To accomplish these alignments, we factor the warp matrix as a
448    product of two shear / asymmetric zoom matrices:
449    / a b \  = /   1       0    \ * / 1+alpha  beta \
450    \ c d /    \ gamma  1+delta /   \    0      1   /
451    where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively.
452    The horizontal shear (with alpha and beta) is applied first,
453    then the vertical shear (with gamma and delta) is applied second.
454 
455    The only limitation is that, to fit this in a fixed 8-tap filter size,
456    the fractional pixel offsets must be at most +-1. Since the horizontal filter
457    generates 15 rows of 8 columns, and the initial point we project is at (4, 4)
458    within the block, the parameters must satisfy
459    4 * |alpha| + 7 * |beta| <= 1   and   4 * |gamma| + 4 * |delta| <= 1
460    for this filter to be applicable.
461 
462    Note: This function assumes that the caller has done all of the relevant
463    checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5]
464    are set appropriately (if using a ROTZOOM model), and that alpha, beta,
465    gamma, delta are all in range.
466 
467    TODO(rachelbarker): Maybe support scaled references?
468 */
469 /* A note on hardware implementation:
470     The warp filter is intended to be implementable using the same hardware as
471     the high-precision convolve filters from the loop-restoration and
472     convolve-round experiments.
473 
474     For a single filter stage, considering all of the coefficient sets for the
475     warp filter and the regular convolution filter, an input in the range
476     [0, 2^k - 1] is mapped into the range [-56 * (2^k - 1), 184 * (2^k - 1)]
477     before rounding.
478 
479     Allowing for some changes to the filter coefficient sets, call the range
480     [-64 * 2^k, 192 * 2^k]. Then, if we initialize the accumulator to 64 * 2^k,
481     we can replace this by the range [0, 256 * 2^k], which can be stored in an
482     unsigned value with 8 + k bits.
483 
484     This allows the derivation of the appropriate bit widths and offsets for
485     the various intermediate values: If
486 
487     F := FILTER_BITS = 7 (or else the above ranges need adjusting)
488          So a *single* filter stage maps a k-bit input to a (k + F + 1)-bit
489          intermediate value.
490     H := ROUND0_BITS
491     V := VERSHEAR_REDUCE_PREC_BITS
492     (and note that we must have H + V = 2*F for the output to have the same
493      scale as the input)
494 
495     then we end up with the following offsets and ranges:
496     Horizontal filter: Apply an offset of 1 << (bd + F - 1), sum fits into a
497                        uint{bd + F + 1}
498     After rounding: The values stored in 'tmp' fit into a uint{bd + F + 1 - H}.
499     Vertical filter: Apply an offset of 1 << (bd + 2*F - H), sum fits into a
500                      uint{bd + 2*F + 2 - H}
501     After rounding: The final value, before undoing the offset, fits into a
502                     uint{bd + 2}.
503 
504     Then we need to undo the offsets before clamping to a pixel. Note that,
505     if we do this at the end, the amount to subtract is actually independent
506     of H and V:
507 
508     offset to subtract = (1 << ((bd + F - 1) - H + F - V)) +
509                          (1 << ((bd + 2*F - H) - V))
510                       == (1 << (bd - 1)) + (1 << bd)
511 
512     This allows us to entirely avoid clamping in both the warp filter and
513     the convolve-round experiment. As of the time of writing, the Wiener filter
514     from loop-restoration can encode a central coefficient up to 216, which
515     leads to a maximum value of about 282 * 2^k after applying the offset.
516     So in that case we still need to clamp.
517 */
av1_warp_affine_c(const int32_t * mat,const uint8_t * ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)518 void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
519                        int height, int stride, uint8_t *pred, int p_col,
520                        int p_row, int p_width, int p_height, int p_stride,
521                        int subsampling_x, int subsampling_y,
522                        ConvolveParams *conv_params, int16_t alpha, int16_t beta,
523                        int16_t gamma, int16_t delta) {
524   int32_t tmp[15 * 8];
525   const int bd = 8;
526   const int reduce_bits_horiz = conv_params->round_0;
527   const int reduce_bits_vert = conv_params->is_compound
528                                    ? conv_params->round_1
529                                    : 2 * FILTER_BITS - reduce_bits_horiz;
530   const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
531   const int offset_bits_horiz = bd + FILTER_BITS - 1;
532   const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
533   const int round_bits =
534       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
535   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
536   (void)max_bits_horiz;
537   assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
538   assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
539 
540   for (int i = p_row; i < p_row + p_height; i += 8) {
541     for (int j = p_col; j < p_col + p_width; j += 8) {
542       // Calculate the center of this 8x8 block,
543       // project to luma coordinates (if in a subsampled chroma plane),
544       // apply the affine transformation,
545       // then convert back to the original coordinates (if necessary)
546       const int32_t src_x = (j + 4) << subsampling_x;
547       const int32_t src_y = (i + 4) << subsampling_y;
548       const int64_t dst_x =
549           (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
550       const int64_t dst_y =
551           (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
552       const int64_t x4 = dst_x >> subsampling_x;
553       const int64_t y4 = dst_y >> subsampling_y;
554 
555       int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
556       int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
557       int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
558       int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
559 
560       sx4 += alpha * (-4) + beta * (-4);
561       sy4 += gamma * (-4) + delta * (-4);
562 
563       sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
564       sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
565 
566       // Horizontal filter
567       for (int k = -7; k < 8; ++k) {
568         // Clamp to top/bottom edge of the frame
569         const int iy = clamp(iy4 + k, 0, height - 1);
570 
571         int sx = sx4 + beta * (k + 4);
572 
573         for (int l = -4; l < 4; ++l) {
574           int ix = ix4 + l - 3;
575           // At this point, sx = sx4 + alpha * l + beta * k
576           const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
577                            WARPEDPIXEL_PREC_SHIFTS;
578           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
579           const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
580 
581           int32_t sum = 1 << offset_bits_horiz;
582           for (int m = 0; m < 8; ++m) {
583             // Clamp to left/right edge of the frame
584             const int sample_x = clamp(ix + m, 0, width - 1);
585 
586             sum += ref[iy * stride + sample_x] * coeffs[m];
587           }
588           sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
589           assert(0 <= sum && sum < (1 << max_bits_horiz));
590           tmp[(k + 7) * 8 + (l + 4)] = sum;
591           sx += alpha;
592         }
593       }
594 
595       // Vertical filter
596       for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
597         int sy = sy4 + delta * (k + 4);
598         for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
599           // At this point, sy = sy4 + gamma * l + delta * k
600           const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
601                            WARPEDPIXEL_PREC_SHIFTS;
602           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
603           const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
604 
605           int32_t sum = 1 << offset_bits_vert;
606           for (int m = 0; m < 8; ++m) {
607             sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
608           }
609 
610           if (conv_params->is_compound) {
611             CONV_BUF_TYPE *p =
612                 &conv_params
613                      ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
614                            (j - p_col + l + 4)];
615             sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
616             if (conv_params->do_average) {
617               uint8_t *dst8 =
618                   &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
619               int32_t tmp32 = *p;
620               if (conv_params->use_dist_wtd_comp_avg) {
621                 tmp32 = tmp32 * conv_params->fwd_offset +
622                         sum * conv_params->bck_offset;
623                 tmp32 = tmp32 >> DIST_PRECISION_BITS;
624               } else {
625                 tmp32 += sum;
626                 tmp32 = tmp32 >> 1;
627               }
628               tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
629                       (1 << (offset_bits - conv_params->round_1 - 1));
630               *dst8 = clip_pixel(ROUND_POWER_OF_TWO(tmp32, round_bits));
631             } else {
632               *p = sum;
633             }
634           } else {
635             uint8_t *p =
636                 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
637             sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
638             assert(0 <= sum && sum < (1 << (bd + 2)));
639             *p = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
640           }
641           sy += gamma;
642         }
643       }
644     }
645   }
646 }
647 
warp_plane(WarpedMotionParams * wm,const uint8_t * const ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params)648 void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width,
649                 int height, int stride, uint8_t *pred, int p_col, int p_row,
650                 int p_width, int p_height, int p_stride, int subsampling_x,
651                 int subsampling_y, ConvolveParams *conv_params) {
652   const int32_t *const mat = wm->wmmat;
653   const int16_t alpha = wm->alpha;
654   const int16_t beta = wm->beta;
655   const int16_t gamma = wm->gamma;
656   const int16_t delta = wm->delta;
657   av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width,
658                   p_height, p_stride, subsampling_x, subsampling_y, conv_params,
659                   alpha, beta, gamma, delta);
660 }
661 
av1_warp_plane(WarpedMotionParams * wm,int use_hbd,int bd,const uint8_t * ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params)662 void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
663                     const uint8_t *ref, int width, int height, int stride,
664                     uint8_t *pred, int p_col, int p_row, int p_width,
665                     int p_height, int p_stride, int subsampling_x,
666                     int subsampling_y, ConvolveParams *conv_params) {
667 #if CONFIG_AV1_HIGHBITDEPTH
668   if (use_hbd)
669     highbd_warp_plane(wm, CONVERT_TO_SHORTPTR(ref), width, height, stride,
670                       CONVERT_TO_SHORTPTR(pred), p_col, p_row, p_width,
671                       p_height, p_stride, subsampling_x, subsampling_y, bd,
672                       conv_params);
673   else
674     warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
675                p_height, p_stride, subsampling_x, subsampling_y, conv_params);
676 #else
677   (void)use_hbd;
678   (void)bd;
679   warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
680              p_height, p_stride, subsampling_x, subsampling_y, conv_params);
681 #endif
682 }
683 
684 #define LS_MV_MAX 256  // max mv in 1/8-pel
685 // Use LS_STEP = 8 so that 2 less bits needed for A, Bx, By.
686 #define LS_STEP 8
687 
688 // Assuming LS_MV_MAX is < MAX_SB_SIZE * 8,
689 // the precision needed is:
690 //   (MAX_SB_SIZE_LOG2 + 3) [for sx * sx magnitude] +
691 //   (MAX_SB_SIZE_LOG2 + 4) [for sx * dx magnitude] +
692 //   1 [for sign] +
693 //   LEAST_SQUARES_SAMPLES_MAX_BITS
694 //        [for adding up to LEAST_SQUARES_SAMPLES_MAX samples]
695 // The value is 23
696 #define LS_MAT_RANGE_BITS \
697   ((MAX_SB_SIZE_LOG2 + 4) * 2 + LEAST_SQUARES_SAMPLES_MAX_BITS)
698 
699 // Bit-depth reduction from the full-range
700 #define LS_MAT_DOWN_BITS 2
701 
702 // bits range of A, Bx and By after downshifting
703 #define LS_MAT_BITS (LS_MAT_RANGE_BITS - LS_MAT_DOWN_BITS)
704 #define LS_MAT_MIN (-(1 << (LS_MAT_BITS - 1)))
705 #define LS_MAT_MAX ((1 << (LS_MAT_BITS - 1)) - 1)
706 
707 // By setting LS_STEP = 8, the least 2 bits of every elements in A, Bx, By are
708 // 0. So, we can reduce LS_MAT_RANGE_BITS(2) bits here.
709 #define LS_SQUARE(a)                                          \
710   (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
711    (2 + LS_MAT_DOWN_BITS))
712 #define LS_PRODUCT1(a, b)                                           \
713   (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
714    (2 + LS_MAT_DOWN_BITS))
715 #define LS_PRODUCT2(a, b)                                               \
716   (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
717    (2 + LS_MAT_DOWN_BITS))
718 
719 #define USE_LIMITED_PREC_MULT 0
720 
721 #if USE_LIMITED_PREC_MULT
722 
723 #define MUL_PREC_BITS 16
resolve_multiplier_64(uint64_t D,int16_t * shift)724 static uint16_t resolve_multiplier_64(uint64_t D, int16_t *shift) {
725   int msb = 0;
726   uint16_t mult = 0;
727   *shift = 0;
728   if (D != 0) {
729     msb = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
730                               : get_msb((unsigned int)D));
731     if (msb >= MUL_PREC_BITS) {
732       mult = (uint16_t)ROUND_POWER_OF_TWO_64(D, msb + 1 - MUL_PREC_BITS);
733       *shift = msb + 1 - MUL_PREC_BITS;
734     } else {
735       mult = (uint16_t)D;
736       *shift = 0;
737     }
738   }
739   return mult;
740 }
741 
get_mult_shift_ndiag(int64_t Px,int16_t iDet,int shift)742 static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
743   int32_t ret;
744   int16_t mshift;
745   uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
746   int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
747   shift -= mshift;
748   if (shift > 0) {
749     return (int32_t)clamp(ROUND_POWER_OF_TWO_SIGNED(v, shift),
750                           -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
751                           WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
752   } else {
753     return (int32_t)clamp(v * (1 << (-shift)),
754                           -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
755                           WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
756   }
757   return ret;
758 }
759 
get_mult_shift_diag(int64_t Px,int16_t iDet,int shift)760 static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
761   int16_t mshift;
762   uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
763   int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
764   shift -= mshift;
765   if (shift > 0) {
766     return (int32_t)clamp(
767         ROUND_POWER_OF_TWO_SIGNED(v, shift),
768         (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
769         (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
770   } else {
771     return (int32_t)clamp(
772         v * (1 << (-shift)),
773         (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
774         (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
775   }
776 }
777 
778 #else
779 
get_mult_shift_ndiag(int64_t Px,int16_t iDet,int shift)780 static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
781   int64_t v = Px * (int64_t)iDet;
782   return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
783                           -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
784                           WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
785 }
786 
get_mult_shift_diag(int64_t Px,int16_t iDet,int shift)787 static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
788   int64_t v = Px * (int64_t)iDet;
789   return (int32_t)clamp64(
790       ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
791       (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
792       (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
793 }
794 #endif  // USE_LIMITED_PREC_MULT
795 
find_affine_int(int np,const int * pts1,const int * pts2,BLOCK_SIZE bsize,int mvy,int mvx,WarpedMotionParams * wm,int mi_row,int mi_col)796 static int find_affine_int(int np, const int *pts1, const int *pts2,
797                            BLOCK_SIZE bsize, int mvy, int mvx,
798                            WarpedMotionParams *wm, int mi_row, int mi_col) {
799   int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
800   int32_t Bx[2] = { 0, 0 };
801   int32_t By[2] = { 0, 0 };
802 
803   const int bw = block_size_wide[bsize];
804   const int bh = block_size_high[bsize];
805   const int rsuy = bh / 2 - 1;
806   const int rsux = bw / 2 - 1;
807   const int suy = rsuy * 8;
808   const int sux = rsux * 8;
809   const int duy = suy + mvy;
810   const int dux = sux + mvx;
811 
812   // Assume the center pixel of the block has exactly the same motion vector
813   // as transmitted for the block. First shift the origin of the source
814   // points to the block center, and the origin of the destination points to
815   // the block center added to the motion vector transmitted.
816   // Let (xi, yi) denote the source points and (xi', yi') denote destination
817   // points after origin shfifting, for i = 0, 1, 2, .... n-1.
818   // Then if  P = [x0, y0,
819   //               x1, y1
820   //               x2, y1,
821   //                ....
822   //              ]
823   //          q = [x0', x1', x2', ... ]'
824   //          r = [y0', y1', y2', ... ]'
825   // the least squares problems that need to be solved are:
826   //          [h1, h2]' = inv(P'P)P'q and
827   //          [h3, h4]' = inv(P'P)P'r
828   // where the affine transformation is given by:
829   //          x' = h1.x + h2.y
830   //          y' = h3.x + h4.y
831   //
832   // The loop below computes: A = P'P, Bx = P'q, By = P'r
833   // We need to just compute inv(A).Bx and inv(A).By for the solutions.
834   // Contribution from neighbor block
835   for (int i = 0; i < np; i++) {
836     const int dx = pts2[i * 2] - dux;
837     const int dy = pts2[i * 2 + 1] - duy;
838     const int sx = pts1[i * 2] - sux;
839     const int sy = pts1[i * 2 + 1] - suy;
840     // (TODO)yunqing: This comparison wouldn't be necessary if the sample
841     // selection is done in find_samples(). Also, global offset can be removed
842     // while collecting samples.
843     if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
844       A[0][0] += LS_SQUARE(sx);
845       A[0][1] += LS_PRODUCT1(sx, sy);
846       A[1][1] += LS_SQUARE(sy);
847       Bx[0] += LS_PRODUCT2(sx, dx);
848       Bx[1] += LS_PRODUCT1(sy, dx);
849       By[0] += LS_PRODUCT1(sx, dy);
850       By[1] += LS_PRODUCT2(sy, dy);
851     }
852   }
853 
854   // Just for debugging, and can be removed later.
855   assert(A[0][0] >= LS_MAT_MIN && A[0][0] <= LS_MAT_MAX);
856   assert(A[0][1] >= LS_MAT_MIN && A[0][1] <= LS_MAT_MAX);
857   assert(A[1][1] >= LS_MAT_MIN && A[1][1] <= LS_MAT_MAX);
858   assert(Bx[0] >= LS_MAT_MIN && Bx[0] <= LS_MAT_MAX);
859   assert(Bx[1] >= LS_MAT_MIN && Bx[1] <= LS_MAT_MAX);
860   assert(By[0] >= LS_MAT_MIN && By[0] <= LS_MAT_MAX);
861   assert(By[1] >= LS_MAT_MIN && By[1] <= LS_MAT_MAX);
862 
863   // Compute Determinant of A
864   const int64_t Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
865   if (Det == 0) return 1;
866 
867   int16_t shift;
868   int16_t iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1);
869   shift -= WARPEDMODEL_PREC_BITS;
870   if (shift < 0) {
871     iDet <<= (-shift);
872     shift = 0;
873   }
874 
875   int64_t Px[2], Py[2];
876   // These divided by the Det, are the least squares solutions
877   Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
878   Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1];
879   Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1];
880   Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1];
881 
882   wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift);
883   wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift);
884   wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift);
885   wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift);
886 
887   const int isuy = (mi_row * MI_SIZE + rsuy);
888   const int isux = (mi_col * MI_SIZE + rsux);
889   // Note: In the vx, vy expressions below, the max value of each of the
890   // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room
891   // for the first term so that the overall sum in the worst case fits
892   // within 32 bits overall.
893   const int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
894                      (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) +
895                       isuy * wm->wmmat[3]);
896   const int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
897                      (isux * wm->wmmat[4] +
898                       isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS)));
899   wm->wmmat[0] =
900       clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
901   wm->wmmat[1] =
902       clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
903   return 0;
904 }
905 
av1_find_projection(int np,const int * pts1,const int * pts2,BLOCK_SIZE bsize,int mvy,int mvx,WarpedMotionParams * wm_params,int mi_row,int mi_col)906 int av1_find_projection(int np, const int *pts1, const int *pts2,
907                         BLOCK_SIZE bsize, int mvy, int mvx,
908                         WarpedMotionParams *wm_params, int mi_row, int mi_col) {
909   assert(wm_params->wmtype == AFFINE);
910 
911   if (find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, mi_row,
912                       mi_col))
913     return 1;
914 
915   // check compatibility with the fast warp filter
916   if (!av1_get_shear_params(wm_params)) return 1;
917 
918   return 0;
919 }
920