• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <memory.h>
15 #include <math.h>
16 #include <assert.h>
17 
18 #include "config/av1_rtcd.h"
19 
20 #include "av1/common/warped_motion.h"
21 #include "av1/common/scale.h"
22 
23 // For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
24 // at a time. The zoom/rotation/shear in the model are applied to the
25 // "fractional" position of each pixel, which therefore varies within
26 // [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
27 // We need an extra 2 taps to fit this in, for a total of 8 taps.
28 /* clang-format off */
29 const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
30 #if WARPEDPIXEL_PREC_BITS == 6
31   // [-1, 0)
32   { 0,   0, 127,   1,   0, 0, 0, 0 }, { 0, - 1, 127,   2,   0, 0, 0, 0 },
33   { 1, - 3, 127,   4, - 1, 0, 0, 0 }, { 1, - 4, 126,   6, - 2, 1, 0, 0 },
34   { 1, - 5, 126,   8, - 3, 1, 0, 0 }, { 1, - 6, 125,  11, - 4, 1, 0, 0 },
35   { 1, - 7, 124,  13, - 4, 1, 0, 0 }, { 2, - 8, 123,  15, - 5, 1, 0, 0 },
36   { 2, - 9, 122,  18, - 6, 1, 0, 0 }, { 2, -10, 121,  20, - 6, 1, 0, 0 },
37   { 2, -11, 120,  22, - 7, 2, 0, 0 }, { 2, -12, 119,  25, - 8, 2, 0, 0 },
38   { 3, -13, 117,  27, - 8, 2, 0, 0 }, { 3, -13, 116,  29, - 9, 2, 0, 0 },
39   { 3, -14, 114,  32, -10, 3, 0, 0 }, { 3, -15, 113,  35, -10, 2, 0, 0 },
40   { 3, -15, 111,  37, -11, 3, 0, 0 }, { 3, -16, 109,  40, -11, 3, 0, 0 },
41   { 3, -16, 108,  42, -12, 3, 0, 0 }, { 4, -17, 106,  45, -13, 3, 0, 0 },
42   { 4, -17, 104,  47, -13, 3, 0, 0 }, { 4, -17, 102,  50, -14, 3, 0, 0 },
43   { 4, -17, 100,  52, -14, 3, 0, 0 }, { 4, -18,  98,  55, -15, 4, 0, 0 },
44   { 4, -18,  96,  58, -15, 3, 0, 0 }, { 4, -18,  94,  60, -16, 4, 0, 0 },
45   { 4, -18,  91,  63, -16, 4, 0, 0 }, { 4, -18,  89,  65, -16, 4, 0, 0 },
46   { 4, -18,  87,  68, -17, 4, 0, 0 }, { 4, -18,  85,  70, -17, 4, 0, 0 },
47   { 4, -18,  82,  73, -17, 4, 0, 0 }, { 4, -18,  80,  75, -17, 4, 0, 0 },
48   { 4, -18,  78,  78, -18, 4, 0, 0 }, { 4, -17,  75,  80, -18, 4, 0, 0 },
49   { 4, -17,  73,  82, -18, 4, 0, 0 }, { 4, -17,  70,  85, -18, 4, 0, 0 },
50   { 4, -17,  68,  87, -18, 4, 0, 0 }, { 4, -16,  65,  89, -18, 4, 0, 0 },
51   { 4, -16,  63,  91, -18, 4, 0, 0 }, { 4, -16,  60,  94, -18, 4, 0, 0 },
52   { 3, -15,  58,  96, -18, 4, 0, 0 }, { 4, -15,  55,  98, -18, 4, 0, 0 },
53   { 3, -14,  52, 100, -17, 4, 0, 0 }, { 3, -14,  50, 102, -17, 4, 0, 0 },
54   { 3, -13,  47, 104, -17, 4, 0, 0 }, { 3, -13,  45, 106, -17, 4, 0, 0 },
55   { 3, -12,  42, 108, -16, 3, 0, 0 }, { 3, -11,  40, 109, -16, 3, 0, 0 },
56   { 3, -11,  37, 111, -15, 3, 0, 0 }, { 2, -10,  35, 113, -15, 3, 0, 0 },
57   { 3, -10,  32, 114, -14, 3, 0, 0 }, { 2, - 9,  29, 116, -13, 3, 0, 0 },
58   { 2, - 8,  27, 117, -13, 3, 0, 0 }, { 2, - 8,  25, 119, -12, 2, 0, 0 },
59   { 2, - 7,  22, 120, -11, 2, 0, 0 }, { 1, - 6,  20, 121, -10, 2, 0, 0 },
60   { 1, - 6,  18, 122, - 9, 2, 0, 0 }, { 1, - 5,  15, 123, - 8, 2, 0, 0 },
61   { 1, - 4,  13, 124, - 7, 1, 0, 0 }, { 1, - 4,  11, 125, - 6, 1, 0, 0 },
62   { 1, - 3,   8, 126, - 5, 1, 0, 0 }, { 1, - 2,   6, 126, - 4, 1, 0, 0 },
63   { 0, - 1,   4, 127, - 3, 1, 0, 0 }, { 0,   0,   2, 127, - 1, 0, 0, 0 },
64 
65   // [0, 1)
66   { 0,  0,   0, 127,   1,   0,  0,  0}, { 0,  0,  -1, 127,   2,   0,  0,  0},
67   { 0,  1,  -3, 127,   4,  -2,  1,  0}, { 0,  1,  -5, 127,   6,  -2,  1,  0},
68   { 0,  2,  -6, 126,   8,  -3,  1,  0}, {-1,  2,  -7, 126,  11,  -4,  2, -1},
69   {-1,  3,  -8, 125,  13,  -5,  2, -1}, {-1,  3, -10, 124,  16,  -6,  3, -1},
70   {-1,  4, -11, 123,  18,  -7,  3, -1}, {-1,  4, -12, 122,  20,  -7,  3, -1},
71   {-1,  4, -13, 121,  23,  -8,  3, -1}, {-2,  5, -14, 120,  25,  -9,  4, -1},
72   {-1,  5, -15, 119,  27, -10,  4, -1}, {-1,  5, -16, 118,  30, -11,  4, -1},
73   {-2,  6, -17, 116,  33, -12,  5, -1}, {-2,  6, -17, 114,  35, -12,  5, -1},
74   {-2,  6, -18, 113,  38, -13,  5, -1}, {-2,  7, -19, 111,  41, -14,  6, -2},
75   {-2,  7, -19, 110,  43, -15,  6, -2}, {-2,  7, -20, 108,  46, -15,  6, -2},
76   {-2,  7, -20, 106,  49, -16,  6, -2}, {-2,  7, -21, 104,  51, -16,  7, -2},
77   {-2,  7, -21, 102,  54, -17,  7, -2}, {-2,  8, -21, 100,  56, -18,  7, -2},
78   {-2,  8, -22,  98,  59, -18,  7, -2}, {-2,  8, -22,  96,  62, -19,  7, -2},
79   {-2,  8, -22,  94,  64, -19,  7, -2}, {-2,  8, -22,  91,  67, -20,  8, -2},
80   {-2,  8, -22,  89,  69, -20,  8, -2}, {-2,  8, -22,  87,  72, -21,  8, -2},
81   {-2,  8, -21,  84,  74, -21,  8, -2}, {-2,  8, -22,  82,  77, -21,  8, -2},
82   {-2,  8, -21,  79,  79, -21,  8, -2}, {-2,  8, -21,  77,  82, -22,  8, -2},
83   {-2,  8, -21,  74,  84, -21,  8, -2}, {-2,  8, -21,  72,  87, -22,  8, -2},
84   {-2,  8, -20,  69,  89, -22,  8, -2}, {-2,  8, -20,  67,  91, -22,  8, -2},
85   {-2,  7, -19,  64,  94, -22,  8, -2}, {-2,  7, -19,  62,  96, -22,  8, -2},
86   {-2,  7, -18,  59,  98, -22,  8, -2}, {-2,  7, -18,  56, 100, -21,  8, -2},
87   {-2,  7, -17,  54, 102, -21,  7, -2}, {-2,  7, -16,  51, 104, -21,  7, -2},
88   {-2,  6, -16,  49, 106, -20,  7, -2}, {-2,  6, -15,  46, 108, -20,  7, -2},
89   {-2,  6, -15,  43, 110, -19,  7, -2}, {-2,  6, -14,  41, 111, -19,  7, -2},
90   {-1,  5, -13,  38, 113, -18,  6, -2}, {-1,  5, -12,  35, 114, -17,  6, -2},
91   {-1,  5, -12,  33, 116, -17,  6, -2}, {-1,  4, -11,  30, 118, -16,  5, -1},
92   {-1,  4, -10,  27, 119, -15,  5, -1}, {-1,  4,  -9,  25, 120, -14,  5, -2},
93   {-1,  3,  -8,  23, 121, -13,  4, -1}, {-1,  3,  -7,  20, 122, -12,  4, -1},
94   {-1,  3,  -7,  18, 123, -11,  4, -1}, {-1,  3,  -6,  16, 124, -10,  3, -1},
95   {-1,  2,  -5,  13, 125,  -8,  3, -1}, {-1,  2,  -4,  11, 126,  -7,  2, -1},
96   { 0,  1,  -3,   8, 126,  -6,  2,  0}, { 0,  1,  -2,   6, 127,  -5,  1,  0},
97   { 0,  1,  -2,   4, 127,  -3,  1,  0}, { 0,  0,   0,   2, 127,  -1,  0,  0},
98 
99   // [1, 2)
100   { 0, 0, 0,   1, 127,   0,   0, 0 }, { 0, 0, 0, - 1, 127,   2,   0, 0 },
101   { 0, 0, 1, - 3, 127,   4, - 1, 0 }, { 0, 0, 1, - 4, 126,   6, - 2, 1 },
102   { 0, 0, 1, - 5, 126,   8, - 3, 1 }, { 0, 0, 1, - 6, 125,  11, - 4, 1 },
103   { 0, 0, 1, - 7, 124,  13, - 4, 1 }, { 0, 0, 2, - 8, 123,  15, - 5, 1 },
104   { 0, 0, 2, - 9, 122,  18, - 6, 1 }, { 0, 0, 2, -10, 121,  20, - 6, 1 },
105   { 0, 0, 2, -11, 120,  22, - 7, 2 }, { 0, 0, 2, -12, 119,  25, - 8, 2 },
106   { 0, 0, 3, -13, 117,  27, - 8, 2 }, { 0, 0, 3, -13, 116,  29, - 9, 2 },
107   { 0, 0, 3, -14, 114,  32, -10, 3 }, { 0, 0, 3, -15, 113,  35, -10, 2 },
108   { 0, 0, 3, -15, 111,  37, -11, 3 }, { 0, 0, 3, -16, 109,  40, -11, 3 },
109   { 0, 0, 3, -16, 108,  42, -12, 3 }, { 0, 0, 4, -17, 106,  45, -13, 3 },
110   { 0, 0, 4, -17, 104,  47, -13, 3 }, { 0, 0, 4, -17, 102,  50, -14, 3 },
111   { 0, 0, 4, -17, 100,  52, -14, 3 }, { 0, 0, 4, -18,  98,  55, -15, 4 },
112   { 0, 0, 4, -18,  96,  58, -15, 3 }, { 0, 0, 4, -18,  94,  60, -16, 4 },
113   { 0, 0, 4, -18,  91,  63, -16, 4 }, { 0, 0, 4, -18,  89,  65, -16, 4 },
114   { 0, 0, 4, -18,  87,  68, -17, 4 }, { 0, 0, 4, -18,  85,  70, -17, 4 },
115   { 0, 0, 4, -18,  82,  73, -17, 4 }, { 0, 0, 4, -18,  80,  75, -17, 4 },
116   { 0, 0, 4, -18,  78,  78, -18, 4 }, { 0, 0, 4, -17,  75,  80, -18, 4 },
117   { 0, 0, 4, -17,  73,  82, -18, 4 }, { 0, 0, 4, -17,  70,  85, -18, 4 },
118   { 0, 0, 4, -17,  68,  87, -18, 4 }, { 0, 0, 4, -16,  65,  89, -18, 4 },
119   { 0, 0, 4, -16,  63,  91, -18, 4 }, { 0, 0, 4, -16,  60,  94, -18, 4 },
120   { 0, 0, 3, -15,  58,  96, -18, 4 }, { 0, 0, 4, -15,  55,  98, -18, 4 },
121   { 0, 0, 3, -14,  52, 100, -17, 4 }, { 0, 0, 3, -14,  50, 102, -17, 4 },
122   { 0, 0, 3, -13,  47, 104, -17, 4 }, { 0, 0, 3, -13,  45, 106, -17, 4 },
123   { 0, 0, 3, -12,  42, 108, -16, 3 }, { 0, 0, 3, -11,  40, 109, -16, 3 },
124   { 0, 0, 3, -11,  37, 111, -15, 3 }, { 0, 0, 2, -10,  35, 113, -15, 3 },
125   { 0, 0, 3, -10,  32, 114, -14, 3 }, { 0, 0, 2, - 9,  29, 116, -13, 3 },
126   { 0, 0, 2, - 8,  27, 117, -13, 3 }, { 0, 0, 2, - 8,  25, 119, -12, 2 },
127   { 0, 0, 2, - 7,  22, 120, -11, 2 }, { 0, 0, 1, - 6,  20, 121, -10, 2 },
128   { 0, 0, 1, - 6,  18, 122, - 9, 2 }, { 0, 0, 1, - 5,  15, 123, - 8, 2 },
129   { 0, 0, 1, - 4,  13, 124, - 7, 1 }, { 0, 0, 1, - 4,  11, 125, - 6, 1 },
130   { 0, 0, 1, - 3,   8, 126, - 5, 1 }, { 0, 0, 1, - 2,   6, 126, - 4, 1 },
131   { 0, 0, 0, - 1,   4, 127, - 3, 1 }, { 0, 0, 0,   0,   2, 127, - 1, 0 },
132   // dummy (replicate row index 191)
133   { 0, 0, 0,   0,   2, 127, - 1, 0 },
134 
135 #elif WARPEDPIXEL_PREC_BITS == 5
136   // [-1, 0)
137   {0,   0, 127,   1,   0, 0, 0, 0}, {1,  -3, 127,   4,  -1, 0, 0, 0},
138   {1,  -5, 126,   8,  -3, 1, 0, 0}, {1,  -7, 124,  13,  -4, 1, 0, 0},
139   {2,  -9, 122,  18,  -6, 1, 0, 0}, {2, -11, 120,  22,  -7, 2, 0, 0},
140   {3, -13, 117,  27,  -8, 2, 0, 0}, {3, -14, 114,  32, -10, 3, 0, 0},
141   {3, -15, 111,  37, -11, 3, 0, 0}, {3, -16, 108,  42, -12, 3, 0, 0},
142   {4, -17, 104,  47, -13, 3, 0, 0}, {4, -17, 100,  52, -14, 3, 0, 0},
143   {4, -18,  96,  58, -15, 3, 0, 0}, {4, -18,  91,  63, -16, 4, 0, 0},
144   {4, -18,  87,  68, -17, 4, 0, 0}, {4, -18,  82,  73, -17, 4, 0, 0},
145   {4, -18,  78,  78, -18, 4, 0, 0}, {4, -17,  73,  82, -18, 4, 0, 0},
146   {4, -17,  68,  87, -18, 4, 0, 0}, {4, -16,  63,  91, -18, 4, 0, 0},
147   {3, -15,  58,  96, -18, 4, 0, 0}, {3, -14,  52, 100, -17, 4, 0, 0},
148   {3, -13,  47, 104, -17, 4, 0, 0}, {3, -12,  42, 108, -16, 3, 0, 0},
149   {3, -11,  37, 111, -15, 3, 0, 0}, {3, -10,  32, 114, -14, 3, 0, 0},
150   {2,  -8,  27, 117, -13, 3, 0, 0}, {2,  -7,  22, 120, -11, 2, 0, 0},
151   {1,  -6,  18, 122,  -9, 2, 0, 0}, {1,  -4,  13, 124,  -7, 1, 0, 0},
152   {1,  -3,   8, 126,  -5, 1, 0, 0}, {0,  -1,   4, 127,  -3, 1, 0, 0},
153   // [0, 1)
154   { 0,  0,   0, 127,   1,   0,   0,  0}, { 0,  1,  -3, 127,   4,  -2,   1,  0},
155   { 0,  2,  -6, 126,   8,  -3,   1,  0}, {-1,  3,  -8, 125,  13,  -5,   2, -1},
156   {-1,  4, -11, 123,  18,  -7,   3, -1}, {-1,  4, -13, 121,  23,  -8,   3, -1},
157   {-1,  5, -15, 119,  27, -10,   4, -1}, {-2,  6, -17, 116,  33, -12,   5, -1},
158   {-2,  6, -18, 113,  38, -13,   5, -1}, {-2,  7, -19, 110,  43, -15,   6, -2},
159   {-2,  7, -20, 106,  49, -16,   6, -2}, {-2,  7, -21, 102,  54, -17,   7, -2},
160   {-2,  8, -22,  98,  59, -18,   7, -2}, {-2,  8, -22,  94,  64, -19,   7, -2},
161   {-2,  8, -22,  89,  69, -20,   8, -2}, {-2,  8, -21,  84,  74, -21,   8, -2},
162   {-2,  8, -21,  79,  79, -21,   8, -2}, {-2,  8, -21,  74,  84, -21,   8, -2},
163   {-2,  8, -20,  69,  89, -22,   8, -2}, {-2,  7, -19,  64,  94, -22,   8, -2},
164   {-2,  7, -18,  59,  98, -22,   8, -2}, {-2,  7, -17,  54, 102, -21,   7, -2},
165   {-2,  6, -16,  49, 106, -20,   7, -2}, {-2,  6, -15,  43, 110, -19,   7, -2},
166   {-1,  5, -13,  38, 113, -18,   6, -2}, {-1,  5, -12,  33, 116, -17,   6, -2},
167   {-1,  4, -10,  27, 119, -15,   5, -1}, {-1,  3,  -8,  23, 121, -13,   4, -1},
168   {-1,  3,  -7,  18, 123, -11,   4, -1}, {-1,  2,  -5,  13, 125,  -8,   3, -1},
169   { 0,  1,  -3,   8, 126,  -6,   2,  0}, { 0,  1,  -2,   4, 127,  -3,   1,  0},
170   // [1, 2)
171   {0, 0, 0,   1, 127,   0,   0, 0}, {0, 0, 1,  -3, 127,   4,  -1, 0},
172   {0, 0, 1,  -5, 126,   8,  -3, 1}, {0, 0, 1,  -7, 124,  13,  -4, 1},
173   {0, 0, 2,  -9, 122,  18,  -6, 1}, {0, 0, 2, -11, 120,  22,  -7, 2},
174   {0, 0, 3, -13, 117,  27,  -8, 2}, {0, 0, 3, -14, 114,  32, -10, 3},
175   {0, 0, 3, -15, 111,  37, -11, 3}, {0, 0, 3, -16, 108,  42, -12, 3},
176   {0, 0, 4, -17, 104,  47, -13, 3}, {0, 0, 4, -17, 100,  52, -14, 3},
177   {0, 0, 4, -18,  96,  58, -15, 3}, {0, 0, 4, -18,  91,  63, -16, 4},
178   {0, 0, 4, -18,  87,  68, -17, 4}, {0, 0, 4, -18,  82,  73, -17, 4},
179   {0, 0, 4, -18,  78,  78, -18, 4}, {0, 0, 4, -17,  73,  82, -18, 4},
180   {0, 0, 4, -17,  68,  87, -18, 4}, {0, 0, 4, -16,  63,  91, -18, 4},
181   {0, 0, 3, -15,  58,  96, -18, 4}, {0, 0, 3, -14,  52, 100, -17, 4},
182   {0, 0, 3, -13,  47, 104, -17, 4}, {0, 0, 3, -12,  42, 108, -16, 3},
183   {0, 0, 3, -11,  37, 111, -15, 3}, {0, 0, 3, -10,  32, 114, -14, 3},
184   {0, 0, 2,  -8,  27, 117, -13, 3}, {0, 0, 2,  -7,  22, 120, -11, 2},
185   {0, 0, 1,  -6,  18, 122,  -9, 2}, {0, 0, 1,  -4,  13, 124,  -7, 1},
186   {0, 0, 1,  -3,   8, 126,  -5, 1}, {0, 0, 0,  -1,   4, 127,  -3, 1},
187   // dummy (replicate row index 95)
188   {0, 0, 0,  -1,   4, 127,  -3, 1},
189 
190 #endif  // WARPEDPIXEL_PREC_BITS == 6
191 };
192 
193 /* clang-format on */
194 
195 #define DIV_LUT_PREC_BITS 14
196 #define DIV_LUT_BITS 8
197 #define DIV_LUT_NUM (1 << DIV_LUT_BITS)
198 
199 static const uint16_t div_lut[DIV_LUT_NUM + 1] = {
200   16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
201   15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
202   15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
203   14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
204   13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
205   13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
206   13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
207   12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
208   12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
209   11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
210   11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
211   11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
212   10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
213   10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
214   10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
215   9963,  9939,  9916,  9892,  9869,  9846,  9823,  9800,  9777,  9754,  9732,
216   9709,  9687,  9664,  9642,  9620,  9598,  9576,  9554,  9533,  9511,  9489,
217   9468,  9447,  9425,  9404,  9383,  9362,  9341,  9321,  9300,  9279,  9259,
218   9239,  9218,  9198,  9178,  9158,  9138,  9118,  9098,  9079,  9059,  9039,
219   9020,  9001,  8981,  8962,  8943,  8924,  8905,  8886,  8867,  8849,  8830,
220   8812,  8793,  8775,  8756,  8738,  8720,  8702,  8684,  8666,  8648,  8630,
221   8613,  8595,  8577,  8560,  8542,  8525,  8508,  8490,  8473,  8456,  8439,
222   8422,  8405,  8389,  8372,  8355,  8339,  8322,  8306,  8289,  8273,  8257,
223   8240,  8224,  8208,  8192,
224 };
225 
226 // Decomposes a divisor D such that 1/D = y/2^shift, where y is returned
227 // at precision of DIV_LUT_PREC_BITS along with the shift.
resolve_divisor_64(uint64_t D,int16_t * shift)228 static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) {
229   int64_t f;
230   *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
231                                : get_msb((unsigned int)D));
232   // e is obtained from D after resetting the most significant 1 bit.
233   const int64_t e = D - ((uint64_t)1 << *shift);
234   // Get the most significant DIV_LUT_BITS (8) bits of e into f
235   if (*shift > DIV_LUT_BITS)
236     f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS);
237   else
238     f = e << (DIV_LUT_BITS - *shift);
239   assert(f <= DIV_LUT_NUM);
240   *shift += DIV_LUT_PREC_BITS;
241   // Use f as lookup into the precomputed table of multipliers
242   return div_lut[f];
243 }
244 
resolve_divisor_32(uint32_t D,int16_t * shift)245 static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) {
246   int32_t f;
247   *shift = get_msb(D);
248   // e is obtained from D after resetting the most significant 1 bit.
249   const int32_t e = D - ((uint32_t)1 << *shift);
250   // Get the most significant DIV_LUT_BITS (8) bits of e into f
251   if (*shift > DIV_LUT_BITS)
252     f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS);
253   else
254     f = e << (DIV_LUT_BITS - *shift);
255   assert(f <= DIV_LUT_NUM);
256   *shift += DIV_LUT_PREC_BITS;
257   // Use f as lookup into the precomputed table of multipliers
258   return div_lut[f];
259 }
260 
is_affine_valid(const WarpedMotionParams * const wm)261 static int is_affine_valid(const WarpedMotionParams *const wm) {
262   const int32_t *mat = wm->wmmat;
263   return (mat[2] > 0);
264 }
265 
is_affine_shear_allowed(int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)266 static int is_affine_shear_allowed(int16_t alpha, int16_t beta, int16_t gamma,
267                                    int16_t delta) {
268   if ((4 * abs(alpha) + 7 * abs(beta) >= (1 << WARPEDMODEL_PREC_BITS)) ||
269       (4 * abs(gamma) + 4 * abs(delta) >= (1 << WARPEDMODEL_PREC_BITS)))
270     return 0;
271   else
272     return 1;
273 }
274 
275 // Returns 1 on success or 0 on an invalid affine set
av1_get_shear_params(WarpedMotionParams * wm)276 int av1_get_shear_params(WarpedMotionParams *wm) {
277   const int32_t *mat = wm->wmmat;
278   if (!is_affine_valid(wm)) return 0;
279   wm->alpha =
280       clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
281   wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX);
282   int16_t shift;
283   int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1);
284   int64_t v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y;
285   wm->gamma =
286       clamp((int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift), INT16_MIN, INT16_MAX);
287   v = ((int64_t)mat[3] * mat[4]) * y;
288   wm->delta = clamp(mat[5] - (int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift) -
289                         (1 << WARPEDMODEL_PREC_BITS),
290                     INT16_MIN, INT16_MAX);
291 
292   wm->alpha = ROUND_POWER_OF_TWO_SIGNED(wm->alpha, WARP_PARAM_REDUCE_BITS) *
293               (1 << WARP_PARAM_REDUCE_BITS);
294   wm->beta = ROUND_POWER_OF_TWO_SIGNED(wm->beta, WARP_PARAM_REDUCE_BITS) *
295              (1 << WARP_PARAM_REDUCE_BITS);
296   wm->gamma = ROUND_POWER_OF_TWO_SIGNED(wm->gamma, WARP_PARAM_REDUCE_BITS) *
297               (1 << WARP_PARAM_REDUCE_BITS);
298   wm->delta = ROUND_POWER_OF_TWO_SIGNED(wm->delta, WARP_PARAM_REDUCE_BITS) *
299               (1 << WARP_PARAM_REDUCE_BITS);
300 
301   if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta))
302     return 0;
303 
304   return 1;
305 }
306 
307 #if CONFIG_AV1_HIGHBITDEPTH
highbd_error_measure(int err,int bd)308 static INLINE int highbd_error_measure(int err, int bd) {
309   const int b = bd - 8;
310   const int bmask = (1 << b) - 1;
311   const int v = (1 << b);
312   err = abs(err);
313   const int e1 = err >> b;
314   const int e2 = err & bmask;
315   return error_measure_lut[255 + e1] * (v - e2) +
316          error_measure_lut[256 + e1] * e2;
317 }
318 
319 /* Note: For an explanation of the warp algorithm, and some notes on bit widths
320     for hardware implementations, see the comments above av1_warp_affine_c
321 */
av1_highbd_warp_affine_c(const int32_t * mat,const uint16_t * ref,int width,int height,int stride,uint16_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)322 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
323                               int width, int height, int stride, uint16_t *pred,
324                               int p_col, int p_row, int p_width, int p_height,
325                               int p_stride, int subsampling_x,
326                               int subsampling_y, int bd,
327                               ConvolveParams *conv_params, int16_t alpha,
328                               int16_t beta, int16_t gamma, int16_t delta) {
329   int32_t tmp[15 * 8];
330   const int reduce_bits_horiz =
331       conv_params->round_0 +
332       AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);
333   const int reduce_bits_vert = conv_params->is_compound
334                                    ? conv_params->round_1
335                                    : 2 * FILTER_BITS - reduce_bits_horiz;
336   const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
337   const int offset_bits_horiz = bd + FILTER_BITS - 1;
338   const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
339   const int round_bits =
340       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
341   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
342   (void)max_bits_horiz;
343   assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
344 
345   for (int i = p_row; i < p_row + p_height; i += 8) {
346     for (int j = p_col; j < p_col + p_width; j += 8) {
347       // Calculate the center of this 8x8 block,
348       // project to luma coordinates (if in a subsampled chroma plane),
349       // apply the affine transformation,
350       // then convert back to the original coordinates (if necessary)
351       const int32_t src_x = (j + 4) << subsampling_x;
352       const int32_t src_y = (i + 4) << subsampling_y;
353       const int64_t dst_x =
354           (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
355       const int64_t dst_y =
356           (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
357       const int64_t x4 = dst_x >> subsampling_x;
358       const int64_t y4 = dst_y >> subsampling_y;
359 
360       const int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
361       int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
362       const int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
363       int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
364 
365       sx4 += alpha * (-4) + beta * (-4);
366       sy4 += gamma * (-4) + delta * (-4);
367 
368       sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
369       sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
370 
371       // Horizontal filter
372       for (int k = -7; k < 8; ++k) {
373         const int iy = clamp(iy4 + k, 0, height - 1);
374 
375         int sx = sx4 + beta * (k + 4);
376         for (int l = -4; l < 4; ++l) {
377           int ix = ix4 + l - 3;
378           const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
379                            WARPEDPIXEL_PREC_SHIFTS;
380           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
381           const int16_t *coeffs = av1_warped_filter[offs];
382 
383           int32_t sum = 1 << offset_bits_horiz;
384           for (int m = 0; m < 8; ++m) {
385             const int sample_x = clamp(ix + m, 0, width - 1);
386             sum += ref[iy * stride + sample_x] * coeffs[m];
387           }
388           sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
389           assert(0 <= sum && sum < (1 << max_bits_horiz));
390           tmp[(k + 7) * 8 + (l + 4)] = sum;
391           sx += alpha;
392         }
393       }
394 
395       // Vertical filter
396       for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
397         int sy = sy4 + delta * (k + 4);
398         for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
399           const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
400                            WARPEDPIXEL_PREC_SHIFTS;
401           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
402           const int16_t *coeffs = av1_warped_filter[offs];
403 
404           int32_t sum = 1 << offset_bits_vert;
405           for (int m = 0; m < 8; ++m) {
406             sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
407           }
408 
409           if (conv_params->is_compound) {
410             CONV_BUF_TYPE *p =
411                 &conv_params
412                      ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
413                            (j - p_col + l + 4)];
414             sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
415             if (conv_params->do_average) {
416               uint16_t *dst16 =
417                   &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
418               int32_t tmp32 = *p;
419               if (conv_params->use_dist_wtd_comp_avg) {
420                 tmp32 = tmp32 * conv_params->fwd_offset +
421                         sum * conv_params->bck_offset;
422                 tmp32 = tmp32 >> DIST_PRECISION_BITS;
423               } else {
424                 tmp32 += sum;
425                 tmp32 = tmp32 >> 1;
426               }
427               tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
428                       (1 << (offset_bits - conv_params->round_1 - 1));
429               *dst16 =
430                   clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp32, round_bits), bd);
431             } else {
432               *p = sum;
433             }
434           } else {
435             uint16_t *p =
436                 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
437             sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
438             assert(0 <= sum && sum < (1 << (bd + 2)));
439             *p = clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
440           }
441           sy += gamma;
442         }
443       }
444     }
445   }
446 }
447 
highbd_warp_plane(WarpedMotionParams * wm,const uint16_t * const ref,int width,int height,int stride,uint16_t * const pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,ConvolveParams * conv_params)448 void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref,
449                        int width, int height, int stride, uint16_t *const pred,
450                        int p_col, int p_row, int p_width, int p_height,
451                        int p_stride, int subsampling_x, int subsampling_y,
452                        int bd, ConvolveParams *conv_params) {
453   assert(wm->wmtype <= AFFINE);
454   if (wm->wmtype == ROTZOOM) {
455     wm->wmmat[5] = wm->wmmat[2];
456     wm->wmmat[4] = -wm->wmmat[3];
457   }
458   const int32_t *const mat = wm->wmmat;
459   const int16_t alpha = wm->alpha;
460   const int16_t beta = wm->beta;
461   const int16_t gamma = wm->gamma;
462   const int16_t delta = wm->delta;
463 
464   av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
465                          p_width, p_height, p_stride, subsampling_x,
466                          subsampling_y, bd, conv_params, alpha, beta, gamma,
467                          delta);
468 }
469 
av1_calc_highbd_frame_error(const uint16_t * const ref,int stride,const uint16_t * const dst,int p_width,int p_height,int p_stride,int bd)470 int64_t av1_calc_highbd_frame_error(const uint16_t *const ref, int stride,
471                                     const uint16_t *const dst, int p_width,
472                                     int p_height, int p_stride, int bd) {
473   int64_t sum_error = 0;
474   for (int i = 0; i < p_height; ++i) {
475     for (int j = 0; j < p_width; ++j) {
476       sum_error +=
477           highbd_error_measure(dst[j + i * p_stride] - ref[j + i * stride], bd);
478     }
479   }
480   return sum_error;
481 }
482 
highbd_segmented_frame_error(const uint16_t * const ref,int stride,const uint16_t * const dst,int p_width,int p_height,int p_stride,int bd,uint8_t * segment_map,int segment_map_stride)483 static int64_t highbd_segmented_frame_error(
484     const uint16_t *const ref, int stride, const uint16_t *const dst,
485     int p_width, int p_height, int p_stride, int bd, uint8_t *segment_map,
486     int segment_map_stride) {
487   int patch_w, patch_h;
488   const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
489   const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
490   int64_t sum_error = 0;
491   for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
492     for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
493       int seg_x = j >> WARP_ERROR_BLOCK_LOG;
494       int seg_y = i >> WARP_ERROR_BLOCK_LOG;
495       // Only compute the error if this block contains inliers from the motion
496       // model
497       if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
498 
499       // avoid computing error into the frame padding
500       patch_w = AOMMIN(error_bsize_w, p_width - j);
501       patch_h = AOMMIN(error_bsize_h, p_height - i);
502       sum_error += av1_calc_highbd_frame_error(ref + j + i * stride, stride,
503                                                dst + j + i * p_stride, patch_w,
504                                                patch_h, p_stride, bd);
505     }
506   }
507   return sum_error;
508 }
509 #endif  // CONFIG_AV1_HIGHBITDEPTH
510 
511 /* The warp filter for ROTZOOM and AFFINE models works as follows:
512    * Split the input into 8x8 blocks
513    * For each block, project the point (4, 4) within the block, to get the
514      overall block position. Split into integer and fractional coordinates,
515      maintaining full WARPEDMODEL precision
516    * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a
517      variable horizontal offset. This means that, while the rows of the
518      intermediate buffer align with the rows of the *reference* image, the
519      columns align with the columns of the *destination* image.
520    * Filter vertically: Generate the output block (up to 8x8 pixels, but if the
521      destination is too small we crop the output at this stage). Each pixel has
522      a variable vertical offset, so that the resulting rows are aligned with
523      the rows of the destination image.
524 
525    To accomplish these alignments, we factor the warp matrix as a
526    product of two shear / asymmetric zoom matrices:
527    / a b \  = /   1       0    \ * / 1+alpha  beta \
528    \ c d /    \ gamma  1+delta /   \    0      1   /
529    where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively.
530    The horizontal shear (with alpha and beta) is applied first,
531    then the vertical shear (with gamma and delta) is applied second.
532 
533    The only limitation is that, to fit this in a fixed 8-tap filter size,
534    the fractional pixel offsets must be at most +-1. Since the horizontal filter
535    generates 15 rows of 8 columns, and the initial point we project is at (4, 4)
536    within the block, the parameters must satisfy
537    4 * |alpha| + 7 * |beta| <= 1   and   4 * |gamma| + 4 * |delta| <= 1
538    for this filter to be applicable.
539 
540    Note: This function assumes that the caller has done all of the relevant
541    checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5]
542    are set appropriately (if using a ROTZOOM model), and that alpha, beta,
543    gamma, delta are all in range.
544 
545    TODO(rachelbarker): Maybe support scaled references?
546 */
547 /* A note on hardware implementation:
548     The warp filter is intended to be implementable using the same hardware as
549     the high-precision convolve filters from the loop-restoration and
550     convolve-round experiments.
551 
552     For a single filter stage, considering all of the coefficient sets for the
553     warp filter and the regular convolution filter, an input in the range
554     [0, 2^k - 1] is mapped into the range [-56 * (2^k - 1), 184 * (2^k - 1)]
555     before rounding.
556 
557     Allowing for some changes to the filter coefficient sets, call the range
558     [-64 * 2^k, 192 * 2^k]. Then, if we initialize the accumulator to 64 * 2^k,
559     we can replace this by the range [0, 256 * 2^k], which can be stored in an
560     unsigned value with 8 + k bits.
561 
562     This allows the derivation of the appropriate bit widths and offsets for
563     the various intermediate values: If
564 
565     F := FILTER_BITS = 7 (or else the above ranges need adjusting)
566          So a *single* filter stage maps a k-bit input to a (k + F + 1)-bit
567          intermediate value.
568     H := ROUND0_BITS
569     V := VERSHEAR_REDUCE_PREC_BITS
570     (and note that we must have H + V = 2*F for the output to have the same
571      scale as the input)
572 
573     then we end up with the following offsets and ranges:
574     Horizontal filter: Apply an offset of 1 << (bd + F - 1), sum fits into a
575                        uint{bd + F + 1}
576     After rounding: The values stored in 'tmp' fit into a uint{bd + F + 1 - H}.
577     Vertical filter: Apply an offset of 1 << (bd + 2*F - H), sum fits into a
578                      uint{bd + 2*F + 2 - H}
579     After rounding: The final value, before undoing the offset, fits into a
580                     uint{bd + 2}.
581 
582     Then we need to undo the offsets before clamping to a pixel. Note that,
583     if we do this at the end, the amount to subtract is actually independent
584     of H and V:
585 
586     offset to subtract = (1 << ((bd + F - 1) - H + F - V)) +
587                          (1 << ((bd + 2*F - H) - V))
588                       == (1 << (bd - 1)) + (1 << bd)
589 
590     This allows us to entirely avoid clamping in both the warp filter and
591     the convolve-round experiment. As of the time of writing, the Wiener filter
592     from loop-restoration can encode a central coefficient up to 216, which
593     leads to a maximum value of about 282 * 2^k after applying the offset.
594     So in that case we still need to clamp.
595 */
av1_warp_affine_c(const int32_t * mat,const uint8_t * ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)596 void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
597                        int height, int stride, uint8_t *pred, int p_col,
598                        int p_row, int p_width, int p_height, int p_stride,
599                        int subsampling_x, int subsampling_y,
600                        ConvolveParams *conv_params, int16_t alpha, int16_t beta,
601                        int16_t gamma, int16_t delta) {
602   int32_t tmp[15 * 8];
603   const int bd = 8;
604   const int reduce_bits_horiz = conv_params->round_0;
605   const int reduce_bits_vert = conv_params->is_compound
606                                    ? conv_params->round_1
607                                    : 2 * FILTER_BITS - reduce_bits_horiz;
608   const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
609   const int offset_bits_horiz = bd + FILTER_BITS - 1;
610   const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
611   const int round_bits =
612       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
613   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
614   (void)max_bits_horiz;
615   assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
616   assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
617 
618   for (int i = p_row; i < p_row + p_height; i += 8) {
619     for (int j = p_col; j < p_col + p_width; j += 8) {
620       // Calculate the center of this 8x8 block,
621       // project to luma coordinates (if in a subsampled chroma plane),
622       // apply the affine transformation,
623       // then convert back to the original coordinates (if necessary)
624       const int32_t src_x = (j + 4) << subsampling_x;
625       const int32_t src_y = (i + 4) << subsampling_y;
626       const int64_t dst_x =
627           (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
628       const int64_t dst_y =
629           (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
630       const int64_t x4 = dst_x >> subsampling_x;
631       const int64_t y4 = dst_y >> subsampling_y;
632 
633       int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
634       int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
635       int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
636       int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
637 
638       sx4 += alpha * (-4) + beta * (-4);
639       sy4 += gamma * (-4) + delta * (-4);
640 
641       sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
642       sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
643 
644       // Horizontal filter
645       for (int k = -7; k < 8; ++k) {
646         // Clamp to top/bottom edge of the frame
647         const int iy = clamp(iy4 + k, 0, height - 1);
648 
649         int sx = sx4 + beta * (k + 4);
650 
651         for (int l = -4; l < 4; ++l) {
652           int ix = ix4 + l - 3;
653           // At this point, sx = sx4 + alpha * l + beta * k
654           const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
655                            WARPEDPIXEL_PREC_SHIFTS;
656           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
657           const int16_t *coeffs = av1_warped_filter[offs];
658 
659           int32_t sum = 1 << offset_bits_horiz;
660           for (int m = 0; m < 8; ++m) {
661             // Clamp to left/right edge of the frame
662             const int sample_x = clamp(ix + m, 0, width - 1);
663 
664             sum += ref[iy * stride + sample_x] * coeffs[m];
665           }
666           sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
667           assert(0 <= sum && sum < (1 << max_bits_horiz));
668           tmp[(k + 7) * 8 + (l + 4)] = sum;
669           sx += alpha;
670         }
671       }
672 
673       // Vertical filter
674       for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
675         int sy = sy4 + delta * (k + 4);
676         for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
677           // At this point, sy = sy4 + gamma * l + delta * k
678           const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
679                            WARPEDPIXEL_PREC_SHIFTS;
680           assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
681           const int16_t *coeffs = av1_warped_filter[offs];
682 
683           int32_t sum = 1 << offset_bits_vert;
684           for (int m = 0; m < 8; ++m) {
685             sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
686           }
687 
688           if (conv_params->is_compound) {
689             CONV_BUF_TYPE *p =
690                 &conv_params
691                      ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
692                            (j - p_col + l + 4)];
693             sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
694             if (conv_params->do_average) {
695               uint8_t *dst8 =
696                   &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
697               int32_t tmp32 = *p;
698               if (conv_params->use_dist_wtd_comp_avg) {
699                 tmp32 = tmp32 * conv_params->fwd_offset +
700                         sum * conv_params->bck_offset;
701                 tmp32 = tmp32 >> DIST_PRECISION_BITS;
702               } else {
703                 tmp32 += sum;
704                 tmp32 = tmp32 >> 1;
705               }
706               tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
707                       (1 << (offset_bits - conv_params->round_1 - 1));
708               *dst8 = clip_pixel(ROUND_POWER_OF_TWO(tmp32, round_bits));
709             } else {
710               *p = sum;
711             }
712           } else {
713             uint8_t *p =
714                 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
715             sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
716             assert(0 <= sum && sum < (1 << (bd + 2)));
717             *p = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
718           }
719           sy += gamma;
720         }
721       }
722     }
723   }
724 }
725 
warp_plane(WarpedMotionParams * wm,const uint8_t * const ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params)726 void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width,
727                 int height, int stride, uint8_t *pred, int p_col, int p_row,
728                 int p_width, int p_height, int p_stride, int subsampling_x,
729                 int subsampling_y, ConvolveParams *conv_params) {
730   assert(wm->wmtype <= AFFINE);
731   if (wm->wmtype == ROTZOOM) {
732     wm->wmmat[5] = wm->wmmat[2];
733     wm->wmmat[4] = -wm->wmmat[3];
734   }
735   const int32_t *const mat = wm->wmmat;
736   const int16_t alpha = wm->alpha;
737   const int16_t beta = wm->beta;
738   const int16_t gamma = wm->gamma;
739   const int16_t delta = wm->delta;
740   av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width,
741                   p_height, p_stride, subsampling_x, subsampling_y, conv_params,
742                   alpha, beta, gamma, delta);
743 }
744 
av1_calc_frame_error_c(const uint8_t * const ref,int stride,const uint8_t * const dst,int p_width,int p_height,int p_stride)745 int64_t av1_calc_frame_error_c(const uint8_t *const ref, int stride,
746                                const uint8_t *const dst, int p_width,
747                                int p_height, int p_stride) {
748   int64_t sum_error = 0;
749   for (int i = 0; i < p_height; ++i) {
750     for (int j = 0; j < p_width; ++j) {
751       sum_error +=
752           (int64_t)error_measure(dst[j + i * p_stride] - ref[j + i * stride]);
753     }
754   }
755   return sum_error;
756 }
757 
segmented_frame_error(const uint8_t * const ref,int stride,const uint8_t * const dst,int p_width,int p_height,int p_stride,uint8_t * segment_map,int segment_map_stride)758 static int64_t segmented_frame_error(const uint8_t *const ref, int stride,
759                                      const uint8_t *const dst, int p_width,
760                                      int p_height, int p_stride,
761                                      uint8_t *segment_map,
762                                      int segment_map_stride) {
763   int patch_w, patch_h;
764   const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
765   const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
766   int64_t sum_error = 0;
767   for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
768     for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
769       int seg_x = j >> WARP_ERROR_BLOCK_LOG;
770       int seg_y = i >> WARP_ERROR_BLOCK_LOG;
771       // Only compute the error if this block contains inliers from the motion
772       // model
773       if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
774 
775       // avoid computing error into the frame padding
776       patch_w = AOMMIN(error_bsize_w, p_width - j);
777       patch_h = AOMMIN(error_bsize_h, p_height - i);
778       sum_error += av1_calc_frame_error(ref + j + i * stride, stride,
779                                         dst + j + i * p_stride, patch_w,
780                                         patch_h, p_stride);
781     }
782   }
783   return sum_error;
784 }
785 
av1_frame_error(int use_hbd,int bd,const uint8_t * ref,int stride,uint8_t * dst,int p_width,int p_height,int p_stride)786 int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
787                         uint8_t *dst, int p_width, int p_height, int p_stride) {
788 #if CONFIG_AV1_HIGHBITDEPTH
789   if (use_hbd) {
790     return av1_calc_highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride,
791                                        CONVERT_TO_SHORTPTR(dst), p_width,
792                                        p_height, p_stride, bd);
793   }
794 #endif
795   (void)use_hbd;
796   (void)bd;
797   return av1_calc_frame_error(ref, stride, dst, p_width, p_height, p_stride);
798 }
799 
av1_segmented_frame_error(int use_hbd,int bd,const uint8_t * ref,int stride,uint8_t * dst,int p_width,int p_height,int p_stride,uint8_t * segment_map,int segment_map_stride)800 int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref,
801                                   int stride, uint8_t *dst, int p_width,
802                                   int p_height, int p_stride,
803                                   uint8_t *segment_map,
804                                   int segment_map_stride) {
805 #if CONFIG_AV1_HIGHBITDEPTH
806   if (use_hbd) {
807     return highbd_segmented_frame_error(
808         CONVERT_TO_SHORTPTR(ref), stride, CONVERT_TO_SHORTPTR(dst), p_width,
809         p_height, p_stride, bd, segment_map, segment_map_stride);
810   }
811 #endif
812   (void)use_hbd;
813   (void)bd;
814   return segmented_frame_error(ref, stride, dst, p_width, p_height, p_stride,
815                                segment_map, segment_map_stride);
816 }
817 
av1_warp_plane(WarpedMotionParams * wm,int use_hbd,int bd,const uint8_t * ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params)818 void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
819                     const uint8_t *ref, int width, int height, int stride,
820                     uint8_t *pred, int p_col, int p_row, int p_width,
821                     int p_height, int p_stride, int subsampling_x,
822                     int subsampling_y, ConvolveParams *conv_params) {
823 #if CONFIG_AV1_HIGHBITDEPTH
824   if (use_hbd)
825     highbd_warp_plane(wm, CONVERT_TO_SHORTPTR(ref), width, height, stride,
826                       CONVERT_TO_SHORTPTR(pred), p_col, p_row, p_width,
827                       p_height, p_stride, subsampling_x, subsampling_y, bd,
828                       conv_params);
829   else
830     warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
831                p_height, p_stride, subsampling_x, subsampling_y, conv_params);
832 #else
833   (void)use_hbd;
834   (void)bd;
835   warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
836              p_height, p_stride, subsampling_x, subsampling_y, conv_params);
837 #endif
838 }
839 
840 #define LS_MV_MAX 256  // max mv in 1/8-pel
841 // Use LS_STEP = 8 so that 2 less bits needed for A, Bx, By.
842 #define LS_STEP 8
843 
844 // Assuming LS_MV_MAX is < MAX_SB_SIZE * 8,
845 // the precision needed is:
846 //   (MAX_SB_SIZE_LOG2 + 3) [for sx * sx magnitude] +
847 //   (MAX_SB_SIZE_LOG2 + 4) [for sx * dx magnitude] +
848 //   1 [for sign] +
849 //   LEAST_SQUARES_SAMPLES_MAX_BITS
850 //        [for adding up to LEAST_SQUARES_SAMPLES_MAX samples]
851 // The value is 23
852 #define LS_MAT_RANGE_BITS \
853   ((MAX_SB_SIZE_LOG2 + 4) * 2 + LEAST_SQUARES_SAMPLES_MAX_BITS)
854 
855 // Bit-depth reduction from the full-range
856 #define LS_MAT_DOWN_BITS 2
857 
858 // bits range of A, Bx and By after downshifting
859 #define LS_MAT_BITS (LS_MAT_RANGE_BITS - LS_MAT_DOWN_BITS)
860 #define LS_MAT_MIN (-(1 << (LS_MAT_BITS - 1)))
861 #define LS_MAT_MAX ((1 << (LS_MAT_BITS - 1)) - 1)
862 
863 // By setting LS_STEP = 8, the least 2 bits of every elements in A, Bx, By are
864 // 0. So, we can reduce LS_MAT_RANGE_BITS(2) bits here.
865 #define LS_SQUARE(a)                                          \
866   (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
867    (2 + LS_MAT_DOWN_BITS))
868 #define LS_PRODUCT1(a, b)                                           \
869   (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
870    (2 + LS_MAT_DOWN_BITS))
871 #define LS_PRODUCT2(a, b)                                               \
872   (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
873    (2 + LS_MAT_DOWN_BITS))
874 
875 #define USE_LIMITED_PREC_MULT 0
876 
877 #if USE_LIMITED_PREC_MULT
878 
879 #define MUL_PREC_BITS 16
resolve_multiplier_64(uint64_t D,int16_t * shift)880 static uint16_t resolve_multiplier_64(uint64_t D, int16_t *shift) {
881   int msb = 0;
882   uint16_t mult = 0;
883   *shift = 0;
884   if (D != 0) {
885     msb = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
886                               : get_msb((unsigned int)D));
887     if (msb >= MUL_PREC_BITS) {
888       mult = (uint16_t)ROUND_POWER_OF_TWO_64(D, msb + 1 - MUL_PREC_BITS);
889       *shift = msb + 1 - MUL_PREC_BITS;
890     } else {
891       mult = (uint16_t)D;
892       *shift = 0;
893     }
894   }
895   return mult;
896 }
897 
get_mult_shift_ndiag(int64_t Px,int16_t iDet,int shift)898 static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
899   int32_t ret;
900   int16_t mshift;
901   uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
902   int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
903   shift -= mshift;
904   if (shift > 0) {
905     return (int32_t)clamp(ROUND_POWER_OF_TWO_SIGNED(v, shift),
906                           -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
907                           WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
908   } else {
909     return (int32_t)clamp(v * (1 << (-shift)),
910                           -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
911                           WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
912   }
913   return ret;
914 }
915 
get_mult_shift_diag(int64_t Px,int16_t iDet,int shift)916 static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
917   int16_t mshift;
918   uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
919   int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
920   shift -= mshift;
921   if (shift > 0) {
922     return (int32_t)clamp(
923         ROUND_POWER_OF_TWO_SIGNED(v, shift),
924         (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
925         (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
926   } else {
927     return (int32_t)clamp(
928         v * (1 << (-shift)),
929         (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
930         (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
931   }
932 }
933 
934 #else
935 
get_mult_shift_ndiag(int64_t Px,int16_t iDet,int shift)936 static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
937   int64_t v = Px * (int64_t)iDet;
938   return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
939                           -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
940                           WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
941 }
942 
get_mult_shift_diag(int64_t Px,int16_t iDet,int shift)943 static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
944   int64_t v = Px * (int64_t)iDet;
945   return (int32_t)clamp64(
946       ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
947       (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
948       (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
949 }
950 #endif  // USE_LIMITED_PREC_MULT
951 
find_affine_int(int np,const int * pts1,const int * pts2,BLOCK_SIZE bsize,int mvy,int mvx,WarpedMotionParams * wm,int mi_row,int mi_col)952 static int find_affine_int(int np, const int *pts1, const int *pts2,
953                            BLOCK_SIZE bsize, int mvy, int mvx,
954                            WarpedMotionParams *wm, int mi_row, int mi_col) {
955   int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
956   int32_t Bx[2] = { 0, 0 };
957   int32_t By[2] = { 0, 0 };
958 
959   const int bw = block_size_wide[bsize];
960   const int bh = block_size_high[bsize];
961   const int rsuy = bh / 2 - 1;
962   const int rsux = bw / 2 - 1;
963   const int suy = rsuy * 8;
964   const int sux = rsux * 8;
965   const int duy = suy + mvy;
966   const int dux = sux + mvx;
967 
968   // Assume the center pixel of the block has exactly the same motion vector
969   // as transmitted for the block. First shift the origin of the source
970   // points to the block center, and the origin of the destination points to
971   // the block center added to the motion vector transmitted.
972   // Let (xi, yi) denote the source points and (xi', yi') denote destination
973   // points after origin shfifting, for i = 0, 1, 2, .... n-1.
974   // Then if  P = [x0, y0,
975   //               x1, y1
976   //               x2, y1,
977   //                ....
978   //              ]
979   //          q = [x0', x1', x2', ... ]'
980   //          r = [y0', y1', y2', ... ]'
981   // the least squares problems that need to be solved are:
982   //          [h1, h2]' = inv(P'P)P'q and
983   //          [h3, h4]' = inv(P'P)P'r
984   // where the affine transformation is given by:
985   //          x' = h1.x + h2.y
986   //          y' = h3.x + h4.y
987   //
988   // The loop below computes: A = P'P, Bx = P'q, By = P'r
989   // We need to just compute inv(A).Bx and inv(A).By for the solutions.
990   // Contribution from neighbor block
991   for (int i = 0; i < np; i++) {
992     const int dx = pts2[i * 2] - dux;
993     const int dy = pts2[i * 2 + 1] - duy;
994     const int sx = pts1[i * 2] - sux;
995     const int sy = pts1[i * 2 + 1] - suy;
996     // (TODO)yunqing: This comparison wouldn't be necessary if the sample
997     // selection is done in find_samples(). Also, global offset can be removed
998     // while collecting samples.
999     if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
1000       A[0][0] += LS_SQUARE(sx);
1001       A[0][1] += LS_PRODUCT1(sx, sy);
1002       A[1][1] += LS_SQUARE(sy);
1003       Bx[0] += LS_PRODUCT2(sx, dx);
1004       Bx[1] += LS_PRODUCT1(sy, dx);
1005       By[0] += LS_PRODUCT1(sx, dy);
1006       By[1] += LS_PRODUCT2(sy, dy);
1007     }
1008   }
1009 
1010   // Just for debugging, and can be removed later.
1011   assert(A[0][0] >= LS_MAT_MIN && A[0][0] <= LS_MAT_MAX);
1012   assert(A[0][1] >= LS_MAT_MIN && A[0][1] <= LS_MAT_MAX);
1013   assert(A[1][1] >= LS_MAT_MIN && A[1][1] <= LS_MAT_MAX);
1014   assert(Bx[0] >= LS_MAT_MIN && Bx[0] <= LS_MAT_MAX);
1015   assert(Bx[1] >= LS_MAT_MIN && Bx[1] <= LS_MAT_MAX);
1016   assert(By[0] >= LS_MAT_MIN && By[0] <= LS_MAT_MAX);
1017   assert(By[1] >= LS_MAT_MIN && By[1] <= LS_MAT_MAX);
1018 
1019   // Compute Determinant of A
1020   const int64_t Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
1021   if (Det == 0) return 1;
1022 
1023   int16_t shift;
1024   int16_t iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1);
1025   shift -= WARPEDMODEL_PREC_BITS;
1026   if (shift < 0) {
1027     iDet <<= (-shift);
1028     shift = 0;
1029   }
1030 
1031   int64_t Px[2], Py[2];
1032   // These divided by the Det, are the least squares solutions
1033   Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
1034   Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1];
1035   Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1];
1036   Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1];
1037 
1038   wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift);
1039   wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift);
1040   wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift);
1041   wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift);
1042 
1043   const int isuy = (mi_row * MI_SIZE + rsuy);
1044   const int isux = (mi_col * MI_SIZE + rsux);
1045   // Note: In the vx, vy expressions below, the max value of each of the
1046   // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room
1047   // for the first term so that the overall sum in the worst case fits
1048   // within 32 bits overall.
1049   const int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
1050                      (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) +
1051                       isuy * wm->wmmat[3]);
1052   const int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
1053                      (isux * wm->wmmat[4] +
1054                       isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS)));
1055   wm->wmmat[0] =
1056       clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
1057   wm->wmmat[1] =
1058       clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
1059   return 0;
1060 }
1061 
av1_find_projection(int np,const int * pts1,const int * pts2,BLOCK_SIZE bsize,int mvy,int mvx,WarpedMotionParams * wm_params,int mi_row,int mi_col)1062 int av1_find_projection(int np, const int *pts1, const int *pts2,
1063                         BLOCK_SIZE bsize, int mvy, int mvx,
1064                         WarpedMotionParams *wm_params, int mi_row, int mi_col) {
1065   assert(wm_params->wmtype == AFFINE);
1066 
1067   if (find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, mi_row,
1068                       mi_col))
1069     return 1;
1070 
1071   // check compatibility with the fast warp filter
1072   if (!av1_get_shear_params(wm_params)) return 1;
1073 
1074   return 0;
1075 }
1076