1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <memory.h>
15 #include <math.h>
16 #include <assert.h>
17
18 #include "config/av1_rtcd.h"
19
20 #include "av1/common/warped_motion.h"
21 #include "av1/common/scale.h"
22
23 // For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
24 // at a time. The zoom/rotation/shear in the model are applied to the
25 // "fractional" position of each pixel, which therefore varies within
26 // [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
27 // We need an extra 2 taps to fit this in, for a total of 8 taps.
28 /* clang-format off */
29 const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
30 #if WARPEDPIXEL_PREC_BITS == 6
31 // [-1, 0)
32 { 0, 0, 127, 1, 0, 0, 0, 0 }, { 0, - 1, 127, 2, 0, 0, 0, 0 },
33 { 1, - 3, 127, 4, - 1, 0, 0, 0 }, { 1, - 4, 126, 6, - 2, 1, 0, 0 },
34 { 1, - 5, 126, 8, - 3, 1, 0, 0 }, { 1, - 6, 125, 11, - 4, 1, 0, 0 },
35 { 1, - 7, 124, 13, - 4, 1, 0, 0 }, { 2, - 8, 123, 15, - 5, 1, 0, 0 },
36 { 2, - 9, 122, 18, - 6, 1, 0, 0 }, { 2, -10, 121, 20, - 6, 1, 0, 0 },
37 { 2, -11, 120, 22, - 7, 2, 0, 0 }, { 2, -12, 119, 25, - 8, 2, 0, 0 },
38 { 3, -13, 117, 27, - 8, 2, 0, 0 }, { 3, -13, 116, 29, - 9, 2, 0, 0 },
39 { 3, -14, 114, 32, -10, 3, 0, 0 }, { 3, -15, 113, 35, -10, 2, 0, 0 },
40 { 3, -15, 111, 37, -11, 3, 0, 0 }, { 3, -16, 109, 40, -11, 3, 0, 0 },
41 { 3, -16, 108, 42, -12, 3, 0, 0 }, { 4, -17, 106, 45, -13, 3, 0, 0 },
42 { 4, -17, 104, 47, -13, 3, 0, 0 }, { 4, -17, 102, 50, -14, 3, 0, 0 },
43 { 4, -17, 100, 52, -14, 3, 0, 0 }, { 4, -18, 98, 55, -15, 4, 0, 0 },
44 { 4, -18, 96, 58, -15, 3, 0, 0 }, { 4, -18, 94, 60, -16, 4, 0, 0 },
45 { 4, -18, 91, 63, -16, 4, 0, 0 }, { 4, -18, 89, 65, -16, 4, 0, 0 },
46 { 4, -18, 87, 68, -17, 4, 0, 0 }, { 4, -18, 85, 70, -17, 4, 0, 0 },
47 { 4, -18, 82, 73, -17, 4, 0, 0 }, { 4, -18, 80, 75, -17, 4, 0, 0 },
48 { 4, -18, 78, 78, -18, 4, 0, 0 }, { 4, -17, 75, 80, -18, 4, 0, 0 },
49 { 4, -17, 73, 82, -18, 4, 0, 0 }, { 4, -17, 70, 85, -18, 4, 0, 0 },
50 { 4, -17, 68, 87, -18, 4, 0, 0 }, { 4, -16, 65, 89, -18, 4, 0, 0 },
51 { 4, -16, 63, 91, -18, 4, 0, 0 }, { 4, -16, 60, 94, -18, 4, 0, 0 },
52 { 3, -15, 58, 96, -18, 4, 0, 0 }, { 4, -15, 55, 98, -18, 4, 0, 0 },
53 { 3, -14, 52, 100, -17, 4, 0, 0 }, { 3, -14, 50, 102, -17, 4, 0, 0 },
54 { 3, -13, 47, 104, -17, 4, 0, 0 }, { 3, -13, 45, 106, -17, 4, 0, 0 },
55 { 3, -12, 42, 108, -16, 3, 0, 0 }, { 3, -11, 40, 109, -16, 3, 0, 0 },
56 { 3, -11, 37, 111, -15, 3, 0, 0 }, { 2, -10, 35, 113, -15, 3, 0, 0 },
57 { 3, -10, 32, 114, -14, 3, 0, 0 }, { 2, - 9, 29, 116, -13, 3, 0, 0 },
58 { 2, - 8, 27, 117, -13, 3, 0, 0 }, { 2, - 8, 25, 119, -12, 2, 0, 0 },
59 { 2, - 7, 22, 120, -11, 2, 0, 0 }, { 1, - 6, 20, 121, -10, 2, 0, 0 },
60 { 1, - 6, 18, 122, - 9, 2, 0, 0 }, { 1, - 5, 15, 123, - 8, 2, 0, 0 },
61 { 1, - 4, 13, 124, - 7, 1, 0, 0 }, { 1, - 4, 11, 125, - 6, 1, 0, 0 },
62 { 1, - 3, 8, 126, - 5, 1, 0, 0 }, { 1, - 2, 6, 126, - 4, 1, 0, 0 },
63 { 0, - 1, 4, 127, - 3, 1, 0, 0 }, { 0, 0, 2, 127, - 1, 0, 0, 0 },
64
65 // [0, 1)
66 { 0, 0, 0, 127, 1, 0, 0, 0}, { 0, 0, -1, 127, 2, 0, 0, 0},
67 { 0, 1, -3, 127, 4, -2, 1, 0}, { 0, 1, -5, 127, 6, -2, 1, 0},
68 { 0, 2, -6, 126, 8, -3, 1, 0}, {-1, 2, -7, 126, 11, -4, 2, -1},
69 {-1, 3, -8, 125, 13, -5, 2, -1}, {-1, 3, -10, 124, 16, -6, 3, -1},
70 {-1, 4, -11, 123, 18, -7, 3, -1}, {-1, 4, -12, 122, 20, -7, 3, -1},
71 {-1, 4, -13, 121, 23, -8, 3, -1}, {-2, 5, -14, 120, 25, -9, 4, -1},
72 {-1, 5, -15, 119, 27, -10, 4, -1}, {-1, 5, -16, 118, 30, -11, 4, -1},
73 {-2, 6, -17, 116, 33, -12, 5, -1}, {-2, 6, -17, 114, 35, -12, 5, -1},
74 {-2, 6, -18, 113, 38, -13, 5, -1}, {-2, 7, -19, 111, 41, -14, 6, -2},
75 {-2, 7, -19, 110, 43, -15, 6, -2}, {-2, 7, -20, 108, 46, -15, 6, -2},
76 {-2, 7, -20, 106, 49, -16, 6, -2}, {-2, 7, -21, 104, 51, -16, 7, -2},
77 {-2, 7, -21, 102, 54, -17, 7, -2}, {-2, 8, -21, 100, 56, -18, 7, -2},
78 {-2, 8, -22, 98, 59, -18, 7, -2}, {-2, 8, -22, 96, 62, -19, 7, -2},
79 {-2, 8, -22, 94, 64, -19, 7, -2}, {-2, 8, -22, 91, 67, -20, 8, -2},
80 {-2, 8, -22, 89, 69, -20, 8, -2}, {-2, 8, -22, 87, 72, -21, 8, -2},
81 {-2, 8, -21, 84, 74, -21, 8, -2}, {-2, 8, -22, 82, 77, -21, 8, -2},
82 {-2, 8, -21, 79, 79, -21, 8, -2}, {-2, 8, -21, 77, 82, -22, 8, -2},
83 {-2, 8, -21, 74, 84, -21, 8, -2}, {-2, 8, -21, 72, 87, -22, 8, -2},
84 {-2, 8, -20, 69, 89, -22, 8, -2}, {-2, 8, -20, 67, 91, -22, 8, -2},
85 {-2, 7, -19, 64, 94, -22, 8, -2}, {-2, 7, -19, 62, 96, -22, 8, -2},
86 {-2, 7, -18, 59, 98, -22, 8, -2}, {-2, 7, -18, 56, 100, -21, 8, -2},
87 {-2, 7, -17, 54, 102, -21, 7, -2}, {-2, 7, -16, 51, 104, -21, 7, -2},
88 {-2, 6, -16, 49, 106, -20, 7, -2}, {-2, 6, -15, 46, 108, -20, 7, -2},
89 {-2, 6, -15, 43, 110, -19, 7, -2}, {-2, 6, -14, 41, 111, -19, 7, -2},
90 {-1, 5, -13, 38, 113, -18, 6, -2}, {-1, 5, -12, 35, 114, -17, 6, -2},
91 {-1, 5, -12, 33, 116, -17, 6, -2}, {-1, 4, -11, 30, 118, -16, 5, -1},
92 {-1, 4, -10, 27, 119, -15, 5, -1}, {-1, 4, -9, 25, 120, -14, 5, -2},
93 {-1, 3, -8, 23, 121, -13, 4, -1}, {-1, 3, -7, 20, 122, -12, 4, -1},
94 {-1, 3, -7, 18, 123, -11, 4, -1}, {-1, 3, -6, 16, 124, -10, 3, -1},
95 {-1, 2, -5, 13, 125, -8, 3, -1}, {-1, 2, -4, 11, 126, -7, 2, -1},
96 { 0, 1, -3, 8, 126, -6, 2, 0}, { 0, 1, -2, 6, 127, -5, 1, 0},
97 { 0, 1, -2, 4, 127, -3, 1, 0}, { 0, 0, 0, 2, 127, -1, 0, 0},
98
99 // [1, 2)
100 { 0, 0, 0, 1, 127, 0, 0, 0 }, { 0, 0, 0, - 1, 127, 2, 0, 0 },
101 { 0, 0, 1, - 3, 127, 4, - 1, 0 }, { 0, 0, 1, - 4, 126, 6, - 2, 1 },
102 { 0, 0, 1, - 5, 126, 8, - 3, 1 }, { 0, 0, 1, - 6, 125, 11, - 4, 1 },
103 { 0, 0, 1, - 7, 124, 13, - 4, 1 }, { 0, 0, 2, - 8, 123, 15, - 5, 1 },
104 { 0, 0, 2, - 9, 122, 18, - 6, 1 }, { 0, 0, 2, -10, 121, 20, - 6, 1 },
105 { 0, 0, 2, -11, 120, 22, - 7, 2 }, { 0, 0, 2, -12, 119, 25, - 8, 2 },
106 { 0, 0, 3, -13, 117, 27, - 8, 2 }, { 0, 0, 3, -13, 116, 29, - 9, 2 },
107 { 0, 0, 3, -14, 114, 32, -10, 3 }, { 0, 0, 3, -15, 113, 35, -10, 2 },
108 { 0, 0, 3, -15, 111, 37, -11, 3 }, { 0, 0, 3, -16, 109, 40, -11, 3 },
109 { 0, 0, 3, -16, 108, 42, -12, 3 }, { 0, 0, 4, -17, 106, 45, -13, 3 },
110 { 0, 0, 4, -17, 104, 47, -13, 3 }, { 0, 0, 4, -17, 102, 50, -14, 3 },
111 { 0, 0, 4, -17, 100, 52, -14, 3 }, { 0, 0, 4, -18, 98, 55, -15, 4 },
112 { 0, 0, 4, -18, 96, 58, -15, 3 }, { 0, 0, 4, -18, 94, 60, -16, 4 },
113 { 0, 0, 4, -18, 91, 63, -16, 4 }, { 0, 0, 4, -18, 89, 65, -16, 4 },
114 { 0, 0, 4, -18, 87, 68, -17, 4 }, { 0, 0, 4, -18, 85, 70, -17, 4 },
115 { 0, 0, 4, -18, 82, 73, -17, 4 }, { 0, 0, 4, -18, 80, 75, -17, 4 },
116 { 0, 0, 4, -18, 78, 78, -18, 4 }, { 0, 0, 4, -17, 75, 80, -18, 4 },
117 { 0, 0, 4, -17, 73, 82, -18, 4 }, { 0, 0, 4, -17, 70, 85, -18, 4 },
118 { 0, 0, 4, -17, 68, 87, -18, 4 }, { 0, 0, 4, -16, 65, 89, -18, 4 },
119 { 0, 0, 4, -16, 63, 91, -18, 4 }, { 0, 0, 4, -16, 60, 94, -18, 4 },
120 { 0, 0, 3, -15, 58, 96, -18, 4 }, { 0, 0, 4, -15, 55, 98, -18, 4 },
121 { 0, 0, 3, -14, 52, 100, -17, 4 }, { 0, 0, 3, -14, 50, 102, -17, 4 },
122 { 0, 0, 3, -13, 47, 104, -17, 4 }, { 0, 0, 3, -13, 45, 106, -17, 4 },
123 { 0, 0, 3, -12, 42, 108, -16, 3 }, { 0, 0, 3, -11, 40, 109, -16, 3 },
124 { 0, 0, 3, -11, 37, 111, -15, 3 }, { 0, 0, 2, -10, 35, 113, -15, 3 },
125 { 0, 0, 3, -10, 32, 114, -14, 3 }, { 0, 0, 2, - 9, 29, 116, -13, 3 },
126 { 0, 0, 2, - 8, 27, 117, -13, 3 }, { 0, 0, 2, - 8, 25, 119, -12, 2 },
127 { 0, 0, 2, - 7, 22, 120, -11, 2 }, { 0, 0, 1, - 6, 20, 121, -10, 2 },
128 { 0, 0, 1, - 6, 18, 122, - 9, 2 }, { 0, 0, 1, - 5, 15, 123, - 8, 2 },
129 { 0, 0, 1, - 4, 13, 124, - 7, 1 }, { 0, 0, 1, - 4, 11, 125, - 6, 1 },
130 { 0, 0, 1, - 3, 8, 126, - 5, 1 }, { 0, 0, 1, - 2, 6, 126, - 4, 1 },
131 { 0, 0, 0, - 1, 4, 127, - 3, 1 }, { 0, 0, 0, 0, 2, 127, - 1, 0 },
132 // dummy (replicate row index 191)
133 { 0, 0, 0, 0, 2, 127, - 1, 0 },
134
135 #elif WARPEDPIXEL_PREC_BITS == 5
136 // [-1, 0)
137 {0, 0, 127, 1, 0, 0, 0, 0}, {1, -3, 127, 4, -1, 0, 0, 0},
138 {1, -5, 126, 8, -3, 1, 0, 0}, {1, -7, 124, 13, -4, 1, 0, 0},
139 {2, -9, 122, 18, -6, 1, 0, 0}, {2, -11, 120, 22, -7, 2, 0, 0},
140 {3, -13, 117, 27, -8, 2, 0, 0}, {3, -14, 114, 32, -10, 3, 0, 0},
141 {3, -15, 111, 37, -11, 3, 0, 0}, {3, -16, 108, 42, -12, 3, 0, 0},
142 {4, -17, 104, 47, -13, 3, 0, 0}, {4, -17, 100, 52, -14, 3, 0, 0},
143 {4, -18, 96, 58, -15, 3, 0, 0}, {4, -18, 91, 63, -16, 4, 0, 0},
144 {4, -18, 87, 68, -17, 4, 0, 0}, {4, -18, 82, 73, -17, 4, 0, 0},
145 {4, -18, 78, 78, -18, 4, 0, 0}, {4, -17, 73, 82, -18, 4, 0, 0},
146 {4, -17, 68, 87, -18, 4, 0, 0}, {4, -16, 63, 91, -18, 4, 0, 0},
147 {3, -15, 58, 96, -18, 4, 0, 0}, {3, -14, 52, 100, -17, 4, 0, 0},
148 {3, -13, 47, 104, -17, 4, 0, 0}, {3, -12, 42, 108, -16, 3, 0, 0},
149 {3, -11, 37, 111, -15, 3, 0, 0}, {3, -10, 32, 114, -14, 3, 0, 0},
150 {2, -8, 27, 117, -13, 3, 0, 0}, {2, -7, 22, 120, -11, 2, 0, 0},
151 {1, -6, 18, 122, -9, 2, 0, 0}, {1, -4, 13, 124, -7, 1, 0, 0},
152 {1, -3, 8, 126, -5, 1, 0, 0}, {0, -1, 4, 127, -3, 1, 0, 0},
153 // [0, 1)
154 { 0, 0, 0, 127, 1, 0, 0, 0}, { 0, 1, -3, 127, 4, -2, 1, 0},
155 { 0, 2, -6, 126, 8, -3, 1, 0}, {-1, 3, -8, 125, 13, -5, 2, -1},
156 {-1, 4, -11, 123, 18, -7, 3, -1}, {-1, 4, -13, 121, 23, -8, 3, -1},
157 {-1, 5, -15, 119, 27, -10, 4, -1}, {-2, 6, -17, 116, 33, -12, 5, -1},
158 {-2, 6, -18, 113, 38, -13, 5, -1}, {-2, 7, -19, 110, 43, -15, 6, -2},
159 {-2, 7, -20, 106, 49, -16, 6, -2}, {-2, 7, -21, 102, 54, -17, 7, -2},
160 {-2, 8, -22, 98, 59, -18, 7, -2}, {-2, 8, -22, 94, 64, -19, 7, -2},
161 {-2, 8, -22, 89, 69, -20, 8, -2}, {-2, 8, -21, 84, 74, -21, 8, -2},
162 {-2, 8, -21, 79, 79, -21, 8, -2}, {-2, 8, -21, 74, 84, -21, 8, -2},
163 {-2, 8, -20, 69, 89, -22, 8, -2}, {-2, 7, -19, 64, 94, -22, 8, -2},
164 {-2, 7, -18, 59, 98, -22, 8, -2}, {-2, 7, -17, 54, 102, -21, 7, -2},
165 {-2, 6, -16, 49, 106, -20, 7, -2}, {-2, 6, -15, 43, 110, -19, 7, -2},
166 {-1, 5, -13, 38, 113, -18, 6, -2}, {-1, 5, -12, 33, 116, -17, 6, -2},
167 {-1, 4, -10, 27, 119, -15, 5, -1}, {-1, 3, -8, 23, 121, -13, 4, -1},
168 {-1, 3, -7, 18, 123, -11, 4, -1}, {-1, 2, -5, 13, 125, -8, 3, -1},
169 { 0, 1, -3, 8, 126, -6, 2, 0}, { 0, 1, -2, 4, 127, -3, 1, 0},
170 // [1, 2)
171 {0, 0, 0, 1, 127, 0, 0, 0}, {0, 0, 1, -3, 127, 4, -1, 0},
172 {0, 0, 1, -5, 126, 8, -3, 1}, {0, 0, 1, -7, 124, 13, -4, 1},
173 {0, 0, 2, -9, 122, 18, -6, 1}, {0, 0, 2, -11, 120, 22, -7, 2},
174 {0, 0, 3, -13, 117, 27, -8, 2}, {0, 0, 3, -14, 114, 32, -10, 3},
175 {0, 0, 3, -15, 111, 37, -11, 3}, {0, 0, 3, -16, 108, 42, -12, 3},
176 {0, 0, 4, -17, 104, 47, -13, 3}, {0, 0, 4, -17, 100, 52, -14, 3},
177 {0, 0, 4, -18, 96, 58, -15, 3}, {0, 0, 4, -18, 91, 63, -16, 4},
178 {0, 0, 4, -18, 87, 68, -17, 4}, {0, 0, 4, -18, 82, 73, -17, 4},
179 {0, 0, 4, -18, 78, 78, -18, 4}, {0, 0, 4, -17, 73, 82, -18, 4},
180 {0, 0, 4, -17, 68, 87, -18, 4}, {0, 0, 4, -16, 63, 91, -18, 4},
181 {0, 0, 3, -15, 58, 96, -18, 4}, {0, 0, 3, -14, 52, 100, -17, 4},
182 {0, 0, 3, -13, 47, 104, -17, 4}, {0, 0, 3, -12, 42, 108, -16, 3},
183 {0, 0, 3, -11, 37, 111, -15, 3}, {0, 0, 3, -10, 32, 114, -14, 3},
184 {0, 0, 2, -8, 27, 117, -13, 3}, {0, 0, 2, -7, 22, 120, -11, 2},
185 {0, 0, 1, -6, 18, 122, -9, 2}, {0, 0, 1, -4, 13, 124, -7, 1},
186 {0, 0, 1, -3, 8, 126, -5, 1}, {0, 0, 0, -1, 4, 127, -3, 1},
187 // dummy (replicate row index 95)
188 {0, 0, 0, -1, 4, 127, -3, 1},
189
190 #endif // WARPEDPIXEL_PREC_BITS == 6
191 };
192
193 /* clang-format on */
194
195 #define DIV_LUT_PREC_BITS 14
196 #define DIV_LUT_BITS 8
197 #define DIV_LUT_NUM (1 << DIV_LUT_BITS)
198
199 static const uint16_t div_lut[DIV_LUT_NUM + 1] = {
200 16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
201 15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
202 15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
203 14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
204 13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
205 13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
206 13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
207 12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
208 12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
209 11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
210 11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
211 11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
212 10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
213 10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
214 10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
215 9963, 9939, 9916, 9892, 9869, 9846, 9823, 9800, 9777, 9754, 9732,
216 9709, 9687, 9664, 9642, 9620, 9598, 9576, 9554, 9533, 9511, 9489,
217 9468, 9447, 9425, 9404, 9383, 9362, 9341, 9321, 9300, 9279, 9259,
218 9239, 9218, 9198, 9178, 9158, 9138, 9118, 9098, 9079, 9059, 9039,
219 9020, 9001, 8981, 8962, 8943, 8924, 8905, 8886, 8867, 8849, 8830,
220 8812, 8793, 8775, 8756, 8738, 8720, 8702, 8684, 8666, 8648, 8630,
221 8613, 8595, 8577, 8560, 8542, 8525, 8508, 8490, 8473, 8456, 8439,
222 8422, 8405, 8389, 8372, 8355, 8339, 8322, 8306, 8289, 8273, 8257,
223 8240, 8224, 8208, 8192,
224 };
225
226 // Decomposes a divisor D such that 1/D = y/2^shift, where y is returned
227 // at precision of DIV_LUT_PREC_BITS along with the shift.
resolve_divisor_64(uint64_t D,int16_t * shift)228 static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) {
229 int64_t f;
230 *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
231 : get_msb((unsigned int)D));
232 // e is obtained from D after resetting the most significant 1 bit.
233 const int64_t e = D - ((uint64_t)1 << *shift);
234 // Get the most significant DIV_LUT_BITS (8) bits of e into f
235 if (*shift > DIV_LUT_BITS)
236 f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS);
237 else
238 f = e << (DIV_LUT_BITS - *shift);
239 assert(f <= DIV_LUT_NUM);
240 *shift += DIV_LUT_PREC_BITS;
241 // Use f as lookup into the precomputed table of multipliers
242 return div_lut[f];
243 }
244
resolve_divisor_32(uint32_t D,int16_t * shift)245 static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) {
246 int32_t f;
247 *shift = get_msb(D);
248 // e is obtained from D after resetting the most significant 1 bit.
249 const int32_t e = D - ((uint32_t)1 << *shift);
250 // Get the most significant DIV_LUT_BITS (8) bits of e into f
251 if (*shift > DIV_LUT_BITS)
252 f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS);
253 else
254 f = e << (DIV_LUT_BITS - *shift);
255 assert(f <= DIV_LUT_NUM);
256 *shift += DIV_LUT_PREC_BITS;
257 // Use f as lookup into the precomputed table of multipliers
258 return div_lut[f];
259 }
260
is_affine_valid(const WarpedMotionParams * const wm)261 static int is_affine_valid(const WarpedMotionParams *const wm) {
262 const int32_t *mat = wm->wmmat;
263 return (mat[2] > 0);
264 }
265
is_affine_shear_allowed(int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)266 static int is_affine_shear_allowed(int16_t alpha, int16_t beta, int16_t gamma,
267 int16_t delta) {
268 if ((4 * abs(alpha) + 7 * abs(beta) >= (1 << WARPEDMODEL_PREC_BITS)) ||
269 (4 * abs(gamma) + 4 * abs(delta) >= (1 << WARPEDMODEL_PREC_BITS)))
270 return 0;
271 else
272 return 1;
273 }
274
275 // Returns 1 on success or 0 on an invalid affine set
av1_get_shear_params(WarpedMotionParams * wm)276 int av1_get_shear_params(WarpedMotionParams *wm) {
277 const int32_t *mat = wm->wmmat;
278 if (!is_affine_valid(wm)) return 0;
279 wm->alpha =
280 clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
281 wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX);
282 int16_t shift;
283 int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1);
284 int64_t v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y;
285 wm->gamma =
286 clamp((int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift), INT16_MIN, INT16_MAX);
287 v = ((int64_t)mat[3] * mat[4]) * y;
288 wm->delta = clamp(mat[5] - (int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift) -
289 (1 << WARPEDMODEL_PREC_BITS),
290 INT16_MIN, INT16_MAX);
291
292 wm->alpha = ROUND_POWER_OF_TWO_SIGNED(wm->alpha, WARP_PARAM_REDUCE_BITS) *
293 (1 << WARP_PARAM_REDUCE_BITS);
294 wm->beta = ROUND_POWER_OF_TWO_SIGNED(wm->beta, WARP_PARAM_REDUCE_BITS) *
295 (1 << WARP_PARAM_REDUCE_BITS);
296 wm->gamma = ROUND_POWER_OF_TWO_SIGNED(wm->gamma, WARP_PARAM_REDUCE_BITS) *
297 (1 << WARP_PARAM_REDUCE_BITS);
298 wm->delta = ROUND_POWER_OF_TWO_SIGNED(wm->delta, WARP_PARAM_REDUCE_BITS) *
299 (1 << WARP_PARAM_REDUCE_BITS);
300
301 if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta))
302 return 0;
303
304 return 1;
305 }
306
307 #if CONFIG_AV1_HIGHBITDEPTH
highbd_error_measure(int err,int bd)308 static INLINE int highbd_error_measure(int err, int bd) {
309 const int b = bd - 8;
310 const int bmask = (1 << b) - 1;
311 const int v = (1 << b);
312 err = abs(err);
313 const int e1 = err >> b;
314 const int e2 = err & bmask;
315 return error_measure_lut[255 + e1] * (v - e2) +
316 error_measure_lut[256 + e1] * e2;
317 }
318
319 /* Note: For an explanation of the warp algorithm, and some notes on bit widths
320 for hardware implementations, see the comments above av1_warp_affine_c
321 */
av1_highbd_warp_affine_c(const int32_t * mat,const uint16_t * ref,int width,int height,int stride,uint16_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)322 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
323 int width, int height, int stride, uint16_t *pred,
324 int p_col, int p_row, int p_width, int p_height,
325 int p_stride, int subsampling_x,
326 int subsampling_y, int bd,
327 ConvolveParams *conv_params, int16_t alpha,
328 int16_t beta, int16_t gamma, int16_t delta) {
329 int32_t tmp[15 * 8];
330 const int reduce_bits_horiz =
331 conv_params->round_0 +
332 AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);
333 const int reduce_bits_vert = conv_params->is_compound
334 ? conv_params->round_1
335 : 2 * FILTER_BITS - reduce_bits_horiz;
336 const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
337 const int offset_bits_horiz = bd + FILTER_BITS - 1;
338 const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
339 const int round_bits =
340 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
341 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
342 (void)max_bits_horiz;
343 assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
344
345 for (int i = p_row; i < p_row + p_height; i += 8) {
346 for (int j = p_col; j < p_col + p_width; j += 8) {
347 // Calculate the center of this 8x8 block,
348 // project to luma coordinates (if in a subsampled chroma plane),
349 // apply the affine transformation,
350 // then convert back to the original coordinates (if necessary)
351 const int32_t src_x = (j + 4) << subsampling_x;
352 const int32_t src_y = (i + 4) << subsampling_y;
353 const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
354 const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
355 const int32_t x4 = dst_x >> subsampling_x;
356 const int32_t y4 = dst_y >> subsampling_y;
357
358 const int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
359 int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
360 const int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
361 int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
362
363 sx4 += alpha * (-4) + beta * (-4);
364 sy4 += gamma * (-4) + delta * (-4);
365
366 sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
367 sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
368
369 // Horizontal filter
370 for (int k = -7; k < 8; ++k) {
371 const int iy = clamp(iy4 + k, 0, height - 1);
372
373 int sx = sx4 + beta * (k + 4);
374 for (int l = -4; l < 4; ++l) {
375 int ix = ix4 + l - 3;
376 const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
377 WARPEDPIXEL_PREC_SHIFTS;
378 assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
379 const int16_t *coeffs = av1_warped_filter[offs];
380
381 int32_t sum = 1 << offset_bits_horiz;
382 for (int m = 0; m < 8; ++m) {
383 const int sample_x = clamp(ix + m, 0, width - 1);
384 sum += ref[iy * stride + sample_x] * coeffs[m];
385 }
386 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
387 assert(0 <= sum && sum < (1 << max_bits_horiz));
388 tmp[(k + 7) * 8 + (l + 4)] = sum;
389 sx += alpha;
390 }
391 }
392
393 // Vertical filter
394 for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
395 int sy = sy4 + delta * (k + 4);
396 for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
397 const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
398 WARPEDPIXEL_PREC_SHIFTS;
399 assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
400 const int16_t *coeffs = av1_warped_filter[offs];
401
402 int32_t sum = 1 << offset_bits_vert;
403 for (int m = 0; m < 8; ++m) {
404 sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
405 }
406
407 if (conv_params->is_compound) {
408 CONV_BUF_TYPE *p =
409 &conv_params
410 ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
411 (j - p_col + l + 4)];
412 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
413 if (conv_params->do_average) {
414 uint16_t *dst16 =
415 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
416 int32_t tmp32 = *p;
417 if (conv_params->use_dist_wtd_comp_avg) {
418 tmp32 = tmp32 * conv_params->fwd_offset +
419 sum * conv_params->bck_offset;
420 tmp32 = tmp32 >> DIST_PRECISION_BITS;
421 } else {
422 tmp32 += sum;
423 tmp32 = tmp32 >> 1;
424 }
425 tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
426 (1 << (offset_bits - conv_params->round_1 - 1));
427 *dst16 =
428 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp32, round_bits), bd);
429 } else {
430 *p = sum;
431 }
432 } else {
433 uint16_t *p =
434 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
435 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
436 assert(0 <= sum && sum < (1 << (bd + 2)));
437 *p = clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
438 }
439 sy += gamma;
440 }
441 }
442 }
443 }
444 }
445
highbd_warp_plane(WarpedMotionParams * wm,const uint16_t * const ref,int width,int height,int stride,uint16_t * const pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,ConvolveParams * conv_params)446 void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref,
447 int width, int height, int stride, uint16_t *const pred,
448 int p_col, int p_row, int p_width, int p_height,
449 int p_stride, int subsampling_x, int subsampling_y,
450 int bd, ConvolveParams *conv_params) {
451 assert(wm->wmtype <= AFFINE);
452 if (wm->wmtype == ROTZOOM) {
453 wm->wmmat[5] = wm->wmmat[2];
454 wm->wmmat[4] = -wm->wmmat[3];
455 }
456 const int32_t *const mat = wm->wmmat;
457 const int16_t alpha = wm->alpha;
458 const int16_t beta = wm->beta;
459 const int16_t gamma = wm->gamma;
460 const int16_t delta = wm->delta;
461
462 av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
463 p_width, p_height, p_stride, subsampling_x,
464 subsampling_y, bd, conv_params, alpha, beta, gamma,
465 delta);
466 }
467
av1_calc_highbd_frame_error(const uint16_t * const ref,int stride,const uint16_t * const dst,int p_width,int p_height,int p_stride,int bd)468 int64_t av1_calc_highbd_frame_error(const uint16_t *const ref, int stride,
469 const uint16_t *const dst, int p_width,
470 int p_height, int p_stride, int bd) {
471 int64_t sum_error = 0;
472 for (int i = 0; i < p_height; ++i) {
473 for (int j = 0; j < p_width; ++j) {
474 sum_error +=
475 highbd_error_measure(dst[j + i * p_stride] - ref[j + i * stride], bd);
476 }
477 }
478 return sum_error;
479 }
480
highbd_segmented_frame_error(const uint16_t * const ref,int stride,const uint16_t * const dst,int p_width,int p_height,int p_stride,int bd,uint8_t * segment_map,int segment_map_stride)481 static int64_t highbd_segmented_frame_error(
482 const uint16_t *const ref, int stride, const uint16_t *const dst,
483 int p_width, int p_height, int p_stride, int bd, uint8_t *segment_map,
484 int segment_map_stride) {
485 int patch_w, patch_h;
486 const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
487 const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
488 int64_t sum_error = 0;
489 for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
490 for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
491 int seg_x = j >> WARP_ERROR_BLOCK_LOG;
492 int seg_y = i >> WARP_ERROR_BLOCK_LOG;
493 // Only compute the error if this block contains inliers from the motion
494 // model
495 if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
496
497 // avoid computing error into the frame padding
498 patch_w = AOMMIN(error_bsize_w, p_width - j);
499 patch_h = AOMMIN(error_bsize_h, p_height - i);
500 sum_error += av1_calc_highbd_frame_error(ref + j + i * stride, stride,
501 dst + j + i * p_stride, patch_w,
502 patch_h, p_stride, bd);
503 }
504 }
505 return sum_error;
506 }
507 #endif // CONFIG_AV1_HIGHBITDEPTH
508
509 /* The warp filter for ROTZOOM and AFFINE models works as follows:
510 * Split the input into 8x8 blocks
511 * For each block, project the point (4, 4) within the block, to get the
512 overall block position. Split into integer and fractional coordinates,
513 maintaining full WARPEDMODEL precision
514 * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a
515 variable horizontal offset. This means that, while the rows of the
516 intermediate buffer align with the rows of the *reference* image, the
517 columns align with the columns of the *destination* image.
518 * Filter vertically: Generate the output block (up to 8x8 pixels, but if the
519 destination is too small we crop the output at this stage). Each pixel has
520 a variable vertical offset, so that the resulting rows are aligned with
521 the rows of the destination image.
522
523 To accomplish these alignments, we factor the warp matrix as a
524 product of two shear / asymmetric zoom matrices:
525 / a b \ = / 1 0 \ * / 1+alpha beta \
526 \ c d / \ gamma 1+delta / \ 0 1 /
527 where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively.
528 The horizontal shear (with alpha and beta) is applied first,
529 then the vertical shear (with gamma and delta) is applied second.
530
531 The only limitation is that, to fit this in a fixed 8-tap filter size,
532 the fractional pixel offsets must be at most +-1. Since the horizontal filter
533 generates 15 rows of 8 columns, and the initial point we project is at (4, 4)
534 within the block, the parameters must satisfy
535 4 * |alpha| + 7 * |beta| <= 1 and 4 * |gamma| + 4 * |delta| <= 1
536 for this filter to be applicable.
537
538 Note: This function assumes that the caller has done all of the relevant
539 checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5]
540 are set appropriately (if using a ROTZOOM model), and that alpha, beta,
541 gamma, delta are all in range.
542
543 TODO(david.barker): Maybe support scaled references?
544 */
545 /* A note on hardware implementation:
546 The warp filter is intended to be implementable using the same hardware as
547 the high-precision convolve filters from the loop-restoration and
548 convolve-round experiments.
549
550 For a single filter stage, considering all of the coefficient sets for the
551 warp filter and the regular convolution filter, an input in the range
552 [0, 2^k - 1] is mapped into the range [-56 * (2^k - 1), 184 * (2^k - 1)]
553 before rounding.
554
555 Allowing for some changes to the filter coefficient sets, call the range
556 [-64 * 2^k, 192 * 2^k]. Then, if we initialize the accumulator to 64 * 2^k,
557 we can replace this by the range [0, 256 * 2^k], which can be stored in an
558 unsigned value with 8 + k bits.
559
560 This allows the derivation of the appropriate bit widths and offsets for
561 the various intermediate values: If
562
563 F := FILTER_BITS = 7 (or else the above ranges need adjusting)
564 So a *single* filter stage maps a k-bit input to a (k + F + 1)-bit
565 intermediate value.
566 H := ROUND0_BITS
567 V := VERSHEAR_REDUCE_PREC_BITS
568 (and note that we must have H + V = 2*F for the output to have the same
569 scale as the input)
570
571 then we end up with the following offsets and ranges:
572 Horizontal filter: Apply an offset of 1 << (bd + F - 1), sum fits into a
573 uint{bd + F + 1}
574 After rounding: The values stored in 'tmp' fit into a uint{bd + F + 1 - H}.
575 Vertical filter: Apply an offset of 1 << (bd + 2*F - H), sum fits into a
576 uint{bd + 2*F + 2 - H}
577 After rounding: The final value, before undoing the offset, fits into a
578 uint{bd + 2}.
579
580 Then we need to undo the offsets before clamping to a pixel. Note that,
581 if we do this at the end, the amount to subtract is actually independent
582 of H and V:
583
584 offset to subtract = (1 << ((bd + F - 1) - H + F - V)) +
585 (1 << ((bd + 2*F - H) - V))
586 == (1 << (bd - 1)) + (1 << bd)
587
588 This allows us to entirely avoid clamping in both the warp filter and
589 the convolve-round experiment. As of the time of writing, the Wiener filter
590 from loop-restoration can encode a central coefficient up to 216, which
591 leads to a maximum value of about 282 * 2^k after applying the offset.
592 So in that case we still need to clamp.
593 */
av1_warp_affine_c(const int32_t * mat,const uint8_t * ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)594 void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
595 int height, int stride, uint8_t *pred, int p_col,
596 int p_row, int p_width, int p_height, int p_stride,
597 int subsampling_x, int subsampling_y,
598 ConvolveParams *conv_params, int16_t alpha, int16_t beta,
599 int16_t gamma, int16_t delta) {
600 int32_t tmp[15 * 8];
601 const int bd = 8;
602 const int reduce_bits_horiz = conv_params->round_0;
603 const int reduce_bits_vert = conv_params->is_compound
604 ? conv_params->round_1
605 : 2 * FILTER_BITS - reduce_bits_horiz;
606 const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
607 const int offset_bits_horiz = bd + FILTER_BITS - 1;
608 const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
609 const int round_bits =
610 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
611 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
612 (void)max_bits_horiz;
613 assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
614 assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
615
616 for (int i = p_row; i < p_row + p_height; i += 8) {
617 for (int j = p_col; j < p_col + p_width; j += 8) {
618 // Calculate the center of this 8x8 block,
619 // project to luma coordinates (if in a subsampled chroma plane),
620 // apply the affine transformation,
621 // then convert back to the original coordinates (if necessary)
622 const int32_t src_x = (j + 4) << subsampling_x;
623 const int32_t src_y = (i + 4) << subsampling_y;
624 const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
625 const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
626 const int32_t x4 = dst_x >> subsampling_x;
627 const int32_t y4 = dst_y >> subsampling_y;
628
629 int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
630 int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
631 int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
632 int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
633
634 sx4 += alpha * (-4) + beta * (-4);
635 sy4 += gamma * (-4) + delta * (-4);
636
637 sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
638 sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
639
640 // Horizontal filter
641 for (int k = -7; k < 8; ++k) {
642 // Clamp to top/bottom edge of the frame
643 const int iy = clamp(iy4 + k, 0, height - 1);
644
645 int sx = sx4 + beta * (k + 4);
646
647 for (int l = -4; l < 4; ++l) {
648 int ix = ix4 + l - 3;
649 // At this point, sx = sx4 + alpha * l + beta * k
650 const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
651 WARPEDPIXEL_PREC_SHIFTS;
652 assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
653 const int16_t *coeffs = av1_warped_filter[offs];
654
655 int32_t sum = 1 << offset_bits_horiz;
656 for (int m = 0; m < 8; ++m) {
657 // Clamp to left/right edge of the frame
658 const int sample_x = clamp(ix + m, 0, width - 1);
659
660 sum += ref[iy * stride + sample_x] * coeffs[m];
661 }
662 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
663 assert(0 <= sum && sum < (1 << max_bits_horiz));
664 tmp[(k + 7) * 8 + (l + 4)] = sum;
665 sx += alpha;
666 }
667 }
668
669 // Vertical filter
670 for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
671 int sy = sy4 + delta * (k + 4);
672 for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
673 // At this point, sy = sy4 + gamma * l + delta * k
674 const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
675 WARPEDPIXEL_PREC_SHIFTS;
676 assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
677 const int16_t *coeffs = av1_warped_filter[offs];
678
679 int32_t sum = 1 << offset_bits_vert;
680 for (int m = 0; m < 8; ++m) {
681 sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
682 }
683
684 if (conv_params->is_compound) {
685 CONV_BUF_TYPE *p =
686 &conv_params
687 ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
688 (j - p_col + l + 4)];
689 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
690 if (conv_params->do_average) {
691 uint8_t *dst8 =
692 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
693 int32_t tmp32 = *p;
694 if (conv_params->use_dist_wtd_comp_avg) {
695 tmp32 = tmp32 * conv_params->fwd_offset +
696 sum * conv_params->bck_offset;
697 tmp32 = tmp32 >> DIST_PRECISION_BITS;
698 } else {
699 tmp32 += sum;
700 tmp32 = tmp32 >> 1;
701 }
702 tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
703 (1 << (offset_bits - conv_params->round_1 - 1));
704 *dst8 = clip_pixel(ROUND_POWER_OF_TWO(tmp32, round_bits));
705 } else {
706 *p = sum;
707 }
708 } else {
709 uint8_t *p =
710 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
711 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
712 assert(0 <= sum && sum < (1 << (bd + 2)));
713 *p = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
714 }
715 sy += gamma;
716 }
717 }
718 }
719 }
720 }
721
warp_plane(WarpedMotionParams * wm,const uint8_t * const ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params)722 void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width,
723 int height, int stride, uint8_t *pred, int p_col, int p_row,
724 int p_width, int p_height, int p_stride, int subsampling_x,
725 int subsampling_y, ConvolveParams *conv_params) {
726 assert(wm->wmtype <= AFFINE);
727 if (wm->wmtype == ROTZOOM) {
728 wm->wmmat[5] = wm->wmmat[2];
729 wm->wmmat[4] = -wm->wmmat[3];
730 }
731 const int32_t *const mat = wm->wmmat;
732 const int16_t alpha = wm->alpha;
733 const int16_t beta = wm->beta;
734 const int16_t gamma = wm->gamma;
735 const int16_t delta = wm->delta;
736 av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width,
737 p_height, p_stride, subsampling_x, subsampling_y, conv_params,
738 alpha, beta, gamma, delta);
739 }
740
av1_calc_frame_error_c(const uint8_t * const ref,int stride,const uint8_t * const dst,int p_width,int p_height,int p_stride)741 int64_t av1_calc_frame_error_c(const uint8_t *const ref, int stride,
742 const uint8_t *const dst, int p_width,
743 int p_height, int p_stride) {
744 int64_t sum_error = 0;
745 for (int i = 0; i < p_height; ++i) {
746 for (int j = 0; j < p_width; ++j) {
747 sum_error +=
748 (int64_t)error_measure(dst[j + i * p_stride] - ref[j + i * stride]);
749 }
750 }
751 return sum_error;
752 }
753
segmented_frame_error(const uint8_t * const ref,int stride,const uint8_t * const dst,int p_width,int p_height,int p_stride,uint8_t * segment_map,int segment_map_stride)754 static int64_t segmented_frame_error(const uint8_t *const ref, int stride,
755 const uint8_t *const dst, int p_width,
756 int p_height, int p_stride,
757 uint8_t *segment_map,
758 int segment_map_stride) {
759 int patch_w, patch_h;
760 const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
761 const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
762 int64_t sum_error = 0;
763 for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
764 for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
765 int seg_x = j >> WARP_ERROR_BLOCK_LOG;
766 int seg_y = i >> WARP_ERROR_BLOCK_LOG;
767 // Only compute the error if this block contains inliers from the motion
768 // model
769 if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
770
771 // avoid computing error into the frame padding
772 patch_w = AOMMIN(error_bsize_w, p_width - j);
773 patch_h = AOMMIN(error_bsize_h, p_height - i);
774 sum_error += av1_calc_frame_error(ref + j + i * stride, stride,
775 dst + j + i * p_stride, patch_w,
776 patch_h, p_stride);
777 }
778 }
779 return sum_error;
780 }
781
av1_frame_error(int use_hbd,int bd,const uint8_t * ref,int stride,uint8_t * dst,int p_width,int p_height,int p_stride)782 int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
783 uint8_t *dst, int p_width, int p_height, int p_stride) {
784 #if CONFIG_AV1_HIGHBITDEPTH
785 if (use_hbd) {
786 return av1_calc_highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride,
787 CONVERT_TO_SHORTPTR(dst), p_width,
788 p_height, p_stride, bd);
789 }
790 #endif
791 (void)use_hbd;
792 (void)bd;
793 return av1_calc_frame_error(ref, stride, dst, p_width, p_height, p_stride);
794 }
795
av1_segmented_frame_error(int use_hbd,int bd,const uint8_t * ref,int stride,uint8_t * dst,int p_width,int p_height,int p_stride,uint8_t * segment_map,int segment_map_stride)796 int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref,
797 int stride, uint8_t *dst, int p_width,
798 int p_height, int p_stride,
799 uint8_t *segment_map,
800 int segment_map_stride) {
801 #if CONFIG_AV1_HIGHBITDEPTH
802 if (use_hbd) {
803 return highbd_segmented_frame_error(
804 CONVERT_TO_SHORTPTR(ref), stride, CONVERT_TO_SHORTPTR(dst), p_width,
805 p_height, p_stride, bd, segment_map, segment_map_stride);
806 }
807 #endif
808 (void)use_hbd;
809 (void)bd;
810 return segmented_frame_error(ref, stride, dst, p_width, p_height, p_stride,
811 segment_map, segment_map_stride);
812 }
813
av1_warp_plane(WarpedMotionParams * wm,int use_hbd,int bd,const uint8_t * ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params)814 void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
815 const uint8_t *ref, int width, int height, int stride,
816 uint8_t *pred, int p_col, int p_row, int p_width,
817 int p_height, int p_stride, int subsampling_x,
818 int subsampling_y, ConvolveParams *conv_params) {
819 #if CONFIG_AV1_HIGHBITDEPTH
820 if (use_hbd)
821 highbd_warp_plane(wm, CONVERT_TO_SHORTPTR(ref), width, height, stride,
822 CONVERT_TO_SHORTPTR(pred), p_col, p_row, p_width,
823 p_height, p_stride, subsampling_x, subsampling_y, bd,
824 conv_params);
825 else
826 warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
827 p_height, p_stride, subsampling_x, subsampling_y, conv_params);
828 #else
829 (void)use_hbd;
830 (void)bd;
831 warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
832 p_height, p_stride, subsampling_x, subsampling_y, conv_params);
833 #endif
834 }
835
836 #define LS_MV_MAX 256 // max mv in 1/8-pel
837 // Use LS_STEP = 8 so that 2 less bits needed for A, Bx, By.
838 #define LS_STEP 8
839
840 // Assuming LS_MV_MAX is < MAX_SB_SIZE * 8,
841 // the precision needed is:
842 // (MAX_SB_SIZE_LOG2 + 3) [for sx * sx magnitude] +
843 // (MAX_SB_SIZE_LOG2 + 4) [for sx * dx magnitude] +
844 // 1 [for sign] +
845 // LEAST_SQUARES_SAMPLES_MAX_BITS
846 // [for adding up to LEAST_SQUARES_SAMPLES_MAX samples]
847 // The value is 23
848 #define LS_MAT_RANGE_BITS \
849 ((MAX_SB_SIZE_LOG2 + 4) * 2 + LEAST_SQUARES_SAMPLES_MAX_BITS)
850
851 // Bit-depth reduction from the full-range
852 #define LS_MAT_DOWN_BITS 2
853
854 // bits range of A, Bx and By after downshifting
855 #define LS_MAT_BITS (LS_MAT_RANGE_BITS - LS_MAT_DOWN_BITS)
856 #define LS_MAT_MIN (-(1 << (LS_MAT_BITS - 1)))
857 #define LS_MAT_MAX ((1 << (LS_MAT_BITS - 1)) - 1)
858
859 // By setting LS_STEP = 8, the least 2 bits of every elements in A, Bx, By are
860 // 0. So, we can reduce LS_MAT_RANGE_BITS(2) bits here.
861 #define LS_SQUARE(a) \
862 (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
863 (2 + LS_MAT_DOWN_BITS))
864 #define LS_PRODUCT1(a, b) \
865 (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
866 (2 + LS_MAT_DOWN_BITS))
867 #define LS_PRODUCT2(a, b) \
868 (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
869 (2 + LS_MAT_DOWN_BITS))
870
871 #define USE_LIMITED_PREC_MULT 0
872
873 #if USE_LIMITED_PREC_MULT
874
875 #define MUL_PREC_BITS 16
resolve_multiplier_64(uint64_t D,int16_t * shift)876 static uint16_t resolve_multiplier_64(uint64_t D, int16_t *shift) {
877 int msb = 0;
878 uint16_t mult = 0;
879 *shift = 0;
880 if (D != 0) {
881 msb = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
882 : get_msb((unsigned int)D));
883 if (msb >= MUL_PREC_BITS) {
884 mult = (uint16_t)ROUND_POWER_OF_TWO_64(D, msb + 1 - MUL_PREC_BITS);
885 *shift = msb + 1 - MUL_PREC_BITS;
886 } else {
887 mult = (uint16_t)D;
888 *shift = 0;
889 }
890 }
891 return mult;
892 }
893
get_mult_shift_ndiag(int64_t Px,int16_t iDet,int shift)894 static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
895 int32_t ret;
896 int16_t mshift;
897 uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
898 int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
899 shift -= mshift;
900 if (shift > 0) {
901 return (int32_t)clamp(ROUND_POWER_OF_TWO_SIGNED(v, shift),
902 -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
903 WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
904 } else {
905 return (int32_t)clamp(v * (1 << (-shift)),
906 -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
907 WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
908 }
909 return ret;
910 }
911
get_mult_shift_diag(int64_t Px,int16_t iDet,int shift)912 static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
913 int16_t mshift;
914 uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
915 int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
916 shift -= mshift;
917 if (shift > 0) {
918 return (int32_t)clamp(
919 ROUND_POWER_OF_TWO_SIGNED(v, shift),
920 (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
921 (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
922 } else {
923 return (int32_t)clamp(
924 v * (1 << (-shift)),
925 (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
926 (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
927 }
928 }
929
930 #else
931
get_mult_shift_ndiag(int64_t Px,int16_t iDet,int shift)932 static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
933 int64_t v = Px * (int64_t)iDet;
934 return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
935 -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
936 WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
937 }
938
get_mult_shift_diag(int64_t Px,int16_t iDet,int shift)939 static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
940 int64_t v = Px * (int64_t)iDet;
941 return (int32_t)clamp64(
942 ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
943 (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
944 (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
945 }
946 #endif // USE_LIMITED_PREC_MULT
947
find_affine_int(int np,const int * pts1,const int * pts2,BLOCK_SIZE bsize,int mvy,int mvx,WarpedMotionParams * wm,int mi_row,int mi_col)948 static int find_affine_int(int np, const int *pts1, const int *pts2,
949 BLOCK_SIZE bsize, int mvy, int mvx,
950 WarpedMotionParams *wm, int mi_row, int mi_col) {
951 int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
952 int32_t Bx[2] = { 0, 0 };
953 int32_t By[2] = { 0, 0 };
954
955 const int bw = block_size_wide[bsize];
956 const int bh = block_size_high[bsize];
957 const int rsuy = bh / 2 - 1;
958 const int rsux = bw / 2 - 1;
959 const int suy = rsuy * 8;
960 const int sux = rsux * 8;
961 const int duy = suy + mvy;
962 const int dux = sux + mvx;
963
964 // Assume the center pixel of the block has exactly the same motion vector
965 // as transmitted for the block. First shift the origin of the source
966 // points to the block center, and the origin of the destination points to
967 // the block center added to the motion vector transmitted.
968 // Let (xi, yi) denote the source points and (xi', yi') denote destination
969 // points after origin shfifting, for i = 0, 1, 2, .... n-1.
970 // Then if P = [x0, y0,
971 // x1, y1
972 // x2, y1,
973 // ....
974 // ]
975 // q = [x0', x1', x2', ... ]'
976 // r = [y0', y1', y2', ... ]'
977 // the least squares problems that need to be solved are:
978 // [h1, h2]' = inv(P'P)P'q and
979 // [h3, h4]' = inv(P'P)P'r
980 // where the affine transformation is given by:
981 // x' = h1.x + h2.y
982 // y' = h3.x + h4.y
983 //
984 // The loop below computes: A = P'P, Bx = P'q, By = P'r
985 // We need to just compute inv(A).Bx and inv(A).By for the solutions.
986 // Contribution from neighbor block
987 for (int i = 0; i < np; i++) {
988 const int dx = pts2[i * 2] - dux;
989 const int dy = pts2[i * 2 + 1] - duy;
990 const int sx = pts1[i * 2] - sux;
991 const int sy = pts1[i * 2 + 1] - suy;
992 // (TODO)yunqing: This comparison wouldn't be necessary if the sample
993 // selection is done in find_samples(). Also, global offset can be removed
994 // while collecting samples.
995 if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
996 A[0][0] += LS_SQUARE(sx);
997 A[0][1] += LS_PRODUCT1(sx, sy);
998 A[1][1] += LS_SQUARE(sy);
999 Bx[0] += LS_PRODUCT2(sx, dx);
1000 Bx[1] += LS_PRODUCT1(sy, dx);
1001 By[0] += LS_PRODUCT1(sx, dy);
1002 By[1] += LS_PRODUCT2(sy, dy);
1003 }
1004 }
1005
1006 // Just for debugging, and can be removed later.
1007 assert(A[0][0] >= LS_MAT_MIN && A[0][0] <= LS_MAT_MAX);
1008 assert(A[0][1] >= LS_MAT_MIN && A[0][1] <= LS_MAT_MAX);
1009 assert(A[1][1] >= LS_MAT_MIN && A[1][1] <= LS_MAT_MAX);
1010 assert(Bx[0] >= LS_MAT_MIN && Bx[0] <= LS_MAT_MAX);
1011 assert(Bx[1] >= LS_MAT_MIN && Bx[1] <= LS_MAT_MAX);
1012 assert(By[0] >= LS_MAT_MIN && By[0] <= LS_MAT_MAX);
1013 assert(By[1] >= LS_MAT_MIN && By[1] <= LS_MAT_MAX);
1014
1015 // Compute Determinant of A
1016 const int64_t Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
1017 if (Det == 0) return 1;
1018
1019 int16_t shift;
1020 int16_t iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1);
1021 shift -= WARPEDMODEL_PREC_BITS;
1022 if (shift < 0) {
1023 iDet <<= (-shift);
1024 shift = 0;
1025 }
1026
1027 int64_t Px[2], Py[2];
1028 // These divided by the Det, are the least squares solutions
1029 Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
1030 Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1];
1031 Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1];
1032 Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1];
1033
1034 wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift);
1035 wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift);
1036 wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift);
1037 wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift);
1038
1039 const int isuy = (mi_row * MI_SIZE + rsuy);
1040 const int isux = (mi_col * MI_SIZE + rsux);
1041 // Note: In the vx, vy expressions below, the max value of each of the
1042 // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room
1043 // for the first term so that the overall sum in the worst case fits
1044 // within 32 bits overall.
1045 const int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
1046 (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) +
1047 isuy * wm->wmmat[3]);
1048 const int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
1049 (isux * wm->wmmat[4] +
1050 isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS)));
1051 wm->wmmat[0] =
1052 clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
1053 wm->wmmat[1] =
1054 clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
1055
1056 wm->wmmat[6] = wm->wmmat[7] = 0;
1057 return 0;
1058 }
1059
av1_find_projection(int np,const int * pts1,const int * pts2,BLOCK_SIZE bsize,int mvy,int mvx,WarpedMotionParams * wm_params,int mi_row,int mi_col)1060 int av1_find_projection(int np, const int *pts1, const int *pts2,
1061 BLOCK_SIZE bsize, int mvy, int mvx,
1062 WarpedMotionParams *wm_params, int mi_row, int mi_col) {
1063 assert(wm_params->wmtype == AFFINE);
1064
1065 if (find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, mi_row,
1066 mi_col))
1067 return 1;
1068
1069 // check compatibility with the fast warp filter
1070 if (!av1_get_shear_params(wm_params)) return 1;
1071
1072 return 0;
1073 }
1074