1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <memory.h>
15 #include <math.h>
16 #include <assert.h>
17
18 #include "config/av1_rtcd.h"
19
20 #include "av1/common/warped_motion.h"
21 #include "av1/common/scale.h"
22
23 // For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
24 // at a time. The zoom/rotation/shear in the model are applied to the
25 // "fractional" position of each pixel, which therefore varies within
26 // [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
27 // We need an extra 2 taps to fit this in, for a total of 8 taps.
28 /* clang-format off */
29 const int16_t av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8] = {
30 #if WARPEDPIXEL_PREC_BITS == 6
31 // [-1, 0)
32 { 0, 0, 127, 1, 0, 0, 0, 0 }, { 0, - 1, 127, 2, 0, 0, 0, 0 },
33 { 1, - 3, 127, 4, - 1, 0, 0, 0 }, { 1, - 4, 126, 6, - 2, 1, 0, 0 },
34 { 1, - 5, 126, 8, - 3, 1, 0, 0 }, { 1, - 6, 125, 11, - 4, 1, 0, 0 },
35 { 1, - 7, 124, 13, - 4, 1, 0, 0 }, { 2, - 8, 123, 15, - 5, 1, 0, 0 },
36 { 2, - 9, 122, 18, - 6, 1, 0, 0 }, { 2, -10, 121, 20, - 6, 1, 0, 0 },
37 { 2, -11, 120, 22, - 7, 2, 0, 0 }, { 2, -12, 119, 25, - 8, 2, 0, 0 },
38 { 3, -13, 117, 27, - 8, 2, 0, 0 }, { 3, -13, 116, 29, - 9, 2, 0, 0 },
39 { 3, -14, 114, 32, -10, 3, 0, 0 }, { 3, -15, 113, 35, -10, 2, 0, 0 },
40 { 3, -15, 111, 37, -11, 3, 0, 0 }, { 3, -16, 109, 40, -11, 3, 0, 0 },
41 { 3, -16, 108, 42, -12, 3, 0, 0 }, { 4, -17, 106, 45, -13, 3, 0, 0 },
42 { 4, -17, 104, 47, -13, 3, 0, 0 }, { 4, -17, 102, 50, -14, 3, 0, 0 },
43 { 4, -17, 100, 52, -14, 3, 0, 0 }, { 4, -18, 98, 55, -15, 4, 0, 0 },
44 { 4, -18, 96, 58, -15, 3, 0, 0 }, { 4, -18, 94, 60, -16, 4, 0, 0 },
45 { 4, -18, 91, 63, -16, 4, 0, 0 }, { 4, -18, 89, 65, -16, 4, 0, 0 },
46 { 4, -18, 87, 68, -17, 4, 0, 0 }, { 4, -18, 85, 70, -17, 4, 0, 0 },
47 { 4, -18, 82, 73, -17, 4, 0, 0 }, { 4, -18, 80, 75, -17, 4, 0, 0 },
48 { 4, -18, 78, 78, -18, 4, 0, 0 }, { 4, -17, 75, 80, -18, 4, 0, 0 },
49 { 4, -17, 73, 82, -18, 4, 0, 0 }, { 4, -17, 70, 85, -18, 4, 0, 0 },
50 { 4, -17, 68, 87, -18, 4, 0, 0 }, { 4, -16, 65, 89, -18, 4, 0, 0 },
51 { 4, -16, 63, 91, -18, 4, 0, 0 }, { 4, -16, 60, 94, -18, 4, 0, 0 },
52 { 3, -15, 58, 96, -18, 4, 0, 0 }, { 4, -15, 55, 98, -18, 4, 0, 0 },
53 { 3, -14, 52, 100, -17, 4, 0, 0 }, { 3, -14, 50, 102, -17, 4, 0, 0 },
54 { 3, -13, 47, 104, -17, 4, 0, 0 }, { 3, -13, 45, 106, -17, 4, 0, 0 },
55 { 3, -12, 42, 108, -16, 3, 0, 0 }, { 3, -11, 40, 109, -16, 3, 0, 0 },
56 { 3, -11, 37, 111, -15, 3, 0, 0 }, { 2, -10, 35, 113, -15, 3, 0, 0 },
57 { 3, -10, 32, 114, -14, 3, 0, 0 }, { 2, - 9, 29, 116, -13, 3, 0, 0 },
58 { 2, - 8, 27, 117, -13, 3, 0, 0 }, { 2, - 8, 25, 119, -12, 2, 0, 0 },
59 { 2, - 7, 22, 120, -11, 2, 0, 0 }, { 1, - 6, 20, 121, -10, 2, 0, 0 },
60 { 1, - 6, 18, 122, - 9, 2, 0, 0 }, { 1, - 5, 15, 123, - 8, 2, 0, 0 },
61 { 1, - 4, 13, 124, - 7, 1, 0, 0 }, { 1, - 4, 11, 125, - 6, 1, 0, 0 },
62 { 1, - 3, 8, 126, - 5, 1, 0, 0 }, { 1, - 2, 6, 126, - 4, 1, 0, 0 },
63 { 0, - 1, 4, 127, - 3, 1, 0, 0 }, { 0, 0, 2, 127, - 1, 0, 0, 0 },
64
65 // [0, 1)
66 { 0, 0, 0, 127, 1, 0, 0, 0}, { 0, 0, -1, 127, 2, 0, 0, 0},
67 { 0, 1, -3, 127, 4, -2, 1, 0}, { 0, 1, -5, 127, 6, -2, 1, 0},
68 { 0, 2, -6, 126, 8, -3, 1, 0}, {-1, 2, -7, 126, 11, -4, 2, -1},
69 {-1, 3, -8, 125, 13, -5, 2, -1}, {-1, 3, -10, 124, 16, -6, 3, -1},
70 {-1, 4, -11, 123, 18, -7, 3, -1}, {-1, 4, -12, 122, 20, -7, 3, -1},
71 {-1, 4, -13, 121, 23, -8, 3, -1}, {-2, 5, -14, 120, 25, -9, 4, -1},
72 {-1, 5, -15, 119, 27, -10, 4, -1}, {-1, 5, -16, 118, 30, -11, 4, -1},
73 {-2, 6, -17, 116, 33, -12, 5, -1}, {-2, 6, -17, 114, 35, -12, 5, -1},
74 {-2, 6, -18, 113, 38, -13, 5, -1}, {-2, 7, -19, 111, 41, -14, 6, -2},
75 {-2, 7, -19, 110, 43, -15, 6, -2}, {-2, 7, -20, 108, 46, -15, 6, -2},
76 {-2, 7, -20, 106, 49, -16, 6, -2}, {-2, 7, -21, 104, 51, -16, 7, -2},
77 {-2, 7, -21, 102, 54, -17, 7, -2}, {-2, 8, -21, 100, 56, -18, 7, -2},
78 {-2, 8, -22, 98, 59, -18, 7, -2}, {-2, 8, -22, 96, 62, -19, 7, -2},
79 {-2, 8, -22, 94, 64, -19, 7, -2}, {-2, 8, -22, 91, 67, -20, 8, -2},
80 {-2, 8, -22, 89, 69, -20, 8, -2}, {-2, 8, -22, 87, 72, -21, 8, -2},
81 {-2, 8, -21, 84, 74, -21, 8, -2}, {-2, 8, -22, 82, 77, -21, 8, -2},
82 {-2, 8, -21, 79, 79, -21, 8, -2}, {-2, 8, -21, 77, 82, -22, 8, -2},
83 {-2, 8, -21, 74, 84, -21, 8, -2}, {-2, 8, -21, 72, 87, -22, 8, -2},
84 {-2, 8, -20, 69, 89, -22, 8, -2}, {-2, 8, -20, 67, 91, -22, 8, -2},
85 {-2, 7, -19, 64, 94, -22, 8, -2}, {-2, 7, -19, 62, 96, -22, 8, -2},
86 {-2, 7, -18, 59, 98, -22, 8, -2}, {-2, 7, -18, 56, 100, -21, 8, -2},
87 {-2, 7, -17, 54, 102, -21, 7, -2}, {-2, 7, -16, 51, 104, -21, 7, -2},
88 {-2, 6, -16, 49, 106, -20, 7, -2}, {-2, 6, -15, 46, 108, -20, 7, -2},
89 {-2, 6, -15, 43, 110, -19, 7, -2}, {-2, 6, -14, 41, 111, -19, 7, -2},
90 {-1, 5, -13, 38, 113, -18, 6, -2}, {-1, 5, -12, 35, 114, -17, 6, -2},
91 {-1, 5, -12, 33, 116, -17, 6, -2}, {-1, 4, -11, 30, 118, -16, 5, -1},
92 {-1, 4, -10, 27, 119, -15, 5, -1}, {-1, 4, -9, 25, 120, -14, 5, -2},
93 {-1, 3, -8, 23, 121, -13, 4, -1}, {-1, 3, -7, 20, 122, -12, 4, -1},
94 {-1, 3, -7, 18, 123, -11, 4, -1}, {-1, 3, -6, 16, 124, -10, 3, -1},
95 {-1, 2, -5, 13, 125, -8, 3, -1}, {-1, 2, -4, 11, 126, -7, 2, -1},
96 { 0, 1, -3, 8, 126, -6, 2, 0}, { 0, 1, -2, 6, 127, -5, 1, 0},
97 { 0, 1, -2, 4, 127, -3, 1, 0}, { 0, 0, 0, 2, 127, -1, 0, 0},
98
99 // [1, 2)
100 { 0, 0, 0, 1, 127, 0, 0, 0 }, { 0, 0, 0, - 1, 127, 2, 0, 0 },
101 { 0, 0, 1, - 3, 127, 4, - 1, 0 }, { 0, 0, 1, - 4, 126, 6, - 2, 1 },
102 { 0, 0, 1, - 5, 126, 8, - 3, 1 }, { 0, 0, 1, - 6, 125, 11, - 4, 1 },
103 { 0, 0, 1, - 7, 124, 13, - 4, 1 }, { 0, 0, 2, - 8, 123, 15, - 5, 1 },
104 { 0, 0, 2, - 9, 122, 18, - 6, 1 }, { 0, 0, 2, -10, 121, 20, - 6, 1 },
105 { 0, 0, 2, -11, 120, 22, - 7, 2 }, { 0, 0, 2, -12, 119, 25, - 8, 2 },
106 { 0, 0, 3, -13, 117, 27, - 8, 2 }, { 0, 0, 3, -13, 116, 29, - 9, 2 },
107 { 0, 0, 3, -14, 114, 32, -10, 3 }, { 0, 0, 3, -15, 113, 35, -10, 2 },
108 { 0, 0, 3, -15, 111, 37, -11, 3 }, { 0, 0, 3, -16, 109, 40, -11, 3 },
109 { 0, 0, 3, -16, 108, 42, -12, 3 }, { 0, 0, 4, -17, 106, 45, -13, 3 },
110 { 0, 0, 4, -17, 104, 47, -13, 3 }, { 0, 0, 4, -17, 102, 50, -14, 3 },
111 { 0, 0, 4, -17, 100, 52, -14, 3 }, { 0, 0, 4, -18, 98, 55, -15, 4 },
112 { 0, 0, 4, -18, 96, 58, -15, 3 }, { 0, 0, 4, -18, 94, 60, -16, 4 },
113 { 0, 0, 4, -18, 91, 63, -16, 4 }, { 0, 0, 4, -18, 89, 65, -16, 4 },
114 { 0, 0, 4, -18, 87, 68, -17, 4 }, { 0, 0, 4, -18, 85, 70, -17, 4 },
115 { 0, 0, 4, -18, 82, 73, -17, 4 }, { 0, 0, 4, -18, 80, 75, -17, 4 },
116 { 0, 0, 4, -18, 78, 78, -18, 4 }, { 0, 0, 4, -17, 75, 80, -18, 4 },
117 { 0, 0, 4, -17, 73, 82, -18, 4 }, { 0, 0, 4, -17, 70, 85, -18, 4 },
118 { 0, 0, 4, -17, 68, 87, -18, 4 }, { 0, 0, 4, -16, 65, 89, -18, 4 },
119 { 0, 0, 4, -16, 63, 91, -18, 4 }, { 0, 0, 4, -16, 60, 94, -18, 4 },
120 { 0, 0, 3, -15, 58, 96, -18, 4 }, { 0, 0, 4, -15, 55, 98, -18, 4 },
121 { 0, 0, 3, -14, 52, 100, -17, 4 }, { 0, 0, 3, -14, 50, 102, -17, 4 },
122 { 0, 0, 3, -13, 47, 104, -17, 4 }, { 0, 0, 3, -13, 45, 106, -17, 4 },
123 { 0, 0, 3, -12, 42, 108, -16, 3 }, { 0, 0, 3, -11, 40, 109, -16, 3 },
124 { 0, 0, 3, -11, 37, 111, -15, 3 }, { 0, 0, 2, -10, 35, 113, -15, 3 },
125 { 0, 0, 3, -10, 32, 114, -14, 3 }, { 0, 0, 2, - 9, 29, 116, -13, 3 },
126 { 0, 0, 2, - 8, 27, 117, -13, 3 }, { 0, 0, 2, - 8, 25, 119, -12, 2 },
127 { 0, 0, 2, - 7, 22, 120, -11, 2 }, { 0, 0, 1, - 6, 20, 121, -10, 2 },
128 { 0, 0, 1, - 6, 18, 122, - 9, 2 }, { 0, 0, 1, - 5, 15, 123, - 8, 2 },
129 { 0, 0, 1, - 4, 13, 124, - 7, 1 }, { 0, 0, 1, - 4, 11, 125, - 6, 1 },
130 { 0, 0, 1, - 3, 8, 126, - 5, 1 }, { 0, 0, 1, - 2, 6, 126, - 4, 1 },
131 { 0, 0, 0, - 1, 4, 127, - 3, 1 }, { 0, 0, 0, 0, 2, 127, - 1, 0 },
132 // dummy (replicate row index 191)
133 { 0, 0, 0, 0, 2, 127, - 1, 0 },
134
135 #elif WARPEDPIXEL_PREC_BITS == 5
136 // [-1, 0)
137 {0, 0, 127, 1, 0, 0, 0, 0}, {1, -3, 127, 4, -1, 0, 0, 0},
138 {1, -5, 126, 8, -3, 1, 0, 0}, {1, -7, 124, 13, -4, 1, 0, 0},
139 {2, -9, 122, 18, -6, 1, 0, 0}, {2, -11, 120, 22, -7, 2, 0, 0},
140 {3, -13, 117, 27, -8, 2, 0, 0}, {3, -14, 114, 32, -10, 3, 0, 0},
141 {3, -15, 111, 37, -11, 3, 0, 0}, {3, -16, 108, 42, -12, 3, 0, 0},
142 {4, -17, 104, 47, -13, 3, 0, 0}, {4, -17, 100, 52, -14, 3, 0, 0},
143 {4, -18, 96, 58, -15, 3, 0, 0}, {4, -18, 91, 63, -16, 4, 0, 0},
144 {4, -18, 87, 68, -17, 4, 0, 0}, {4, -18, 82, 73, -17, 4, 0, 0},
145 {4, -18, 78, 78, -18, 4, 0, 0}, {4, -17, 73, 82, -18, 4, 0, 0},
146 {4, -17, 68, 87, -18, 4, 0, 0}, {4, -16, 63, 91, -18, 4, 0, 0},
147 {3, -15, 58, 96, -18, 4, 0, 0}, {3, -14, 52, 100, -17, 4, 0, 0},
148 {3, -13, 47, 104, -17, 4, 0, 0}, {3, -12, 42, 108, -16, 3, 0, 0},
149 {3, -11, 37, 111, -15, 3, 0, 0}, {3, -10, 32, 114, -14, 3, 0, 0},
150 {2, -8, 27, 117, -13, 3, 0, 0}, {2, -7, 22, 120, -11, 2, 0, 0},
151 {1, -6, 18, 122, -9, 2, 0, 0}, {1, -4, 13, 124, -7, 1, 0, 0},
152 {1, -3, 8, 126, -5, 1, 0, 0}, {0, -1, 4, 127, -3, 1, 0, 0},
153 // [0, 1)
154 { 0, 0, 0, 127, 1, 0, 0, 0}, { 0, 1, -3, 127, 4, -2, 1, 0},
155 { 0, 2, -6, 126, 8, -3, 1, 0}, {-1, 3, -8, 125, 13, -5, 2, -1},
156 {-1, 4, -11, 123, 18, -7, 3, -1}, {-1, 4, -13, 121, 23, -8, 3, -1},
157 {-1, 5, -15, 119, 27, -10, 4, -1}, {-2, 6, -17, 116, 33, -12, 5, -1},
158 {-2, 6, -18, 113, 38, -13, 5, -1}, {-2, 7, -19, 110, 43, -15, 6, -2},
159 {-2, 7, -20, 106, 49, -16, 6, -2}, {-2, 7, -21, 102, 54, -17, 7, -2},
160 {-2, 8, -22, 98, 59, -18, 7, -2}, {-2, 8, -22, 94, 64, -19, 7, -2},
161 {-2, 8, -22, 89, 69, -20, 8, -2}, {-2, 8, -21, 84, 74, -21, 8, -2},
162 {-2, 8, -21, 79, 79, -21, 8, -2}, {-2, 8, -21, 74, 84, -21, 8, -2},
163 {-2, 8, -20, 69, 89, -22, 8, -2}, {-2, 7, -19, 64, 94, -22, 8, -2},
164 {-2, 7, -18, 59, 98, -22, 8, -2}, {-2, 7, -17, 54, 102, -21, 7, -2},
165 {-2, 6, -16, 49, 106, -20, 7, -2}, {-2, 6, -15, 43, 110, -19, 7, -2},
166 {-1, 5, -13, 38, 113, -18, 6, -2}, {-1, 5, -12, 33, 116, -17, 6, -2},
167 {-1, 4, -10, 27, 119, -15, 5, -1}, {-1, 3, -8, 23, 121, -13, 4, -1},
168 {-1, 3, -7, 18, 123, -11, 4, -1}, {-1, 2, -5, 13, 125, -8, 3, -1},
169 { 0, 1, -3, 8, 126, -6, 2, 0}, { 0, 1, -2, 4, 127, -3, 1, 0},
170 // [1, 2)
171 {0, 0, 0, 1, 127, 0, 0, 0}, {0, 0, 1, -3, 127, 4, -1, 0},
172 {0, 0, 1, -5, 126, 8, -3, 1}, {0, 0, 1, -7, 124, 13, -4, 1},
173 {0, 0, 2, -9, 122, 18, -6, 1}, {0, 0, 2, -11, 120, 22, -7, 2},
174 {0, 0, 3, -13, 117, 27, -8, 2}, {0, 0, 3, -14, 114, 32, -10, 3},
175 {0, 0, 3, -15, 111, 37, -11, 3}, {0, 0, 3, -16, 108, 42, -12, 3},
176 {0, 0, 4, -17, 104, 47, -13, 3}, {0, 0, 4, -17, 100, 52, -14, 3},
177 {0, 0, 4, -18, 96, 58, -15, 3}, {0, 0, 4, -18, 91, 63, -16, 4},
178 {0, 0, 4, -18, 87, 68, -17, 4}, {0, 0, 4, -18, 82, 73, -17, 4},
179 {0, 0, 4, -18, 78, 78, -18, 4}, {0, 0, 4, -17, 73, 82, -18, 4},
180 {0, 0, 4, -17, 68, 87, -18, 4}, {0, 0, 4, -16, 63, 91, -18, 4},
181 {0, 0, 3, -15, 58, 96, -18, 4}, {0, 0, 3, -14, 52, 100, -17, 4},
182 {0, 0, 3, -13, 47, 104, -17, 4}, {0, 0, 3, -12, 42, 108, -16, 3},
183 {0, 0, 3, -11, 37, 111, -15, 3}, {0, 0, 3, -10, 32, 114, -14, 3},
184 {0, 0, 2, -8, 27, 117, -13, 3}, {0, 0, 2, -7, 22, 120, -11, 2},
185 {0, 0, 1, -6, 18, 122, -9, 2}, {0, 0, 1, -4, 13, 124, -7, 1},
186 {0, 0, 1, -3, 8, 126, -5, 1}, {0, 0, 0, -1, 4, 127, -3, 1},
187 // dummy (replicate row index 95)
188 {0, 0, 0, -1, 4, 127, -3, 1},
189
190 #endif // WARPEDPIXEL_PREC_BITS == 6
191 };
192
193 /* clang-format on */
194
195 #define DIV_LUT_PREC_BITS 14
196 #define DIV_LUT_BITS 8
197 #define DIV_LUT_NUM (1 << DIV_LUT_BITS)
198
199 static const uint16_t div_lut[DIV_LUT_NUM + 1] = {
200 16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
201 15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
202 15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
203 14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
204 13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
205 13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
206 13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
207 12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
208 12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
209 11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
210 11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
211 11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
212 10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
213 10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
214 10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
215 9963, 9939, 9916, 9892, 9869, 9846, 9823, 9800, 9777, 9754, 9732,
216 9709, 9687, 9664, 9642, 9620, 9598, 9576, 9554, 9533, 9511, 9489,
217 9468, 9447, 9425, 9404, 9383, 9362, 9341, 9321, 9300, 9279, 9259,
218 9239, 9218, 9198, 9178, 9158, 9138, 9118, 9098, 9079, 9059, 9039,
219 9020, 9001, 8981, 8962, 8943, 8924, 8905, 8886, 8867, 8849, 8830,
220 8812, 8793, 8775, 8756, 8738, 8720, 8702, 8684, 8666, 8648, 8630,
221 8613, 8595, 8577, 8560, 8542, 8525, 8508, 8490, 8473, 8456, 8439,
222 8422, 8405, 8389, 8372, 8355, 8339, 8322, 8306, 8289, 8273, 8257,
223 8240, 8224, 8208, 8192,
224 };
225
226 // Decomposes a divisor D such that 1/D = y/2^shift, where y is returned
227 // at precision of DIV_LUT_PREC_BITS along with the shift.
resolve_divisor_64(uint64_t D,int16_t * shift)228 static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) {
229 int64_t f;
230 *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
231 : get_msb((unsigned int)D));
232 // e is obtained from D after resetting the most significant 1 bit.
233 const int64_t e = D - ((uint64_t)1 << *shift);
234 // Get the most significant DIV_LUT_BITS (8) bits of e into f
235 if (*shift > DIV_LUT_BITS)
236 f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS);
237 else
238 f = e << (DIV_LUT_BITS - *shift);
239 assert(f <= DIV_LUT_NUM);
240 *shift += DIV_LUT_PREC_BITS;
241 // Use f as lookup into the precomputed table of multipliers
242 return div_lut[f];
243 }
244
resolve_divisor_32(uint32_t D,int16_t * shift)245 static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) {
246 int32_t f;
247 *shift = get_msb(D);
248 // e is obtained from D after resetting the most significant 1 bit.
249 const int32_t e = D - ((uint32_t)1 << *shift);
250 // Get the most significant DIV_LUT_BITS (8) bits of e into f
251 if (*shift > DIV_LUT_BITS)
252 f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS);
253 else
254 f = e << (DIV_LUT_BITS - *shift);
255 assert(f <= DIV_LUT_NUM);
256 *shift += DIV_LUT_PREC_BITS;
257 // Use f as lookup into the precomputed table of multipliers
258 return div_lut[f];
259 }
260
is_affine_valid(const WarpedMotionParams * const wm)261 static int is_affine_valid(const WarpedMotionParams *const wm) {
262 const int32_t *mat = wm->wmmat;
263 return (mat[2] > 0);
264 }
265
is_affine_shear_allowed(int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)266 static int is_affine_shear_allowed(int16_t alpha, int16_t beta, int16_t gamma,
267 int16_t delta) {
268 if ((4 * abs(alpha) + 7 * abs(beta) >= (1 << WARPEDMODEL_PREC_BITS)) ||
269 (4 * abs(gamma) + 4 * abs(delta) >= (1 << WARPEDMODEL_PREC_BITS)))
270 return 0;
271 else
272 return 1;
273 }
274
275 // Returns 1 on success or 0 on an invalid affine set
av1_get_shear_params(WarpedMotionParams * wm)276 int av1_get_shear_params(WarpedMotionParams *wm) {
277 const int32_t *mat = wm->wmmat;
278 if (!is_affine_valid(wm)) return 0;
279 wm->alpha =
280 clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
281 wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX);
282 int16_t shift;
283 int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1);
284 int64_t v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y;
285 wm->gamma =
286 clamp((int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift), INT16_MIN, INT16_MAX);
287 v = ((int64_t)mat[3] * mat[4]) * y;
288 wm->delta = clamp(mat[5] - (int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift) -
289 (1 << WARPEDMODEL_PREC_BITS),
290 INT16_MIN, INT16_MAX);
291
292 wm->alpha = ROUND_POWER_OF_TWO_SIGNED(wm->alpha, WARP_PARAM_REDUCE_BITS) *
293 (1 << WARP_PARAM_REDUCE_BITS);
294 wm->beta = ROUND_POWER_OF_TWO_SIGNED(wm->beta, WARP_PARAM_REDUCE_BITS) *
295 (1 << WARP_PARAM_REDUCE_BITS);
296 wm->gamma = ROUND_POWER_OF_TWO_SIGNED(wm->gamma, WARP_PARAM_REDUCE_BITS) *
297 (1 << WARP_PARAM_REDUCE_BITS);
298 wm->delta = ROUND_POWER_OF_TWO_SIGNED(wm->delta, WARP_PARAM_REDUCE_BITS) *
299 (1 << WARP_PARAM_REDUCE_BITS);
300
301 if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta))
302 return 0;
303
304 return 1;
305 }
306
307 #if CONFIG_AV1_HIGHBITDEPTH
highbd_error_measure(int err,int bd)308 static INLINE int highbd_error_measure(int err, int bd) {
309 const int b = bd - 8;
310 const int bmask = (1 << b) - 1;
311 const int v = (1 << b);
312 err = abs(err);
313 const int e1 = err >> b;
314 const int e2 = err & bmask;
315 return error_measure_lut[255 + e1] * (v - e2) +
316 error_measure_lut[256 + e1] * e2;
317 }
318
319 /* Note: For an explanation of the warp algorithm, and some notes on bit widths
320 for hardware implementations, see the comments above av1_warp_affine_c
321 */
av1_highbd_warp_affine_c(const int32_t * mat,const uint16_t * ref,int width,int height,int stride,uint16_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)322 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
323 int width, int height, int stride, uint16_t *pred,
324 int p_col, int p_row, int p_width, int p_height,
325 int p_stride, int subsampling_x,
326 int subsampling_y, int bd,
327 ConvolveParams *conv_params, int16_t alpha,
328 int16_t beta, int16_t gamma, int16_t delta) {
329 int32_t tmp[15 * 8];
330 const int reduce_bits_horiz =
331 conv_params->round_0 +
332 AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);
333 const int reduce_bits_vert = conv_params->is_compound
334 ? conv_params->round_1
335 : 2 * FILTER_BITS - reduce_bits_horiz;
336 const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
337 const int offset_bits_horiz = bd + FILTER_BITS - 1;
338 const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
339 const int round_bits =
340 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
341 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
342 (void)max_bits_horiz;
343 assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
344
345 for (int i = p_row; i < p_row + p_height; i += 8) {
346 for (int j = p_col; j < p_col + p_width; j += 8) {
347 // Calculate the center of this 8x8 block,
348 // project to luma coordinates (if in a subsampled chroma plane),
349 // apply the affine transformation,
350 // then convert back to the original coordinates (if necessary)
351 const int32_t src_x = (j + 4) << subsampling_x;
352 const int32_t src_y = (i + 4) << subsampling_y;
353 const int64_t dst_x =
354 (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
355 const int64_t dst_y =
356 (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
357 const int64_t x4 = dst_x >> subsampling_x;
358 const int64_t y4 = dst_y >> subsampling_y;
359
360 const int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
361 int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
362 const int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
363 int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
364
365 sx4 += alpha * (-4) + beta * (-4);
366 sy4 += gamma * (-4) + delta * (-4);
367
368 sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
369 sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
370
371 // Horizontal filter
372 for (int k = -7; k < 8; ++k) {
373 const int iy = clamp(iy4 + k, 0, height - 1);
374
375 int sx = sx4 + beta * (k + 4);
376 for (int l = -4; l < 4; ++l) {
377 int ix = ix4 + l - 3;
378 const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
379 WARPEDPIXEL_PREC_SHIFTS;
380 assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
381 const int16_t *coeffs = av1_warped_filter[offs];
382
383 int32_t sum = 1 << offset_bits_horiz;
384 for (int m = 0; m < 8; ++m) {
385 const int sample_x = clamp(ix + m, 0, width - 1);
386 sum += ref[iy * stride + sample_x] * coeffs[m];
387 }
388 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
389 assert(0 <= sum && sum < (1 << max_bits_horiz));
390 tmp[(k + 7) * 8 + (l + 4)] = sum;
391 sx += alpha;
392 }
393 }
394
395 // Vertical filter
396 for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
397 int sy = sy4 + delta * (k + 4);
398 for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
399 const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
400 WARPEDPIXEL_PREC_SHIFTS;
401 assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
402 const int16_t *coeffs = av1_warped_filter[offs];
403
404 int32_t sum = 1 << offset_bits_vert;
405 for (int m = 0; m < 8; ++m) {
406 sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
407 }
408
409 if (conv_params->is_compound) {
410 CONV_BUF_TYPE *p =
411 &conv_params
412 ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
413 (j - p_col + l + 4)];
414 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
415 if (conv_params->do_average) {
416 uint16_t *dst16 =
417 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
418 int32_t tmp32 = *p;
419 if (conv_params->use_dist_wtd_comp_avg) {
420 tmp32 = tmp32 * conv_params->fwd_offset +
421 sum * conv_params->bck_offset;
422 tmp32 = tmp32 >> DIST_PRECISION_BITS;
423 } else {
424 tmp32 += sum;
425 tmp32 = tmp32 >> 1;
426 }
427 tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
428 (1 << (offset_bits - conv_params->round_1 - 1));
429 *dst16 =
430 clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp32, round_bits), bd);
431 } else {
432 *p = sum;
433 }
434 } else {
435 uint16_t *p =
436 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
437 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
438 assert(0 <= sum && sum < (1 << (bd + 2)));
439 *p = clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
440 }
441 sy += gamma;
442 }
443 }
444 }
445 }
446 }
447
highbd_warp_plane(WarpedMotionParams * wm,const uint16_t * const ref,int width,int height,int stride,uint16_t * const pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,int bd,ConvolveParams * conv_params)448 void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref,
449 int width, int height, int stride, uint16_t *const pred,
450 int p_col, int p_row, int p_width, int p_height,
451 int p_stride, int subsampling_x, int subsampling_y,
452 int bd, ConvolveParams *conv_params) {
453 assert(wm->wmtype <= AFFINE);
454 if (wm->wmtype == ROTZOOM) {
455 wm->wmmat[5] = wm->wmmat[2];
456 wm->wmmat[4] = -wm->wmmat[3];
457 }
458 const int32_t *const mat = wm->wmmat;
459 const int16_t alpha = wm->alpha;
460 const int16_t beta = wm->beta;
461 const int16_t gamma = wm->gamma;
462 const int16_t delta = wm->delta;
463
464 av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
465 p_width, p_height, p_stride, subsampling_x,
466 subsampling_y, bd, conv_params, alpha, beta, gamma,
467 delta);
468 }
469
av1_calc_highbd_frame_error(const uint16_t * const ref,int stride,const uint16_t * const dst,int p_width,int p_height,int p_stride,int bd)470 int64_t av1_calc_highbd_frame_error(const uint16_t *const ref, int stride,
471 const uint16_t *const dst, int p_width,
472 int p_height, int p_stride, int bd) {
473 int64_t sum_error = 0;
474 for (int i = 0; i < p_height; ++i) {
475 for (int j = 0; j < p_width; ++j) {
476 sum_error +=
477 highbd_error_measure(dst[j + i * p_stride] - ref[j + i * stride], bd);
478 }
479 }
480 return sum_error;
481 }
482
highbd_segmented_frame_error(const uint16_t * const ref,int stride,const uint16_t * const dst,int p_width,int p_height,int p_stride,int bd,uint8_t * segment_map,int segment_map_stride)483 static int64_t highbd_segmented_frame_error(
484 const uint16_t *const ref, int stride, const uint16_t *const dst,
485 int p_width, int p_height, int p_stride, int bd, uint8_t *segment_map,
486 int segment_map_stride) {
487 int patch_w, patch_h;
488 const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
489 const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
490 int64_t sum_error = 0;
491 for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
492 for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
493 int seg_x = j >> WARP_ERROR_BLOCK_LOG;
494 int seg_y = i >> WARP_ERROR_BLOCK_LOG;
495 // Only compute the error if this block contains inliers from the motion
496 // model
497 if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
498
499 // avoid computing error into the frame padding
500 patch_w = AOMMIN(error_bsize_w, p_width - j);
501 patch_h = AOMMIN(error_bsize_h, p_height - i);
502 sum_error += av1_calc_highbd_frame_error(ref + j + i * stride, stride,
503 dst + j + i * p_stride, patch_w,
504 patch_h, p_stride, bd);
505 }
506 }
507 return sum_error;
508 }
509 #endif // CONFIG_AV1_HIGHBITDEPTH
510
511 /* The warp filter for ROTZOOM and AFFINE models works as follows:
512 * Split the input into 8x8 blocks
513 * For each block, project the point (4, 4) within the block, to get the
514 overall block position. Split into integer and fractional coordinates,
515 maintaining full WARPEDMODEL precision
516 * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a
517 variable horizontal offset. This means that, while the rows of the
518 intermediate buffer align with the rows of the *reference* image, the
519 columns align with the columns of the *destination* image.
520 * Filter vertically: Generate the output block (up to 8x8 pixels, but if the
521 destination is too small we crop the output at this stage). Each pixel has
522 a variable vertical offset, so that the resulting rows are aligned with
523 the rows of the destination image.
524
525 To accomplish these alignments, we factor the warp matrix as a
526 product of two shear / asymmetric zoom matrices:
527 / a b \ = / 1 0 \ * / 1+alpha beta \
528 \ c d / \ gamma 1+delta / \ 0 1 /
529 where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively.
530 The horizontal shear (with alpha and beta) is applied first,
531 then the vertical shear (with gamma and delta) is applied second.
532
533 The only limitation is that, to fit this in a fixed 8-tap filter size,
534 the fractional pixel offsets must be at most +-1. Since the horizontal filter
535 generates 15 rows of 8 columns, and the initial point we project is at (4, 4)
536 within the block, the parameters must satisfy
537 4 * |alpha| + 7 * |beta| <= 1 and 4 * |gamma| + 4 * |delta| <= 1
538 for this filter to be applicable.
539
540 Note: This function assumes that the caller has done all of the relevant
541 checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5]
542 are set appropriately (if using a ROTZOOM model), and that alpha, beta,
543 gamma, delta are all in range.
544
545 TODO(rachelbarker): Maybe support scaled references?
546 */
547 /* A note on hardware implementation:
548 The warp filter is intended to be implementable using the same hardware as
549 the high-precision convolve filters from the loop-restoration and
550 convolve-round experiments.
551
552 For a single filter stage, considering all of the coefficient sets for the
553 warp filter and the regular convolution filter, an input in the range
554 [0, 2^k - 1] is mapped into the range [-56 * (2^k - 1), 184 * (2^k - 1)]
555 before rounding.
556
557 Allowing for some changes to the filter coefficient sets, call the range
558 [-64 * 2^k, 192 * 2^k]. Then, if we initialize the accumulator to 64 * 2^k,
559 we can replace this by the range [0, 256 * 2^k], which can be stored in an
560 unsigned value with 8 + k bits.
561
562 This allows the derivation of the appropriate bit widths and offsets for
563 the various intermediate values: If
564
565 F := FILTER_BITS = 7 (or else the above ranges need adjusting)
566 So a *single* filter stage maps a k-bit input to a (k + F + 1)-bit
567 intermediate value.
568 H := ROUND0_BITS
569 V := VERSHEAR_REDUCE_PREC_BITS
570 (and note that we must have H + V = 2*F for the output to have the same
571 scale as the input)
572
573 then we end up with the following offsets and ranges:
574 Horizontal filter: Apply an offset of 1 << (bd + F - 1), sum fits into a
575 uint{bd + F + 1}
576 After rounding: The values stored in 'tmp' fit into a uint{bd + F + 1 - H}.
577 Vertical filter: Apply an offset of 1 << (bd + 2*F - H), sum fits into a
578 uint{bd + 2*F + 2 - H}
579 After rounding: The final value, before undoing the offset, fits into a
580 uint{bd + 2}.
581
582 Then we need to undo the offsets before clamping to a pixel. Note that,
583 if we do this at the end, the amount to subtract is actually independent
584 of H and V:
585
586 offset to subtract = (1 << ((bd + F - 1) - H + F - V)) +
587 (1 << ((bd + 2*F - H) - V))
588 == (1 << (bd - 1)) + (1 << bd)
589
590 This allows us to entirely avoid clamping in both the warp filter and
591 the convolve-round experiment. As of the time of writing, the Wiener filter
592 from loop-restoration can encode a central coefficient up to 216, which
593 leads to a maximum value of about 282 * 2^k after applying the offset.
594 So in that case we still need to clamp.
595 */
av1_warp_affine_c(const int32_t * mat,const uint8_t * ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)596 void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
597 int height, int stride, uint8_t *pred, int p_col,
598 int p_row, int p_width, int p_height, int p_stride,
599 int subsampling_x, int subsampling_y,
600 ConvolveParams *conv_params, int16_t alpha, int16_t beta,
601 int16_t gamma, int16_t delta) {
602 int32_t tmp[15 * 8];
603 const int bd = 8;
604 const int reduce_bits_horiz = conv_params->round_0;
605 const int reduce_bits_vert = conv_params->is_compound
606 ? conv_params->round_1
607 : 2 * FILTER_BITS - reduce_bits_horiz;
608 const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
609 const int offset_bits_horiz = bd + FILTER_BITS - 1;
610 const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
611 const int round_bits =
612 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
613 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
614 (void)max_bits_horiz;
615 assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
616 assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
617
618 for (int i = p_row; i < p_row + p_height; i += 8) {
619 for (int j = p_col; j < p_col + p_width; j += 8) {
620 // Calculate the center of this 8x8 block,
621 // project to luma coordinates (if in a subsampled chroma plane),
622 // apply the affine transformation,
623 // then convert back to the original coordinates (if necessary)
624 const int32_t src_x = (j + 4) << subsampling_x;
625 const int32_t src_y = (i + 4) << subsampling_y;
626 const int64_t dst_x =
627 (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
628 const int64_t dst_y =
629 (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
630 const int64_t x4 = dst_x >> subsampling_x;
631 const int64_t y4 = dst_y >> subsampling_y;
632
633 int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
634 int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
635 int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
636 int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
637
638 sx4 += alpha * (-4) + beta * (-4);
639 sy4 += gamma * (-4) + delta * (-4);
640
641 sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
642 sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
643
644 // Horizontal filter
645 for (int k = -7; k < 8; ++k) {
646 // Clamp to top/bottom edge of the frame
647 const int iy = clamp(iy4 + k, 0, height - 1);
648
649 int sx = sx4 + beta * (k + 4);
650
651 for (int l = -4; l < 4; ++l) {
652 int ix = ix4 + l - 3;
653 // At this point, sx = sx4 + alpha * l + beta * k
654 const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
655 WARPEDPIXEL_PREC_SHIFTS;
656 assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
657 const int16_t *coeffs = av1_warped_filter[offs];
658
659 int32_t sum = 1 << offset_bits_horiz;
660 for (int m = 0; m < 8; ++m) {
661 // Clamp to left/right edge of the frame
662 const int sample_x = clamp(ix + m, 0, width - 1);
663
664 sum += ref[iy * stride + sample_x] * coeffs[m];
665 }
666 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
667 assert(0 <= sum && sum < (1 << max_bits_horiz));
668 tmp[(k + 7) * 8 + (l + 4)] = sum;
669 sx += alpha;
670 }
671 }
672
673 // Vertical filter
674 for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
675 int sy = sy4 + delta * (k + 4);
676 for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
677 // At this point, sy = sy4 + gamma * l + delta * k
678 const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
679 WARPEDPIXEL_PREC_SHIFTS;
680 assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
681 const int16_t *coeffs = av1_warped_filter[offs];
682
683 int32_t sum = 1 << offset_bits_vert;
684 for (int m = 0; m < 8; ++m) {
685 sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
686 }
687
688 if (conv_params->is_compound) {
689 CONV_BUF_TYPE *p =
690 &conv_params
691 ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
692 (j - p_col + l + 4)];
693 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
694 if (conv_params->do_average) {
695 uint8_t *dst8 =
696 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
697 int32_t tmp32 = *p;
698 if (conv_params->use_dist_wtd_comp_avg) {
699 tmp32 = tmp32 * conv_params->fwd_offset +
700 sum * conv_params->bck_offset;
701 tmp32 = tmp32 >> DIST_PRECISION_BITS;
702 } else {
703 tmp32 += sum;
704 tmp32 = tmp32 >> 1;
705 }
706 tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
707 (1 << (offset_bits - conv_params->round_1 - 1));
708 *dst8 = clip_pixel(ROUND_POWER_OF_TWO(tmp32, round_bits));
709 } else {
710 *p = sum;
711 }
712 } else {
713 uint8_t *p =
714 &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
715 sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
716 assert(0 <= sum && sum < (1 << (bd + 2)));
717 *p = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
718 }
719 sy += gamma;
720 }
721 }
722 }
723 }
724 }
725
warp_plane(WarpedMotionParams * wm,const uint8_t * const ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params)726 void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width,
727 int height, int stride, uint8_t *pred, int p_col, int p_row,
728 int p_width, int p_height, int p_stride, int subsampling_x,
729 int subsampling_y, ConvolveParams *conv_params) {
730 assert(wm->wmtype <= AFFINE);
731 if (wm->wmtype == ROTZOOM) {
732 wm->wmmat[5] = wm->wmmat[2];
733 wm->wmmat[4] = -wm->wmmat[3];
734 }
735 const int32_t *const mat = wm->wmmat;
736 const int16_t alpha = wm->alpha;
737 const int16_t beta = wm->beta;
738 const int16_t gamma = wm->gamma;
739 const int16_t delta = wm->delta;
740 av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width,
741 p_height, p_stride, subsampling_x, subsampling_y, conv_params,
742 alpha, beta, gamma, delta);
743 }
744
av1_calc_frame_error_c(const uint8_t * const ref,int stride,const uint8_t * const dst,int p_width,int p_height,int p_stride)745 int64_t av1_calc_frame_error_c(const uint8_t *const ref, int stride,
746 const uint8_t *const dst, int p_width,
747 int p_height, int p_stride) {
748 int64_t sum_error = 0;
749 for (int i = 0; i < p_height; ++i) {
750 for (int j = 0; j < p_width; ++j) {
751 sum_error +=
752 (int64_t)error_measure(dst[j + i * p_stride] - ref[j + i * stride]);
753 }
754 }
755 return sum_error;
756 }
757
segmented_frame_error(const uint8_t * const ref,int stride,const uint8_t * const dst,int p_width,int p_height,int p_stride,uint8_t * segment_map,int segment_map_stride)758 static int64_t segmented_frame_error(const uint8_t *const ref, int stride,
759 const uint8_t *const dst, int p_width,
760 int p_height, int p_stride,
761 uint8_t *segment_map,
762 int segment_map_stride) {
763 int patch_w, patch_h;
764 const int error_bsize_w = AOMMIN(p_width, WARP_ERROR_BLOCK);
765 const int error_bsize_h = AOMMIN(p_height, WARP_ERROR_BLOCK);
766 int64_t sum_error = 0;
767 for (int i = 0; i < p_height; i += WARP_ERROR_BLOCK) {
768 for (int j = 0; j < p_width; j += WARP_ERROR_BLOCK) {
769 int seg_x = j >> WARP_ERROR_BLOCK_LOG;
770 int seg_y = i >> WARP_ERROR_BLOCK_LOG;
771 // Only compute the error if this block contains inliers from the motion
772 // model
773 if (!segment_map[seg_y * segment_map_stride + seg_x]) continue;
774
775 // avoid computing error into the frame padding
776 patch_w = AOMMIN(error_bsize_w, p_width - j);
777 patch_h = AOMMIN(error_bsize_h, p_height - i);
778 sum_error += av1_calc_frame_error(ref + j + i * stride, stride,
779 dst + j + i * p_stride, patch_w,
780 patch_h, p_stride);
781 }
782 }
783 return sum_error;
784 }
785
av1_frame_error(int use_hbd,int bd,const uint8_t * ref,int stride,uint8_t * dst,int p_width,int p_height,int p_stride)786 int64_t av1_frame_error(int use_hbd, int bd, const uint8_t *ref, int stride,
787 uint8_t *dst, int p_width, int p_height, int p_stride) {
788 #if CONFIG_AV1_HIGHBITDEPTH
789 if (use_hbd) {
790 return av1_calc_highbd_frame_error(CONVERT_TO_SHORTPTR(ref), stride,
791 CONVERT_TO_SHORTPTR(dst), p_width,
792 p_height, p_stride, bd);
793 }
794 #endif
795 (void)use_hbd;
796 (void)bd;
797 return av1_calc_frame_error(ref, stride, dst, p_width, p_height, p_stride);
798 }
799
av1_segmented_frame_error(int use_hbd,int bd,const uint8_t * ref,int stride,uint8_t * dst,int p_width,int p_height,int p_stride,uint8_t * segment_map,int segment_map_stride)800 int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref,
801 int stride, uint8_t *dst, int p_width,
802 int p_height, int p_stride,
803 uint8_t *segment_map,
804 int segment_map_stride) {
805 #if CONFIG_AV1_HIGHBITDEPTH
806 if (use_hbd) {
807 return highbd_segmented_frame_error(
808 CONVERT_TO_SHORTPTR(ref), stride, CONVERT_TO_SHORTPTR(dst), p_width,
809 p_height, p_stride, bd, segment_map, segment_map_stride);
810 }
811 #endif
812 (void)use_hbd;
813 (void)bd;
814 return segmented_frame_error(ref, stride, dst, p_width, p_height, p_stride,
815 segment_map, segment_map_stride);
816 }
817
av1_warp_plane(WarpedMotionParams * wm,int use_hbd,int bd,const uint8_t * ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params)818 void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
819 const uint8_t *ref, int width, int height, int stride,
820 uint8_t *pred, int p_col, int p_row, int p_width,
821 int p_height, int p_stride, int subsampling_x,
822 int subsampling_y, ConvolveParams *conv_params) {
823 #if CONFIG_AV1_HIGHBITDEPTH
824 if (use_hbd)
825 highbd_warp_plane(wm, CONVERT_TO_SHORTPTR(ref), width, height, stride,
826 CONVERT_TO_SHORTPTR(pred), p_col, p_row, p_width,
827 p_height, p_stride, subsampling_x, subsampling_y, bd,
828 conv_params);
829 else
830 warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
831 p_height, p_stride, subsampling_x, subsampling_y, conv_params);
832 #else
833 (void)use_hbd;
834 (void)bd;
835 warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
836 p_height, p_stride, subsampling_x, subsampling_y, conv_params);
837 #endif
838 }
839
840 #define LS_MV_MAX 256 // max mv in 1/8-pel
841 // Use LS_STEP = 8 so that 2 less bits needed for A, Bx, By.
842 #define LS_STEP 8
843
844 // Assuming LS_MV_MAX is < MAX_SB_SIZE * 8,
845 // the precision needed is:
846 // (MAX_SB_SIZE_LOG2 + 3) [for sx * sx magnitude] +
847 // (MAX_SB_SIZE_LOG2 + 4) [for sx * dx magnitude] +
848 // 1 [for sign] +
849 // LEAST_SQUARES_SAMPLES_MAX_BITS
850 // [for adding up to LEAST_SQUARES_SAMPLES_MAX samples]
851 // The value is 23
852 #define LS_MAT_RANGE_BITS \
853 ((MAX_SB_SIZE_LOG2 + 4) * 2 + LEAST_SQUARES_SAMPLES_MAX_BITS)
854
855 // Bit-depth reduction from the full-range
856 #define LS_MAT_DOWN_BITS 2
857
858 // bits range of A, Bx and By after downshifting
859 #define LS_MAT_BITS (LS_MAT_RANGE_BITS - LS_MAT_DOWN_BITS)
860 #define LS_MAT_MIN (-(1 << (LS_MAT_BITS - 1)))
861 #define LS_MAT_MAX ((1 << (LS_MAT_BITS - 1)) - 1)
862
863 // By setting LS_STEP = 8, the least 2 bits of every elements in A, Bx, By are
864 // 0. So, we can reduce LS_MAT_RANGE_BITS(2) bits here.
865 #define LS_SQUARE(a) \
866 (((a) * (a)*4 + (a)*4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
867 (2 + LS_MAT_DOWN_BITS))
868 #define LS_PRODUCT1(a, b) \
869 (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
870 (2 + LS_MAT_DOWN_BITS))
871 #define LS_PRODUCT2(a, b) \
872 (((a) * (b)*4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
873 (2 + LS_MAT_DOWN_BITS))
874
875 #define USE_LIMITED_PREC_MULT 0
876
877 #if USE_LIMITED_PREC_MULT
878
879 #define MUL_PREC_BITS 16
resolve_multiplier_64(uint64_t D,int16_t * shift)880 static uint16_t resolve_multiplier_64(uint64_t D, int16_t *shift) {
881 int msb = 0;
882 uint16_t mult = 0;
883 *shift = 0;
884 if (D != 0) {
885 msb = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
886 : get_msb((unsigned int)D));
887 if (msb >= MUL_PREC_BITS) {
888 mult = (uint16_t)ROUND_POWER_OF_TWO_64(D, msb + 1 - MUL_PREC_BITS);
889 *shift = msb + 1 - MUL_PREC_BITS;
890 } else {
891 mult = (uint16_t)D;
892 *shift = 0;
893 }
894 }
895 return mult;
896 }
897
get_mult_shift_ndiag(int64_t Px,int16_t iDet,int shift)898 static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
899 int32_t ret;
900 int16_t mshift;
901 uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
902 int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
903 shift -= mshift;
904 if (shift > 0) {
905 return (int32_t)clamp(ROUND_POWER_OF_TWO_SIGNED(v, shift),
906 -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
907 WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
908 } else {
909 return (int32_t)clamp(v * (1 << (-shift)),
910 -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
911 WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
912 }
913 return ret;
914 }
915
get_mult_shift_diag(int64_t Px,int16_t iDet,int shift)916 static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
917 int16_t mshift;
918 uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
919 int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
920 shift -= mshift;
921 if (shift > 0) {
922 return (int32_t)clamp(
923 ROUND_POWER_OF_TWO_SIGNED(v, shift),
924 (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
925 (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
926 } else {
927 return (int32_t)clamp(
928 v * (1 << (-shift)),
929 (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
930 (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
931 }
932 }
933
934 #else
935
get_mult_shift_ndiag(int64_t Px,int16_t iDet,int shift)936 static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
937 int64_t v = Px * (int64_t)iDet;
938 return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
939 -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
940 WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
941 }
942
get_mult_shift_diag(int64_t Px,int16_t iDet,int shift)943 static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
944 int64_t v = Px * (int64_t)iDet;
945 return (int32_t)clamp64(
946 ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
947 (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
948 (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
949 }
950 #endif // USE_LIMITED_PREC_MULT
951
find_affine_int(int np,const int * pts1,const int * pts2,BLOCK_SIZE bsize,int mvy,int mvx,WarpedMotionParams * wm,int mi_row,int mi_col)952 static int find_affine_int(int np, const int *pts1, const int *pts2,
953 BLOCK_SIZE bsize, int mvy, int mvx,
954 WarpedMotionParams *wm, int mi_row, int mi_col) {
955 int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
956 int32_t Bx[2] = { 0, 0 };
957 int32_t By[2] = { 0, 0 };
958
959 const int bw = block_size_wide[bsize];
960 const int bh = block_size_high[bsize];
961 const int rsuy = bh / 2 - 1;
962 const int rsux = bw / 2 - 1;
963 const int suy = rsuy * 8;
964 const int sux = rsux * 8;
965 const int duy = suy + mvy;
966 const int dux = sux + mvx;
967
968 // Assume the center pixel of the block has exactly the same motion vector
969 // as transmitted for the block. First shift the origin of the source
970 // points to the block center, and the origin of the destination points to
971 // the block center added to the motion vector transmitted.
972 // Let (xi, yi) denote the source points and (xi', yi') denote destination
973 // points after origin shfifting, for i = 0, 1, 2, .... n-1.
974 // Then if P = [x0, y0,
975 // x1, y1
976 // x2, y1,
977 // ....
978 // ]
979 // q = [x0', x1', x2', ... ]'
980 // r = [y0', y1', y2', ... ]'
981 // the least squares problems that need to be solved are:
982 // [h1, h2]' = inv(P'P)P'q and
983 // [h3, h4]' = inv(P'P)P'r
984 // where the affine transformation is given by:
985 // x' = h1.x + h2.y
986 // y' = h3.x + h4.y
987 //
988 // The loop below computes: A = P'P, Bx = P'q, By = P'r
989 // We need to just compute inv(A).Bx and inv(A).By for the solutions.
990 // Contribution from neighbor block
991 for (int i = 0; i < np; i++) {
992 const int dx = pts2[i * 2] - dux;
993 const int dy = pts2[i * 2 + 1] - duy;
994 const int sx = pts1[i * 2] - sux;
995 const int sy = pts1[i * 2 + 1] - suy;
996 // (TODO)yunqing: This comparison wouldn't be necessary if the sample
997 // selection is done in find_samples(). Also, global offset can be removed
998 // while collecting samples.
999 if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
1000 A[0][0] += LS_SQUARE(sx);
1001 A[0][1] += LS_PRODUCT1(sx, sy);
1002 A[1][1] += LS_SQUARE(sy);
1003 Bx[0] += LS_PRODUCT2(sx, dx);
1004 Bx[1] += LS_PRODUCT1(sy, dx);
1005 By[0] += LS_PRODUCT1(sx, dy);
1006 By[1] += LS_PRODUCT2(sy, dy);
1007 }
1008 }
1009
1010 // Just for debugging, and can be removed later.
1011 assert(A[0][0] >= LS_MAT_MIN && A[0][0] <= LS_MAT_MAX);
1012 assert(A[0][1] >= LS_MAT_MIN && A[0][1] <= LS_MAT_MAX);
1013 assert(A[1][1] >= LS_MAT_MIN && A[1][1] <= LS_MAT_MAX);
1014 assert(Bx[0] >= LS_MAT_MIN && Bx[0] <= LS_MAT_MAX);
1015 assert(Bx[1] >= LS_MAT_MIN && Bx[1] <= LS_MAT_MAX);
1016 assert(By[0] >= LS_MAT_MIN && By[0] <= LS_MAT_MAX);
1017 assert(By[1] >= LS_MAT_MIN && By[1] <= LS_MAT_MAX);
1018
1019 // Compute Determinant of A
1020 const int64_t Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
1021 if (Det == 0) return 1;
1022
1023 int16_t shift;
1024 int16_t iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1);
1025 shift -= WARPEDMODEL_PREC_BITS;
1026 if (shift < 0) {
1027 iDet <<= (-shift);
1028 shift = 0;
1029 }
1030
1031 int64_t Px[2], Py[2];
1032 // These divided by the Det, are the least squares solutions
1033 Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
1034 Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1];
1035 Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1];
1036 Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1];
1037
1038 wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift);
1039 wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift);
1040 wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift);
1041 wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift);
1042
1043 const int isuy = (mi_row * MI_SIZE + rsuy);
1044 const int isux = (mi_col * MI_SIZE + rsux);
1045 // Note: In the vx, vy expressions below, the max value of each of the
1046 // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room
1047 // for the first term so that the overall sum in the worst case fits
1048 // within 32 bits overall.
1049 const int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
1050 (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) +
1051 isuy * wm->wmmat[3]);
1052 const int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
1053 (isux * wm->wmmat[4] +
1054 isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS)));
1055 wm->wmmat[0] =
1056 clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
1057 wm->wmmat[1] =
1058 clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
1059 return 0;
1060 }
1061
av1_find_projection(int np,const int * pts1,const int * pts2,BLOCK_SIZE bsize,int mvy,int mvx,WarpedMotionParams * wm_params,int mi_row,int mi_col)1062 int av1_find_projection(int np, const int *pts1, const int *pts2,
1063 BLOCK_SIZE bsize, int mvy, int mvx,
1064 WarpedMotionParams *wm_params, int mi_row, int mi_col) {
1065 assert(wm_params->wmtype == AFFINE);
1066
1067 if (find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, mi_row,
1068 mi_col))
1069 return 1;
1070
1071 // check compatibility with the fast warp filter
1072 if (!av1_get_shear_params(wm_params)) return 1;
1073
1074 return 0;
1075 }
1076