1 /*
2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <emmintrin.h>
13 #include <smmintrin.h>
14
15 #include "config/av1_rtcd.h"
16
17 #include "av1/common/warped_motion.h"
18
19 /* This is a modified version of 'av1_warped_filter' from warped_motion.c:
20 * Each coefficient is stored in 8 bits instead of 16 bits
21 * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
22
23 This is done in order to avoid overflow: Since the tap with the largest
24 coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
25 order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
26 convolve functions.
27
28 Instead, we use the summation order
29 ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
30 The rearrangement of coefficients in this table is so that we can get the
31 coefficients into the correct order more quickly.
32 */
33 /* clang-format off */
34 DECLARE_ALIGNED(8, const int8_t,
35 av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
36 #if WARPEDPIXEL_PREC_BITS == 6
37 // [-1, 0)
38 { 0, 127, 0, 0, 0, 1, 0, 0}, { 0, 127, 0, 0, -1, 2, 0, 0},
39 { 1, 127, -1, 0, -3, 4, 0, 0}, { 1, 126, -2, 0, -4, 6, 1, 0},
40 { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 125, -4, 0, -6, 11, 1, 0},
41 { 1, 124, -4, 0, -7, 13, 1, 0}, { 2, 123, -5, 0, -8, 15, 1, 0},
42 { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 121, -6, 0, -10, 20, 1, 0},
43 { 2, 120, -7, 0, -11, 22, 2, 0}, { 2, 119, -8, 0, -12, 25, 2, 0},
44 { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 116, -9, 0, -13, 29, 2, 0},
45 { 3, 114, -10, 0, -14, 32, 3, 0}, { 3, 113, -10, 0, -15, 35, 2, 0},
46 { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 109, -11, 0, -16, 40, 3, 0},
47 { 3, 108, -12, 0, -16, 42, 3, 0}, { 4, 106, -13, 0, -17, 45, 3, 0},
48 { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 102, -14, 0, -17, 50, 3, 0},
49 { 4, 100, -14, 0, -17, 52, 3, 0}, { 4, 98, -15, 0, -18, 55, 4, 0},
50 { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 94, -16, 0, -18, 60, 4, 0},
51 { 4, 91, -16, 0, -18, 63, 4, 0}, { 4, 89, -16, 0, -18, 65, 4, 0},
52 { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 85, -17, 0, -18, 70, 4, 0},
53 { 4, 82, -17, 0, -18, 73, 4, 0}, { 4, 80, -17, 0, -18, 75, 4, 0},
54 { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 75, -18, 0, -17, 80, 4, 0},
55 { 4, 73, -18, 0, -17, 82, 4, 0}, { 4, 70, -18, 0, -17, 85, 4, 0},
56 { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 65, -18, 0, -16, 89, 4, 0},
57 { 4, 63, -18, 0, -16, 91, 4, 0}, { 4, 60, -18, 0, -16, 94, 4, 0},
58 { 3, 58, -18, 0, -15, 96, 4, 0}, { 4, 55, -18, 0, -15, 98, 4, 0},
59 { 3, 52, -17, 0, -14, 100, 4, 0}, { 3, 50, -17, 0, -14, 102, 4, 0},
60 { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 45, -17, 0, -13, 106, 4, 0},
61 { 3, 42, -16, 0, -12, 108, 3, 0}, { 3, 40, -16, 0, -11, 109, 3, 0},
62 { 3, 37, -15, 0, -11, 111, 3, 0}, { 2, 35, -15, 0, -10, 113, 3, 0},
63 { 3, 32, -14, 0, -10, 114, 3, 0}, { 2, 29, -13, 0, -9, 116, 3, 0},
64 { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 25, -12, 0, -8, 119, 2, 0},
65 { 2, 22, -11, 0, -7, 120, 2, 0}, { 1, 20, -10, 0, -6, 121, 2, 0},
66 { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 15, -8, 0, -5, 123, 2, 0},
67 { 1, 13, -7, 0, -4, 124, 1, 0}, { 1, 11, -6, 0, -4, 125, 1, 0},
68 { 1, 8, -5, 0, -3, 126, 1, 0}, { 1, 6, -4, 0, -2, 126, 1, 0},
69 { 0, 4, -3, 0, -1, 127, 1, 0}, { 0, 2, -1, 0, 0, 127, 0, 0},
70 // [0, 1)
71 { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -1, 2, 0, 0, 127, 0, 0},
72 { 0, -3, 4, 1, 1, 127, -2, 0}, { 0, -5, 6, 1, 1, 127, -2, 0},
73 { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -7, 11, 2, 2, 126, -4, -1},
74 {-1, -8, 13, 2, 3, 125, -5, -1}, {-1, -10, 16, 3, 3, 124, -6, -1},
75 {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -12, 20, 3, 4, 122, -7, -1},
76 {-1, -13, 23, 3, 4, 121, -8, -1}, {-2, -14, 25, 4, 5, 120, -9, -1},
77 {-1, -15, 27, 4, 5, 119, -10, -1}, {-1, -16, 30, 4, 5, 118, -11, -1},
78 {-2, -17, 33, 5, 6, 116, -12, -1}, {-2, -17, 35, 5, 6, 114, -12, -1},
79 {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 41, 6, 7, 111, -14, -2},
80 {-2, -19, 43, 6, 7, 110, -15, -2}, {-2, -20, 46, 6, 7, 108, -15, -2},
81 {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 51, 7, 7, 104, -16, -2},
82 {-2, -21, 54, 7, 7, 102, -17, -2}, {-2, -21, 56, 7, 8, 100, -18, -2},
83 {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 62, 7, 8, 96, -19, -2},
84 {-2, -22, 64, 7, 8, 94, -19, -2}, {-2, -22, 67, 8, 8, 91, -20, -2},
85 {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -22, 72, 8, 8, 87, -21, -2},
86 {-2, -21, 74, 8, 8, 84, -21, -2}, {-2, -22, 77, 8, 8, 82, -21, -2},
87 {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 82, 8, 8, 77, -22, -2},
88 {-2, -21, 84, 8, 8, 74, -21, -2}, {-2, -21, 87, 8, 8, 72, -22, -2},
89 {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -20, 91, 8, 8, 67, -22, -2},
90 {-2, -19, 94, 8, 7, 64, -22, -2}, {-2, -19, 96, 8, 7, 62, -22, -2},
91 {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -18, 100, 8, 7, 56, -21, -2},
92 {-2, -17, 102, 7, 7, 54, -21, -2}, {-2, -16, 104, 7, 7, 51, -21, -2},
93 {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 108, 7, 6, 46, -20, -2},
94 {-2, -15, 110, 7, 6, 43, -19, -2}, {-2, -14, 111, 7, 6, 41, -19, -2},
95 {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 114, 6, 5, 35, -17, -2},
96 {-1, -12, 116, 6, 5, 33, -17, -2}, {-1, -11, 118, 5, 4, 30, -16, -1},
97 {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -9, 120, 5, 4, 25, -14, -2},
98 {-1, -8, 121, 4, 3, 23, -13, -1}, {-1, -7, 122, 4, 3, 20, -12, -1},
99 {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -6, 124, 3, 3, 16, -10, -1},
100 {-1, -5, 125, 3, 2, 13, -8, -1}, {-1, -4, 126, 2, 2, 11, -7, -1},
101 { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 6, -5, 0},
102 { 0, -2, 127, 1, 1, 4, -3, 0}, { 0, 0, 127, 0, 0, 2, -1, 0},
103 // [1, 2)
104 { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 0, 127, 0, 0, -1, 2, 0},
105 { 0, 1, 127, -1, 0, -3, 4, 0}, { 0, 1, 126, -2, 0, -4, 6, 1},
106 { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 125, -4, 0, -6, 11, 1},
107 { 0, 1, 124, -4, 0, -7, 13, 1}, { 0, 2, 123, -5, 0, -8, 15, 1},
108 { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 121, -6, 0, -10, 20, 1},
109 { 0, 2, 120, -7, 0, -11, 22, 2}, { 0, 2, 119, -8, 0, -12, 25, 2},
110 { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 116, -9, 0, -13, 29, 2},
111 { 0, 3, 114, -10, 0, -14, 32, 3}, { 0, 3, 113, -10, 0, -15, 35, 2},
112 { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 109, -11, 0, -16, 40, 3},
113 { 0, 3, 108, -12, 0, -16, 42, 3}, { 0, 4, 106, -13, 0, -17, 45, 3},
114 { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 102, -14, 0, -17, 50, 3},
115 { 0, 4, 100, -14, 0, -17, 52, 3}, { 0, 4, 98, -15, 0, -18, 55, 4},
116 { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 94, -16, 0, -18, 60, 4},
117 { 0, 4, 91, -16, 0, -18, 63, 4}, { 0, 4, 89, -16, 0, -18, 65, 4},
118 { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 85, -17, 0, -18, 70, 4},
119 { 0, 4, 82, -17, 0, -18, 73, 4}, { 0, 4, 80, -17, 0, -18, 75, 4},
120 { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 75, -18, 0, -17, 80, 4},
121 { 0, 4, 73, -18, 0, -17, 82, 4}, { 0, 4, 70, -18, 0, -17, 85, 4},
122 { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 65, -18, 0, -16, 89, 4},
123 { 0, 4, 63, -18, 0, -16, 91, 4}, { 0, 4, 60, -18, 0, -16, 94, 4},
124 { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 4, 55, -18, 0, -15, 98, 4},
125 { 0, 3, 52, -17, 0, -14, 100, 4}, { 0, 3, 50, -17, 0, -14, 102, 4},
126 { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 45, -17, 0, -13, 106, 4},
127 { 0, 3, 42, -16, 0, -12, 108, 3}, { 0, 3, 40, -16, 0, -11, 109, 3},
128 { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 2, 35, -15, 0, -10, 113, 3},
129 { 0, 3, 32, -14, 0, -10, 114, 3}, { 0, 2, 29, -13, 0, -9, 116, 3},
130 { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 25, -12, 0, -8, 119, 2},
131 { 0, 2, 22, -11, 0, -7, 120, 2}, { 0, 1, 20, -10, 0, -6, 121, 2},
132 { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 15, -8, 0, -5, 123, 2},
133 { 0, 1, 13, -7, 0, -4, 124, 1}, { 0, 1, 11, -6, 0, -4, 125, 1},
134 { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 1, 6, -4, 0, -2, 126, 1},
135 { 0, 0, 4, -3, 0, -1, 127, 1}, { 0, 0, 2, -1, 0, 0, 127, 0},
136 // dummy (replicate row index 191)
137 { 0, 0, 2, -1, 0, 0, 127, 0},
138
139 #else
140 // [-1, 0)
141 { 0, 127, 0, 0, 0, 1, 0, 0}, { 1, 127, -1, 0, -3, 4, 0, 0},
142 { 1, 126, -3, 0, -5, 8, 1, 0}, { 1, 124, -4, 0, -7, 13, 1, 0},
143 { 2, 122, -6, 0, -9, 18, 1, 0}, { 2, 120, -7, 0, -11, 22, 2, 0},
144 { 3, 117, -8, 0, -13, 27, 2, 0}, { 3, 114, -10, 0, -14, 32, 3, 0},
145 { 3, 111, -11, 0, -15, 37, 3, 0}, { 3, 108, -12, 0, -16, 42, 3, 0},
146 { 4, 104, -13, 0, -17, 47, 3, 0}, { 4, 100, -14, 0, -17, 52, 3, 0},
147 { 4, 96, -15, 0, -18, 58, 3, 0}, { 4, 91, -16, 0, -18, 63, 4, 0},
148 { 4, 87, -17, 0, -18, 68, 4, 0}, { 4, 82, -17, 0, -18, 73, 4, 0},
149 { 4, 78, -18, 0, -18, 78, 4, 0}, { 4, 73, -18, 0, -17, 82, 4, 0},
150 { 4, 68, -18, 0, -17, 87, 4, 0}, { 4, 63, -18, 0, -16, 91, 4, 0},
151 { 3, 58, -18, 0, -15, 96, 4, 0}, { 3, 52, -17, 0, -14, 100, 4, 0},
152 { 3, 47, -17, 0, -13, 104, 4, 0}, { 3, 42, -16, 0, -12, 108, 3, 0},
153 { 3, 37, -15, 0, -11, 111, 3, 0}, { 3, 32, -14, 0, -10, 114, 3, 0},
154 { 2, 27, -13, 0, -8, 117, 3, 0}, { 2, 22, -11, 0, -7, 120, 2, 0},
155 { 1, 18, -9, 0, -6, 122, 2, 0}, { 1, 13, -7, 0, -4, 124, 1, 0},
156 { 1, 8, -5, 0, -3, 126, 1, 0}, { 0, 4, -3, 0, -1, 127, 1, 0},
157 // [0, 1)
158 { 0, 0, 1, 0, 0, 127, 0, 0}, { 0, -3, 4, 1, 1, 127, -2, 0},
159 { 0, -6, 8, 1, 2, 126, -3, 0}, {-1, -8, 13, 2, 3, 125, -5, -1},
160 {-1, -11, 18, 3, 4, 123, -7, -1}, {-1, -13, 23, 3, 4, 121, -8, -1},
161 {-1, -15, 27, 4, 5, 119, -10, -1}, {-2, -17, 33, 5, 6, 116, -12, -1},
162 {-2, -18, 38, 5, 6, 113, -13, -1}, {-2, -19, 43, 6, 7, 110, -15, -2},
163 {-2, -20, 49, 6, 7, 106, -16, -2}, {-2, -21, 54, 7, 7, 102, -17, -2},
164 {-2, -22, 59, 7, 8, 98, -18, -2}, {-2, -22, 64, 7, 8, 94, -19, -2},
165 {-2, -22, 69, 8, 8, 89, -20, -2}, {-2, -21, 74, 8, 8, 84, -21, -2},
166 {-2, -21, 79, 8, 8, 79, -21, -2}, {-2, -21, 84, 8, 8, 74, -21, -2},
167 {-2, -20, 89, 8, 8, 69, -22, -2}, {-2, -19, 94, 8, 7, 64, -22, -2},
168 {-2, -18, 98, 8, 7, 59, -22, -2}, {-2, -17, 102, 7, 7, 54, -21, -2},
169 {-2, -16, 106, 7, 6, 49, -20, -2}, {-2, -15, 110, 7, 6, 43, -19, -2},
170 {-1, -13, 113, 6, 5, 38, -18, -2}, {-1, -12, 116, 6, 5, 33, -17, -2},
171 {-1, -10, 119, 5, 4, 27, -15, -1}, {-1, -8, 121, 4, 3, 23, -13, -1},
172 {-1, -7, 123, 4, 3, 18, -11, -1}, {-1, -5, 125, 3, 2, 13, -8, -1},
173 { 0, -3, 126, 2, 1, 8, -6, 0}, { 0, -2, 127, 1, 1, 4, -3, 0},
174 // [1, 2)
175 { 0, 0, 127, 0, 0, 1, 0, 0}, { 0, 1, 127, -1, 0, -3, 4, 0},
176 { 0, 1, 126, -3, 0, -5, 8, 1}, { 0, 1, 124, -4, 0, -7, 13, 1},
177 { 0, 2, 122, -6, 0, -9, 18, 1}, { 0, 2, 120, -7, 0, -11, 22, 2},
178 { 0, 3, 117, -8, 0, -13, 27, 2}, { 0, 3, 114, -10, 0, -14, 32, 3},
179 { 0, 3, 111, -11, 0, -15, 37, 3}, { 0, 3, 108, -12, 0, -16, 42, 3},
180 { 0, 4, 104, -13, 0, -17, 47, 3}, { 0, 4, 100, -14, 0, -17, 52, 3},
181 { 0, 4, 96, -15, 0, -18, 58, 3}, { 0, 4, 91, -16, 0, -18, 63, 4},
182 { 0, 4, 87, -17, 0, -18, 68, 4}, { 0, 4, 82, -17, 0, -18, 73, 4},
183 { 0, 4, 78, -18, 0, -18, 78, 4}, { 0, 4, 73, -18, 0, -17, 82, 4},
184 { 0, 4, 68, -18, 0, -17, 87, 4}, { 0, 4, 63, -18, 0, -16, 91, 4},
185 { 0, 3, 58, -18, 0, -15, 96, 4}, { 0, 3, 52, -17, 0, -14, 100, 4},
186 { 0, 3, 47, -17, 0, -13, 104, 4}, { 0, 3, 42, -16, 0, -12, 108, 3},
187 { 0, 3, 37, -15, 0, -11, 111, 3}, { 0, 3, 32, -14, 0, -10, 114, 3},
188 { 0, 2, 27, -13, 0, -8, 117, 3}, { 0, 2, 22, -11, 0, -7, 120, 2},
189 { 0, 1, 18, -9, 0, -6, 122, 2}, { 0, 1, 13, -7, 0, -4, 124, 1},
190 { 0, 1, 8, -5, 0, -3, 126, 1}, { 0, 0, 4, -3, 0, -1, 127, 1},
191 // dummy (replicate row index 95)
192 { 0, 0, 4, -3, 0, -1, 127, 1},
193 #endif // WARPEDPIXEL_PREC_BITS == 6
194 };
195 /* clang-format on */
196
197 // Shuffle masks: we want to convert a sequence of bytes 0, 1, 2, ..., 15
198 // in an SSE register into two sequences:
199 // 0, 2, 2, 4, ..., 12, 12, 14, <don't care>
200 // 1, 3, 3, 5, ..., 13, 13, 15, <don't care>
201 DECLARE_ALIGNED(16, static const uint8_t,
202 even_mask[16]) = { 0, 2, 2, 4, 4, 6, 6, 8,
203 8, 10, 10, 12, 12, 14, 14, 0 };
204
205 DECLARE_ALIGNED(16, static const uint8_t,
206 odd_mask[16]) = { 1, 3, 3, 5, 5, 7, 7, 9,
207 9, 11, 11, 13, 13, 15, 15, 0 };
208
209 DECLARE_ALIGNED(16, static const uint8_t,
210 shuffle_alpha0_mask01[16]) = { 0, 1, 0, 1, 0, 1, 0, 1,
211 0, 1, 0, 1, 0, 1, 0, 1 };
212
213 DECLARE_ALIGNED(16, static const uint8_t,
214 shuffle_alpha0_mask23[16]) = { 2, 3, 2, 3, 2, 3, 2, 3,
215 2, 3, 2, 3, 2, 3, 2, 3 };
216
217 DECLARE_ALIGNED(16, static const uint8_t,
218 shuffle_alpha0_mask45[16]) = { 4, 5, 4, 5, 4, 5, 4, 5,
219 4, 5, 4, 5, 4, 5, 4, 5 };
220
221 DECLARE_ALIGNED(16, static const uint8_t,
222 shuffle_alpha0_mask67[16]) = { 6, 7, 6, 7, 6, 7, 6, 7,
223 6, 7, 6, 7, 6, 7, 6, 7 };
224
225 DECLARE_ALIGNED(16, static const uint8_t,
226 shuffle_gamma0_mask0[16]) = { 0, 1, 2, 3, 0, 1, 2, 3,
227 0, 1, 2, 3, 0, 1, 2, 3 };
228
229 DECLARE_ALIGNED(16, static const uint8_t,
230 shuffle_gamma0_mask1[16]) = { 4, 5, 6, 7, 4, 5, 6, 7,
231 4, 5, 6, 7, 4, 5, 6, 7 };
232
233 DECLARE_ALIGNED(16, static const uint8_t,
234 shuffle_gamma0_mask2[16]) = { 8, 9, 10, 11, 8, 9, 10, 11,
235 8, 9, 10, 11, 8, 9, 10, 11 };
236
237 DECLARE_ALIGNED(16, static const uint8_t,
238 shuffle_gamma0_mask3[16]) = { 12, 13, 14, 15, 12, 13, 14, 15,
239 12, 13, 14, 15, 12, 13, 14, 15 };
240
filter_src_pixels(__m128i src,__m128i * tmp,__m128i * coeff,const int offset_bits_horiz,const int reduce_bits_horiz,int k)241 static INLINE void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff,
242 const int offset_bits_horiz,
243 const int reduce_bits_horiz, int k) {
244 const __m128i src_even =
245 _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)even_mask));
246 const __m128i src_odd =
247 _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)odd_mask));
248 // The pixel order we need for 'src' is:
249 // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
250 const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
251 const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff[0]);
252 // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
253 const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
254 _mm_srli_si128(src_odd, 4));
255 const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff[1]);
256 // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
257 const __m128i src_13 =
258 _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
259 const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff[2]);
260 // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
261 const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4),
262 _mm_srli_si128(src_even, 6));
263 const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff[3]);
264
265 const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) +
266 ((1 << reduce_bits_horiz) >> 1));
267
268 // Note: The values res_02 + res_46 and res_13 + res_57 both
269 // fit into int16s at this point, but their sum may be too wide to fit
270 // into an int16. However, once we also add round_const, the sum of
271 // all of these fits into a uint16.
272 //
273 // The wrapping behaviour of _mm_add_* is used here to make sure we
274 // get the correct result despite converting between different
275 // (implicit) types.
276 const __m128i res_even = _mm_add_epi16(res_02, res_46);
277 const __m128i res_odd = _mm_add_epi16(res_13, res_57);
278 const __m128i res =
279 _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
280 tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
281 }
282
prepare_horizontal_filter_coeff(int alpha,int sx,__m128i * coeff)283 static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx,
284 __m128i *coeff) {
285 // Filter even-index pixels
286 const __m128i tmp_0 = _mm_loadl_epi64(
287 (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
288 const __m128i tmp_1 = _mm_loadl_epi64(
289 (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
290 const __m128i tmp_2 = _mm_loadl_epi64(
291 (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
292 const __m128i tmp_3 = _mm_loadl_epi64(
293 (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
294 const __m128i tmp_4 = _mm_loadl_epi64(
295 (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
296 const __m128i tmp_5 = _mm_loadl_epi64(
297 (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
298 const __m128i tmp_6 = _mm_loadl_epi64(
299 (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
300 const __m128i tmp_7 = _mm_loadl_epi64(
301 (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
302
303 // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2
304 const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
305 // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 1 3
306 const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
307 // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 4 6
308 const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
309 // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 5 7
310 const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
311
312 // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 0 2 4 6
313 const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
314 // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 0 2 4 6
315 const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
316 // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 1 3 5 7
317 const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
318 // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 1 3 5 7
319 const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
320
321 // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
322 coeff[0] = _mm_unpacklo_epi64(tmp_12, tmp_14);
323 // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
324 coeff[1] = _mm_unpackhi_epi64(tmp_12, tmp_14);
325 // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
326 coeff[2] = _mm_unpacklo_epi64(tmp_13, tmp_15);
327 // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
328 coeff[3] = _mm_unpackhi_epi64(tmp_13, tmp_15);
329 }
330
prepare_horizontal_filter_coeff_alpha0(int sx,__m128i * coeff)331 static INLINE void prepare_horizontal_filter_coeff_alpha0(int sx,
332 __m128i *coeff) {
333 // Filter even-index pixels
334 const __m128i tmp_0 =
335 _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
336
337 // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
338 coeff[0] =
339 _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask01));
340 // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
341 coeff[1] =
342 _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask23));
343 // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
344 coeff[2] =
345 _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask45));
346 // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
347 coeff[3] =
348 _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask67));
349 }
350
horizontal_filter(__m128i src,__m128i * tmp,int sx,int alpha,int k,const int offset_bits_horiz,const int reduce_bits_horiz)351 static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx,
352 int alpha, int k,
353 const int offset_bits_horiz,
354 const int reduce_bits_horiz) {
355 __m128i coeff[4];
356 prepare_horizontal_filter_coeff(alpha, sx, coeff);
357 filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
358 }
359
warp_horizontal_filter(const uint8_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)360 static INLINE void warp_horizontal_filter(const uint8_t *ref, __m128i *tmp,
361 int stride, int32_t ix4, int32_t iy4,
362 int32_t sx4, int alpha, int beta,
363 int p_height, int height, int i,
364 const int offset_bits_horiz,
365 const int reduce_bits_horiz) {
366 int k;
367 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
368 int iy = iy4 + k;
369 if (iy < 0)
370 iy = 0;
371 else if (iy > height - 1)
372 iy = height - 1;
373 int sx = sx4 + beta * (k + 4);
374
375 // Load source pixels
376 const __m128i src =
377 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
378 horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
379 reduce_bits_horiz);
380 }
381 }
382
warp_horizontal_filter_alpha0(const uint8_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)383 static INLINE void warp_horizontal_filter_alpha0(
384 const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
385 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
386 const int offset_bits_horiz, const int reduce_bits_horiz) {
387 (void)alpha;
388 int k;
389 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
390 int iy = iy4 + k;
391 if (iy < 0)
392 iy = 0;
393 else if (iy > height - 1)
394 iy = height - 1;
395 int sx = sx4 + beta * (k + 4);
396
397 // Load source pixels
398 const __m128i src =
399 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
400
401 __m128i coeff[4];
402 prepare_horizontal_filter_coeff_alpha0(sx, coeff);
403 filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
404 }
405 }
406
warp_horizontal_filter_beta0(const uint8_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)407 static INLINE void warp_horizontal_filter_beta0(
408 const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
409 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
410 const int offset_bits_horiz, const int reduce_bits_horiz) {
411 (void)beta;
412 int k;
413 __m128i coeff[4];
414 prepare_horizontal_filter_coeff(alpha, sx4, coeff);
415
416 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
417 int iy = iy4 + k;
418 if (iy < 0)
419 iy = 0;
420 else if (iy > height - 1)
421 iy = height - 1;
422
423 // Load source pixels
424 const __m128i src =
425 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
426 filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
427 }
428 }
429
warp_horizontal_filter_alpha0_beta0(const uint8_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)430 static INLINE void warp_horizontal_filter_alpha0_beta0(
431 const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
432 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
433 const int offset_bits_horiz, const int reduce_bits_horiz) {
434 (void)beta;
435 (void)alpha;
436 int k;
437
438 __m128i coeff[4];
439 prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
440
441 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
442 int iy = iy4 + k;
443 if (iy < 0)
444 iy = 0;
445 else if (iy > height - 1)
446 iy = height - 1;
447
448 // Load source pixels
449 const __m128i src =
450 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
451 filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
452 }
453 }
454
unpack_weights_and_set_round_const(ConvolveParams * conv_params,const int round_bits,const int offset_bits,__m128i * res_sub_const,__m128i * round_bits_const,__m128i * wt)455 static INLINE void unpack_weights_and_set_round_const(
456 ConvolveParams *conv_params, const int round_bits, const int offset_bits,
457 __m128i *res_sub_const, __m128i *round_bits_const, __m128i *wt) {
458 *res_sub_const =
459 _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
460 (1 << (offset_bits - conv_params->round_1 - 1)));
461 *round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1));
462
463 const int w0 = conv_params->fwd_offset;
464 const int w1 = conv_params->bck_offset;
465 const __m128i wt0 = _mm_set1_epi16((int16_t)w0);
466 const __m128i wt1 = _mm_set1_epi16((int16_t)w1);
467 *wt = _mm_unpacklo_epi16(wt0, wt1);
468 }
469
prepare_vertical_filter_coeffs(int gamma,int sy,__m128i * coeffs)470 static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy,
471 __m128i *coeffs) {
472 const __m128i tmp_0 =
473 _mm_loadu_si128((__m128i *)(av1_warped_filter +
474 ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
475 const __m128i tmp_2 =
476 _mm_loadu_si128((__m128i *)(av1_warped_filter +
477 ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
478 const __m128i tmp_4 =
479 _mm_loadu_si128((__m128i *)(av1_warped_filter +
480 ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
481 const __m128i tmp_6 =
482 _mm_loadu_si128((__m128i *)(av1_warped_filter +
483 ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
484
485 const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
486 const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
487 const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
488 const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
489
490 // even coeffs
491 coeffs[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
492 coeffs[1] = _mm_unpackhi_epi64(tmp_8, tmp_10);
493 coeffs[2] = _mm_unpacklo_epi64(tmp_12, tmp_14);
494 coeffs[3] = _mm_unpackhi_epi64(tmp_12, tmp_14);
495
496 const __m128i tmp_1 =
497 _mm_loadu_si128((__m128i *)(av1_warped_filter +
498 ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
499 const __m128i tmp_3 =
500 _mm_loadu_si128((__m128i *)(av1_warped_filter +
501 ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
502 const __m128i tmp_5 =
503 _mm_loadu_si128((__m128i *)(av1_warped_filter +
504 ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
505 const __m128i tmp_7 =
506 _mm_loadu_si128((__m128i *)(av1_warped_filter +
507 ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
508
509 const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
510 const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
511 const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
512 const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
513
514 // odd coeffs
515 coeffs[4] = _mm_unpacklo_epi64(tmp_9, tmp_11);
516 coeffs[5] = _mm_unpackhi_epi64(tmp_9, tmp_11);
517 coeffs[6] = _mm_unpacklo_epi64(tmp_13, tmp_15);
518 coeffs[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
519 }
520
prepare_vertical_filter_coeffs_gamma0(int sy,__m128i * coeffs)521 static INLINE void prepare_vertical_filter_coeffs_gamma0(int sy,
522 __m128i *coeffs) {
523 const __m128i tmp_0 = _mm_loadu_si128(
524 (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
525
526 // even coeffs
527 coeffs[0] =
528 _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask0));
529 coeffs[1] =
530 _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask1));
531 coeffs[2] =
532 _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask2));
533 coeffs[3] =
534 _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask3));
535
536 // odd coeffs
537 coeffs[4] = coeffs[0];
538 coeffs[5] = coeffs[1];
539 coeffs[6] = coeffs[2];
540 coeffs[7] = coeffs[3];
541 }
542
filter_src_pixels_vertical(__m128i * tmp,__m128i * coeffs,__m128i * res_lo,__m128i * res_hi,int k)543 static INLINE void filter_src_pixels_vertical(__m128i *tmp, __m128i *coeffs,
544 __m128i *res_lo, __m128i *res_hi,
545 int k) {
546 // Load from tmp and rearrange pairs of consecutive rows into the
547 // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
548 const __m128i *src = tmp + (k + 4);
549 const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
550 const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
551 const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
552 const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
553
554 const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]);
555 const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]);
556 const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]);
557 const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]);
558
559 const __m128i res_even =
560 _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6));
561
562 // Filter odd-index pixels
563 const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
564 const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
565 const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
566 const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
567
568 const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[4]);
569 const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[5]);
570 const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[6]);
571 const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[7]);
572
573 const __m128i res_odd =
574 _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7));
575
576 // Rearrange pixels back into the order 0 ... 7
577 *res_lo = _mm_unpacklo_epi32(res_even, res_odd);
578 *res_hi = _mm_unpackhi_epi32(res_even, res_odd);
579 }
580
store_vertical_filter_output(__m128i * res_lo,__m128i * res_hi,const __m128i * res_add_const,const __m128i * wt,const __m128i * res_sub_const,__m128i * round_bits_const,uint8_t * pred,ConvolveParams * conv_params,int i,int j,int k,const int reduce_bits_vert,int p_stride,int p_width,const int round_bits)581 static INLINE void store_vertical_filter_output(
582 __m128i *res_lo, __m128i *res_hi, const __m128i *res_add_const,
583 const __m128i *wt, const __m128i *res_sub_const, __m128i *round_bits_const,
584 uint8_t *pred, ConvolveParams *conv_params, int i, int j, int k,
585 const int reduce_bits_vert, int p_stride, int p_width,
586 const int round_bits) {
587 __m128i res_lo_1 = *res_lo;
588 __m128i res_hi_1 = *res_hi;
589
590 if (conv_params->is_compound) {
591 __m128i *const p =
592 (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
593 res_lo_1 = _mm_srai_epi32(_mm_add_epi32(res_lo_1, *res_add_const),
594 reduce_bits_vert);
595 const __m128i temp_lo_16 = _mm_packus_epi32(res_lo_1, res_lo_1);
596 __m128i res_lo_16;
597 if (conv_params->do_average) {
598 __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
599 const __m128i p_16 = _mm_loadl_epi64(p);
600
601 if (conv_params->use_dist_wtd_comp_avg) {
602 const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16);
603 const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt);
604 const __m128i shifted_32 =
605 _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
606 res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32);
607 } else {
608 res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1);
609 }
610
611 res_lo_16 = _mm_add_epi16(res_lo_16, *res_sub_const);
612
613 res_lo_16 = _mm_srai_epi16(_mm_add_epi16(res_lo_16, *round_bits_const),
614 round_bits);
615 __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16);
616 *(uint32_t *)dst8 = _mm_cvtsi128_si32(res_8_lo);
617 } else {
618 _mm_storel_epi64(p, temp_lo_16);
619 }
620 if (p_width > 4) {
621 __m128i *const p4 =
622 (__m128i *)&conv_params
623 ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
624 res_hi_1 = _mm_srai_epi32(_mm_add_epi32(res_hi_1, *res_add_const),
625 reduce_bits_vert);
626 const __m128i temp_hi_16 = _mm_packus_epi32(res_hi_1, res_hi_1);
627 __m128i res_hi_16;
628
629 if (conv_params->do_average) {
630 __m128i *const dst8_4 =
631 (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
632 const __m128i p4_16 = _mm_loadl_epi64(p4);
633
634 if (conv_params->use_dist_wtd_comp_avg) {
635 const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16);
636 const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt);
637 const __m128i shifted_32 =
638 _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
639 res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32);
640 } else {
641 res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1);
642 }
643 res_hi_16 = _mm_add_epi16(res_hi_16, *res_sub_const);
644
645 res_hi_16 = _mm_srai_epi16(_mm_add_epi16(res_hi_16, *round_bits_const),
646 round_bits);
647 __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16);
648 *(uint32_t *)dst8_4 = _mm_cvtsi128_si32(res_8_hi);
649
650 } else {
651 _mm_storel_epi64(p4, temp_hi_16);
652 }
653 }
654 } else {
655 const __m128i res_lo_round = _mm_srai_epi32(
656 _mm_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
657 const __m128i res_hi_round = _mm_srai_epi32(
658 _mm_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
659
660 const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
661 __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
662
663 // Store, blending with 'pred' if needed
664 __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
665
666 // Note: If we're outputting a 4x4 block, we need to be very careful
667 // to only output 4 pixels at this point, to avoid encode/decode
668 // mismatches when encoding with multiple threads.
669 if (p_width == 4) {
670 *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
671 } else {
672 _mm_storel_epi64(p, res_8bit);
673 }
674 }
675 }
676
warp_vertical_filter(uint8_t * pred,__m128i * tmp,ConvolveParams * conv_params,int16_t gamma,int16_t delta,int p_height,int p_stride,int p_width,int i,int j,int sy4,const int reduce_bits_vert,const __m128i * res_add_const,const int round_bits,const int offset_bits)677 static INLINE void warp_vertical_filter(
678 uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
679 int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
680 int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
681 const int round_bits, const int offset_bits) {
682 int k;
683 __m128i res_sub_const, round_bits_const, wt;
684 unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
685 &res_sub_const, &round_bits_const, &wt);
686 // Vertical filter
687 for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
688 int sy = sy4 + delta * (k + 4);
689
690 __m128i coeffs[8];
691 prepare_vertical_filter_coeffs(gamma, sy, coeffs);
692
693 __m128i res_lo;
694 __m128i res_hi;
695 filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
696
697 store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
698 &res_sub_const, &round_bits_const, pred,
699 conv_params, i, j, k, reduce_bits_vert,
700 p_stride, p_width, round_bits);
701 }
702 }
703
warp_vertical_filter_gamma0(uint8_t * pred,__m128i * tmp,ConvolveParams * conv_params,int16_t gamma,int16_t delta,int p_height,int p_stride,int p_width,int i,int j,int sy4,const int reduce_bits_vert,const __m128i * res_add_const,const int round_bits,const int offset_bits)704 static INLINE void warp_vertical_filter_gamma0(
705 uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
706 int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
707 int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
708 const int round_bits, const int offset_bits) {
709 int k;
710 (void)gamma;
711 __m128i res_sub_const, round_bits_const, wt;
712 unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
713 &res_sub_const, &round_bits_const, &wt);
714 // Vertical filter
715 for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
716 int sy = sy4 + delta * (k + 4);
717
718 __m128i coeffs[8];
719 prepare_vertical_filter_coeffs_gamma0(sy, coeffs);
720
721 __m128i res_lo;
722 __m128i res_hi;
723 filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
724
725 store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
726 &res_sub_const, &round_bits_const, pred,
727 conv_params, i, j, k, reduce_bits_vert,
728 p_stride, p_width, round_bits);
729 }
730 }
731
warp_vertical_filter_delta0(uint8_t * pred,__m128i * tmp,ConvolveParams * conv_params,int16_t gamma,int16_t delta,int p_height,int p_stride,int p_width,int i,int j,int sy4,const int reduce_bits_vert,const __m128i * res_add_const,const int round_bits,const int offset_bits)732 static INLINE void warp_vertical_filter_delta0(
733 uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
734 int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
735 int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
736 const int round_bits, const int offset_bits) {
737 (void)delta;
738 int k;
739 __m128i res_sub_const, round_bits_const, wt;
740 unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
741 &res_sub_const, &round_bits_const, &wt);
742
743 __m128i coeffs[8];
744 prepare_vertical_filter_coeffs(gamma, sy4, coeffs);
745 // Vertical filter
746 for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
747 __m128i res_lo;
748 __m128i res_hi;
749 filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
750
751 store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
752 &res_sub_const, &round_bits_const, pred,
753 conv_params, i, j, k, reduce_bits_vert,
754 p_stride, p_width, round_bits);
755 }
756 }
757
warp_vertical_filter_gamma0_delta0(uint8_t * pred,__m128i * tmp,ConvolveParams * conv_params,int16_t gamma,int16_t delta,int p_height,int p_stride,int p_width,int i,int j,int sy4,const int reduce_bits_vert,const __m128i * res_add_const,const int round_bits,const int offset_bits)758 static INLINE void warp_vertical_filter_gamma0_delta0(
759 uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
760 int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
761 int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
762 const int round_bits, const int offset_bits) {
763 (void)delta;
764 (void)gamma;
765 int k;
766 __m128i res_sub_const, round_bits_const, wt;
767 unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
768 &res_sub_const, &round_bits_const, &wt);
769
770 __m128i coeffs[8];
771 prepare_vertical_filter_coeffs_gamma0(sy4, coeffs);
772 // Vertical filter
773 for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
774 __m128i res_lo;
775 __m128i res_hi;
776 filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
777
778 store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
779 &res_sub_const, &round_bits_const, pred,
780 conv_params, i, j, k, reduce_bits_vert,
781 p_stride, p_width, round_bits);
782 }
783 }
784
prepare_warp_vertical_filter(uint8_t * pred,__m128i * tmp,ConvolveParams * conv_params,int16_t gamma,int16_t delta,int p_height,int p_stride,int p_width,int i,int j,int sy4,const int reduce_bits_vert,const __m128i * res_add_const,const int round_bits,const int offset_bits)785 static INLINE void prepare_warp_vertical_filter(
786 uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
787 int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
788 int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
789 const int round_bits, const int offset_bits) {
790 if (gamma == 0 && delta == 0)
791 warp_vertical_filter_gamma0_delta0(
792 pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, j,
793 sy4, reduce_bits_vert, res_add_const, round_bits, offset_bits);
794 else if (gamma == 0 && delta != 0)
795 warp_vertical_filter_gamma0(pred, tmp, conv_params, gamma, delta, p_height,
796 p_stride, p_width, i, j, sy4, reduce_bits_vert,
797 res_add_const, round_bits, offset_bits);
798 else if (gamma != 0 && delta == 0)
799 warp_vertical_filter_delta0(pred, tmp, conv_params, gamma, delta, p_height,
800 p_stride, p_width, i, j, sy4, reduce_bits_vert,
801 res_add_const, round_bits, offset_bits);
802 else
803 warp_vertical_filter(pred, tmp, conv_params, gamma, delta, p_height,
804 p_stride, p_width, i, j, sy4, reduce_bits_vert,
805 res_add_const, round_bits, offset_bits);
806 }
807
prepare_warp_horizontal_filter(const uint8_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)808 static INLINE void prepare_warp_horizontal_filter(
809 const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
810 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
811 const int offset_bits_horiz, const int reduce_bits_horiz) {
812 if (alpha == 0 && beta == 0)
813 warp_horizontal_filter_alpha0_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
814 beta, p_height, height, i,
815 offset_bits_horiz, reduce_bits_horiz);
816 else if (alpha == 0 && beta != 0)
817 warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
818 p_height, height, i, offset_bits_horiz,
819 reduce_bits_horiz);
820 else if (alpha != 0 && beta == 0)
821 warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
822 p_height, height, i, offset_bits_horiz,
823 reduce_bits_horiz);
824 else
825 warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
826 p_height, height, i, offset_bits_horiz,
827 reduce_bits_horiz);
828 }
829
av1_warp_affine_sse4_1(const int32_t * mat,const uint8_t * ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)830 void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
831 int height, int stride, uint8_t *pred, int p_col,
832 int p_row, int p_width, int p_height, int p_stride,
833 int subsampling_x, int subsampling_y,
834 ConvolveParams *conv_params, int16_t alpha,
835 int16_t beta, int16_t gamma, int16_t delta) {
836 __m128i tmp[15];
837 int i, j, k;
838 const int bd = 8;
839 const int reduce_bits_horiz = conv_params->round_0;
840 const int reduce_bits_vert = conv_params->is_compound
841 ? conv_params->round_1
842 : 2 * FILTER_BITS - reduce_bits_horiz;
843 const int offset_bits_horiz = bd + FILTER_BITS - 1;
844 assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
845
846 const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
847 const __m128i reduce_bits_vert_const =
848 _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
849 const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
850 const int round_bits =
851 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
852 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
853 assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
854
855 /* Note: For this code to work, the left/right frame borders need to be
856 extended by at least 13 pixels each. By the time we get here, other
857 code will have set up this border, but we allow an explicit check
858 for debugging purposes.
859 */
860 /*for (i = 0; i < height; ++i) {
861 for (j = 0; j < 13; ++j) {
862 assert(ref[i * stride - 13 + j] == ref[i * stride]);
863 assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
864 }
865 }*/
866 __m128i res_add_const_1;
867 if (conv_params->is_compound == 1) {
868 res_add_const_1 = _mm_add_epi32(reduce_bits_vert_const, res_add_const);
869 } else {
870 res_add_const_1 = _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
871 ((1 << reduce_bits_vert) >> 1));
872 }
873
874 for (i = 0; i < p_height; i += 8) {
875 for (j = 0; j < p_width; j += 8) {
876 const int32_t src_x = (p_col + j + 4) << subsampling_x;
877 const int32_t src_y = (p_row + i + 4) << subsampling_y;
878 const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
879 const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
880 const int32_t x4 = dst_x >> subsampling_x;
881 const int32_t y4 = dst_y >> subsampling_y;
882
883 int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
884 int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
885 int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
886 int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
887
888 // Add in all the constant terms, including rounding and offset
889 sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
890 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
891 sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
892 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
893
894 sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
895 sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
896
897 // Horizontal filter
898 // If the block is aligned such that, after clamping, every sample
899 // would be taken from the leftmost/rightmost column, then we can
900 // skip the expensive horizontal filter.
901 if (ix4 <= -7) {
902 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
903 int iy = iy4 + k;
904 if (iy < 0)
905 iy = 0;
906 else if (iy > height - 1)
907 iy = height - 1;
908 tmp[k + 7] = _mm_set1_epi16(
909 (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
910 ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
911 }
912 } else if (ix4 >= width + 6) {
913 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
914 int iy = iy4 + k;
915 if (iy < 0)
916 iy = 0;
917 else if (iy > height - 1)
918 iy = height - 1;
919 tmp[k + 7] =
920 _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
921 ref[iy * stride + (width - 1)] *
922 (1 << (FILTER_BITS - reduce_bits_horiz)));
923 }
924 } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
925 const int out_of_boundary_left = -(ix4 - 6);
926 const int out_of_boundary_right = (ix4 + 8) - width;
927 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
928 int iy = iy4 + k;
929 if (iy < 0)
930 iy = 0;
931 else if (iy > height - 1)
932 iy = height - 1;
933 int sx = sx4 + beta * (k + 4);
934
935 // Load source pixels
936 __m128i src =
937 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
938 if (out_of_boundary_left >= 0) {
939 const __m128i shuffle_reg_left =
940 _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
941 src = _mm_shuffle_epi8(src, shuffle_reg_left);
942 }
943 if (out_of_boundary_right >= 0) {
944 const __m128i shuffle_reg_right = _mm_loadu_si128(
945 (__m128i *)warp_pad_right[out_of_boundary_right]);
946 src = _mm_shuffle_epi8(src, shuffle_reg_right);
947 }
948 horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
949 reduce_bits_horiz);
950 }
951 } else {
952 prepare_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha,
953 beta, p_height, height, i,
954 offset_bits_horiz, reduce_bits_horiz);
955 }
956
957 // Vertical filter
958 prepare_warp_vertical_filter(
959 pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i,
960 j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, offset_bits);
961 }
962 }
963 }
964