• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <emmintrin.h>
13 #include <smmintrin.h>
14 
15 #include "config/av1_rtcd.h"
16 
17 #include "av1/common/warped_motion.h"
18 
19 /* This is a modified version of 'av1_warped_filter' from warped_motion.c:
20    * Each coefficient is stored in 8 bits instead of 16 bits
21    * The coefficients are rearranged in the column order 0, 2, 4, 6, 1, 3, 5, 7
22 
23      This is done in order to avoid overflow: Since the tap with the largest
24      coefficient could be any of taps 2, 3, 4 or 5, we can't use the summation
25      order ((0 + 1) + (4 + 5)) + ((2 + 3) + (6 + 7)) used in the regular
26      convolve functions.
27 
28      Instead, we use the summation order
29      ((0 + 2) + (4 + 6)) + ((1 + 3) + (5 + 7)).
30      The rearrangement of coefficients in this table is so that we can get the
31      coefficients into the correct order more quickly.
32 */
33 /* clang-format off */
34 DECLARE_ALIGNED(8, const int8_t,
35                 av1_filter_8bit[WARPEDPIXEL_PREC_SHIFTS * 3 + 1][8]) = {
36 #if WARPEDPIXEL_PREC_BITS == 6
37   // [-1, 0)
38   { 0, 127,   0, 0,   0,   1, 0, 0}, { 0, 127,   0, 0,  -1,   2, 0, 0},
39   { 1, 127,  -1, 0,  -3,   4, 0, 0}, { 1, 126,  -2, 0,  -4,   6, 1, 0},
40   { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 125,  -4, 0,  -6,  11, 1, 0},
41   { 1, 124,  -4, 0,  -7,  13, 1, 0}, { 2, 123,  -5, 0,  -8,  15, 1, 0},
42   { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 121,  -6, 0, -10,  20, 1, 0},
43   { 2, 120,  -7, 0, -11,  22, 2, 0}, { 2, 119,  -8, 0, -12,  25, 2, 0},
44   { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 116,  -9, 0, -13,  29, 2, 0},
45   { 3, 114, -10, 0, -14,  32, 3, 0}, { 3, 113, -10, 0, -15,  35, 2, 0},
46   { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 109, -11, 0, -16,  40, 3, 0},
47   { 3, 108, -12, 0, -16,  42, 3, 0}, { 4, 106, -13, 0, -17,  45, 3, 0},
48   { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 102, -14, 0, -17,  50, 3, 0},
49   { 4, 100, -14, 0, -17,  52, 3, 0}, { 4,  98, -15, 0, -18,  55, 4, 0},
50   { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  94, -16, 0, -18,  60, 4, 0},
51   { 4,  91, -16, 0, -18,  63, 4, 0}, { 4,  89, -16, 0, -18,  65, 4, 0},
52   { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  85, -17, 0, -18,  70, 4, 0},
53   { 4,  82, -17, 0, -18,  73, 4, 0}, { 4,  80, -17, 0, -18,  75, 4, 0},
54   { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  75, -18, 0, -17,  80, 4, 0},
55   { 4,  73, -18, 0, -17,  82, 4, 0}, { 4,  70, -18, 0, -17,  85, 4, 0},
56   { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  65, -18, 0, -16,  89, 4, 0},
57   { 4,  63, -18, 0, -16,  91, 4, 0}, { 4,  60, -18, 0, -16,  94, 4, 0},
58   { 3,  58, -18, 0, -15,  96, 4, 0}, { 4,  55, -18, 0, -15,  98, 4, 0},
59   { 3,  52, -17, 0, -14, 100, 4, 0}, { 3,  50, -17, 0, -14, 102, 4, 0},
60   { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  45, -17, 0, -13, 106, 4, 0},
61   { 3,  42, -16, 0, -12, 108, 3, 0}, { 3,  40, -16, 0, -11, 109, 3, 0},
62   { 3,  37, -15, 0, -11, 111, 3, 0}, { 2,  35, -15, 0, -10, 113, 3, 0},
63   { 3,  32, -14, 0, -10, 114, 3, 0}, { 2,  29, -13, 0,  -9, 116, 3, 0},
64   { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  25, -12, 0,  -8, 119, 2, 0},
65   { 2,  22, -11, 0,  -7, 120, 2, 0}, { 1,  20, -10, 0,  -6, 121, 2, 0},
66   { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  15,  -8, 0,  -5, 123, 2, 0},
67   { 1,  13,  -7, 0,  -4, 124, 1, 0}, { 1,  11,  -6, 0,  -4, 125, 1, 0},
68   { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 1,   6,  -4, 0,  -2, 126, 1, 0},
69   { 0,   4,  -3, 0,  -1, 127, 1, 0}, { 0,   2,  -1, 0,   0, 127, 0, 0},
70   // [0, 1)
71   { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -1,   2, 0, 0, 127,   0,  0},
72   { 0,  -3,   4, 1, 1, 127,  -2,  0}, { 0,  -5,   6, 1, 1, 127,  -2,  0},
73   { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -7,  11, 2, 2, 126,  -4, -1},
74   {-1,  -8,  13, 2, 3, 125,  -5, -1}, {-1, -10,  16, 3, 3, 124,  -6, -1},
75   {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -12,  20, 3, 4, 122,  -7, -1},
76   {-1, -13,  23, 3, 4, 121,  -8, -1}, {-2, -14,  25, 4, 5, 120,  -9, -1},
77   {-1, -15,  27, 4, 5, 119, -10, -1}, {-1, -16,  30, 4, 5, 118, -11, -1},
78   {-2, -17,  33, 5, 6, 116, -12, -1}, {-2, -17,  35, 5, 6, 114, -12, -1},
79   {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  41, 6, 7, 111, -14, -2},
80   {-2, -19,  43, 6, 7, 110, -15, -2}, {-2, -20,  46, 6, 7, 108, -15, -2},
81   {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  51, 7, 7, 104, -16, -2},
82   {-2, -21,  54, 7, 7, 102, -17, -2}, {-2, -21,  56, 7, 8, 100, -18, -2},
83   {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  62, 7, 8,  96, -19, -2},
84   {-2, -22,  64, 7, 8,  94, -19, -2}, {-2, -22,  67, 8, 8,  91, -20, -2},
85   {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -22,  72, 8, 8,  87, -21, -2},
86   {-2, -21,  74, 8, 8,  84, -21, -2}, {-2, -22,  77, 8, 8,  82, -21, -2},
87   {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  82, 8, 8,  77, -22, -2},
88   {-2, -21,  84, 8, 8,  74, -21, -2}, {-2, -21,  87, 8, 8,  72, -22, -2},
89   {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -20,  91, 8, 8,  67, -22, -2},
90   {-2, -19,  94, 8, 7,  64, -22, -2}, {-2, -19,  96, 8, 7,  62, -22, -2},
91   {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -18, 100, 8, 7,  56, -21, -2},
92   {-2, -17, 102, 7, 7,  54, -21, -2}, {-2, -16, 104, 7, 7,  51, -21, -2},
93   {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 108, 7, 6,  46, -20, -2},
94   {-2, -15, 110, 7, 6,  43, -19, -2}, {-2, -14, 111, 7, 6,  41, -19, -2},
95   {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 114, 6, 5,  35, -17, -2},
96   {-1, -12, 116, 6, 5,  33, -17, -2}, {-1, -11, 118, 5, 4,  30, -16, -1},
97   {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -9, 120, 5, 4,  25, -14, -2},
98   {-1,  -8, 121, 4, 3,  23, -13, -1}, {-1,  -7, 122, 4, 3,  20, -12, -1},
99   {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -6, 124, 3, 3,  16, -10, -1},
100   {-1,  -5, 125, 3, 2,  13,  -8, -1}, {-1,  -4, 126, 2, 2,  11,  -7, -1},
101   { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   6,  -5,  0},
102   { 0,  -2, 127, 1, 1,   4,  -3,  0}, { 0,   0, 127, 0, 0,   2,  -1,  0},
103   // [1, 2)
104   { 0, 0, 127,   0, 0,   1,   0, 0}, { 0, 0, 127,   0, 0,  -1,   2, 0},
105   { 0, 1, 127,  -1, 0,  -3,   4, 0}, { 0, 1, 126,  -2, 0,  -4,   6, 1},
106   { 0, 1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 125,  -4, 0,  -6,  11, 1},
107   { 0, 1, 124,  -4, 0,  -7,  13, 1}, { 0, 2, 123,  -5, 0,  -8,  15, 1},
108   { 0, 2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 121,  -6, 0, -10,  20, 1},
109   { 0, 2, 120,  -7, 0, -11,  22, 2}, { 0, 2, 119,  -8, 0, -12,  25, 2},
110   { 0, 3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 116,  -9, 0, -13,  29, 2},
111   { 0, 3, 114, -10, 0, -14,  32, 3}, { 0, 3, 113, -10, 0, -15,  35, 2},
112   { 0, 3, 111, -11, 0, -15,  37, 3}, { 0, 3, 109, -11, 0, -16,  40, 3},
113   { 0, 3, 108, -12, 0, -16,  42, 3}, { 0, 4, 106, -13, 0, -17,  45, 3},
114   { 0, 4, 104, -13, 0, -17,  47, 3}, { 0, 4, 102, -14, 0, -17,  50, 3},
115   { 0, 4, 100, -14, 0, -17,  52, 3}, { 0, 4,  98, -15, 0, -18,  55, 4},
116   { 0, 4,  96, -15, 0, -18,  58, 3}, { 0, 4,  94, -16, 0, -18,  60, 4},
117   { 0, 4,  91, -16, 0, -18,  63, 4}, { 0, 4,  89, -16, 0, -18,  65, 4},
118   { 0, 4,  87, -17, 0, -18,  68, 4}, { 0, 4,  85, -17, 0, -18,  70, 4},
119   { 0, 4,  82, -17, 0, -18,  73, 4}, { 0, 4,  80, -17, 0, -18,  75, 4},
120   { 0, 4,  78, -18, 0, -18,  78, 4}, { 0, 4,  75, -18, 0, -17,  80, 4},
121   { 0, 4,  73, -18, 0, -17,  82, 4}, { 0, 4,  70, -18, 0, -17,  85, 4},
122   { 0, 4,  68, -18, 0, -17,  87, 4}, { 0, 4,  65, -18, 0, -16,  89, 4},
123   { 0, 4,  63, -18, 0, -16,  91, 4}, { 0, 4,  60, -18, 0, -16,  94, 4},
124   { 0, 3,  58, -18, 0, -15,  96, 4}, { 0, 4,  55, -18, 0, -15,  98, 4},
125   { 0, 3,  52, -17, 0, -14, 100, 4}, { 0, 3,  50, -17, 0, -14, 102, 4},
126   { 0, 3,  47, -17, 0, -13, 104, 4}, { 0, 3,  45, -17, 0, -13, 106, 4},
127   { 0, 3,  42, -16, 0, -12, 108, 3}, { 0, 3,  40, -16, 0, -11, 109, 3},
128   { 0, 3,  37, -15, 0, -11, 111, 3}, { 0, 2,  35, -15, 0, -10, 113, 3},
129   { 0, 3,  32, -14, 0, -10, 114, 3}, { 0, 2,  29, -13, 0,  -9, 116, 3},
130   { 0, 2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  25, -12, 0,  -8, 119, 2},
131   { 0, 2,  22, -11, 0,  -7, 120, 2}, { 0, 1,  20, -10, 0,  -6, 121, 2},
132   { 0, 1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  15,  -8, 0,  -5, 123, 2},
133   { 0, 1,  13,  -7, 0,  -4, 124, 1}, { 0, 1,  11,  -6, 0,  -4, 125, 1},
134   { 0, 1,   8,  -5, 0,  -3, 126, 1}, { 0, 1,   6,  -4, 0,  -2, 126, 1},
135   { 0, 0,   4,  -3, 0,  -1, 127, 1}, { 0, 0,   2,  -1, 0,   0, 127, 0},
136   // dummy (replicate row index 191)
137   { 0, 0,   2,  -1, 0,   0, 127, 0},
138 
139 #else
140   // [-1, 0)
141   { 0, 127,   0, 0,   0,   1, 0, 0}, { 1, 127,  -1, 0,  -3,   4, 0, 0},
142   { 1, 126,  -3, 0,  -5,   8, 1, 0}, { 1, 124,  -4, 0,  -7,  13, 1, 0},
143   { 2, 122,  -6, 0,  -9,  18, 1, 0}, { 2, 120,  -7, 0, -11,  22, 2, 0},
144   { 3, 117,  -8, 0, -13,  27, 2, 0}, { 3, 114, -10, 0, -14,  32, 3, 0},
145   { 3, 111, -11, 0, -15,  37, 3, 0}, { 3, 108, -12, 0, -16,  42, 3, 0},
146   { 4, 104, -13, 0, -17,  47, 3, 0}, { 4, 100, -14, 0, -17,  52, 3, 0},
147   { 4,  96, -15, 0, -18,  58, 3, 0}, { 4,  91, -16, 0, -18,  63, 4, 0},
148   { 4,  87, -17, 0, -18,  68, 4, 0}, { 4,  82, -17, 0, -18,  73, 4, 0},
149   { 4,  78, -18, 0, -18,  78, 4, 0}, { 4,  73, -18, 0, -17,  82, 4, 0},
150   { 4,  68, -18, 0, -17,  87, 4, 0}, { 4,  63, -18, 0, -16,  91, 4, 0},
151   { 3,  58, -18, 0, -15,  96, 4, 0}, { 3,  52, -17, 0, -14, 100, 4, 0},
152   { 3,  47, -17, 0, -13, 104, 4, 0}, { 3,  42, -16, 0, -12, 108, 3, 0},
153   { 3,  37, -15, 0, -11, 111, 3, 0}, { 3,  32, -14, 0, -10, 114, 3, 0},
154   { 2,  27, -13, 0,  -8, 117, 3, 0}, { 2,  22, -11, 0,  -7, 120, 2, 0},
155   { 1,  18,  -9, 0,  -6, 122, 2, 0}, { 1,  13,  -7, 0,  -4, 124, 1, 0},
156   { 1,   8,  -5, 0,  -3, 126, 1, 0}, { 0,   4,  -3, 0,  -1, 127, 1, 0},
157   // [0, 1)
158   { 0,   0,   1, 0, 0, 127,   0,  0}, { 0,  -3,   4, 1, 1, 127,  -2,  0},
159   { 0,  -6,   8, 1, 2, 126,  -3,  0}, {-1,  -8,  13, 2, 3, 125,  -5, -1},
160   {-1, -11,  18, 3, 4, 123,  -7, -1}, {-1, -13,  23, 3, 4, 121,  -8, -1},
161   {-1, -15,  27, 4, 5, 119, -10, -1}, {-2, -17,  33, 5, 6, 116, -12, -1},
162   {-2, -18,  38, 5, 6, 113, -13, -1}, {-2, -19,  43, 6, 7, 110, -15, -2},
163   {-2, -20,  49, 6, 7, 106, -16, -2}, {-2, -21,  54, 7, 7, 102, -17, -2},
164   {-2, -22,  59, 7, 8,  98, -18, -2}, {-2, -22,  64, 7, 8,  94, -19, -2},
165   {-2, -22,  69, 8, 8,  89, -20, -2}, {-2, -21,  74, 8, 8,  84, -21, -2},
166   {-2, -21,  79, 8, 8,  79, -21, -2}, {-2, -21,  84, 8, 8,  74, -21, -2},
167   {-2, -20,  89, 8, 8,  69, -22, -2}, {-2, -19,  94, 8, 7,  64, -22, -2},
168   {-2, -18,  98, 8, 7,  59, -22, -2}, {-2, -17, 102, 7, 7,  54, -21, -2},
169   {-2, -16, 106, 7, 6,  49, -20, -2}, {-2, -15, 110, 7, 6,  43, -19, -2},
170   {-1, -13, 113, 6, 5,  38, -18, -2}, {-1, -12, 116, 6, 5,  33, -17, -2},
171   {-1, -10, 119, 5, 4,  27, -15, -1}, {-1,  -8, 121, 4, 3,  23, -13, -1},
172   {-1,  -7, 123, 4, 3,  18, -11, -1}, {-1,  -5, 125, 3, 2,  13,  -8, -1},
173   { 0,  -3, 126, 2, 1,   8,  -6,  0}, { 0,  -2, 127, 1, 1,   4,  -3,  0},
174   // [1, 2)
175   { 0,  0, 127,   0, 0,   1,   0, 0}, { 0, 1, 127,  -1, 0,  -3,   4, 0},
176   { 0,  1, 126,  -3, 0,  -5,   8, 1}, { 0, 1, 124,  -4, 0,  -7,  13, 1},
177   { 0,  2, 122,  -6, 0,  -9,  18, 1}, { 0, 2, 120,  -7, 0, -11,  22, 2},
178   { 0,  3, 117,  -8, 0, -13,  27, 2}, { 0, 3, 114, -10, 0, -14,  32, 3},
179   { 0,  3, 111, -11, 0, -15,  37, 3}, { 0, 3, 108, -12, 0, -16,  42, 3},
180   { 0,  4, 104, -13, 0, -17,  47, 3}, { 0, 4, 100, -14, 0, -17,  52, 3},
181   { 0,  4,  96, -15, 0, -18,  58, 3}, { 0, 4,  91, -16, 0, -18,  63, 4},
182   { 0,  4,  87, -17, 0, -18,  68, 4}, { 0, 4,  82, -17, 0, -18,  73, 4},
183   { 0,  4,  78, -18, 0, -18,  78, 4}, { 0, 4,  73, -18, 0, -17,  82, 4},
184   { 0,  4,  68, -18, 0, -17,  87, 4}, { 0, 4,  63, -18, 0, -16,  91, 4},
185   { 0,  3,  58, -18, 0, -15,  96, 4}, { 0, 3,  52, -17, 0, -14, 100, 4},
186   { 0,  3,  47, -17, 0, -13, 104, 4}, { 0, 3,  42, -16, 0, -12, 108, 3},
187   { 0,  3,  37, -15, 0, -11, 111, 3}, { 0, 3,  32, -14, 0, -10, 114, 3},
188   { 0,  2,  27, -13, 0,  -8, 117, 3}, { 0, 2,  22, -11, 0,  -7, 120, 2},
189   { 0,  1,  18,  -9, 0,  -6, 122, 2}, { 0, 1,  13,  -7, 0,  -4, 124, 1},
190   { 0,  1,   8,  -5, 0,  -3, 126, 1}, { 0, 0,   4,  -3, 0,  -1, 127, 1},
191   // dummy (replicate row index 95)
192   { 0, 0,   4,  -3, 0,  -1, 127, 1},
193 #endif  // WARPEDPIXEL_PREC_BITS == 6
194 };
195 /* clang-format on */
196 
197 // Shuffle masks: we want to convert a sequence of bytes 0, 1, 2, ..., 15
198 // in an SSE register into two sequences:
199 // 0, 2, 2, 4, ..., 12, 12, 14, <don't care>
200 // 1, 3, 3, 5, ..., 13, 13, 15, <don't care>
201 DECLARE_ALIGNED(16, static const uint8_t,
202                 even_mask[16]) = { 0, 2,  2,  4,  4,  6,  6,  8,
203                                    8, 10, 10, 12, 12, 14, 14, 0 };
204 
205 DECLARE_ALIGNED(16, static const uint8_t,
206                 odd_mask[16]) = { 1, 3,  3,  5,  5,  7,  7,  9,
207                                   9, 11, 11, 13, 13, 15, 15, 0 };
208 
209 DECLARE_ALIGNED(16, static const uint8_t,
210                 shuffle_alpha0_mask01[16]) = { 0, 1, 0, 1, 0, 1, 0, 1,
211                                                0, 1, 0, 1, 0, 1, 0, 1 };
212 
213 DECLARE_ALIGNED(16, static const uint8_t,
214                 shuffle_alpha0_mask23[16]) = { 2, 3, 2, 3, 2, 3, 2, 3,
215                                                2, 3, 2, 3, 2, 3, 2, 3 };
216 
217 DECLARE_ALIGNED(16, static const uint8_t,
218                 shuffle_alpha0_mask45[16]) = { 4, 5, 4, 5, 4, 5, 4, 5,
219                                                4, 5, 4, 5, 4, 5, 4, 5 };
220 
221 DECLARE_ALIGNED(16, static const uint8_t,
222                 shuffle_alpha0_mask67[16]) = { 6, 7, 6, 7, 6, 7, 6, 7,
223                                                6, 7, 6, 7, 6, 7, 6, 7 };
224 
225 DECLARE_ALIGNED(16, static const uint8_t,
226                 shuffle_gamma0_mask0[16]) = { 0, 1, 2, 3, 0, 1, 2, 3,
227                                               0, 1, 2, 3, 0, 1, 2, 3 };
228 
229 DECLARE_ALIGNED(16, static const uint8_t,
230                 shuffle_gamma0_mask1[16]) = { 4, 5, 6, 7, 4, 5, 6, 7,
231                                               4, 5, 6, 7, 4, 5, 6, 7 };
232 
233 DECLARE_ALIGNED(16, static const uint8_t,
234                 shuffle_gamma0_mask2[16]) = { 8, 9, 10, 11, 8, 9, 10, 11,
235                                               8, 9, 10, 11, 8, 9, 10, 11 };
236 
237 DECLARE_ALIGNED(16, static const uint8_t,
238                 shuffle_gamma0_mask3[16]) = { 12, 13, 14, 15, 12, 13, 14, 15,
239                                               12, 13, 14, 15, 12, 13, 14, 15 };
240 
filter_src_pixels(__m128i src,__m128i * tmp,__m128i * coeff,const int offset_bits_horiz,const int reduce_bits_horiz,int k)241 static INLINE void filter_src_pixels(__m128i src, __m128i *tmp, __m128i *coeff,
242                                      const int offset_bits_horiz,
243                                      const int reduce_bits_horiz, int k) {
244   const __m128i src_even =
245       _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)even_mask));
246   const __m128i src_odd =
247       _mm_shuffle_epi8(src, _mm_load_si128((__m128i *)odd_mask));
248   // The pixel order we need for 'src' is:
249   // 0 2 2 4 4 6 6 8 1 3 3 5 5 7 7 9
250   const __m128i src_02 = _mm_unpacklo_epi64(src_even, src_odd);
251   const __m128i res_02 = _mm_maddubs_epi16(src_02, coeff[0]);
252   // 4 6 6 8 8 10 10 12 5 7 7 9 9 11 11 13
253   const __m128i src_46 = _mm_unpacklo_epi64(_mm_srli_si128(src_even, 4),
254                                             _mm_srli_si128(src_odd, 4));
255   const __m128i res_46 = _mm_maddubs_epi16(src_46, coeff[1]);
256   // 1 3 3 5 5 7 7 9 2 4 4 6 6 8 8 10
257   const __m128i src_13 =
258       _mm_unpacklo_epi64(src_odd, _mm_srli_si128(src_even, 2));
259   const __m128i res_13 = _mm_maddubs_epi16(src_13, coeff[2]);
260   // 5 7 7 9 9 11 11 13 6 8 8 10 10 12 12 14
261   const __m128i src_57 = _mm_unpacklo_epi64(_mm_srli_si128(src_odd, 4),
262                                             _mm_srli_si128(src_even, 6));
263   const __m128i res_57 = _mm_maddubs_epi16(src_57, coeff[3]);
264 
265   const __m128i round_const = _mm_set1_epi16((1 << offset_bits_horiz) +
266                                              ((1 << reduce_bits_horiz) >> 1));
267 
268   // Note: The values res_02 + res_46 and res_13 + res_57 both
269   // fit into int16s at this point, but their sum may be too wide to fit
270   // into an int16. However, once we also add round_const, the sum of
271   // all of these fits into a uint16.
272   //
273   // The wrapping behaviour of _mm_add_* is used here to make sure we
274   // get the correct result despite converting between different
275   // (implicit) types.
276   const __m128i res_even = _mm_add_epi16(res_02, res_46);
277   const __m128i res_odd = _mm_add_epi16(res_13, res_57);
278   const __m128i res =
279       _mm_add_epi16(_mm_add_epi16(res_even, res_odd), round_const);
280   tmp[k + 7] = _mm_srl_epi16(res, _mm_cvtsi32_si128(reduce_bits_horiz));
281 }
282 
prepare_horizontal_filter_coeff(int alpha,int sx,__m128i * coeff)283 static INLINE void prepare_horizontal_filter_coeff(int alpha, int sx,
284                                                    __m128i *coeff) {
285   // Filter even-index pixels
286   const __m128i tmp_0 = _mm_loadl_epi64(
287       (__m128i *)&av1_filter_8bit[(sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS]);
288   const __m128i tmp_1 = _mm_loadl_epi64(
289       (__m128i *)&av1_filter_8bit[(sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS]);
290   const __m128i tmp_2 = _mm_loadl_epi64(
291       (__m128i *)&av1_filter_8bit[(sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS]);
292   const __m128i tmp_3 = _mm_loadl_epi64(
293       (__m128i *)&av1_filter_8bit[(sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS]);
294   const __m128i tmp_4 = _mm_loadl_epi64(
295       (__m128i *)&av1_filter_8bit[(sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS]);
296   const __m128i tmp_5 = _mm_loadl_epi64(
297       (__m128i *)&av1_filter_8bit[(sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS]);
298   const __m128i tmp_6 = _mm_loadl_epi64(
299       (__m128i *)&av1_filter_8bit[(sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS]);
300   const __m128i tmp_7 = _mm_loadl_epi64(
301       (__m128i *)&av1_filter_8bit[(sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS]);
302 
303   // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 0 2
304   const __m128i tmp_8 = _mm_unpacklo_epi16(tmp_0, tmp_2);
305   // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 1 3
306   const __m128i tmp_9 = _mm_unpacklo_epi16(tmp_1, tmp_3);
307   // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 4 6
308   const __m128i tmp_10 = _mm_unpacklo_epi16(tmp_4, tmp_6);
309   // Coeffs 0 2 0 2 4 6 4 6 1 3 1 3 5 7 5 7 for pixels 5 7
310   const __m128i tmp_11 = _mm_unpacklo_epi16(tmp_5, tmp_7);
311 
312   // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 0 2 4 6
313   const __m128i tmp_12 = _mm_unpacklo_epi32(tmp_8, tmp_10);
314   // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 0 2 4 6
315   const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_8, tmp_10);
316   // Coeffs 0 2 0 2 0 2 0 2 4 6 4 6 4 6 4 6 for pixels 1 3 5 7
317   const __m128i tmp_14 = _mm_unpacklo_epi32(tmp_9, tmp_11);
318   // Coeffs 1 3 1 3 1 3 1 3 5 7 5 7 5 7 5 7 for pixels 1 3 5 7
319   const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_9, tmp_11);
320 
321   // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
322   coeff[0] = _mm_unpacklo_epi64(tmp_12, tmp_14);
323   // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
324   coeff[1] = _mm_unpackhi_epi64(tmp_12, tmp_14);
325   // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
326   coeff[2] = _mm_unpacklo_epi64(tmp_13, tmp_15);
327   // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
328   coeff[3] = _mm_unpackhi_epi64(tmp_13, tmp_15);
329 }
330 
prepare_horizontal_filter_coeff_alpha0(int sx,__m128i * coeff)331 static INLINE void prepare_horizontal_filter_coeff_alpha0(int sx,
332                                                           __m128i *coeff) {
333   // Filter even-index pixels
334   const __m128i tmp_0 =
335       _mm_loadl_epi64((__m128i *)&av1_filter_8bit[sx >> WARPEDDIFF_PREC_BITS]);
336 
337   // Coeffs 0 2 for pixels 0 2 4 6 1 3 5 7
338   coeff[0] =
339       _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask01));
340   // Coeffs 4 6 for pixels 0 2 4 6 1 3 5 7
341   coeff[1] =
342       _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask23));
343   // Coeffs 1 3 for pixels 0 2 4 6 1 3 5 7
344   coeff[2] =
345       _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask45));
346   // Coeffs 5 7 for pixels 0 2 4 6 1 3 5 7
347   coeff[3] =
348       _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_alpha0_mask67));
349 }
350 
horizontal_filter(__m128i src,__m128i * tmp,int sx,int alpha,int k,const int offset_bits_horiz,const int reduce_bits_horiz)351 static INLINE void horizontal_filter(__m128i src, __m128i *tmp, int sx,
352                                      int alpha, int k,
353                                      const int offset_bits_horiz,
354                                      const int reduce_bits_horiz) {
355   __m128i coeff[4];
356   prepare_horizontal_filter_coeff(alpha, sx, coeff);
357   filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
358 }
359 
warp_horizontal_filter(const uint8_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)360 static INLINE void warp_horizontal_filter(const uint8_t *ref, __m128i *tmp,
361                                           int stride, int32_t ix4, int32_t iy4,
362                                           int32_t sx4, int alpha, int beta,
363                                           int p_height, int height, int i,
364                                           const int offset_bits_horiz,
365                                           const int reduce_bits_horiz) {
366   int k;
367   for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
368     int iy = iy4 + k;
369     if (iy < 0)
370       iy = 0;
371     else if (iy > height - 1)
372       iy = height - 1;
373     int sx = sx4 + beta * (k + 4);
374 
375     // Load source pixels
376     const __m128i src =
377         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
378     horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
379                       reduce_bits_horiz);
380   }
381 }
382 
warp_horizontal_filter_alpha0(const uint8_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)383 static INLINE void warp_horizontal_filter_alpha0(
384     const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
385     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
386     const int offset_bits_horiz, const int reduce_bits_horiz) {
387   (void)alpha;
388   int k;
389   for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
390     int iy = iy4 + k;
391     if (iy < 0)
392       iy = 0;
393     else if (iy > height - 1)
394       iy = height - 1;
395     int sx = sx4 + beta * (k + 4);
396 
397     // Load source pixels
398     const __m128i src =
399         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
400 
401     __m128i coeff[4];
402     prepare_horizontal_filter_coeff_alpha0(sx, coeff);
403     filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
404   }
405 }
406 
warp_horizontal_filter_beta0(const uint8_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)407 static INLINE void warp_horizontal_filter_beta0(
408     const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
409     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
410     const int offset_bits_horiz, const int reduce_bits_horiz) {
411   (void)beta;
412   int k;
413   __m128i coeff[4];
414   prepare_horizontal_filter_coeff(alpha, sx4, coeff);
415 
416   for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
417     int iy = iy4 + k;
418     if (iy < 0)
419       iy = 0;
420     else if (iy > height - 1)
421       iy = height - 1;
422 
423     // Load source pixels
424     const __m128i src =
425         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
426     filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
427   }
428 }
429 
warp_horizontal_filter_alpha0_beta0(const uint8_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)430 static INLINE void warp_horizontal_filter_alpha0_beta0(
431     const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
432     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
433     const int offset_bits_horiz, const int reduce_bits_horiz) {
434   (void)beta;
435   (void)alpha;
436   int k;
437 
438   __m128i coeff[4];
439   prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
440 
441   for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
442     int iy = iy4 + k;
443     if (iy < 0)
444       iy = 0;
445     else if (iy > height - 1)
446       iy = height - 1;
447 
448     // Load source pixels
449     const __m128i src =
450         _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
451     filter_src_pixels(src, tmp, coeff, offset_bits_horiz, reduce_bits_horiz, k);
452   }
453 }
454 
unpack_weights_and_set_round_const(ConvolveParams * conv_params,const int round_bits,const int offset_bits,__m128i * res_sub_const,__m128i * round_bits_const,__m128i * wt)455 static INLINE void unpack_weights_and_set_round_const(
456     ConvolveParams *conv_params, const int round_bits, const int offset_bits,
457     __m128i *res_sub_const, __m128i *round_bits_const, __m128i *wt) {
458   *res_sub_const =
459       _mm_set1_epi16(-(1 << (offset_bits - conv_params->round_1)) -
460                      (1 << (offset_bits - conv_params->round_1 - 1)));
461   *round_bits_const = _mm_set1_epi16(((1 << round_bits) >> 1));
462 
463   const int w0 = conv_params->fwd_offset;
464   const int w1 = conv_params->bck_offset;
465   const __m128i wt0 = _mm_set1_epi16((int16_t)w0);
466   const __m128i wt1 = _mm_set1_epi16((int16_t)w1);
467   *wt = _mm_unpacklo_epi16(wt0, wt1);
468 }
469 
prepare_vertical_filter_coeffs(int gamma,int sy,__m128i * coeffs)470 static INLINE void prepare_vertical_filter_coeffs(int gamma, int sy,
471                                                   __m128i *coeffs) {
472   const __m128i tmp_0 =
473       _mm_loadu_si128((__m128i *)(av1_warped_filter +
474                                   ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
475   const __m128i tmp_2 =
476       _mm_loadu_si128((__m128i *)(av1_warped_filter +
477                                   ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
478   const __m128i tmp_4 =
479       _mm_loadu_si128((__m128i *)(av1_warped_filter +
480                                   ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
481   const __m128i tmp_6 =
482       _mm_loadu_si128((__m128i *)(av1_warped_filter +
483                                   ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
484 
485   const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
486   const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
487   const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
488   const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
489 
490   // even coeffs
491   coeffs[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
492   coeffs[1] = _mm_unpackhi_epi64(tmp_8, tmp_10);
493   coeffs[2] = _mm_unpacklo_epi64(tmp_12, tmp_14);
494   coeffs[3] = _mm_unpackhi_epi64(tmp_12, tmp_14);
495 
496   const __m128i tmp_1 =
497       _mm_loadu_si128((__m128i *)(av1_warped_filter +
498                                   ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
499   const __m128i tmp_3 =
500       _mm_loadu_si128((__m128i *)(av1_warped_filter +
501                                   ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
502   const __m128i tmp_5 =
503       _mm_loadu_si128((__m128i *)(av1_warped_filter +
504                                   ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
505   const __m128i tmp_7 =
506       _mm_loadu_si128((__m128i *)(av1_warped_filter +
507                                   ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
508 
509   const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
510   const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
511   const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
512   const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
513 
514   // odd coeffs
515   coeffs[4] = _mm_unpacklo_epi64(tmp_9, tmp_11);
516   coeffs[5] = _mm_unpackhi_epi64(tmp_9, tmp_11);
517   coeffs[6] = _mm_unpacklo_epi64(tmp_13, tmp_15);
518   coeffs[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
519 }
520 
prepare_vertical_filter_coeffs_gamma0(int sy,__m128i * coeffs)521 static INLINE void prepare_vertical_filter_coeffs_gamma0(int sy,
522                                                          __m128i *coeffs) {
523   const __m128i tmp_0 = _mm_loadu_si128(
524       (__m128i *)(av1_warped_filter + (sy >> WARPEDDIFF_PREC_BITS)));
525 
526   // even coeffs
527   coeffs[0] =
528       _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask0));
529   coeffs[1] =
530       _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask1));
531   coeffs[2] =
532       _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask2));
533   coeffs[3] =
534       _mm_shuffle_epi8(tmp_0, _mm_load_si128((__m128i *)shuffle_gamma0_mask3));
535 
536   // odd coeffs
537   coeffs[4] = coeffs[0];
538   coeffs[5] = coeffs[1];
539   coeffs[6] = coeffs[2];
540   coeffs[7] = coeffs[3];
541 }
542 
filter_src_pixels_vertical(__m128i * tmp,__m128i * coeffs,__m128i * res_lo,__m128i * res_hi,int k)543 static INLINE void filter_src_pixels_vertical(__m128i *tmp, __m128i *coeffs,
544                                               __m128i *res_lo, __m128i *res_hi,
545                                               int k) {
546   // Load from tmp and rearrange pairs of consecutive rows into the
547   // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
548   const __m128i *src = tmp + (k + 4);
549   const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
550   const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
551   const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
552   const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
553 
554   const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]);
555   const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]);
556   const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]);
557   const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]);
558 
559   const __m128i res_even =
560       _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6));
561 
562   // Filter odd-index pixels
563   const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
564   const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
565   const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
566   const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
567 
568   const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[4]);
569   const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[5]);
570   const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[6]);
571   const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[7]);
572 
573   const __m128i res_odd =
574       _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7));
575 
576   // Rearrange pixels back into the order 0 ... 7
577   *res_lo = _mm_unpacklo_epi32(res_even, res_odd);
578   *res_hi = _mm_unpackhi_epi32(res_even, res_odd);
579 }
580 
store_vertical_filter_output(__m128i * res_lo,__m128i * res_hi,const __m128i * res_add_const,const __m128i * wt,const __m128i * res_sub_const,__m128i * round_bits_const,uint8_t * pred,ConvolveParams * conv_params,int i,int j,int k,const int reduce_bits_vert,int p_stride,int p_width,const int round_bits)581 static INLINE void store_vertical_filter_output(
582     __m128i *res_lo, __m128i *res_hi, const __m128i *res_add_const,
583     const __m128i *wt, const __m128i *res_sub_const, __m128i *round_bits_const,
584     uint8_t *pred, ConvolveParams *conv_params, int i, int j, int k,
585     const int reduce_bits_vert, int p_stride, int p_width,
586     const int round_bits) {
587   __m128i res_lo_1 = *res_lo;
588   __m128i res_hi_1 = *res_hi;
589 
590   if (conv_params->is_compound) {
591     __m128i *const p =
592         (__m128i *)&conv_params->dst[(i + k + 4) * conv_params->dst_stride + j];
593     res_lo_1 = _mm_srai_epi32(_mm_add_epi32(res_lo_1, *res_add_const),
594                               reduce_bits_vert);
595     const __m128i temp_lo_16 = _mm_packus_epi32(res_lo_1, res_lo_1);
596     __m128i res_lo_16;
597     if (conv_params->do_average) {
598       __m128i *const dst8 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
599       const __m128i p_16 = _mm_loadl_epi64(p);
600 
601       if (conv_params->use_dist_wtd_comp_avg) {
602         const __m128i p_16_lo = _mm_unpacklo_epi16(p_16, temp_lo_16);
603         const __m128i wt_res_lo = _mm_madd_epi16(p_16_lo, *wt);
604         const __m128i shifted_32 =
605             _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
606         res_lo_16 = _mm_packus_epi32(shifted_32, shifted_32);
607       } else {
608         res_lo_16 = _mm_srai_epi16(_mm_add_epi16(p_16, temp_lo_16), 1);
609       }
610 
611       res_lo_16 = _mm_add_epi16(res_lo_16, *res_sub_const);
612 
613       res_lo_16 = _mm_srai_epi16(_mm_add_epi16(res_lo_16, *round_bits_const),
614                                  round_bits);
615       __m128i res_8_lo = _mm_packus_epi16(res_lo_16, res_lo_16);
616       *(uint32_t *)dst8 = _mm_cvtsi128_si32(res_8_lo);
617     } else {
618       _mm_storel_epi64(p, temp_lo_16);
619     }
620     if (p_width > 4) {
621       __m128i *const p4 =
622           (__m128i *)&conv_params
623               ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
624       res_hi_1 = _mm_srai_epi32(_mm_add_epi32(res_hi_1, *res_add_const),
625                                 reduce_bits_vert);
626       const __m128i temp_hi_16 = _mm_packus_epi32(res_hi_1, res_hi_1);
627       __m128i res_hi_16;
628 
629       if (conv_params->do_average) {
630         __m128i *const dst8_4 =
631             (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
632         const __m128i p4_16 = _mm_loadl_epi64(p4);
633 
634         if (conv_params->use_dist_wtd_comp_avg) {
635           const __m128i p_16_hi = _mm_unpacklo_epi16(p4_16, temp_hi_16);
636           const __m128i wt_res_hi = _mm_madd_epi16(p_16_hi, *wt);
637           const __m128i shifted_32 =
638               _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
639           res_hi_16 = _mm_packus_epi32(shifted_32, shifted_32);
640         } else {
641           res_hi_16 = _mm_srai_epi16(_mm_add_epi16(p4_16, temp_hi_16), 1);
642         }
643         res_hi_16 = _mm_add_epi16(res_hi_16, *res_sub_const);
644 
645         res_hi_16 = _mm_srai_epi16(_mm_add_epi16(res_hi_16, *round_bits_const),
646                                    round_bits);
647         __m128i res_8_hi = _mm_packus_epi16(res_hi_16, res_hi_16);
648         *(uint32_t *)dst8_4 = _mm_cvtsi128_si32(res_8_hi);
649 
650       } else {
651         _mm_storel_epi64(p4, temp_hi_16);
652       }
653     }
654   } else {
655     const __m128i res_lo_round = _mm_srai_epi32(
656         _mm_add_epi32(res_lo_1, *res_add_const), reduce_bits_vert);
657     const __m128i res_hi_round = _mm_srai_epi32(
658         _mm_add_epi32(res_hi_1, *res_add_const), reduce_bits_vert);
659 
660     const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
661     __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
662 
663     // Store, blending with 'pred' if needed
664     __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
665 
666     // Note: If we're outputting a 4x4 block, we need to be very careful
667     // to only output 4 pixels at this point, to avoid encode/decode
668     // mismatches when encoding with multiple threads.
669     if (p_width == 4) {
670       *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
671     } else {
672       _mm_storel_epi64(p, res_8bit);
673     }
674   }
675 }
676 
warp_vertical_filter(uint8_t * pred,__m128i * tmp,ConvolveParams * conv_params,int16_t gamma,int16_t delta,int p_height,int p_stride,int p_width,int i,int j,int sy4,const int reduce_bits_vert,const __m128i * res_add_const,const int round_bits,const int offset_bits)677 static INLINE void warp_vertical_filter(
678     uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
679     int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
680     int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
681     const int round_bits, const int offset_bits) {
682   int k;
683   __m128i res_sub_const, round_bits_const, wt;
684   unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
685                                      &res_sub_const, &round_bits_const, &wt);
686   // Vertical filter
687   for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
688     int sy = sy4 + delta * (k + 4);
689 
690     __m128i coeffs[8];
691     prepare_vertical_filter_coeffs(gamma, sy, coeffs);
692 
693     __m128i res_lo;
694     __m128i res_hi;
695     filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
696 
697     store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
698                                  &res_sub_const, &round_bits_const, pred,
699                                  conv_params, i, j, k, reduce_bits_vert,
700                                  p_stride, p_width, round_bits);
701   }
702 }
703 
warp_vertical_filter_gamma0(uint8_t * pred,__m128i * tmp,ConvolveParams * conv_params,int16_t gamma,int16_t delta,int p_height,int p_stride,int p_width,int i,int j,int sy4,const int reduce_bits_vert,const __m128i * res_add_const,const int round_bits,const int offset_bits)704 static INLINE void warp_vertical_filter_gamma0(
705     uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
706     int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
707     int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
708     const int round_bits, const int offset_bits) {
709   int k;
710   (void)gamma;
711   __m128i res_sub_const, round_bits_const, wt;
712   unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
713                                      &res_sub_const, &round_bits_const, &wt);
714   // Vertical filter
715   for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
716     int sy = sy4 + delta * (k + 4);
717 
718     __m128i coeffs[8];
719     prepare_vertical_filter_coeffs_gamma0(sy, coeffs);
720 
721     __m128i res_lo;
722     __m128i res_hi;
723     filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
724 
725     store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
726                                  &res_sub_const, &round_bits_const, pred,
727                                  conv_params, i, j, k, reduce_bits_vert,
728                                  p_stride, p_width, round_bits);
729   }
730 }
731 
warp_vertical_filter_delta0(uint8_t * pred,__m128i * tmp,ConvolveParams * conv_params,int16_t gamma,int16_t delta,int p_height,int p_stride,int p_width,int i,int j,int sy4,const int reduce_bits_vert,const __m128i * res_add_const,const int round_bits,const int offset_bits)732 static INLINE void warp_vertical_filter_delta0(
733     uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
734     int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
735     int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
736     const int round_bits, const int offset_bits) {
737   (void)delta;
738   int k;
739   __m128i res_sub_const, round_bits_const, wt;
740   unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
741                                      &res_sub_const, &round_bits_const, &wt);
742 
743   __m128i coeffs[8];
744   prepare_vertical_filter_coeffs(gamma, sy4, coeffs);
745   // Vertical filter
746   for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
747     __m128i res_lo;
748     __m128i res_hi;
749     filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
750 
751     store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
752                                  &res_sub_const, &round_bits_const, pred,
753                                  conv_params, i, j, k, reduce_bits_vert,
754                                  p_stride, p_width, round_bits);
755   }
756 }
757 
warp_vertical_filter_gamma0_delta0(uint8_t * pred,__m128i * tmp,ConvolveParams * conv_params,int16_t gamma,int16_t delta,int p_height,int p_stride,int p_width,int i,int j,int sy4,const int reduce_bits_vert,const __m128i * res_add_const,const int round_bits,const int offset_bits)758 static INLINE void warp_vertical_filter_gamma0_delta0(
759     uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
760     int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
761     int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
762     const int round_bits, const int offset_bits) {
763   (void)delta;
764   (void)gamma;
765   int k;
766   __m128i res_sub_const, round_bits_const, wt;
767   unpack_weights_and_set_round_const(conv_params, round_bits, offset_bits,
768                                      &res_sub_const, &round_bits_const, &wt);
769 
770   __m128i coeffs[8];
771   prepare_vertical_filter_coeffs_gamma0(sy4, coeffs);
772   // Vertical filter
773   for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
774     __m128i res_lo;
775     __m128i res_hi;
776     filter_src_pixels_vertical(tmp, coeffs, &res_lo, &res_hi, k);
777 
778     store_vertical_filter_output(&res_lo, &res_hi, res_add_const, &wt,
779                                  &res_sub_const, &round_bits_const, pred,
780                                  conv_params, i, j, k, reduce_bits_vert,
781                                  p_stride, p_width, round_bits);
782   }
783 }
784 
prepare_warp_vertical_filter(uint8_t * pred,__m128i * tmp,ConvolveParams * conv_params,int16_t gamma,int16_t delta,int p_height,int p_stride,int p_width,int i,int j,int sy4,const int reduce_bits_vert,const __m128i * res_add_const,const int round_bits,const int offset_bits)785 static INLINE void prepare_warp_vertical_filter(
786     uint8_t *pred, __m128i *tmp, ConvolveParams *conv_params, int16_t gamma,
787     int16_t delta, int p_height, int p_stride, int p_width, int i, int j,
788     int sy4, const int reduce_bits_vert, const __m128i *res_add_const,
789     const int round_bits, const int offset_bits) {
790   if (gamma == 0 && delta == 0)
791     warp_vertical_filter_gamma0_delta0(
792         pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i, j,
793         sy4, reduce_bits_vert, res_add_const, round_bits, offset_bits);
794   else if (gamma == 0 && delta != 0)
795     warp_vertical_filter_gamma0(pred, tmp, conv_params, gamma, delta, p_height,
796                                 p_stride, p_width, i, j, sy4, reduce_bits_vert,
797                                 res_add_const, round_bits, offset_bits);
798   else if (gamma != 0 && delta == 0)
799     warp_vertical_filter_delta0(pred, tmp, conv_params, gamma, delta, p_height,
800                                 p_stride, p_width, i, j, sy4, reduce_bits_vert,
801                                 res_add_const, round_bits, offset_bits);
802   else
803     warp_vertical_filter(pred, tmp, conv_params, gamma, delta, p_height,
804                          p_stride, p_width, i, j, sy4, reduce_bits_vert,
805                          res_add_const, round_bits, offset_bits);
806 }
807 
prepare_warp_horizontal_filter(const uint8_t * ref,__m128i * tmp,int stride,int32_t ix4,int32_t iy4,int32_t sx4,int alpha,int beta,int p_height,int height,int i,const int offset_bits_horiz,const int reduce_bits_horiz)808 static INLINE void prepare_warp_horizontal_filter(
809     const uint8_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
810     int32_t sx4, int alpha, int beta, int p_height, int height, int i,
811     const int offset_bits_horiz, const int reduce_bits_horiz) {
812   if (alpha == 0 && beta == 0)
813     warp_horizontal_filter_alpha0_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
814                                         beta, p_height, height, i,
815                                         offset_bits_horiz, reduce_bits_horiz);
816   else if (alpha == 0 && beta != 0)
817     warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
818                                   p_height, height, i, offset_bits_horiz,
819                                   reduce_bits_horiz);
820   else if (alpha != 0 && beta == 0)
821     warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
822                                  p_height, height, i, offset_bits_horiz,
823                                  reduce_bits_horiz);
824   else
825     warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
826                            p_height, height, i, offset_bits_horiz,
827                            reduce_bits_horiz);
828 }
829 
av1_warp_affine_sse4_1(const int32_t * mat,const uint8_t * ref,int width,int height,int stride,uint8_t * pred,int p_col,int p_row,int p_width,int p_height,int p_stride,int subsampling_x,int subsampling_y,ConvolveParams * conv_params,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta)830 void av1_warp_affine_sse4_1(const int32_t *mat, const uint8_t *ref, int width,
831                             int height, int stride, uint8_t *pred, int p_col,
832                             int p_row, int p_width, int p_height, int p_stride,
833                             int subsampling_x, int subsampling_y,
834                             ConvolveParams *conv_params, int16_t alpha,
835                             int16_t beta, int16_t gamma, int16_t delta) {
836   __m128i tmp[15];
837   int i, j, k;
838   const int bd = 8;
839   const int reduce_bits_horiz = conv_params->round_0;
840   const int reduce_bits_vert = conv_params->is_compound
841                                    ? conv_params->round_1
842                                    : 2 * FILTER_BITS - reduce_bits_horiz;
843   const int offset_bits_horiz = bd + FILTER_BITS - 1;
844   assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
845 
846   const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
847   const __m128i reduce_bits_vert_const =
848       _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
849   const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
850   const int round_bits =
851       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
852   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
853   assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
854 
855   /* Note: For this code to work, the left/right frame borders need to be
856   extended by at least 13 pixels each. By the time we get here, other
857   code will have set up this border, but we allow an explicit check
858   for debugging purposes.
859   */
860   /*for (i = 0; i < height; ++i) {
861   for (j = 0; j < 13; ++j) {
862   assert(ref[i * stride - 13 + j] == ref[i * stride]);
863   assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
864   }
865   }*/
866   __m128i res_add_const_1;
867   if (conv_params->is_compound == 1) {
868     res_add_const_1 = _mm_add_epi32(reduce_bits_vert_const, res_add_const);
869   } else {
870     res_add_const_1 = _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
871                                      ((1 << reduce_bits_vert) >> 1));
872   }
873 
874   for (i = 0; i < p_height; i += 8) {
875     for (j = 0; j < p_width; j += 8) {
876       const int32_t src_x = (p_col + j + 4) << subsampling_x;
877       const int32_t src_y = (p_row + i + 4) << subsampling_y;
878       const int32_t dst_x = mat[2] * src_x + mat[3] * src_y + mat[0];
879       const int32_t dst_y = mat[4] * src_x + mat[5] * src_y + mat[1];
880       const int32_t x4 = dst_x >> subsampling_x;
881       const int32_t y4 = dst_y >> subsampling_y;
882 
883       int32_t ix4 = x4 >> WARPEDMODEL_PREC_BITS;
884       int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
885       int32_t iy4 = y4 >> WARPEDMODEL_PREC_BITS;
886       int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
887 
888       // Add in all the constant terms, including rounding and offset
889       sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
890              (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
891       sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
892              (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
893 
894       sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
895       sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
896 
897       // Horizontal filter
898       // If the block is aligned such that, after clamping, every sample
899       // would be taken from the leftmost/rightmost column, then we can
900       // skip the expensive horizontal filter.
901       if (ix4 <= -7) {
902         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
903           int iy = iy4 + k;
904           if (iy < 0)
905             iy = 0;
906           else if (iy > height - 1)
907             iy = height - 1;
908           tmp[k + 7] = _mm_set1_epi16(
909               (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
910               ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
911         }
912       } else if (ix4 >= width + 6) {
913         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
914           int iy = iy4 + k;
915           if (iy < 0)
916             iy = 0;
917           else if (iy > height - 1)
918             iy = height - 1;
919           tmp[k + 7] =
920               _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
921                              ref[iy * stride + (width - 1)] *
922                                  (1 << (FILTER_BITS - reduce_bits_horiz)));
923         }
924       } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
925         const int out_of_boundary_left = -(ix4 - 6);
926         const int out_of_boundary_right = (ix4 + 8) - width;
927         for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
928           int iy = iy4 + k;
929           if (iy < 0)
930             iy = 0;
931           else if (iy > height - 1)
932             iy = height - 1;
933           int sx = sx4 + beta * (k + 4);
934 
935           // Load source pixels
936           __m128i src =
937               _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
938           if (out_of_boundary_left >= 0) {
939             const __m128i shuffle_reg_left =
940                 _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
941             src = _mm_shuffle_epi8(src, shuffle_reg_left);
942           }
943           if (out_of_boundary_right >= 0) {
944             const __m128i shuffle_reg_right = _mm_loadu_si128(
945                 (__m128i *)warp_pad_right[out_of_boundary_right]);
946             src = _mm_shuffle_epi8(src, shuffle_reg_right);
947           }
948           horizontal_filter(src, tmp, sx, alpha, k, offset_bits_horiz,
949                             reduce_bits_horiz);
950         }
951       } else {
952         prepare_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha,
953                                        beta, p_height, height, i,
954                                        offset_bits_horiz, reduce_bits_horiz);
955       }
956 
957       // Vertical filter
958       prepare_warp_vertical_filter(
959           pred, tmp, conv_params, gamma, delta, p_height, p_stride, p_width, i,
960           j, sy4, reduce_bits_vert, &res_add_const_1, round_bits, offset_bits);
961     }
962   }
963 }
964