• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2020 The libgav1 Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "src/dsp/warp.h"
16 #include "src/utils/cpu.h"
17 
18 #if LIBGAV1_ENABLE_SSE4_1
19 
20 #include <smmintrin.h>
21 
22 #include <cassert>
23 #include <cstddef>
24 #include <cstdint>
25 #include <cstring>
26 #include <type_traits>
27 
28 #include "src/dsp/constants.h"
29 #include "src/dsp/dsp.h"
30 #include "src/dsp/x86/common_sse4.h"
31 #include "src/dsp/x86/transpose_sse4.h"
32 #include "src/utils/common.h"
33 #include "src/utils/constants.h"
34 
35 namespace libgav1 {
36 namespace dsp {
37 namespace low_bitdepth {
38 namespace {
39 
40 // Number of extra bits of precision in warped filtering.
41 constexpr int kWarpedDiffPrecisionBits = 10;
42 
43 // This assumes the two filters contain filter[x] and filter[x+2].
AccumulateFilter(const __m128i sum,const __m128i filter_0,const __m128i filter_1,const __m128i & src_window)44 inline __m128i AccumulateFilter(const __m128i sum, const __m128i filter_0,
45                                 const __m128i filter_1,
46                                 const __m128i& src_window) {
47   const __m128i filter_taps = _mm_unpacklo_epi8(filter_0, filter_1);
48   const __m128i src =
49       _mm_unpacklo_epi8(src_window, _mm_srli_si128(src_window, 2));
50   return _mm_add_epi16(sum, _mm_maddubs_epi16(src, filter_taps));
51 }
52 
53 constexpr int kFirstPassOffset = 1 << 14;
54 constexpr int kOffsetRemoval =
55     (kFirstPassOffset >> kInterRoundBitsHorizontal) * 128;
56 
57 // Applies the horizontal filter to one source row and stores the result in
58 // |intermediate_result_row|. |intermediate_result_row| is a row in the 15x8
59 // |intermediate_result| two-dimensional array.
HorizontalFilter(const int sx4,const int16_t alpha,const __m128i src_row,int16_t intermediate_result_row[8])60 inline void HorizontalFilter(const int sx4, const int16_t alpha,
61                              const __m128i src_row,
62                              int16_t intermediate_result_row[8]) {
63   int sx = sx4 - MultiplyBy4(alpha);
64   __m128i filter[8];
65   for (__m128i& f : filter) {
66     const int offset = RightShiftWithRounding(sx, kWarpedDiffPrecisionBits) +
67                        kWarpedPixelPrecisionShifts;
68     f = LoadLo8(kWarpedFilters8[offset]);
69     sx += alpha;
70   }
71   Transpose8x8To4x16_U8(filter, filter);
72   // |filter| now contains two filters per register.
73   // Staggered combinations allow us to take advantage of _mm_maddubs_epi16
74   // without overflowing the sign bit. The sign bit is hit only where two taps
75   // paired in a single madd add up to more than 128. This is only possible with
76   // two adjacent "inner" taps. Therefore, pairing odd with odd and even with
77   // even guarantees safety. |sum| is given a negative offset to allow for large
78   // intermediate values.
79   // k = 0, 2.
80   __m128i src_row_window = src_row;
81   __m128i sum = _mm_set1_epi16(-kFirstPassOffset);
82   sum = AccumulateFilter(sum, filter[0], filter[1], src_row_window);
83 
84   // k = 1, 3.
85   src_row_window = _mm_srli_si128(src_row_window, 1);
86   sum = AccumulateFilter(sum, _mm_srli_si128(filter[0], 8),
87                          _mm_srli_si128(filter[1], 8), src_row_window);
88   // k = 4, 6.
89   src_row_window = _mm_srli_si128(src_row_window, 3);
90   sum = AccumulateFilter(sum, filter[2], filter[3], src_row_window);
91 
92   // k = 5, 7.
93   src_row_window = _mm_srli_si128(src_row_window, 1);
94   sum = AccumulateFilter(sum, _mm_srli_si128(filter[2], 8),
95                          _mm_srli_si128(filter[3], 8), src_row_window);
96 
97   sum = RightShiftWithRounding_S16(sum, kInterRoundBitsHorizontal);
98   StoreUnaligned16(intermediate_result_row, sum);
99 }
100 
101 template <bool is_compound>
WriteVerticalFilter(const __m128i filter[8],const int16_t intermediate_result[15][8],int y,void * dst_row)102 inline void WriteVerticalFilter(const __m128i filter[8],
103                                 const int16_t intermediate_result[15][8], int y,
104                                 void* dst_row) {
105   constexpr int kRoundBitsVertical =
106       is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical;
107   __m128i sum_low = _mm_set1_epi32(kOffsetRemoval);
108   __m128i sum_high = sum_low;
109   for (int k = 0; k < 8; k += 2) {
110     const __m128i filters_low = _mm_unpacklo_epi16(filter[k], filter[k + 1]);
111     const __m128i filters_high = _mm_unpackhi_epi16(filter[k], filter[k + 1]);
112     const __m128i intermediate_0 = LoadUnaligned16(intermediate_result[y + k]);
113     const __m128i intermediate_1 =
114         LoadUnaligned16(intermediate_result[y + k + 1]);
115     const __m128i intermediate_low =
116         _mm_unpacklo_epi16(intermediate_0, intermediate_1);
117     const __m128i intermediate_high =
118         _mm_unpackhi_epi16(intermediate_0, intermediate_1);
119 
120     const __m128i product_low = _mm_madd_epi16(filters_low, intermediate_low);
121     const __m128i product_high =
122         _mm_madd_epi16(filters_high, intermediate_high);
123     sum_low = _mm_add_epi32(sum_low, product_low);
124     sum_high = _mm_add_epi32(sum_high, product_high);
125   }
126   sum_low = RightShiftWithRounding_S32(sum_low, kRoundBitsVertical);
127   sum_high = RightShiftWithRounding_S32(sum_high, kRoundBitsVertical);
128   if (is_compound) {
129     const __m128i sum = _mm_packs_epi32(sum_low, sum_high);
130     StoreUnaligned16(static_cast<int16_t*>(dst_row), sum);
131   } else {
132     const __m128i sum = _mm_packus_epi32(sum_low, sum_high);
133     StoreLo8(static_cast<uint8_t*>(dst_row), _mm_packus_epi16(sum, sum));
134   }
135 }
136 
137 template <bool is_compound>
WriteVerticalFilter(const __m128i filter[8],const int16_t * intermediate_result_column,void * dst_row)138 inline void WriteVerticalFilter(const __m128i filter[8],
139                                 const int16_t* intermediate_result_column,
140                                 void* dst_row) {
141   constexpr int kRoundBitsVertical =
142       is_compound ? kInterRoundBitsCompoundVertical : kInterRoundBitsVertical;
143   __m128i sum_low = _mm_setzero_si128();
144   __m128i sum_high = _mm_setzero_si128();
145   for (int k = 0; k < 8; k += 2) {
146     const __m128i filters_low = _mm_unpacklo_epi16(filter[k], filter[k + 1]);
147     const __m128i filters_high = _mm_unpackhi_epi16(filter[k], filter[k + 1]);
148     // Equivalent to unpacking two vectors made by duplicating int16_t values.
149     const __m128i intermediate =
150         _mm_set1_epi32((intermediate_result_column[k + 1] << 16) |
151                        intermediate_result_column[k]);
152     const __m128i product_low = _mm_madd_epi16(filters_low, intermediate);
153     const __m128i product_high = _mm_madd_epi16(filters_high, intermediate);
154     sum_low = _mm_add_epi32(sum_low, product_low);
155     sum_high = _mm_add_epi32(sum_high, product_high);
156   }
157   sum_low = RightShiftWithRounding_S32(sum_low, kRoundBitsVertical);
158   sum_high = RightShiftWithRounding_S32(sum_high, kRoundBitsVertical);
159   if (is_compound) {
160     const __m128i sum = _mm_packs_epi32(sum_low, sum_high);
161     StoreUnaligned16(static_cast<int16_t*>(dst_row), sum);
162   } else {
163     const __m128i sum = _mm_packus_epi32(sum_low, sum_high);
164     StoreLo8(static_cast<uint8_t*>(dst_row), _mm_packus_epi16(sum, sum));
165   }
166 }
167 
168 template <bool is_compound, typename DestType>
VerticalFilter(const int16_t source[15][8],int y4,int gamma,int delta,DestType * dest_row,ptrdiff_t dest_stride)169 inline void VerticalFilter(const int16_t source[15][8], int y4, int gamma,
170                            int delta, DestType* dest_row,
171                            ptrdiff_t dest_stride) {
172   int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
173   for (int y = 0; y < 8; ++y) {
174     int sy = sy4 - MultiplyBy4(gamma);
175     __m128i filter[8];
176     for (__m128i& f : filter) {
177       const int offset = RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
178                          kWarpedPixelPrecisionShifts;
179       f = LoadUnaligned16(kWarpedFilters[offset]);
180       sy += gamma;
181     }
182     Transpose8x8_U16(filter, filter);
183     WriteVerticalFilter<is_compound>(filter, source, y, dest_row);
184     dest_row += dest_stride;
185     sy4 += delta;
186   }
187 }
188 
189 template <bool is_compound, typename DestType>
VerticalFilter(const int16_t * source_cols,int y4,int gamma,int delta,DestType * dest_row,ptrdiff_t dest_stride)190 inline void VerticalFilter(const int16_t* source_cols, int y4, int gamma,
191                            int delta, DestType* dest_row,
192                            ptrdiff_t dest_stride) {
193   int sy4 = (y4 & ((1 << kWarpedModelPrecisionBits) - 1)) - MultiplyBy4(delta);
194   for (int y = 0; y < 8; ++y) {
195     int sy = sy4 - MultiplyBy4(gamma);
196     __m128i filter[8];
197     for (__m128i& f : filter) {
198       const int offset = RightShiftWithRounding(sy, kWarpedDiffPrecisionBits) +
199                          kWarpedPixelPrecisionShifts;
200       f = LoadUnaligned16(kWarpedFilters[offset]);
201       sy += gamma;
202     }
203     Transpose8x8_U16(filter, filter);
204     WriteVerticalFilter<is_compound>(filter, &source_cols[y], dest_row);
205     dest_row += dest_stride;
206     sy4 += delta;
207   }
208 }
209 
210 template <bool is_compound, typename DestType>
WarpRegion1(const uint8_t * src,ptrdiff_t source_stride,int source_width,int source_height,int ix4,int iy4,DestType * dst_row,ptrdiff_t dest_stride)211 inline void WarpRegion1(const uint8_t* src, ptrdiff_t source_stride,
212                         int source_width, int source_height, int ix4, int iy4,
213                         DestType* dst_row, ptrdiff_t dest_stride) {
214   // Region 1
215   // Points to the left or right border of the first row of |src|.
216   const uint8_t* first_row_border =
217       (ix4 + 7 <= 0) ? src : src + source_width - 1;
218   // In general, for y in [-7, 8), the row number iy4 + y is clipped:
219   //   const int row = Clip3(iy4 + y, 0, source_height - 1);
220   // In two special cases, iy4 + y is clipped to either 0 or
221   // source_height - 1 for all y. In the rest of the cases, iy4 + y is
222   // bounded and we can avoid clipping iy4 + y by relying on a reference
223   // frame's boundary extension on the top and bottom.
224   // Region 1.
225   // Every sample used to calculate the prediction block has the same
226   // value. So the whole prediction block has the same value.
227   const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
228   const uint8_t row_border_pixel = first_row_border[row * source_stride];
229 
230   if (is_compound) {
231     const __m128i sum =
232         _mm_set1_epi16(row_border_pixel << (kInterRoundBitsVertical -
233                                             kInterRoundBitsCompoundVertical));
234     StoreUnaligned16(dst_row, sum);
235   } else {
236     memset(dst_row, row_border_pixel, 8);
237   }
238   const DestType* const first_dst_row = dst_row;
239   dst_row += dest_stride;
240   for (int y = 1; y < 8; ++y) {
241     memcpy(dst_row, first_dst_row, 8 * sizeof(*dst_row));
242     dst_row += dest_stride;
243   }
244 }
245 
246 template <bool is_compound, typename DestType>
WarpRegion2(const uint8_t * src,ptrdiff_t source_stride,int source_width,int y4,int ix4,int iy4,int gamma,int delta,int16_t intermediate_result_column[15],DestType * dst_row,ptrdiff_t dest_stride)247 inline void WarpRegion2(const uint8_t* src, ptrdiff_t source_stride,
248                         int source_width, int y4, int ix4, int iy4, int gamma,
249                         int delta, int16_t intermediate_result_column[15],
250                         DestType* dst_row, ptrdiff_t dest_stride) {
251   // Region 2.
252   // Points to the left or right border of the first row of |src|.
253   const uint8_t* first_row_border =
254       (ix4 + 7 <= 0) ? src : src + source_width - 1;
255   // In general, for y in [-7, 8), the row number iy4 + y is clipped:
256   //   const int row = Clip3(iy4 + y, 0, source_height - 1);
257   // In two special cases, iy4 + y is clipped to either 0 or
258   // source_height - 1 for all y. In the rest of the cases, iy4 + y is
259   // bounded and we can avoid clipping iy4 + y by relying on a reference
260   // frame's boundary extension on the top and bottom.
261 
262   // Region 2.
263   // Horizontal filter.
264   // The input values in this region are generated by extending the border
265   // which makes them identical in the horizontal direction. This
266   // computation could be inlined in the vertical pass but most
267   // implementations will need a transpose of some sort.
268   // It is not necessary to use the offset values here because the
269   // horizontal pass is a simple shift and the vertical pass will always
270   // require using 32 bits.
271   for (int y = -7; y < 8; ++y) {
272     // We may over-read up to 13 pixels above the top source row, or up
273     // to 13 pixels below the bottom source row. This is proved in
274     // warp.cc.
275     const int row = iy4 + y;
276     int sum = first_row_border[row * source_stride];
277     sum <<= (kFilterBits - kInterRoundBitsHorizontal);
278     intermediate_result_column[y + 7] = sum;
279   }
280   // Region 2 vertical filter.
281   VerticalFilter<is_compound, DestType>(intermediate_result_column, y4, gamma,
282                                         delta, dst_row, dest_stride);
283 }
284 
285 template <bool is_compound, typename DestType>
WarpRegion3(const uint8_t * src,ptrdiff_t source_stride,int source_height,int alpha,int beta,int x4,int ix4,int iy4,int16_t intermediate_result[15][8])286 inline void WarpRegion3(const uint8_t* src, ptrdiff_t source_stride,
287                         int source_height, int alpha, int beta, int x4, int ix4,
288                         int iy4, int16_t intermediate_result[15][8]) {
289   // Region 3
290   // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
291 
292   // In general, for y in [-7, 8), the row number iy4 + y is clipped:
293   //   const int row = Clip3(iy4 + y, 0, source_height - 1);
294   // In two special cases, iy4 + y is clipped to either 0 or
295   // source_height - 1 for all y. In the rest of the cases, iy4 + y is
296   // bounded and we can avoid clipping iy4 + y by relying on a reference
297   // frame's boundary extension on the top and bottom.
298   // Horizontal filter.
299   const int row = (iy4 + 7 <= 0) ? 0 : source_height - 1;
300   const uint8_t* const src_row = src + row * source_stride;
301   // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
302   // read but is ignored.
303   //
304   // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
305   // bytes after src_row[source_width - 1]. We assume the source frame
306   // has left and right borders of at least 13 bytes that extend the
307   // frame boundary pixels. We also assume there is at least one extra
308   // padding byte after the right border of the last source row.
309   const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]);
310   int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
311   for (int y = -7; y < 8; ++y) {
312     HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
313     sx4 += beta;
314   }
315 }
316 
317 template <bool is_compound, typename DestType>
WarpRegion4(const uint8_t * src,ptrdiff_t source_stride,int alpha,int beta,int x4,int ix4,int iy4,int16_t intermediate_result[15][8])318 inline void WarpRegion4(const uint8_t* src, ptrdiff_t source_stride, int alpha,
319                         int beta, int x4, int ix4, int iy4,
320                         int16_t intermediate_result[15][8]) {
321   // Region 4.
322   // At this point, we know ix4 - 7 < source_width - 1 and ix4 + 7 > 0.
323 
324   // In general, for y in [-7, 8), the row number iy4 + y is clipped:
325   //   const int row = Clip3(iy4 + y, 0, source_height - 1);
326   // In two special cases, iy4 + y is clipped to either 0 or
327   // source_height - 1 for all y. In the rest of the cases, iy4 + y is
328   // bounded and we can avoid clipping iy4 + y by relying on a reference
329   // frame's boundary extension on the top and bottom.
330   // Horizontal filter.
331   int sx4 = (x4 & ((1 << kWarpedModelPrecisionBits) - 1)) - beta * 7;
332   for (int y = -7; y < 8; ++y) {
333     // We may over-read up to 13 pixels above the top source row, or up
334     // to 13 pixels below the bottom source row. This is proved in
335     // warp.cc.
336     const int row = iy4 + y;
337     const uint8_t* const src_row = src + row * source_stride;
338     // Read 15 samples from &src_row[ix4 - 7]. The 16th sample is also
339     // read but is ignored.
340     //
341     // NOTE: This may read up to 13 bytes before src_row[0] or up to 14
342     // bytes after src_row[source_width - 1]. We assume the source frame
343     // has left and right borders of at least 13 bytes that extend the
344     // frame boundary pixels. We also assume there is at least one extra
345     // padding byte after the right border of the last source row.
346     const __m128i src_row_v = LoadUnaligned16(&src_row[ix4 - 7]);
347     // Convert src_row_v to int8 (subtract 128).
348     HorizontalFilter(sx4, alpha, src_row_v, intermediate_result[y + 7]);
349     sx4 += beta;
350   }
351 }
352 
353 template <bool is_compound, typename DestType>
HandleWarpBlock(const uint8_t * src,ptrdiff_t source_stride,int source_width,int source_height,const int * warp_params,int subsampling_x,int subsampling_y,int src_x,int src_y,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta,DestType * dst_row,ptrdiff_t dest_stride)354 inline void HandleWarpBlock(const uint8_t* src, ptrdiff_t source_stride,
355                             int source_width, int source_height,
356                             const int* warp_params, int subsampling_x,
357                             int subsampling_y, int src_x, int src_y,
358                             int16_t alpha, int16_t beta, int16_t gamma,
359                             int16_t delta, DestType* dst_row,
360                             ptrdiff_t dest_stride) {
361   union {
362     // Intermediate_result is the output of the horizontal filtering and
363     // rounding. The range is within 13 (= bitdepth + kFilterBits + 1 -
364     // kInterRoundBitsHorizontal) bits (unsigned). We use the signed int16_t
365     // type so that we can start with a negative offset and restore it on the
366     // final filter sum.
367     int16_t intermediate_result[15][8];  // 15 rows, 8 columns.
368     // In the simple special cases where the samples in each row are all the
369     // same, store one sample per row in a column vector.
370     int16_t intermediate_result_column[15];
371   };
372 
373   const int dst_x =
374       src_x * warp_params[2] + src_y * warp_params[3] + warp_params[0];
375   const int dst_y =
376       src_x * warp_params[4] + src_y * warp_params[5] + warp_params[1];
377   const int x4 = dst_x >> subsampling_x;
378   const int y4 = dst_y >> subsampling_y;
379   const int ix4 = x4 >> kWarpedModelPrecisionBits;
380   const int iy4 = y4 >> kWarpedModelPrecisionBits;
381   // A prediction block may fall outside the frame's boundaries. If a
382   // prediction block is calculated using only samples outside the frame's
383   // boundary, the filtering can be simplified. We can divide the plane
384   // into several regions and handle them differently.
385   //
386   //                |           |
387   //            1   |     3     |   1
388   //                |           |
389   //         -------+-----------+-------
390   //                |***********|
391   //            2   |*****4*****|   2
392   //                |***********|
393   //         -------+-----------+-------
394   //                |           |
395   //            1   |     3     |   1
396   //                |           |
397   //
398   // At the center, region 4 represents the frame and is the general case.
399   //
400   // In regions 1 and 2, the prediction block is outside the frame's
401   // boundary horizontally. Therefore the horizontal filtering can be
402   // simplified. Furthermore, in the region 1 (at the four corners), the
403   // prediction is outside the frame's boundary both horizontally and
404   // vertically, so we get a constant prediction block.
405   //
406   // In region 3, the prediction block is outside the frame's boundary
407   // vertically. Unfortunately because we apply the horizontal filters
408   // first, by the time we apply the vertical filters, they no longer see
409   // simple inputs. So the only simplification is that all the rows are
410   // the same, but we still need to apply all the horizontal and vertical
411   // filters.
412 
413   // Check for two simple special cases, where the horizontal filter can
414   // be significantly simplified.
415   //
416   // In general, for each row, the horizontal filter is calculated as
417   // follows:
418   //   for (int x = -4; x < 4; ++x) {
419   //     const int offset = ...;
420   //     int sum = first_pass_offset;
421   //     for (int k = 0; k < 8; ++k) {
422   //       const int column = Clip3(ix4 + x + k - 3, 0, source_width - 1);
423   //       sum += kWarpedFilters[offset][k] * src_row[column];
424   //     }
425   //     ...
426   //   }
427   // The column index before clipping, ix4 + x + k - 3, varies in the range
428   // ix4 - 7 <= ix4 + x + k - 3 <= ix4 + 7. If ix4 - 7 >= source_width - 1
429   // or ix4 + 7 <= 0, then all the column indexes are clipped to the same
430   // border index (source_width - 1 or 0, respectively). Then for each x,
431   // the inner for loop of the horizontal filter is reduced to multiplying
432   // the border pixel by the sum of the filter coefficients.
433   if (ix4 - 7 >= source_width - 1 || ix4 + 7 <= 0) {
434     if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) {
435       // Outside the frame in both directions. One repeated value.
436       WarpRegion1<is_compound, DestType>(src, source_stride, source_width,
437                                          source_height, ix4, iy4, dst_row,
438                                          dest_stride);
439       return;
440     }
441     // Outside the frame horizontally. Rows repeated.
442     WarpRegion2<is_compound, DestType>(
443         src, source_stride, source_width, y4, ix4, iy4, gamma, delta,
444         intermediate_result_column, dst_row, dest_stride);
445     return;
446   }
447 
448   if ((iy4 - 7 >= source_height - 1 || iy4 + 7 <= 0)) {
449     // Outside the frame vertically.
450     WarpRegion3<is_compound, DestType>(src, source_stride, source_height, alpha,
451                                        beta, x4, ix4, iy4, intermediate_result);
452   } else {
453     // Inside the frame.
454     WarpRegion4<is_compound, DestType>(src, source_stride, alpha, beta, x4, ix4,
455                                        iy4, intermediate_result);
456   }
457   // Region 3 and 4 vertical filter.
458   VerticalFilter<is_compound, DestType>(intermediate_result, y4, gamma, delta,
459                                         dst_row, dest_stride);
460 }
461 
462 template <bool is_compound>
Warp_SSE4_1(const void * source,ptrdiff_t source_stride,int source_width,int source_height,const int * warp_params,int subsampling_x,int subsampling_y,int block_start_x,int block_start_y,int block_width,int block_height,int16_t alpha,int16_t beta,int16_t gamma,int16_t delta,void * dest,ptrdiff_t dest_stride)463 void Warp_SSE4_1(const void* source, ptrdiff_t source_stride, int source_width,
464                  int source_height, const int* warp_params, int subsampling_x,
465                  int subsampling_y, int block_start_x, int block_start_y,
466                  int block_width, int block_height, int16_t alpha, int16_t beta,
467                  int16_t gamma, int16_t delta, void* dest,
468                  ptrdiff_t dest_stride) {
469   const auto* const src = static_cast<const uint8_t*>(source);
470   using DestType =
471       typename std::conditional<is_compound, int16_t, uint8_t>::type;
472   auto* dst = static_cast<DestType*>(dest);
473 
474   // Warp process applies for each 8x8 block.
475   assert(block_width >= 8);
476   assert(block_height >= 8);
477   const int block_end_x = block_start_x + block_width;
478   const int block_end_y = block_start_y + block_height;
479 
480   const int start_x = block_start_x;
481   const int start_y = block_start_y;
482   int src_x = (start_x + 4) << subsampling_x;
483   int src_y = (start_y + 4) << subsampling_y;
484   const int end_x = (block_end_x + 4) << subsampling_x;
485   const int end_y = (block_end_y + 4) << subsampling_y;
486   do {
487     DestType* dst_row = dst;
488     src_x = (start_x + 4) << subsampling_x;
489     do {
490       HandleWarpBlock<is_compound, DestType>(
491           src, source_stride, source_width, source_height, warp_params,
492           subsampling_x, subsampling_y, src_x, src_y, alpha, beta, gamma, delta,
493           dst_row, dest_stride);
494       src_x += (8 << subsampling_x);
495       dst_row += 8;
496     } while (src_x < end_x);
497     dst += 8 * dest_stride;
498     src_y += (8 << subsampling_y);
499   } while (src_y < end_y);
500 }
501 
Init8bpp()502 void Init8bpp() {
503   Dsp* const dsp = dsp_internal::GetWritableDspTable(kBitdepth8);
504   assert(dsp != nullptr);
505   dsp->warp = Warp_SSE4_1</*is_compound=*/false>;
506   dsp->warp_compound = Warp_SSE4_1</*is_compound=*/true>;
507 }
508 
509 }  // namespace
510 }  // namespace low_bitdepth
511 
WarpInit_SSE4_1()512 void WarpInit_SSE4_1() { low_bitdepth::Init8bpp(); }
513 
514 }  // namespace dsp
515 }  // namespace libgav1
516 #else  // !LIBGAV1_ENABLE_SSE4_1
517 
518 namespace libgav1 {
519 namespace dsp {
520 
WarpInit_SSE4_1()521 void WarpInit_SSE4_1() {}
522 
523 }  // namespace dsp
524 }  // namespace libgav1
525 #endif  // LIBGAV1_ENABLE_SSE4_1
526