• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <emmintrin.h>  // SSE2
14 
15 #include "aom/aom_integer.h"
16 #include "aom_dsp/x86/mem_sse2.h"
17 #include "av1/common/onyxc_int.h"
18 #include "av1/common/txb_common.h"
19 
load_levels_4x4x5_sse2(const uint8_t * const src,const int stride,const ptrdiff_t * const offsets,__m128i * const level)20 static INLINE void load_levels_4x4x5_sse2(const uint8_t *const src,
21                                           const int stride,
22                                           const ptrdiff_t *const offsets,
23                                           __m128i *const level) {
24   level[0] = load_8bit_4x4_to_1_reg_sse2(src + 1, stride);
25   level[1] = load_8bit_4x4_to_1_reg_sse2(src + stride, stride);
26   level[2] = load_8bit_4x4_to_1_reg_sse2(src + offsets[0], stride);
27   level[3] = load_8bit_4x4_to_1_reg_sse2(src + offsets[1], stride);
28   level[4] = load_8bit_4x4_to_1_reg_sse2(src + offsets[2], stride);
29 }
30 
load_levels_8x2x5_sse2(const uint8_t * const src,const int stride,const ptrdiff_t * const offsets,__m128i * const level)31 static INLINE void load_levels_8x2x5_sse2(const uint8_t *const src,
32                                           const int stride,
33                                           const ptrdiff_t *const offsets,
34                                           __m128i *const level) {
35   level[0] = load_8bit_8x2_to_1_reg_sse2(src + 1, stride);
36   level[1] = load_8bit_8x2_to_1_reg_sse2(src + stride, stride);
37   level[2] = load_8bit_8x2_to_1_reg_sse2(src + offsets[0], stride);
38   level[3] = load_8bit_8x2_to_1_reg_sse2(src + offsets[1], stride);
39   level[4] = load_8bit_8x2_to_1_reg_sse2(src + offsets[2], stride);
40 }
41 
load_levels_16x1x5_sse2(const uint8_t * const src,const int stride,const ptrdiff_t * const offsets,__m128i * const level)42 static INLINE void load_levels_16x1x5_sse2(const uint8_t *const src,
43                                            const int stride,
44                                            const ptrdiff_t *const offsets,
45                                            __m128i *const level) {
46   level[0] = _mm_loadu_si128((__m128i *)(src + 1));
47   level[1] = _mm_loadu_si128((__m128i *)(src + stride));
48   level[2] = _mm_loadu_si128((__m128i *)(src + offsets[0]));
49   level[3] = _mm_loadu_si128((__m128i *)(src + offsets[1]));
50   level[4] = _mm_loadu_si128((__m128i *)(src + offsets[2]));
51 }
52 
get_coeff_contexts_kernel_sse2(__m128i * const level)53 static INLINE __m128i get_coeff_contexts_kernel_sse2(__m128i *const level) {
54   const __m128i const_3 = _mm_set1_epi8(3);
55   const __m128i const_4 = _mm_set1_epi8(4);
56   __m128i count;
57 
58   count = _mm_min_epu8(level[0], const_3);
59   level[1] = _mm_min_epu8(level[1], const_3);
60   level[2] = _mm_min_epu8(level[2], const_3);
61   level[3] = _mm_min_epu8(level[3], const_3);
62   level[4] = _mm_min_epu8(level[4], const_3);
63   count = _mm_add_epi8(count, level[1]);
64   count = _mm_add_epi8(count, level[2]);
65   count = _mm_add_epi8(count, level[3]);
66   count = _mm_add_epi8(count, level[4]);
67   count = _mm_avg_epu8(count, _mm_setzero_si128());
68   count = _mm_min_epu8(count, const_4);
69   return count;
70 }
71 
get_4_nz_map_contexts_2d(const uint8_t * levels,const int height,const ptrdiff_t * const offsets,int8_t * const coeff_contexts)72 static INLINE void get_4_nz_map_contexts_2d(const uint8_t *levels,
73                                             const int height,
74                                             const ptrdiff_t *const offsets,
75                                             int8_t *const coeff_contexts) {
76   const int stride = 4 + TX_PAD_HOR;
77   const __m128i pos_to_offset_large = _mm_set1_epi8(21);
78   __m128i pos_to_offset =
79       (height == 4)
80           ? _mm_setr_epi8(0, 1, 6, 6, 1, 6, 6, 21, 6, 6, 21, 21, 6, 21, 21, 21)
81           : _mm_setr_epi8(0, 11, 11, 11, 11, 11, 11, 11, 6, 6, 21, 21, 6, 21,
82                           21, 21);
83   __m128i count;
84   __m128i level[5];
85   int8_t *cc = coeff_contexts;
86   int row = height;
87 
88   assert(!(height % 4));
89 
90   do {
91     load_levels_4x4x5_sse2(levels, stride, offsets, level);
92     count = get_coeff_contexts_kernel_sse2(level);
93     count = _mm_add_epi8(count, pos_to_offset);
94     _mm_store_si128((__m128i *)cc, count);
95     pos_to_offset = pos_to_offset_large;
96     levels += 4 * stride;
97     cc += 16;
98     row -= 4;
99   } while (row);
100 
101   coeff_contexts[0] = 0;
102 }
103 
get_4_nz_map_contexts_hor(const uint8_t * levels,const int height,const ptrdiff_t * const offsets,int8_t * coeff_contexts)104 static INLINE void get_4_nz_map_contexts_hor(const uint8_t *levels,
105                                              const int height,
106                                              const ptrdiff_t *const offsets,
107                                              int8_t *coeff_contexts) {
108   const int stride = 4 + TX_PAD_HOR;
109   const __m128i pos_to_offset =
110       _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
111                     SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
112                     SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
113                     SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
114                     SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
115                     SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
116                     SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
117                     SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
118   __m128i count;
119   __m128i level[5];
120   int row = height;
121 
122   assert(!(height % 4));
123 
124   do {
125     load_levels_4x4x5_sse2(levels, stride, offsets, level);
126     count = get_coeff_contexts_kernel_sse2(level);
127     count = _mm_add_epi8(count, pos_to_offset);
128     _mm_store_si128((__m128i *)coeff_contexts, count);
129     levels += 4 * stride;
130     coeff_contexts += 16;
131     row -= 4;
132   } while (row);
133 }
134 
get_4_nz_map_contexts_ver(const uint8_t * levels,const int height,const ptrdiff_t * const offsets,int8_t * coeff_contexts)135 static INLINE void get_4_nz_map_contexts_ver(const uint8_t *levels,
136                                              const int height,
137                                              const ptrdiff_t *const offsets,
138                                              int8_t *coeff_contexts) {
139   const int stride = 4 + TX_PAD_HOR;
140   const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
141   __m128i pos_to_offset =
142       _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
143                     SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
144                     SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
145                     SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
146                     SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
147                     SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
148                     SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
149                     SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
150   __m128i count;
151   __m128i level[5];
152   int row = height;
153 
154   assert(!(height % 4));
155 
156   do {
157     load_levels_4x4x5_sse2(levels, stride, offsets, level);
158     count = get_coeff_contexts_kernel_sse2(level);
159     count = _mm_add_epi8(count, pos_to_offset);
160     _mm_store_si128((__m128i *)coeff_contexts, count);
161     pos_to_offset = pos_to_offset_large;
162     levels += 4 * stride;
163     coeff_contexts += 16;
164     row -= 4;
165   } while (row);
166 }
167 
get_8_coeff_contexts_2d(const uint8_t * levels,const int height,const ptrdiff_t * const offsets,int8_t * coeff_contexts)168 static INLINE void get_8_coeff_contexts_2d(const uint8_t *levels,
169                                            const int height,
170                                            const ptrdiff_t *const offsets,
171                                            int8_t *coeff_contexts) {
172   const int stride = 8 + TX_PAD_HOR;
173   int8_t *cc = coeff_contexts;
174   int row = height;
175   __m128i count;
176   __m128i level[5];
177   __m128i pos_to_offset[3];
178 
179   assert(!(height % 2));
180 
181   if (height == 8) {
182     pos_to_offset[0] =
183         _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 1, 6, 6, 21, 21, 21, 21, 21);
184     pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21,
185                                      21, 21, 21, 21, 21);
186   } else if (height < 8) {
187     pos_to_offset[0] = _mm_setr_epi8(0, 16, 6, 6, 21, 21, 21, 21, 16, 16, 6, 21,
188                                      21, 21, 21, 21);
189     pos_to_offset[1] = _mm_setr_epi8(16, 16, 21, 21, 21, 21, 21, 21, 16, 16, 21,
190                                      21, 21, 21, 21, 21);
191   } else {
192     pos_to_offset[0] = _mm_setr_epi8(0, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
193                                      11, 11, 11, 11, 11);
194     pos_to_offset[1] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 6, 21, 21,
195                                      21, 21, 21, 21, 21);
196   }
197   pos_to_offset[2] = _mm_set1_epi8(21);
198 
199   do {
200     load_levels_8x2x5_sse2(levels, stride, offsets, level);
201     count = get_coeff_contexts_kernel_sse2(level);
202     count = _mm_add_epi8(count, pos_to_offset[0]);
203     _mm_store_si128((__m128i *)cc, count);
204     pos_to_offset[0] = pos_to_offset[1];
205     pos_to_offset[1] = pos_to_offset[2];
206     levels += 2 * stride;
207     cc += 16;
208     row -= 2;
209   } while (row);
210 
211   coeff_contexts[0] = 0;
212 }
213 
get_8_coeff_contexts_hor(const uint8_t * levels,const int height,const ptrdiff_t * const offsets,int8_t * coeff_contexts)214 static INLINE void get_8_coeff_contexts_hor(const uint8_t *levels,
215                                             const int height,
216                                             const ptrdiff_t *const offsets,
217                                             int8_t *coeff_contexts) {
218   const int stride = 8 + TX_PAD_HOR;
219   const __m128i pos_to_offset =
220       _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
221                     SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
222                     SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
223                     SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
224                     SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
225                     SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
226                     SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
227                     SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
228   int row = height;
229   __m128i count;
230   __m128i level[5];
231 
232   assert(!(height % 2));
233 
234   do {
235     load_levels_8x2x5_sse2(levels, stride, offsets, level);
236     count = get_coeff_contexts_kernel_sse2(level);
237     count = _mm_add_epi8(count, pos_to_offset);
238     _mm_store_si128((__m128i *)coeff_contexts, count);
239     levels += 2 * stride;
240     coeff_contexts += 16;
241     row -= 2;
242   } while (row);
243 }
244 
get_8_coeff_contexts_ver(const uint8_t * levels,const int height,const ptrdiff_t * const offsets,int8_t * coeff_contexts)245 static INLINE void get_8_coeff_contexts_ver(const uint8_t *levels,
246                                             const int height,
247                                             const ptrdiff_t *const offsets,
248                                             int8_t *coeff_contexts) {
249   const int stride = 8 + TX_PAD_HOR;
250   const __m128i pos_to_offset_large = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
251   __m128i pos_to_offset =
252       _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
253                     SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
254                     SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
255                     SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 0,
256                     SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
257                     SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
258                     SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5,
259                     SIG_COEF_CONTEXTS_2D + 5, SIG_COEF_CONTEXTS_2D + 5);
260   int row = height;
261   __m128i count;
262   __m128i level[5];
263 
264   assert(!(height % 2));
265 
266   do {
267     load_levels_8x2x5_sse2(levels, stride, offsets, level);
268     count = get_coeff_contexts_kernel_sse2(level);
269     count = _mm_add_epi8(count, pos_to_offset);
270     _mm_store_si128((__m128i *)coeff_contexts, count);
271     pos_to_offset = pos_to_offset_large;
272     levels += 2 * stride;
273     coeff_contexts += 16;
274     row -= 2;
275   } while (row);
276 }
277 
get_16n_coeff_contexts_2d(const uint8_t * levels,const int real_width,const int real_height,const int width,const int height,const ptrdiff_t * const offsets,int8_t * coeff_contexts)278 static INLINE void get_16n_coeff_contexts_2d(const uint8_t *levels,
279                                              const int real_width,
280                                              const int real_height,
281                                              const int width, const int height,
282                                              const ptrdiff_t *const offsets,
283                                              int8_t *coeff_contexts) {
284   const int stride = width + TX_PAD_HOR;
285   int8_t *cc = coeff_contexts;
286   int row = height;
287   __m128i pos_to_offset[5];
288   __m128i pos_to_offset_large[3];
289   __m128i count;
290   __m128i level[5];
291 
292   assert(!(width % 16));
293 
294   pos_to_offset_large[2] = _mm_set1_epi8(21);
295   if (real_width == real_height) {
296     pos_to_offset[0] = _mm_setr_epi8(0, 1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21,
297                                      21, 21, 21, 21);
298     pos_to_offset[1] = _mm_setr_epi8(1, 6, 6, 21, 21, 21, 21, 21, 21, 21, 21,
299                                      21, 21, 21, 21, 21);
300     pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21,
301                                      21, 21, 21, 21, 21);
302     pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
303                                      21, 21, 21, 21, 21);
304     pos_to_offset[4] = pos_to_offset_large[0] = pos_to_offset_large[1] =
305         pos_to_offset_large[2];
306   } else if (real_width > real_height) {
307     pos_to_offset[0] = _mm_setr_epi8(0, 16, 6, 6, 21, 21, 21, 21, 21, 21, 21,
308                                      21, 21, 21, 21, 21);
309     pos_to_offset[1] = _mm_setr_epi8(16, 16, 6, 21, 21, 21, 21, 21, 21, 21, 21,
310                                      21, 21, 21, 21, 21);
311     pos_to_offset[2] = pos_to_offset[3] = pos_to_offset[4] = _mm_setr_epi8(
312         16, 16, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21);
313     pos_to_offset_large[0] = pos_to_offset_large[1] = pos_to_offset_large[2];
314   } else {  // real_width < real_height
315     pos_to_offset[0] = pos_to_offset[1] = _mm_setr_epi8(
316         11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11);
317     pos_to_offset[2] = _mm_setr_epi8(6, 6, 21, 21, 21, 21, 21, 21, 21, 21, 21,
318                                      21, 21, 21, 21, 21);
319     pos_to_offset[3] = _mm_setr_epi8(6, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
320                                      21, 21, 21, 21, 21);
321     pos_to_offset[4] = pos_to_offset_large[2];
322     pos_to_offset_large[0] = pos_to_offset_large[1] = _mm_set1_epi8(11);
323   }
324 
325   do {
326     int w = width;
327 
328     do {
329       load_levels_16x1x5_sse2(levels, stride, offsets, level);
330       count = get_coeff_contexts_kernel_sse2(level);
331       count = _mm_add_epi8(count, pos_to_offset[0]);
332       _mm_store_si128((__m128i *)cc, count);
333       levels += 16;
334       cc += 16;
335       w -= 16;
336       pos_to_offset[0] = pos_to_offset_large[0];
337     } while (w);
338 
339     pos_to_offset[0] = pos_to_offset[1];
340     pos_to_offset[1] = pos_to_offset[2];
341     pos_to_offset[2] = pos_to_offset[3];
342     pos_to_offset[3] = pos_to_offset[4];
343     pos_to_offset_large[0] = pos_to_offset_large[1];
344     pos_to_offset_large[1] = pos_to_offset_large[2];
345     levels += TX_PAD_HOR;
346   } while (--row);
347 
348   coeff_contexts[0] = 0;
349 }
350 
get_16n_coeff_contexts_hor(const uint8_t * levels,const int width,const int height,const ptrdiff_t * const offsets,int8_t * coeff_contexts)351 static INLINE void get_16n_coeff_contexts_hor(const uint8_t *levels,
352                                               const int width, const int height,
353                                               const ptrdiff_t *const offsets,
354                                               int8_t *coeff_contexts) {
355   const int stride = width + TX_PAD_HOR;
356   const __m128i pos_to_offset_large =
357       _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
358                     SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
359                     SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
360                     SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
361                     SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
362                     SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
363                     SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
364                     SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
365   __m128i count;
366   __m128i level[5];
367   int row = height;
368 
369   assert(!(width % 16));
370 
371   do {
372     __m128i pos_to_offset =
373         _mm_setr_epi8(SIG_COEF_CONTEXTS_2D + 0, SIG_COEF_CONTEXTS_2D + 5,
374                       SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
375                       SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
376                       SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
377                       SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
378                       SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
379                       SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10,
380                       SIG_COEF_CONTEXTS_2D + 10, SIG_COEF_CONTEXTS_2D + 10);
381     int w = width;
382 
383     do {
384       load_levels_16x1x5_sse2(levels, stride, offsets, level);
385       count = get_coeff_contexts_kernel_sse2(level);
386       count = _mm_add_epi8(count, pos_to_offset);
387       _mm_store_si128((__m128i *)coeff_contexts, count);
388       pos_to_offset = pos_to_offset_large;
389       levels += 16;
390       coeff_contexts += 16;
391       w -= 16;
392     } while (w);
393 
394     levels += TX_PAD_HOR;
395   } while (--row);
396 }
397 
get_16n_coeff_contexts_ver(const uint8_t * levels,const int width,const int height,const ptrdiff_t * const offsets,int8_t * coeff_contexts)398 static INLINE void get_16n_coeff_contexts_ver(const uint8_t *levels,
399                                               const int width, const int height,
400                                               const ptrdiff_t *const offsets,
401                                               int8_t *coeff_contexts) {
402   const int stride = width + TX_PAD_HOR;
403   __m128i pos_to_offset[3];
404   __m128i count;
405   __m128i level[5];
406   int row = height;
407 
408   assert(!(width % 16));
409 
410   pos_to_offset[0] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 0);
411   pos_to_offset[1] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 5);
412   pos_to_offset[2] = _mm_set1_epi8(SIG_COEF_CONTEXTS_2D + 10);
413 
414   do {
415     int w = width;
416 
417     do {
418       load_levels_16x1x5_sse2(levels, stride, offsets, level);
419       count = get_coeff_contexts_kernel_sse2(level);
420       count = _mm_add_epi8(count, pos_to_offset[0]);
421       _mm_store_si128((__m128i *)coeff_contexts, count);
422       levels += 16;
423       coeff_contexts += 16;
424       w -= 16;
425     } while (w);
426 
427     pos_to_offset[0] = pos_to_offset[1];
428     pos_to_offset[1] = pos_to_offset[2];
429     levels += TX_PAD_HOR;
430   } while (--row);
431 }
432 
433 // Note: levels[] must be in the range [0, 127], inclusive.
av1_get_nz_map_contexts_sse2(const uint8_t * const levels,const int16_t * const scan,const uint16_t eob,const TX_SIZE tx_size,const TX_CLASS tx_class,int8_t * const coeff_contexts)434 void av1_get_nz_map_contexts_sse2(const uint8_t *const levels,
435                                   const int16_t *const scan, const uint16_t eob,
436                                   const TX_SIZE tx_size,
437                                   const TX_CLASS tx_class,
438                                   int8_t *const coeff_contexts) {
439   const int last_idx = eob - 1;
440   if (!last_idx) {
441     coeff_contexts[0] = 0;
442     return;
443   }
444 
445   const int real_width = tx_size_wide[tx_size];
446   const int real_height = tx_size_high[tx_size];
447   const int width = get_txb_wide(tx_size);
448   const int height = get_txb_high(tx_size);
449   const int stride = width + TX_PAD_HOR;
450   ptrdiff_t offsets[3];
451 
452   /* coeff_contexts must be 16 byte aligned. */
453   assert(!((intptr_t)coeff_contexts & 0xf));
454 
455   if (tx_class == TX_CLASS_2D) {
456     offsets[0] = 0 * stride + 2;
457     offsets[1] = 1 * stride + 1;
458     offsets[2] = 2 * stride + 0;
459 
460     if (width == 4) {
461       get_4_nz_map_contexts_2d(levels, height, offsets, coeff_contexts);
462     } else if (width == 8) {
463       get_8_coeff_contexts_2d(levels, height, offsets, coeff_contexts);
464     } else if (width == 16) {
465       get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
466                                 offsets, coeff_contexts);
467     } else {
468       get_16n_coeff_contexts_2d(levels, real_width, real_height, width, height,
469                                 offsets, coeff_contexts);
470     }
471   } else if (tx_class == TX_CLASS_HORIZ) {
472     offsets[0] = 2;
473     offsets[1] = 3;
474     offsets[2] = 4;
475     if (width == 4) {
476       get_4_nz_map_contexts_hor(levels, height, offsets, coeff_contexts);
477     } else if (width == 8) {
478       get_8_coeff_contexts_hor(levels, height, offsets, coeff_contexts);
479     } else {
480       get_16n_coeff_contexts_hor(levels, width, height, offsets,
481                                  coeff_contexts);
482     }
483   } else {  // TX_CLASS_VERT
484     offsets[0] = 2 * stride;
485     offsets[1] = 3 * stride;
486     offsets[2] = 4 * stride;
487     if (width == 4) {
488       get_4_nz_map_contexts_ver(levels, height, offsets, coeff_contexts);
489     } else if (width == 8) {
490       get_8_coeff_contexts_ver(levels, height, offsets, coeff_contexts);
491     } else {
492       get_16n_coeff_contexts_ver(levels, width, height, offsets,
493                                  coeff_contexts);
494     }
495   }
496 
497   const int bwl = get_txb_bwl(tx_size);
498   const int pos = scan[last_idx];
499   if (last_idx <= (height << bwl) / 8)
500     coeff_contexts[pos] = 1;
501   else if (last_idx <= (height << bwl) / 4)
502     coeff_contexts[pos] = 2;
503   else
504     coeff_contexts[pos] = 3;
505 }
506