1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19 *******************************************************************************
20 * @file
21 * ihevc_sao_atom_intr.c
22 *
23 * @brief
24 * Contains function definitions for Sample adaptive offset(SAO) used in-loop
25 * filtering
26 *
27 * @author
28 * 100592
29 *
30 * @par List of Functions:
31 * - ihevc_sao_band_offset_luma_ssse3()
32 * - ihevc_sao_band_offset_chroma_ssse3()
33 * - ihevc_sao_edge_offset_class0_ssse3()
34 * - ihevc_sao_edge_offset_class0_chroma_ssse3()
35 * - ihevc_sao_edge_offset_class1_ssse3()
36 * - ihevc_sao_edge_offset_class1_chroma_ssse3()
37 * - ihevc_sao_edge_offset_class2_ssse3()
38 * - ihevc_sao_edge_offset_class2_chroma_ssse3()
39 * - ihevc_sao_edge_offset_class3_ssse3()
40 * - ihevc_sao_edge_offset_class3_chroma_ssse3()
41 *
42 * @remarks
43 * None
44 *
45 *******************************************************************************
46 */
47 /*****************************************************************************/
48 /* File Includes */
49 /*****************************************************************************/
50 #include <stdio.h>
51
52 #include "ihevc_typedefs.h"
53 #include "ihevc_platform_macros.h"
54 #include "ihevc_macros.h"
55 #include "ihevc_func_selector.h"
56 #include "ihevc_defs.h"
57 #include "ihevc_tables_x86_intr.h"
58 #include "ihevc_common_tables.h"
59 #include "ihevc_sao.h"
60
61 #include <immintrin.h>
62
63 #define NUM_BAND_TABLE 32
64 /**
65 *******************************************************************************
66 *
67 * @brief
68 * Has two sets of functions : band offset and edge offset both for luma and chroma
69 * edge offset has horizontal ,vertical, 135 degree and 45 degree
70 *
71 * @par Description:
72 *
73 *
74 * @param[in-out] pu1_src
75 * Pointer to the source
76 *
77 * @param[in] src_strd
78 * Source stride
79 *
80 * @param[in-out] pu1_src_left
81 * source left boundary
82 *
83 * @param[in-out] pu1_src_top
84 * Source top boundary
85 *
86 * @param[in-out] pu1_src_top_left
87 * Source top left boundary
88 *
89 * @param[in] pu1_src_top_right
90 * Source top right boundary
91 *
92 * @param[in] pu1_src_bot_left
93 * Source bottom left boundary
94 *
95 * @param[in] pu1_avail
96 * boundary availability flags
97 *
98 * @param[in] pi1_sao_offset_u
99 * Chroma U sao offset values
100 *
101 * @param[in] pi1_sao_offset_v
102 * Chroma V sao offset values
103 *
104 * @param[in] pi1_sao_offset
105 * Luma sao offset values
106 *
107 * @param[in] wd
108 * width of the source
109
110 * @param[in] ht
111 * height of the source
112 * @returns
113 *
114 * @remarks
115 * None
116 *
117 *******************************************************************************
118 */
119
120
ihevc_sao_band_offset_luma_ssse3(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_src_left,UWORD8 * pu1_src_top,UWORD8 * pu1_src_top_left,WORD32 sao_band_pos,WORD8 * pi1_sao_offset,WORD32 wd,WORD32 ht)121 void ihevc_sao_band_offset_luma_ssse3(UWORD8 *pu1_src,
122 WORD32 src_strd,
123 UWORD8 *pu1_src_left,
124 UWORD8 *pu1_src_top,
125 UWORD8 *pu1_src_top_left,
126 WORD32 sao_band_pos,
127 WORD8 *pi1_sao_offset,
128 WORD32 wd,
129 WORD32 ht)
130 {
131 WORD32 row, col;
132 UWORD8 *pu1_src_cpy;
133 WORD32 wd_rem;
134 WORD8 offset = 0;
135
136 __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b;
137 __m128i band_table0_8x16b, band_table1_8x16b, band_table2_8x16b, band_table3_8x16b;
138 __m128i tmp_set_128i_1, tmp_set_128i_2, tmp_set_128i_3, tmp_set_128i_4;
139 __m128i band_pos_16x8b;
140 __m128i sao_offset;
141 __m128i cmp_mask, cmp_store;
142
143 /* Updating left and top-left and top */
144 for(row = 0; row < ht; row++)
145 {
146 pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)];
147 }
148 pu1_src_top_left[0] = pu1_src_top[wd - 1];
149 for(col = 0; col < wd; col += 8)
150 {
151 tmp_set_128i_1 = _mm_loadl_epi64((__m128i *)(pu1_src + (ht - 1) * src_strd + offset));
152 _mm_storel_epi64((__m128i *)(pu1_src_top + offset), tmp_set_128i_1);
153 offset += 8;
154 }
155
156 //replicating sao_band_pos as 8 bit value 16 times
157
158
159 band_pos_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos << 3));
160 //value set for sao_offset extraction
161 tmp_set_128i_1 = _mm_set_epi8(128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1);
162 tmp_set_128i_2 = _mm_set_epi8(128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2);
163 tmp_set_128i_3 = _mm_set_epi8(128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3);
164 tmp_set_128i_4 = _mm_set_epi8(128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4);
165
166 //loaded sao offset values
167 sao_offset = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
168
169 //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
170 band_table0_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
171 band_table1_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
172 band_table2_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
173 band_table3_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
174
175 //band_position addition
176 band_table0_8x16b = _mm_add_epi16(band_table0_8x16b, band_pos_16x8b);
177 band_table1_8x16b = _mm_add_epi16(band_table1_8x16b, band_pos_16x8b);
178 band_table2_8x16b = _mm_add_epi16(band_table2_8x16b, band_pos_16x8b);
179 band_table3_8x16b = _mm_add_epi16(band_table3_8x16b, band_pos_16x8b);
180 //sao_offset duplication
181 tmp_set_128i_1 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
182 tmp_set_128i_2 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
183 tmp_set_128i_3 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
184 tmp_set_128i_4 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
185 //settng for comparision
186 cmp_mask = _mm_set1_epi16(16);
187 cmp_store = _mm_set1_epi16(0x00ff);
188
189 //sao_offset addition
190 band_table0_8x16b = _mm_add_epi16(band_table0_8x16b, tmp_set_128i_1);
191 band_table1_8x16b = _mm_add_epi16(band_table1_8x16b, tmp_set_128i_2);
192 band_table2_8x16b = _mm_add_epi16(band_table2_8x16b, tmp_set_128i_3);
193 band_table3_8x16b = _mm_add_epi16(band_table3_8x16b, tmp_set_128i_4);
194 //masking upper 8bit values of each 16 bit band table value
195 band_table0_8x16b = _mm_and_si128(band_table0_8x16b, cmp_store);
196 band_table1_8x16b = _mm_and_si128(band_table1_8x16b, cmp_store);
197 band_table2_8x16b = _mm_and_si128(band_table2_8x16b, cmp_store);
198 band_table3_8x16b = _mm_and_si128(band_table3_8x16b, cmp_store);
199
200 switch(sao_band_pos)
201 {
202 case 0:
203 tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table0_8x16b);
204 band_table0_8x16b = _mm_and_si128(band_table0_8x16b, tmp_set_128i_2);
205 break;
206 case 28:
207 tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table3_8x16b);
208 band_table3_8x16b = _mm_or_si128(band_table3_8x16b, tmp_set_128i_2);
209 break;
210 case 29:
211 tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table2_8x16b);
212 band_table2_8x16b = _mm_or_si128(band_table2_8x16b, tmp_set_128i_2);
213 tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table3_8x16b);
214 band_table3_8x16b = _mm_and_si128(band_table3_8x16b, tmp_set_128i_2);
215 break;
216 case 30:
217 tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table1_8x16b);
218 band_table1_8x16b = _mm_or_si128(band_table1_8x16b, tmp_set_128i_2);
219 tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table2_8x16b);
220 band_table2_8x16b = _mm_and_si128(band_table2_8x16b, tmp_set_128i_2);
221 break;
222 case 31:
223 tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table0_8x16b);
224 band_table0_8x16b = _mm_or_si128(band_table0_8x16b, tmp_set_128i_2);
225 tmp_set_128i_2 = _mm_cmpgt_epi16(cmp_mask, band_table1_8x16b);
226 band_table1_8x16b = _mm_and_si128(band_table1_8x16b, tmp_set_128i_2);
227 break;
228 default:
229 break;
230 }
231 //sao_offset is reused for zero cmp mask.
232 sao_offset = _mm_setzero_si128();
233 tmp_set_128i_1 = _mm_set1_epi8(1);
234 //tmp_set_128i_2 = _mm_set_epi8 (128,7,128,6,128,5,128,4,128,3,128,2,128,1,128,0);
235 cmp_mask = _mm_packus_epi16(cmp_mask, cmp_mask); //cmp_msk=dup16(16);
236
237 //masking upper 8bit values of each 16 bit band table value
238 band_table0_8x16b = _mm_and_si128(band_table0_8x16b, cmp_store);
239 band_table1_8x16b = _mm_and_si128(band_table1_8x16b, cmp_store);
240 band_table2_8x16b = _mm_and_si128(band_table2_8x16b, cmp_store);
241 band_table3_8x16b = _mm_and_si128(band_table3_8x16b, cmp_store);
242
243 //band table 8x16 four registers are packed into 16x8 two registers: band_table0_8x16b and band_table2_8x16b
244 band_table0_8x16b = _mm_packus_epi16(band_table0_8x16b, band_table1_8x16b);
245 band_table2_8x16b = _mm_packus_epi16(band_table2_8x16b, band_table3_8x16b);
246
247 band_table3_8x16b = _mm_slli_epi16(cmp_mask, 1); // to compare if value is greater than 31
248 band_pos_16x8b = _mm_packus_epi16(band_pos_16x8b, band_pos_16x8b); //band_pos is now 8 bit aligned
249 band_table3_8x16b = _mm_sub_epi8(band_table3_8x16b, tmp_set_128i_1); // to compare if value is greater than 31
250
251 cmp_mask = _mm_sub_epi8(cmp_mask, tmp_set_128i_1);
252 // band_pos_16x8b = _mm_or_si128(band_pos_16x8b,cmp_store);
253
254 for(col = wd; col >= 16; col -= 16)
255 {
256 pu1_src_cpy = pu1_src;
257 for(row = ht; row > 0; row -= 2)
258 {
259
260
261 //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
262 src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
263 // row = 1
264 src_temp2_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
265
266
267
268 //saturated substract 8 bit
269 tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_16x8b);
270 tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_16x8b);
271 //if the values less than 0 put ff
272 tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
273 tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
274 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
275 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
276 //if the values gret=ater than 31 put ff
277 tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, band_table3_8x16b);
278 tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, band_table3_8x16b);
279 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
280 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
281
282
283 //row 0 and row1
284 //if the values >16 then put ff ,cmp_mask = dup16(15)
285 cmp_store = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
286 //values 16 to 31 for row 0 & 1 but values <16 ==0
287 tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, cmp_store);
288 // values 0 to 15 for row 0 & 1
289 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, cmp_store);
290 //values 16 to 31 for row 0 & 1 but values <16 masked to ff
291 cmp_store = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
292 tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, cmp_store);
293 //row 2 and row 3
294 //if the values >16 then put ff ,cmp_mask = dup16(15)
295 cmp_store = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
296 //values 16 to 31 for row 2 & 3 but values <16 ==0
297 tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, cmp_store);
298 // values 0 to 15 for row 2 & 3
299 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, cmp_store);
300 //values 16 to 31 for row 2 & 3 but values <16 masked to ff
301 cmp_store = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
302 tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, cmp_store);
303
304 //row 0 and row 1
305 //to preserve pixel values in which no offset needs to be added.
306 cmp_store = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
307 src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, cmp_store);
308
309 //row 2 and row 3
310 //to preserve pixel values in which no offset needs to be added.
311 cmp_store = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
312 src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, cmp_store);
313
314 //indexing 0 - 15 bandtable indexes
315 tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_1);
316 tmp_set_128i_3 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_3);
317 tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_2);
318 tmp_set_128i_4 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_4);
319 // combining all offsets results
320 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
321 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
322 // combing results woth the pixel values
323 src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
324 src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
325
326
327 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
328 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_8x16b);
329 // row = 1
330 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp2_8x16b);
331
332 pu1_src_cpy += (src_strd << 1);
333 }
334 pu1_src += 16;
335 }
336 wd_rem = wd & 0xF;
337 if(wd_rem)
338 {pu1_src_cpy = pu1_src;
339 for(row = ht; row > 0; row -= 4)
340 {
341
342
343 //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
344 src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
345 // row = 1
346 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
347 // row = 2
348 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
349 // row = 3
350 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
351 //row0 and row1 packed and row2 and row3 packed
352
353 src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b);
354 src_temp2_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp3_8x16b);
355
356 //saturated substract 8 bit
357 tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_16x8b);
358 tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_16x8b);
359 //if the values less than 0 put ff
360 tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
361 tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
362 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
363 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
364 //if the values gret=ater than 31 put ff
365 tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, band_table3_8x16b);
366 tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, band_table3_8x16b);
367 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
368 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
369
370
371
372 //row 0 and row1
373 //if the values >16 then put ff ,cmp_mask = dup16(15)
374 cmp_store = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
375 //values 16 to 31 for row 0 & 1 but values <16 ==0
376 tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, cmp_store);
377 // values 0 to 15 for row 0 & 1
378 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, cmp_store);
379 //values 16 to 31 for row 0 & 1 but values <16 masked to ff
380 cmp_store = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
381 tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, cmp_store);
382 //row 2 and row 3
383 //if the values >16 then put ff ,cmp_mask = dup16(15)
384 cmp_store = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
385 //values 16 to 31 for row 2 & 3 but values <16 ==0
386 tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, cmp_store);
387 // values 0 to 15 for row 2 & 3
388 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, cmp_store);
389 //values 16 to 31 for row 2 & 3 but values <16 masked to ff
390 cmp_store = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
391 tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, cmp_store);
392
393 //row 0 and row 1
394 //to preserve pixel values in which no offset needs to be added.
395 cmp_store = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
396 src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, cmp_store);
397
398 //row 2 and row 3
399 //to preserve pixel values in which no offset needs to be added.
400 cmp_store = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
401 src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, cmp_store);
402
403 //indexing 0 - 15 bandtable indexes
404 tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_1);
405 tmp_set_128i_3 = _mm_shuffle_epi8(band_table0_8x16b, tmp_set_128i_3);
406 tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_2);
407 tmp_set_128i_4 = _mm_shuffle_epi8(band_table2_8x16b, tmp_set_128i_4);
408 // combining all offsets results
409 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
410 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
411 // combing results woth the pixel values
412 src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
413 src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
414
415 //Getting row1 separately
416 src_temp1_8x16b = _mm_srli_si128(src_temp0_8x16b, 8);
417 //Getting row3 separately
418 src_temp3_8x16b = _mm_srli_si128(src_temp2_8x16b, 8);
419
420 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
421 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_8x16b);
422 // row = 1
423 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), src_temp1_8x16b);
424 // row = 2
425 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp2_8x16b);
426 // row = 3
427 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), src_temp3_8x16b);
428
429 pu1_src_cpy += (src_strd << 2);
430
431 }
432 pu1_src += 8;
433 }
434
435
436 }
437
ihevc_sao_band_offset_chroma_ssse3(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_src_left,UWORD8 * pu1_src_top,UWORD8 * pu1_src_top_left,WORD32 sao_band_pos_u,WORD32 sao_band_pos_v,WORD8 * pi1_sao_offset_u,WORD8 * pi1_sao_offset_v,WORD32 wd,WORD32 ht)438 void ihevc_sao_band_offset_chroma_ssse3(UWORD8 *pu1_src,
439 WORD32 src_strd,
440 UWORD8 *pu1_src_left,
441 UWORD8 *pu1_src_top,
442 UWORD8 *pu1_src_top_left,
443 WORD32 sao_band_pos_u,
444 WORD32 sao_band_pos_v,
445 WORD8 *pi1_sao_offset_u,
446 WORD8 *pi1_sao_offset_v,
447 WORD32 wd,
448 WORD32 ht)
449 {
450 WORD32 row, col;
451 WORD8 offset = 0;
452
453
454 __m128i src_temp0_8x16b, src_temp1_8x16b, src_temp2_8x16b, src_temp3_8x16b;
455 __m128i cmp_msk2;
456 __m128i band_table0_16x8b, band_table1_16x8b, band_table2_16x8b, band_table3_16x8b;
457 __m128i tmp_set_128i_1, tmp_set_128i_2, tmp_set_128i_3, tmp_set_128i_4;
458 __m128i band_pos_u_16x8b, band_pos_v_16x8b;
459 __m128i sao_offset;
460 __m128i cmp_mask;
461
462
463 /* Updating left and top and top-left */
464 for(row = 0; row < ht; row++)
465 {
466 pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)];
467 pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)];
468 }
469 pu1_src_top_left[0] = pu1_src_top[wd - 2];
470 pu1_src_top_left[1] = pu1_src_top[wd - 1];
471 for(col = 0; col < wd; col += 8)
472 {
473 tmp_set_128i_1 = _mm_loadl_epi64((__m128i *)(pu1_src + (ht - 1) * src_strd + offset));
474 _mm_storel_epi64((__m128i *)(pu1_src_top + offset), tmp_set_128i_1);
475 offset += 8;
476 }
477
478 { // band _table creation
479 __m128i temp0_8x16b, temp1_8x16b, temp2_8x16b, temp3_8x16b;
480 // Band table for U component : band_table0_16x8b and band_table2_16x8b
481 //replicating sao_band_pos as 8 bit value 16 times
482 band_pos_u_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos_u << 3));
483 //value set for sao_offset extraction
484 tmp_set_128i_1 = _mm_set_epi8(128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1, 128, 1);
485 tmp_set_128i_2 = _mm_set_epi8(128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2, 128, 2);
486 tmp_set_128i_3 = _mm_set_epi8(128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3, 128, 3);
487 tmp_set_128i_4 = _mm_set_epi8(128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4, 128, 4);
488
489 //loaded sao offset values
490 sao_offset = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
491
492 //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
493 band_table0_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
494 band_table1_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
495 band_table2_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
496 band_table3_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
497
498 //band_position addition
499 band_table0_16x8b = _mm_add_epi16(band_table0_16x8b, band_pos_u_16x8b);
500 band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, band_pos_u_16x8b);
501 band_table2_16x8b = _mm_add_epi16(band_table2_16x8b, band_pos_u_16x8b);
502 band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, band_pos_u_16x8b);
503 //sao_offset duplication
504 temp0_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
505 temp1_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
506 temp2_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
507 temp3_8x16b = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
508
509 //sao_offset addition
510 band_table0_16x8b = _mm_add_epi16(band_table0_16x8b, temp0_8x16b);
511 band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, temp1_8x16b);
512 band_table2_16x8b = _mm_add_epi16(band_table2_16x8b, temp2_8x16b);
513 band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, temp3_8x16b);
514 //reuse for clipping
515 temp1_8x16b = _mm_set1_epi16(0x00ff);
516 //settng for comparision
517 cmp_mask = _mm_set1_epi16(16);
518
519 //masking upper 8bit values of each 16 bit band table value
520 band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp1_8x16b);
521 band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
522 band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp1_8x16b);
523 band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
524
525 //temp1_8x16b reuse for compare storage
526 switch(sao_band_pos_u)
527 {
528 case 0:
529 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table0_16x8b);
530 band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp3_8x16b);
531 break;
532 case 28:
533 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
534 band_table3_16x8b = _mm_or_si128(band_table3_16x8b, temp3_8x16b);
535 break;
536 case 29:
537 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table2_16x8b);
538 band_table2_16x8b = _mm_or_si128(band_table2_16x8b, temp3_8x16b);
539 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
540 band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp3_8x16b);
541 break;
542 case 30:
543 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
544 band_table1_16x8b = _mm_or_si128(band_table1_16x8b, temp3_8x16b);
545 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table2_16x8b);
546 band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp3_8x16b);
547 break;
548 case 31:
549 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table0_16x8b);
550 band_table0_16x8b = _mm_or_si128(band_table0_16x8b, temp3_8x16b);
551 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
552 band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp3_8x16b);
553 break;
554 default:
555 break;
556 }
557 //masking upper 8bit values of each 16 bit band table value
558 band_table0_16x8b = _mm_and_si128(band_table0_16x8b, temp1_8x16b);
559 band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
560 band_table2_16x8b = _mm_and_si128(band_table2_16x8b, temp1_8x16b);
561 band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
562 //band table 8x16 four registers are packed into 16x8 two registers: band_table0_8x16b and band_table2_8x16b
563 band_table0_16x8b = _mm_packus_epi16(band_table0_16x8b, band_table1_16x8b);
564 band_table2_16x8b = _mm_packus_epi16(band_table2_16x8b, band_table3_16x8b);
565 // Band table for U component over
566
567 // Band table for V component : band_table1_16x8b and band_table3_16x8b
568 // replicating sao_band_pos as 8 bit value 16 times
569 band_pos_v_16x8b = _mm_set1_epi16((WORD16)(sao_band_pos_v << 3));
570
571 //loaded sao offset values
572 sao_offset = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
573
574 //loading 16bit 32values of gu2_table_band_idx consecutively in 4 registers
575 temp0_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx));
576 band_table1_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 8));
577 temp2_8x16b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 16));
578 band_table3_16x8b = _mm_load_si128((__m128i *)(gu2_table_band_idx + 24));
579
580 //band_position addition
581 temp0_8x16b = _mm_add_epi16(temp0_8x16b, band_pos_v_16x8b);
582 band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, band_pos_v_16x8b);
583 temp2_8x16b = _mm_add_epi16(temp2_8x16b, band_pos_v_16x8b);
584 band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, band_pos_v_16x8b);
585 //sao_offset duplication
586 tmp_set_128i_1 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_1);
587 tmp_set_128i_2 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_2);
588 tmp_set_128i_3 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_3);
589 tmp_set_128i_4 = _mm_shuffle_epi8(sao_offset, tmp_set_128i_4);
590
591 //sao_offset addition
592 temp0_8x16b = _mm_add_epi16(temp0_8x16b, tmp_set_128i_1);
593 band_table1_16x8b = _mm_add_epi16(band_table1_16x8b, tmp_set_128i_2);
594 temp2_8x16b = _mm_add_epi16(temp2_8x16b, tmp_set_128i_3);
595 band_table3_16x8b = _mm_add_epi16(band_table3_16x8b, tmp_set_128i_4);
596
597 //masking upper 8bit values of 16 bit band table value
598 temp0_8x16b = _mm_and_si128(temp0_8x16b, temp1_8x16b);
599 band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
600 temp2_8x16b = _mm_and_si128(temp2_8x16b, temp1_8x16b);
601 band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
602 //temp1_8x16b reuse for compare storage
603
604 switch(sao_band_pos_v)
605 {
606 case 0:
607 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp0_8x16b);
608 temp0_8x16b = _mm_and_si128(temp0_8x16b, temp3_8x16b);
609 break;
610 case 28:
611 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
612 band_table3_16x8b = _mm_or_si128(band_table3_16x8b, temp3_8x16b);
613 break;
614 case 29:
615 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp2_8x16b);
616 temp2_8x16b = _mm_or_si128(temp2_8x16b, temp3_8x16b);
617 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table3_16x8b);
618 band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp3_8x16b);
619 break;
620 case 30:
621 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
622 band_table1_16x8b = _mm_or_si128(band_table1_16x8b, temp3_8x16b);
623 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp2_8x16b);
624 temp2_8x16b = _mm_and_si128(temp2_8x16b, temp3_8x16b);
625 break;
626 case 31:
627 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, temp0_8x16b);
628 temp0_8x16b = _mm_or_si128(temp0_8x16b, temp3_8x16b);
629 temp3_8x16b = _mm_cmpgt_epi16(cmp_mask, band_table1_16x8b);
630 band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp3_8x16b);
631 break;
632 default:
633 break;
634 }
635 //masking upper 8bit values of each 16 bit band table value
636 temp0_8x16b = _mm_and_si128(temp0_8x16b, temp1_8x16b);
637 band_table1_16x8b = _mm_and_si128(band_table1_16x8b, temp1_8x16b);
638 temp2_8x16b = _mm_and_si128(temp2_8x16b, temp1_8x16b);
639 band_table3_16x8b = _mm_and_si128(band_table3_16x8b, temp1_8x16b);
640 //band table 8x16 four registers are packed into 16x8 two registers: band_table0_8x16b and band_table2_8x16b
641 band_table1_16x8b = _mm_packus_epi16(temp0_8x16b, band_table1_16x8b);
642 band_table3_16x8b = _mm_packus_epi16(temp2_8x16b, band_table3_16x8b);
643 //band table for u and v created
644 }
645 {
646 UWORD8 *pu1_src_cpy;
647 WORD32 wd_rem;
648
649
650 //sao_offset is reused for zero cmp mask.
651 sao_offset = _mm_setzero_si128();
652 tmp_set_128i_1 = _mm_set1_epi8(1);
653 //tmp_set_128i_2 = _mm_set_epi8 (128,7,128,6,128,5,128,4,128,3,128,2,128,1,128,0);
654 cmp_mask = _mm_packus_epi16(cmp_mask, cmp_mask); //cmp_msk=dup16(16);
655 //to avoid ffff to be saturated to 0 instead it should be to ff
656
657 cmp_msk2 = _mm_slli_epi16(cmp_mask, 1); // to compare if value is greater than 31
658 band_pos_u_16x8b = _mm_packus_epi16(band_pos_u_16x8b, band_pos_u_16x8b); //band_pos_u is now 8 bit aligned
659 band_pos_v_16x8b = _mm_packus_epi16(band_pos_v_16x8b, band_pos_v_16x8b); //band_pos_v is now 8 bit aligned
660 cmp_msk2 = _mm_sub_epi8(cmp_msk2, tmp_set_128i_1); // to compare if value is greater than 31
661
662 cmp_mask = _mm_sub_epi8(cmp_mask, tmp_set_128i_1);
663
664 for(col = wd; col >= 16; col -= 16)
665 {
666 pu1_src_cpy = pu1_src;
667 for(row = ht; row > 0; row -= 2)
668 {
669 //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
670 src_temp0_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
671 // row = 1
672 src_temp3_8x16b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
673
674
675 //odd values
676 src_temp1_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
677 src_temp2_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
678 //even values
679 src_temp0_8x16b = _mm_slli_epi16(src_temp0_8x16b, 8);
680 src_temp3_8x16b = _mm_slli_epi16(src_temp3_8x16b, 8);
681 src_temp0_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
682 src_temp3_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
683 //combining odd values
684 src_temp2_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp2_8x16b);
685 //combining even values
686 src_temp0_8x16b = _mm_packus_epi16(src_temp0_8x16b, src_temp3_8x16b);
687
688 //saturated substract 8 bit
689 tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_u_16x8b);
690 tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_v_16x8b);
691 //if the values less than 0 put ff
692 tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
693 tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
694 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
695 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
696 //if the values greater than 31 put ff
697 tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_msk2);
698 tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_msk2);
699 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
700 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
701 // registers reused to increase performance
702 //if the values >16 then put ff ,cmp_mask = dup16(15) row 0 and row1
703 src_temp1_8x16b = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
704 //if the values >16 then put ff ,cmp_mask = dup16(15) row 2 and row 3
705 src_temp3_8x16b = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
706
707 //values 16 to 31 for row 0 & 1 but values <16 ==0
708 tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, src_temp1_8x16b);
709 // values 0 to 15 for row 0 & 1
710 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, src_temp1_8x16b);
711 //values 16 to 31 for row 2 & 3 but values <16 ==0
712 tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, src_temp3_8x16b);
713 // values 0 to 15 for row 2 & 3
714 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, src_temp3_8x16b);
715
716 //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 0 and row1
717 src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
718 //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 2 and row 3
719 src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
720 tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, src_temp1_8x16b);
721 tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, src_temp3_8x16b);
722
723
724 //to choose which pixel values to preserve in row 0 and row 1
725 src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
726 //to choose which pixel values to preserve in row 2 and row 3
727 src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
728 //values of all rows to which no offset needs to be added preserved.
729 src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, src_temp1_8x16b);
730 src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, src_temp3_8x16b);
731
732 //indexing 0 - 15 bandtable indexes
733 tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_16x8b, tmp_set_128i_1); //U low
734 tmp_set_128i_3 = _mm_shuffle_epi8(band_table1_16x8b, tmp_set_128i_3); //V low
735 //indexing 16 -31 bandtable indexes
736 tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_16x8b, tmp_set_128i_2); //U high
737 tmp_set_128i_4 = _mm_shuffle_epi8(band_table3_16x8b, tmp_set_128i_4); //V high
738 // combining all offsets results
739 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); //U
740 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); //V
741 // combing results with the pixel values
742 src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
743 src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
744 //reorganising even and odd values
745 src_temp1_8x16b = _mm_unpacklo_epi8(src_temp0_8x16b, src_temp2_8x16b);
746 src_temp3_8x16b = _mm_unpackhi_epi8(src_temp0_8x16b, src_temp2_8x16b);
747
748
749 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
750 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp1_8x16b);
751 // row = 1
752 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp3_8x16b);
753
754
755 pu1_src_cpy += (src_strd << 1);
756
757 }
758 pu1_src += 16;
759 }
760
761 wd_rem = wd & 0xF;
762 if(wd_rem)
763 {
764 pu1_src_cpy = pu1_src;
765 for(row = ht; row > 0; row -= 4)
766 {
767 //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
768 src_temp0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
769 // row = 1
770 src_temp1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
771 // row = 2
772 src_temp2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
773 // row = 3
774 src_temp3_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
775 //row0 and row1 packed and row2 and row3 packed
776
777 src_temp0_8x16b = _mm_unpacklo_epi64(src_temp0_8x16b, src_temp1_8x16b);
778 src_temp3_8x16b = _mm_unpacklo_epi64(src_temp2_8x16b, src_temp3_8x16b);
779 //odd values
780 src_temp1_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
781 src_temp2_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
782 //even values
783 src_temp0_8x16b = _mm_slli_epi16(src_temp0_8x16b, 8);
784 src_temp3_8x16b = _mm_slli_epi16(src_temp3_8x16b, 8);
785 src_temp0_8x16b = _mm_srli_epi16(src_temp0_8x16b, 8);
786 src_temp3_8x16b = _mm_srli_epi16(src_temp3_8x16b, 8);
787 //combining odd values
788 src_temp2_8x16b = _mm_packus_epi16(src_temp1_8x16b, src_temp2_8x16b);
789 //combining even values
790 src_temp0_8x16b = _mm_packus_epi16(src_temp0_8x16b, src_temp3_8x16b);
791
792 //saturated substract 8 bit
793 tmp_set_128i_1 = _mm_sub_epi8(src_temp0_8x16b, band_pos_u_16x8b);
794 tmp_set_128i_3 = _mm_sub_epi8(src_temp2_8x16b, band_pos_v_16x8b);
795 //if the values less than 0 put ff
796 tmp_set_128i_2 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_1);
797 tmp_set_128i_4 = _mm_cmpgt_epi8(sao_offset, tmp_set_128i_3);
798 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
799 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
800 //if the values greater than 31 put ff
801 tmp_set_128i_2 = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_msk2);
802 tmp_set_128i_4 = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_msk2);
803 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2);
804 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4);
805 // registers reused to increase performance
806 //if the values >16 then put ff ,cmp_mask = dup16(15) row 0 and row1
807 src_temp1_8x16b = _mm_cmpgt_epi8(tmp_set_128i_1, cmp_mask);
808 //if the values >16 then put ff ,cmp_mask = dup16(15) row 2 and row 3
809 src_temp3_8x16b = _mm_cmpgt_epi8(tmp_set_128i_3, cmp_mask);
810
811 //values 16 to 31 for row 0 & 1 but values <16 ==0
812 tmp_set_128i_2 = _mm_and_si128(tmp_set_128i_1, src_temp1_8x16b);
813 // values 0 to 15 for row 0 & 1
814 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, src_temp1_8x16b);
815 //values 16 to 31 for row 2 & 3 but values <16 ==0
816 tmp_set_128i_4 = _mm_and_si128(tmp_set_128i_3, src_temp3_8x16b);
817 // values 0 to 15 for row 2 & 3
818 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, src_temp3_8x16b);
819
820 //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 0 and row1
821 src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_2, sao_offset);
822 //values 16 to 31 for row 0 & 1 but values <16 masked to ff row 2 and row 3
823 src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_4, sao_offset);
824 tmp_set_128i_2 = _mm_or_si128(tmp_set_128i_2, src_temp1_8x16b);
825 tmp_set_128i_4 = _mm_or_si128(tmp_set_128i_4, src_temp3_8x16b);
826
827
828 //to choose which pixel values to preserve in row 0 and row 1
829 src_temp1_8x16b = _mm_cmpeq_epi8(tmp_set_128i_1, tmp_set_128i_2);
830 //to choose which pixel values to preserve in row 2 and row 3
831 src_temp3_8x16b = _mm_cmpeq_epi8(tmp_set_128i_3, tmp_set_128i_4);
832 //values of all rows to which no offset needs to be added preserved.
833 src_temp0_8x16b = _mm_and_si128(src_temp0_8x16b, src_temp1_8x16b);
834 src_temp2_8x16b = _mm_and_si128(src_temp2_8x16b, src_temp3_8x16b);
835
836 //indexing 0 - 15 bandtable indexes
837 tmp_set_128i_1 = _mm_shuffle_epi8(band_table0_16x8b, tmp_set_128i_1); //U low
838 tmp_set_128i_3 = _mm_shuffle_epi8(band_table1_16x8b, tmp_set_128i_3); //V low
839 //indexing 16 -31 bandtable indexes
840 tmp_set_128i_2 = _mm_shuffle_epi8(band_table2_16x8b, tmp_set_128i_2); //U high
841 tmp_set_128i_4 = _mm_shuffle_epi8(band_table3_16x8b, tmp_set_128i_4); //V high
842 // combining all offsets results
843 tmp_set_128i_1 = _mm_or_si128(tmp_set_128i_1, tmp_set_128i_2); //U
844 tmp_set_128i_3 = _mm_or_si128(tmp_set_128i_3, tmp_set_128i_4); //V
845 // combing results with the pixel values
846 src_temp0_8x16b = _mm_or_si128(src_temp0_8x16b, tmp_set_128i_1);
847 src_temp2_8x16b = _mm_or_si128(src_temp2_8x16b, tmp_set_128i_3);
848 //reorganising even and odd values
849 src_temp1_8x16b = _mm_unpacklo_epi8(src_temp0_8x16b, src_temp2_8x16b);
850 src_temp3_8x16b = _mm_unpackhi_epi8(src_temp0_8x16b, src_temp2_8x16b);
851 //Getting row1 separately
852 src_temp0_8x16b = _mm_srli_si128(src_temp1_8x16b, 8);
853 //Getting row3 separately
854 src_temp2_8x16b = _mm_srli_si128(src_temp3_8x16b, 8);
855
856 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
857 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp1_8x16b);
858 // row = 1
859 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), src_temp0_8x16b);
860 // row = 2
861 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp3_8x16b);
862 // row = 3
863 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), src_temp2_8x16b);
864
865 pu1_src_cpy += (src_strd << 2);
866
867 }
868 pu1_src += 16;
869 }
870
871
872 }
873 }
874
875
876
ihevc_sao_edge_offset_class0_ssse3(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_src_left,UWORD8 * pu1_src_top,UWORD8 * pu1_src_top_left,UWORD8 * pu1_src_top_right,UWORD8 * pu1_src_bot_left,UWORD8 * pu1_avail,WORD8 * pi1_sao_offset,WORD32 wd,WORD32 ht)877 void ihevc_sao_edge_offset_class0_ssse3(UWORD8 *pu1_src,
878 WORD32 src_strd,
879 UWORD8 *pu1_src_left,
880 UWORD8 *pu1_src_top,
881 UWORD8 *pu1_src_top_left,
882 UWORD8 *pu1_src_top_right,
883 UWORD8 *pu1_src_bot_left,
884 UWORD8 *pu1_avail,
885 WORD8 *pi1_sao_offset,
886 WORD32 wd,
887 WORD32 ht)
888 {
889 WORD32 row, col;
890 UWORD8 *pu1_src_cpy, *pu1_src_left_cpy, *pu1_src_left_str, *pu1_left_tmp;
891 UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
892 UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
893 UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
894 UWORD8 u1_avail0, u1_avail1;
895 WORD32 wd_rem;
896 WORD32 offset = 0;
897 __m128i src_temp0_16x8b, src_temp1_16x8b;
898 __m128i left0_16x8b, left1_16x8b;
899 __m128i cmp_gt0_16x8b, cmp_lt0_16x8b, cmp_gt1_16x8b, cmp_lt1_16x8b;
900 __m128i edge0_16x8b, edge1_16x8b;
901 __m128i au1_mask8x16b;
902 __m128i edge_idx_8x16b, sao_offset_8x16b;
903 __m128i const2_16x8b, const0_16x8b;
904 __m128i left_store_16x8b;
905 UNUSED(pu1_src_top_right);
906 UNUSED(pu1_src_bot_left);
907
908 au1_mask8x16b = _mm_set1_epi8(0xff);
909
910 /* Update top and top-left arrays */
911
912 *pu1_src_top_left = pu1_src_top[wd - 1];
913
914 for(col = wd; col >= 16; col -= 16)
915 {
916 const0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + offset + (ht - 1) * src_strd));
917 _mm_storeu_si128((__m128i *)(pu1_src_top + offset), const0_16x8b);
918 offset += 16;
919 }
920
921 //setting availability mask to ff size MAX_CTB_SIZE
922 for(col = 0; col < MAX_CTB_SIZE; col += 16)
923 _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
924 for(row = 0; row < ht; row++)
925 {
926 au1_src_left_tmp[row] = pu1_src_left[row];
927 }
928 edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
929 sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
930
931 //availability mask creation
932 u1_avail0 = pu1_avail[0];
933 u1_avail1 = pu1_avail[1];
934 au1_mask[0] = u1_avail0;
935 au1_mask[wd - 1] = u1_avail1;
936
937 const2_16x8b = _mm_set1_epi8(2);
938 const0_16x8b = _mm_setzero_si128();
939 pu1_src_left_cpy = au1_src_left_tmp;
940 pu1_src_left_str = au1_src_left_tmp1;
941 {
942 au1_mask_cpy = au1_mask;
943 for(col = wd; col >= 16; col -= 16)
944 {
945 pu1_src_cpy = pu1_src;
946 au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
947 //pu1_src_left_cpy =au1_src_left_tmp;
948 for(row = ht; row > 0; row -= 2)
949 {
950
951 left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
952 //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
953 src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
954 // row = 1
955 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
956
957 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 2);
958 //row 1 left
959 left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 15);
960 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
961 //row 0 left
962 left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 15);
963 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
964
965
966 //separating +ve and and -ve values.
967 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
968 cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
969 cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
970 cmp_lt1_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
971 //creating mask 00 for +ve and -ve values and FF for zero.
972 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
973 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
974 cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
975 cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
976 //combining the appropriate sign change
977 left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
978 left1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
979
980 //row = 0 right
981 edge0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 1));
982 // row = 1 right
983 edge1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
984 //separating +ve and and -ve values.
985 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
986 cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
987 cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
988 cmp_lt1_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
989 //creating mask 00 for +ve and -ve values and FF for zero.
990 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
991 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
992 cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
993 cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
994 //combining the appropriate sign change
995 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
996 edge1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
997
998 //combining sign-left and sign_right
999 edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
1000 edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
1001 //adding constant 2
1002 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1003 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1004 //shuffle to get sao index
1005 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1006 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1007 //using availability mask
1008 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
1009 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
1010
1011 //shuffle to get sao offset
1012 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1013 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1014 //cnvert to 16 bit then add and then saturated pack
1015 left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1016 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1017 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
1018 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1019 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1020 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
1021 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1022 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1023
1024 left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1025 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1026 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
1027 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
1028 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1029 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
1030 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
1031 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
1032
1033
1034 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
1035 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1036 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1037 // row = 1
1038 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
1039
1040 pu1_src_cpy += (src_strd << 1);
1041 pu1_src_left_cpy += 2;
1042 pu1_src_left_str += 2;
1043 }
1044 au1_mask_cpy += 16;
1045 pu1_src += 16;
1046 pu1_src_left_cpy -= ht;
1047 pu1_src_left_str -= ht;
1048
1049 pu1_left_tmp = pu1_src_left_cpy;
1050 pu1_src_left_cpy = pu1_src_left_str;
1051 pu1_src_left_str = pu1_left_tmp;
1052 }
1053
1054 wd_rem = wd & 0xF;
1055 if(wd_rem)
1056 {
1057
1058 cmp_gt1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (ht - 1) * src_strd));
1059 _mm_storel_epi64((__m128i *)(pu1_src_top + offset), cmp_gt1_16x8b);
1060
1061 au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy);
1062 pu1_src_cpy = pu1_src;
1063 au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
1064 //pu1_src_left_cpy =au1_src_left_tmp;
1065 for(row = ht; row > 0; row -= 4)
1066 {
1067 left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
1068 //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
1069 src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
1070 // row = 1
1071 cmp_gt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
1072 // row = 2
1073 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
1074 // row = 3
1075 cmp_gt1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
1076
1077
1078 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 4);
1079 //row 3 left
1080 edge0_16x8b = _mm_slli_si128(cmp_gt1_16x8b, 8);
1081 cmp_lt1_16x8b = _mm_alignr_epi8(cmp_gt1_16x8b, left_store_16x8b, 15);
1082 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
1083 //row 2 left
1084 edge0_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
1085 left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 15);
1086 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
1087 //row 1 left
1088 edge0_16x8b = _mm_slli_si128(cmp_gt0_16x8b, 8);
1089 cmp_lt0_16x8b = _mm_alignr_epi8(cmp_gt0_16x8b, left_store_16x8b, 15);
1090 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
1091 //row 0 left
1092 edge0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
1093 left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 15);
1094 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 15);
1095
1096 // packing rows together for 16 SIMD operations
1097 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, cmp_gt0_16x8b);
1098 src_temp1_16x8b = _mm_unpacklo_epi64(src_temp1_16x8b, cmp_gt1_16x8b);
1099 // packing rows together for 16 SIMD operations
1100 left0_16x8b = _mm_unpacklo_epi64(left0_16x8b, cmp_lt0_16x8b);
1101 left1_16x8b = _mm_unpacklo_epi64(left1_16x8b, cmp_lt1_16x8b);
1102
1103 //separating +ve and and -ve values.
1104 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
1105 cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
1106 cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
1107 cmp_lt1_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
1108 //creating mask 00 for +ve and -ve values and FF for zero.
1109 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1110 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1111 cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
1112 cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
1113 //combining the appropriate sign change
1114 left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1115 left1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
1116
1117 //row = 0 right
1118 edge0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 1));
1119 // row = 1 right
1120 cmp_gt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd + 1));
1121 // row = 2 right
1122 edge1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd + 1));
1123 // row = 3 right
1124 cmp_gt1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd + 1));
1125 // packing rows together for 16 SIMD operations
1126 edge0_16x8b = _mm_unpacklo_epi64(edge0_16x8b, cmp_gt0_16x8b);
1127 edge1_16x8b = _mm_unpacklo_epi64(edge1_16x8b, cmp_gt1_16x8b);
1128
1129 //separating +ve and and -ve values.
1130 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
1131 cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
1132 cmp_gt1_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
1133 cmp_lt1_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
1134 //creating mask 00 for +ve and -ve values and FF for zero.
1135 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1136 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1137 cmp_gt1_16x8b = _mm_cmpeq_epi8(cmp_gt1_16x8b, const0_16x8b);
1138 cmp_lt1_16x8b = _mm_cmpeq_epi8(cmp_lt1_16x8b, const0_16x8b);
1139 //combining the appropriate sign change
1140 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1141 edge1_16x8b = _mm_sub_epi8(cmp_gt1_16x8b, cmp_lt1_16x8b);
1142
1143 //combining sign-left and sign_right
1144 edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
1145 edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
1146 //adding constant 2
1147 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1148 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1149 //shuffle to get sao index
1150 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1151 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1152 //shuffle to get sao offset
1153 //using availability mask
1154 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
1155 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
1156
1157 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1158 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1159 //cnvert to 16 bit then add and then saturated pack
1160 left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1161 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1162 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
1163 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1164 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1165 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
1166 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1167 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1168
1169 left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1170 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1171 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
1172 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
1173 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1174 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
1175 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
1176 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
1177 //separting row 1 and row 3
1178 cmp_lt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
1179 cmp_lt1_16x8b = _mm_srli_si128(src_temp1_16x8b, 8);
1180
1181 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
1182 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1183 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1184 // row = 1
1185 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_lt0_16x8b);
1186 // row = 2
1187 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp1_16x8b);
1188 // row = 3
1189 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt1_16x8b);
1190
1191 pu1_src_cpy += (src_strd << 2);
1192 pu1_src_left_cpy += 4;
1193 pu1_src_left_str += 4;
1194 }
1195 pu1_src += wd;
1196 pu1_src_left_cpy -= ht;
1197 pu1_src_left_str -= ht;
1198
1199 pu1_left_tmp = pu1_src_left_cpy;
1200 pu1_src_left_cpy = pu1_src_left_str;
1201 pu1_src_left_str = pu1_left_tmp;
1202 }
1203 for(row = 0; row < ht; row++)
1204 {
1205 pu1_src_left[row] = pu1_src_left_cpy[row];
1206 }
1207 }
1208 }
1209
1210
ihevc_sao_edge_offset_class0_chroma_ssse3(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_src_left,UWORD8 * pu1_src_top,UWORD8 * pu1_src_top_left,UWORD8 * pu1_src_top_right,UWORD8 * pu1_src_bot_left,UWORD8 * pu1_avail,WORD8 * pi1_sao_offset_u,WORD8 * pi1_sao_offset_v,WORD32 wd,WORD32 ht)1211 void ihevc_sao_edge_offset_class0_chroma_ssse3(UWORD8 *pu1_src,
1212 WORD32 src_strd,
1213 UWORD8 *pu1_src_left,
1214 UWORD8 *pu1_src_top,
1215 UWORD8 *pu1_src_top_left,
1216 UWORD8 *pu1_src_top_right,
1217 UWORD8 *pu1_src_bot_left,
1218 UWORD8 *pu1_avail,
1219 WORD8 *pi1_sao_offset_u,
1220 WORD8 *pi1_sao_offset_v,
1221 WORD32 wd,
1222 WORD32 ht)
1223 {
1224 WORD32 row, col;
1225 UWORD8 *pu1_src_cpy, *pu1_src_left_cpy, *pu1_src_left_str, *pu1_left_tmp;
1226 UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
1227 UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
1228 UWORD8 au1_src_left_tmp1[2 * (MAX_CTB_SIZE + 8)];
1229 UWORD8 u1_avail0, u1_avail1;
1230 WORD32 wd_rem;
1231 WORD32 offset = 0;
1232
1233 __m128i src_temp0_16x8b, src_temp1_16x8b;
1234 __m128i left0_16x8b, left1_16x8b;
1235 __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
1236 __m128i edge0_16x8b, edge1_16x8b;
1237 __m128i au1_mask8x16b;
1238 __m128i edge_idx_8x16b, sao_offset_8x16b;
1239 __m128i const2_16x8b, const0_16x8b;
1240 __m128i left_store_16x8b;
1241 __m128i chroma_offset_8x16b;
1242 UNUSED(pu1_src_top_right);
1243 UNUSED(pu1_src_bot_left);
1244
1245 au1_mask8x16b = _mm_set1_epi8(0xff);
1246
1247 /* Update top and top-left arrays */
1248 pu1_src_top_left[0] = pu1_src_top[wd - 2];
1249 pu1_src_top_left[1] = pu1_src_top[wd - 1];;
1250
1251 for(col = wd; col >= 16; col -= 16)
1252 {
1253 const0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + offset + (ht - 1) * src_strd));
1254 _mm_storeu_si128((__m128i *)(pu1_src_top + offset), const0_16x8b);
1255 offset += 16;
1256 }
1257 for(row = 0; row < 2 * ht; row++)
1258 {
1259 au1_src_left_tmp[row] = pu1_src_left[row];
1260 }
1261 //setting availability mask to ff size MAX_CTB_SIZE
1262 for(col = 0; col < MAX_CTB_SIZE; col += 16)
1263 _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
1264
1265 edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
1266 sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
1267 const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
1268 chroma_offset_8x16b = _mm_set1_epi16(0x0800);
1269 //availability mask creation
1270 u1_avail0 = pu1_avail[0];
1271 u1_avail1 = pu1_avail[1];
1272 au1_mask[0] = u1_avail0;
1273 au1_mask[1] = u1_avail0;
1274 au1_mask[wd - 1] = u1_avail1;
1275 au1_mask[wd - 2] = u1_avail1;
1276 sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
1277 const2_16x8b = _mm_set1_epi8(2);
1278 const0_16x8b = _mm_setzero_si128();
1279
1280 {
1281 pu1_src_left_cpy = au1_src_left_tmp;
1282 pu1_src_left_str = au1_src_left_tmp1;
1283 au1_mask_cpy = au1_mask;
1284 for(col = wd; col >= 16; col -= 16)
1285 {
1286 pu1_src_cpy = pu1_src;
1287 au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
1288
1289 for(row = ht; row > 0; row -= 2)
1290 {
1291
1292 left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
1293 //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
1294 src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
1295 // row = 1
1296 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
1297
1298 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 4);
1299 //row 1 left
1300 left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 14);
1301 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
1302 //row 0 left
1303 left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 14);
1304 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
1305
1306
1307 //separating +ve and and -ve values.row 0 left
1308 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
1309 cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
1310 //creating mask 00 for +ve and -ve values and FF for zero.
1311 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1312 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1313 //combining the appropriate sign change
1314 left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1315
1316 //separating +ve and and -ve values.row 1 left
1317 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
1318 cmp_lt0_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
1319 //creating mask 00 for +ve and -ve values and FF for zero.
1320 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1321 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1322 //combining the appropriate sign change
1323 left1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1324
1325
1326 //row = 0 right
1327 edge0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2));
1328 // row = 1 right
1329 edge1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
1330 //separating +ve and and -ve values.row 0 right
1331 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
1332 cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
1333 //creating mask 00 for +ve and -ve values and FF for zero.
1334 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1335 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1336 //combining the appropriate sign change
1337 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1338
1339 //separating +ve and and -ve values.row 1 right
1340 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
1341 cmp_lt0_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
1342 //creating mask 00 for +ve and -ve values and FF for zero.
1343 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1344 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1345 //combining the appropriate sign change
1346 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1347
1348 //combining sign-left and sign_right
1349 edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
1350 edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
1351 //adding constant 2
1352 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1353 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1354 //shuffle to get sao index
1355 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1356 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1357 //using availability mask
1358 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
1359 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
1360 //adding chroma offset to access U and V
1361 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
1362 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
1363
1364 //shuffle to get sao offset
1365 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1366 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1367 //cnvert to 16 bit then add and then saturated pack
1368 left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1369 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1370 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
1371 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1372 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
1373 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1374 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
1375 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1376
1377 left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1378 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1379 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
1380 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
1381 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
1382 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1383 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
1384 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
1385
1386 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
1387 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1388 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1389 // row = 1
1390 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
1391
1392 pu1_src_cpy += (src_strd << 1);
1393 pu1_src_left_cpy += 4;
1394 pu1_src_left_str += 4;
1395 }
1396 au1_mask_cpy += 16;
1397 pu1_src += 16;
1398 pu1_src_left_cpy -= 2 * ht;
1399 pu1_src_left_str -= 2 * ht;
1400
1401 pu1_left_tmp = pu1_src_left_cpy;
1402 pu1_src_left_cpy = pu1_src_left_str;
1403 pu1_src_left_str = pu1_left_tmp;
1404 }
1405
1406 wd_rem = wd & 0xF;
1407 if(wd_rem)
1408 {
1409
1410 cmp_gt0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src + (ht - 1) * src_strd));
1411 _mm_storel_epi64((__m128i *)(pu1_src_top + offset), cmp_gt0_16x8b);
1412
1413 au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy);
1414 pu1_src_cpy = pu1_src;
1415 au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
1416
1417 for(row = ht; row > 0; row -= 4)
1418 {
1419 left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
1420 //row = 0 load 8 pixel values from 7:0 pos. relative to cur. pos.
1421 src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
1422 // row = 1
1423 cmp_gt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
1424 // row = 2
1425 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
1426 // row = 3
1427 cmp_lt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
1428
1429
1430 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, left_store_16x8b, 8);
1431 //row 3 left
1432 edge0_16x8b = _mm_slli_si128(cmp_lt0_16x8b, 8);
1433 left0_16x8b = _mm_alignr_epi8(cmp_lt0_16x8b, left_store_16x8b, 14);
1434 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
1435 //row 2 left
1436 edge0_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
1437 left1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, left_store_16x8b, 14);
1438 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
1439
1440
1441 // packing rows together for 16 SIMD operations
1442 src_temp1_16x8b = _mm_unpacklo_epi64(src_temp1_16x8b, cmp_lt0_16x8b);
1443 left1_16x8b = _mm_unpacklo_epi64(left1_16x8b, left0_16x8b);
1444
1445 //row 1 left
1446 edge0_16x8b = _mm_slli_si128(cmp_gt0_16x8b, 8);
1447 edge1_16x8b = _mm_alignr_epi8(cmp_gt0_16x8b, left_store_16x8b, 14);
1448 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
1449 //row 0 left
1450 edge0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
1451 left0_16x8b = _mm_alignr_epi8(src_temp0_16x8b, left_store_16x8b, 14);
1452 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, edge0_16x8b, 14);
1453 // packing rows together for 16 SIMD operations
1454 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, cmp_gt0_16x8b);
1455 left0_16x8b = _mm_unpacklo_epi64(left0_16x8b, edge1_16x8b);
1456
1457 //separating +ve and and -ve values.for row 2 and row 3
1458 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, left1_16x8b);
1459 cmp_lt0_16x8b = _mm_subs_epu8(left1_16x8b, src_temp1_16x8b);
1460 //creating mask 00 for +ve and -ve values and FF for zero.
1461 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1462 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1463 //combining the appropriate sign change
1464 left1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1465
1466
1467
1468
1469
1470 //separating +ve and and -ve values.
1471 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, left0_16x8b);
1472 cmp_lt0_16x8b = _mm_subs_epu8(left0_16x8b, src_temp0_16x8b);
1473 //creating mask 00 for +ve and -ve values and FF for zero.
1474 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1475 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1476 //combining the appropriate sign change
1477 left0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1478
1479
1480 //row = 0 right
1481 edge0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2));
1482 // row = 1 right
1483 cmp_gt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd + 2));
1484 // row = 2 right
1485 edge1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd + 2));
1486 // row = 3 right
1487 cmp_lt0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd + 2));
1488 // packing rows together for 16 SIMD operations
1489 edge0_16x8b = _mm_unpacklo_epi64(edge0_16x8b, cmp_gt0_16x8b);
1490 edge1_16x8b = _mm_unpacklo_epi64(edge1_16x8b, cmp_lt0_16x8b);
1491
1492 //separating +ve and and -ve values.
1493 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, edge0_16x8b);
1494 cmp_lt0_16x8b = _mm_subs_epu8(edge0_16x8b, src_temp0_16x8b);
1495 //creating mask 00 for +ve and -ve values and FF for zero.
1496 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1497 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1498 //combining the appropriate sign change
1499 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1500
1501 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, edge1_16x8b);
1502 cmp_lt0_16x8b = _mm_subs_epu8(edge1_16x8b, src_temp1_16x8b);
1503 //creating mask 00 for +ve and -ve values and FF for zero.
1504 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1505 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1506 //combining the appropriate sign change
1507 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1508
1509 //combining sign-left and sign_right
1510 edge0_16x8b = _mm_add_epi8(edge0_16x8b, left0_16x8b);
1511 edge1_16x8b = _mm_add_epi8(edge1_16x8b, left1_16x8b);
1512 //adding constant 2
1513 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1514 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1515 //shuffle to get sao index
1516 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1517 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1518 //shuffle to get sao offset
1519 //using availability mask
1520 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
1521 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
1522 //adding chroma offset to access U and V
1523 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
1524 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
1525
1526 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1527 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1528 //cnvert to 16 bit then add and then saturated pack
1529 left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1530 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1531 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, left0_16x8b);
1532 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1533 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, left0_16x8b);
1534 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1535 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
1536 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1537
1538 left0_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1539 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1540 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, left0_16x8b);
1541 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
1542 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, left0_16x8b);
1543 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1544 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
1545 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
1546
1547 //seaprting row 1 and row 3
1548 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
1549 cmp_lt0_16x8b = _mm_srli_si128(src_temp1_16x8b, 8);
1550
1551 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
1552 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1553 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1554 // row = 1
1555 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
1556 // row = 2
1557 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_temp1_16x8b);
1558 // row = 3
1559 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
1560
1561 pu1_src_cpy += (src_strd << 2);
1562 pu1_src_left_cpy += 8;
1563 pu1_src_left_str += 8;
1564 }
1565 pu1_src += wd;
1566 pu1_src_left_cpy -= 2 * ht;
1567 pu1_src_left_str -= 2 * ht;
1568
1569 pu1_left_tmp = pu1_src_left_cpy;
1570 pu1_src_left_cpy = pu1_src_left_str;
1571 pu1_src_left_str = pu1_left_tmp;
1572 }
1573 for(row = 0; row < 2 * ht; row++)
1574 {
1575 pu1_src_left[row] = pu1_src_left_cpy[row];
1576 }
1577 }
1578
1579 }
1580
1581
ihevc_sao_edge_offset_class1_ssse3(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_src_left,UWORD8 * pu1_src_top,UWORD8 * pu1_src_top_left,UWORD8 * pu1_src_top_right,UWORD8 * pu1_src_bot_left,UWORD8 * pu1_avail,WORD8 * pi1_sao_offset,WORD32 wd,WORD32 ht)1582 void ihevc_sao_edge_offset_class1_ssse3(UWORD8 *pu1_src,
1583 WORD32 src_strd,
1584 UWORD8 *pu1_src_left,
1585 UWORD8 *pu1_src_top,
1586 UWORD8 *pu1_src_top_left,
1587 UWORD8 *pu1_src_top_right,
1588 UWORD8 *pu1_src_bot_left,
1589 UWORD8 *pu1_avail,
1590 WORD8 *pi1_sao_offset,
1591 WORD32 wd,
1592 WORD32 ht)
1593 {
1594 WORD32 row, col;
1595 UWORD8 *pu1_src_top_cpy;
1596 UWORD8 *pu1_src_cpy;
1597 WORD32 wd_rem;
1598
1599
1600 __m128i src_top_16x8b, src_bottom_16x8b;
1601 __m128i src_temp0_16x8b, src_temp1_16x8b;
1602 __m128i signup0_16x8b, signdwn1_16x8b;
1603 __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
1604 __m128i edge0_16x8b, edge1_16x8b;
1605 __m128i edge_idx_8x16b, sao_offset_8x16b;
1606 __m128i const2_16x8b, const0_16x8b;
1607
1608 UNUSED(pu1_src_top_right);
1609 UNUSED(pu1_src_bot_left);
1610
1611
1612 /* Updating left and top-left */
1613 for(row = 0; row < ht; row++)
1614 {
1615 pu1_src_left[row] = pu1_src[row * src_strd + (wd - 1)];
1616 }
1617 *pu1_src_top_left = pu1_src_top[wd - 1];
1618
1619
1620
1621 pu1_src_top_cpy = pu1_src_top;
1622 edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
1623 sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
1624
1625 /* Update height and source pointers based on the availability flags */
1626 if(0 == pu1_avail[2])
1627 {
1628 pu1_src_top_cpy = pu1_src;
1629 pu1_src += src_strd;
1630 ht--;
1631 }
1632 if(0 == pu1_avail[3])
1633 {
1634 ht--;
1635 }
1636
1637 const2_16x8b = _mm_set1_epi8(2);
1638 const0_16x8b = _mm_setzero_si128();
1639
1640 {
1641 WORD32 ht_rem;
1642 for(col = wd; col >= 16; col -= 16)
1643 {
1644 pu1_src_cpy = pu1_src;
1645 src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
1646 //row = 0
1647 src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
1648 //separating +ve and and -ve values.
1649 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
1650 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
1651 //creating mask 00 for +ve and -ve values and FF for zero.
1652 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1653 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1654 //combining the appropriate sign change
1655 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1656
1657 for(row = ht; row >= 2; row -= 2)
1658 {
1659
1660 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
1661 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
1662 // row = 2
1663 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
1664
1665
1666 //row 0 -row1
1667 //separating +ve and and -ve values.
1668 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
1669 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
1670 //creating mask 00 for +ve and -ve values and FF for zero.
1671 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1672 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1673 //combining the appropriate sign change
1674 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1675 //row1-row0
1676 edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
1677
1678 //row1 -bottom
1679 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
1680 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
1681 //creating mask 00 for +ve and -ve values and FF for zero.
1682 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1683 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1684 //combining the appropriate sign change
1685 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1686
1687 //combining sign-left and sign_right
1688 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
1689 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
1690
1691 //for the next iteration signup0_16x8b = -signdwn1_16x8b
1692 signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
1693 //adding constant 2
1694 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1695 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1696 //shuffle to get sao index
1697 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1698 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1699 //shuffle to get sao offset
1700 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1701 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1702 //copying the next top
1703 src_top_16x8b = src_temp1_16x8b;
1704 //cnvert to 16 bit then add and then saturated pack
1705 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1706 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1707 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
1708 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1709 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1710 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
1711 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1712 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1713
1714 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1715 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1716 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
1717 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
1718 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1719 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
1720 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
1721 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
1722
1723 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1724 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1725 // row = 1
1726 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
1727
1728 src_temp0_16x8b = src_bottom_16x8b;
1729 pu1_src_cpy += (src_strd << 1);
1730 }
1731 ht_rem = ht & 0x1;
1732
1733 if(ht_rem)
1734 {
1735 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
1736 //current row -next row
1737 //separating +ve and and -ve values.
1738 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
1739 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
1740 //creating mask 00 for +ve and -ve values and FF for zero.
1741 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1742 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1743 //combining the appropriate sign change
1744 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1745 //adding top and botton and constant 2
1746 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
1747 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1748
1749 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1750 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1751 //copying the next top
1752 src_top_16x8b = src_temp0_16x8b;
1753 //cnvert to 16 bit then add and then saturated pack
1754 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1755 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1756 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
1757 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1758 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1759 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
1760 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1761 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1762
1763 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1764 }
1765 if(0 == pu1_avail[3])
1766 {
1767 src_top_16x8b = src_bottom_16x8b;
1768 }
1769 //updating top flag
1770 _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
1771 pu1_src += 16;
1772 }
1773
1774 wd_rem = wd & 0xF;
1775 if(wd_rem)
1776 {
1777 pu1_src_cpy = pu1_src;
1778 src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
1779 //row = 0
1780 src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
1781 //separating +ve and and -ve values.
1782 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
1783 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
1784 //creating mask 00 for +ve and -ve values and FF for zero.
1785 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1786 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1787 //combining the appropriate sign change
1788 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1789 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
1790 for(row = ht; row >= 4; row -= 4)
1791 {
1792 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
1793 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
1794 // row = 2
1795 src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
1796
1797 //row 0 -row1
1798 //separating +ve and and -ve values.
1799 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
1800 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
1801 //creating mask 00 for +ve and -ve values and FF for zero.
1802 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1803 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1804 //combining the appropriate sign change
1805 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1806
1807 //row1-row0
1808 edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
1809 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
1810 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
1811 //row1 -row2
1812 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
1813 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
1814 //creating mask 00 for +ve and -ve values and FF for zero.
1815 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1816 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1817 //combining the appropriate sign change
1818 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
1819 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
1820 //packing row 0 n row 1
1821 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
1822 //row = 3
1823 src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
1824 // row = 4
1825 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 4 * src_strd));
1826
1827 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
1828 signdwn1_16x8b = _mm_slli_si128(signdwn1_16x8b, 8); //allign left (1-2)
1829 //separating +ve and and -ve values.(2,3)
1830 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_top_16x8b);
1831 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_bottom_16x8b);
1832 //creating mask 00 for +ve and -ve values and FF for zero.
1833 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1834 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1835 //combining the appropriate sign change
1836 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
1837
1838 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signdwn1_16x8b, 8); //(2-3) ,(1-2) (substract with down)
1839 edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8);
1840 //separating +ve and and -ve values.(3,4)
1841 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp1_16x8b);
1842 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_top_16x8b);
1843 //creating mask 00 for +ve and -ve values and FF for zero.
1844 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1845 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1846 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-4)
1847 //combining sign-left and sign_right
1848 edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-4),(2-3)
1849
1850 edge1_16x8b = _mm_sub_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
1851
1852 //packing row 2 n row 3
1853 src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
1854 //for the next iteration signup0_16x8b = -signdwn1_16x8b
1855 signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(4-3)
1856
1857 //adding constant 2
1858 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1859 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
1860 //shuffle to get sao index
1861 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1862 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
1863 //shuffle to get sao offset
1864 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1865 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
1866 //the next top already in src_top_16x8b
1867 //src_top_16x8b = src_temp1_16x8b;
1868 //cnvert to 16 bit then add and then saturated pack
1869 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1870 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1871 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
1872 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
1873 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1874 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
1875 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1876 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
1877
1878 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
1879 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
1880 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
1881 src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
1882 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
1883 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
1884 src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
1885 src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
1886
1887 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
1888 cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
1889 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1890 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1891 // row = 1
1892 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
1893 //row = 2
1894 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
1895 // row = 3
1896 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
1897
1898 src_temp0_16x8b = src_temp1_16x8b;
1899 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
1900 pu1_src_cpy += (src_strd << 2);
1901
1902 }
1903 ht_rem = ht & 0x2;
1904 if(ht_rem)
1905 {
1906
1907 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
1908 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
1909 // row = 2
1910 src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
1911
1912 //row 0 -row1
1913 //separating +ve and and -ve values.
1914 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
1915 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
1916 //creating mask 00 for +ve and -ve values and FF for zero.
1917 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1918 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1919 //combining the appropriate sign change
1920 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1921 //row1-row0
1922 edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
1923 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
1924 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
1925 //row1 -row2
1926 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
1927 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
1928 //creating mask 00 for +ve and -ve values and FF for zero.
1929 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1930 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1931 //combining the appropriate sign change
1932 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
1933 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
1934 //adding top and down substraction
1935 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
1936 //for the next iteration signup0_16x8b = -signdwn1_16x8b
1937 signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(2-1) for next
1938 src_top_16x8b = src_temp1_16x8b;
1939 //adding constant 2
1940 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1941
1942 //shuffle to get sao index
1943 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1944
1945 //shuffle to get sao offset
1946 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1947
1948 //the next top already in src_top_16x8b
1949 //cnvert to 16 bit then add and then saturated pack
1950 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1951 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1952 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
1953 src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
1954 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
1955 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
1956 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
1957 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
1958
1959 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
1960
1961 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
1962 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
1963 // row = 1
1964 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
1965 src_temp0_16x8b = src_bottom_16x8b;
1966 pu1_src_cpy += (src_strd << 1);
1967
1968 }
1969 ht_rem = ht & 0x1;
1970 if(ht_rem)
1971 {
1972
1973 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
1974 src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
1975
1976 //row 0 -row1
1977 //separating +ve and and -ve values.
1978 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
1979 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
1980 //creating mask 00 for +ve and -ve values and FF for zero.
1981 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
1982 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
1983 //combining the appropriate sign change
1984 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
1985 //adding top and down substraction
1986 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
1987 //adding constant 2
1988 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
1989 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
1990 edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
1991 //shuffle to get sao index
1992 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
1993 //shuffle to get sao offset
1994 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
1995 src_top_16x8b = src_temp0_16x8b;
1996 //cnvert to 16 bit then add and then saturated pack
1997 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
1998 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
1999 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2000 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
2001 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
2002 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2003 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2004 pu1_src_cpy += (src_strd);
2005
2006 }
2007 if(0 == pu1_avail[3])
2008 {
2009 src_top_16x8b = src_bottom_16x8b;
2010 }
2011 _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
2012 pu1_src += 8;
2013 }
2014 }
2015 }
2016
ihevc_sao_edge_offset_class1_chroma_ssse3(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_src_left,UWORD8 * pu1_src_top,UWORD8 * pu1_src_top_left,UWORD8 * pu1_src_top_right,UWORD8 * pu1_src_bot_left,UWORD8 * pu1_avail,WORD8 * pi1_sao_offset_u,WORD8 * pi1_sao_offset_v,WORD32 wd,WORD32 ht)2017 void ihevc_sao_edge_offset_class1_chroma_ssse3(UWORD8 *pu1_src,
2018 WORD32 src_strd,
2019 UWORD8 *pu1_src_left,
2020 UWORD8 *pu1_src_top,
2021 UWORD8 *pu1_src_top_left,
2022 UWORD8 *pu1_src_top_right,
2023 UWORD8 *pu1_src_bot_left,
2024 UWORD8 *pu1_avail,
2025 WORD8 *pi1_sao_offset_u,
2026 WORD8 *pi1_sao_offset_v,
2027 WORD32 wd,
2028 WORD32 ht)
2029 {
2030 WORD32 row, col;
2031 UWORD8 *pu1_src_top_cpy;
2032 UWORD8 *pu1_src_cpy;
2033 WORD32 wd_rem;
2034
2035
2036 __m128i src_top_16x8b, src_bottom_16x8b;
2037 __m128i src_temp0_16x8b, src_temp1_16x8b;
2038 __m128i signup0_16x8b, signdwn1_16x8b;
2039 __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
2040 __m128i edge0_16x8b, edge1_16x8b;
2041 __m128i edge_idx_8x16b, sao_offset_8x16b;
2042 __m128i const2_16x8b, const0_16x8b;
2043 __m128i chroma_offset_8x16b;
2044
2045 UNUSED(pu1_src_top_right);
2046 UNUSED(pu1_src_bot_left);
2047
2048 /* Updating left and top and top-left */
2049 for(row = 0; row < ht; row++)
2050 {
2051 pu1_src_left[2 * row] = pu1_src[row * src_strd + (wd - 2)];
2052 pu1_src_left[2 * row + 1] = pu1_src[row * src_strd + (wd - 1)];
2053 }
2054 pu1_src_top_left[0] = pu1_src_top[wd - 2];
2055 pu1_src_top_left[1] = pu1_src_top[wd - 1];
2056
2057
2058
2059 pu1_src_top_cpy = pu1_src_top;
2060 edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
2061 sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
2062 const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
2063 chroma_offset_8x16b = _mm_set1_epi16(0x0800);
2064 /* Update height and source pointers based on the availability flags */
2065 if(0 == pu1_avail[2])
2066 {
2067 pu1_src_top_cpy = pu1_src;
2068 pu1_src += src_strd;
2069 ht--;
2070 }
2071 if(0 == pu1_avail[3])
2072 {
2073 ht--;
2074 }
2075 sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
2076 const2_16x8b = _mm_set1_epi8(2);
2077 const0_16x8b = _mm_setzero_si128();
2078
2079
2080 {
2081 WORD32 ht_rem;
2082
2083
2084
2085 for(col = wd; col >= 16; col -= 16)
2086 {
2087 pu1_src_cpy = pu1_src;
2088 src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
2089 //row = 0
2090 src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
2091 //separating +ve and and -ve values.
2092 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
2093 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
2094 //creating mask 00 for +ve and -ve values and FF for zero.
2095 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2096 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2097 //combining the appropriate sign change
2098 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2099
2100 for(row = ht; row >= 2; row -= 2)
2101 {
2102
2103 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
2104 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
2105 // row = 2
2106 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
2107
2108
2109 //row 0 -row1
2110 //separating +ve and and -ve values.
2111 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
2112 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
2113 //creating mask 00 for +ve and -ve values and FF for zero.
2114 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2115 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2116 //combining the appropriate sign change
2117 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2118 //row1-row0
2119 edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
2120
2121 //row1 -bottom
2122 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
2123 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
2124 //creating mask 00 for +ve and -ve values and FF for zero.
2125 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2126 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2127 //combining the appropriate sign change
2128 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2129
2130 //combining sign-left and sign_right
2131 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
2132 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
2133
2134 //for the next iteration signup0_16x8b = -signdwn1_16x8b
2135 signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
2136 //adding constant 2
2137 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2138 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
2139 //copying the next top
2140 src_top_16x8b = src_temp1_16x8b;
2141
2142
2143 //shuffle to get sao index
2144 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2145 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
2146 //adding chroma offset to access U and V
2147 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
2148 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
2149
2150 //shuffle to get sao offset
2151 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2152 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
2153 //cnvert to 16 bit then add and then saturated pack
2154 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2155 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2156 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2157 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
2158 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2159 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2160 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
2161 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
2162
2163 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
2164 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
2165 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
2166 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
2167 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
2168 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2169 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
2170 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
2171 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2172 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2173 // row = 1
2174 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
2175
2176 src_temp0_16x8b = src_bottom_16x8b;
2177 pu1_src_cpy += (src_strd << 1);
2178 }
2179 ht_rem = ht & 0x1;
2180
2181 if(ht_rem)
2182 {
2183 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
2184 //current row -next row
2185 //separating +ve and and -ve values.
2186 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
2187 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
2188 //creating mask 00 for +ve and -ve values and FF for zero.
2189 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2190 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2191 //combining the appropriate sign change
2192 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2193 //adding top and botton and constant 2
2194 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
2195 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2196 //copying the next top
2197 src_top_16x8b = src_temp0_16x8b;
2198
2199 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2200 //adding chroma offset to access U and V
2201 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
2202 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2203
2204 //cnvert to 16 bit then add and then saturated pack
2205 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2206 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2207 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2208 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
2209 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2210 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2211 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
2212 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
2213
2214 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2215 }
2216 if(0 == pu1_avail[3])
2217 {
2218 src_top_16x8b = src_bottom_16x8b;
2219 }
2220 //updating top flag
2221 _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
2222 pu1_src += 16;
2223 }
2224
2225 wd_rem = wd & 0xF;
2226 if(wd_rem)
2227 {
2228 pu1_src_cpy = pu1_src;
2229 src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
2230 //row = 0
2231 src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
2232 //separating +ve and and -ve values.
2233 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
2234 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
2235 //creating mask 00 for +ve and -ve values and FF for zero.
2236 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2237 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2238 //combining the appropriate sign change
2239 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2240 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
2241 for(row = ht; row >= 4; row -= 4)
2242 {
2243 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
2244 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
2245 // row = 2
2246 src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
2247
2248 //row 0 -row1
2249 //separating +ve and and -ve values.
2250 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
2251 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
2252 //creating mask 00 for +ve and -ve values and FF for zero.
2253 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2254 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2255 //combining the appropriate sign change
2256 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2257
2258 //row1-row0
2259 edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
2260 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
2261 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
2262 //row1 -row2
2263 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
2264 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
2265 //creating mask 00 for +ve and -ve values and FF for zero.
2266 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2267 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2268 //combining the appropriate sign change
2269 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
2270 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
2271 //packing row 0 n row 1
2272 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
2273 //row = 3
2274 src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd));
2275 // row = 4
2276 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 4 * src_strd));
2277
2278 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
2279 signdwn1_16x8b = _mm_slli_si128(signdwn1_16x8b, 8); //allign left (1-2)
2280 //separating +ve and and -ve values.(2,3)
2281 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_top_16x8b);
2282 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_bottom_16x8b);
2283 //creating mask 00 for +ve and -ve values and FF for zero.
2284 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2285 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2286 //combining the appropriate sign change
2287 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
2288
2289 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signdwn1_16x8b, 8); //(2-3) ,(1-2) (substract with down)
2290 edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8);
2291 //separating +ve and and -ve values.(3,4)
2292 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp1_16x8b);
2293 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_top_16x8b);
2294 //creating mask 00 for +ve and -ve values and FF for zero.
2295 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2296 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2297 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-4)
2298 //combining sign-left and sign_right
2299 edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-4),(2-3)
2300
2301 edge1_16x8b = _mm_sub_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
2302
2303 //packing row 2 n row 3
2304 src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
2305 //for the next iteration signup0_16x8b = -signdwn1_16x8b
2306 signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(4-3)
2307 //adding constant 2
2308 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2309 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
2310 //shuffle to get sao index
2311 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2312 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
2313 //adding chroma offset to access U and V
2314 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
2315 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
2316
2317 //shuffle to get sao offset
2318 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2319 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
2320 //the next top already in src_top_16x8b
2321 //cnvert to 16 bit then add and then saturated pack
2322 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2323 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2324 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2325 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
2326 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2327 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2328 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
2329 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
2330
2331 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
2332 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
2333 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
2334 src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
2335 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
2336 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2337 src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
2338 src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
2339
2340 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
2341 cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
2342 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2343 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2344 // row = 1
2345 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
2346 //row = 2
2347 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
2348 // row = 3
2349 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
2350
2351 src_temp0_16x8b = src_temp1_16x8b;
2352 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
2353 pu1_src_cpy += (src_strd << 2);
2354
2355 }
2356 ht_rem = ht & 0x2;
2357 if(ht_rem)
2358 {
2359
2360 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
2361 src_temp1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
2362 // row = 2
2363 src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd));
2364
2365 //row 0 -row1
2366 //separating +ve and and -ve values.
2367 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_temp1_16x8b);
2368 cmp_lt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_temp0_16x8b);
2369 //creating mask 00 for +ve and -ve values and FF for zero.
2370 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2371 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2372 //combining the appropriate sign change
2373 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2374 //row1-row0
2375 edge1_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b);
2376 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
2377 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
2378 //row1 -row2
2379 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
2380 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
2381 //creating mask 00 for +ve and -ve values and FF for zero.
2382 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2383 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2384 //combining the appropriate sign change
2385 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
2386 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
2387 //adding top and down substraction
2388 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
2389 //for the next iteration signup0_16x8b = -signdwn1_16x8b
2390 signup0_16x8b = _mm_sub_epi8(cmp_lt0_16x8b, cmp_gt0_16x8b); //(2-1) for next
2391 src_top_16x8b = src_temp1_16x8b;
2392
2393 //adding constant 2
2394 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2395
2396 //shuffle to get sao index
2397 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2398
2399 //adding chroma offset to access U and V
2400 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
2401 //shuffle to get sao offset
2402 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2403 //the next top already in src_top_16x8b
2404 //cnvert to 16 bit then add and then saturated pack
2405 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2406 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2407 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2408 src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
2409 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2410 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
2411 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
2412 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
2413
2414 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
2415
2416 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2417 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2418 // row = 1
2419 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
2420 src_temp0_16x8b = src_bottom_16x8b;
2421 pu1_src_cpy += (src_strd << 1);
2422
2423 }
2424 ht_rem = ht & 0x1;
2425 if(ht_rem)
2426 {
2427
2428 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
2429 src_bottom_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy + src_strd));
2430
2431 //row 0 -row1
2432 //separating +ve and and -ve values.
2433 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
2434 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
2435 //creating mask 00 for +ve and -ve values and FF for zero.
2436 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2437 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2438 //combining the appropriate sign change
2439 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2440 //adding top and down substraction
2441 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
2442 //adding constant 2
2443 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2444 src_top_16x8b = src_temp0_16x8b;
2445
2446 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
2447 edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
2448 //shuffle to get sao index
2449 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2450 //adding chroma offset to access U and V
2451 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
2452 //shuffle to get sao offset
2453 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2454
2455 //cnvert to 16 bit then add and then saturated pack
2456 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2457 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2458 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2459 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
2460 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
2461 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2462 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2463 pu1_src_cpy += (src_strd);
2464
2465 }
2466 if(0 == pu1_avail[3])
2467 {
2468 src_top_16x8b = src_bottom_16x8b;
2469 }
2470 _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
2471 pu1_src += 8;
2472 }
2473 }
2474 }
2475
2476 /* 135 degree filtering */
ihevc_sao_edge_offset_class2_ssse3(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_src_left,UWORD8 * pu1_src_top,UWORD8 * pu1_src_top_left,UWORD8 * pu1_src_top_right,UWORD8 * pu1_src_bot_left,UWORD8 * pu1_avail,WORD8 * pi1_sao_offset,WORD32 wd,WORD32 ht)2477 void ihevc_sao_edge_offset_class2_ssse3(UWORD8 *pu1_src,
2478 WORD32 src_strd,
2479 UWORD8 *pu1_src_left,
2480 UWORD8 *pu1_src_top,
2481 UWORD8 *pu1_src_top_left,
2482 UWORD8 *pu1_src_top_right,
2483 UWORD8 *pu1_src_bot_left,
2484 UWORD8 *pu1_avail,
2485 WORD8 *pi1_sao_offset,
2486 WORD32 wd,
2487 WORD32 ht)
2488 {
2489 WORD32 row, col;
2490 UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
2491 UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
2492 UWORD8 *pu1_firstleft;
2493 UWORD8 *pu1_src_cpy, *pu1_src_org;
2494 UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
2495 UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
2496 UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
2497 WORD32 wd_rem;
2498 UWORD8 u1_pos_0_0_tmp, u1_pos_wd_ht_tmp;
2499 WORD32 ht_tmp, ht_0;
2500
2501 WORD32 bit_depth;
2502 UWORD8 u1_avail0, u1_avail1;
2503
2504 __m128i src_top_16x8b, src_bottom_16x8b;
2505 __m128i src_temp0_16x8b, src_temp1_16x8b;
2506 __m128i signup0_16x8b, signdwn1_16x8b;
2507 __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
2508 __m128i edge0_16x8b, edge1_16x8b;
2509 __m128i au1_mask8x16b;
2510 __m128i edge_idx_8x16b, sao_offset_8x16b;
2511 __m128i const2_16x8b, const0_16x8b;
2512 __m128i left_store_16x8b;
2513 UNUSED(pu1_src_top_right);
2514 UNUSED(pu1_src_bot_left);
2515
2516 ht_0 = ht; ht_tmp = ht;
2517 au1_mask8x16b = _mm_set1_epi8(0xff);
2518
2519 //setting availability mask to ff size MAX_CTB_SIZE
2520 for(col = 0; col < MAX_CTB_SIZE; col += 16)
2521 _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
2522 for(row = 0; row < ht; row++)
2523 {
2524 au1_src_left_tmp[row] = pu1_src_left[row];
2525 }
2526 bit_depth = BIT_DEPTH_LUMA;
2527 pu1_src_org = pu1_src;
2528 pu1_src_top_cpy = pu1_src_top;
2529 pu1_src_left_cpy2 = au1_src_left_tmp;
2530 pu1_src_left_cpy = au1_src_left_tmp;
2531 pu1_src_left_str2 = au1_src_left_tmp1;
2532 pu1_src_left_str = au1_src_left_tmp1;
2533 edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
2534 sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
2535
2536
2537 /* If top-left is available, process separately */
2538 if(0 != pu1_avail[4])
2539 {
2540 WORD8 edge_idx;
2541
2542 edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) +
2543 SIGN(pu1_src[0] - pu1_src[1 + src_strd]);
2544
2545 edge_idx = gi1_table_edge_idx[edge_idx];
2546
2547 if(0 != edge_idx)
2548 {
2549 u1_pos_0_0_tmp = CLIP3(pu1_src[0] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
2550 }
2551 else
2552 {
2553 u1_pos_0_0_tmp = pu1_src[0];
2554 }
2555 }
2556 else
2557 {
2558 u1_pos_0_0_tmp = pu1_src[0];
2559 }
2560
2561 /* If bottom-right is available, process separately */
2562 if(0 != pu1_avail[7])
2563 {
2564 WORD8 edge_idx;
2565
2566 edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 1 - src_strd]) +
2567 SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 1 + src_strd]);
2568
2569 edge_idx = gi1_table_edge_idx[edge_idx];
2570
2571 if(0 != edge_idx)
2572 {
2573 u1_pos_wd_ht_tmp = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
2574 }
2575 else
2576 {
2577 u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd];
2578 }
2579 }
2580 else
2581 {
2582 u1_pos_wd_ht_tmp = pu1_src[wd - 1 + (ht - 1) * src_strd];
2583 }
2584 pu1_firstleft = pu1_src_top_left;
2585
2586 /* Update height and source pointers based on the availability flags */
2587 if(0 == pu1_avail[2])
2588 {
2589 pu1_firstleft = pu1_src_left_cpy2;
2590 pu1_src_left_cpy2++;
2591 pu1_src_left_str2++;
2592 pu1_src_top_cpy = pu1_src;
2593 pu1_src += src_strd;
2594 ht--;
2595 }
2596 if(0 == pu1_avail[3])
2597 {
2598 ht--;
2599 ht_0--;
2600 }
2601 //storing top left in a mmx register
2602 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_firstleft);
2603 const2_16x8b = _mm_set1_epi8(2);
2604 const0_16x8b = _mm_setzero_si128();
2605 left_store_16x8b = _mm_slli_si128(left_store_16x8b, 15);
2606 //update top -left
2607 *pu1_src_top_left = pu1_src_top[wd - 1];
2608 //availability mask creation
2609 u1_avail0 = pu1_avail[0];
2610 u1_avail1 = pu1_avail[1];
2611 au1_mask[0] = u1_avail0;
2612 au1_mask[wd - 1] = u1_avail1;
2613 {
2614 WORD32 ht_rem;
2615
2616
2617 pu1_src_left_cpy = pu1_src_left_cpy2;
2618 pu1_src_left_str = pu1_src_left_str2;
2619 au1_mask_cpy = au1_mask;
2620 for(col = wd; col >= 16; col -= 16)
2621 {
2622 pu1_src_cpy = pu1_src;
2623 src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
2624 //row = 0
2625 src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
2626 src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 15);
2627 //loading the mask
2628 au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
2629 //separating +ve and and -ve values.
2630 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
2631 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
2632 //creating mask 00 for +ve and -ve values and FF for zero.
2633 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2634 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2635 //combining the appropriate sign change
2636 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2637
2638
2639 for(row = ht; row >= 2; row -= 2)
2640 {
2641 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
2642 //row = 1
2643 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
2644 // row = 1 right
2645 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
2646 //to insert left in row 0
2647 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 15);
2648 //row 0 -row1
2649 //separating +ve and and -ve values.
2650 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
2651 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
2652
2653 //creating mask 00 for +ve and -ve values and FF for zero.
2654 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2655 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2656 //manipulation for row 1 - row 0
2657 signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
2658 //combining the appropriate sign change
2659 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
2660 //row1-row0
2661 //separating +ve and and -ve values.
2662 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
2663 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
2664 //creating mask 00 for +ve and -ve values and FF for zero.
2665 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2666 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2667 // row = 2 right
2668 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd + 1));
2669 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
2670
2671
2672 //row1 -bottom
2673 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
2674 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
2675 //creating mask 00 for +ve and -ve values and FF for zero.
2676 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2677 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2678 //combining the appropriate sign change
2679 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2680 // row = 2
2681 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
2682
2683 //combining sign-left and sign_right
2684 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
2685
2686 //storing the row 1 left for next row.
2687 signup0_16x8b = _mm_slli_si128(left_store_16x8b, 14);
2688
2689 //combining sign-left and sign_right
2690 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
2691 //manipulation for bottom - row 1
2692 signup0_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signup0_16x8b, 15);
2693 //eliminating old left for row 0 and row 1
2694 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
2695 //bottom - row1
2696 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signup0_16x8b);
2697 cmp_lt0_16x8b = _mm_subs_epu8(signup0_16x8b, src_bottom_16x8b);
2698 //creating mask 00 for +ve and -ve values and FF for zero.
2699 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2700 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2701 //for the next iteration bottom -row1
2702 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2703 //row1 getting it right for left of next block
2704 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
2705 //adding constant 2
2706 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2707 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
2708 //shuffle to get sao index
2709 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2710 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
2711 //using availability mask
2712 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
2713 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
2714 //shuffle to get sao offset
2715 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2716 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
2717 //row0 getting it right for left of next block
2718 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
2719 //copying the next top
2720 src_top_16x8b = src_temp1_16x8b;
2721 //cnvert to 16 bit then add and then saturated pack
2722 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2723 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2724 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2725 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
2726 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2727 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2728 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
2729 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
2730
2731 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
2732 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
2733 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
2734 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
2735 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2736 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
2737 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
2738 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
2739
2740 //store left boundary
2741 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
2742 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
2743 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2744 // row = 1
2745 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
2746
2747 src_temp0_16x8b = src_bottom_16x8b;
2748 pu1_src_cpy += (src_strd << 1);
2749 pu1_src_left_cpy += 2;
2750 pu1_src_left_str += 2;
2751 }
2752 ht_rem = ht & 0x1;
2753
2754 if(ht_rem)
2755 {
2756 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
2757 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
2758 //current row -next row
2759 //separating +ve and and -ve values.
2760 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
2761 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
2762 //creating mask 00 for +ve and -ve values and FF for zero.
2763 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2764 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2765 //combining the appropriate sign change
2766 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2767 //adding top and botton and constant 2
2768 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
2769 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2770 //eliminating old left for row 0 and row 1
2771 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
2772
2773 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2774 //using availability mask
2775 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
2776
2777 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
2778
2779 //row0 getting it right for left of next block
2780 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
2781 //copying the next top
2782 src_top_16x8b = src_temp0_16x8b;
2783 //cnvert to 16 bit then add and then saturated pack
2784 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
2785 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
2786 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
2787 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
2788 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
2789 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
2790 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
2791 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
2792 //store left boundary
2793 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
2794
2795 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
2796 pu1_src_cpy += (src_strd);
2797 pu1_src_left_cpy += 1;
2798 pu1_src_left_str += 1;
2799 }
2800 if(0 == pu1_avail[3])
2801 {
2802 src_top_16x8b = src_bottom_16x8b;
2803 pu1_src_left_str[0] = pu1_src_cpy[15];
2804 }
2805 if(0 == pu1_avail[2])
2806 {
2807 pu1_src_left_str[-ht_0] = pu1_src[15 - src_strd];
2808 }
2809
2810 //for the top left of next part of the block
2811 left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
2812 //updating top flag
2813 _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
2814 pu1_src += 16;
2815 au1_mask_cpy += 16;
2816
2817
2818 pu1_left_tmp = pu1_src_left_cpy2;
2819 pu1_src_left_cpy2 = pu1_src_left_str2;
2820 pu1_src_left_str2 = pu1_left_tmp;
2821
2822 pu1_src_left_cpy = pu1_src_left_cpy2;
2823 pu1_src_left_str = pu1_src_left_str2;
2824 }
2825
2826 wd_rem = wd & 0xF;
2827 if(wd_rem)
2828 {
2829 pu1_src_left_cpy = pu1_src_left_cpy2;
2830 pu1_src_left_str = pu1_src_left_str2;
2831 pu1_src_cpy = pu1_src;
2832 src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
2833 //row = 0
2834 src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
2835 src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 15);
2836 au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
2837 //separating +ve and and -ve values.
2838 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
2839 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
2840 //creating mask 00 for +ve and -ve values and FF for zero.
2841 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2842 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2843 //preparing au1_mask
2844 au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
2845 //combining the appropriate sign change
2846 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2847 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
2848
2849 for(row = ht; row >= 4; row -= 4)
2850 {
2851 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
2852 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
2853 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
2854 // row = 2
2855 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
2856 //right row1
2857 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
2858 //row 0 -row1
2859 //separating +ve and and -ve values.
2860 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
2861 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
2862 //manipulation for row 1 -row 0
2863 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 15);
2864 //creating mask 00 for +ve and -ve values and FF for zero.
2865 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2866 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2867 //row 0 left
2868 signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
2869 //combining the appropriate sign change
2870 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2871 //row 1 -row0
2872 //separating +ve and and -ve values.
2873 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
2874 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
2875
2876 //creating mask 00 for +ve and -ve values and FF for zero.
2877 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2878 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2879 //row1-row0
2880 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
2881
2882 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
2883
2884 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
2885 //right row2
2886 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
2887 //packing row 0 n row 1
2888 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
2889 //row1 -row2
2890 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
2891 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
2892 //creating mask 00 for +ve and -ve values and FF for zero.
2893 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2894 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2895 //combining the appropriate sign change
2896 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
2897 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
2898 //manipulation for row 2 -row 1
2899 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
2900 //row 1 left
2901 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
2902 //row = 3
2903 src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
2904
2905 // row = 4
2906 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
2907
2908 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
2909
2910 //separating +ve and and -ve values.(2,1)
2911 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
2912 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
2913 //manipulation for row 3 -row 2
2914 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13);
2915 //creating mask 00 for +ve and -ve values and FF for zero.
2916 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2917 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2918 //row 2 left
2919 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
2920 //combining the appropriate sign change
2921 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
2922
2923 //separating +ve and and -ve values.(3,2)
2924 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
2925 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
2926 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
2927 //creating mask 00 for +ve and -ve values and FF for zero.
2928 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2929 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2930 //right row3
2931 signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 1);
2932 //combining the appropriate sign change
2933 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
2934
2935 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
2936
2937 //separating +ve and and -ve values.(2,3)
2938 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
2939 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
2940 //right row 4
2941 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
2942 //creating mask 00 for +ve and -ve values and FF for zero.
2943 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2944 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2945 //combining the appropriate sign change
2946 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
2947
2948 //separating +ve and and -ve values.(3,bottom)
2949 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
2950 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
2951
2952 //creating mask 00 for +ve and -ve values and FF for zero.
2953 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2954 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2955 edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
2956 //combining the appropriate sign change
2957 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
2958 edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
2959
2960 //manipulation for bottom -row 3
2961 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
2962 //eliminating old left for row 0,1,2,3
2963 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
2964 //packing row 2 n row 3
2965 src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
2966 //row 3 left
2967 signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 15);
2968 //loading row 3 right into left
2969 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 15);
2970 //adding bottom and top values of row 2 and row 3
2971 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
2972 //separating +ve and and -ve values.(botttom,3)
2973 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
2974 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
2975 //to store right of row 2
2976 signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
2977 //creating mask 00 for +ve and -ve values and FF for zero.
2978 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
2979 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
2980 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
2981
2982 //storing right of row 2into left
2983 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
2984 //to store right of row 0
2985 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
2986 //storing right of row 1 into left
2987 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
2988
2989 //adding constant 2
2990 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
2991 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
2992 //shuffle to get sao index
2993 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
2994 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
2995 //using availability mask
2996 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
2997 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
2998 //shuffle to get sao offset
2999 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3000 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
3001
3002 //storing right of row 0 into left
3003 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
3004 //cnvert to 16 bit then add and then saturated pack
3005 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3006 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3007 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3008 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
3009 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3010 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3011 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
3012 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
3013
3014 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
3015 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
3016 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
3017 src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
3018 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3019 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
3020 src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
3021 src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
3022
3023 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
3024 cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
3025
3026 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3027 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3028 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3029 // row = 1
3030 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
3031 //row = 2
3032 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
3033 // row = 3
3034 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
3035
3036 src_temp0_16x8b = src_temp1_16x8b;
3037 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
3038 pu1_src_cpy += (src_strd << 2);
3039 pu1_src_left_cpy += 4;
3040 pu1_src_left_str += 4;
3041 }
3042 ht_rem = ht & 0x2;
3043 if(ht_rem)
3044 {
3045 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3046 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
3047 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3048 // row = 2
3049 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
3050
3051 //row 0 -row 1
3052 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
3053 //separating +ve and and -ve values.
3054 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
3055 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
3056 //manipulation for row 1 -row 0
3057 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 15);
3058 //creating mask 00 for +ve and -ve values and FF for zero.
3059 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3060 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3061 //manipulation for row 1 - row 0
3062 signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 15);
3063 //combining the appropriate sign change
3064 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3065
3066 //row1-row0
3067 //separating +ve and and -ve values.
3068 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3069 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3070
3071 //creating mask 00 for +ve and -ve values and FF for zero.
3072 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3073 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3074 //combining the appropriate sign chang
3075 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3076 //row 1 -bottom
3077 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
3078
3079 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
3080 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
3081 //row1 -bottom
3082 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3083 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3084
3085 //creating mask 00 for +ve and -ve values and FF for zero.
3086 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3087 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3088 //combining the appropriate sign change
3089 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
3090 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
3091 //manipulation for bottom -row1
3092 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
3093 //manipulation for bottom- row 1
3094 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
3095 //adding top and down substraction
3096 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
3097 //bottom - row 1
3098 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
3099 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
3100
3101 //eliminating old left for row 0,1
3102 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
3103 signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
3104 //creating mask 00 for +ve and -ve values and FF for zero.
3105 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3106 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3107 //for the next iteration signup0_16x8b
3108 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
3109
3110 //storing right of row 1 into left
3111 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
3112 //for storing right of row 1
3113 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
3114
3115 src_top_16x8b = src_temp1_16x8b;
3116 //storing right of row 0 into left
3117 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
3118
3119 //adding constant 2
3120 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3121
3122 //shuffle to get sao index
3123 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3124 //using availability mask
3125 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3126 //shuffle to get sao offset
3127 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3128
3129 //the next top already in src_top_16x8b
3130 //cnvert to 16 bit then add and then saturated pack
3131 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3132 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3133 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3134 src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
3135 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
3136 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3137 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
3138 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
3139
3140 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
3141
3142 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3143 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3144 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3145 // row = 1
3146 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
3147 src_temp0_16x8b = src_bottom_16x8b;
3148 pu1_src_cpy += (src_strd << 1);
3149 pu1_src_left_cpy += 2;
3150 pu1_src_left_str += 2;
3151 }
3152 ht_rem = ht & 0x1;
3153 if(ht_rem)
3154 {
3155 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3156 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
3157 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3158 //left store manipulation 1
3159 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
3160 //row 0 -row1
3161 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
3162 //separating +ve and and -ve values.
3163 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
3164 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
3165 //creating mask 00 for +ve and -ve values and FF for zero.
3166 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3167 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3168 //combining the appropriate sign change
3169 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3170 //adding top and down substraction
3171 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
3172 //for row 0 right to put into left store
3173 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
3174 //adding constant 2
3175 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3176 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
3177 edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
3178 //filling the left boundary value
3179 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
3180
3181 //shuffle to get sao index
3182 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3183 //using availability mask
3184 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3185 //shuffle to get sao offset
3186 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3187 src_top_16x8b = src_temp0_16x8b;
3188 //cnvert to 16 bit then add and then saturated pack
3189 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3190 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3191 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3192 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
3193 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
3194
3195 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3196 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3197 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3198 pu1_src_cpy += (src_strd);
3199 pu1_src_left_cpy += 1;
3200 pu1_src_left_str += 1;
3201 }
3202 if(0 == pu1_avail[3])
3203 {
3204 src_top_16x8b = src_bottom_16x8b;
3205 pu1_src_left_str[0] = pu1_src_cpy[7];
3206 }
3207
3208 if(0 == pu1_avail[2])
3209 {
3210 pu1_src_left_str[-ht_0] = pu1_src[7 - src_strd];
3211 }
3212
3213 _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
3214 pu1_src += 8;
3215 au1_mask_cpy += 16;
3216
3217 pu1_left_tmp = pu1_src_left_cpy2;
3218 pu1_src_left_cpy2 = pu1_src_left_str2;
3219 pu1_src_left_str2 = pu1_left_tmp;
3220
3221 pu1_src_left_cpy = pu1_src_left_cpy2;
3222 pu1_src_left_str = pu1_src_left_str2;
3223 }
3224 pu1_src_org[0] = u1_pos_0_0_tmp;
3225 pu1_src_org[wd - 1 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp;
3226 pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 1) : pu1_src_left_cpy;
3227 for(row = 0; row < ht_tmp; row++)
3228 {
3229 pu1_src_left[row] = pu1_src_left_cpy[row];
3230 }
3231 }
3232
3233 }
3234
3235 /* 135 degree filtering */
ihevc_sao_edge_offset_class2_chroma_ssse3(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_src_left,UWORD8 * pu1_src_top,UWORD8 * pu1_src_top_left,UWORD8 * pu1_src_top_right,UWORD8 * pu1_src_bot_left,UWORD8 * pu1_avail,WORD8 * pi1_sao_offset_u,WORD8 * pi1_sao_offset_v,WORD32 wd,WORD32 ht)3236 void ihevc_sao_edge_offset_class2_chroma_ssse3(UWORD8 *pu1_src,
3237 WORD32 src_strd,
3238 UWORD8 *pu1_src_left,
3239 UWORD8 *pu1_src_top,
3240 UWORD8 *pu1_src_top_left,
3241 UWORD8 *pu1_src_top_right,
3242 UWORD8 *pu1_src_bot_left,
3243 UWORD8 *pu1_avail,
3244 WORD8 *pi1_sao_offset_u,
3245 WORD8 *pi1_sao_offset_v,
3246 WORD32 wd,
3247 WORD32 ht)
3248 {
3249 WORD32 row, col;
3250 UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
3251 UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
3252 UWORD8 *pu1_firstleft;
3253 UWORD8 *pu1_src_cpy, *pu1_src_org;
3254 UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
3255 UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
3256 UWORD8 au1_src_left_tmp1[2 * (MAX_CTB_SIZE + 8)];
3257 WORD32 wd_rem;
3258 UWORD8 u1_pos_0_0_tmp_u, u1_pos_0_0_tmp_v, u1_pos_wd_ht_tmp_u, u1_pos_wd_ht_tmp_v;
3259 WORD32 ht_tmp;
3260 WORD32 ht_0;
3261
3262 WORD32 bit_depth;
3263 UWORD8 u1_avail0, u1_avail1;
3264
3265 __m128i src_temp0_16x8b, src_temp1_16x8b;
3266 __m128i signup0_16x8b, signdwn1_16x8b;
3267 __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
3268 __m128i edge0_16x8b, edge1_16x8b;
3269 __m128i src_top_16x8b, src_bottom_16x8b;
3270 __m128i au1_mask8x16b;
3271 __m128i edge_idx_8x16b, sao_offset_8x16b;
3272 __m128i const2_16x8b, const0_16x8b;
3273 __m128i left_store_16x8b;
3274 __m128i chroma_offset_8x16b;
3275
3276 UNUSED(pu1_src_top_right);
3277 UNUSED(pu1_src_bot_left);
3278
3279 ht_0 = ht; ht_tmp = ht;
3280 au1_mask8x16b = _mm_set1_epi8(0xff);
3281 /* Updating left and top-left */
3282 for(row = 0; row < 2 * ht; row++)
3283 {
3284 au1_src_left_tmp[row] = pu1_src_left[row];
3285 }
3286 //setting availability mask to ff size MAX_CTB_SIZE
3287 for(col = 0; col < MAX_CTB_SIZE; col += 16)
3288 _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
3289 bit_depth = BIT_DEPTH_LUMA;
3290 pu1_src_org = pu1_src;
3291 pu1_src_top_cpy = pu1_src_top;
3292 pu1_src_left_cpy2 = au1_src_left_tmp;
3293 pu1_src_left_cpy = au1_src_left_tmp;
3294 pu1_src_left_str2 = au1_src_left_tmp1;
3295 pu1_src_left_str = au1_src_left_tmp1;
3296 edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
3297 sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
3298 const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
3299 chroma_offset_8x16b = _mm_set1_epi16(0x0800);
3300
3301 /* If top-left is available, process separately */
3302 if(0 != pu1_avail[4])
3303 {
3304 WORD32 edge_idx;
3305
3306 /* U */
3307 edge_idx = 2 + SIGN(pu1_src[0] - pu1_src_top_left[0]) +
3308 SIGN(pu1_src[0] - pu1_src[2 + src_strd]);
3309
3310 edge_idx = gi1_table_edge_idx[edge_idx];
3311
3312 if(0 != edge_idx)
3313 {
3314 u1_pos_0_0_tmp_u = CLIP3(pu1_src[0] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
3315 }
3316 else
3317 {
3318 u1_pos_0_0_tmp_u = pu1_src[0];
3319 }
3320
3321 /* V */
3322 edge_idx = 2 + SIGN(pu1_src[1] - pu1_src_top_left[1]) +
3323 SIGN(pu1_src[1] - pu1_src[1 + 2 + src_strd]);
3324
3325 edge_idx = gi1_table_edge_idx[edge_idx];
3326
3327 if(0 != edge_idx)
3328 {
3329 u1_pos_0_0_tmp_v = CLIP3(pu1_src[1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
3330 }
3331 else
3332 {
3333 u1_pos_0_0_tmp_v = pu1_src[1];
3334 }
3335 }
3336 else
3337 {
3338 u1_pos_0_0_tmp_u = pu1_src[0];
3339 u1_pos_0_0_tmp_v = pu1_src[1];
3340 }
3341
3342 /* If bottom-right is available, process separately */
3343 if(0 != pu1_avail[7])
3344 {
3345 WORD32 edge_idx;
3346
3347 /* U */
3348 edge_idx = 2 + SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd - 2 - src_strd]) +
3349 SIGN(pu1_src[wd - 2 + (ht - 1) * src_strd] - pu1_src[wd - 2 + (ht - 1) * src_strd + 2 + src_strd]);
3350
3351 edge_idx = gi1_table_edge_idx[edge_idx];
3352
3353 if(0 != edge_idx)
3354 {
3355 u1_pos_wd_ht_tmp_u = CLIP3(pu1_src[wd - 2 + (ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
3356 }
3357 else
3358 {
3359 u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd];
3360 }
3361
3362 /* V */
3363 edge_idx = 2 + SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd - 2 - src_strd]) +
3364 SIGN(pu1_src[wd - 1 + (ht - 1) * src_strd] - pu1_src[wd - 1 + (ht - 1) * src_strd + 2 + src_strd]);
3365
3366 edge_idx = gi1_table_edge_idx[edge_idx];
3367
3368 if(0 != edge_idx)
3369 {
3370 u1_pos_wd_ht_tmp_v = CLIP3(pu1_src[wd - 1 + (ht - 1) * src_strd] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
3371 }
3372 else
3373 {
3374 u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd];
3375 }
3376 }
3377 else
3378 {
3379 u1_pos_wd_ht_tmp_u = pu1_src[wd - 2 + (ht - 1) * src_strd];
3380 u1_pos_wd_ht_tmp_v = pu1_src[wd - 1 + (ht - 1) * src_strd];
3381 }
3382 pu1_firstleft = pu1_src_top_left;
3383
3384 /* Update height and source pointers based on the availability flags */
3385 if(0 == pu1_avail[2])
3386 {
3387 pu1_firstleft = pu1_src_left_cpy2;
3388 pu1_src_left_cpy2 += 2;
3389 pu1_src_left_str2 += 2;
3390 pu1_src_top_cpy = pu1_src;
3391 pu1_src += src_strd;
3392 ht--;
3393 }
3394 if(0 == pu1_avail[3])
3395 {
3396 ht--;
3397 ht_0--;
3398 }
3399 //storing top left in a mmx register
3400 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_firstleft);
3401 sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
3402 const2_16x8b = _mm_set1_epi8(2);
3403 const0_16x8b = _mm_setzero_si128();
3404 left_store_16x8b = _mm_slli_si128(left_store_16x8b, 14);
3405
3406 //availability mask creation
3407 u1_avail0 = pu1_avail[0];
3408 u1_avail1 = pu1_avail[1];
3409 au1_mask[0] = u1_avail0;
3410 au1_mask[1] = u1_avail0;
3411 au1_mask[wd - 1] = u1_avail1;
3412 au1_mask[wd - 2] = u1_avail1;
3413
3414 /* top-left arrays */
3415 pu1_src_top_left[0] = pu1_src_top[wd - 2];
3416 pu1_src_top_left[1] = pu1_src_top[wd - 1];
3417 {
3418 WORD32 ht_rem;
3419 au1_mask_cpy = au1_mask;
3420
3421 pu1_src_left_cpy = pu1_src_left_cpy2;
3422 pu1_src_left_str = pu1_src_left_str2;
3423 for(col = wd; col >= 16; col -= 16)
3424 {
3425 pu1_src_cpy = pu1_src;
3426 src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
3427 //row = 0
3428 src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
3429 src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 14);
3430 //loading the mask
3431 au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
3432 //separating +ve and and -ve values.
3433 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
3434 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
3435 //creating mask 00 for +ve and -ve values and FF for zero.
3436 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3437 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3438 //combining the appropriate sign change
3439 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3440
3441
3442 for(row = ht; row >= 2; row -= 2)
3443 {
3444 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3445 //row = 1
3446 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3447 // row = 1 right
3448 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
3449 //to insert left in row 0
3450 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
3451 //row 0 -row1
3452 //separating +ve and and -ve values.
3453 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
3454 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
3455
3456 //creating mask 00 for +ve and -ve values and FF for zero.
3457 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3458 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3459 //manipulation for row 1 - row 0
3460 signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
3461 //combining the appropriate sign change
3462 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
3463 //row1-row0
3464 //separating +ve and and -ve values.
3465 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3466 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3467 //creating mask 00 for +ve and -ve values and FF for zero.
3468 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3469 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3470 // row = 2 right
3471 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd + 2));
3472 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
3473
3474
3475 //row1 -bottom
3476 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
3477 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
3478 //creating mask 00 for +ve and -ve values and FF for zero.
3479 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3480 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3481 //combining the appropriate sign change
3482 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3483 // row = 2
3484 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
3485
3486 //combining sign-left and sign_right
3487 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
3488
3489 //storing the row 1 left for next row.
3490 signup0_16x8b = _mm_slli_si128(left_store_16x8b, 12);
3491
3492 //combining sign-left and sign_right
3493 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
3494 //manipulation for bottom - row 1
3495 signup0_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signup0_16x8b, 14);
3496 //eliminating old left for row 0 and row 1
3497 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
3498 //bottom - row1
3499 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signup0_16x8b);
3500 cmp_lt0_16x8b = _mm_subs_epu8(signup0_16x8b, src_bottom_16x8b);
3501 //creating mask 00 for +ve and -ve values and FF for zero.
3502 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3503 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3504 //for the next iteration bottom -row1
3505 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3506 //row1 getting it right for left of next iteration
3507 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
3508 //copying the next top
3509 src_top_16x8b = src_temp1_16x8b;
3510 //row0 getting its right for left of next iteration.
3511 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
3512
3513
3514 //adding constant 2
3515 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3516 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
3517 //shuffle to get sao index
3518 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3519 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
3520 //using availability mask
3521 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3522 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
3523 //adding chroma offset to access U and V
3524 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
3525 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
3526
3527
3528 //shuffle to get sao offset
3529 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3530 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
3531 //cnvert to 16 bit then add and then saturated pack
3532 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3533 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3534 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3535 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
3536 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3537 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3538 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
3539 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
3540
3541 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
3542 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
3543 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
3544 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
3545 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
3546 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3547 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
3548 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
3549
3550 //store left boundary
3551 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3552 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3553 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3554 // row = 1
3555 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
3556
3557 src_temp0_16x8b = src_bottom_16x8b;
3558 pu1_src_cpy += (src_strd << 1);
3559 pu1_src_left_cpy += 4;
3560 pu1_src_left_str += 4;
3561 }
3562 ht_rem = ht & 0x1;
3563
3564 if(ht_rem)
3565 {
3566 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3567 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
3568 //current row -next row
3569 //separating +ve and and -ve values.
3570 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_bottom_16x8b);
3571 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp0_16x8b);
3572 //creating mask 00 for +ve and -ve values and FF for zero.
3573 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3574 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3575 //combining the appropriate sign change
3576 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3577 //adding top and botton and constant 2
3578 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
3579 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3580
3581 //eliminating old left for row 0 and row 1
3582 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
3583 //copying the next top
3584 src_top_16x8b = src_temp0_16x8b;
3585 //row0 getting it right for left of next block
3586 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
3587
3588 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3589 //using availability mask
3590 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3591 //adding chroma offset to access U and V
3592 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
3593
3594 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3595
3596 //cnvert to 16 bit then add and then saturated pack
3597 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3598 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3599 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3600 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
3601 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3602 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3603 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
3604 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
3605
3606 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3607
3608 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3609 pu1_src_cpy += (src_strd);
3610 pu1_src_left_cpy += 2;
3611 pu1_src_left_str += 2;
3612 }
3613 if(0 == pu1_avail[3])
3614 {
3615 src_top_16x8b = src_bottom_16x8b;
3616 pu1_src_left_str[1] = pu1_src_cpy[15];
3617 pu1_src_left_str[0] = pu1_src_cpy[14];
3618 }
3619 if(0 == pu1_avail[2])
3620 {
3621 pu1_src_left_str[-2 * ht_0] = pu1_src[14 - src_strd];
3622 pu1_src_left_str[-2 * ht_0 + 1] = pu1_src[15 - src_strd];
3623 }
3624
3625 //for the top left of next part of the block
3626 left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
3627 //updating top flag
3628 _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
3629 pu1_src += 16;
3630 au1_mask_cpy += 16;
3631
3632 pu1_left_tmp = pu1_src_left_cpy2;
3633 pu1_src_left_cpy2 = pu1_src_left_str2;
3634 pu1_src_left_str2 = pu1_left_tmp;
3635
3636 pu1_src_left_cpy = pu1_src_left_cpy2;
3637 pu1_src_left_str = pu1_src_left_str2;
3638 }
3639 wd_rem = wd & 0xF;
3640 if(wd_rem)
3641 {
3642 pu1_src_left_cpy = pu1_src_left_cpy2;
3643 pu1_src_left_str = pu1_src_left_str2;
3644 pu1_src_cpy = pu1_src;
3645 src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col));
3646 //row = 0
3647 src_temp0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_cpy));
3648 src_top_16x8b = _mm_alignr_epi8(src_top_16x8b, left_store_16x8b, 14);
3649 au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
3650 //separating +ve and and -ve values.
3651 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
3652 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
3653 //creating mask 00 for +ve and -ve values and FF for zero.
3654 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3655 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3656 //preparing au1_mask
3657 au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
3658 //combining the appropriate sign change
3659 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3660 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
3661
3662 for(row = ht; row >= 4; row -= 4)
3663 {
3664 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3665 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
3666 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3667 // row = 2
3668 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
3669 //right row1
3670 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
3671 //row 0 -row1
3672 //separating +ve and and -ve values.
3673 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
3674 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
3675 //manipulation for row 1 -row 0
3676 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
3677 //creating mask 00 for +ve and -ve values and FF for zero.
3678 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3679 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3680 //row 0 left
3681 signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
3682 //combining the appropriate sign change
3683 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3684 //row 1 -row0
3685 //separating +ve and and -ve values.
3686 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3687 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3688
3689 //creating mask 00 for +ve and -ve values and FF for zero.
3690 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3691 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3692 //row1-row0
3693 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3694
3695 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
3696
3697 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
3698 //right row2
3699 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
3700 //packing row 0 n row 1
3701 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
3702 //row1 -row2
3703 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3704 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3705 //creating mask 00 for +ve and -ve values and FF for zero.
3706 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3707 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3708 //combining the appropriate sign change
3709 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
3710 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
3711 //manipulation for row 2 -row 1
3712 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
3713 //row 1 left
3714 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
3715 //row = 3
3716 src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
3717
3718 // row = 4
3719 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
3720
3721 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
3722
3723 //separating +ve and and -ve values.(2,1)
3724 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
3725 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
3726 //manipulation for row 3 -row 2
3727 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10);
3728 //creating mask 00 for +ve and -ve values and FF for zero.
3729 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3730 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3731 //row 2 left
3732 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
3733 //combining the appropriate sign change
3734 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
3735
3736 //separating +ve and and -ve values.(3,2)
3737 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
3738 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
3739 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
3740 //creating mask 00 for +ve and -ve values and FF for zero.
3741 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3742 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3743 //right row3
3744 signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 2);
3745 //combining the appropriate sign change
3746 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
3747
3748 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
3749
3750 //separating +ve and and -ve values.(2,3)
3751 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
3752 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
3753 //right row 4
3754 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
3755 //creating mask 00 for +ve and -ve values and FF for zero.
3756 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3757 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3758 //combining the appropriate sign change
3759 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
3760
3761 //separating +ve and and -ve values.(3,bottom)
3762 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
3763 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
3764
3765 //creating mask 00 for +ve and -ve values and FF for zero.
3766 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3767 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3768 edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
3769 //combining the appropriate sign change
3770 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
3771 edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
3772
3773 //manipulation for bottom -row 3
3774 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 8);
3775 //eliminating old left for row 0,1,2,3
3776 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 8);
3777 //packing row 2 n row 3
3778 src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
3779 //row 3 left
3780 signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 14);
3781
3782 //adding bottom and top values of row 2 and row 3
3783 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
3784 //separating +ve and and -ve values.(botttom,3)
3785 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3786 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3787
3788 //creating mask 00 for +ve and -ve values and FF for zero.
3789 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3790 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3791 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
3792
3793 //to store right of row 2
3794 signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
3795 //loading row 3 right into left
3796 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 14);
3797 //storing right of row 2into left
3798 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
3799 //to store right of row 0
3800 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
3801 //storing right of row 1 into left
3802 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
3803 //storing right of row 0 into left
3804 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
3805
3806 //adding constant 2
3807 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3808 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
3809 //shuffle to get sao index
3810 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3811 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
3812 //using availability mask
3813 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3814 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
3815
3816 //adding chroma offset to access U and V
3817 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
3818 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
3819
3820 //shuffle to get sao offset
3821 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3822 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
3823 //cnvert to 16 bit then add and then saturated pack
3824 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3825 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3826 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3827 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
3828 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3829 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3830 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
3831 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
3832
3833 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
3834 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
3835 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
3836 src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
3837 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
3838 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
3839 src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
3840 src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
3841
3842 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
3843 cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
3844
3845
3846 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3847 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3848 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3849 // row = 1
3850 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
3851 //row = 2
3852 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
3853 // row = 3
3854 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
3855
3856 src_temp0_16x8b = src_temp1_16x8b;
3857 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
3858 pu1_src_cpy += (src_strd << 2);
3859 pu1_src_left_cpy += 8;
3860 pu1_src_left_str += 8;
3861 }
3862 ht_rem = ht & 0x2;
3863 if(ht_rem)
3864 {
3865 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3866 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
3867 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3868 // row = 2
3869 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
3870
3871 //row 0 -row 1
3872 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
3873 //separating +ve and and -ve values.
3874 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
3875 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
3876 //manipulation for row 1 -row 0
3877 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
3878 //creating mask 00 for +ve and -ve values and FF for zero.
3879 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3880 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3881 //manipulation for row 1 - row 0
3882 signdwn1_16x8b = _mm_alignr_epi8(src_temp0_16x8b, signdwn1_16x8b, 14);
3883 //combining the appropriate sign change
3884 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3885
3886 //row1-row0
3887 //separating +ve and and -ve values.
3888 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3889 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3890
3891 //creating mask 00 for +ve and -ve values and FF for zero.
3892 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3893 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3894 //combining the appropriate sign chang
3895 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3896 //row 1 -bottom
3897 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
3898
3899 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
3900 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
3901 //row1 -bottom
3902 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
3903 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
3904
3905 //creating mask 00 for +ve and -ve values and FF for zero.
3906 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3907 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3908 //combining the appropriate sign change
3909 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
3910 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
3911 //manipulation for bottom -row1
3912 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
3913 //eliminating old left for row 0,1
3914 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
3915 //manipulation for bottom- row 1
3916 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
3917 //adding top and down substraction
3918 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
3919 //bottom - row 1
3920 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
3921 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
3922
3923 //shifting row 1
3924 signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
3925 //creating mask 00 for +ve and -ve values and FF for zero.
3926 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3927 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3928 //for the next iteration signup0_16x8b
3929 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
3930 //storing right of row 1 into left
3931 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14); //for storing right of row 0
3932 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
3933 //the next top in src_top_16x8b
3934 src_top_16x8b = src_temp1_16x8b;
3935 //storing right of row 0 into left
3936 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
3937
3938
3939 //adding constant 2
3940 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
3941
3942 //shuffle to get sao index
3943 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
3944 //using availability mask
3945 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
3946
3947 //adding chroma offset to access U and V
3948 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
3949
3950 //shuffle to get sao offset
3951 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
3952 //the next top already in src_top_16x8b
3953 //cnvert to 16 bit then add and then saturated pack
3954 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
3955 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
3956 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
3957 src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
3958 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
3959 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
3960 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
3961 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
3962
3963 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
3964
3965 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
3966 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
3967 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
3968 // row = 1
3969 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
3970 src_temp0_16x8b = src_bottom_16x8b;
3971 pu1_src_cpy += (src_strd << 1);
3972 pu1_src_left_cpy += 4;
3973 pu1_src_left_str += 4;
3974 }
3975 ht_rem = ht & 0x1;
3976 if(ht_rem)
3977 {
3978 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
3979 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
3980 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
3981
3982 //row 0 -row1
3983 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
3984 //separating +ve and and -ve values.
3985 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
3986 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
3987 //creating mask 00 for +ve and -ve values and FF for zero.
3988 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
3989 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
3990 //combining the appropriate sign change
3991 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
3992 //adding top and down substraction
3993 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
3994
3995 //for row 0 right to put into left store
3996 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
3997 //left store manipulation 1
3998 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
3999 src_top_16x8b = src_temp0_16x8b;
4000 //filling the left boundary value
4001 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
4002
4003 //adding constant 2
4004 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4005 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
4006 edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
4007
4008
4009 //shuffle to get sao index
4010 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4011 //using availability mask
4012 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4013 //adding chroma offset to access U and V
4014 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
4015
4016 //shuffle to get sao offset
4017 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4018
4019 //cnvert to 16 bit then add and then saturated pack
4020 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4021 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4022 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4023 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4024 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
4025
4026 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4027 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
4028 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4029 pu1_src_cpy += (src_strd);
4030 pu1_src_left_cpy += 2;
4031 pu1_src_left_str += 2;
4032 }
4033 if(0 == pu1_avail[3])
4034 {
4035 src_top_16x8b = src_bottom_16x8b;
4036 pu1_src_left_str[1] = pu1_src_cpy[7];
4037 pu1_src_left_str[0] = pu1_src_cpy[6];
4038 }
4039
4040 if(0 == pu1_avail[2])
4041 {
4042 pu1_src_left_str[-2 * ht_0] = pu1_src[6 - src_strd];
4043 pu1_src_left_str[-2 * ht_0 + 1] = pu1_src[7 - src_strd];
4044 }
4045
4046 _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
4047 pu1_src += 8;
4048
4049 pu1_left_tmp = pu1_src_left_cpy2;
4050 pu1_src_left_cpy2 = pu1_src_left_str2;
4051 pu1_src_left_str2 = pu1_left_tmp;
4052
4053 pu1_src_left_cpy = pu1_src_left_cpy2;
4054 pu1_src_left_str = pu1_src_left_str2;
4055 }
4056 pu1_src_org[0] = u1_pos_0_0_tmp_u;
4057 pu1_src_org[1] = u1_pos_0_0_tmp_v;
4058 pu1_src_org[wd - 2 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp_u;
4059 pu1_src_org[wd - 1 + (ht_tmp - 1) * src_strd] = u1_pos_wd_ht_tmp_v;
4060 pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 2) : pu1_src_left_cpy;
4061 for(row = 0; row < 2 * ht_tmp; row++)
4062 {
4063 pu1_src_left[row] = pu1_src_left_cpy[row];
4064 }
4065 }
4066
4067 }
4068
ihevc_sao_edge_offset_class3_ssse3(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_src_left,UWORD8 * pu1_src_top,UWORD8 * pu1_src_top_left,UWORD8 * pu1_src_top_right,UWORD8 * pu1_src_bot_left,UWORD8 * pu1_avail,WORD8 * pi1_sao_offset,WORD32 wd,WORD32 ht)4069 void ihevc_sao_edge_offset_class3_ssse3(UWORD8 *pu1_src,
4070 WORD32 src_strd,
4071 UWORD8 *pu1_src_left,
4072 UWORD8 *pu1_src_top,
4073 UWORD8 *pu1_src_top_left,
4074 UWORD8 *pu1_src_top_right,
4075 UWORD8 *pu1_src_bot_left,
4076 UWORD8 *pu1_avail,
4077 WORD8 *pi1_sao_offset,
4078 WORD32 wd,
4079 WORD32 ht)
4080 {
4081 WORD32 row, col;
4082 UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
4083 UWORD8 *pu1_left_tmp, *pu1_src_left_str, *pu1_src_left_str2;
4084 UWORD8 *pu1_src_cpy, *pu1_src_org;
4085 UWORD8 au1_src_left_tmp[MAX_CTB_SIZE + 8];
4086 UWORD8 au1_src_left_tmp1[MAX_CTB_SIZE + 8];
4087 UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
4088 WORD32 wd_rem;
4089 UWORD8 u1_pos_wd_0_tmp, u1_pos_0_ht_tmp;
4090 WORD32 ht_tmp;
4091 WORD32 bit_depth;
4092 UWORD8 u1_avail0, u1_avail1;
4093
4094 __m128i src_top_16x8b, src_bottom_16x8b;
4095 __m128i src_temp0_16x8b, src_temp1_16x8b;
4096 __m128i signup0_16x8b, signdwn1_16x8b;
4097 __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
4098 __m128i edge0_16x8b, edge1_16x8b;
4099 __m128i au1_mask8x16b;
4100 __m128i edge_idx_8x16b, sao_offset_8x16b;
4101 __m128i const2_16x8b, const0_16x8b;
4102 __m128i left_store_16x8b;
4103
4104 ht_tmp = ht;
4105 au1_mask8x16b = _mm_set1_epi8(0xff);
4106
4107 au1_src_left_tmp[0] = pu1_src[(wd - 1)];
4108 //manipulation for bottom left
4109 for(row = 1; row < ht; row++)
4110 {
4111 au1_src_left_tmp[row] = pu1_src_left[row];
4112 }
4113 au1_src_left_tmp[ht] = pu1_src_bot_left[0];
4114
4115 *pu1_src_top_left = pu1_src_top[wd - 1];
4116 //setting availability mask to ff size MAX_CTB_SIZE
4117 for(col = 0; col < MAX_CTB_SIZE; col += 16)
4118 _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
4119 bit_depth = BIT_DEPTH_LUMA;
4120 pu1_src_org = pu1_src;
4121 pu1_src_top_cpy = pu1_src_top;
4122 pu1_src_left_cpy2 = au1_src_left_tmp;
4123 pu1_src_left_cpy = au1_src_left_tmp;
4124 pu1_src_left_str2 = au1_src_left_tmp1;
4125 pu1_src_left_str = au1_src_left_tmp1;
4126 edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
4127 sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset);
4128
4129 /* If top-right is available, process separately */
4130 if(0 != pu1_avail[5])
4131 {
4132 WORD32 edge_idx;
4133
4134 edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[0]) +
4135 SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 1 + src_strd]);
4136
4137 edge_idx = gi1_table_edge_idx[edge_idx];
4138
4139 if(0 != edge_idx)
4140 {
4141 u1_pos_wd_0_tmp = CLIP3(pu1_src[wd - 1] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
4142 }
4143 else
4144 {
4145 u1_pos_wd_0_tmp = pu1_src[wd - 1];
4146 }
4147 }
4148 else
4149 {
4150 u1_pos_wd_0_tmp = pu1_src[wd - 1];
4151 }
4152
4153 /* If bottom-left is available, process separately */
4154 if(0 != pu1_avail[6])
4155 {
4156 WORD32 edge_idx;
4157
4158 edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 1 - src_strd]) +
4159 SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]);
4160
4161 edge_idx = gi1_table_edge_idx[edge_idx];
4162
4163 if(0 != edge_idx)
4164 {
4165 u1_pos_0_ht_tmp = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset[edge_idx], 0, (1 << bit_depth) - 1);
4166 }
4167 else
4168 {
4169 u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd];
4170 }
4171 }
4172 else
4173 {
4174 u1_pos_0_ht_tmp = pu1_src[(ht - 1) * src_strd];
4175 }
4176
4177
4178
4179 /* Update height and source pointers based on the availability flags */
4180 if(0 == pu1_avail[2])
4181 {
4182 pu1_src_left_cpy2++;
4183 pu1_src_left_str2++;
4184 pu1_src_top_cpy = pu1_src;
4185 pu1_src += src_strd;
4186 ht--;
4187 }
4188 if(0 == pu1_avail[3])
4189 {
4190 ht--;
4191 }
4192
4193
4194 const2_16x8b = _mm_set1_epi8(2);
4195 const0_16x8b = _mm_setzero_si128();
4196
4197
4198 //availability mask creation
4199 u1_avail0 = pu1_avail[0];
4200 u1_avail1 = pu1_avail[1];
4201 au1_mask[0] = u1_avail0;
4202 au1_mask[wd - 1] = u1_avail1;
4203 {
4204 WORD32 ht_rem;
4205
4206 pu1_src_left_cpy = pu1_src_left_cpy2;
4207 pu1_src_left_str = pu1_src_left_str2;
4208 au1_mask_cpy = au1_mask;
4209 for(col = wd; col >= 16; col -= 16)
4210 {
4211 pu1_src_cpy = pu1_src;
4212 src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col + 1));
4213 //row = 0
4214 src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
4215
4216 //loading the mask
4217 au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
4218 //separating +ve and and -ve values.
4219 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
4220 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
4221 //creating mask 00 for +ve and -ve values and FF for zero.
4222 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4223 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4224 //combining the appropriate sign change
4225 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4226
4227 for(row = ht; row >= 2; row -= 2)
4228 {
4229 left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
4230 //row = 1
4231 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
4232 //to insert left in row 1
4233 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
4234 // row = 0 right
4235 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 1));
4236
4237 //manipulation for row 1 - row 0
4238 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
4239 //row 0 -row1
4240 //separating +ve and and -ve values.
4241 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
4242 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
4243
4244 //creating mask 00 for +ve and -ve values and FF for zero.
4245 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4246 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4247
4248 //combining the appropriate sign change
4249 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
4250 //combining sign-left and sign_right
4251 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
4252
4253 //row1-row0
4254 //separating +ve and and -ve values.
4255 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
4256 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
4257 //creating mask 00 for +ve and -ve values and FF for zero.
4258 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4259 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4260
4261 // row = 2
4262 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
4263 // row = 1 right
4264 signdwn1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 1));
4265 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
4266
4267 //bottom - row1
4268 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
4269 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
4270 //creating mask 00 for +ve and -ve values and FF for zero.
4271 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4272 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4273 //for the next iteration bottom -row1
4274 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4275
4276 //to insert left in row 1
4277 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13);
4278 //manipulation for row 1 - bottom
4279 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
4280
4281 //row1 -bottom
4282 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4283 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4284 //creating mask 00 for +ve and -ve values and FF for zero.
4285 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4286 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4287 //combining the appropriate sign change
4288 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4289
4290 //combining sign-left and sign_right
4291 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
4292
4293 //eliminating old left for row 0 and row 1
4294 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
4295
4296 //row1 getting it right for left of next block
4297 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 15);
4298 //adding constant 2
4299 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4300 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
4301 //shuffle to get sao index
4302 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4303 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
4304 //using availability mask
4305 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4306 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
4307 //shuffle to get sao offset
4308 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4309 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
4310 //row0 getting it right for left of next block
4311 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
4312 //copying the next top
4313 src_top_16x8b = src_temp1_16x8b;
4314 //cnvert to 16 bit then add and then saturated pack
4315 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4316 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4317 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4318 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
4319 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
4320 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
4321 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4322 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
4323
4324 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
4325 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
4326 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
4327 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
4328 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
4329 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
4330 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
4331 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
4332 //store left boundary
4333 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4334 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
4335 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4336 // row = 1
4337 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
4338
4339 src_temp0_16x8b = src_bottom_16x8b;
4340 pu1_src_cpy += (src_strd << 1);
4341 pu1_src_left_cpy += 2;
4342 pu1_src_left_str += 2;
4343 }
4344 ht_rem = ht & 0x1;
4345
4346 if(ht_rem)
4347 {
4348 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4349 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
4350 //to insert left in row 1
4351 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
4352 //manipulation for row 1 - row 0
4353 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
4354
4355 //current row -next row
4356 //separating +ve and and -ve values.
4357 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
4358 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
4359 //creating mask 00 for +ve and -ve values and FF for zero.
4360 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4361 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4362 //combining the appropriate sign change
4363 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4364 //adding top and bottom and constant 2
4365 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
4366 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4367 //eliminating old left for row 0 and row 1
4368 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
4369
4370 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4371 //using availability mask
4372 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4373
4374 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4375
4376 //row0 getting it right for left of next block
4377 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
4378 //copying the next top
4379 src_top_16x8b = src_temp0_16x8b;
4380 //cnvert to 16 bit then add and then saturated pack
4381 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4382 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4383 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4384 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
4385 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
4386 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
4387 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4388 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
4389 //store left boundary
4390 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4391
4392 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4393 pu1_src_cpy += (src_strd);
4394 src_temp0_16x8b = src_bottom_16x8b;
4395 pu1_src_left_cpy++;
4396 pu1_src_left_str++;
4397 }
4398 { //for bottom right
4399 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4400 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
4401 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
4402 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4403 }
4404 if(0 == pu1_avail[3])
4405 {
4406 src_top_16x8b = src_bottom_16x8b;
4407 }
4408 //for the top left of next part of the block
4409 left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
4410 //updating top flag
4411 _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
4412 pu1_src += 16;
4413 au1_mask_cpy += 16;
4414
4415 pu1_left_tmp = pu1_src_left_cpy2;
4416 pu1_src_left_cpy2 = pu1_src_left_str2;
4417 pu1_src_left_str2 = pu1_left_tmp;
4418
4419 pu1_src_left_cpy = pu1_src_left_cpy2;
4420 pu1_src_left_str = pu1_src_left_str2;
4421 }
4422
4423 wd_rem = wd & 0xF;
4424 if(wd_rem)
4425 {
4426 pu1_src_cpy = pu1_src;
4427 pu1_src_left_cpy = pu1_src_left_cpy2;
4428 pu1_src_left_str = pu1_src_left_str2;
4429 src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col + 1));
4430 //row = 0
4431 src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
4432 au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
4433 //separating +ve and and -ve values.
4434 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
4435 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
4436 //creating mask 00 for +ve and -ve values and FF for zero.
4437 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4438 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4439 //preparing au1_mask
4440 au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
4441 //combining the appropriate sign change
4442 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4443 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
4444
4445 for(row = ht; row >= 4; row -= 4)
4446 {
4447 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4448 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
4449 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
4450 // row = 2
4451 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
4452 //manipulation for row 0 -row 1
4453 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
4454 //row 1 left
4455 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
4456 //row 0 -row1
4457 //separating +ve and and -ve values.
4458 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
4459 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
4460
4461 //creating mask 00 for +ve and -ve values and FF for zero.
4462 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4463 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4464 //manipulatiing for row 1 -row 0
4465 signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 1);
4466 //combining the appropriate sign change
4467 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4468 //row 1 -row0
4469 //separating +ve and and -ve values.
4470 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4471 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4472
4473 //creating mask 00 for +ve and -ve values and FF for zero.
4474 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4475 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4476 //row1-row0
4477 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4478
4479 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
4480
4481 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
4482 //manipulation for row 1 -row 2
4483 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13);
4484 //row 2 left
4485 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
4486 //packing row 0 n row 1
4487 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
4488 //row1 -row2
4489 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4490 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4491 //creating mask 00 for +ve and -ve values and FF for zero.
4492 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4493 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4494 //combining the appropriate sign change
4495 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
4496 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
4497
4498 //row 1 right
4499 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
4500 //row = 3
4501 src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
4502
4503 // row = 4
4504 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
4505
4506 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
4507
4508 //separating +ve and and -ve values.(2,1)
4509 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
4510 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
4511
4512 //creating mask 00 for +ve and -ve values and FF for zero.
4513 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4514 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4515 //row 2 right
4516 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 1);
4517 //combining the appropriate sign change
4518 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
4519
4520 //separating +ve and and -ve values.(3,2)
4521 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
4522 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
4523 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
4524 //creating mask 00 for +ve and -ve values and FF for zero.
4525 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4526 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4527 //manipulation for row 2 -row 3
4528 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
4529 //row 3 left
4530 signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 15);
4531 //combining the appropriate sign change
4532 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
4533
4534 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
4535
4536 //separating +ve and and -ve values.(2,3)
4537 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
4538 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
4539
4540 //manipulation for row 3 -bottom
4541 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 11);
4542 //bottom left
4543 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
4544
4545 //creating mask 00 for +ve and -ve values and FF for zero.
4546 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4547 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4548 //combining the appropriate sign change
4549 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
4550
4551 //separating +ve and and -ve values.(3,bottom)
4552 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
4553 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
4554
4555 //creating mask 00 for +ve and -ve values and FF for zero.
4556 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4557 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4558 edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
4559 //combining the appropriate sign change
4560 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
4561 edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
4562
4563
4564 //eliminating old left for row 0,1,2,3
4565 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
4566 //packing row 2 n row 3
4567 src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
4568 //row 3 right
4569 signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 1);
4570 //loading row 3 right into left
4571 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 15);
4572 //adding bottom and top values of row 2 and row 3
4573 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
4574 //separating +ve and and -ve values.(botttom,3)
4575 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4576 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4577 //to store right of row 2
4578 signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
4579 //creating mask 00 for +ve and -ve values and FF for zero.
4580 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4581 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4582 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
4583
4584 //storing right of row 2into left
4585 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
4586 //to store right of row 0
4587 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
4588 //storing right of row 1 into left
4589 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
4590
4591 //adding constant 2
4592 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4593 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
4594 //shuffle to get sao index
4595 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4596 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
4597 //using availability mask
4598 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4599 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
4600 //shuffle to get sao offset
4601 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4602 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
4603
4604 //storing right of row 0 into left
4605 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
4606 //cnvert to 16 bit then add and then saturated pack
4607 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4608 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4609 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4610 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
4611 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
4612 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
4613 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4614 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
4615
4616 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
4617 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
4618 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
4619 src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
4620 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
4621 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
4622 src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, cmp_lt0_16x8b);
4623 src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
4624
4625 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
4626 cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
4627
4628 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4629 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
4630 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4631 // row = 1
4632 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
4633 //row = 2
4634 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
4635 // row = 3
4636 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
4637
4638 src_temp0_16x8b = src_temp1_16x8b;
4639 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
4640 pu1_src_cpy += (src_strd << 2);
4641 pu1_src_left_cpy += 4;
4642 pu1_src_left_str += 4;
4643 }
4644 ht_rem = ht & 0x2;
4645 if(ht_rem)
4646 {
4647 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4648 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
4649 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
4650 // row = 2
4651 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
4652
4653 //manipulation for row 0 -row 1
4654 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
4655 //bottom left
4656 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 15);
4657 //separating +ve and and -ve values.
4658 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
4659 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
4660
4661 //creating mask 00 for +ve and -ve values and FF for zero.
4662 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4663 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4664 //manipulation for row 1 - row 0
4665 signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 1);
4666 //combining the appropriate sign change
4667 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4668
4669 //row1-row0
4670 //separating +ve and and -ve values.
4671 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4672 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4673
4674 //creating mask 00 for +ve and -ve values and FF for zero.
4675 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4676 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4677 //combining the appropriate sign chang
4678 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4679
4680 //manipulation for row 1 -bottom
4681 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 13);
4682 //bottom left
4683 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
4684
4685 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
4686 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
4687 //row1 -bottom
4688 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
4689 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
4690
4691 //creating mask 00 for +ve and -ve values and FF for zero.
4692 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4693 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4694 //combining the appropriate sign change
4695 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
4696 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
4697 //manipulation for bottom- row 1 (row 1 right)
4698 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 1);
4699 //adding top and down substraction
4700 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
4701 //bottom - row 1
4702 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
4703 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
4704
4705 //eliminating old left for row 0,1
4706 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
4707 signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
4708 //creating mask 00 for +ve and -ve values and FF for zero.
4709 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4710 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4711 //for the next iteration signup0_16x8b
4712 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
4713
4714 //storing right of row 1 into left
4715 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
4716 //for storing right of row 1
4717 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
4718
4719 src_top_16x8b = src_temp1_16x8b;
4720 //storing right of row 0 into left
4721 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
4722
4723 //adding constant 2
4724 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4725
4726 //shuffle to get sao index
4727 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4728 //using availability mask
4729 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4730 //shuffle to get sao offset
4731 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4732
4733 //the next top already in src_top_16x8b
4734 //cnvert to 16 bit then add and then saturated pack
4735 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4736 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4737 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4738 src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
4739 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4740 cmp_lt0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
4741 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, cmp_lt0_16x8b);
4742 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
4743
4744 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
4745
4746 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4747 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
4748 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4749 // row = 1
4750 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
4751 src_temp0_16x8b = src_bottom_16x8b;
4752 pu1_src_cpy += (src_strd << 1);
4753 pu1_src_left_cpy += 2;
4754 pu1_src_left_str += 2;
4755 }
4756 ht_rem = ht & 0x1;
4757 if(ht_rem)
4758 {
4759 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4760 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
4761 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
4762
4763
4764 //manipulation for row 0 -bottom
4765 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 14);
4766 //bottom left
4767 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 15);
4768 //separating +ve and and -ve values.
4769 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
4770 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
4771 //creating mask 00 for +ve and -ve values and FF for zero.
4772 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
4773 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
4774 //combining the appropriate sign change
4775 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
4776 //adding top and down substraction
4777 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
4778 //for row 0 right to put into left store
4779 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
4780 //adding constant 2
4781 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
4782 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
4783 edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
4784 //left store manipulation 1
4785 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
4786 //filling the left boundary value
4787 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 15);
4788
4789 //shuffle to get sao index
4790 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
4791 //using availability mask
4792 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
4793 //shuffle to get sao offset
4794 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
4795 src_top_16x8b = src_temp0_16x8b;
4796 //cnvert to 16 bit then add and then saturated pack
4797 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
4798 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
4799 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
4800 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
4801 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
4802
4803 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4804 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
4805 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
4806 pu1_src_cpy += (src_strd);
4807 src_temp0_16x8b = src_bottom_16x8b;
4808 pu1_src_left_cpy++;
4809 pu1_src_left_str++;
4810 }
4811 { //for bottom right
4812 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
4813 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 1);
4814 src_temp0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
4815 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 15);
4816 _mm_storel_epi64((__m128i *)(pu1_src_left_str), left_store_16x8b);
4817 }
4818 if(0 == pu1_avail[3])
4819 {
4820 src_top_16x8b = src_bottom_16x8b;
4821 }
4822 _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
4823 pu1_src += 8;
4824
4825 pu1_left_tmp = pu1_src_left_cpy2;
4826 pu1_src_left_cpy2 = pu1_src_left_str2;
4827 pu1_src_left_str2 = pu1_left_tmp;
4828
4829 pu1_src_left_cpy = pu1_src_left_cpy2;
4830 pu1_src_left_str = pu1_src_left_str2;
4831
4832 }
4833 pu1_src_org[wd - 1] = u1_pos_wd_0_tmp;
4834 pu1_src_org[(ht_tmp - 1) * src_strd] = u1_pos_0_ht_tmp;
4835 pu1_src_left_cpy = (0 == pu1_avail[2]) ? (pu1_src_left_cpy - 1) : pu1_src_left_cpy;
4836 pu1_src_left[0] = au1_src_left_tmp[0];
4837 for(row = 1; row < ht_tmp; row++)
4838 {
4839 pu1_src_left[row] = pu1_src_left_cpy[row];
4840 }
4841 }
4842
4843 }
4844
ihevc_sao_edge_offset_class3_chroma_ssse3(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_src_left,UWORD8 * pu1_src_top,UWORD8 * pu1_src_top_left,UWORD8 * pu1_src_top_right,UWORD8 * pu1_src_bot_left,UWORD8 * pu1_avail,WORD8 * pi1_sao_offset_u,WORD8 * pi1_sao_offset_v,WORD32 wd,WORD32 ht)4845 void ihevc_sao_edge_offset_class3_chroma_ssse3(UWORD8 *pu1_src,
4846 WORD32 src_strd,
4847 UWORD8 *pu1_src_left,
4848 UWORD8 *pu1_src_top,
4849 UWORD8 *pu1_src_top_left,
4850 UWORD8 *pu1_src_top_right,
4851 UWORD8 *pu1_src_bot_left,
4852 UWORD8 *pu1_avail,
4853 WORD8 *pi1_sao_offset_u,
4854 WORD8 *pi1_sao_offset_v,
4855 WORD32 wd,
4856 WORD32 ht)
4857 {
4858 WORD32 row, col;
4859 UWORD8 *pu1_src_top_cpy, *pu1_src_left_cpy, *pu1_src_left_cpy2;
4860 UWORD8 *pu1_src_cpy, *pu1_src_org;
4861 UWORD8 au1_src_left_tmp[2 * (MAX_CTB_SIZE + 8)];
4862 UWORD8 au1_mask[MAX_CTB_SIZE], *au1_mask_cpy;
4863 WORD32 wd_rem;
4864 UWORD8 u1_pos_wd_0_tmp_u, u1_pos_wd_0_tmp_v, u1_pos_0_ht_tmp_u, u1_pos_0_ht_tmp_v;
4865 WORD32 ht_tmp;
4866 WORD32 bit_depth;
4867 UWORD8 u1_avail0, u1_avail1;
4868
4869 __m128i src_top_16x8b, src_bottom_16x8b;
4870 __m128i src_temp0_16x8b, src_temp1_16x8b;
4871 __m128i signup0_16x8b, signdwn1_16x8b;
4872 __m128i cmp_gt0_16x8b, cmp_lt0_16x8b;
4873 __m128i edge0_16x8b, edge1_16x8b;
4874 __m128i au1_mask8x16b;
4875 __m128i edge_idx_8x16b, sao_offset_8x16b;
4876 __m128i left_store_16x8b;
4877 __m128i const0_16x8b, const2_16x8b;
4878 __m128i chroma_offset_8x16b;
4879
4880 ht_tmp = ht;
4881 au1_mask8x16b = _mm_set1_epi8(0xff);
4882
4883
4884 au1_src_left_tmp[0] = pu1_src[(wd - 2)];
4885 au1_src_left_tmp[1] = pu1_src[(wd - 1)];
4886 //manipulation for bottom left
4887 for(row = 2; row < 2 * ht; row++)
4888 {
4889 au1_src_left_tmp[row] = pu1_src_left[row];
4890 }
4891 au1_src_left_tmp[2 * ht] = pu1_src_bot_left[0];
4892 au1_src_left_tmp[2 * ht + 1] = pu1_src_bot_left[1];
4893
4894 pu1_src_top_left[0] = pu1_src_top[wd - 2];
4895 pu1_src_top_left[1] = pu1_src_top[wd - 1];
4896 //setting availability mask to ff size MAX_CTB_SIZE
4897 for(col = 0; col < MAX_CTB_SIZE; col += 16)
4898 _mm_storeu_si128((__m128i *)(au1_mask + col), au1_mask8x16b);
4899 bit_depth = BIT_DEPTH_LUMA;
4900 pu1_src_org = pu1_src;
4901 pu1_src_top_cpy = pu1_src_top;
4902 pu1_src_left_cpy2 = au1_src_left_tmp;
4903 pu1_src_left_cpy = au1_src_left_tmp;
4904 edge_idx_8x16b = _mm_loadl_epi64((__m128i *)gi1_table_edge_idx);
4905 sao_offset_8x16b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_u);
4906 const0_16x8b = _mm_loadl_epi64((__m128i *)pi1_sao_offset_v);
4907 chroma_offset_8x16b = _mm_set1_epi16(0x0800);
4908 /* If top-right is available, process separately */
4909 if(0 != pu1_avail[5])
4910 {
4911 WORD32 edge_idx;
4912
4913 /* U */
4914 edge_idx = 2 + SIGN(pu1_src[wd - 2] - pu1_src_top_right[0]) +
4915 SIGN(pu1_src[wd - 2] - pu1_src[wd - 2 - 2 + src_strd]);
4916
4917 edge_idx = gi1_table_edge_idx[edge_idx];
4918
4919 if(0 != edge_idx)
4920 {
4921 u1_pos_wd_0_tmp_u = CLIP3(pu1_src[wd - 2] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
4922 }
4923 else
4924 {
4925 u1_pos_wd_0_tmp_u = pu1_src[wd - 2];
4926 }
4927
4928 /* V */
4929 edge_idx = 2 + SIGN(pu1_src[wd - 1] - pu1_src_top_right[1]) +
4930 SIGN(pu1_src[wd - 1] - pu1_src[wd - 1 - 2 + src_strd]);
4931
4932 edge_idx = gi1_table_edge_idx[edge_idx];
4933
4934 if(0 != edge_idx)
4935 {
4936 u1_pos_wd_0_tmp_v = CLIP3(pu1_src[wd - 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
4937 }
4938 else
4939 {
4940 u1_pos_wd_0_tmp_v = pu1_src[wd - 1];
4941 }
4942 }
4943 else
4944 {
4945 u1_pos_wd_0_tmp_u = pu1_src[wd - 2];
4946 u1_pos_wd_0_tmp_v = pu1_src[wd - 1];
4947 }
4948
4949 /* If bottom-left is available, process separately */
4950 if(0 != pu1_avail[6])
4951 {
4952 WORD32 edge_idx;
4953
4954 /* U */
4955 edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src[(ht - 1) * src_strd + 2 - src_strd]) +
4956 SIGN(pu1_src[(ht - 1) * src_strd] - pu1_src_bot_left[0]);
4957
4958 edge_idx = gi1_table_edge_idx[edge_idx];
4959
4960 if(0 != edge_idx)
4961 {
4962 u1_pos_0_ht_tmp_u = CLIP3(pu1_src[(ht - 1) * src_strd] + pi1_sao_offset_u[edge_idx], 0, (1 << bit_depth) - 1);
4963 }
4964 else
4965 {
4966 u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd];
4967 }
4968
4969 /* V */
4970 edge_idx = 2 + SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src[(ht - 1) * src_strd + 1 + 2 - src_strd]) +
4971 SIGN(pu1_src[(ht - 1) * src_strd + 1] - pu1_src_bot_left[1]);
4972
4973 edge_idx = gi1_table_edge_idx[edge_idx];
4974
4975 if(0 != edge_idx)
4976 {
4977 u1_pos_0_ht_tmp_v = CLIP3(pu1_src[(ht - 1) * src_strd + 1] + pi1_sao_offset_v[edge_idx], 0, (1 << bit_depth) - 1);
4978 }
4979 else
4980 {
4981 u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1];
4982 }
4983 }
4984 else
4985 {
4986 u1_pos_0_ht_tmp_u = pu1_src[(ht - 1) * src_strd];
4987 u1_pos_0_ht_tmp_v = pu1_src[(ht - 1) * src_strd + 1];
4988 }
4989
4990
4991
4992 /* Update height and source pointers based on the availability flags */
4993 if(0 == pu1_avail[2])
4994 {
4995 pu1_src_left_cpy2 += 2;
4996 pu1_src_top_cpy = pu1_src;
4997 pu1_src += src_strd;
4998 ht--;
4999 }
5000 if(0 == pu1_avail[3])
5001 {
5002 ht--;
5003 }
5004
5005 sao_offset_8x16b = _mm_unpacklo_epi64(sao_offset_8x16b, const0_16x8b);
5006 const2_16x8b = _mm_set1_epi8(2);
5007 const0_16x8b = _mm_setzero_si128();
5008
5009
5010 //availability mask creation
5011 u1_avail0 = pu1_avail[0];
5012 u1_avail1 = pu1_avail[1];
5013 au1_mask[0] = u1_avail0;
5014 au1_mask[1] = u1_avail0;
5015 au1_mask[wd - 1] = u1_avail1;
5016 au1_mask[wd - 2] = u1_avail1;
5017 {
5018 WORD32 ht_rem;
5019 au1_mask_cpy = au1_mask;
5020 for(col = wd; col >= 16; col -= 16)
5021 {
5022 pu1_src_cpy = pu1_src;
5023 src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col + 2));
5024 //row = 0
5025 src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
5026
5027 //loading the mask
5028 au1_mask8x16b = _mm_loadu_si128((__m128i *)au1_mask_cpy);
5029 //separating +ve and and -ve values.
5030 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
5031 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
5032 //creating mask 00 for +ve and -ve values and FF for zero.
5033 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5034 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5035 //combining the appropriate sign change
5036 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5037 pu1_src_left_cpy = pu1_src_left_cpy2;
5038
5039 for(row = ht; row >= 2; row -= 2)
5040 {
5041 left_store_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_left_cpy));
5042 //row = 1
5043 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
5044 //to insert left in row 1
5045 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
5046 // row = 0 right
5047 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2));
5048
5049 //manipulation for row 1 - row 0
5050 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
5051 //row 0 -row1
5052 //separating +ve and and -ve values.
5053 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
5054 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
5055
5056 //creating mask 00 for +ve and -ve values and FF for zero.
5057 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5058 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5059
5060 //combining the appropriate sign change
5061 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(0-1)
5062 //combining sign-left and sign_right
5063 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
5064
5065 //row1-row0
5066 //separating +ve and and -ve values.
5067 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, src_bottom_16x8b);
5068 cmp_lt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, src_temp1_16x8b);
5069 //creating mask 00 for +ve and -ve values and FF for zero.
5070 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5071 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5072
5073 // row = 2
5074 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
5075 // row = 1 right
5076 signdwn1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd + 2));
5077 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-0)
5078
5079 //bottom - row1
5080 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
5081 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
5082 //creating mask 00 for +ve and -ve values and FF for zero.
5083 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5084 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5085 //for the next iteration bottom -row1
5086 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5087
5088 //to insert left in row 1
5089 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10);
5090 //manipulation for row 1 - bottom
5091 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
5092
5093 //row1 -bottom
5094 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5095 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5096 //creating mask 00 for +ve and -ve values and FF for zero.
5097 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5098 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5099 //combining the appropriate sign change
5100 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5101
5102 //combining sign-left and sign_right
5103 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signdwn1_16x8b);
5104
5105 //eliminating old left for row 0 and row 1
5106 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
5107 //row1 getting it right for left of next block
5108 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp1_16x8b, 14);
5109 //row0 getting it right for left of next block
5110 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
5111 //copying the next top
5112 src_top_16x8b = src_temp1_16x8b;
5113
5114
5115 //adding constant 2
5116 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
5117 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
5118 //shuffle to get sao index
5119 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
5120 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
5121 //using availability mask
5122 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
5123 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
5124
5125 //adding chroma offset to access U and V
5126 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
5127 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
5128
5129 //shuffle to get sao offset
5130 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
5131 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
5132 //cnvert to 16 bit then add and then saturated pack
5133 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
5134 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
5135 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
5136 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
5137 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
5138 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
5139 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
5140 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
5141
5142 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
5143 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
5144 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
5145 src_temp1_16x8b = _mm_unpackhi_epi8(src_temp1_16x8b, const0_16x8b);
5146 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
5147 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
5148 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge1_16x8b);
5149 src_temp1_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp1_16x8b);
5150 //store left boundary
5151 _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5152 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
5153 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
5154 // row = 1
5155 _mm_storeu_si128((__m128i *)(pu1_src_cpy + src_strd), src_temp1_16x8b);
5156
5157 src_temp0_16x8b = src_bottom_16x8b;
5158 pu1_src_cpy += (src_strd << 1);
5159 pu1_src_left_cpy += 4;
5160 }
5161 ht_rem = ht & 0x1;
5162
5163 if(ht_rem)
5164 {
5165 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
5166 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
5167 //to insert left in row 1
5168 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
5169 //manipulation for row 1 - row 0
5170 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
5171
5172 //current row -next row
5173 //separating +ve and and -ve values.
5174 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
5175 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
5176 //creating mask 00 for +ve and -ve values and FF for zero.
5177 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5178 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5179 //combining the appropriate sign change
5180 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5181 //adding top and bottom and constant 2
5182 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
5183 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
5184 //eliminating old left for row 0 and row 1
5185 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
5186 //row0 getting it right for left of next block
5187 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
5188 //copying the next top
5189 src_top_16x8b = src_temp0_16x8b;
5190
5191 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
5192 //using availability mask
5193 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
5194
5195 //adding chroma offset to access U and V
5196 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
5197
5198
5199 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
5200
5201 //cnvert to 16 bit then add and then saturated pack
5202 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
5203 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
5204 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
5205 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
5206 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
5207 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
5208 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
5209 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
5210
5211 //store left boundary
5212 _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5213
5214 _mm_storeu_si128((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
5215 pu1_src_cpy += (src_strd);
5216 src_temp0_16x8b = src_bottom_16x8b;
5217 pu1_src_left_cpy += 2;
5218 }
5219 { //for bottom right
5220 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
5221 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
5222 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
5223 _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5224 }
5225 if(0 == pu1_avail[3])
5226 {
5227 src_top_16x8b = src_bottom_16x8b;
5228 }
5229 //for the top left of next part of the block
5230 left_store_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_top_cpy + wd - col));
5231 //updating top flag
5232 _mm_storeu_si128((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
5233 pu1_src += 16;
5234 au1_mask_cpy += 16;
5235 }
5236 pu1_src_left_cpy = pu1_src_left_cpy2;
5237 wd_rem = wd & 0xF;
5238 if(wd_rem)
5239 {
5240 pu1_src_cpy = pu1_src;
5241 src_top_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src_top_cpy + wd - col + 2));
5242 //row = 0
5243 src_temp0_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy));
5244 au1_mask8x16b = _mm_loadl_epi64((__m128i *)au1_mask_cpy); //????
5245 //separating +ve and and -ve values.
5246 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, src_top_16x8b);
5247 cmp_lt0_16x8b = _mm_subs_epu8(src_top_16x8b, src_temp0_16x8b);
5248 //creating mask 00 for +ve and -ve values and FF for zero.
5249 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5250 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5251 //preparing au1_mask
5252 au1_mask8x16b = _mm_unpacklo_epi64(au1_mask8x16b, au1_mask8x16b);
5253 //combining the appropriate sign change
5254 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5255 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
5256 pu1_src_left_cpy = pu1_src_left_cpy2;
5257 for(row = ht; row >= 4; row -= 4)
5258 {
5259 left_store_16x8b = _mm_loadu_si128((__m128i *)pu1_src_left_cpy);
5260 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
5261 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
5262 // row = 2
5263 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
5264 //manipulation for row 0 -row 1
5265 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
5266 //row 1 left
5267 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
5268 //row 0 -row1
5269 //separating +ve and and -ve values.
5270 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
5271 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
5272
5273 //creating mask 00 for +ve and -ve values and FF for zero.
5274 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5275 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5276 //manipulatiing for row 1 -row 0
5277 signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 2);
5278 //combining the appropriate sign change
5279 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5280 //row 1 -row0
5281 //separating +ve and and -ve values.
5282 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5283 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5284
5285 //creating mask 00 for +ve and -ve values and FF for zero.
5286 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5287 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5288 //row1-row0
5289 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5290
5291 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
5292
5293 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
5294 //manipulation for row 1 -row 2
5295 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10);
5296 //row 2 left
5297 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
5298 //packing row 0 n row 1
5299 src_temp0_16x8b = _mm_unpacklo_epi64(src_temp0_16x8b, src_temp1_16x8b);
5300 //row1 -row2
5301 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5302 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5303 //creating mask 00 for +ve and -ve values and FF for zero.
5304 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5305 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5306 //combining the appropriate sign change
5307 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
5308 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
5309
5310 //row 1 right
5311 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
5312 //row = 3
5313 src_top_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 3 * src_strd));
5314
5315 // row = 4
5316 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 4 * src_strd));
5317
5318 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
5319
5320 //separating +ve and and -ve values.(2,1)
5321 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
5322 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
5323
5324 //creating mask 00 for +ve and -ve values and FF for zero.
5325 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5326 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5327 //row 2 right
5328 signdwn1_16x8b = _mm_srli_si128(src_bottom_16x8b, 2);
5329 //combining the appropriate sign change
5330 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1)
5331
5332 //separating +ve and and -ve values.(3,2)
5333 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
5334 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
5335 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8); //aligned left (2-1)
5336 //creating mask 00 for +ve and -ve values and FF for zero.
5337 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5338 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5339 //manipulation for row 2 -row 3
5340 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 8);
5341 //row 3 left
5342 signdwn1_16x8b = _mm_alignr_epi8(src_top_16x8b, signdwn1_16x8b, 14);
5343 //combining the appropriate sign change
5344 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-2)
5345
5346 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(3-2) ,(2-1)
5347
5348 //separating +ve and and -ve values.(2,3)
5349 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
5350 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
5351
5352 //manipulation for row 3 -bottom
5353 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 6);
5354 //bottom left
5355 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
5356
5357 //creating mask 00 for +ve and -ve values and FF for zero.
5358 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5359 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5360 //combining the appropriate sign change
5361 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-3)
5362
5363 //separating +ve and and -ve values.(3,bottom)
5364 cmp_gt0_16x8b = _mm_subs_epu8(src_top_16x8b, signdwn1_16x8b);
5365 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_top_16x8b);
5366
5367 //creating mask 00 for +ve and -ve values and FF for zero.
5368 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5369 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5370 edge1_16x8b = _mm_slli_si128(edge1_16x8b, 8); //aligned left (2-3)
5371 //combining the appropriate sign change
5372 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(3-bottom)
5373 edge1_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge1_16x8b, 8); //(3-bottom),(2-3)
5374
5375
5376 //eliminating old left for row 0,1,2,3
5377 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 8);
5378 //packing row 2 n row 3
5379 src_bottom_16x8b = _mm_unpacklo_epi64(src_bottom_16x8b, src_top_16x8b);
5380 //row 3 right
5381 signdwn1_16x8b = _mm_srli_si128(src_top_16x8b, 2);
5382 //loading row 3 right into left
5383 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_bottom_16x8b, 14);
5384 //adding bottom and top values of row 2 and row 3
5385 edge1_16x8b = _mm_add_epi8(edge1_16x8b, signup0_16x8b); //(3,2)
5386 //separating +ve and and -ve values.(botttom,3)
5387 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5388 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5389 //to store right of row 2
5390 signdwn1_16x8b = _mm_slli_si128(src_bottom_16x8b, 8);
5391 //creating mask 00 for +ve and -ve values and FF for zero.
5392 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5393 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5394 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(bottom -3) for next iteration
5395
5396 //storing right of row 2into left
5397 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
5398 //to store right of row 0
5399 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
5400 //storing right of row 1 into left
5401 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
5402 //storing right of row 0 into left
5403 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
5404
5405
5406 //adding constant 2
5407 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
5408 edge1_16x8b = _mm_add_epi8(edge1_16x8b, const2_16x8b);
5409 //shuffle to get sao index
5410 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
5411 edge1_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge1_16x8b);
5412 //using availability mask
5413 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
5414 edge1_16x8b = _mm_and_si128(edge1_16x8b, au1_mask8x16b);
5415 //adding chroma offset to access U and V
5416 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
5417 edge1_16x8b = _mm_add_epi8(edge1_16x8b, chroma_offset_8x16b);
5418 //shuffle to get sao offset
5419 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
5420 edge1_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge1_16x8b);
5421
5422 //cnvert to 16 bit then add and then saturated pack
5423 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
5424 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
5425 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
5426 src_temp0_16x8b = _mm_unpackhi_epi8(src_temp0_16x8b, const0_16x8b);
5427 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
5428 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
5429 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, edge0_16x8b);
5430 src_temp0_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_temp0_16x8b);
5431
5432 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge1_16x8b);
5433 cmp_gt0_16x8b = _mm_unpacklo_epi8(src_bottom_16x8b, const0_16x8b);
5434 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge1_16x8b, signdwn1_16x8b);
5435 src_bottom_16x8b = _mm_unpackhi_epi8(src_bottom_16x8b, const0_16x8b);
5436 edge1_16x8b = _mm_unpackhi_epi8(edge1_16x8b, signdwn1_16x8b);
5437 cmp_gt0_16x8b = _mm_add_epi16(cmp_gt0_16x8b, cmp_lt0_16x8b);
5438 src_bottom_16x8b = _mm_add_epi16(src_bottom_16x8b, edge1_16x8b);
5439 src_bottom_16x8b = _mm_packus_epi16(cmp_gt0_16x8b, src_bottom_16x8b);
5440
5441 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
5442 cmp_lt0_16x8b = _mm_srli_si128(src_bottom_16x8b, 8);
5443 _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5444 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
5445 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
5446 // row = 1
5447 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
5448 //row = 2
5449 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 2 * src_strd), src_bottom_16x8b);
5450 // row = 3
5451 _mm_storel_epi64((__m128i *)(pu1_src_cpy + 3 * src_strd), cmp_lt0_16x8b);
5452
5453 src_temp0_16x8b = src_temp1_16x8b;
5454 signup0_16x8b = _mm_slli_si128(signup0_16x8b, 8);
5455 pu1_src_cpy += (src_strd << 2);
5456 pu1_src_left_cpy += 8;
5457 }
5458 ht_rem = ht & 0x2;
5459 if(ht_rem)
5460 {
5461 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
5462 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
5463 src_temp1_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
5464 // row = 2
5465 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + 2 * src_strd));
5466
5467 //manipulation for row 0 -row 1
5468 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
5469 //bottom left
5470 signdwn1_16x8b = _mm_alignr_epi8(src_temp1_16x8b, signdwn1_16x8b, 14);
5471 //separating +ve and and -ve values.
5472 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
5473 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
5474
5475 //creating mask 00 for +ve and -ve values and FF for zero.
5476 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5477 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5478 //manipulation for row 1 - row 0
5479 signdwn1_16x8b = _mm_srli_si128(src_temp0_16x8b, 2);
5480 //combining the appropriate sign change
5481 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5482
5483 //row1-row0
5484 //separating +ve and and -ve values.
5485 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5486 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5487
5488 //creating mask 00 for +ve and -ve values and FF for zero.
5489 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5490 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5491 //combining the appropriate sign chang
5492 edge1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5493
5494 //manipulation for row 1 -bottom
5495 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 10);
5496 //bottom left
5497 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
5498
5499 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8); //aligned left (0-1)
5500 signup0_16x8b = _mm_alignr_epi8(edge1_16x8b, signup0_16x8b, 8); //(1-0),(0-top)
5501 //row1 -bottom
5502 cmp_gt0_16x8b = _mm_subs_epu8(src_temp1_16x8b, signdwn1_16x8b);
5503 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp1_16x8b);
5504
5505 //creating mask 00 for +ve and -ve values and FF for zero.
5506 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5507 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5508 //combining the appropriate sign change
5509 signdwn1_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(1-2)
5510 edge0_16x8b = _mm_alignr_epi8(signdwn1_16x8b, edge0_16x8b, 8); //(1-2),(0-1)
5511
5512 //manipulation for bottom- row 1 (row 1 right)
5513 signdwn1_16x8b = _mm_srli_si128(src_temp1_16x8b, 2);
5514 //adding top and down substraction
5515 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b); //(1,0) sign_up empty
5516 //bottom - row 1
5517 cmp_gt0_16x8b = _mm_subs_epu8(src_bottom_16x8b, signdwn1_16x8b);
5518 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_bottom_16x8b);
5519
5520 //eliminating old left for row 0,1
5521 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 4);
5522 signdwn1_16x8b = _mm_slli_si128(src_temp1_16x8b, 8);
5523 //creating mask 00 for +ve and -ve values and FF for zero.
5524 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5525 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5526 //for the next iteration signup0_16x8b
5527 signup0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b); //(2-1) for next
5528
5529 //storing right of row 1 into left
5530 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
5531 //for storing right of row 1
5532 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
5533
5534 src_top_16x8b = src_temp1_16x8b;
5535 //storing right of row 0 into left
5536 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
5537
5538 //adding constant 2
5539 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
5540
5541 //shuffle to get sao index
5542 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
5543 //using availability mask
5544 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
5545 //adding chroma offset to access U and V
5546 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
5547 //shuffle to get sao offset
5548 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
5549 //the next top already in src_top_16x8b
5550 //cnvert to 16 bit then add and then saturated pack
5551 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
5552 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
5553 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
5554 src_temp1_16x8b = _mm_unpacklo_epi8(src_temp1_16x8b, const0_16x8b);
5555 edge0_16x8b = _mm_unpackhi_epi8(edge0_16x8b, signdwn1_16x8b);
5556 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
5557 src_temp1_16x8b = _mm_add_epi16(src_temp1_16x8b, edge0_16x8b);
5558 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, src_temp1_16x8b);
5559
5560 cmp_gt0_16x8b = _mm_srli_si128(src_temp0_16x8b, 8);
5561
5562 _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5563 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
5564 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
5565 // row = 1
5566 _mm_storel_epi64((__m128i *)(pu1_src_cpy + src_strd), cmp_gt0_16x8b);
5567 src_temp0_16x8b = src_bottom_16x8b;
5568 pu1_src_cpy += (src_strd << 1);
5569 pu1_src_left_cpy += 4;
5570 }
5571 ht_rem = ht & 0x1;
5572 if(ht_rem)
5573 {
5574 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
5575 //row = 1 load 8 pixel values from 7:0 pos. relative to cur. pos.
5576 src_bottom_16x8b = _mm_loadu_si128((__m128i *)(pu1_src_cpy + src_strd));
5577
5578
5579 //manipulation for row 0 -bottom
5580 signdwn1_16x8b = _mm_slli_si128(left_store_16x8b, 12);
5581 //bottom left
5582 signdwn1_16x8b = _mm_alignr_epi8(src_bottom_16x8b, signdwn1_16x8b, 14);
5583 //separating +ve and and -ve values.
5584 cmp_gt0_16x8b = _mm_subs_epu8(src_temp0_16x8b, signdwn1_16x8b);
5585 cmp_lt0_16x8b = _mm_subs_epu8(signdwn1_16x8b, src_temp0_16x8b);
5586 //creating mask 00 for +ve and -ve values and FF for zero.
5587 cmp_gt0_16x8b = _mm_cmpeq_epi8(cmp_gt0_16x8b, const0_16x8b);
5588 cmp_lt0_16x8b = _mm_cmpeq_epi8(cmp_lt0_16x8b, const0_16x8b);
5589 //combining the appropriate sign change
5590 edge0_16x8b = _mm_sub_epi8(cmp_gt0_16x8b, cmp_lt0_16x8b);
5591 //adding top and down substraction
5592 edge0_16x8b = _mm_add_epi8(edge0_16x8b, signup0_16x8b);
5593 //for row 0 right to put into left store
5594 signdwn1_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
5595 //adding constant 2
5596 edge0_16x8b = _mm_add_epi8(edge0_16x8b, const2_16x8b);
5597 edge0_16x8b = _mm_slli_si128(edge0_16x8b, 8);
5598 edge0_16x8b = _mm_srli_si128(edge0_16x8b, 8);
5599 //left store manipulation 1
5600 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
5601 //filling the left boundary value
5602 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, signdwn1_16x8b, 14);
5603 src_top_16x8b = src_temp0_16x8b;
5604
5605 //shuffle to get sao index
5606 edge0_16x8b = _mm_shuffle_epi8(edge_idx_8x16b, edge0_16x8b);
5607 //using availability mask
5608 edge0_16x8b = _mm_and_si128(edge0_16x8b, au1_mask8x16b);
5609 //adding chroma offset to access U and V
5610 edge0_16x8b = _mm_add_epi8(edge0_16x8b, chroma_offset_8x16b);
5611 //shuffle to get sao offset
5612 edge0_16x8b = _mm_shuffle_epi8(sao_offset_8x16b, edge0_16x8b);
5613
5614 //cnvert to 16 bit then add and then saturated pack
5615 signdwn1_16x8b = _mm_cmpgt_epi8(const0_16x8b, edge0_16x8b);
5616 src_temp0_16x8b = _mm_unpacklo_epi8(src_temp0_16x8b, const0_16x8b);
5617 cmp_lt0_16x8b = _mm_unpacklo_epi8(edge0_16x8b, signdwn1_16x8b);
5618 src_temp0_16x8b = _mm_add_epi16(src_temp0_16x8b, cmp_lt0_16x8b);
5619 src_temp0_16x8b = _mm_packus_epi16(src_temp0_16x8b, const0_16x8b);
5620
5621 _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5622 //row = 0 store 8 pixel values from 7:0 pos. relative to cur. pos.
5623 _mm_storel_epi64((__m128i *)(pu1_src_cpy), src_temp0_16x8b);
5624 pu1_src_cpy += (src_strd);
5625 src_temp0_16x8b = src_bottom_16x8b;
5626 pu1_src_left_cpy += 2;
5627 }
5628 { //for bottom right
5629 left_store_16x8b = _mm_loadl_epi64((__m128i *)pu1_src_left_cpy);
5630 left_store_16x8b = _mm_srli_si128(left_store_16x8b, 2);
5631 src_temp0_16x8b = _mm_slli_si128(src_temp0_16x8b, 8);
5632 left_store_16x8b = _mm_alignr_epi8(left_store_16x8b, src_temp0_16x8b, 14);
5633 _mm_storel_epi64((__m128i *)(pu1_src_left_cpy), left_store_16x8b);
5634 }
5635 if(0 == pu1_avail[3])
5636 {
5637 src_top_16x8b = src_bottom_16x8b;
5638 }
5639
5640 _mm_storel_epi64((__m128i *)(pu1_src_top + wd - col), src_top_16x8b);
5641 pu1_src += 8;
5642 }
5643 pu1_src_org[wd - 2] = u1_pos_wd_0_tmp_u;
5644 pu1_src_org[wd - 1] = u1_pos_wd_0_tmp_v;
5645 pu1_src_org[(ht_tmp - 1) * src_strd] = u1_pos_0_ht_tmp_u;
5646 pu1_src_org[(ht_tmp - 1) * src_strd + 1] = u1_pos_0_ht_tmp_v;
5647 for(row = 0; row < 2 * ht_tmp; row++)
5648 {
5649 pu1_src_left[row] = au1_src_left_tmp[row];
5650 }
5651 }
5652
5653 }
5654