1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19 *******************************************************************************
20 * @file
21 * ihevc_deblck_atom_intr.c
22 *
23 * @brief
24 * Contains function definitions for deblocking filters
25 *
26 * @author
27 * Rishab
28 *
29 * @par List of Functions:
30 * - ihevc_deblk_luma_vert_ssse3()
31 * - ihevc_deblk_luma_horz_ssse3()
32 * - ihevc_deblk_chroma_vert_ssse3()
33 * - ihevc_deblk_chroma_horz_ssse3()
34 *
35 * @remarks
36 * None
37 *
38 *******************************************************************************
39 */
40 #include <stdlib.h>
41 #include <stdio.h>
42 #include <assert.h>
43 #include "ihevc_typedefs.h"
44 #include "ihevc_platform_macros.h"
45 #include "ihevc_macros.h"
46 #include "ihevc_deblk.h"
47 #include "ihevc_deblk_tables.h"
48 #include "ihevc_debug.h"
49
50 #include "ihevc_tables_x86_intr.h"
51
52 #include <immintrin.h>
53 /**
54 *******************************************************************************
55 *
56 * @brief
57 * Decision process and filtering for the luma block vertical edge.
58 *
59 * @par Description:
60 * The decision process for the luma block vertical edge is carried out and
61 * an appropriate filter is applied. The boundary filter strength, bs should
62 * be greater than 0. The pcm flags and the transquant bypass flags should
63 * be taken care of by the calling function.
64 *
65 * @param[in] pu1_src
66 * Pointer to the src sample q(0,0)
67 *
68 * @param[in] src_strd
69 * Source stride
70 *
71 * @param[in] bs
72 * Boundary filter strength of q(0,0)
73 *
74 * @param[in] quant_param_p
75 * quantization parameter of p block
76 *
77 * @param[in] quant_param_q
78 * quantization parameter of p block
79 *
80 * @param[in] beta_offset_div2
81 *
82 *
83 * @param[in] tc_offset_div2
84 *
85 *
86 * @param[in] filter_flag_p
87 * flag whether to filter the p block
88 *
89 * @param[in] filter_flag_q
90 * flag whether to filter the q block
91 *
92 * @returns
93 *
94 * @remarks
95 * None
96 *
97 *******************************************************************************
98 */
99
ihevc_deblk_luma_vert_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 bs,WORD32 quant_param_p,WORD32 quant_param_q,WORD32 beta_offset_div2,WORD32 tc_offset_div2,WORD32 filter_flag_p,WORD32 filter_flag_q)100 void ihevc_deblk_luma_vert_ssse3(UWORD8 *pu1_src,
101 WORD32 src_strd,
102 WORD32 bs,
103 WORD32 quant_param_p,
104 WORD32 quant_param_q,
105 WORD32 beta_offset_div2,
106 WORD32 tc_offset_div2,
107 WORD32 filter_flag_p,
108 WORD32 filter_flag_q)
109 {
110 WORD32 qp_luma, beta_indx, tc_indx;
111 WORD32 beta, tc;
112 WORD32 d, dp, dq, d_sam0, d_sam3;
113
114 WORD32 d3, d0, de_0, de_1, de_2, de_3;
115 WORD32 de, dep, deq;
116 __m128i src_row0_8x16b, src_row1_8x16b, src_row2_8x16b, src_row3_8x16b;
117
118
119 {
120 __m128i src_tmp_8x16b, coef_8x16b, mask_d_result_4x32b, mask_de_result_8x16b;
121 __m128i mask_16x8b, temp_coef0_8x16b, temp_coef1_8x16b;
122
123
124
125 ASSERT((bs > 0) && (bs <= 3));
126 ASSERT(filter_flag_p || filter_flag_q);
127
128 qp_luma = (quant_param_p + quant_param_q + 1) >> 1;
129 beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51);
130
131 /* BS based on implementation can take value 3 if it is intra/inter egde */
132 /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */
133 /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2) */
134 /* the above desired functionallity is achieved by doing (2*(bs>>1)) */
135
136 tc_indx = CLIP3(qp_luma + (2 * (bs >> 1)) + (tc_offset_div2 << 1), 0, 53);
137
138 beta = gai4_ihevc_beta_table[beta_indx];
139 tc = gai4_ihevc_tc_table[tc_indx];
140 if(0 == tc)
141 {
142 return;
143 }
144 src_row0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 4));
145 src_row3_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd));
146
147 coef_8x16b = _mm_load_si128((__m128i *)(coef_d));
148 mask_16x8b = _mm_load_si128((__m128i *)(shuffle_d));
149
150 src_tmp_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row3_8x16b);
151 mask_de_result_8x16b = _mm_shuffle_epi8(src_tmp_8x16b, mask_16x8b);
152
153 mask_d_result_4x32b = _mm_maddubs_epi16(src_tmp_8x16b, coef_8x16b);
154
155
156 //to get all 1's of 8 bit in (1)
157 temp_coef0_8x16b = _mm_cmpeq_epi16(src_tmp_8x16b, src_tmp_8x16b);
158 temp_coef1_8x16b = _mm_srli_epi16(temp_coef0_8x16b, 15);
159 //accumulating values foe dp3 dq3 , dp0 dq0 values
160 mask_d_result_4x32b = _mm_madd_epi16(mask_d_result_4x32b, temp_coef1_8x16b);
161
162 temp_coef1_8x16b = _mm_packus_epi16(temp_coef1_8x16b, temp_coef1_8x16b);
163 // to get all 1,-1 sets of 16 bits in (0)
164 temp_coef0_8x16b = _mm_unpacklo_epi8(temp_coef0_8x16b, temp_coef1_8x16b);
165 //q33-q30,p33-p30,q03-q00,p03-p00,0,q30-p30,0,q00-p00
166 mask_de_result_8x16b = _mm_maddubs_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
167 //to get 16 bit 1's
168 temp_coef0_8x16b = _mm_srli_epi16(temp_coef1_8x16b, 8);
169
170
171 // dq3 dp3 dq0 dp0
172 mask_d_result_4x32b = _mm_abs_epi32(mask_d_result_4x32b);
173 mask_16x8b = _mm_shuffle_epi32(mask_d_result_4x32b, 0xec);
174 mask_d_result_4x32b = _mm_shuffle_epi32(mask_d_result_4x32b, 0x49);
175 // dq dp d3 d0
176 mask_d_result_4x32b = _mm_add_epi32(mask_d_result_4x32b, mask_16x8b);
177 //|q33-q30|,|p33-p30|,|q03-q00|,|p03-p00|,0,|q30-p30|,0,|q00-p00|
178 mask_de_result_8x16b = _mm_abs_epi16(mask_de_result_8x16b);
179 //|q33-q30|+|p33-p30|,|q03-q00|+|p03-p00|,0+|q30-p30|,0+|q00-p00|
180 mask_de_result_8x16b = _mm_madd_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
181
182 ///store back in a single variable
183 temp_coef0_8x16b = _mm_srli_si128(mask_d_result_4x32b, 4);
184 temp_coef1_8x16b = _mm_srli_si128(mask_d_result_4x32b, 8);
185 mask_16x8b = _mm_srli_si128(mask_d_result_4x32b, 12);
186
187 d0 = _mm_cvtsi128_si32(mask_d_result_4x32b);
188 d3 = _mm_cvtsi128_si32(temp_coef0_8x16b);
189 dp = _mm_cvtsi128_si32(temp_coef1_8x16b);
190 dq = _mm_cvtsi128_si32(mask_16x8b);
191 //getting d
192 d = d0 + d3;
193
194 ///store back in a single variable
195 temp_coef0_8x16b = _mm_srli_si128(mask_de_result_8x16b, 4);
196 temp_coef1_8x16b = _mm_srli_si128(mask_de_result_8x16b, 8);
197 mask_16x8b = _mm_srli_si128(mask_de_result_8x16b, 12);
198
199 de_0 = _mm_cvtsi128_si32(mask_de_result_8x16b);
200 de_1 = _mm_cvtsi128_si32(temp_coef0_8x16b);
201 de_2 = _mm_cvtsi128_si32(temp_coef1_8x16b);
202 de_3 = _mm_cvtsi128_si32(mask_16x8b);
203
204 de = 0;
205 dep = 0;
206 deq = 0;
207 if(d < beta)
208 {
209 d_sam0 = 0;
210 if((2 * d0 < (beta >> 2))
211 && (de_2 < (beta >> 3))
212 && (de_0 < ((5 * tc + 1) >> 1)))
213 {
214 d_sam0 = 1;
215 }
216
217 d_sam3 = 0;
218 if((2 * d3 < (beta >> 2))
219 && (de_3 < (beta >> 3))
220 && de_1 < ((5 * tc + 1) >> 1))
221 {
222 d_sam3 = 1;
223 }
224
225 de = (d_sam0 & d_sam3) + 1;
226 dep = (dp < (beta + (beta >> 1)) >> 3) ? 1 : 0;
227 deq = (dq < (beta + (beta >> 1)) >> 3) ? 1 : 0;
228 if(tc <= 1)
229 {
230 dep = 0;
231 deq = 0;
232 }
233 }
234
235 }
236
237 if(de != 0)
238 {
239
240
241 src_row1_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + src_strd));
242 src_row2_8x16b = _mm_loadl_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd));
243
244 if(de == 2)
245 {
246 __m128i temp_pq_str0_16x8b;
247 __m128i temp_pq1_str0_16x8b, temp_pq1_str1_16x8b;
248 __m128i temp_pq2_str0_16x8b;
249 __m128i temp_pq_str1_16x8b;
250 __m128i temp_str0_16x8b, temp_str1_16x8b, temp_str2_16x8b, temp_str3_16x8b;
251 __m128i temp_max0_16x8b, temp_max1_16x8b, temp_min0_16x8b, temp_min1_16x8b;
252 __m128i const2_8x16b, const2tc_8x16b;
253 LWORD64 mask, tc2;
254 tc = tc << 1;
255 mask = (((LWORD64)filter_flag_q) << 63) | (((LWORD64)filter_flag_p) << 31);
256 tc2 = ((LWORD64)tc);
257
258 const2_8x16b = _mm_cmpeq_epi16(src_row0_8x16b, src_row0_8x16b);
259 //q'0-q'1-2 ,p'0-p'1-2
260 src_row0_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row2_8x16b);
261 src_row1_8x16b = _mm_unpacklo_epi64(src_row1_8x16b, src_row3_8x16b);
262
263 const2_8x16b = _mm_srli_epi16(const2_8x16b, 15);
264 temp_pq_str0_16x8b = _mm_srli_epi64(src_row0_8x16b, 16);
265 temp_pq_str1_16x8b = _mm_srli_epi64(src_row1_8x16b, 16);
266 //arranged x x x x x x x x q31 q30 q1 q10 p30 p31 p10 p11 , x x x x x x x x q21 q20 q01 q00 p20 p21 p00 p01
267 temp_str0_16x8b = _mm_unpacklo_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b);
268 temp_str1_16x8b = _mm_unpackhi_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b);
269
270 const2_8x16b = _mm_packus_epi16(const2_8x16b, const2_8x16b);
271 //arranged q31 q30 q21 q20 q1 q10 q01 q00 p30 p31 p20 p21 p10 p11 p00 p01
272 temp_pq_str0_16x8b = _mm_unpacklo_epi32(temp_str0_16x8b, temp_str1_16x8b);
273
274 temp_pq_str0_16x8b = _mm_maddubs_epi16(temp_pq_str0_16x8b, const2_8x16b);
275
276 //q'1-2, p'1-2
277 temp_pq1_str0_16x8b = _mm_srli_epi64(src_row0_8x16b, 8);
278 temp_pq1_str1_16x8b = _mm_srli_epi64(src_row1_8x16b, 8);
279
280 temp_str2_16x8b = _mm_unpacklo_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
281 temp_str3_16x8b = _mm_unpackhi_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
282
283 temp_str2_16x8b = _mm_shuffle_epi32(temp_str2_16x8b, 0x58);
284 temp_str3_16x8b = _mm_shuffle_epi32(temp_str3_16x8b, 0x58);
285 // q30 p30 q20 p20 q10 p10 q01 q00 p30 q20 p20 q10 p10 q01 q00 p00
286 temp_pq1_str0_16x8b = _mm_unpackhi_epi32(temp_str2_16x8b, temp_str3_16x8b);
287 // q32 q31 q22 q21 q12 q11 q02 q01 p32 p31 p22 p21 p12 p11 p02 p01
288 temp_pq1_str1_16x8b = _mm_unpacklo_epi32(temp_str2_16x8b, temp_str3_16x8b);
289
290 temp_pq1_str0_16x8b = _mm_maddubs_epi16(temp_pq1_str0_16x8b, const2_8x16b);
291 temp_pq1_str1_16x8b = _mm_maddubs_epi16(temp_pq1_str1_16x8b, const2_8x16b);
292
293 //clipping mask design
294 temp_str1_16x8b = _mm_setzero_si128();
295 temp_str0_16x8b = _mm_loadl_epi64((__m128i *)(&mask));
296 const2tc_8x16b = _mm_loadl_epi64((__m128i *)(&tc2));
297 temp_str0_16x8b = _mm_shuffle_epi32(temp_str0_16x8b, 0x44);
298 const2tc_8x16b = _mm_shuffle_epi8(const2tc_8x16b, temp_str1_16x8b);
299
300 //clipping mask design
301 temp_str0_16x8b = _mm_srai_epi32(temp_str0_16x8b, 31);
302 const2tc_8x16b = _mm_and_si128(const2tc_8x16b, temp_str0_16x8b);
303 //calculating Clipping MAX for all pixel values.
304 temp_max0_16x8b = _mm_adds_epu8(src_row0_8x16b, const2tc_8x16b);
305 temp_max1_16x8b = _mm_adds_epu8(src_row1_8x16b, const2tc_8x16b);
306
307
308 //q'2-q'0-2,p'2-p'0-2
309 temp_pq2_str0_16x8b = _mm_unpacklo_epi16(src_row0_8x16b, src_row2_8x16b);
310 temp_str3_16x8b = _mm_unpacklo_epi16(src_row1_8x16b, src_row3_8x16b);
311
312 temp_pq2_str0_16x8b = _mm_shuffle_epi32(temp_pq2_str0_16x8b, 0x5c);
313 temp_str3_16x8b = _mm_shuffle_epi32(temp_str3_16x8b, 0x5c);
314
315 const2_8x16b = _mm_slli_epi16(const2_8x16b, 1);
316 //arranged q33 q32 q23 q22 q13 q12 q03 q02 p33 p32 p23 p22 p13 p12 p03 p02
317 temp_str3_16x8b = _mm_unpacklo_epi16(temp_pq2_str0_16x8b, temp_str3_16x8b);
318
319 temp_pq2_str0_16x8b = _mm_maddubs_epi16(temp_str3_16x8b, const2_8x16b);
320
321 //calculating Clipping MIN for all pixel values.
322 temp_min0_16x8b = _mm_subs_epu8(src_row0_8x16b, const2tc_8x16b);
323 temp_min1_16x8b = _mm_subs_epu8(src_row1_8x16b, const2tc_8x16b);
324 //q'0-q'1-2 ,p'0-p'1-2
325 temp_pq_str1_16x8b = _mm_shuffle_epi32(temp_pq_str0_16x8b, 0x4e);
326 temp_pq_str0_16x8b = _mm_add_epi16(temp_pq_str0_16x8b, temp_pq_str1_16x8b);
327 //q'1-2 p'1-2
328 temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
329 //to get 2 in 16 bit
330 const2_8x16b = _mm_srli_epi16(const2_8x16b, 8);
331 //to get q33 q23 q13 q03, p33 p23 p13 p03
332 temp_pq1_str1_16x8b = _mm_slli_epi16(temp_str3_16x8b, 8);
333 temp_pq_str1_16x8b = _mm_srli_epi16(temp_str3_16x8b, 8);
334 temp_pq1_str1_16x8b = _mm_srli_epi16(temp_pq1_str1_16x8b, 8);
335
336 //q'1, p'1 (adding 2)
337 temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, const2_8x16b);
338 //q'0-q'1,p'0-p'1
339 temp_pq_str0_16x8b = _mm_add_epi16(temp_pq_str0_16x8b, const2_8x16b);
340 //q'2-q'1,p'2-p'1
341 temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq2_str0_16x8b, const2_8x16b);
342 //q'0 = (q'0-q'1)+q'1 ,p'0 = (p'0-p'1)+p'1;
343 temp_pq_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq_str0_16x8b);
344 //q'2 = (q'2-q'1)+q'1 ,p'2 = (p'2-p'1)+p'1;
345 temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq2_str0_16x8b);
346
347 //normalisation of all modified pixels
348 temp_pq_str0_16x8b = _mm_srai_epi16(temp_pq_str0_16x8b, 3);
349 temp_pq1_str0_16x8b = _mm_srai_epi16(temp_pq1_str0_16x8b, 2);
350 temp_pq2_str0_16x8b = _mm_srai_epi16(temp_pq2_str0_16x8b, 3);
351
352 //getting p0 p1 together and p2 p3 together
353 temp_str0_16x8b = _mm_unpacklo_epi16(temp_pq1_str0_16x8b, temp_pq_str0_16x8b);
354 temp_str2_16x8b = _mm_unpacklo_epi16(temp_pq1_str1_16x8b, temp_pq2_str0_16x8b);
355 //getting q1 q0 together and q3 q2 together
356 temp_pq_str0_16x8b = _mm_unpackhi_epi16(temp_pq_str0_16x8b, temp_pq1_str0_16x8b);
357 temp_pq2_str0_16x8b = _mm_unpackhi_epi16(temp_pq2_str0_16x8b, temp_pq_str1_16x8b);
358 //getting p's of row0 row1 together and of row2 row3 together
359 temp_pq_str1_16x8b = _mm_unpacklo_epi32(temp_str2_16x8b, temp_str0_16x8b);
360 temp_str2_16x8b = _mm_unpackhi_epi32(temp_str2_16x8b, temp_str0_16x8b);
361 //getting q's of row0 row1 together and of row2 row3 together
362 temp_str0_16x8b = _mm_unpacklo_epi32(temp_pq_str0_16x8b, temp_pq2_str0_16x8b);
363 temp_pq_str0_16x8b = _mm_unpackhi_epi32(temp_pq_str0_16x8b, temp_pq2_str0_16x8b);
364 //getting values for respective rows in 16 bit
365 src_row0_8x16b = _mm_unpacklo_epi64(temp_pq_str1_16x8b, temp_str0_16x8b);
366 src_row1_8x16b = _mm_unpackhi_epi64(temp_pq_str1_16x8b, temp_str0_16x8b);
367 src_row2_8x16b = _mm_unpacklo_epi64(temp_str2_16x8b, temp_pq_str0_16x8b);
368 src_row3_8x16b = _mm_unpackhi_epi64(temp_str2_16x8b, temp_pq_str0_16x8b);
369 //packing values to 8 bit
370 src_row0_8x16b = _mm_packus_epi16(src_row0_8x16b, src_row2_8x16b);
371 src_row1_8x16b = _mm_packus_epi16(src_row1_8x16b, src_row3_8x16b);
372 //Clipping MAX
373 src_row0_8x16b = _mm_min_epu8(src_row0_8x16b, temp_max0_16x8b);
374 src_row1_8x16b = _mm_min_epu8(src_row1_8x16b, temp_max1_16x8b);
375 //Clipping MIN
376 src_row0_8x16b = _mm_max_epu8(src_row0_8x16b, temp_min0_16x8b);
377 src_row1_8x16b = _mm_max_epu8(src_row1_8x16b, temp_min1_16x8b);
378 //separating row 2 and row 3
379 src_row2_8x16b = _mm_srli_si128(src_row0_8x16b, 8);
380 src_row3_8x16b = _mm_srli_si128(src_row1_8x16b, 8);
381
382 }
383
384 else
385 {
386
387 __m128i tmp_delta0_8x16b, tmp_delta1_8x16b, tmp_delta2_8x16b, tmp_delta3_8x16b;
388 __m128i tmp0_const_8x16b, tmp1_const_8x16b, tmp2_const_8x16b, tmp3_const_8x16b;
389 __m128i coefdelta_0_8x16b, mask_pq_8x16b;
390 __m128i const2_8x16b, consttc_8x16b;
391
392 LWORD64 mask1;
393 mask1 = (((LWORD64)(filter_flag_q & deq)) << 63) | (((LWORD64)filter_flag_q) << 47) | (((LWORD64)filter_flag_p) << 31) | (((LWORD64)(filter_flag_p & dep)) << 15);
394
395 consttc_8x16b = _mm_set1_epi32(tc);
396
397
398 src_row0_8x16b = _mm_unpacklo_epi64(src_row0_8x16b, src_row1_8x16b);
399 src_row2_8x16b = _mm_unpacklo_epi64(src_row2_8x16b, src_row3_8x16b);
400
401 tmp_delta2_8x16b = _mm_srli_epi64(src_row0_8x16b, 16);
402 tmp_delta3_8x16b = _mm_srli_epi64(src_row2_8x16b, 16);
403
404 tmp_delta2_8x16b = _mm_shuffle_epi32(tmp_delta2_8x16b, 0x08);
405 tmp_delta3_8x16b = _mm_shuffle_epi32(tmp_delta3_8x16b, 0x08);
406 //arranged q31 q30 p30 p31 q21 q20 p20 p21 q1 q10 p10 p11 q01 q00 p00 p01
407 tmp_delta2_8x16b = _mm_unpacklo_epi64(tmp_delta2_8x16b, tmp_delta3_8x16b);
408
409 coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_de1);
410 // (-3q1+9q0),(-9p0+3p1)
411 tmp_delta3_8x16b = _mm_maddubs_epi16(tmp_delta2_8x16b, coefdelta_0_8x16b);
412 //converting to 16 bit
413 consttc_8x16b = _mm_packs_epi32(consttc_8x16b, consttc_8x16b);
414 //getting -tc store
415 tmp1_const_8x16b = _mm_cmpeq_epi32(consttc_8x16b, consttc_8x16b);
416 //calc 10 *tc = 2*tc +8*tc ; 2*tc
417 tmp2_const_8x16b = _mm_slli_epi16(consttc_8x16b, 1);
418 //calc 10 *tc = 2*tc +8*tc ; 8*tc
419 tmp0_const_8x16b = _mm_slli_epi16(consttc_8x16b, 3);
420 //getting -tc store
421 tmp3_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp1_const_8x16b);
422 //calc 10 *tc
423 tmp2_const_8x16b = _mm_add_epi16(tmp2_const_8x16b, tmp0_const_8x16b);
424 //const 1
425 const2_8x16b = _mm_srli_epi16(tmp1_const_8x16b, 15);
426 tmp_delta0_8x16b = _mm_madd_epi16(tmp_delta3_8x16b, const2_8x16b);
427 const2_8x16b = _mm_srli_epi32(tmp1_const_8x16b, 31);
428 //getting the mask values
429 mask_pq_8x16b = _mm_loadl_epi64((__m128i *)(&mask1));
430 //loaded coef for delta1 calculation
431 coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_dep1);
432 //(-2q1+q0),(p0-2p1)
433 tmp_delta3_8x16b = _mm_maddubs_epi16(tmp_delta2_8x16b, coefdelta_0_8x16b);
434 //const 8
435 const2_8x16b = _mm_slli_epi32(const2_8x16b, 3);
436 //rearranging the mask values
437 mask_pq_8x16b = _mm_unpacklo_epi64(mask_pq_8x16b, mask_pq_8x16b);
438 //normalisation of the filter
439 tmp_delta0_8x16b = _mm_add_epi32(tmp_delta0_8x16b, const2_8x16b);
440 tmp_delta0_8x16b = _mm_srai_epi32(tmp_delta0_8x16b, 4);
441
442 //getting deltaq0
443 tmp_delta2_8x16b = _mm_sign_epi32(tmp_delta0_8x16b, tmp1_const_8x16b);
444 //packing d3q d2q d1q d0q d3p d2p d1p d0p
445 tmp_delta0_8x16b = _mm_packs_epi32(tmp_delta0_8x16b, tmp_delta2_8x16b);
446 //absolute delta
447 tmp_delta2_8x16b = _mm_abs_epi16(tmp_delta0_8x16b);
448 //Clipping of delta0
449 tmp_delta0_8x16b = _mm_min_epi16(tmp_delta0_8x16b, consttc_8x16b);
450 //mask for |delta| < 10*tc
451 tmp0_const_8x16b = _mm_cmpgt_epi16(tmp2_const_8x16b, tmp_delta2_8x16b);
452 //Clipping of delta0
453 tmp_delta0_8x16b = _mm_max_epi16(tmp_delta0_8x16b, tmp3_const_8x16b);
454
455
456 //delta 1 calc starts
457
458 //getting q32 q22 q12 q02 p32 p12 p22 p02
459 tmp2_const_8x16b = _mm_loadl_epi64((__m128i *)(shuffle0));
460 tmp_delta2_8x16b = _mm_shuffle_epi8(src_row0_8x16b, tmp2_const_8x16b);
461 tmp_delta1_8x16b = _mm_shuffle_epi8(src_row2_8x16b, tmp2_const_8x16b);
462 tmp_delta1_8x16b = _mm_unpacklo_epi32(tmp_delta2_8x16b, tmp_delta1_8x16b);
463 //constant 1
464 const2_8x16b = _mm_srli_epi16(tmp1_const_8x16b, 15);
465 //tc>>1 16 bit
466 consttc_8x16b = _mm_srai_epi16(consttc_8x16b, 1);
467
468 //getting -tc>>1 store 16 bit
469 tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp1_const_8x16b);
470 //2*delta0
471 tmp2_const_8x16b = _mm_add_epi16(tmp_delta0_8x16b, tmp_delta0_8x16b);
472
473 //getting all respective q's and p's together
474 tmp3_const_8x16b = _mm_load_si128((__m128i *)(shuffle1));
475 tmp_delta3_8x16b = _mm_shuffle_epi8(tmp_delta3_8x16b, tmp3_const_8x16b);
476 //final adds for deltap1 and deltaq1
477 tmp_delta3_8x16b = _mm_add_epi16(tmp_delta3_8x16b, const2_8x16b);
478 tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, tmp2_const_8x16b);
479 tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, tmp_delta3_8x16b);
480 tmp2_const_8x16b = _mm_setzero_si128();
481 tmp_delta1_8x16b = _mm_srai_epi16(tmp_delta1_8x16b, 2);
482
483 // clipping delta1
484 tmp_delta1_8x16b = _mm_min_epi16(tmp_delta1_8x16b, consttc_8x16b);
485 // clipping delta1
486 tmp_delta1_8x16b = _mm_max_epi16(tmp_delta1_8x16b, tmp1_const_8x16b);
487
488 //getting the mask ready
489 mask_pq_8x16b = _mm_srai_epi16(mask_pq_8x16b, 15);
490 //masking of the delta values |delta|<10*tc
491 tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, tmp0_const_8x16b);
492 tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, tmp0_const_8x16b);
493 //packing dq1 dq0 dp0 dp1
494 tmp1_const_8x16b = _mm_unpacklo_epi16(tmp_delta1_8x16b, tmp_delta0_8x16b);
495 tmp_delta0_8x16b = _mm_unpackhi_epi16(tmp_delta0_8x16b, tmp_delta1_8x16b);
496 tmp_delta1_8x16b = _mm_unpackhi_epi32(tmp1_const_8x16b, tmp_delta0_8x16b);
497 tmp_delta0_8x16b = _mm_unpacklo_epi32(tmp1_const_8x16b, tmp_delta0_8x16b);
498
499 //masking of the delta values dep, deq , filter_p ,filter_q
500 tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, mask_pq_8x16b);
501 tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, mask_pq_8x16b);
502 //converting 8bit to 16 bit
503 src_row0_8x16b = _mm_unpacklo_epi8(src_row0_8x16b, tmp2_const_8x16b);
504 src_row1_8x16b = _mm_unpacklo_epi8(src_row1_8x16b, tmp2_const_8x16b);
505 src_row2_8x16b = _mm_unpacklo_epi8(src_row2_8x16b, tmp2_const_8x16b);
506 src_row3_8x16b = _mm_unpacklo_epi8(src_row3_8x16b, tmp2_const_8x16b);
507 //shuffle values loaded
508 tmp0_const_8x16b = _mm_load_si128((__m128i *)shuffle2);
509 tmp1_const_8x16b = _mm_load_si128((__m128i *)shuffle3);
510 //arranging each row delta in different registers
511 tmp_delta3_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, tmp1_const_8x16b);
512 tmp_delta2_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, tmp0_const_8x16b);
513 tmp_delta1_8x16b = _mm_shuffle_epi8(tmp_delta0_8x16b, tmp1_const_8x16b);
514 tmp_delta0_8x16b = _mm_shuffle_epi8(tmp_delta0_8x16b, tmp0_const_8x16b);
515
516 //adding the respective delta
517 src_row3_8x16b = _mm_add_epi16(tmp_delta3_8x16b, src_row3_8x16b);
518 src_row2_8x16b = _mm_add_epi16(tmp_delta2_8x16b, src_row2_8x16b);
519 src_row1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, src_row1_8x16b);
520 src_row0_8x16b = _mm_add_epi16(tmp_delta0_8x16b, src_row0_8x16b);
521 //saturating to 8 bit
522 src_row2_8x16b = _mm_packus_epi16(src_row2_8x16b, src_row3_8x16b);
523 src_row0_8x16b = _mm_packus_epi16(src_row0_8x16b, src_row1_8x16b);
524 //separating different rows
525 src_row1_8x16b = _mm_srli_si128(src_row0_8x16b, 8);
526 src_row3_8x16b = _mm_srli_si128(src_row2_8x16b, 8);
527 }
528
529 _mm_storel_epi64((__m128i *)(pu1_src - 4), src_row0_8x16b);
530 _mm_storel_epi64((__m128i *)((pu1_src - 4) + src_strd), src_row1_8x16b);
531 _mm_storel_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd), src_row2_8x16b);
532 _mm_storel_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd), src_row3_8x16b);
533 }
534 }
535
ihevc_deblk_luma_horz_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 bs,WORD32 quant_param_p,WORD32 quant_param_q,WORD32 beta_offset_div2,WORD32 tc_offset_div2,WORD32 filter_flag_p,WORD32 filter_flag_q)536 void ihevc_deblk_luma_horz_ssse3(UWORD8 *pu1_src,
537 WORD32 src_strd,
538 WORD32 bs,
539 WORD32 quant_param_p,
540 WORD32 quant_param_q,
541 WORD32 beta_offset_div2,
542 WORD32 tc_offset_div2,
543 WORD32 filter_flag_p,
544 WORD32 filter_flag_q)
545 {
546 WORD32 qp_luma, beta_indx, tc_indx;
547 WORD32 beta, tc;
548
549 WORD32 d0, d3, dp, dq, d;
550 WORD32 de_0, de_1, de_2, de_3;
551 WORD32 d_sam0, d_sam3;
552 WORD32 de, dep, deq;
553
554 __m128i src_q0_8x16b, src_q1_8x16b, src_p0_8x16b, src_p1_8x16b, src_q2_8x16b;
555 __m128i tmp_pq_str1_8x16b, src_p2_8x16b, tmp_pq_str0_8x16b;
556
557
558
559
560 {
561 __m128i src_tmp_p_0_8x16b, src_tmp_p_1_8x16b, src_tmp_q_0_8x16b, src_tmp_q_1_8x16b;
562 __m128i coef_8x16b, mask_d_result_4x32b, mask_de_result_8x16b;
563 __m128i mask_16x8b, temp_coef0_8x16b, temp_coef1_8x16b;
564
565 ASSERT((bs > 0));
566 ASSERT(filter_flag_p || filter_flag_q);
567
568 qp_luma = (quant_param_p + quant_param_q + 1) >> 1;
569 beta_indx = CLIP3(qp_luma + (beta_offset_div2 << 1), 0, 51);
570
571 /* BS based on implementation can take value 3 if it is intra/inter egde */
572 /* based on BS, tc index is calcuated by adding 2 * ( bs - 1) to QP and tc_offset */
573 /* for BS = 1 adding factor is (0*2), BS = 2 or 3 adding factor is (1*2) */
574 /* the above desired functionallity is achieved by doing (2*(bs>>1)) */
575
576 tc_indx = CLIP3(qp_luma + 2 * (bs >> 1) + (tc_offset_div2 << 1), 0, 53);
577
578 beta = gai4_ihevc_beta_table[beta_indx];
579 tc = gai4_ihevc_tc_table[tc_indx];
580 if(0 == tc)
581 {
582 return;
583 }
584 src_q0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src));
585 src_q1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
586 src_p0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - src_strd));
587 src_p1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 2 * src_strd));
588 src_q2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd));
589 tmp_pq_str1_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd));
590 src_p2_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 3 * src_strd));
591 tmp_pq_str0_8x16b = _mm_loadl_epi64((__m128i *)(pu1_src - 4 * src_strd));
592
593
594 src_tmp_p_0_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b);
595 src_tmp_p_1_8x16b = _mm_unpacklo_epi8(tmp_pq_str0_8x16b, src_p2_8x16b);
596
597 src_tmp_q_0_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b);
598 src_tmp_q_1_8x16b = _mm_unpacklo_epi8(src_q2_8x16b, tmp_pq_str1_8x16b);
599
600 src_tmp_p_0_8x16b = _mm_unpacklo_epi16(src_tmp_p_1_8x16b, src_tmp_p_0_8x16b);
601 src_tmp_q_0_8x16b = _mm_unpacklo_epi16(src_tmp_q_0_8x16b, src_tmp_q_1_8x16b);
602
603 src_tmp_p_0_8x16b = _mm_shuffle_epi32(src_tmp_p_0_8x16b, 0x6c);
604 src_tmp_q_0_8x16b = _mm_shuffle_epi32(src_tmp_q_0_8x16b, 0x6c);
605
606 coef_8x16b = _mm_load_si128((__m128i *)(coef_d));
607 mask_16x8b = _mm_load_si128((__m128i *)(shuffle_d));
608
609 src_tmp_p_0_8x16b = _mm_unpacklo_epi32(src_tmp_p_0_8x16b, src_tmp_q_0_8x16b);
610 //WORD32 shuffle_d[4]={0x80800403,0x80800c0b,0x03000704,0x0b080f0c};
611 mask_de_result_8x16b = _mm_shuffle_epi8(src_tmp_p_0_8x16b, mask_16x8b);
612
613 mask_d_result_4x32b = _mm_maddubs_epi16(src_tmp_p_0_8x16b, coef_8x16b);
614
615
616 //to get all 1's of 8 bit in (1)
617 temp_coef0_8x16b = _mm_cmpeq_epi16(src_tmp_p_0_8x16b, src_tmp_p_0_8x16b);
618 temp_coef1_8x16b = _mm_srli_epi16(temp_coef0_8x16b, 15);
619 //accumulating values foe dp3 dq3 , dp0 dq0 values
620 mask_d_result_4x32b = _mm_madd_epi16(mask_d_result_4x32b, temp_coef1_8x16b);
621
622 temp_coef1_8x16b = _mm_packus_epi16(temp_coef1_8x16b, temp_coef1_8x16b);
623 // to get all 1,-1 sets of 16 bits in (0)
624 temp_coef0_8x16b = _mm_unpacklo_epi8(temp_coef0_8x16b, temp_coef1_8x16b);
625 //q33-q30,p33-p30,q03-q00,p03-p00,0,q30-p30,0,q00-p00
626 mask_de_result_8x16b = _mm_maddubs_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
627 //to get 16 bit 1's
628 temp_coef0_8x16b = _mm_srli_epi16(temp_coef1_8x16b, 8);
629
630
631 // dq3 dp3 dq0 dp0
632 mask_d_result_4x32b = _mm_abs_epi32(mask_d_result_4x32b);
633 mask_16x8b = _mm_shuffle_epi32(mask_d_result_4x32b, 0xec);
634 mask_d_result_4x32b = _mm_shuffle_epi32(mask_d_result_4x32b, 0x49);
635 // dq dp d3 d0
636 mask_d_result_4x32b = _mm_add_epi32(mask_d_result_4x32b, mask_16x8b);
637 //|q33-q30|,|p33-p30|,|q03-q00|,|p03-p00|,0,|q30-p30|,0,|q00-p00|
638 mask_de_result_8x16b = _mm_abs_epi16(mask_de_result_8x16b);
639 //|q33-q30|+|p33-p30|,|q03-q00|+|p03-p00|,0+|q30-p30|,0+|q00-p00|
640 mask_de_result_8x16b = _mm_madd_epi16(mask_de_result_8x16b, temp_coef0_8x16b);
641
642 ///store back in a single variable
643 temp_coef0_8x16b = _mm_srli_si128(mask_d_result_4x32b, 4);
644 temp_coef1_8x16b = _mm_srli_si128(mask_d_result_4x32b, 8);
645 mask_16x8b = _mm_srli_si128(mask_d_result_4x32b, 12);
646
647 d0 = _mm_cvtsi128_si32(mask_d_result_4x32b);
648 d3 = _mm_cvtsi128_si32(temp_coef0_8x16b);
649 dp = _mm_cvtsi128_si32(temp_coef1_8x16b);
650 dq = _mm_cvtsi128_si32(mask_16x8b);
651 //getting d
652 d = d0 + d3;
653
654 ///store back in a single variable
655 temp_coef0_8x16b = _mm_srli_si128(mask_de_result_8x16b, 4);
656 temp_coef1_8x16b = _mm_srli_si128(mask_de_result_8x16b, 8);
657 mask_16x8b = _mm_srli_si128(mask_de_result_8x16b, 12);
658
659 de_0 = _mm_cvtsi128_si32(mask_de_result_8x16b);
660 de_1 = _mm_cvtsi128_si32(temp_coef0_8x16b);
661 de_2 = _mm_cvtsi128_si32(temp_coef1_8x16b);
662 de_3 = _mm_cvtsi128_si32(mask_16x8b);
663
664 de = 0;
665 dep = 0;
666 deq = 0;
667 if(d < beta)
668 {
669 d_sam0 = 0;
670 if((2 * d0 < (beta >> 2))
671 && (de_2 < (beta >> 3))
672 && (de_0 < ((5 * tc + 1) >> 1)))
673 {
674 d_sam0 = 1;
675 }
676
677 d_sam3 = 0;
678 if((2 * d3 < (beta >> 2))
679 && (de_3 < (beta >> 3))
680 && de_1 < ((5 * tc + 1) >> 1))
681 {
682 d_sam3 = 1;
683 }
684
685 de = (d_sam0 & d_sam3) + 1;
686 dep = (dp < (beta + (beta >> 1)) >> 3) ? 1 : 0;
687 deq = (dq < (beta + (beta >> 1)) >> 3) ? 1 : 0;
688 if(tc <= 1)
689 {
690 dep = 0;
691 deq = 0;
692 }
693 }
694
695 }
696
697 if(de != 0)
698 {
699
700 if(2 == de)
701 {
702
703 __m128i temp_pq0_str0_16x8b;
704 __m128i temp_pq1_str0_16x8b, temp_pq1_str1_16x8b;
705 __m128i temp_pq2_str0_16x8b;
706 __m128i temp_str0_16x8b, temp_str1_16x8b;
707 __m128i const2_8x16b, const2tc_8x16b;
708
709 LWORD64 mask, tc2;
710 tc = tc << 1;
711 mask = (((LWORD64)filter_flag_q) << 63) | (((LWORD64)filter_flag_p) << 31);
712 tc2 = ((LWORD64)tc);
713
714 const2_8x16b = _mm_cmpeq_epi16(src_p1_8x16b, src_p1_8x16b);
715 //q'0-q'1-2 ,p'0-p'1-2
716 temp_pq0_str0_16x8b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b);
717 temp_str0_16x8b = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b);
718 const2_8x16b = _mm_srli_epi16(const2_8x16b, 15);
719 //arranged q31 q30 q21 q20 q1 q10 q01 q00 p30 p31 p20 p21 p10 p11 p00 p01
720 temp_pq0_str0_16x8b = _mm_unpacklo_epi64(temp_pq0_str0_16x8b, temp_str0_16x8b);
721
722 const2_8x16b = _mm_packus_epi16(const2_8x16b, const2_8x16b);
723 temp_pq0_str0_16x8b = _mm_maddubs_epi16(temp_pq0_str0_16x8b, const2_8x16b);
724
725 //q'1-2, p'1-2
726 temp_pq1_str0_16x8b = _mm_unpacklo_epi8(src_p0_8x16b, src_q0_8x16b);
727 temp_pq1_str1_16x8b = _mm_unpacklo_epi8(src_q1_8x16b, src_q2_8x16b);
728 temp_str1_16x8b = _mm_unpacklo_epi8(src_p1_8x16b, src_p2_8x16b);
729 // q30 p30 q20 p20 q10 p10 q01 q00 p30 q20 p20 q10 p10 q01 q00 p00
730 temp_pq1_str0_16x8b = _mm_unpacklo_epi64(temp_pq1_str0_16x8b, temp_pq1_str0_16x8b);
731 // q32 q31 q22 q21 q12 q11 q02 q01 p32 p31 p22 p21 p12 p11 p02 p01
732 temp_pq1_str1_16x8b = _mm_unpacklo_epi64(temp_str1_16x8b, temp_pq1_str1_16x8b);
733
734 temp_pq1_str0_16x8b = _mm_maddubs_epi16(temp_pq1_str0_16x8b, const2_8x16b);
735 temp_pq1_str1_16x8b = _mm_maddubs_epi16(temp_pq1_str1_16x8b, const2_8x16b);
736
737 //clipping mask design
738 temp_str1_16x8b = _mm_setzero_si128();
739 temp_str0_16x8b = _mm_loadl_epi64((__m128i *)(&mask));
740 const2tc_8x16b = _mm_loadl_epi64((__m128i *)(&tc2));
741 temp_str0_16x8b = _mm_shuffle_epi32(temp_str0_16x8b, 0x44);
742 const2tc_8x16b = _mm_shuffle_epi8(const2tc_8x16b, temp_str1_16x8b);
743
744 //clipping mask design
745 temp_str0_16x8b = _mm_srai_epi32(temp_str0_16x8b, 31);
746 const2tc_8x16b = _mm_and_si128(const2tc_8x16b, temp_str0_16x8b);
747 //calculating Clipping MAX for all pixel values.
748 src_p0_8x16b = _mm_unpacklo_epi32(src_p0_8x16b, src_q0_8x16b);
749 src_q0_8x16b = _mm_unpacklo_epi32(src_p1_8x16b, src_q1_8x16b);
750 //for clipping calc
751 src_p1_8x16b = _mm_unpacklo_epi64(src_p0_8x16b, src_q0_8x16b);
752 //saving the unmodified data of q1 p1 q0 p0
753 src_q1_8x16b = _mm_unpackhi_epi64(src_p0_8x16b, src_q0_8x16b);
754 //CLIpping MAX and MIN for q1 p1 q0 p0
755 src_p0_8x16b = _mm_adds_epu8(src_p1_8x16b, const2tc_8x16b);
756 src_p1_8x16b = _mm_subs_epu8(src_p1_8x16b, const2tc_8x16b);
757
758
759 //q'2-q'0-2,p'2-p'0-2
760 tmp_pq_str0_8x16b = _mm_unpacklo_epi8(src_p2_8x16b, tmp_pq_str0_8x16b);
761 temp_pq2_str0_16x8b = _mm_unpacklo_epi8(src_q2_8x16b, tmp_pq_str1_8x16b);
762 const2_8x16b = _mm_slli_epi16(const2_8x16b, 1);
763 //arranged q33 q32 q23 q22 q13 q12 q03 q02 p32 p33 p22 p23 p12 p13 p02 p03
764 temp_pq2_str0_16x8b = _mm_unpacklo_epi64(tmp_pq_str0_8x16b, temp_pq2_str0_16x8b);
765 src_p2_8x16b = _mm_unpacklo_epi32(src_p2_8x16b, src_q2_8x16b);
766 temp_pq2_str0_16x8b = _mm_maddubs_epi16(temp_pq2_str0_16x8b, const2_8x16b);
767
768 //calculating Clipping MAX and MIN for p2 and q2 .
769 tmp_pq_str0_8x16b = _mm_adds_epu8(src_p2_8x16b, const2tc_8x16b);
770 tmp_pq_str1_8x16b = _mm_subs_epu8(src_p2_8x16b, const2tc_8x16b);
771 //q'0-q'1-2 ,p'0-p'1-2
772 temp_str0_16x8b = _mm_shuffle_epi32(temp_pq0_str0_16x8b, 0x4e);
773 temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq0_str0_16x8b, temp_str0_16x8b);
774 //q'1-2 p'1-2
775 temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq1_str1_16x8b);
776 //to get 2 in 16 bit
777 const2_8x16b = _mm_srli_epi16(const2_8x16b, 8);
778
779
780 //q'1, p'1 (adding 2)
781 temp_pq1_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, const2_8x16b);
782 //q'0-q'1,p'0-p'1
783 temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq0_str0_16x8b, const2_8x16b);
784 //q'2-q'1,p'2-p'1
785 temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq2_str0_16x8b, const2_8x16b);
786 //q'0 = (q'0-q'1)+q'1 ,p'0 = (p'0-p'1)+p'1;
787 temp_pq0_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq0_str0_16x8b);
788 //q'2 = (q'2-q'1)+q'1 ,p'2 = (p'2-p'1)+p'1;
789 temp_pq2_str0_16x8b = _mm_add_epi16(temp_pq1_str0_16x8b, temp_pq2_str0_16x8b);
790
791 //normalisation of all modified pixels
792 temp_pq0_str0_16x8b = _mm_srai_epi16(temp_pq0_str0_16x8b, 3);
793 temp_pq1_str0_16x8b = _mm_srai_epi16(temp_pq1_str0_16x8b, 2);
794 temp_pq2_str0_16x8b = _mm_srai_epi16(temp_pq2_str0_16x8b, 3);
795 //q'1 p'1 q'0 p'0
796 temp_pq0_str0_16x8b = _mm_packus_epi16(temp_pq0_str0_16x8b, temp_pq1_str0_16x8b);
797 temp_pq2_str0_16x8b = _mm_packus_epi16(temp_pq2_str0_16x8b, temp_pq2_str0_16x8b);
798 //pack with the unmodified data of q2 and p2
799 src_p2_8x16b = _mm_unpackhi_epi64(temp_pq2_str0_16x8b, src_p2_8x16b);
800 //Clipping MAX and MIN for q'1 p'1 q'0 p'0 and q'2 p'2
801 temp_pq0_str0_16x8b = _mm_min_epu8(temp_pq0_str0_16x8b, src_p0_8x16b);
802 src_p2_8x16b = _mm_min_epu8(src_p2_8x16b, tmp_pq_str0_8x16b);
803 temp_pq0_str0_16x8b = _mm_max_epu8(temp_pq0_str0_16x8b, src_p1_8x16b);
804 src_p2_8x16b = _mm_max_epu8(src_p2_8x16b, tmp_pq_str1_8x16b);
805 //Reshuffling q'1 p'1 q'0 p'0 along with unmodified data
806 src_p0_8x16b = _mm_unpacklo_epi32(temp_pq0_str0_16x8b, src_q1_8x16b);
807 src_p1_8x16b = _mm_unpackhi_epi32(temp_pq0_str0_16x8b, src_q1_8x16b);
808 src_p2_8x16b = _mm_shuffle_epi32(src_p2_8x16b, 0xd8);
809 src_q0_8x16b = _mm_srli_si128(src_p0_8x16b, 8);
810 src_q1_8x16b = _mm_srli_si128(src_p1_8x16b, 8);
811 src_q2_8x16b = _mm_srli_si128(src_p2_8x16b, 8);
812
813 _mm_storel_epi64((__m128i *)(pu1_src - 3 * src_strd), src_p2_8x16b);
814 _mm_storel_epi64((__m128i *)(pu1_src - 2 * src_strd), src_p1_8x16b);
815 _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_8x16b);
816 _mm_storel_epi64((__m128i *)(pu1_src), src_q0_8x16b);
817 _mm_storel_epi64((__m128i *)(pu1_src + src_strd), src_q1_8x16b);
818 _mm_storel_epi64((__m128i *)(pu1_src + 2 * src_strd), src_q2_8x16b);
819
820
821 }
822
823 else
824 {
825
826 __m128i tmp_delta0_8x16b, tmp_delta1_8x16b;
827 __m128i tmp0_const_8x16b, tmp1_const_8x16b, tmp2_const_8x16b;
828 __m128i coefdelta_0_8x16b;
829 __m128i const2_8x16b, consttc_8x16b;
830
831 LWORD64 maskp0, maskp1, maskq0, maskq1;
832 maskp0 = (LWORD64)filter_flag_p;
833 maskq0 = (LWORD64)filter_flag_q;
834 maskp1 = (LWORD64)dep;
835 maskq1 = (LWORD64)deq;
836 consttc_8x16b = _mm_set1_epi32(tc);
837
838 tmp_delta0_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, src_p0_8x16b);
839 tmp_delta1_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, src_q1_8x16b);
840 //arranged q31 q30 p30 p31 q21 q20 p20 p21 q1 q10 p10 p11 q01 q00 p00 p01
841 tmp_delta1_8x16b = _mm_unpacklo_epi16(tmp_delta0_8x16b, tmp_delta1_8x16b);
842
843 coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_de1);
844 // (-3q1+9q0),(-9p0+3p1)
845 tmp_delta0_8x16b = _mm_maddubs_epi16(tmp_delta1_8x16b, coefdelta_0_8x16b);
846
847 //getting -tc store
848 tmp2_const_8x16b = _mm_cmpeq_epi32(consttc_8x16b, consttc_8x16b);
849
850 //getting tc in 16 bit
851 consttc_8x16b = _mm_packs_epi32(consttc_8x16b, consttc_8x16b);
852 //calc 10 *tc = 2*tc +8*tc ; 2*tc
853 tmp_pq_str0_8x16b = _mm_slli_epi16(consttc_8x16b, 1);
854 //calc 10 *tc = 2*tc +8*tc ; 8*tc
855 tmp_pq_str1_8x16b = _mm_slli_epi16(consttc_8x16b, 3);
856
857 //const 1
858 const2_8x16b = _mm_srli_epi16(tmp2_const_8x16b, 15);
859 //calc 10 *tc
860 tmp_pq_str0_8x16b = _mm_add_epi16(tmp_pq_str0_8x16b, tmp_pq_str1_8x16b);
861 //delta0 without normalisation and clipping
862 tmp_delta0_8x16b = _mm_madd_epi16(tmp_delta0_8x16b, const2_8x16b);
863
864 const2_8x16b = _mm_srli_epi32(tmp2_const_8x16b, 31);
865
866 //loaded coef for delta1 calculation
867 coefdelta_0_8x16b = _mm_load_si128((__m128i *)coef_dep1);
868 //(-2q1+q0),(p0-2p1)
869 tmp_delta1_8x16b = _mm_maddubs_epi16(tmp_delta1_8x16b, coefdelta_0_8x16b);
870 //const 8
871 const2_8x16b = _mm_slli_epi32(const2_8x16b, 3);
872
873 //normalisation of the filter
874 tmp_delta0_8x16b = _mm_add_epi32(tmp_delta0_8x16b, const2_8x16b);
875 tmp_delta0_8x16b = _mm_srai_epi32(tmp_delta0_8x16b, 4);
876
877 //getting deltaq0
878 tmp_pq_str1_8x16b = _mm_sign_epi32(tmp_delta0_8x16b, tmp2_const_8x16b);
879 //getting -tc
880 tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp2_const_8x16b);
881 //packing d03q d02q d01q d0q d03p d02p d01p d00p
882 tmp_delta0_8x16b = _mm_packs_epi32(tmp_delta0_8x16b, tmp_pq_str1_8x16b);
883 //absolute delta
884 tmp_pq_str1_8x16b = _mm_abs_epi16(tmp_delta0_8x16b);
885
886 //Clipping of delta0
887 tmp_delta0_8x16b = _mm_min_epi16(tmp_delta0_8x16b, consttc_8x16b);
888 //tc>>1 16 bit
889 consttc_8x16b = _mm_srai_epi16(consttc_8x16b, 1);
890 //Clipping of delta0
891 tmp_delta0_8x16b = _mm_max_epi16(tmp_delta0_8x16b, tmp1_const_8x16b);
892
893 //(-tc)>>1 16 bit
894 tmp1_const_8x16b = _mm_sign_epi16(consttc_8x16b, tmp2_const_8x16b);
895 //mask for |delta| < 10*tc
896 tmp_pq_str0_8x16b = _mm_cmpgt_epi16(tmp_pq_str0_8x16b, tmp_pq_str1_8x16b);
897 //delta 1 calc starts
898
899 //getting q32 q22 q12 q02 p32 p12 p22 p02
900 tmp0_const_8x16b = _mm_setzero_si128();
901 src_q2_8x16b = _mm_unpacklo_epi8(src_q2_8x16b, tmp0_const_8x16b);
902 src_p2_8x16b = _mm_unpacklo_epi8(src_p2_8x16b, tmp0_const_8x16b);
903 src_p2_8x16b = _mm_unpacklo_epi64(src_p2_8x16b, src_q2_8x16b);
904 //constant 1
905 const2_8x16b = _mm_srli_epi16(tmp2_const_8x16b, 15);
906 //2*delta0
907 tmp2_const_8x16b = _mm_add_epi16(tmp_delta0_8x16b, tmp_delta0_8x16b);
908 //getting all respective q's and p's together
909 coefdelta_0_8x16b = _mm_load_si128((__m128i *)(shuffle1));
910 tmp_delta1_8x16b = _mm_shuffle_epi8(tmp_delta1_8x16b, coefdelta_0_8x16b);
911 //final adds for deltap1 and deltaq1
912 tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, const2_8x16b);
913 src_p2_8x16b = _mm_add_epi16(src_p2_8x16b, tmp2_const_8x16b);
914 tmp_delta1_8x16b = _mm_add_epi16(tmp_delta1_8x16b, src_p2_8x16b);
915 tmp_delta1_8x16b = _mm_srai_epi16(tmp_delta1_8x16b, 2);
916
917 //mask0= (((LWORD64)filter_flag_q)<<63)| (((LWORD64)filter_flag_p)<<31);
918 tmp_pq_str1_8x16b = _mm_loadl_epi64((__m128i *)(&(maskq0)));
919 src_p2_8x16b = _mm_loadl_epi64((__m128i *)(&(maskp0)));
920
921 // src_p2_8x16b = _mm_set_epi32(filter_flag_q,filter_flag_p,filter_flag_q,filter_flag_p);
922 //mask1= (((LWORD64)(filter_flag_q&deq))<<63)|(((LWORD64)(filter_flag_p & dep))<<31);
923 src_q2_8x16b = _mm_loadl_epi64((__m128i *)(&(maskq1)));
924 coefdelta_0_8x16b = _mm_loadl_epi64((__m128i *)(&(maskp1)));
925
926 src_p2_8x16b = _mm_unpacklo_epi32(src_p2_8x16b, tmp_pq_str1_8x16b);
927 src_q2_8x16b = _mm_unpacklo_epi32(coefdelta_0_8x16b, src_q2_8x16b);
928 //src_q2_8x16b = _mm_set_epi32(deq,dep,deq,dep);
929 src_q2_8x16b = _mm_and_si128(src_q2_8x16b, src_p2_8x16b);
930
931 //rearranging the mask values
932 src_q2_8x16b = _mm_shuffle_epi32(src_q2_8x16b, 0x50);
933 src_p2_8x16b = _mm_shuffle_epi32(src_p2_8x16b, 0x50);
934
935 src_q2_8x16b = _mm_slli_epi32(src_q2_8x16b, 31);
936 src_p2_8x16b = _mm_slli_epi32(src_p2_8x16b, 31);
937 src_q2_8x16b = _mm_srai_epi32(src_q2_8x16b, 31);
938 src_p2_8x16b = _mm_srai_epi32(src_p2_8x16b, 31);
939
940 //combining mask delta1
941 tmp_pq_str1_8x16b = _mm_and_si128(tmp_pq_str0_8x16b, src_q2_8x16b);
942 // clipping delta1
943 tmp_delta1_8x16b = _mm_min_epi16(tmp_delta1_8x16b, consttc_8x16b);
944 //combining mask delat0
945 tmp_pq_str0_8x16b = _mm_and_si128(tmp_pq_str0_8x16b, src_p2_8x16b);
946 // clipping delta1
947 tmp_delta1_8x16b = _mm_max_epi16(tmp_delta1_8x16b, tmp1_const_8x16b);
948
949
950 //masking of the delta values |delta|<10*tc
951 tmp_delta1_8x16b = _mm_and_si128(tmp_delta1_8x16b, tmp_pq_str1_8x16b);
952 tmp_delta0_8x16b = _mm_and_si128(tmp_delta0_8x16b, tmp_pq_str0_8x16b);
953 //separating p and q delta 0 and addinq p0 and q0
954 tmp_pq_str0_8x16b = _mm_unpacklo_epi64(tmp_delta0_8x16b, tmp0_const_8x16b);
955 tmp_pq_str1_8x16b = _mm_unpackhi_epi64(tmp_delta0_8x16b, tmp0_const_8x16b);
956 src_p0_8x16b = _mm_unpacklo_epi8(src_p0_8x16b, tmp0_const_8x16b);
957 src_q0_8x16b = _mm_unpacklo_epi8(src_q0_8x16b, tmp0_const_8x16b);
958 src_p0_8x16b = _mm_add_epi16(src_p0_8x16b, tmp_pq_str0_8x16b);
959 src_q0_8x16b = _mm_add_epi16(src_q0_8x16b, tmp_pq_str1_8x16b);
960 //separating p and q delta 0 and addinq p0 and q0
961 tmp_pq_str0_8x16b = _mm_unpacklo_epi64(tmp_delta1_8x16b, tmp0_const_8x16b);
962 tmp_pq_str1_8x16b = _mm_unpackhi_epi64(tmp_delta1_8x16b, tmp0_const_8x16b);
963 src_p1_8x16b = _mm_unpacklo_epi8(src_p1_8x16b, tmp0_const_8x16b);
964 src_q1_8x16b = _mm_unpacklo_epi8(src_q1_8x16b, tmp0_const_8x16b);
965 src_p1_8x16b = _mm_add_epi16(src_p1_8x16b, tmp_pq_str0_8x16b);
966 src_q1_8x16b = _mm_add_epi16(src_q1_8x16b, tmp_pq_str1_8x16b);
967 //packing p1 q1 and p0 q0 to 8 bit
968 src_p1_8x16b = _mm_packus_epi16(src_p1_8x16b, src_q1_8x16b);
969 src_p0_8x16b = _mm_packus_epi16(src_p0_8x16b, src_q0_8x16b);
970
971 src_q1_8x16b = _mm_srli_si128(src_p1_8x16b, 8);
972 src_q0_8x16b = _mm_srli_si128(src_p0_8x16b, 8);
973
974 _mm_storel_epi64((__m128i *)(pu1_src - 2 * src_strd), src_p1_8x16b);
975 _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_8x16b);
976 _mm_storel_epi64((__m128i *)(pu1_src), src_q0_8x16b);
977 _mm_storel_epi64((__m128i *)(pu1_src + src_strd), src_q1_8x16b);
978
979
980 }
981
982
983
984 }
985
986 }
987
ihevc_deblk_chroma_vert_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 quant_param_p,WORD32 quant_param_q,WORD32 qp_offset_u,WORD32 qp_offset_v,WORD32 tc_offset_div2,WORD32 filter_flag_p,WORD32 filter_flag_q)988 void ihevc_deblk_chroma_vert_ssse3(UWORD8 *pu1_src,
989 WORD32 src_strd,
990 WORD32 quant_param_p,
991 WORD32 quant_param_q,
992 WORD32 qp_offset_u,
993 WORD32 qp_offset_v,
994 WORD32 tc_offset_div2,
995 WORD32 filter_flag_p,
996 WORD32 filter_flag_q)
997 {
998 WORD32 qp_indx_u, qp_chroma_u;
999 WORD32 qp_indx_v, qp_chroma_v;
1000 WORD32 tc_indx_u, tc_u;
1001 WORD32 tc_indx_v, tc_v;
1002
1003 __m128i src_row_0_16x8b, tmp_pxl_0_16x8b, src_row_2_16x8b, tmp_pxl_1_16x8b;
1004 ASSERT(filter_flag_p || filter_flag_q);
1005
1006 /* chroma processing is done only if BS is 2 */
1007 /* this function is assumed to be called only if BS is 2 */
1008 qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1);
1009 qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]);
1010
1011 qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1);
1012 qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]);
1013
1014 tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53);
1015 tc_u = gai4_ihevc_tc_table[tc_indx_u];
1016
1017 tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53);
1018 tc_v = gai4_ihevc_tc_table[tc_indx_v];
1019
1020 if(0 == tc_u && 0 == tc_v)
1021 {
1022 return;
1023 }
1024 src_row_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - 4));
1025 tmp_pxl_0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd - 4));
1026 src_row_2_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 2 * src_strd - 4));
1027 tmp_pxl_1_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + 3 * src_strd - 4));
1028
1029 {
1030 LWORD64 mask_tc, mask_flag, mask;
1031 __m128i delta_vu0_16x8b, delta_vu1_16x8b;
1032 __m128i mask_tc_16x8, mask_16x8b, mask_flag_p_16x8b, mask_flag_q_16x8b;
1033 __m128i min_0_16x8b;
1034 __m128i const_16x8b;
1035 mask_flag = (((LWORD64)filter_flag_p) << 31) | (((LWORD64)filter_flag_q) << 63);
1036 mask_tc = (((LWORD64)tc_v) << 16) | ((LWORD64)tc_u);
1037 mask = 0xffff00000000ffffLL;
1038
1039 src_row_0_16x8b = _mm_unpacklo_epi64(src_row_0_16x8b, tmp_pxl_0_16x8b);
1040 src_row_2_16x8b = _mm_unpacklo_epi64(src_row_2_16x8b, tmp_pxl_1_16x8b);
1041
1042 mask_16x8b = _mm_load_si128((__m128i *)(shuffle_uv));
1043 // qv11 qu11 qv10 qu10 qv01 qu01 qv00 qu00 pv10 pu10 pv11 pu11 pv00 pu00 pv01 pu01
1044 // qv31 qu31 qv30 qu30 qv21 qu21 qv20 qu20 pv30 pu30 pv31 pu31 pv20 pu20 pv21 pu21
1045 delta_vu0_16x8b = _mm_shuffle_epi8(src_row_0_16x8b, mask_16x8b);
1046 delta_vu1_16x8b = _mm_shuffle_epi8(src_row_2_16x8b, mask_16x8b);
1047
1048 tmp_pxl_0_16x8b = _mm_unpacklo_epi64(delta_vu0_16x8b, delta_vu1_16x8b);
1049 tmp_pxl_1_16x8b = _mm_unpackhi_epi64(delta_vu0_16x8b, delta_vu1_16x8b);
1050 // pv30 pv31 pu30 pu31 pv20 pv21 pu20 pu21 pv10 pv11 pu10 pu11 pv00 pv01 pu00 pu01
1051 // qv31 qv30 qu31 qu30 qv21 qv20 qu21 qu20 qv11 qv10 qu11 qu10 qv01 qv00 qu01 qu00
1052 delta_vu0_16x8b = _mm_load_si128((__m128i *)delta0);
1053 delta_vu1_16x8b = _mm_load_si128((__m128i *)delta1);
1054
1055 delta_vu0_16x8b = _mm_maddubs_epi16(tmp_pxl_0_16x8b, delta_vu0_16x8b);
1056 delta_vu1_16x8b = _mm_maddubs_epi16(tmp_pxl_1_16x8b, delta_vu1_16x8b);
1057
1058 //generating offset 4
1059 const_16x8b = _mm_cmpeq_epi16(tmp_pxl_0_16x8b, tmp_pxl_0_16x8b);
1060 // filter flag mask and tc mask
1061 mask_tc_16x8 = _mm_loadl_epi64((__m128i *)(&mask_tc));
1062 mask_flag_q_16x8b = _mm_loadl_epi64((__m128i *)(&mask_flag));
1063
1064 mask_tc_16x8 = _mm_shuffle_epi32(mask_tc_16x8, 0x00);
1065 mask_flag_q_16x8b = _mm_srai_epi32(mask_flag_q_16x8b, 31);
1066 //-tc
1067 min_0_16x8b = _mm_sign_epi16(mask_tc_16x8, const_16x8b);
1068 //converting const 1
1069 const_16x8b = _mm_srli_epi16(const_16x8b, 15);
1070
1071 //filterp and filterq flag
1072 mask_flag_p_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x00);
1073 mask_flag_q_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x55);
1074
1075 //modified delta with a filter (1 -4 4 -1) available in 16 bit
1076 delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, delta_vu1_16x8b);
1077 //converting const 4
1078 const_16x8b = _mm_slli_epi16(const_16x8b, 2);
1079
1080 mask_16x8b = _mm_loadl_epi64((__m128i *)(&mask));
1081 //offset addition
1082 delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, const_16x8b);
1083 //eliminating q1
1084 tmp_pxl_1_16x8b = _mm_slli_epi16(tmp_pxl_1_16x8b, 8);
1085
1086 const_16x8b = _mm_setzero_si128();
1087 //filter after normalisation
1088 delta_vu0_16x8b = _mm_srai_epi16(delta_vu0_16x8b, 3);
1089 mask_16x8b = _mm_shuffle_epi32(mask_16x8b, 0x44);
1090
1091 //clipping MAX
1092 delta_vu0_16x8b = _mm_min_epi16(delta_vu0_16x8b, mask_tc_16x8);
1093 //getting p0 and eliminating p1
1094 tmp_pxl_0_16x8b = _mm_srli_epi16(tmp_pxl_0_16x8b, 8);
1095 //clipping MIN
1096 delta_vu0_16x8b = _mm_max_epi16(delta_vu0_16x8b, min_0_16x8b);
1097 //getting q0
1098 tmp_pxl_1_16x8b = _mm_srli_epi16(tmp_pxl_1_16x8b, 8);
1099 //masking filter flag
1100 delta_vu1_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_q_16x8b);
1101 delta_vu0_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_p_16x8b);
1102
1103 // q-delta ,p+delta
1104 tmp_pxl_1_16x8b = _mm_sub_epi16(tmp_pxl_1_16x8b, delta_vu1_16x8b);
1105 tmp_pxl_0_16x8b = _mm_add_epi16(tmp_pxl_0_16x8b, delta_vu0_16x8b);
1106 //merging q0 and p0 of respective rows
1107 delta_vu1_16x8b = _mm_unpackhi_epi32(tmp_pxl_0_16x8b, tmp_pxl_1_16x8b);
1108 delta_vu0_16x8b = _mm_unpacklo_epi32(tmp_pxl_0_16x8b, tmp_pxl_1_16x8b);
1109 // row 0 and row 1 packed , row2 and row3 packed
1110 delta_vu0_16x8b = _mm_packus_epi16(delta_vu0_16x8b, const_16x8b);
1111 delta_vu1_16x8b = _mm_packus_epi16(delta_vu1_16x8b, const_16x8b);
1112 //removing older pixel values
1113 src_row_0_16x8b = _mm_and_si128(src_row_0_16x8b, mask_16x8b);
1114 src_row_2_16x8b = _mm_and_si128(src_row_2_16x8b, mask_16x8b);
1115 //arranging modified pixels
1116 delta_vu0_16x8b = _mm_shuffle_epi32(delta_vu0_16x8b, 0xd8);
1117 delta_vu1_16x8b = _mm_shuffle_epi32(delta_vu1_16x8b, 0xd8);
1118 delta_vu0_16x8b = _mm_slli_epi64(delta_vu0_16x8b, 16);
1119 delta_vu1_16x8b = _mm_slli_epi64(delta_vu1_16x8b, 16);
1120 //plugging the modified values
1121 src_row_0_16x8b = _mm_or_si128(src_row_0_16x8b, delta_vu0_16x8b);
1122 src_row_2_16x8b = _mm_or_si128(src_row_2_16x8b, delta_vu1_16x8b);
1123
1124
1125 //geting values for row1 and row 3
1126 tmp_pxl_0_16x8b = _mm_srli_si128(src_row_0_16x8b, 8);
1127 tmp_pxl_1_16x8b = _mm_srli_si128(src_row_2_16x8b, 8);
1128
1129 _mm_storel_epi64((__m128i *)(pu1_src - 4), src_row_0_16x8b);
1130 _mm_storel_epi64((__m128i *)((pu1_src - 4) + src_strd), tmp_pxl_0_16x8b);
1131 _mm_storel_epi64((__m128i *)((pu1_src - 4) + 2 * src_strd), src_row_2_16x8b);
1132 _mm_storel_epi64((__m128i *)((pu1_src - 4) + 3 * src_strd), tmp_pxl_1_16x8b);
1133 }
1134
1135
1136
1137 }
1138
ihevc_deblk_chroma_horz_ssse3(UWORD8 * pu1_src,WORD32 src_strd,WORD32 quant_param_p,WORD32 quant_param_q,WORD32 qp_offset_u,WORD32 qp_offset_v,WORD32 tc_offset_div2,WORD32 filter_flag_p,WORD32 filter_flag_q)1139 void ihevc_deblk_chroma_horz_ssse3(UWORD8 *pu1_src,
1140 WORD32 src_strd,
1141 WORD32 quant_param_p,
1142 WORD32 quant_param_q,
1143 WORD32 qp_offset_u,
1144 WORD32 qp_offset_v,
1145 WORD32 tc_offset_div2,
1146 WORD32 filter_flag_p,
1147 WORD32 filter_flag_q)
1148 {
1149 WORD32 qp_indx_u, qp_chroma_u;
1150 WORD32 qp_indx_v, qp_chroma_v;
1151 WORD32 tc_indx_u, tc_u;
1152 WORD32 tc_indx_v, tc_v;
1153
1154
1155 __m128i tmp_p0_16x8b, src_p0_16x8b, src_q0_16x8b, tmp_q0_16x8b;
1156
1157 ASSERT(filter_flag_p || filter_flag_q);
1158
1159 /* chroma processing is done only if BS is 2 */
1160 /* this function is assumed to be called only if BS is 2 */
1161 qp_indx_u = qp_offset_u + ((quant_param_p + quant_param_q + 1) >> 1);
1162 qp_chroma_u = qp_indx_u < 0 ? qp_indx_u : (qp_indx_u > 57 ? qp_indx_u - 6 : gai4_ihevc_qp_table[qp_indx_u]);
1163
1164 qp_indx_v = qp_offset_v + ((quant_param_p + quant_param_q + 1) >> 1);
1165 qp_chroma_v = qp_indx_v < 0 ? qp_indx_v : (qp_indx_v > 57 ? qp_indx_v - 6 : gai4_ihevc_qp_table[qp_indx_v]);
1166
1167 tc_indx_u = CLIP3(qp_chroma_u + 2 + (tc_offset_div2 << 1), 0, 53);
1168 tc_u = gai4_ihevc_tc_table[tc_indx_u];
1169
1170 tc_indx_v = CLIP3(qp_chroma_v + 2 + (tc_offset_div2 << 1), 0, 53);
1171 tc_v = gai4_ihevc_tc_table[tc_indx_v];
1172
1173 if(0 == tc_u && 0 == tc_v)
1174 {
1175 return;
1176 }
1177 tmp_p0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - 2 * src_strd));
1178 src_p0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src - src_strd));
1179 src_q0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src));
1180 tmp_q0_16x8b = _mm_loadl_epi64((__m128i *)(pu1_src + src_strd));
1181
1182 {
1183 LWORD64 mask_tc, mask_flag;
1184 __m128i delta_vu0_16x8b, delta_vu1_16x8b;
1185 __m128i mask_tc_16x8, mask_16x8b, mask_flag_p_16x8b, mask_flag_q_16x8b;
1186 __m128i min_0_16x8b;
1187 __m128i const_16x8b;
1188 mask_flag = (((LWORD64)filter_flag_p) << 31) | (((LWORD64)filter_flag_q) << 63);
1189 mask_tc = (((LWORD64)tc_v) << 16) | ((LWORD64)tc_u);
1190
1191 tmp_p0_16x8b = _mm_unpacklo_epi8(tmp_p0_16x8b, src_p0_16x8b);
1192 tmp_q0_16x8b = _mm_unpacklo_epi8(src_q0_16x8b, tmp_q0_16x8b);
1193
1194 // pv30 pv31 pu30 pu31 pv20 pv21 pu20 pu21 pv10 pv11 pu10 pu11 pv00 pv01 pu00 pu01
1195 // qv31 qv30 qu31 qu30 qv21 qv20 qu21 qu20 qv11 qv10 qu11 qu10 qv01 qv00 qu01 qu00
1196 delta_vu0_16x8b = _mm_load_si128((__m128i *)delta0);
1197 delta_vu1_16x8b = _mm_load_si128((__m128i *)delta1);
1198
1199 delta_vu0_16x8b = _mm_maddubs_epi16(tmp_p0_16x8b, delta_vu0_16x8b);
1200 delta_vu1_16x8b = _mm_maddubs_epi16(tmp_q0_16x8b, delta_vu1_16x8b);
1201
1202
1203 // filter flag mask and tc mask
1204 mask_tc_16x8 = _mm_loadl_epi64((__m128i *)(&mask_tc));
1205 mask_flag_q_16x8b = _mm_loadl_epi64((__m128i *)(&mask_flag));
1206
1207 //generating offset 4
1208 const_16x8b = _mm_cmpeq_epi16(tmp_p0_16x8b, tmp_p0_16x8b);
1209 // filter flag mask and tc mask
1210 mask_tc_16x8 = _mm_shuffle_epi32(mask_tc_16x8, 0x00);
1211 mask_flag_q_16x8b = _mm_srai_epi32(mask_flag_q_16x8b, 31);
1212 //-tc
1213 min_0_16x8b = _mm_sign_epi16(mask_tc_16x8, const_16x8b);
1214 //converting const 1
1215 const_16x8b = _mm_srli_epi16(const_16x8b, 15);
1216
1217 //filterp
1218 mask_flag_p_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x00);
1219
1220
1221 //converting const 4
1222 const_16x8b = _mm_slli_epi16(const_16x8b, 2);
1223 //modified delta with a filter (1 -4 4 -1) available in 16 bit
1224 delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, delta_vu1_16x8b);
1225
1226 //filterq flag
1227 mask_flag_q_16x8b = _mm_shuffle_epi32(mask_flag_q_16x8b, 0x55);
1228 //offset addition
1229 delta_vu0_16x8b = _mm_add_epi16(delta_vu0_16x8b, const_16x8b);
1230 mask_16x8b = _mm_setzero_si128();
1231 //filter after normalisation
1232 delta_vu0_16x8b = _mm_srai_epi16(delta_vu0_16x8b, 3);
1233
1234 //converting p0 to 16bit
1235 src_p0_16x8b = _mm_unpacklo_epi8(src_p0_16x8b, mask_16x8b);
1236 //clipping MAX
1237 delta_vu0_16x8b = _mm_min_epi16(delta_vu0_16x8b, mask_tc_16x8);
1238 //converting q0 to 16bit
1239 src_q0_16x8b = _mm_unpacklo_epi8(src_q0_16x8b, mask_16x8b);
1240 //clipping MIN
1241 delta_vu0_16x8b = _mm_max_epi16(delta_vu0_16x8b, min_0_16x8b);
1242
1243 //masking filter flag
1244 delta_vu1_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_q_16x8b);
1245 delta_vu0_16x8b = _mm_and_si128(delta_vu0_16x8b, mask_flag_p_16x8b);
1246
1247 // q-delta ,p+delta
1248 src_q0_16x8b = _mm_sub_epi16(src_q0_16x8b, delta_vu1_16x8b);
1249 src_p0_16x8b = _mm_add_epi16(src_p0_16x8b, delta_vu0_16x8b);
1250
1251 // p0 and q0 packed
1252 src_q0_16x8b = _mm_packus_epi16(src_q0_16x8b, mask_16x8b);
1253 src_p0_16x8b = _mm_packus_epi16(src_p0_16x8b, mask_16x8b);
1254
1255
1256
1257 _mm_storel_epi64((__m128i *)(pu1_src - src_strd), src_p0_16x8b);
1258 _mm_storel_epi64((__m128i *)(pu1_src), src_q0_16x8b);
1259
1260 }
1261
1262
1263 }
1264