1//****************************************************************************** 2//* 3//* Copyright (C) 2015 The Android Open Source Project 4//* 5//* Licensed under the Apache License, Version 2.0 (the "License"); 6//* you may not use this file except in compliance with the License. 7//* You may obtain a copy of the License at: 8//* 9//* http://www.apache.org/licenses/LICENSE-2.0 10//* 11//* Unless required by applicable law or agreed to in writing, software 12//* distributed under the License is distributed on an "AS IS" BASIS, 13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14//* See the License for the specific language governing permissions and 15//* limitations under the License. 16//* 17//***************************************************************************** 18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19//*/ 20///*****************************************************************************/ 21///* */ 22///* File Name : ih264_deblk_luma_av8.s */ 23///* */ 24///* Description : Contains function definitions for deblocking luma */ 25///* edge. Functions are coded in NEON assembly and can */ 26///* be compiled using ARM RVDS. */ 27///* */ 28///* List of Functions : ih264_deblk_luma_vert_bs4_av8() */ 29///* ih264_deblk_luma_vert_bslt4_av8() */ 30///* ih264_deblk_luma_horz_bs4_av8() */ 31///* ih264_deblk_luma_horz_bslt4_av8() */ 32///* */ 33///* Issues / Problems : None */ 34///* */ 35///* Revision History : */ 36///* */ 37///* DD MM YYYY Author(s) Changes (Describe the changes made) */ 38///* 28 11 2013 Ittiam Draft */ 39///* */ 40///*****************************************************************************/ 41 42 43.text 44.p2align 2 45.include "ih264_neon_macros.s" 46 47 48 49///** 50//******************************************************************************* 51//* 52//* @brief 53//* Performs filtering of a luma block horizontal edge for cases where the 54//* boundary strength is less than 4 55//* 56//* @par Description: 57//* This operation is described in Sec. 8.7.2.4 under the title 58//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. 59//* 60//* @param[in] x0 - pu1_src 61//* Pointer to the src sample q0 62//* 63//* @param[in] w1 - src_strd 64//* Source stride 65//* 66//* @param[in] w2 - alpha 67//* Alpha Value for the boundary 68//* 69//* @param[in] w3 - beta 70//* Beta Value for the boundary 71//* 72//* @param[in] w4 - u4_bs 73//* Packed Boundary strength array 74//* 75//* @param[in] x5 - pu1_cliptab 76//* tc0_table 77//* 78//* @returns 79//* None 80//* 81//* @remarks 82//* None 83//* 84//******************************************************************************* 85//*/ 86 87 .global ih264_deblk_luma_horz_bslt4_av8 88 89ih264_deblk_luma_horz_bslt4_av8: 90 91 // STMFD sp!,{x4-x7,x14} 92 push_v_regs 93 sxtw x1, w1 94 stp x19, x20, [sp, #-16]! 95 96 //LDRD x4,x5,[SP,#0x14] //x4 = ui_Bs , x5 = *puc_ClpTab 97 sub x0, x0, x1, lsl #1 //x1 = uc_Horizonpad 98 sub x0, x0, x1 //x0 pointer to p2 99 rev w4, w4 // 100 ld1 {v10.8b, v11.8b}, [x0], x1 //p2 values are loaded into q5 101 mov v12.s[0], w4 //d12[0] = ui_Bs 102 mov x6, x0 //keeping backup of pointer to p1 103 ld1 {v8.8b, v9.8b}, [x0], x1 //p1 values are loaded into q4 104 mov x7, x0 //keeping backup of pointer to p0 105 ld1 {v6.8b, v7.8b}, [x0], x1 //p0 values are loaded into q3 106 uxtl v12.8h, v12.8b //q6 = uc_Bs in each 16 bt scalar 107 ld1 {v0.8b, v1.8b}, [x0], x1 //q0 values are loaded into q0 108 mov v10.d[1], v11.d[0] 109 mov v8.d[1], v9.d[0] 110 mov v6.d[1], v7.d[0] 111 uabd v26.16b, v8.16b, v6.16b 112 ld1 {v2.8b, v3.8b}, [x0], x1 //q1 values are loaded into q1 113 mov v0.d[1], v1.d[0] 114 mov v2.d[1], v3.d[0] 115 uabd v22.16b, v6.16b, v0.16b 116 ld1 {v16.s}[0], [x5] //D16[0] contains cliptab 117 uabd v24.16b, v2.16b, v0.16b 118 ld1 {v4.8b, v5.8b}, [x0], x1 //q2 values are loaded into q2 119 tbl v14.8b, {v16.16b}, v12.8b // 120 mov v4.d[1], v5.d[0] 121 dup v20.16b, w2 //Q10 contains alpha 122 dup v16.16b, w3 //Q8 contains beta 123 uxtl v12.4s, v12.4h // 124 uxtl v14.4s, v14.4h // 125 uabd v28.16b, v10.16b, v6.16b 126 uabd v30.16b, v4.16b, v0.16b 127 cmgt v12.4s, v12.4s, #0 128 sli v14.4s, v14.4s, #8 129 cmhs v18.16b, v22.16b, v20.16b 130 cmhs v24.16b, v24.16b, v16.16b 131 cmhs v26.16b, v26.16b, v16.16b 132 cmhi v20.16b, v16.16b , v28.16b //Q10=(Ap<Beta) 133 cmhi v22.16b, v16.16b , v30.16b //Q11=(Aq<Beta) 134 sli v14.4s, v14.4s, #16 135 orr v18.16b, v18.16b , v24.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) 136 usubl v30.8h, v1.8b, v7.8b // 137 usubl v24.8h, v0.8b, v6.8b //Q15,Q12 = (q0 - p0) 138 orr v18.16b, v18.16b , v26.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta ) 139 usubl v28.8h, v8.8b, v2.8b //Q14 = (p1 - q1)L 140 shl v26.8h, v30.8h, #2 //Q13 = (q0 - p0)<<2 141 shl v24.8h, v24.8h, #2 //Q12 = (q0 - p0)<<2 142 usubl v30.8h, v9.8b, v3.8b //Q15 = (p1 - q1)H 143 bic v12.16b, v12.16b , v18.16b //final condition 144 add v24.8h, v24.8h , v28.8h // 145 add v26.8h, v26.8h , v30.8h //Q13,Q12 = [ (q0 - p0)<<2 ] + (p1 - q1) 146 sub v18.16b, v14.16b , v20.16b //Q9 = C0 + (Ap < Beta) 147 urhadd v16.16b, v6.16b , v0.16b //Q8 = ((p0+q0+1) >> 1) 148 mov v17.d[0], v16.d[1] 149 sqrshrn v24.8b, v24.8h, #3 // 150 sqrshrn v25.8b, v26.8h, #3 //Q12 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3 151 mov v24.d[1], v25.d[0] 152 sub v18.16b, v18.16b , v22.16b //Q9 = C0 + (Ap < Beta) + (Aq < Beta) 153 and v20.16b, v20.16b , v12.16b // 154 and v22.16b, v22.16b , v12.16b // 155 abs v26.16b, v24.16b //Q13 = ABS (i_macro) 156 uaddl v28.8h, v17.8b, v11.8b // 157 uaddl v10.8h, v16.8b, v10.8b //Q14,Q5 = p2 + (p0+q0+1)>>1 158 uaddl v30.8h, v17.8b, v5.8b // 159 umin v18.16b, v26.16b , v18.16b //Q9 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro) 160 ushll v26.8h, v9.8b, #1 // 161 uaddl v4.8h, v16.8b, v4.8b //Q15,Q2 = q2 + (p0+q0+1)>>1 162 ushll v16.8h, v8.8b, #1 //Q13,Q8 = (p1<<1) 163 and v18.16b, v18.16b , v12.16b //Making delta zero in places where values shouldn be filterd 164 sub v28.8h, v28.8h , v26.8h //Q14,Q5 = [p2 + (p0+q0+1)>>1] - (p1<<1) 165 sub v10.8h, v10.8h , v16.8h // 166 ushll v16.8h, v2.8b, #1 // 167 ushll v26.8h, v3.8b, #1 //Q13,Q8 = (q1<<1) 168 sqshrn v29.8b, v28.8h, #1 // 169 sqshrn v28.8b, v10.8h, #1 //Q14 = i_macro_p1 170 mov v28.d[1], v29.d[0] 171 sub v4.8h, v4.8h , v16.8h // 172 sub v30.8h, v30.8h , v26.8h //Q15,Q2 = [q2 + (p0+q0+1)>>1] - (q1<<1) 173 neg v26.16b, v14.16b //Q13 = -C0 174 smin v28.16b, v28.16b , v14.16b //Q14 = min(C0,i_macro_p1) 175 cmge v24.16b, v24.16b, #0 176 sqshrn v31.8b, v30.8h, #1 // 177 sqshrn v30.8b, v4.8h, #1 //Q15 = i_macro_q1 178 mov v30.d[1], v31.d[0] 179 smax v28.16b, v28.16b , v26.16b //Q14 = max( - C0 , min(C0, i_macro_p1) ) 180 uqadd v16.16b, v6.16b , v18.16b //Q8 = p0 + delta 181 uqsub v6.16b, v6.16b , v18.16b //Q3 = p0 - delta 182 smin v30.16b, v30.16b , v14.16b //Q15 = min(C0,i_macro_q1) 183 and v28.16b, v20.16b , v28.16b //condition check Ap<beta 184 uqadd v14.16b, v0.16b , v18.16b //Q7 = q0 + delta 185 uqsub v0.16b, v0.16b , v18.16b //Q0 = q0 - delta 186 smax v30.16b, v30.16b , v26.16b //Q15 = max( - C0 , min(C0, i_macro_q1) ) 187 bif v16.16b, v6.16b , v24.16b //Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta) 188 bif v0.16b, v14.16b , v24.16b //Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta) 189 add v28.16b, v28.16b , v8.16b // 190 and v30.16b, v22.16b , v30.16b //condition check Aq<beta 191 st1 {v16.16b}, [x7], x1 //writting back filtered value of p0 192 add v30.16b, v30.16b , v2.16b // 193 st1 {v0.16b}, [x7], x1 //writting back filtered value of q0 194 st1 {v28.16b}, [x6] //writting back filtered value of p1 195 st1 {v30.16b}, [x7], x1 //writting back filtered value of q1 196 197 // LDMFD sp!,{x4-x7,pc} 198 ldp x19, x20, [sp], #16 199 pop_v_regs 200 ret 201 202 203 204///** 205//******************************************************************************* 206//* 207//* @brief 208//* Performs filtering of a luma block horizontal edge when the 209//* boundary strength is set to 4 210//* 211//* @par Description: 212//* This operation is described in Sec. 8.7.2.4 under the title 213//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. 214//* 215//* @param[in] x0 - pu1_src 216//* Pointer to the src sample q0 217//* 218//* @param[in] w1 - src_strd 219//* Source stride 220//* 221//* @param[in] w2 - alpha 222//* Alpha Value for the boundary 223//* 224//* @param[in] w3 - beta 225//* Beta Value for the boundary 226//* 227//* @returns 228//* None 229//* 230//* @remarks 231//* None 232//* 233//******************************************************************************* 234//*/ 235 236 .global ih264_deblk_luma_horz_bs4_av8 237 238ih264_deblk_luma_horz_bs4_av8: 239 240 // Back up necessary registers on stack 241 // STMFD sp!,{x12,x14} 242 push_v_regs 243 stp x19, x20, [sp, #-16]! 244 sxtw x1, w1 245 246 // Init 247 dup v0.16b, w2 //duplicate alpha 248 sub x12, x0, x1 //pointer to p0 = q0 - src_strd 249 dup v2.16b, w3 //duplicate beta 250 sub x14, x0, x1, lsl#1 //pointer to p1 = q0 - src_strd*2 251 sub x2, x0, x1, lsl#2 //pointer to p3 = q0 - src_strd*4 252 sub x3, x14, x1 //pointer to p2 = p1 - src_strd 253 254 // Load Data 255 ld1 {v4.8b, v5.8b}, [x0], x1 //load q0 to Q2, q0 = q0 + src_strd 256 ld1 {v6.8b, v7.8b}, [x12] //load p0 to Q3 257 ld1 {v8.8b, v9.8b}, [x0], x1 //load q1 to Q4, q0 = q0 + src_strd 258 ld1 {v10.8b, v11.8b}, [x14] //load p1 to Q5 259 mov v4.d[1] , v5.d[0] 260 mov v6.d[1] , v7.d[0] 261 mov v8.d[1] , v9.d[0] 262 mov v10.d[1] , v11.d[0] 263 264 // Filter Decision 265 uabd v12.16b , v4.16b, v6.16b 266 uabd v14.16b , v8.16b, v4.16b 267 uabd v16.16b , v10.16b, v6.16b 268 cmhs v18.16b, v12.16b , v0.16b //ABS(p0 - q0) >= Alpha 269 cmhs v14.16b, v14.16b , v2.16b //ABS(q1 - q0) >= Beta 270 cmhs v16.16b, v16.16b , v2.16b //ABS(q1 - q0) >= Beta 271 movi v20.16b, #2 272 orr v18.16b, v18.16b , v14.16b //ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta 273 ld1 {v14.8b, v15.8b}, [x0], x1 //load q2 to Q7, q0 = q0 + src_strd 274 mov v14.d[1] , v15.d[0] 275 orr v18.16b, v18.16b , v16.16b //ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta 276 usra v20.16b, v0.16b, #2 //alpha >>2 +2 277 uabd v22.16b , v14.16b, v4.16b 278 uaddl v24.8h, v4.8b, v6.8b //p0+q0 L 279 uaddl v26.8h, v5.8b, v7.8b //p0+q0 H 280 cmhi v22.16b, v2.16b , v22.16b //Aq < Beta 281 cmhi v20.16b, v20.16b , v12.16b //(ABS(p0 - q0) <((Alpha >>2) + 2)) 282 // Deblock Filtering q0', q1', q2' 283 uaddw v28.8h, v24.8h , v8.8b //p0+q0+q1 L 284 uaddw v30.8h, v26.8h , v9.8b //p0+q0+q1 H 285 and v22.16b, v22.16b , v20.16b //(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) 286 // q0' if (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) TRUE 287 add v16.8h, v28.8h , v28.8h //2*(p0+q0+q1)L 288 add v0.8h, v30.8h , v30.8h //2*(p0+q0+q1)H 289 uaddw v16.8h, v16.8h , v14.8b //2*(p0+q0+q1)+q2 L 290 uaddw v0.8h, v0.8h , v15.8b //2*(p0+q0+q1)+q2 H 291 uaddw v16.8h, v16.8h , v10.8b //2*(p0+q0+q1)+q2 +p1 L 292 uaddw v0.8h, v0.8h , v11.8b //2*(p0+q0+q1)+q2 +p1 H 293 rshrn v12.8b, v16.8h, #3 //(2*(p0+q0+q1)+q2 +p1 +4)>> 3 L [q0'] 294 rshrn v13.8b, v0.8h, #3 //(2*(p0+q0+q1)+q2 +p1 +4)>> 3 H [q0'] 295 mov v12.d[1] , v13.d[0] 296 // q0" if (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) FALSE 297 uaddl v16.8h, v8.8b, v8.8b //2*q1 L 298 uaddl v0.8h, v9.8b, v9.8b //2*q1 H 299 uaddw v16.8h, v16.8h , v4.8b //2*q1+q0 L 300 uaddw v0.8h, v0.8h , v5.8b //2*q1+q0 H 301 uaddw v16.8h, v16.8h , v10.8b //2*q1+q0+p1 L 302 uaddw v0.8h, v0.8h , v11.8b //2*q1+q0+p1 H 303 rshrn v16.8b, v16.8h, #2 //(2*q1+q0+p1+2)>>2 L [q0"] 304 rshrn v17.8b, v0.8h, #2 //(2*q1+q0+p1+2)>>2 H [q0"] 305 mov v16.d[1] , v17.d[0] 306 uaddw v28.8h, v28.8h , v14.8b //p0+q0+q1+q2 L 307 uaddw v30.8h, v30.8h , v15.8b //p0+q0+q1+q2 H 308 ld1 {v0.8b, v1.8b}, [x0], x1 //load q3 to Q0, q0 = q0 + src_strd 309 mov v0.d[1] , v1.d[0] 310 bit v16.16b, v12.16b , v22.16b //choosing between q0' and q0" depending on condn 311 sub x0, x0, x1, lsl #2 //pointer to q0 312 bic v22.16b, v22.16b , v18.16b //((ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)) 313 // && (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) 314 rshrn v12.8b, v28.8h, #2 //(p0+q0+q1+q2+2)>>2 L [q1'] 315 rshrn v13.8b, v30.8h, #2 //(p0+q0+q1+q2+2)>>2 H [q1'] 316 mov v12.d[1] , v13.d[0] 317 bif v4.16b, v16.16b , v18.16b //choose q0 or filtered q0 318 mov v5.d[0] , v4.d[1] 319 uaddl v16.8h, v14.8b, v0.8b //q2+q3,L 320 uaddl v0.8h, v15.8b, v1.8b //q2+q3,H 321 add v28.8h, v28.8h , v16.8h //p0+q0+q1+2*q2+q3 L 322 st1 {v4.8b, v5.8b}, [x0], x1 //store q0 323 add v30.8h, v30.8h , v0.8h //p0+q0+q1+2*q2+q3 H 324 add v28.8h, v28.8h , v16.8h //p0+q0+q1+3*q2+2*q3 L 325 add v30.8h, v30.8h , v0.8h //p0+q0+q1+3*q2+2*q3 H 326 rshrn v0.8b, v28.8h, #3 //(p0+q0+q1+3*q2+2*q3+4)>>3 L [q2'] 327 rshrn v1.8b, v30.8h, #3 //(p0+q0+q1+3*q2+2*q3+4)>>3 H [q2'] 328 mov v0.d[1] , v1.d[0] 329 ld1 {v30.8b, v31.8b}, [x3] //load p2 to Q15 330 mov v30.d[1] , v31.d[0] 331 bif v12.16b, v8.16b , v22.16b //choose q1 or filtered value of q1 332 mov v13.d[0] , v12.d[1] 333 uabd v16.16b , v30.16b, v6.16b 334 uaddw v24.8h, v24.8h , v10.8b //p0+q0+p1 L 335 bif v0.16b, v14.16b , v22.16b //choose q2 or filtered q2 336 mov v1.d[0] , v0.d[1] 337 uaddw v26.8h, v26.8h , v11.8b //p0+q0+p1 H 338 st1 {v12.8b, v13.8b}, [x0], x1 //store q1 339 cmhi v16.16b, v2.16b , v16.16b //Ap < Beta 340 add v28.8h, v24.8h , v24.8h //2*(p0+q0+p1) L 341 add v4.8h, v26.8h , v26.8h //2*(p0+q0+p1) H 342 st1 {v0.8b, v1.8b}, [x0], x1 //store q2 343 and v20.16b, v20.16b , v16.16b //((Ap < Beta) && (ABS(p0 - q0) <((Alpha >>2) + 2))) 344 uaddw v28.8h, v28.8h , v30.8b //2*(p0+q0+p1)+p2 l 345 uaddw v4.8h, v4.8h , v31.8b //2*(p0+q0+p1)+p2 H 346 uaddw v28.8h, v28.8h , v8.8b //2*(p0+q0+p1)+p2+q1 L 347 uaddw v4.8h, v4.8h , v9.8b //2*(p0+q0+p1)+p2+q1 H 348 rshrn v28.8b, v28.8h, #3 //(2*(p0+q0+p1)+p2+q1+4)>>3 L,p0' 349 rshrn v29.8b, v4.8h, #3 //(2*(p0+q0+p1)+p2+q1+4)>>3 H,p0' 350 mov v28.d[1] , v29.d[0] 351 movi v0.8b, #2 352 movi v1.4h, #2 353 uaddl v2.8h, v6.8b, v8.8b //p0+q1 L 354 umlal v2.8h, v10.8b, v0.8b //2*p1+p0+q1 L 355 uaddl v16.8h, v7.8b, v9.8b //p0+q1 H 356 umlal v16.8h, v11.8b, v0.8b //2*p1+p0+q1 H 357 uaddw v12.8h, v24.8h , v30.8b //(p0+q0+p1) +p2 L 358 ld1 {v24.8b, v25.8b}, [x2] //load p3,Q12 359 mov v24.d[1] , v25.d[0] 360 uaddw v4.8h, v26.8h , v31.8b //(p0+q0+p1) +p2 H 361 uaddl v8.8h, v30.8b, v24.8b //p2+p3 L 362 rshrn v26.8b, v12.8h, #2 //((p0+q0+p1)+p2 +2)>>2,p1' L 363 rshrn v2.8b, v2.8h, #2 //(2*p1+p0+q1+2)>>2,p0"L 364 rshrn v27.8b, v4.8h, #2 //((p0+q0+p1)+p2 +2)>>2,p1' H 365 rshrn v3.8b, v16.8h, #2 //(2*p1+p0+q1+2)>>2,p0" H 366 mov v26.d[1] , v27.d[0] 367 mov v2.d[1] , v3.d[0] 368 uaddl v16.8h, v31.8b, v25.8b //p2+p3 H 369 mla v12.8h, v8.8h , v1.h[0] //(p0+q0+p1)+3*p2+2*p3 L 370 mla v4.8h, v16.8h , v1.h[0] //(p0+q0+p1)+3*p2+2*p3 H 371 bic v16.16b, v20.16b , v18.16b //((ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)) 372 mov v17.d[0] , v16.d[1] //&& (Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) 373 bit v2.16b, v28.16b , v20.16b //choosing between po' and p0" 374 mov v3.d[0] , v2.d[1] 375 rshrn v12.8b, v12.8h, #3 //((p0+q0+p1)+3*p2+2*p3+4)>>3 L p2' 376 rshrn v13.8b, v4.8h, #3 //((p0+q0+p1)+3*p2+2*p3+4)>>3 H p2' 377 mov v12.d[1] , v13.d[0] 378 bif v6.16b, v2.16b , v18.16b //choosing between p0 and filtered value of p0 379 bit v10.16b, v26.16b , v16.16b //choosing between p1 and p1' 380 bit v30.16b, v12.16b , v16.16b //choosing between p2 and p2' 381 st1 {v6.16b}, [x12] //store p0 382 st1 {v10.16b}, [x14] //store p1 383 st1 {v30.16b}, [x3] //store p2 384 385 // LDMFD sp!,{x12,pc} 386 ldp x19, x20, [sp], #16 387 pop_v_regs 388 ret 389 390 391 392///** 393//******************************************************************************* 394//* 395//* @brief 396//* Performs filtering of a luma block vertical edge for cases where the 397//* boundary strength is less than 4 398//* 399//* @par Description: 400//* This operation is described in Sec. 8.7.2.4 under the title 401//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. 402//* 403//* @param[in] x0 - pu1_src 404//* Pointer to the src sample q0 405//* 406//* @param[in] w1 - src_strd 407//* Source stride 408//* 409//* @param[in] w2 - alpha 410//* Alpha Value for the boundary 411//* 412//* @param[in] w3 - beta 413//* Beta Value for the boundary 414//* 415//* @param[in] w4 - u4_bs 416//* Packed Boundary strength array 417//* 418//* @param[in] x5 - pu1_cliptab 419//* tc0_table 420//* 421//* @returns 422//* None 423//* 424//* @remarks 425//* None 426//* 427//******************************************************************************* 428//*/ 429 430 .global ih264_deblk_luma_vert_bslt4_av8 431 432ih264_deblk_luma_vert_bslt4_av8: 433 434 // STMFD sp!,{x12,x14} 435 push_v_regs 436 stp x19, x20, [sp, #-16]! 437 sxtw x1, w1 438 439 sub x0, x0, #4 //pointer uc_edgePixel-4 440 mov x12, x4 441 mov x14, x5 442 mov x17, x0 443 //loading p3:p2:p1:p0:q0:q1:q2:q3 for every row 444 ld1 {v0.8b}, [x0], x1 //row1 445 ld1 {v2.8b}, [x0], x1 //row2 446 ld1 {v4.8b}, [x0], x1 //row3 447 rev w12, w12 //reversing ui_bs 448 ld1 {v6.8b}, [x0], x1 //row4 449 mov v18.s[0], w12 //d12[0] = ui_Bs 450 ld1 {v16.s}[0], [x14] //D16[0] contains cliptab 451 ld1 {v8.8b}, [x0], x1 //row5 452 uxtl v18.8h, v18.8b //q6 = uc_Bs in each 16 bt scalar 453 ld1 {v10.8b}, [x0], x1 //row6 454 ld1 {v12.8b}, [x0], x1 //row7 455 tbl v16.8b, {v16.16b}, v18.8b //puc_ClipTab[uc_Bs] 456 ld1 {v14.8b}, [x0], x1 //row8 457 ld1 {v1.8b}, [x0], x1 //row9 458 uxtl v16.4s, v16.4h // 459 ld1 {v3.8b}, [x0], x1 //row10 460 ld1 {v5.8b}, [x0], x1 //row11 461 ld1 {v7.8b}, [x0], x1 //row12 462 sli v16.4s, v16.4s, #8 // 463 ld1 {v9.8b}, [x0], x1 //row13 464 ld1 {v11.8b}, [x0], x1 //row14 465 ld1 {v13.8b}, [x0], x1 //row15 466 sli v16.4s, v16.4s, #16 467 ld1 {v15.8b}, [x0], x1 //row16 468 469 470 //taking two 8x8 transposes 471 //2X2 transposes 472 trn1 v21.8b, v0.8b, v2.8b 473 trn2 v2.8b, v0.8b, v2.8b //row1 &2 474 mov v0.8b, v21.8b 475 trn1 v21.8b, v4.8b, v6.8b 476 trn2 v6.8b, v4.8b, v6.8b //row3&row4 477 mov v4.8b, v21.8b 478 trn1 v21.8b, v8.8b, v10.8b 479 trn2 v10.8b, v8.8b, v10.8b //row5&6 480 mov v8.8b, v21.8b 481 trn1 v21.8b, v12.8b, v14.8b 482 trn2 v14.8b, v12.8b, v14.8b //row7 & 8 483 mov v12.8b, v21.8b 484 trn1 v21.8b, v1.8b, v3.8b 485 trn2 v3.8b, v1.8b, v3.8b //row9 &10 486 mov v1.8b, v21.8b 487 trn1 v21.8b, v5.8b, v7.8b 488 trn2 v7.8b, v5.8b, v7.8b //row11 & 12 489 mov v5.8b, v21.8b 490 trn1 v21.8b, v9.8b, v11.8b 491 trn2 v11.8b, v9.8b, v11.8b //row13 &14 492 mov v9.8b, v21.8b 493 trn1 v21.8b, v13.8b, v15.8b 494 trn2 v15.8b, v13.8b, v15.8b //row15 & 16 495 mov v13.8b, v21.8b 496 //4x4 transposes 497 trn1 v21.4h, v2.4h, v6.4h 498 trn2 v6.4h, v2.4h, v6.4h //row2 & row4 499 mov v2.8b, v21.8b 500 trn1 v21.4h, v10.4h, v14.4h 501 trn2 v14.4h, v10.4h, v14.4h //row6 & row8 502 mov v10.8b, v21.8b 503 trn1 v21.4h, v3.4h, v7.4h 504 trn2 v7.4h, v3.4h, v7.4h //row10 & 12 505 mov v3.8b, v21.8b 506 trn1 v21.4h, v11.4h, v15.4h 507 trn2 v15.4h, v11.4h, v15.4h //row14 & row16 508 mov v11.8b, v21.8b 509 trn1 v21.2s, v6.2s, v14.2s 510 trn2 v14.2s, v6.2s, v14.2s //row4 & 8 511 mov v6.8b, v21.8b 512 trn1 v21.2s, v7.2s, v15.2s 513 trn2 v15.2s, v7.2s, v15.2s //row 12 & 16 514 mov v7.8b, v21.8b 515 //now Q3 ->p0 and Q7->q3 516 trn1 v21.4h, v0.4h, v4.4h 517 trn2 v4.4h, v0.4h, v4.4h //row1 & 3 518 mov v0.8b, v21.8b 519 trn1 v21.4h, v8.4h, v12.4h 520 trn2 v12.4h, v8.4h, v12.4h //row 5 & 7 521 mov v8.8b, v21.8b 522 trn1 v21.4h, v1.4h, v5.4h 523 trn2 v5.4h, v1.4h, v5.4h //row9 & row11 524 mov v1.8b, v21.8b 525 trn1 v21.4h, v9.4h, v13.4h 526 trn2 v13.4h, v9.4h, v13.4h //row13 & row15 527 mov v9.8b, v21.8b 528 trn1 v21.2s, v0.2s, v8.2s 529 trn2 v8.2s, v0.2s, v8.2s //row1 & row5 530 mov v0.8b, v21.8b 531 trn1 v21.2s, v1.2s, v9.2s 532 trn2 v9.2s, v1.2s, v9.2s //row9 & 13 533 mov v1.8b, v21.8b 534 //now Q0->p3 & Q4->q0 535 //starting processing as p0 and q0 are now ready 536 trn1 v21.2s, v2.2s, v10.2s 537 trn2 v10.2s, v2.2s, v10.2s //row2 &6 538 mov v2.8b, v21.8b 539 mov v6.d[1] , v7.d[0] 540 mov v8.d[1] , v9.d[0] 541 urhadd v20.16b, v6.16b , v8.16b //((p0 + q0 + 1) >> 1) 542 mov v21.d[0], v20.d[1] 543 trn1 v31.2s, v3.2s, v11.2s 544 trn2 v11.2s, v3.2s, v11.2s //row10&row14 545 mov v3.8b, v31.8b 546 movi v19.8b, #2 547 mov v18.d[1], v19.d[0] 548 //now Q1->p2 & Q5->q1 549 trn1 v31.2s, v4.2s, v12.2s 550 trn2 v12.2s, v4.2s, v12.2s //row3 & 7 551 mov v4.8b, v31.8b 552 uabd v22.16b , v6.16b, v8.16b //ABS(q1 - q0) 553 trn1 v31.2s, v5.2s, v13.2s 554 trn2 v13.2s, v5.2s, v13.2s //row11 & row15 555 mov v5.8b, v31.8b 556 mov v0.d[1] , v1.d[0] 557 mov v2.d[1] , v3.d[0] 558 mov v4.d[1] , v5.d[0] 559 mov v10.d[1] , v11.d[0] 560 mov v12.d[1] , v13.d[0] 561 mov v14.d[1] , v15.d[0] 562 uaddl v24.8h, v20.8b, v2.8b //(p2 + ((p0 + q0 + 1) >> 1) L 563 //now Q2->p1,Q6->q2 564 uaddl v26.8h, v21.8b, v3.8b //(p2 + ((p0 + q0 + 1) >> 1) H 565 umlsl v24.8h, v4.8b, v19.8b //(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) L 566 umlsl v26.8h, v5.8b, v19.8b //(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) H 567 dup v28.16b, w2 //alpha 568 cmhs v22.16b, v22.16b , v28.16b //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0)) 569 dup v28.16b, w3 //beta 570 uabd v30.16b , v10.16b, v8.16b //ABS(q1 - q0) 571 sqshrn v24.8b, v24.8h, #1 //((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) L 572 sqshrn v25.8b, v26.8h, #1 //((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) H 573 mov v24.d[1], v25.d[0] 574 cmhs v30.16b, v30.16b , v28.16b //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0)) 575 uabd v26.16b , v4.16b, v6.16b //ABS(q1 - q0) 576 577 smin v24.16b, v24.16b , v16.16b //min(deltap1 ,C0) 578 orr v22.16b, v22.16b , v30.16b //ABS(q1 - q0) >= Beta ||ABS(p0 - q0) >= Alpha 579 neg v30.16b, v16.16b //-C0 580 cmhs v26.16b, v26.16b , v28.16b //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0)) 581 smax v24.16b, v24.16b , v30.16b //max(deltap1,-C0) 582 orr v22.16b, v22.16b , v26.16b //ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta) 583 uxtl v26.4s, v18.4h //ui_bs 584 uaddl v18.8h, v20.8b, v12.8b //q2 + ((p0 + q0 + 1) >> 1) L 585 cmeq v26.4s, v26.4s , #0 //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0)) 586 usubw v18.8h, v18.8h , v10.8b //(q2 + ((p0 + q0 + 1) >> 1) - q1) L 587 uaddl v20.8h, v21.8b, v13.8b //q2 + ((p0 + q0 + 1) >> 1) H 588 usubw v18.8h, v18.8h , v10.8b //(q2 + ((p0 + q0 + 1) >> 1) - 2*q1)L 589 usubw v20.8h, v20.8h , v11.8b //(q2 + ((p0 + q0 + 1) >> 1) - q1) H 590 orr v26.16b, v26.16b , v22.16b //(ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)) &&(ui_bs) 591 usubw v20.8h, v20.8h , v11.8b //(q2 + ((p0 + q0 + 1) >> 1) - 2*q1) H 592 sqshrn v18.8b, v18.8h, #1 //((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1) L 593 uabd v22.16b , v2.16b, v6.16b //ABS(q1 - q0) 594 sqshrn v19.8b, v20.8h, #1 //((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1) H 595 mov v18.d[1], v19.d[0] 596 uabd v20.16b , v12.16b, v8.16b //ABS(q1 - q0) 597 cmhi v22.16b, v28.16b , v22.16b //Ap < Beta 598 smin v18.16b, v18.16b , v16.16b //min(delatq1,C0) 599 cmhi v20.16b, v28.16b , v20.16b //Aq <Beta 600 usubl v28.8h, v8.8b, v6.8b //(q0 - p0) L 601 smax v18.16b, v18.16b , v30.16b //max(deltaq1,-C0) 602 usubl v30.8h, v9.8b, v7.8b //(q0 - p0) H 603 shl v28.8h, v28.8h, #2 //(q0 - p0)<<2 L 604 sub v16.16b, v16.16b , v22.16b //C0 + (Ap < Beta) 605 shl v30.8h, v30.8h, #2 //(q0 - p0) << 2) H 606 uaddw v28.8h, v28.8h , v4.8b //((q0 - p0) << 2) + (p1 L 607 uaddw v30.8h, v30.8h , v5.8b //((q0 - p0) << 2) + (p1 H 608 usubw v28.8h, v28.8h , v10.8b //((q0 - p0) << 2) + (p1 - q1) L 609 usubw v30.8h, v30.8h , v11.8b //((q0 - p0) << 2) + (p1 - q1) H 610 bic v22.16b, v22.16b , v26.16b //final condition for p1 611 rshrn v28.8b, v28.8h, #3 //delta = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3); L 612 rshrn v29.8b, v30.8h, #3 //delta = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3) H 613 mov v28.d[1], v29.d[0] 614 sub v16.16b, v16.16b , v20.16b //C0 + (Ap < Beta) + (Aq < Beta) 615 bic v20.16b, v20.16b , v26.16b //final condition for q1 616 abs v30.16b, v28.16b //abs(delta) 617 and v24.16b, v24.16b , v22.16b //delatp1 618 and v18.16b, v18.16b , v20.16b //delta q1 619 umin v30.16b, v30.16b , v16.16b //min((abs(delta),C) 620 add v4.16b, v4.16b , v24.16b //p1+deltap1 621 add v10.16b, v10.16b , v18.16b //q1+deltaq1 622 mov v5.d[0], v4.d[1] 623 mov v11.d[0], v10.d[1] 624 bic v30.16b, v30.16b , v26.16b //abs(delta) of pixels to be changed only 625 // VCGE.S8 Q14, Q14,#0 //sign(delta) 626 cmge v28.16b, v28.16b , #0 627 uqsub v22.16b, v6.16b , v30.16b //clip(p0-delta) 628 629 trn1 v21.8b, v0.8b, v2.8b 630 trn2 v2.8b, v0.8b, v2.8b //row1 &2 631 mov v0.8b, v21.8b 632 uqadd v6.16b, v6.16b , v30.16b //clip(p0+delta) 633 634 trn1 v21.8b, v1.8b, v3.8b 635 trn2 v3.8b, v1.8b, v3.8b //row9 &10 636 mov v1.8b, v21.8b 637 uqadd v24.16b, v8.16b , v30.16b //clip(q0+delta) 638 trn1 v21.8b, v12.8b, v14.8b 639 trn2 v14.8b, v12.8b, v14.8b //row7 & 8 640 mov v12.8b, v21.8b 641 uqsub v8.16b, v8.16b , v30.16b //clip(q0-delta) 642 trn1 v21.8b, v13.8b, v15.8b 643 trn2 v15.8b, v13.8b, v15.8b //row15 & 16 644 mov v13.8b, v21.8b 645 bif v6.16b, v22.16b , v28.16b //p0 646 bif v8.16b, v24.16b , v28.16b //q0 647 mov v7.d[0], v6.d[1] 648 mov v9.d[0], v8.d[1] 649 trn1 v21.8b, v4.8b, v6.8b 650 trn2 v6.8b, v4.8b, v6.8b //row3&row4 651 mov v4.8b, v21.8b 652 trn1 v21.8b, v8.8b, v10.8b 653 trn2 v10.8b, v8.8b, v10.8b //row5&6 654 mov v8.8b, v21.8b 655 trn1 v21.8b, v5.8b, v7.8b 656 trn2 v7.8b, v5.8b, v7.8b //row11 & 12 657 mov v5.8b, v21.8b 658 trn1 v21.8b, v9.8b, v11.8b 659 trn2 v11.8b, v9.8b, v11.8b //row13 &14 660 mov v9.8b, v21.8b 661 trn1 v21.4h, v2.4h, v6.4h 662 trn2 v6.4h, v2.4h, v6.4h //row2 & row4 663 mov v2.8b, v21.8b 664 trn1 v21.4h, v10.4h, v14.4h 665 trn2 v14.4h, v10.4h, v14.4h //row6 & row8 666 mov v10.8b, v21.8b 667 trn1 v21.4h, v3.4h, v7.4h 668 trn2 v7.4h, v3.4h, v7.4h //row10 & 12 669 mov v3.8b, v21.8b 670 trn1 v21.4h, v11.4h, v15.4h 671 trn2 v15.4h, v11.4h, v15.4h //row14 & row16 672 mov v11.8b, v21.8b 673 trn1 v21.2s, v6.2s, v14.2s 674 trn2 v14.2s, v6.2s, v14.2s //row4 & 8 675 mov v6.8b, v21.8b 676 trn1 v21.2s, v7.2s, v15.2s 677 trn2 v15.2s, v7.2s, v15.2s //row 12 & 16 678 mov v7.8b, v21.8b 679 //now Q3 ->p0 and Q7->q3 680 trn1 v21.4h, v0.4h, v4.4h 681 trn2 v4.4h, v0.4h, v4.4h //row1 & 3 682 mov v0.8b, v21.8b 683 trn1 v21.4h, v8.4h, v12.4h 684 trn2 v12.4h, v8.4h, v12.4h //row 5 & 7 685 mov v8.8b, v21.8b 686 trn1 v21.4h, v1.4h, v5.4h 687 trn2 v5.4h, v1.4h, v5.4h //row9 & row11 688 mov v1.8b, v21.8b 689 trn1 v21.4h, v9.4h, v13.4h 690 trn2 v13.4h, v9.4h, v13.4h //row13 & row15 691 mov v9.8b, v21.8b 692 sub x0, x0, x1, lsl#4 //restore pointer 693 trn1 v21.2s, v0.2s, v8.2s 694 trn2 v8.2s, v0.2s, v8.2s //row1 & row5 695 mov v0.8b, v21.8b 696 trn1 v21.2s, v1.2s, v9.2s 697 trn2 v9.2s, v1.2s, v9.2s //row9 & 13 698 mov v1.8b, v21.8b 699 trn1 v21.2s, v2.2s, v10.2s 700 trn2 v10.2s, v2.2s, v10.2s //row2 &6 701 mov v2.8b, v21.8b 702 trn1 v21.2s, v3.2s, v11.2s 703 trn2 v11.2s, v3.2s, v11.2s //row10&row14 704 mov v3.8b, v21.8b 705 trn1 v21.2s, v4.2s, v12.2s 706 trn2 v12.2s, v4.2s, v12.2s //row3 & 7 707 mov v4.8b, v21.8b 708 trn1 v21.2s, v5.2s, v13.2s 709 trn2 v13.2s, v5.2s, v13.2s //row11 & row15 710 mov v5.8b, v21.8b 711 st1 {v0.8b}, [x0], x1 //row1 712 st1 {v2.8b}, [x0], x1 //row2 713 st1 {v4.8b}, [x0], x1 //row3 714 st1 {v6.8b}, [x0], x1 //row4 715 st1 {v8.8b}, [x0], x1 //row5 716 st1 {v10.8b}, [x0], x1 //row6 717 st1 {v12.8b}, [x0], x1 //row7 718 st1 {v14.8b}, [x0], x1 //row8 719 st1 {v1.8b}, [x0], x1 //row9 720 st1 {v3.8b}, [x0], x1 //row10 721 st1 {v5.8b}, [x0], x1 //row11 722 st1 {v7.8b}, [x0], x1 //row12 723 st1 {v9.8b}, [x0], x1 //row13 724 st1 {v11.8b}, [x0], x1 //row14 725 st1 {v13.8b}, [x0], x1 //row15 726 st1 {v15.8b}, [x0], x1 //row16 727 728 // LDMFD sp!,{x12,pc} 729 ldp x19, x20, [sp], #16 730 pop_v_regs 731 ret 732 733 734 735///** 736//******************************************************************************* 737//* 738//* @brief 739//* Performs filtering of a luma block vertical edge when the 740//* boundary strength is set to 4 741//* 742//* @par Description: 743//* This operation is described in Sec. 8.7.2.4 under the title 744//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264. 745//* 746//* @param[in] x0 - pu1_src 747//* Pointer to the src sample q0 748//* 749//* @param[in] w1 - src_strd 750//* Source stride 751//* 752//* @param[in] w2 - alpha 753//* Alpha Value for the boundary 754//* 755//* @param[in] w3 - beta 756//* Beta Value for the boundary 757//* 758//* @returns 759//* None 760//* 761//* @remarks 762//* None 763//* 764//******************************************************************************* 765//*/ 766 767 .global ih264_deblk_luma_vert_bs4_av8 768 769ih264_deblk_luma_vert_bs4_av8: 770 771 // STMFD sp!,{x12,x14} 772 push_v_regs 773 stp x19, x20, [sp, #-16]! 774 775 sub x0, x0, #4 //pointer uc_edgePixel-4 776 mov x17, x0 777 //loading p3:p2:p1:p0:q0:q1:q2:q3 for every row 778 ld1 {v0.8b}, [x0], x1 //row1 779 ld1 {v2.8b}, [x0], x1 //row2 780 ld1 {v4.8b}, [x0], x1 //row3 781 ld1 {v6.8b}, [x0], x1 //row4 782 ld1 {v8.8b}, [x0], x1 //row5 783 ld1 {v10.8b}, [x0], x1 //row6 784 ld1 {v12.8b}, [x0], x1 //row7 785 ld1 {v14.8b}, [x0], x1 //row8 786 ld1 {v1.8b}, [x0], x1 //row9 787 ld1 {v3.8b}, [x0], x1 //row10 788 ld1 {v5.8b}, [x0], x1 //row11 789 ld1 {v7.8b}, [x0], x1 //row12 790 ld1 {v9.8b}, [x0], x1 //row13 791 ld1 {v11.8b}, [x0], x1 //row14 792 ld1 {v13.8b}, [x0], x1 //row15 793 ld1 {v15.8b}, [x0], x1 //row16 794 795 //taking two 8x8 transposes 796 //2X2 transposes 797 trn1 v21.8b, v0.8b, v2.8b 798 trn2 v2.8b, v0.8b, v2.8b //row1 &2 799 mov v0.8b, v21.8b 800 trn1 v21.8b, v4.8b, v6.8b 801 trn2 v6.8b, v4.8b, v6.8b //row3&row4 802 mov v4.8b, v21.8b 803 trn1 v21.8b, v8.8b, v10.8b 804 trn2 v10.8b, v8.8b, v10.8b //row5&6 805 mov v8.8b, v21.8b 806 trn1 v21.8b, v12.8b, v14.8b 807 trn2 v14.8b, v12.8b, v14.8b //row7 & 8 808 mov v12.8b, v21.8b 809 trn1 v21.8b, v1.8b, v3.8b 810 trn2 v3.8b, v1.8b, v3.8b //row9 &10 811 mov v1.8b , v21.8b 812 trn1 v21.8b, v5.8b, v7.8b 813 trn2 v7.8b, v5.8b, v7.8b //row11 & 12 814 mov v5.8b , v21.8b 815 trn1 v21.8b, v9.8b, v11.8b 816 trn2 v11.8b, v9.8b, v11.8b //row13 &14 817 mov v9.8b , v21.8b 818 trn1 v21.8b, v13.8b, v15.8b 819 trn2 v15.8b, v13.8b, v15.8b //row15 & 16 820 mov v13.8b , v21.8b 821 //4x4 transposes 822 trn1 v21.4h, v2.4h, v6.4h 823 trn2 v6.4h, v2.4h, v6.4h //row2 & row4 824 mov v2.8b, v21.8b 825 trn1 v21.4h, v10.4h, v14.4h 826 trn2 v14.4h, v10.4h, v14.4h //row6 & row8 827 mov v10.8b , v21.8b 828 trn1 v21.4h, v3.4h, v7.4h 829 trn2 v7.4h, v3.4h, v7.4h //row10 & 12 830 mov v3.8b, v21.8b 831 trn1 v21.4h, v11.4h, v15.4h 832 trn2 v15.4h, v11.4h, v15.4h //row14 & row16 833 mov v11.8b, v21.8b 834 trn1 v21.2s, v6.2s, v14.2s 835 trn2 v14.2s, v6.2s, v14.2s //row4 & 8 836 mov v6.8b, v21.8b 837 trn1 v21.2s, v7.2s, v15.2s 838 trn2 v15.2s, v7.2s, v15.2s //row 12 & 16 839 mov v7.8b, v21.8b 840 //now Q3 ->p0 and Q7->q3 841 trn1 v21.4h, v0.4h, v4.4h 842 trn2 v4.4h, v0.4h, v4.4h //row1 & 3 843 mov v0.8b , v21.8b 844 trn1 v21.4h, v8.4h, v12.4h 845 trn2 v12.4h, v8.4h, v12.4h //row 5 & 7 846 mov v8.8b, v21.8b 847 trn1 v21.4h, v1.4h, v5.4h 848 trn2 v5.4h, v1.4h, v5.4h //row9 & row11 849 mov v1.8b, v21.8b 850 trn1 v21.4h, v9.4h, v13.4h 851 trn2 v13.4h, v9.4h, v13.4h //row13 & row15 852 mov v9.8b , v21.8b 853 trn1 v21.2s, v0.2s, v8.2s 854 trn2 v8.2s, v0.2s, v8.2s //row1 & row5 855 mov v0.8b, v21.8b 856 trn1 v21.2s, v1.2s, v9.2s 857 trn2 v9.2s, v1.2s, v9.2s //row9 & 13 858 mov v1.8b, v21.8b 859 //now Q0->p3 & Q4->q0 860 //starting processing as p0 and q0 are now ready 861 //now Q1->p2 & Q5->q1 862 mov v31.d[0], v14.d[0] 863 mov v31.d[1], v15.d[0] 864 trn1 v21.2s, v4.2s, v12.2s 865 trn2 v12.2s, v4.2s, v12.2s //row3 & 7 866 mov v4.8b, v21.8b 867 movi v28.8h, #2 868 trn1 v21.2s, v5.2s, v13.2s 869 trn2 v13.2s, v5.2s, v13.2s //row11 & row15 870 mov v5.8b, v21.8b 871 uaddl v16.8h, v6.8b, v8.8b //p0+q0 L 872 trn1 v21.2s, v2.2s, v10.2s 873 trn2 v10.2s, v2.2s, v10.2s //row2 &6 874 mov v2.8b, v21.8b 875 uaddl v18.8h, v7.8b, v9.8b //p0+q0 H 876 trn1 v21.2s, v3.2s, v11.2s 877 trn2 v11.2s, v3.2s, v11.2s //row10&row14 878 mov v3.8b, v21.8b 879 uaddw v20.8h, v16.8h , v4.8b //p0+q0+p1 L 880 uaddw v22.8h, v18.8h , v5.8b //p0+q0+p1 H 881 uaddl v24.8h, v2.8b, v10.8b //p2+q1 L 882 uaddl v26.8h, v3.8b, v11.8b //p2+q1 H 883 mla v24.8h, v20.8h , v28.8h //p2 + X2(p1) + X2(p0) + X2(q0) + q1 L 884 mla v26.8h, v22.8h , v28.8h //p2 + X2(p1) + X2(p0) + X2(q0) + q1 H 885 movi v28.16b, #2 886 uaddw v16.8h, v20.8h , v2.8b //p0+q0+p1+p2 L 887 uaddw v18.8h, v22.8h , v3.8b //p0+q0+p1+p2 H 888 dup v30.16b, w2 //duplicate alpha 889 rshrn v20.8b, v16.8h, #2 //(p2 + p1 + p0 + q0 + 2) >> 2)L p1' 890 rshrn v21.8b, v18.8h, #2 //(p2 + p1 + p0 + q0 + 2) >> 2)H p1' 891 mov v20.d[1] , v21.d[0] 892 mov v0.d[1] , v1.d[0] 893 mov v2.d[1] , v3.d[0] 894 mov v4.d[1] , v5.d[0] 895 mov v6.d[1] , v7.d[0] 896 mov v8.d[1] , v9.d[0] 897 mov v10.d[1] , v11.d[0] 898 mov v12.d[1] , v13.d[0] 899 mov v14.d[1] , v15.d[0] 900 uabd v22.16b , v6.16b, v8.16b 901 usra v28.16b, v30.16b, #2 //alpha >>2 +2 902 uabd v30.16b , v2.16b, v6.16b 903 rshrn v24.8b, v24.8h, #3 //((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) L p0' 904 rshrn v25.8b, v26.8h, #3 //((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) H p0' 905 mov v24.d[1] , v25.d[0] 906 dup v26.16b, w3 //beta 907 cmhi v28.16b, v28.16b , v22.16b //ABS(p0 - q0) <((Alpha >>2) + 2) 908 uaddl v22.8h, v6.8b, v10.8b //p0+q1 L 909 cmhi v14.16b, v26.16b , v30.16b //beta>Ap 910 uaddl v30.8h, v7.8b, v11.8b //p0+q1 H 911 uaddw v22.8h, v22.8h , v4.8b //p0+q1+p1 L 912 uaddw v30.8h, v30.8h , v5.8b //p0+q1+p1 H 913 uaddw v22.8h, v22.8h , v4.8b //p0+q1+2*p1 L 914 uaddw v30.8h, v30.8h , v5.8b //p0+q1+2*p1 H 915 and v14.16b, v14.16b , v28.16b //(Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2) 916 rshrn v22.8b, v22.8h, #2 //((X2(p1) + p0 + q1 + 2) >> 2) L p0" 917 rshrn v23.8b, v30.8h, #2 //((X2(p1) + p0 + q1 + 2) >> 2) H p0" 918 mov v22.d[1] , v23.d[0] 919 uaddl v30.8h, v2.8b, v0.8b //p2+p3 L 920 bif v24.16b, v22.16b , v14.16b //p0' or p0 " 921 uaddl v22.8h, v3.8b, v1.8b //p2+p3 H 922 add v30.8h, v30.8h , v30.8h //2*(p2+p3) L 923 add v22.8h, v22.8h , v22.8h //2*(p2+p3)H 924 add v16.8h, v16.8h , v30.8h //(X2(p3) + X3(p2) + p1 + p0 + q0) L 925 add v18.8h, v18.8h , v22.8h //(X2(p3) + X3(p2) + p1 + p0 + q0) H 926 uabd v30.16b , v12.16b, v8.16b 927 uabd v22.16b , v10.16b, v8.16b 928 rshrn v16.8b, v16.8h, #3 //((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); L p2' 929 rshrn v17.8b, v18.8h, #3 //((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); H p2' 930 mov v16.d[1] , v17.d[0] 931 uabd v18.16b , v4.16b, v6.16b 932 cmhi v30.16b, v26.16b , v30.16b //Aq < Beta 933 cmhs v22.16b, v22.16b, v26.16b 934 cmhs v18.16b, v18.16b, v26.16b 935 dup v26.16b, w2 //duplicate alpha 936 and v30.16b, v30.16b , v28.16b //(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) 937 uabd v28.16b , v6.16b, v8.16b 938 orr v22.16b, v22.16b , v18.16b //ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta 939 uaddl v18.8h, v6.8b, v8.8b //p0+q0 L 940 cmhs v28.16b, v28.16b, v26.16b 941 uaddl v26.8h, v7.8b, v9.8b //p0+q0 H 942 uaddw v18.8h, v18.8h , v10.8b //p0+q0+q1 L 943 orr v22.16b, v22.16b , v28.16b //ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta||ABS(p0 - q0) >= Alpha 944 uaddw v26.8h, v26.8h , v11.8b //p0+q0+q1 H 945 bic v14.16b, v14.16b , v22.16b //final condn for p's 946 movi v28.16b, #2 947 bif v6.16b, v24.16b , v22.16b //final p0 948 bit v2.16b, v16.16b , v14.16b //final p2 949 bif v20.16b, v4.16b , v14.16b //final p1 950 mov v7.d[0] , v6.d[1] 951 mov v3.d[0] , v2.d[1] 952 mov v21.d[0] , v20.d[1] 953 uaddl v24.8h, v8.8b, v4.8b //q0+p1 L 954 umlal v24.8h, v10.8b, v28.8b //X2(q1) + q0 + p1 L 955 uaddl v16.8h, v9.8b, v5.8b //q0+p1 H 956 umlal v16.8h, v11.8b, v28.8b //X2(q1) + q0 + p1 H 957 movi v28.8h, #2 958 uaddl v14.8h, v4.8b, v12.8b //p1+q2 L 959 mla v14.8h, v18.8h , v28.8h //p1 + X2(p0) + X2(q0) + X2(q1) + q2L 960 uaddl v4.8h, v5.8b, v13.8b //p1+q2H 961 mla v4.8h, v26.8h , v28.8h //p1 + X2(p0) + X2(q0) + X2(q1) + q2H 962 rshrn v24.8b, v24.8h, #2 //(X2(q1) + q0 + p1 + 2) >> 2; L q0' 963 rshrn v25.8b, v16.8h, #2 //(X2(q1) + q0 + p1 + 2) >> 2; H q0' 964 mov v24.d[1] , v25.d[0] 965 uaddw v18.8h, v18.8h , v12.8b //p0 + q0 + q1 + q2 L 966 uaddw v26.8h, v26.8h , v13.8b //p0 + q0 + q1 + q2 H 967 rshrn v16.8b, v14.8h, #3 //(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 L qo" 968 mov v14.16b, v31.16b 969 rshrn v17.8b, v4.8h, #3 //(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 H qo" 970 mov v16.d[1] , v17.d[0] 971 rshrn v4.8b, v18.8h, #2 //p0 + q0 + q1 + q2 + 2)>>2 L q1' 972 rshrn v5.8b, v26.8h, #2 //p0 + q0 + q1 + q2 + 2)>>2 H q1' 973 mov v4.d[1] , v5.d[0] 974 bit v24.16b, v16.16b , v30.16b //q0' or q0" 975 bic v30.16b, v30.16b , v22.16b //final condn for q's 976 trn1 v31.8b, v0.8b, v2.8b 977 trn2 v2.8b, v0.8b, v2.8b //row1 &2 978 mov v0.8b, v31.8b 979 bit v10.16b, v4.16b , v30.16b 980 mov v11.d[0] , v10.d[1] 981 mov v25.d[0] , v24.d[1] 982 mov v31.d[0] , v30.d[1] 983 trn1 v31.8b, v1.8b, v3.8b 984 trn2 v3.8b, v1.8b, v3.8b //row9 &10 985 mov v1.8b, v31.8b 986 uaddl v16.8h, v12.8b, v14.8b //q2+q3 L 987 trn1 v31.8b, v20.8b, v6.8b 988 trn2 v6.8b, v20.8b, v6.8b //row3&row4 989 mov v20.8b , v31.8b 990 uaddl v4.8h, v13.8b, v15.8b //q2+q3 H 991 trn1 v31.8b, v21.8b, v7.8b 992 trn2 v7.8b, v21.8b, v7.8b //row11 & 12 993 mov v21.8b , v31.8b 994 mla v18.8h, v16.8h , v28.8h //X2(q3) + X3(q2) + q1 + q0 + p0 L 995 trn1 v31.4h, v2.4h, v6.4h 996 trn2 v6.4h, v2.4h, v6.4h //row2 & row4 997 mov v2.8b, v31.8b 998 mla v26.8h, v4.8h , v28.8h //X2(q3) + X3(q2) + q1 + q0 + p0 H 999 trn1 v31.4h, v3.4h, v7.4h 1000 trn2 v7.4h, v3.4h, v7.4h //row10 & 12 1001 mov v3.8b , v31.8b 1002 bif v8.16b, v24.16b , v22.16b //final q0 1003 mov v9.d[0] , v8.d[1] 1004 trn1 v31.4h, v0.4h, v20.4h 1005 trn2 v20.4h, v0.4h, v20.4h //row1 & 3 1006 mov v0.8b , v31.8b 1007 rshrn v18.8b, v18.8h, #3 //(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; L 1008 trn1 v31.4h, v1.4h, v21.4h 1009 trn2 v21.4h, v1.4h, v21.4h //row9 & row11 1010 mov v1.8b, v31.8b 1011 rshrn v19.8b, v26.8h, #3 //(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; H 1012 mov v18.d[1] , v19.d[0] 1013 trn1 v31.8b, v8.8b, v10.8b 1014 trn2 v10.8b, v8.8b, v10.8b //row5&6 1015 mov v8.8b, v31.8b 1016 bit v12.16b, v18.16b , v30.16b //final q2 1017 mov v13.d[0] , v12.d[1] 1018 trn1 v31.8b, v9.8b, v11.8b 1019 trn2 v11.8b, v9.8b, v11.8b //row13 &14 1020 mov v9.8b, v31.8b 1021 trn1 v31.8b, v12.8b, v14.8b 1022 trn2 v14.8b, v12.8b, v14.8b //row7 & 8 1023 mov v12.8b, v31.8b 1024 trn1 v31.8b, v13.8b, v15.8b 1025 trn2 v15.8b, v13.8b, v15.8b //row15 & 16 1026 mov v13.8b , v31.8b 1027 trn1 v31.4h, v10.4h, v14.4h 1028 trn2 v14.4h, v10.4h, v14.4h //row6 & row8 1029 mov v10.8b, v31.8b 1030 trn1 v31.4h, v11.4h, v15.4h 1031 trn2 v15.4h, v11.4h, v15.4h //row14 & row16 1032 mov v11.8b, v31.8b 1033 //now Q3 ->p0 and Q7->q3 1034 trn1 v31.4h, v8.4h, v12.4h 1035 trn2 v12.4h, v8.4h, v12.4h //row 5 & 7 1036 mov v8.8b, v31.8b 1037 trn1 v31.4h, v9.4h, v13.4h 1038 trn2 v13.4h, v9.4h, v13.4h //row13 & row15 1039 mov v9.8b, v31.8b 1040 sub x0, x0, x1, lsl#4 //restore pointer 1041 trn1 v31.2s, v6.2s, v14.2s 1042 trn2 v14.2s, v6.2s, v14.2s //row4 & 8 1043 mov v6.8b , v31.8b 1044 trn1 v31.2s, v7.2s, v15.2s 1045 trn2 v15.2s, v7.2s, v15.2s //row 12 & 16 1046 mov v7.8b, v31.8b 1047 trn1 v31.2s, v0.2s, v8.2s 1048 trn2 v8.2s, v0.2s, v8.2s //row1 & row5 1049 mov v0.8b , v31.8b 1050 trn1 v31.2s, v1.2s, v9.2s 1051 trn2 v9.2s, v1.2s, v9.2s //row9 & 13 1052 mov v1.8b , v31.8b 1053 trn1 v31.2s, v2.2s, v10.2s 1054 trn2 v10.2s, v2.2s, v10.2s //row2 &6 1055 mov v2.8b , v31.8b 1056 trn1 v31.2s, v3.2s, v11.2s 1057 trn2 v11.2s, v3.2s, v11.2s //row10&row14 1058 mov v3.8b , v31.8b 1059 trn1 v31.2s, v20.2s, v12.2s 1060 trn2 v12.2s, v20.2s, v12.2s //row3 & 7 1061 mov v20.8b , v31.8b 1062 trn1 v31.2s, v21.2s, v13.2s 1063 trn2 v13.2s, v21.2s, v13.2s //row11 & row15 1064 mov v21.8b, v31.8b 1065 st1 {v0.8b}, [x0], x1 //row1 1066 st1 {v2.8b}, [x0], x1 //row2 1067 st1 {v20.8b}, [x0], x1 //row3 1068 st1 {v6.8b}, [x0], x1 //row4 1069 st1 {v8.8b}, [x0], x1 //row5 1070 st1 {v10.8b}, [x0], x1 //row6 1071 st1 {v12.8b}, [x0], x1 //row7 1072 st1 {v14.8b}, [x0], x1 //row8 1073 st1 {v1.8b}, [x0], x1 //row9 1074 st1 {v3.8b}, [x0], x1 //row10 1075 st1 {v21.8b}, [x0], x1 //row11 1076 st1 {v7.8b}, [x0], x1 //row12 1077 st1 {v9.8b}, [x0], x1 //row13 1078 st1 {v11.8b}, [x0], x1 //row14 1079 st1 {v13.8b}, [x0], x1 //row15 1080 st1 {v15.8b}, [x0], x1 //row16 1081 1082 // LDMFD sp!,{x12,pc} 1083 ldp x19, x20, [sp], #16 1084 pop_v_regs 1085 ret 1086 1087 1088