• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///**
21//******************************************************************************
22//* @file
23//*  ih264_inter_pred_luma_horz_qpel_av8.s
24//*
25//* @brief
26//*  Contains function definitions for inter prediction horizontal quarter pel interpolation.
27//*
28//* @author
29//*  Mohit
30//*
31//* @par List of Functions:
32//*
33//*  - ih264_inter_pred_luma_horz_qpel_av8()
34//*
35//* @remarks
36//*  None
37//*
38//*******************************************************************************
39//*/
40
41///* All the functions here are replicated from ih264_inter_pred_filters.c
42//
43
44///**
45///**
46//*******************************************************************************
47//*
48//* @brief
49//*     Quarter pel interprediction luma filter for horizontal input
50//*
51//* @par Description:
52//* Applies a 6 tap horizontal filter .The output is  clipped to 8 bits
53//* sec 8.4.2.2.1 titled "Luma sample interpolation process"
54//*
55//* @param[in] pu1_src
56//*  UWORD8 pointer to the source
57//*
58//* @param[out] pu1_dst
59//*  UWORD8 pointer to the destination
60//*
61//* @param[in] src_strd
62//*  integer source stride
63//*
64//* @param[in] dst_strd
65//*  integer destination stride
66//*
67//* @param[in] ht
68//*  integer height of the array
69//*
70//* @param[in] wd
71//*  integer width of the array
72//*
73// @param[in] pu1_tmp: temporary buffer: UNUSED in this function
74//*
75//* @param[in] dydx: x and y reference offset for qpel calculations.
76//* @returns
77//*
78// @remarks
79//*  None
80//*
81//*******************************************************************************
82//*/
83
84//void ih264_inter_pred_luma_horz (
85//                            UWORD8 *pu1_src,
86//                            UWORD8 *pu1_dst,
87//                            WORD32 src_strd,
88//                            WORD32 dst_strd,
89//                            WORD32 ht,
90//                            WORD32 wd,
91//                              UWORD8* pu1_tmp,
92//                             UWORD32 dydx)
93
94//**************Variables Vs Registers*****************************************
95//    x0 => *pu1_src
96//    x1 => *pu1_dst
97//    w2 =>  src_strd
98//    w3 =>  dst_strd
99//    w4 =>  ht
100//    w5 =>  wd
101//    w7 =>  dydx
102
103.text
104.p2align 2
105.include "ih264_neon_macros.s"
106
107
108
109
110    .global ih264_inter_pred_luma_horz_qpel_av8
111
112ih264_inter_pred_luma_horz_qpel_av8:
113
114
115    push_v_regs
116    stp       x19, x20, [sp, #-16]!
117    sxtw      x2, w2
118    sxtw      x3, w3
119    sxtw      x4, w4
120    sxtw      x5, w5
121
122
123    and       x7, x7, #3                //Finds x-offset
124    add       x7, x0, x7, lsr #1        //pu1_src + (x_offset>>1)
125    sub       x0, x0, #2                //pu1_src-2
126    sub       x14, x4, #16
127    movi      v0.16b, #5                //filter coeff
128    subs      x12, x5, #8               //if wd=8 branch to loop_8
129    movi      v1.16b, #20               //filter coeff
130
131    beq       loop_8
132
133    subs      x12, x5, #4               //if wd=4 branch to loop_4
134    beq       loop_4
135
136loop_16:                                //when  wd=16
137    //// Processing row0 and row1
138    ld1       {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row0
139    add       x14, x14, #1              //for checking loop
140    ext       v31.8b, v2.8b , v3.8b , #5
141    ld1       {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row1
142    ext       v30.8b, v3.8b , v4.8b , #5
143    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row0)
144    ext       v28.8b, v5.8b , v6.8b , #5
145    uaddl     v10.8h, v30.8b, v3.8b     //// a0 + a5                             (column2,row0)
146    ext       v27.8b, v6.8b , v7.8b , #5
147    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row1)
148    ext       v31.8b, v2.8b , v3.8b , #2
149    uaddl     v16.8h, v27.8b, v6.8b     //// a0 + a5                             (column2,row1)
150    ext       v30.8b, v3.8b , v4.8b , #2
151    umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row0)
152    ext       v28.8b, v5.8b , v6.8b , #2
153    umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row0)
154    ext       v27.8b, v6.8b , v7.8b , #2
155    umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row1)
156    ext       v31.8b, v2.8b , v3.8b , #3
157    umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row1)
158    ext       v30.8b, v3.8b , v4.8b , #3
159    umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
160    ext       v28.8b, v5.8b , v6.8b , #3
161    umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row0)
162    ext       v27.8b, v6.8b , v7.8b , #3
163    umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row1)
164    ext       v31.8b, v2.8b , v3.8b , #1
165    umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row1)
166    ext       v30.8b, v3.8b , v4.8b , #1
167    umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
168    ext       v28.8b, v5.8b , v6.8b , #1
169    umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row0)
170    ext       v27.8b, v6.8b , v7.8b , #1
171    umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row1)
172    ext       v31.8b, v2.8b , v3.8b , #4
173    umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row1)
174    ext       v30.8b, v3.8b , v4.8b , #4
175    umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
176    ext       v28.8b, v5.8b , v6.8b , #4
177    umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row0)
178    ext       v27.8b, v6.8b , v7.8b , #4
179    umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row1)
180    ld1       {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row2
181    umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row1)
182
183    ld1       {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation            (column1,row0)
184    sqrshrun  v20.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
185    ld1       {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row3
186    sqrshrun  v21.8b, v10.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row0)
187    ext       v31.8b, v2.8b , v3.8b , #5
188    urhadd    v20.16b, v12.16b , v20.16b //Interpolation step for qpel calculation
189    urhadd    v21.16b, v13.16b , v21.16b //Interpolation step for qpel calculation
190
191    sqrshrun  v18.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row1)
192    st1       {v20.8b, v21.8b}, [x1], x3 ////Store dest row0
193    ext       v30.8b, v3.8b , v4.8b , #5
194    sqrshrun  v19.8b, v16.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row1)
195
196
197
198//// Processing row2 and row3
199    ld1       {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation            (column1,row1)
200    ext       v28.8b, v5.8b , v6.8b , #5
201    urhadd    v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
202    urhadd    v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
203
204    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row2)
205    st1       {v18.8b, v19.8b}, [x1], x3 ////Store dest row1
206    uaddl     v10.8h, v30.8b, v3.8b     //// a0 + a5                             (column2,row2)
207    ext       v27.8b, v6.8b , v7.8b , #5
208    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row3)
209    ext       v31.8b, v2.8b , v3.8b , #2
210    uaddl     v16.8h, v27.8b, v6.8b     //// a0 + a5                             (column2,row3)
211    ext       v30.8b, v3.8b , v4.8b , #2
212    umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row2)
213    ext       v27.8b, v6.8b , v7.8b , #2
214    umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row2)
215    ext       v28.8b, v5.8b , v6.8b , #2
216    umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row3)
217    ext       v31.8b, v2.8b , v3.8b , #3
218    umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row3)
219    ext       v30.8b, v3.8b , v4.8b , #3
220    umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row2)
221    ext       v28.8b, v5.8b , v6.8b , #3
222    umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row2)
223    ext       v27.8b, v6.8b , v7.8b , #3
224    umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row3)
225    ext       v31.8b, v2.8b , v3.8b , #1
226    umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row3)
227    ext       v30.8b, v3.8b , v4.8b , #1
228    umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row2)
229    ext       v28.8b, v5.8b , v6.8b , #1
230    umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row2)
231    ext       v27.8b, v6.8b , v7.8b , #1
232    umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row3)
233    ext       v31.8b, v2.8b , v3.8b , #4
234    umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row3)
235    ext       v30.8b, v3.8b , v4.8b , #4
236    umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row2)
237    ext       v28.8b, v5.8b , v6.8b , #4
238    umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row2)
239    ext       v27.8b, v6.8b , v7.8b , #4
240    umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row3)
241    ld1       {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row4
242    umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row3)
243
244    ld1       {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation            (column1,row2)
245    sqrshrun  v20.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row2)
246    ld1       {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row5
247    sqrshrun  v21.8b, v10.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row2)
248    ext       v31.8b, v2.8b , v3.8b , #5
249    urhadd    v20.16b, v12.16b , v20.16b //Interpolation step for qpel calculation
250    urhadd    v21.16b, v13.16b , v21.16b //Interpolation step for qpel calculation
251
252    sqrshrun  v18.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row3)
253    ext       v30.8b, v3.8b , v4.8b , #5
254    st1       {v20.8b, v21.8b}, [x1], x3 ////Store dest row2
255    sqrshrun  v19.8b, v16.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row3)
256    ld1       {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation            (column1,row3)
257
258//// Processing row4 and row5
259    ext       v28.8b, v5.8b , v6.8b , #5
260    urhadd    v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
261    urhadd    v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
262
263    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row4)
264    st1       {v18.8b, v19.8b}, [x1], x3 ////Store dest row3
265    uaddl     v10.8h, v30.8b, v3.8b     //// a0 + a5                             (column2,row4)
266    ext       v27.8b, v6.8b , v7.8b , #5
267    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row5)
268    ext       v31.8b, v2.8b , v3.8b , #2
269    uaddl     v16.8h, v27.8b, v6.8b     //// a0 + a5                             (column2,row5)
270    ext       v30.8b, v3.8b , v4.8b , #2
271    umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row4)
272    ext       v27.8b, v6.8b , v7.8b , #2
273    umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row4)
274    ext       v28.8b, v5.8b , v6.8b , #2
275    umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row5)
276    ext       v31.8b, v2.8b , v3.8b , #3
277    umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row5)
278    ext       v30.8b, v3.8b , v4.8b , #3
279    umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row4)
280    ext       v28.8b, v5.8b , v6.8b , #3
281    umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row4)
282    ext       v27.8b, v6.8b , v7.8b , #3
283    umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row5)
284    ext       v31.8b, v2.8b , v3.8b , #1
285    umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row5)
286    ext       v30.8b, v3.8b , v4.8b , #1
287    umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row4)
288    ext       v28.8b, v5.8b , v6.8b , #1
289    umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row4)
290    ext       v27.8b, v6.8b , v7.8b , #1
291    umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row4)
292    ext       v31.8b, v2.8b , v3.8b , #4
293    umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row5)
294    ext       v30.8b, v3.8b , v4.8b , #4
295    umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row4)
296    ext       v28.8b, v5.8b , v6.8b , #4
297    umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row4)
298    ext       v27.8b, v6.8b , v7.8b , #4
299    umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row5)
300    ld1       {v2.8b, v3.8b, v4.8b}, [x0], x2 //// Load row6
301    umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row5)
302    ld1       {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation            (column1,row4)
303    sqrshrun  v20.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row4)
304    ld1       {v5.8b, v6.8b, v7.8b}, [x0], x2 //// Load row7
305    sqrshrun  v21.8b, v10.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row4)
306    ext       v31.8b, v2.8b , v3.8b , #5
307    urhadd    v20.16b, v12.16b , v20.16b //Interpolation step for qpel calculation
308    urhadd    v21.16b, v13.16b , v21.16b //Interpolation step for qpel calculation
309
310    sqrshrun  v18.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row5)
311    st1       {v20.8b, v21.8b}, [x1], x3 ////Store dest row4
312    ext       v30.8b, v3.8b , v4.8b , #5
313    sqrshrun  v19.8b, v16.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row5)
314    ld1       {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation            (column1,row5)
315
316
317    //// Processing row6 and row7
318
319    ext       v28.8b, v5.8b , v6.8b , #5
320    urhadd    v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
321    urhadd    v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
322
323    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row6)
324    st1       {v18.8b, v19.8b}, [x1], x3 ////Store dest row5
325    uaddl     v10.8h, v30.8b, v3.8b     //// a0 + a5                             (column2,row6)
326    ext       v27.8b, v6.8b , v7.8b , #5
327    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row7)
328    ext       v31.8b, v2.8b , v3.8b , #2
329    uaddl     v16.8h, v27.8b, v6.8b     //// a0 + a5                             (column2,row7)
330    ext       v30.8b, v3.8b , v4.8b , #2
331    umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row6)
332    ext       v27.8b, v6.8b , v7.8b , #2
333    umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row6)
334    ext       v28.8b, v5.8b , v6.8b , #2
335    umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row7)
336    ext       v31.8b, v2.8b , v3.8b , #3
337    umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2                         (column2,row7)
338    ext       v30.8b, v3.8b , v4.8b , #3
339    umlal     v8.8h, v31.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row6)
340    ext       v28.8b, v5.8b , v6.8b , #3
341    umlal     v10.8h, v30.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row6)
342    ext       v27.8b, v6.8b , v7.8b , #3
343    umlal     v14.8h, v28.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row7)
344    ext       v31.8b, v2.8b , v3.8b , #1
345    umlal     v16.8h, v27.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column2,row7)
346    ext       v30.8b, v3.8b , v4.8b , #1
347    umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row6)
348    ext       v28.8b, v5.8b , v6.8b , #1
349    umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row6)
350    ext       v27.8b, v6.8b , v7.8b , #1
351    umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row6)
352    ext       v31.8b, v2.8b , v3.8b , #4
353    umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column2,row7)
354    ext       v30.8b, v3.8b , v4.8b , #4
355    umlsl     v8.8h, v31.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row6)
356    ext       v28.8b, v5.8b , v6.8b , #4
357    umlsl     v10.8h, v30.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row6)
358    ext       v27.8b, v6.8b , v7.8b , #4
359    ld1       {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation            (column1,row6)
360    sqrshrun  v20.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row6)
361    umlsl     v14.8h, v28.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row7)
362    sqrshrun  v21.8b, v10.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row6)
363    umlsl     v16.8h, v27.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column2,row7)
364    urhadd    v20.16b, v12.16b , v20.16b //Interpolation step for qpel calculation
365    urhadd    v21.16b, v13.16b , v21.16b //Interpolation step for qpel calculation
366
367    ld1       {v12.2s, v13.2s}, [x7], x2 //Load value for interpolation            (column1,row7)
368    sqrshrun  v18.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row7)
369    st1       {v20.8b, v21.8b}, [x1], x3 ////Store dest row6
370    sqrshrun  v19.8b, v16.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column2,row7)
371    urhadd    v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
372    urhadd    v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
373
374    subs      x12, x14, #1              // if height==16  - looping
375    st1       {v18.8b, v19.8b}, [x1], x3 ////Store dest row7
376
377
378
379    beq       loop_16
380    b         end_func
381
382loop_8:
383//// Processing row0 and row1
384
385    ld1       {v5.8b, v6.8b}, [x0], x2  //// Load row1
386    add       x14, x14, #1              //for checking loop
387    ext       v28.8b, v5.8b , v6.8b , #5
388    ld1       {v2.8b, v3.8b}, [x0], x2  //// Load row0
389    ext       v25.8b, v5.8b , v6.8b , #2
390    ext       v31.8b, v2.8b , v3.8b , #5
391    ext       v24.8b, v5.8b , v6.8b , #3
392    ext       v23.8b, v5.8b , v6.8b , #1
393    ext       v22.8b, v5.8b , v6.8b , #4
394    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row1)
395    ext       v29.8b, v2.8b , v3.8b , #3
396    umlal     v14.8h, v25.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row1)
397    umlal     v14.8h, v24.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row1)
398    umlsl     v14.8h, v23.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row1)
399    umlsl     v14.8h, v22.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row1)
400    ext       v30.8b, v2.8b , v3.8b , #2
401    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row0)
402    ext       v27.8b, v2.8b , v3.8b , #1
403    ext       v26.8b, v2.8b , v3.8b , #4
404    ld1       {v2.8b, v3.8b}, [x0], x2  //// Load row2
405    umlal     v8.8h, v29.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
406    umlal     v8.8h, v30.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row0)
407    umlsl     v8.8h, v27.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
408    umlsl     v8.8h, v26.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
409    ld1       {v5.8b, v6.8b}, [x0], x2  //// Load row3
410    sqrshrun  v18.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
411
412    //// Processing row2 and row3
413    ext       v28.8b, v5.8b , v6.8b , #5
414    ext       v25.8b, v5.8b , v6.8b , #2
415    ext       v31.8b, v2.8b , v3.8b , #5
416    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row3)
417    ld1       {v12.2s}, [x7], x2        //Load value for interpolation            (column1,row0)
418    ld1       {v13.2s}, [x7], x2        //Load value for interpolation            (column1,row1)
419    ext       v24.8b, v5.8b , v6.8b , #3
420    ext       v23.8b, v5.8b , v6.8b , #1
421    sqrshrun  v19.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row1)
422    ext       v22.8b, v5.8b , v6.8b , #4
423    ext       v29.8b, v2.8b , v3.8b , #3
424    umlal     v14.8h, v25.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row3)
425    umlal     v14.8h, v24.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row3)
426    umlsl     v14.8h, v23.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row3)
427    umlsl     v14.8h, v22.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row3)
428    urhadd    v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
429    urhadd    v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
430
431    st1       {v18.8b}, [x1], x3        ////Store dest row0
432    st1       {v19.8b}, [x1], x3        ////Store dest row1
433    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row2)
434    ext       v30.8b, v2.8b , v3.8b , #2
435    ext       v27.8b, v2.8b , v3.8b , #1
436    ext       v26.8b, v2.8b , v3.8b , #4
437    ld1       {v2.8b, v3.8b}, [x0], x2  //// Load row4
438    umlal     v8.8h, v29.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row2)
439    umlal     v8.8h, v30.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row2)
440    umlsl     v8.8h, v27.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row2)
441    umlsl     v8.8h, v26.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row2)
442    ld1       {v5.8b, v6.8b}, [x0], x2  //// Load row5
443    subs      x9, x4, #4
444    sqrshrun  v19.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row3)
445    ld1       {v12.2s}, [x7], x2        //Load value for interpolation            (column1,row2)
446    ld1       {v13.2s}, [x7], x2        //Load value for interpolation            (column1,row3)
447    ext       v28.8b, v5.8b , v6.8b , #5
448    ext       v25.8b, v5.8b , v6.8b , #2
449    ext       v31.8b, v2.8b , v3.8b , #5
450    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row5)
451    ext       v24.8b, v5.8b , v6.8b , #3
452    sqrshrun  v18.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row2)
453    ext       v22.8b, v5.8b , v6.8b , #4
454    ext       v29.8b, v2.8b , v3.8b , #3
455    urhadd    v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
456    urhadd    v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
457
458    st1       {v18.8b}, [x1], x3        ////Store dest row2
459    ext       v30.8b, v2.8b , v3.8b , #2
460    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row4)
461    st1       {v19.8b}, [x1], x3        ////Store dest row3
462    beq       end_func                  // Branch if height==4
463
464//// Processing row4 and row5
465    ext       v23.8b, v5.8b , v6.8b , #1
466    umlal     v14.8h, v25.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row5)
467    umlal     v14.8h, v24.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row5)
468    umlsl     v14.8h, v23.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row5)
469    umlsl     v14.8h, v22.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row5)
470    ext       v27.8b, v2.8b , v3.8b , #1
471    ext       v26.8b, v2.8b , v3.8b , #4
472    ld1       {v2.8b, v3.8b}, [x0], x2  //// Load row6
473    umlal     v8.8h, v29.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row4)
474    umlal     v8.8h, v30.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row4)
475    umlsl     v8.8h, v27.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row4)
476    umlsl     v8.8h, v26.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row4)
477    sqrshrun  v19.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row5)
478    ld1       {v5.8b, v6.8b}, [x0], x2  //// Load row7
479    ext       v31.8b, v2.8b , v3.8b , #5
480    ext       v28.8b, v5.8b , v6.8b , #5
481    ld1       {v12.2s}, [x7], x2        //Load value for interpolation            (column1,row4)
482    ld1       {v13.2s}, [x7], x2        //Load value for interpolation            (column1,row5)
483    ext       v25.8b, v5.8b , v6.8b , #2
484    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row7)
485    ext       v24.8b, v5.8b , v6.8b , #3
486    ext       v22.8b, v5.8b , v6.8b , #4
487    sqrshrun  v18.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row4)
488    ext       v29.8b, v2.8b , v3.8b , #3
489    ext       v30.8b, v2.8b , v3.8b , #2
490    urhadd    v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
491    urhadd    v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
492
493    st1       {v18.8b}, [x1], x3        ////Store dest row4
494    ext       v27.8b, v2.8b , v3.8b , #1
495    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row6)
496    ext       v26.8b, v2.8b , v3.8b , #4
497    umlal     v8.8h, v29.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row6)
498    umlal     v8.8h, v30.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row6)
499    umlsl     v8.8h, v27.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row6)
500    umlsl     v8.8h, v26.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row6)
501    //// Processing row6 and row7
502    st1       {v19.8b}, [x1], x3        ////Store dest row5
503    ext       v23.8b, v5.8b , v6.8b , #1
504    umlal     v14.8h, v25.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row7)
505    umlal     v14.8h, v24.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row7)
506    umlsl     v14.8h, v23.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row7)
507    umlsl     v14.8h, v22.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row7)
508    ld1       {v12.2s}, [x7], x2        //Load value for interpolation            (column1,row6)
509    ld1       {v13.2s}, [x7], x2        //Load value for interpolation            (column1,row7)
510    sqrshrun  v18.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row6)
511    subs      x12, x14, #1
512    sqrshrun  v19.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row7)
513    urhadd    v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
514    urhadd    v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
515
516    st1       {v18.8b}, [x1], x3        ////Store dest row6
517    st1       {v19.8b}, [x1], x3        ////Store dest row7
518
519    beq       loop_8                    //looping if height ==16
520
521    b         end_func
522
523loop_4:
524    ld1       {v5.8b, v6.8b}, [x0], x2  //// Load row1
525    ext       v28.8b, v5.8b , v6.8b , #5
526    ld1       {v2.8b, v3.8b}, [x0], x2  //// Load row0
527    ext       v25.8b, v5.8b , v6.8b , #2
528    ext       v31.8b, v2.8b , v3.8b , #5
529    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row1)
530    ext       v24.8b, v5.8b , v6.8b , #3
531    ext       v23.8b, v5.8b , v6.8b , #1
532    ext       v22.8b, v5.8b , v6.8b , #4
533    ext       v29.8b, v2.8b , v3.8b , #3
534    umlal     v14.8h, v25.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row1)
535    umlal     v14.8h, v24.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row1)
536    umlsl     v14.8h, v23.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row1)
537    umlsl     v14.8h, v22.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row1)
538    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row0)
539    ext       v30.8b, v2.8b , v3.8b , #2
540    ld1       {v12.2s}, [x7], x2        //Load value for interpolation            (column1,row0)
541    ld1       {v13.2s}, [x7], x2        //Load value for interpolation            (column1,row1)
542    ext       v27.8b, v2.8b , v3.8b , #1
543    ext       v26.8b, v2.8b , v3.8b , #4
544    ld1       {v2.8b, v3.8b}, [x0], x2  //// Load row2
545    umlal     v8.8h, v29.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row0)
546    umlal     v8.8h, v30.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row0)
547    umlsl     v8.8h, v27.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row0)
548    umlsl     v8.8h, v26.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row0)
549    ld1       {v5.8b, v6.8b}, [x0], x2  //// Load row3
550    ext       v28.8b, v5.8b , v6.8b , #5
551    ext       v25.8b, v5.8b , v6.8b , #2
552    sqrshrun  v18.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row0)
553    ext       v31.8b, v2.8b , v3.8b , #5
554    ext       v24.8b, v5.8b , v6.8b , #3
555
556    ext       v23.8b, v5.8b , v6.8b , #1
557    ext       v22.8b, v5.8b , v6.8b , #4
558    ext       v29.8b, v2.8b , v3.8b , #3
559    sqrshrun  v19.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row1)
560    ext       v30.8b, v2.8b , v3.8b , #2
561    ext       v27.8b, v2.8b , v3.8b , #1
562
563    //// Processing row2 and row3
564    urhadd    v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
565    urhadd    v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
566
567    st1       {v18.s}[0], [x1], x3      ////Store dest row0
568    st1       {v19.s}[0], [x1], x3      ////Store dest row1
569    uaddl     v14.8h, v28.8b, v5.8b     //// a0 + a5                             (column1,row3)
570    ext       v26.8b, v2.8b , v3.8b , #4
571    ld1       {v12.2s}, [x7], x2        //Load value for interpolation            (column1,row2)
572    ld1       {v13.2s}, [x7], x2        //Load value for interpolation            (column1,row3)
573
574    umlal     v14.8h, v25.8b, v1.8b     //// a0 + a5 + 20a2                         (column1,row3)
575    umlal     v14.8h, v24.8b, v1.8b     //// a0 + a5 + 20a2 + 20a3                  (column1,row3)
576    umlsl     v14.8h, v23.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row3)
577    umlsl     v14.8h, v22.8b, v0.8b     //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row3)
578    uaddl     v8.8h, v31.8b, v2.8b      //// a0 + a5                             (column1,row2)
579    umlal     v8.8h, v29.8b, v1.8b      //// a0 + a5 + 20a2 + 20a3                  (column1,row2)
580    umlal     v8.8h, v30.8b, v1.8b      //// a0 + a5 + 20a2                         (column1,row2)
581    umlsl     v8.8h, v27.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1           (column1,row2)
582    umlsl     v8.8h, v26.8b, v0.8b      //// a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4     (column1,row2)
583    sqrshrun  v19.8b, v14.8h, #5        //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row3)
584    sqrshrun  v18.8b, v8.8h, #5         //// (a0 + a5 + 20a2 + 20a3 - 5a1 - 5a4 + 16) >> 5    (column1,row2)
585    urhadd    v18.16b, v12.16b , v18.16b //Interpolation step for qpel calculation
586    urhadd    v19.16b, v13.16b , v19.16b //Interpolation step for qpel calculation
587
588    st1       {v18.s}[0], [x1], x3      ////Store dest row2
589    subs      x4, x4, #8                // Loop if height =8
590    st1       {v19.s}[0], [x1], x3      ////Store dest row3
591
592    beq       loop_4
593
594end_func:
595
596    ldp       x19, x20, [sp], #16
597    pop_v_regs
598    ret
599
600
601
602