• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1//******************************************************************************
2//*
3//* Copyright (C) 2015 The Android Open Source Project
4//*
5//* Licensed under the Apache License, Version 2.0 (the "License");
6//* you may not use this file except in compliance with the License.
7//* You may obtain a copy of the License at:
8//*
9//* http://www.apache.org/licenses/LICENSE-2.0
10//*
11//* Unless required by applicable law or agreed to in writing, software
12//* distributed under the License is distributed on an "AS IS" BASIS,
13//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14//* See the License for the specific language governing permissions and
15//* limitations under the License.
16//*
17//*****************************************************************************
18//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19//*/
20///**
21///*******************************************************************************
22// * //file
23// *  ih264_iquant_itrans_recon_a9.s
24// *
25// * //brief
26// *  Contains function definitions for single stage  inverse transform
27// *
28// * //author
29// *  Parthiban V
30// *     Mohit
31// *  Harinarayanaan
32// *
33// * //par List of Functions:
34// *  - ih264_iquant_itrans_recon_4x4_av8()
35// *     - ih264_iquant_itrans_recon_8x8_av8()
36// *     - ih264_iquant_itrans_recon_chroma_4x4_av8()
37// *
38// * //remarks
39// *  None
40// *
41// *******************************************************************************
42
43.text
44.p2align 2
45.include "ih264_neon_macros.s"
46
47///*
48// *******************************************************************************
49// *
50// * //brief
51// *  This function performs inverse quant and Inverse transform type Ci4 for 4*4 block
52// *
53// * //par Description:
54// *  Performs inverse transform Ci4 and adds the residue to get the
55// *  reconstructed block
56// *
57// * //param[in] pi2_src
58// *  Input 4x4 coefficients
59// *
60// * //param[in] pu1_pred
61// *  Prediction 4x4 block
62// *
63// * //param[out] pu1_out
64// *  Output 4x4 block
65// *
66// * //param[in] u4_qp_div_6
67// *     QP
68// *
69// * //param[in] pu2_weigh_mat
70// * Pointer to weight matrix
71// *
72// * //param[in] pred_strd,
73// *  Prediction stride
74// *
75// * //param[in] out_strd
76// *  Output Stride
77// *
78// *//param[in] pi2_tmp
79// * temporary buffer of size 1*16
80// *
81// * //param[in] pu2_iscal_mat
82// * Pointer to the inverse quantization matrix
83// *
84// * //returns  Void
85// *
86// * //remarks
87// *  None
88// *
89// *******************************************************************************
90// */
91//void ih264_iquant_itrans_recon_4x4(WORD16 *pi2_src,
92//                                    UWORD8 *pu1_pred,
93//                                    UWORD8 *pu1_out,
94//                                    WORD32 pred_strd,
95//                                    WORD32 out_strd,
96//                                    const UWORD16 *pu2_iscal_mat,
97//                                    const UWORD16 *pu2_weigh_mat,
98//                                    UWORD32 u4_qp_div_6,
99//                                    WORD32 *pi4_tmp,
100//                                    WORD32 iq_start_idx
101//                                    WORD16 *pi2_dc_ld_addr)
102//**************Variables Vs Registers*****************************************
103//x0 => *pi2_src
104//x1 => *pu1_pred
105//x2 => *pu1_out
106//x3 =>  pred_strd
107//x4 =>  out_strd
108//x5 => *pu2_iscal_mat
109//x6 => *pu2_weigh_mat
110//x7 =>  u4_qp_div_6
111//   =>  pi4_tmp
112//   =>  iq_start_idx
113//   =>  pi2_dc_ld_addr
114//Only one shift is done in horizontal inverse because,
115//if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
116//if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
117
118    .global ih264_iquant_itrans_recon_4x4_av8
119ih264_iquant_itrans_recon_4x4_av8:
120
121    push_v_regs
122
123    dup       v30.4s, w7                //Populate the u4_qp_div_6 in Q15
124
125    ldr       w8, [sp, #72]             //Loads iq_start_idx
126    sxtw      x8, w8
127
128    ldr       x10, [sp, #80]            //Load alternate dc address
129
130    subs      x8, x8, #1                // if x8 == 1 => intra case , so result of subtraction is zero and z flag is set
131
132
133//=======================DEQUANT FROM HERE===================================
134
135    ld4       {v20.4h - v23.4h}, [x5]   // load pu2_iscal_mat[i], i =0..15
136    ld4       {v26.4h - v29.4h}, [x6]   // pu2_weigh_mat[i], i =0..15
137    ld4       {v16.4h - v19.4h}, [x0]   // pi2_src_tmp[i], i =0..15
138
139
140    mul       v20.4h, v20.4h, v26.4h    // x[i]=(scale[i] * dequant[i]) where i = 0..3
141    mul       v21.4h, v21.4h, v27.4h    // x[i]=(scale[i] * dequant[i]) where i = 4..7
142    mul       v22.4h, v22.4h, v28.4h    // x[i]=(scale[i] * dequant[i]) where i = 8..11
143    mul       v23.4h, v23.4h, v29.4h    // x[i]=(scale[i] * dequant[i]) where i = 12..14
144
145    smull     v0.4s, v16.4h, v20.4h     // q0  = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
146    smull     v2.4s, v17.4h, v21.4h     // q1  = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
147    smull     v4.4s, v18.4h, v22.4h     // q2  = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
148    smull     v6.4s, v19.4h, v23.4h     // q3  = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
149
150    sshl      v0.4s, v0.4s, v30.4s      // q0  = q[i] = (p[i] << (qp/6)) where i = 0..3
151    sshl      v2.4s, v2.4s, v30.4s      // q1  = q[i] = (p[i] << (qp/6)) where i = 4..7
152    sshl      v4.4s, v4.4s, v30.4s      // q2  = q[i] = (p[i] << (qp/6)) where i = 8..11
153    sshl      v6.4s, v6.4s, v30.4s      // q3  = q[i] = (p[i] << (qp/6)) where i = 12..15
154
155    sqrshrn   v0.4h, v0.4s, #0x4        // d0  = c[i] = ((q[i] + 32) >> 4) where i = 0..3
156    sqrshrn   v1.4h, v2.4s, #0x4        // d1  = c[i] = ((q[i] + 32) >> 4) where i = 4..7
157    sqrshrn   v2.4h, v4.4s, #0x4        // d2  = c[i] = ((q[i] + 32) >> 4) where i = 8..11
158    sqrshrn   v3.4h, v6.4s, #0x4        // d3  = c[i] = ((q[i] + 32) >> 4) where i = 12..15
159
160    bne       skip_loading_luma_dc_src
161    ld1       {v0.h}[0], [x10]          // loads signed halfword pi2_dc_ld_addr[0], if x8==1
162skip_loading_luma_dc_src:
163
164    //========= PROCESS IDCT FROM HERE =======
165    //Steps for Stage 1:
166    //------------------
167    ld1       {v30.s}[0], [x1], x3      // i row load pu1_pred buffer
168
169    sshr      v8.4h, v1.4h, #1          // d1>>1
170    sshr      v9.4h, v3.4h, #1          // d3>>1
171
172    add       v4.4h, v0.4h, v2.4h       // x0 = d0 + d2//
173    sub       v5.4h, v0.4h, v2.4h       // x1 = d0 - d2//
174    sub       v6.4h, v8.4h, v3.4h       // x2 = (d1 >> 1) -  d3//
175    add       v7.4h, v1.4h, v9.4h       // x3 =  d1  + (d3 >>  1)//
176
177    ld1       {v30.s}[1], [x1], x3      // ii row load pu1_pred buffer
178
179    add       v10.4h, v4.4h , v7.4h     // x0+x3
180    add       v11.4h, v5.4h , v6.4h     // x1+x2
181    sub       v12.4h, v5.4h , v6.4h     // x1-x2
182    sub       v13.4h, v4.4h , v7.4h
183
184    ld1       {v31.s}[0], [x1], x3      // iii row load pu1_pred buf
185
186
187    //Steps for Stage 2:
188    //transopose
189    trn1      v4.4h, v10.4h, v11.4h
190    trn2      v5.4h, v10.4h, v11.4h
191    trn1      v6.4h, v12.4h, v13.4h
192    trn2      v7.4h, v12.4h, v13.4h
193
194    trn1      v10.2s, v4.2s, v6.2s      // 0
195    trn1      v11.2s, v5.2s, v7.2s      // 8
196    trn2      v12.2s, v4.2s, v6.2s      // 4
197    trn2      v13.2s, v5.2s, v7.2s
198    //end transpose
199
200    sshr      v18.4h, v11.4h, #1        // q0>>1
201    sshr      v19.4h, v13.4h, #1        // q1>>1
202
203    add       v14.4h, v10.4h, v12.4h    // x0 = q0 + q2//
204    sub       v15.4h, v10.4h, v12.4h    // x1 = q0 - q2//
205    sub       v16.4h, v18.4h, v13.4h    // x2 = (q1 >> 1) -  q3//
206    add       v17.4h, v11.4h, v19.4h    // x3 = q1+ (q3 >> 3)//
207
208
209    ld1       {v31.s}[1], [x1], x3      // iv row load pu1_pred buffer
210
211    add       v20.4h, v14.4h, v17.4h    // x0 + x3
212    add       v21.4h, v15.4h, v16.4h    // x1 + x2
213    sub       v22.4h, v15.4h, v16.4h    // x1 - x2
214    sub       v23.4h, v14.4h, v17.4h    // x0 - x3
215
216    mov       v20.d[1], v21.d[0]
217    mov       v22.d[1], v23.d[0]
218
219    srshr     v20.8h, v20.8h, #6
220    srshr     v22.8h, v22.8h, #6
221
222    uaddw     v20.8h, v20.8h , v30.8b
223    uaddw     v22.8h, v22.8h , v31.8b
224
225    sqxtun    v0.8b, v20.8h
226    sqxtun    v1.8b, v22.8h
227
228    st1       {v0.s}[0], [x2], x4       //i row store the value
229    st1       {v0.s}[1], [x2], x4       //ii row store the value
230    st1       {v1.s}[0], [x2], x4       //iii row store the value
231    st1       {v1.s}[1], [x2]           //iv row store the value
232
233    pop_v_regs
234    ret
235
236
237///**
238// *******************************************************************************
239// *
240// * @brief
241// *  This function performs inverse quant and Inverse transform type Ci4 for 4*4 block
242// *
243// * @par Description:
244// *  Performs inverse transform Ci4 and adds the residue to get the
245// *  reconstructed block
246// *
247// * @param[in] pi2_src
248// *  Input 4x4 coefficients
249// *
250// * @param[in] pu1_pred
251// *  Prediction 4x4 block
252// *
253// * @param[out] pu1_out
254// *  Output 4x4 block
255// *
256// * @param[in] u4_qp_div_6
257// *     QP
258// *
259// * @param[in] pu2_weigh_mat
260// * Pointer to weight matrix
261// *
262// * @param[in] pred_strd,
263// *  Prediction stride
264// *
265// * @param[in] out_strd
266// *  Output Stride
267// *
268// *@param[in] pi2_tmp
269// * temporary buffer of size 1*16
270// *
271// * @param[in] pu2_iscal_mat
272// * Pointer to the inverse quantization matrix
273// *
274// * @returns  Void
275// *
276// * @remarks
277// *  None
278// *
279// *******************************************************************************
280// */
281//void ih264_iquant_itrans_recon_chroma_4x4(WORD16 *pi2_src,
282//                                          UWORD8 *pu1_pred,
283//                                          UWORD8 *pu1_out,
284//                                          WORD32 pred_strd,
285//                                          WORD32 out_strd,
286//                                          const UWORD16 *pu2_iscal_mat,
287//                                          const UWORD16 *pu2_weigh_mat,
288//                                          UWORD32 u4_qp_div_6,
289//                                          WORD32 *pi4_tmp
290//                                          WORD16 *pi2_dc_src)
291//**************Variables Vs Registers*****************************************
292//x0 => *pi2_src
293//x1 => *pu1_pred
294//x2 => *pu1_out
295//x3 =>  pred_strd
296//x4 =>  out_strd
297//x5 => *pu2_iscal_mat
298//x6 => *pu2_weigh_mat
299//x7 =>  u4_qp_div_6
300//sp =>  pi4_tmp
301//sp#8 => *pi2_dc_src
302
303    .global ih264_iquant_itrans_recon_chroma_4x4_av8
304ih264_iquant_itrans_recon_chroma_4x4_av8:
305
306//VLD4.S16 is used because the pointer is incremented by SUB_BLK_WIDTH_4x4
307//If the macro value changes need to change the instruction according to it.
308//Only one shift is done in horizontal inverse because,
309//if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
310//if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
311
312//at the end of the fucntion, we could have moved 64 bits into heigher 64 bits of register and done further processing
313//but it seem to give only reduce the number of instruction by 1. [Since a15 we saw add and sub to be very high throughput
314//all instructions were taken as equal
315
316    //reduce sp by 64
317    push_v_regs
318
319    dup       v30.4s, w7                //Populate the u4_qp_div_6 in Q15
320
321    //was at sp + 8, hence now at sp+64+8 = sp+72
322    ldr       x10, [sp, #72]            //Load alternate dc address
323
324//=======================DEQUANT FROM HERE===================================
325
326    ld4       {v20.4h - v23.4h}, [x5]   // load pu2_iscal_mat[i], i =0..15
327    ld4       {v26.4h - v29.4h}, [x6]   // pu2_weigh_mat[i], i =0..15
328    ld4       {v16.4h - v19.4h}, [x0]   // pi2_src_tmp[i], i =0..15
329
330
331    mul       v20.4h, v20.4h, v26.4h    // x[i]=(scale[i] * dequant[i]) where i = 0..3
332    mul       v21.4h, v21.4h, v27.4h    // x[i]=(scale[i] * dequant[i]) where i = 4..7
333    mul       v22.4h, v22.4h, v28.4h    // x[i]=(scale[i] * dequant[i]) where i = 8..11
334    mul       v23.4h, v23.4h, v29.4h    // x[i]=(scale[i] * dequant[i]) where i = 12..14
335
336    smull     v0.4s, v16.4h, v20.4h     // q0  = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
337    smull     v2.4s, v17.4h, v21.4h     // q1  = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
338    smull     v4.4s, v18.4h, v22.4h     // q2  = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
339    smull     v6.4s, v19.4h, v23.4h     // q3  = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
340
341    sshl      v0.4s, v0.4s, v30.4s      // q0  = q[i] = (p[i] << (qp/6)) where i = 0..3
342    sshl      v2.4s, v2.4s, v30.4s      // q1  = q[i] = (p[i] << (qp/6)) where i = 4..7
343    sshl      v4.4s, v4.4s, v30.4s      // q2  = q[i] = (p[i] << (qp/6)) where i = 8..11
344    sshl      v6.4s, v6.4s, v30.4s      // q3  = q[i] = (p[i] << (qp/6)) where i = 12..15
345
346    sqrshrn   v0.4h, v0.4s, #0x4        // d0  = c[i] = ((q[i] + 32) >> 4) where i = 0..3
347    sqrshrn   v1.4h, v2.4s, #0x4        // d1  = c[i] = ((q[i] + 32) >> 4) where i = 4..7
348    sqrshrn   v2.4h, v4.4s, #0x4        // d2  = c[i] = ((q[i] + 32) >> 4) where i = 8..11
349    sqrshrn   v3.4h, v6.4s, #0x4        // d3  = c[i] = ((q[i] + 32) >> 4) where i = 12..15
350
351    ld1       {v0.h}[0], [x10]          // loads signed halfword pi2_dc_src[0]
352
353    //========= PROCESS IDCT FROM HERE =======
354    //Steps for Stage 1:
355    //------------------
356
357    sshr      v8.4h, v1.4h, #1          // d1>>1
358    sshr      v9.4h, v3.4h, #1          // d3>>1
359
360    add       v4.4h, v0.4h, v2.4h       // x0 = d0 + d2//
361    sub       v5.4h, v0.4h, v2.4h       // x1 = d0 - d2//
362    sub       v6.4h, v8.4h, v3.4h       // x2 = (d1 >> 1) -  d3//
363    add       v7.4h, v1.4h, v9.4h       // x3 =  d1  + (d3 >>  1)//
364
365
366    add       v10.4h, v4.4h , v7.4h     // x0+x3
367    add       v11.4h, v5.4h , v6.4h     // x1+x2
368    sub       v12.4h, v5.4h , v6.4h     // x1-x2
369    sub       v13.4h, v4.4h , v7.4h
370
371    ld1       {v26.8b}, [x1], x3        // i row load pu1_pred buffer
372    ld1       {v27.8b}, [x1], x3        // ii row load pu1_pred buffer
373    ld1       {v28.8b}, [x1], x3        // iii row load pu1_pred buf
374    ld1       {v29.8b}, [x1], x3        // iv row load pu1_pred buffer
375
376    //Steps for Stage 2:
377    //transopose
378    trn1      v4.4h, v10.4h, v11.4h
379    trn2      v5.4h, v10.4h, v11.4h
380    trn1      v6.4h, v12.4h, v13.4h
381    trn2      v7.4h, v12.4h, v13.4h
382
383    trn1      v10.2s, v4.2s, v6.2s      // 0
384    trn1      v11.2s, v5.2s, v7.2s      // 8
385    trn2      v12.2s, v4.2s, v6.2s      // 4
386    trn2      v13.2s, v5.2s, v7.2s
387    //end transpose
388
389    sshr      v18.4h, v11.4h, #1        // q0>>1
390    sshr      v19.4h, v13.4h, #1        // q1>>1
391
392    add       v14.4h, v10.4h, v12.4h    // x0 = q0 + q2//
393    sub       v15.4h, v10.4h, v12.4h    // x1 = q0 - q2//
394    sub       v16.4h, v18.4h, v13.4h    // x2 = (q1 >> 1) -  q3//
395    add       v17.4h, v11.4h, v19.4h    // x3 = q1+ (q3 >> 3)//
396
397    //Backup the output addr
398    mov       x0, x2
399
400    //load outpt buufer for interleaving
401    ld1       {v10.8b}, [x2], x4
402    ld1       {v11.8b}, [x2], x4
403    ld1       {v12.8b}, [x2], x4
404    ld1       {v13.8b}, [x2]
405
406    add       v20.4h, v14.4h, v17.4h    // x0 + x3
407    add       v21.4h, v15.4h, v16.4h    // x1 + x2
408    sub       v22.4h, v15.4h, v16.4h    // x1 - x2
409    sub       v23.4h, v14.4h, v17.4h    // x0 - x3
410
411    srshr     v20.4h, v20.4h, #6
412    srshr     v21.4h, v21.4h, #6
413    srshr     v22.4h, v22.4h, #6
414    srshr     v23.4h, v23.4h, #6
415
416    //nop       v30.8b                            //dummy for deinterleaving
417    movi      v31.4h, #0x00ff           //mask for interleaving [copy lower 8 bits]
418
419    //Extract u/v plane from interleaved data
420    uzp1      v26.8b, v26.8b, v30.8b
421    uzp1      v27.8b, v27.8b, v30.8b
422    uzp1      v28.8b, v28.8b, v30.8b
423    uzp1      v29.8b, v29.8b, v30.8b
424
425    uaddw     v20.8h, v20.8h, v26.8b
426    uaddw     v21.8h, v21.8h, v27.8b
427    uaddw     v22.8h, v22.8h, v28.8b
428    uaddw     v23.8h, v23.8h, v29.8b
429
430    sqxtun    v0.8b, v20.8h
431    sqxtun    v1.8b, v21.8h
432    sqxtun    v2.8b, v22.8h
433    sqxtun    v3.8b, v23.8h
434
435    //long the output so that we have 0 at msb and value at lsb
436    uxtl      v6.8h, v0.8b
437    uxtl      v7.8h, v1.8b
438    uxtl      v8.8h, v2.8b
439    uxtl      v9.8h, v3.8b
440
441    //select lsbs from proceesd data and msbs from pu1_out loaded data
442    bit       v10.8b, v6.8b, v31.8b
443    bit       v11.8b, v7.8b, v31.8b
444    bit       v12.8b, v8.8b, v31.8b
445    bit       v13.8b, v9.8b, v31.8b
446
447    //store the interleaved result
448    st1       {v10.8b}, [x0], x4
449    st1       {v11.8b}, [x0], x4
450    st1       {v12.8b}, [x0], x4
451    st1       {v13.8b}, [x0]
452
453    pop_v_regs
454    ret
455
456///*
457// *******************************************************************************
458// *
459// * //brief
460// *  This function performs inverse quant and Inverse transform type Ci4 for 8*8 block
461// *
462// * //par Description:
463// *  Performs inverse transform Ci8 and adds the residue to get the
464// *  reconstructed block
465// *
466// * //param[in] pi2_src
467// *  Input 4x4 coefficients
468// *
469// * //param[in] pu1_pred
470// *  Prediction 4x4 block
471// *
472// * //param[out] pu1_out
473// *  Output 4x4 block
474// *
475// * //param[in] u4_qp_div_6
476// *     QP
477// *
478// * //param[in] pu2_weigh_mat
479// * Pointer to weight matrix
480// *
481// * //param[in] pred_strd,
482// *  Prediction stride
483// *
484// * //param[in] out_strd
485// *  Output Stride
486// *
487// *//param[in] pi2_tmp
488// * temporary buffer of size 1*64
489// *
490// * //param[in] pu2_iscal_mat
491// * Pointer to the inverse quantization matrix
492// *
493// * //returns  Void
494// *
495// * //remarks
496// *  None
497// *
498// *******************************************************************************
499// */
500//void ih264_iquant_itrans_recon_8x8(WORD16 *pi2_src,
501//                                   UWORD8 *pu1_pred,
502//                                   UWORD8 *pu1_out,
503//                                   WORD32 pred_strd,
504//                                   WORD32 out_strd,
505//                                   const UWORD16 *pu2_iscal_mat,
506//                                   const UWORD16 *pu2_weigh_mat,
507//                                   UWORD32 u4_qp_div_6,
508//                                   WORD32 *pi4_tmp,
509//                                   WORD32 iq_start_idx
510//                                   WORD16 *pi2_dc_ld_addr)
511//**************Variables Vs Registers*****************************************
512//x0       => *pi2_src
513//x1       => *pu1_pred
514//x2       => *pu1_out
515//x3       =>  pred_strd
516//x4       =>  out_strd
517//x5       =>  *pu2_iscal_mat
518//x6       =>  *pu2_weigh_mat
519//x7       =>  u4_qp_div_6
520//NOT USED =>  pi4_tmp
521//NOT USED =>  iq_start_idx
522//NOT USED =>  pi2_dc_ld_addr
523
524    .global ih264_iquant_itrans_recon_8x8_av8
525ih264_iquant_itrans_recon_8x8_av8:
526
527    push_v_regs
528
529    ld1       {v8.8h -v11.8h}, [x5], #64
530    ld1       {v12.8h-v15.8h}, [x5]
531
532    ld1       {v16.8h -v19.8h}, [x6], #64
533    ld1       {v20.8h -v23.8h}, [x6]
534
535    mov       x8, #16
536    ld1       {v0.8h}, [x0], x8
537    ld1       {v1.8h}, [x0], x8
538    ld1       {v2.8h}, [x0], x8
539    ld1       {v3.8h}, [x0], x8
540    ld1       {v4.8h}, [x0], x8
541    ld1       {v5.8h}, [x0], x8
542    ld1       {v6.8h}, [x0], x8
543    ld1       {v7.8h}, [x0]
544
545    mul       v8.8h, v8.8h, v16.8h
546    mul       v9.8h, v9.8h, v17.8h
547    mul       v10.8h, v10.8h, v18.8h
548    mul       v11.8h, v11.8h, v19.8h
549    mul       v12.8h, v12.8h, v20.8h
550    mul       v13.8h, v13.8h, v21.8h
551    mul       v14.8h, v14.8h, v22.8h
552    mul       v15.8h, v15.8h, v23.8h
553
554    smull     v16.4s, v0.4h, v8.4h
555    smull2    v17.4s, v0.8h, v8.8h
556    smull     v18.4s, v1.4h, v9.4h
557    smull2    v19.4s, v1.8h, v9.8h
558    smull     v20.4s, v2.4h, v10.4h
559    smull2    v21.4s, v2.8h, v10.8h
560    smull     v22.4s, v3.4h, v11.4h
561    smull2    v23.4s, v3.8h, v11.8h
562    smull     v24.4s, v4.4h, v12.4h
563    smull2    v25.4s, v4.8h, v12.8h
564    smull     v26.4s, v5.4h, v13.4h
565    smull2    v27.4s, v5.8h, v13.8h
566    smull     v28.4s, v6.4h, v14.4h
567    smull2    v29.4s, v6.8h, v14.8h
568    smull     v30.4s, v7.4h, v15.4h
569    smull2    v31.4s, v7.8h, v15.8h
570
571    dup       v0.4s, w7
572
573    sshl      v16.4s, v16.4s, v0.4s
574    sshl      v17.4s, v17.4s, v0.4s
575    sshl      v18.4s, v18.4s, v0.4s
576    sshl      v19.4s, v19.4s, v0.4s
577    sshl      v20.4s, v20.4s, v0.4s
578    sshl      v21.4s, v21.4s, v0.4s
579    sshl      v22.4s, v22.4s, v0.4s
580    sshl      v23.4s, v23.4s, v0.4s
581    sshl      v24.4s, v24.4s, v0.4s
582    sshl      v25.4s, v25.4s, v0.4s
583    sshl      v26.4s, v26.4s, v0.4s
584    sshl      v27.4s, v27.4s, v0.4s
585    sshl      v28.4s, v28.4s, v0.4s
586    sshl      v29.4s, v29.4s, v0.4s
587    sshl      v30.4s, v30.4s, v0.4s
588    sshl      v31.4s, v31.4s, v0.4s
589
590    sqrshrn   v0.4h, v16.4s, #6
591    sqrshrn2  v0.8h, v17.4s, #6
592    sqrshrn   v1.4h, v18.4s, #6
593    sqrshrn2  v1.8h, v19.4s, #6
594    sqrshrn   v2.4h, v20.4s, #6
595    sqrshrn2  v2.8h, v21.4s, #6
596    sqrshrn   v3.4h, v22.4s, #6
597    sqrshrn2  v3.8h, v23.4s, #6
598    sqrshrn   v4.4h, v24.4s, #6
599    sqrshrn2  v4.8h, v25.4s, #6
600    sqrshrn   v5.4h, v26.4s, #6
601    sqrshrn2  v5.8h, v27.4s, #6
602    sqrshrn   v6.4h, v28.4s, #6
603    sqrshrn2  v6.8h, v29.4s, #6
604    sqrshrn   v7.4h, v30.4s, #6
605    sqrshrn2  v7.8h, v31.4s, #6
606
607    //loop counter
608    mov       x8, #2
609//1x8 transofORM
610trans_1x8_1d:
611
612    //transpose 8x8
613    trn1      v8.8h, v0.8h, v1.8h
614    trn2      v9.8h, v0.8h, v1.8h
615    trn1      v10.8h, v2.8h, v3.8h
616    trn2      v11.8h, v2.8h, v3.8h
617    trn1      v12.8h, v4.8h, v5.8h
618    trn2      v13.8h, v4.8h, v5.8h
619    trn1      v14.8h, v6.8h, v7.8h
620    trn2      v15.8h, v6.8h, v7.8h
621
622    trn1      v0.4s, v8.4s, v10.4s
623    trn2      v2.4s, v8.4s, v10.4s
624    trn1      v1.4s, v9.4s, v11.4s
625    trn2      v3.4s, v9.4s, v11.4s
626    trn1      v4.4s, v12.4s, v14.4s
627    trn2      v6.4s, v12.4s, v14.4s
628    trn1      v5.4s, v13.4s, v15.4s
629    trn2      v7.4s, v13.4s, v15.4s
630
631    trn1      v8.2d, v0.2d, v4.2d       //0
632    trn2      v12.2d, v0.2d, v4.2d      //1
633    trn1      v9.2d, v1.2d, v5.2d       //2
634    trn2      v13.2d, v1.2d, v5.2d      //3
635    trn1      v10.2d, v2.2d, v6.2d      //4
636    trn2      v14.2d, v2.2d, v6.2d      //5
637    trn1      v11.2d, v3.2d, v7.2d      //6
638    trn2      v15.2d, v3.2d, v7.2d      //7
639
640    // 1 3 5 6 7
641    sshr      v16.8h, v9.8h, #1         //(pi2_tmp_ptr[1] >> 1)
642    sshr      v17.8h, v10.8h, #1        //(pi2_tmp_ptr[2] >> 1)
643    sshr      v18.8h, v11.8h, #1        //(pi2_tmp_ptr[3] >> 1)
644    sshr      v19.8h, v13.8h, #1        //(pi2_tmp_ptr[5] >> 1)
645    sshr      v20.8h, v14.8h, #1        //(pi2_tmp_ptr[6] >> 1)
646    sshr      v21.8h, v15.8h, #1        //(pi2_tmp_ptr[7] >> 1)
647
648    add       v0.8h, v8.8h, v12.8h      // i_y0 = (pi2_tmp_ptr[0] + pi2_tmp_ptr[4] );
649    sub       v2.8h, v8.8h, v12.8h      // i_y2 = (pi2_tmp_ptr[0] - pi2_tmp_ptr[4] );
650
651    sub       v4.8h, v17.8h, v14.8h     //i_y4 = ((pi2_tmp_ptr[2] >> 1) - pi2_tmp_ptr[6] );
652    add       v6.8h, v10.8h, v20.8h     //i_y6 = (pi2_tmp_ptr[2] + (pi2_tmp_ptr[6] >> 1));
653
654    //-w3 + w5
655    ssubl     v22.4s, v13.4h, v11.4h
656    ssubl2    v23.4s, v13.8h, v11.8h
657    //w3 + w5
658    saddl     v24.4s, v13.4h, v11.4h
659    saddl2    v25.4s, v13.8h, v11.8h
660    //-w1 + w7
661    ssubl     v26.4s, v15.4h, v9.4h
662    ssubl2    v27.4s, v15.8h, v9.8h
663    //w1 + w7
664    saddl     v28.4s, v15.4h, v9.4h
665    saddl2    v29.4s, v15.8h, v9.8h
666
667    //-w3 + w5 - w7
668    ssubw     v22.4s, v22.4s, v15.4h
669    ssubw2    v23.4s, v23.4s, v15.8h
670    //w3 + w5 + w1
671    saddw     v24.4s, v24.4s, v9.4h
672    saddw2    v25.4s, v25.4s, v9.8h
673    //-w1 + w7 + w5
674    saddw     v26.4s, v26.4s, v13.4h
675    saddw2    v27.4s, v27.4s, v13.8h
676    //w1 + w7 - w3
677    ssubw     v28.4s, v28.4s, v11.4h
678    ssubw2    v29.4s, v29.4s, v11.8h
679
680    //-w3 + w5 - w7 - (w7 >> 1)
681    ssubw     v22.4s, v22.4s, v21.4h
682    ssubw2    v23.4s, v23.4s, v21.8h
683    //w3 + w5 + w1 + (w1 >> 1)
684    saddw     v24.4s, v24.4s, v16.4h
685    saddw2    v25.4s, v25.4s, v16.8h
686    //-w1 + w7 + w5 + (w5 >> 1)
687    saddw     v26.4s, v26.4s, v19.4h
688    saddw2    v27.4s, v27.4s, v19.8h
689    //w1 + w7 - w3 - (w3 >> 1)
690    ssubw     v28.4s, v28.4s, v18.4h
691    ssubw2    v29.4s, v29.4s, v18.8h
692
693    xtn       v1.4h, v22.4s
694    xtn2      v1.8h, v23.4s
695    xtn       v3.4h, v28.4s
696    xtn2      v3.8h, v29.4s
697    xtn       v5.4h, v26.4s
698    xtn2      v5.8h, v27.4s
699    xtn       v7.4h, v24.4s
700    xtn2      v7.8h, v25.4s
701
702    sshr      v16.8h, v1.8h, #2         //(y1 >> 2)
703    sshr      v17.8h, v3.8h, #2         //(y3 >> 2)
704    sshr      v18.8h, v5.8h, #2         //(y5 >> 2)
705    sshr      v19.8h, v7.8h, #2         //(y7 >> 2)
706
707    add       v8.8h, v0.8h, v6.8h
708    add       v9.8h, v1.8h, v19.8h
709    add       v10.8h, v2.8h, v4.8h
710    add       v11.8h, v3.8h, v18.8h
711    sub       v12.8h, v2.8h, v4.8h
712    sub       v13.8h, v17.8h, v5.8h
713    sub       v14.8h, v0.8h, v6.8h
714    sub       v15.8h, v7.8h, v16.8h
715
716    add       v0.8h, v8.8h, v15.8h
717    add       v1.8h, v10.8h, v13.8h
718    add       v2.8h, v12.8h, v11.8h
719    add       v3.8h, v14.8h, v9.8h
720    sub       v4.8h, v14.8h, v9.8h
721    sub       v5.8h, v12.8h, v11.8h
722    sub       v6.8h, v10.8h, v13.8h
723    sub       v7.8h, v8.8h, v15.8h
724
725    subs      x8, x8, #1
726    bne       trans_1x8_1d
727
728    ld1       {v22.8b}, [x1], x3
729    ld1       {v23.8b}, [x1], x3
730    ld1       {v24.8b}, [x1], x3
731    ld1       {v25.8b}, [x1], x3
732    ld1       {v26.8b}, [x1], x3
733    ld1       {v27.8b}, [x1], x3
734    ld1       {v28.8b}, [x1], x3
735    ld1       {v29.8b}, [x1]
736
737    srshr     v0.8h, v0.8h, #6
738    srshr     v1.8h, v1.8h, #6
739    srshr     v2.8h, v2.8h, #6
740    srshr     v3.8h, v3.8h, #6
741    srshr     v4.8h, v4.8h, #6
742    srshr     v5.8h, v5.8h, #6
743    srshr     v6.8h, v6.8h, #6
744    srshr     v7.8h, v7.8h, #6
745
746    uaddw     v0.8h, v0.8h, v22.8b
747    uaddw     v1.8h, v1.8h, v23.8b
748    uaddw     v2.8h, v2.8h, v24.8b
749    uaddw     v3.8h, v3.8h, v25.8b
750    uaddw     v4.8h, v4.8h, v26.8b
751    uaddw     v5.8h, v5.8h, v27.8b
752    uaddw     v6.8h, v6.8h, v28.8b
753    uaddw     v7.8h, v7.8h, v29.8b
754
755    sqxtun    v0.8b, v0.8h
756    sqxtun    v1.8b, v1.8h
757    sqxtun    v2.8b, v2.8h
758    sqxtun    v3.8b, v3.8h
759    sqxtun    v4.8b, v4.8h
760    sqxtun    v5.8b, v5.8h
761    sqxtun    v6.8b, v6.8h
762    sqxtun    v7.8b, v7.8h
763
764    st1       {v0.8b}, [x2], x4
765    st1       {v1.8b}, [x2], x4
766    st1       {v2.8b}, [x2], x4
767    st1       {v3.8b}, [x2], x4
768    st1       {v4.8b}, [x2], x4
769    st1       {v5.8b}, [x2], x4
770    st1       {v6.8b}, [x2], x4
771    st1       {v7.8b}, [x2]
772
773    pop_v_regs
774    ret
775
776
777
778
779