• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*!
2 * \copy
3 *     Copyright (c)  2013, Cisco Systems
4 *     All rights reserved.
5 *
6 *     Redistribution and use in source and binary forms, with or without
7 *     modification, are permitted provided that the following conditions
8 *     are met:
9 *
10 *        * Redistributions of source code must retain the above copyright
11 *          notice, this list of conditions and the following disclaimer.
12 *
13 *        * Redistributions in binary form must reproduce the above copyright
14 *          notice, this list of conditions and the following disclaimer in
15 *          the documentation and/or other materials provided with the
16 *          distribution.
17 *
18 *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21 *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22 *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23 *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24 *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
28 *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 *     POSSIBILITY OF SUCH DAMAGE.
30 *
31 */
32
33#ifdef HAVE_NEON
34#include "arm_arch_common_macro.S"
35
36.macro LOAD_4x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
37//  {   //  input: \arg0~\arg3, src1*, src1_stride, src2*, src2_stride
38    vld2.16 {\arg0[0],\arg1[0]}, [\arg4], \arg5
39    vld2.16 {\arg2[0],\arg3[0]}, [\arg6], \arg7
40    vld2.16 {\arg0[1],\arg1[1]}, [\arg4], \arg5
41    vld2.16 {\arg2[1],\arg3[1]}, [\arg6], \arg7
42
43    vld2.16 {\arg0[2],\arg1[2]}, [\arg4], \arg5
44    vld2.16 {\arg2[2],\arg3[2]}, [\arg6], \arg7
45    vld2.16 {\arg0[3],\arg1[3]}, [\arg4], \arg5
46    vld2.16 {\arg2[3],\arg3[3]}, [\arg6], \arg7
47//  }
48.endm
49
50.macro LOAD_8x8_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
51//  {   //  input: \arg0~\arg3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride
52    vld1.64 {\arg0}, [\arg8], r2
53    vld1.64 {\arg4}, [\arg9], r4
54    vld1.64 {\arg1}, [\arg8], r2
55    vld1.64 {\arg5}, [\arg9], r4
56
57    vld1.64 {\arg2}, [\arg8], r2
58    vld1.64 {\arg6}, [\arg9], r4
59    vld1.64 {\arg3}, [\arg8], r2
60    vld1.64 {\arg7}, [\arg9], r4
61//  }
62.endm
63
64.macro DCT_ROW_TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
65//  {   //  input: src_d[0]~[3], working: [4]~[7]
66    vadd.s16        \arg4, \arg0, \arg3         //int16 s[0] = data[i] + data[i3];
67    vsub.s16        \arg7, \arg0, \arg3         //int16 s[3] = data[i] - data[i3];
68    vadd.s16        \arg5, \arg1, \arg2         //int16 s[1] = data[i1] + data[i2];
69    vsub.s16        \arg6, \arg1, \arg2         //int16 s[2] = data[i1] - data[i2];
70
71    vadd.s16        \arg0, \arg4, \arg5         //int16 dct[i ] = s[0] + s[1];
72    vsub.s16        \arg2, \arg4, \arg5         //int16 dct[i2] = s[0] - s[1];
73    vshl.s16        \arg1, \arg7, #1
74    vshl.s16        \arg3, \arg6, #1
75    vadd.s16        \arg1, \arg1, \arg6         //int16 dct[i1] = (s[3] << 1) + s[2];
76    vsub.s16        \arg3, \arg7, \arg3         //int16 dct[i3] = s[3] - (s[2] << 1);
77//  }
78.endm
79
80.macro MATRIX_TRANSFORM_EACH_16BITS arg0, arg1, arg2, arg3
81//  {   //  input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]
82    vtrn.s16        \arg0, \arg1                //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
83    vtrn.s16        \arg2, \arg3                //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
84    vtrn.32     \arg0, \arg2                //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
85    vtrn.32     \arg1, \arg3                //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
86//  }
87.endm
88
89.macro NEWQUANT_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8
90//  {   //  input:  coef, ff (dst), ff_d0, ff_d1, mf_d0, md_d1
91    veor.s16        \arg6, \arg6            // init 0 , and keep 0;
92    vaba.s16        \arg1, \arg0, \arg6     // f + abs(coef - 0)
93    vmull.s16       \arg7, \arg2, \arg4
94    vmull.s16       \arg8, \arg3, \arg5
95    vshr.s32        \arg7, #16
96    vshr.s32        \arg8, #16
97    vmovn.s32       \arg2, \arg7
98    vmovn.s32       \arg3, \arg8
99
100    vcgt.s16        \arg7, \arg0, #0        // if true, location of coef == 11111111
101    vbif.s16        \arg6, \arg1, \arg7     // if (x<0) reserved part; else keep 0 untouched
102    vshl.s16        \arg6, #1
103    vsub.s16        \arg1, \arg1, \arg6     // if x > 0, -= 0; else x-= 2x
104//  }
105.endm
106
107.macro NEWQUANT_COEF_EACH_16BITS_MAX arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
108//  {   //  input:  coef, ff (dst), ff_d0, ff_d1, mf_d0(max), md_d1
109    veor.s16        \arg6, \arg6            // init 0 , and keep 0;
110    vaba.s16        \arg1, \arg0, \arg6     // f + abs(coef - 0)
111    vmull.s16       \arg7, \arg2, \arg4
112    vmull.s16       \arg8, \arg3, \arg5
113    vshr.s32        \arg7, #16
114    vshr.s32        \arg8, #16
115    vmovn.s32       \arg2, \arg7
116    vmovn.s32       \arg3, \arg8
117
118    vcgt.s16        \arg7, \arg0, #0        // if true, location of coef == 11111111
119    vbif.s16        \arg6, \arg1, \arg7     // if (x<0) reserved part; else keep 0 untouched
120    vshl.s16        \arg6, #1
121    vmax.s16        \arg9, \arg2, \arg3
122    vsub.s16        \arg1, \arg1, \arg6     // if x > 0, -= 0; else x-= 2x
123//  }
124.endm
125
126.macro QUANT_DUALWORD_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4
127//  {   //  input:  coef, ff (dst), mf , working_d (all 0), working_q
128    vaba.s16        \arg1, \arg0, \arg3     // f + abs(coef - 0)
129    vmull.s16       \arg4, \arg1, \arg2     // *= mf
130    vshr.s32        \arg4, #16
131    vmovn.s32       \arg1, \arg4            // >> 16
132
133    vcgt.s16        \arg2, \arg0, #0        // if true, location of coef == 11111111
134    vbif.s16        \arg3, \arg1, \arg2     // if (x<0) reserved part; else keep 0 untouched
135    vshl.s16        \arg3, #1
136    vsub.s16        \arg1, \arg1, \arg3     // if x > 0, -= 0; else x-= 2x
137//  }
138.endm
139
140.macro DC_ZERO_COUNT_IN_DUALWORD arg0, arg1, arg2
141//  {   //  input:  coef, dst_d, working_d (all 0x01)
142    vceq.s16    \arg1, \arg0, #0
143    vand.s16    \arg1, \arg2
144    vpadd.s16   \arg1, \arg1, \arg1
145    vpadd.s16   \arg1, \arg1, \arg1
146//  }
147.endm
148
149.macro SELECT_MAX_IN_ABS_COEF arg0, arg1, arg2, arg3, arg4
150//  {   //  input:  coef_0, coef_1, max_q (identy to follow two), output: max_d0, max_d1
151    vmax.s16        \arg2, \arg0, \arg1     // max 1st in \arg3 & max 2nd in \arg4
152    vpmax.s16       \arg3, \arg3, \arg4     // max 1st in \arg3[0][1] & max 2nd in \arg3[2][3]
153    vpmax.s16       \arg3, \arg3, \arg4     // max 1st in \arg3[0][1]
154//  }
155.endm
156
157.macro ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2, arg3, arg4, arg5, arg6
158//  {   //  input:  coef_0 (identy to \arg3 \arg4), coef_1(identy to \arg5 \arg6), mask_q
159    vceq.s16    \arg0, #0
160    vceq.s16    \arg1, #0
161    vand.s16    \arg0, \arg2
162    vand.s16    \arg1, \arg2
163
164    vpadd.s16   \arg3, \arg3, \arg5
165    vpadd.s16   \arg4, \arg4, \arg6
166    vpadd.s16   \arg3, \arg3, \arg4     // 8-->4
167    vpadd.s16   \arg3, \arg3, \arg3
168    vpadd.s16   \arg3, \arg3, \arg3
169//  }
170.endm
171
172.macro HDM_QUANT_2x2_TOTAL_16BITS arg0, arg1, arg2
173//  {   //  input: src_d[0]~[3], working_d, dst_d
174    vshr.s64    \arg1, \arg0, #32
175    vadd.s16    \arg2, \arg0, \arg1     // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];
176    vsub.s16    \arg1, \arg0, \arg1     // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];
177    vtrn.s16    \arg2, \arg1
178    vtrn.s32    \arg2, \arg1
179//  }
180.endm
181
182.macro IHDM_4x4_TOTAL_16BITS arg0, arg1, arg2
183//  {   //  input: each src_d[0]~[3](dst), working_q0, working_q1, working_q2
184    vshr.s64    \arg1, \arg0, #32
185    vadd.s16    \arg2, \arg0, \arg1     // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];
186    vsub.s16    \arg1, \arg0, \arg1     // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];
187    vtrn.s16    \arg2, \arg1
188    vrev32.16   \arg1, \arg1
189    vtrn.s32    \arg2, \arg1            // [0] = rs[0] + rs[2];[1] = rs[0] - rs[2];[2] = rs[1] - rs[3];[3] = rs[1] + rs[3];
190
191    vrev64.16   \arg1, \arg2
192    vadd.s16    \arg0, \arg2, \arg1     // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];
193    vsub.s16    \arg1, \arg2, \arg1
194    vrev32.16   \arg1, \arg1            // [0] = rs[1] - rs[2];[1] = rs[0] - rs[3];
195    vtrn.s32    \arg0, \arg1            // [0] = rs[0] + rs[3];[1] = rs[1] + rs[2];[2] = rs[1] - rs[2];[3] = rs[0] - rs[3];
196//  }
197.endm
198
199.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP arg0, arg1, arg2, arg3, arg4, arg5
200//  {   //  input: pred_d[0]/[1](output), dct_q0/1, working_q0/1;
201    vmovl.u8        \arg4,\arg0
202    vmovl.u8        \arg5,\arg1
203    vadd.s16        \arg4,\arg2
204    vadd.s16        \arg5,\arg3
205    vqmovun.s16 \arg0,\arg4
206    vqmovun.s16 \arg1,\arg5
207//  }
208.endm
209
210.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
211//  {   //  input: src_d[0]~[3], output: e_d[0]~[3];
212    vadd.s16        \arg4, \arg0, \arg2         //int16 e[i][0] = src[0] + src[2];
213    vsub.s16        \arg5, \arg0, \arg2         //int16 e[i][1] = src[0] - src[2];
214    vshr.s16        \arg6, \arg1, #1
215    vshr.s16        \arg7, \arg3, #1
216    vsub.s16        \arg6, \arg6, \arg3         //int16 e[i][2] = (src[1]>>1)-src[3];
217    vadd.s16        \arg7, \arg1, \arg7         //int16 e[i][3] = src[1] + (src[3]>>1);
218//  }
219.endm
220
221.macro TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7    // both row & col transform used
222//  {   //  output: f_q[0]~[3], input: e_q[0]~[3];
223    vadd.s16        \arg0, \arg4, \arg7         //int16 f[i][0] = e[i][0] + e[i][3];
224    vadd.s16        \arg1, \arg5, \arg6         //int16 f[i][1] = e[i][1] + e[i][2];
225    vsub.s16        \arg2, \arg5, \arg6         //int16 f[i][2] = e[i][1] - e[i][2];
226    vsub.s16        \arg3, \arg4, \arg7         //int16 f[i][3] = e[i][0] - e[i][3];
227//  }
228.endm
229
230
231.macro ROW_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
232//  {   //  input: src_d[0]~[3], output: e_q[0]~[3];
233    vaddl.s16       \arg4, \arg0, \arg2         //int32 e[i][0] = src[0] + src[2];
234    vsubl.s16       \arg5, \arg0, \arg2         //int32 e[i][1] = src[0] - src[2];
235    vsubl.s16       \arg6, \arg1, \arg3         //int32 e[i][2] = src[1] - src[3];
236    vaddl.s16       \arg7, \arg1, \arg3         //int32 e[i][3] = src[1] + src[3];
237//  }
238.endm
239
240.macro ROW_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9
241//  {   //  input: src_d[0]~[3], output: e_q[0]~[3]; working: \arg8 \arg9
242    vaddl.s16       \arg4, \arg0, \arg2         //int32 e[i][0] = src[0] + src[2];
243    vsubl.s16       \arg5, \arg0, \arg2         //int32 e[i][1] = src[0] - src[2];
244    vshr.s16        \arg8, \arg1, #1
245    vshr.s16        \arg9, \arg3, #1
246    vsubl.s16       \arg6, \arg8, \arg3         //int32 e[i][2] = (src[1]>>1)-src[3];
247    vaddl.s16       \arg7, \arg1, \arg9         //int32 e[i][3] = src[1] + (src[3]>>1);
248//  }
249.endm
250
251.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7  // both row & col transform used
252//  {   //  output: f_q[0]~[3], input: e_q[0]~[3];
253    vadd.s32        \arg0, \arg4, \arg7         //int16 f[i][0] = e[i][0] + e[i][3];
254    vadd.s32        \arg1, \arg5, \arg6         //int16 f[i][1] = e[i][1] + e[i][2];
255    vsub.s32        \arg2, \arg5, \arg6         //int16 f[i][2] = e[i][1] - e[i][2];
256    vsub.s32        \arg3, \arg4, \arg7         //int16 f[i][3] = e[i][0] - e[i][3];
257//  }
258.endm
259
260.macro COL_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
261//  {   //  input: src_q[0]~[3], output: e_q[0]~[3];
262    vadd.s32        \arg4, \arg0, \arg2         //int32 e[0][j] = f[0][j] + f[2][j];
263    vsub.s32        \arg5, \arg0, \arg2         //int32 e[1][j] = f[0][j] - f[2][j];
264    vsub.s32        \arg6, \arg1, \arg3         //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
265    vadd.s32        \arg7, \arg1, \arg3         //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
266//  }
267.endm
268
269.macro COL_TRANSFORM_1_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7
270//  {   //  input: src_q[0]~[3], output: e_q[0]~[3];
271    vadd.s32        \arg4, \arg0, \arg2         //int32 e[0][j] = f[0][j] + f[2][j];
272    vsub.s32        \arg5, \arg0, \arg2         //int32 e[1][j] = f[0][j] - f[2][j];
273    vshr.s32        \arg6, \arg1, #1
274    vshr.s32        \arg7, \arg3, #1
275    vsub.s32        \arg6, \arg6, \arg3         //int32 e[2][j] = (f[1][j]>>1) - f[3][j];
276    vadd.s32        \arg7, \arg1, \arg7         //int32 e[3][j] = f[1][j] + (f[3][j]>>1);
277//  }
278.endm
279
280
281WELS_ASM_FUNC_BEGIN WelsDctT4_neon
282    push        {r4}
283    ldr         r4, [sp, #4]
284
285    LOAD_4x4_DATA_FOR_DCT   d4, d5, d6, d7, r1, r2, r3, r4
286
287    vsubl.u8    q0, d4, d6
288    vsubl.u8    q1, d5, d7
289    vtrn.s32    q0, q1
290    vswp        d1, d2
291
292    // horizontal transform
293    DCT_ROW_TRANSFORM_TOTAL_16BITS      d0, d1, d2, d3, d4, d5, d6, d7
294
295    // transform element
296    MATRIX_TRANSFORM_EACH_16BITS    d0, d1, d2, d3
297
298    //  vertical transform
299    DCT_ROW_TRANSFORM_TOTAL_16BITS      d0, d1, d2, d3, d4, d5, d6, d7
300
301    // transform element
302    MATRIX_TRANSFORM_EACH_16BITS    d0, d1, d2, d3
303
304    vst1.s16        {q0, q1}, [r0]!
305
306    pop     {r4}
307WELS_ASM_FUNC_END
308
309
310WELS_ASM_FUNC_BEGIN WelsDctFourT4_neon
311    push        {r4}
312    ldr         r4, [sp, #4]
313
314    LOAD_8x8_DATA_FOR_DCT   d16, d17, d18, d19, d20, d21, d22, d23, r1, r3
315
316    vsubl.u8    q0, d16, d20
317    vsubl.u8    q1, d17, d21
318    vsubl.u8    q2, d18, d22
319    vsubl.u8    q3, d19, d23
320    MATRIX_TRANSFORM_EACH_16BITS    q0, q1, q2, q3
321
322    // horizontal transform
323    DCT_ROW_TRANSFORM_TOTAL_16BITS      q0, q1, q2, q3, q8, q9, q10, q11
324
325    // transform element
326    MATRIX_TRANSFORM_EACH_16BITS    q0, q1, q2, q3
327
328    //  vertical transform
329    DCT_ROW_TRANSFORM_TOTAL_16BITS      q0, q1, q2, q3, q8, q9, q10, q11
330
331    vswp        d1, d2
332    vswp        d5, d6
333    vswp        q1, q2
334    vst1.s16        {q0, q1}, [r0]!
335    vst1.s16        {q2, q3}, [r0]!
336
337    ////////////////
338    LOAD_8x8_DATA_FOR_DCT   d16, d17, d18, d19, d20, d21, d22, d23, r1, r3
339
340    vsubl.u8    q0, d16, d20
341    vsubl.u8    q1, d17, d21
342    vsubl.u8    q2, d18, d22
343    vsubl.u8    q3, d19, d23
344    MATRIX_TRANSFORM_EACH_16BITS    q0, q1, q2, q3
345
346    // horizontal transform
347    DCT_ROW_TRANSFORM_TOTAL_16BITS      q0, q1, q2, q3, q8, q9, q10, q11
348
349    // transform element
350    MATRIX_TRANSFORM_EACH_16BITS    q0, q1, q2, q3
351
352    //  vertical transform
353    DCT_ROW_TRANSFORM_TOTAL_16BITS      q0, q1, q2, q3, q8, q9, q10, q11
354
355    vswp        d1, d2
356    vswp        d5, d6
357    vswp        q1, q2
358    vst1.s16        {q0, q1}, [r0]!
359    vst1.s16        {q2, q3}, [r0]!
360
361    pop     {r4}
362WELS_ASM_FUNC_END
363
364
365WELS_ASM_FUNC_BEGIN WelsQuant4x4_neon
366    vld1.s16        {q2}, [r1]
367    vld1.s16        {q0, q1}, [r0]
368    vld1.s16        {q3}, [r2]
369
370    vmov            q8, q2
371
372    NEWQUANT_COEF_EACH_16BITS   q0, q2, d4, d5, d6, d7, q9, q10, q11
373    vst1.s16        {q2}, [r0]!
374
375    NEWQUANT_COEF_EACH_16BITS   q1, q8, d16, d17, d6, d7, q9, q10, q11
376    vst1.s16        {q8}, [r0]!
377
378WELS_ASM_FUNC_END
379
380
381WELS_ASM_FUNC_BEGIN WelsQuant4x4Dc_neon
382
383    vld1.s16        {q0, q1}, [r0]
384    vdup.s16        q2, r1      // even ff range [0, 768]
385    vdup.s16        q3, r2
386
387    vmov            q8, q2
388
389    NEWQUANT_COEF_EACH_16BITS   q0, q2, d4, d5, d6, d7, q9, q10, q11
390    vst1.s16        {q2}, [r0]!
391
392    NEWQUANT_COEF_EACH_16BITS   q1, q8, d16, d17, d6, d7, q9, q10, q11
393    vst1.s16        {q8}, [r0]!
394
395WELS_ASM_FUNC_END
396
397
398WELS_ASM_FUNC_BEGIN WelsQuantFour4x4_neon
399    vld1.s16        {q2}, [r1]
400    vld1.s16        {q3}, [r2]
401    mov             r1, r0
402
403    vld1.s16        {q0, q1}, [r0]!
404    vmov            q8, q2
405    NEWQUANT_COEF_EACH_16BITS   q0, q8, d16, d17, d6, d7, q9, q10, q11
406    vst1.s16        {q8}, [r1]!
407    vmov            q8, q2
408    NEWQUANT_COEF_EACH_16BITS   q1, q8, d16, d17, d6, d7, q9, q10, q11
409    vst1.s16        {q8}, [r1]!
410
411    vld1.s16        {q0, q1}, [r0]!
412    vmov            q8, q2
413    NEWQUANT_COEF_EACH_16BITS   q0, q8, d16, d17, d6, d7, q9, q10, q11
414    vst1.s16        {q8}, [r1]!
415    vmov            q8, q2
416    NEWQUANT_COEF_EACH_16BITS   q1, q8, d16, d17, d6, d7, q9, q10, q11
417    vst1.s16        {q8}, [r1]!
418
419    vld1.s16        {q0, q1}, [r0]!
420    vmov            q8, q2
421    NEWQUANT_COEF_EACH_16BITS   q0, q8, d16, d17, d6, d7, q9, q10, q11
422    vst1.s16        {q8}, [r1]!
423    vmov            q8, q2
424    NEWQUANT_COEF_EACH_16BITS   q1, q8, d16, d17, d6, d7, q9, q10, q11
425    vst1.s16        {q8}, [r1]!
426
427    vld1.s16        {q0, q1}, [r0]!
428    vmov            q8, q2
429    NEWQUANT_COEF_EACH_16BITS   q0, q8, d16, d17, d6, d7, q9, q10, q11
430    vst1.s16        {q8}, [r1]!
431    vmov            q8, q2
432    NEWQUANT_COEF_EACH_16BITS   q1, q8, d16, d17, d6, d7, q9, q10, q11
433    vst1.s16        {q8}, [r1]!
434
435WELS_ASM_FUNC_END
436
437
438WELS_ASM_FUNC_BEGIN WelsQuantFour4x4Max_neon
439    vld1.s16        {q2}, [r1]
440    vld1.s16        {q3}, [r2]
441    mov             r1, r0
442
443    vld1.s16        {q0, q1}, [r0]!
444    vmov            q8, q2
445    NEWQUANT_COEF_EACH_16BITS_MAX   q0, q8, d16, d17, d6, d7, q9, q10, q11, d26
446    vst1.s16        {q8}, [r1]!
447    vmov            q12, q2
448    NEWQUANT_COEF_EACH_16BITS_MAX   q1, q12, d24, d25, d6, d7, q9, q10, q11, d28
449    vst1.s16        {q12}, [r1]!        // then 1st 16 elem in d26 & d28
450
451    vld1.s16        {q0, q1}, [r0]!
452    vmov            q8, q2
453    NEWQUANT_COEF_EACH_16BITS_MAX   q0, q8, d16, d17, d6, d7, q9, q10, q11, d27
454    vst1.s16        {q8}, [r1]!
455    vmov            q12, q2
456    NEWQUANT_COEF_EACH_16BITS_MAX   q1, q12, d24, d25, d6, d7, q9, q10, q11, d29
457    vst1.s16        {q12}, [r1]!    // then 2nd 16 elem in d27 & d29
458
459    SELECT_MAX_IN_ABS_COEF  q13, q14, q0, d0, d1
460    vst1.s32        {d0[0]}, [r3]!
461
462    ///////////
463    vld1.s16        {q0, q1}, [r0]!
464    vmov            q8, q2
465    NEWQUANT_COEF_EACH_16BITS_MAX   q0, q8, d16, d17, d6, d7, q9, q10, q11, d26
466    vst1.s16        {q8}, [r1]!
467    vmov            q12, q2
468    NEWQUANT_COEF_EACH_16BITS_MAX   q1, q12, d24, d25, d6, d7, q9, q10, q11, d28
469    vst1.s16        {q12}, [r1]!        // then 3rd 16 elem in d26 & d28
470
471    vld1.s16        {q0, q1}, [r0]!
472    vmov            q8, q2
473    NEWQUANT_COEF_EACH_16BITS_MAX   q0, q8, d16, d17, d6, d7, q9, q10, q11, d27
474    vst1.s16        {q8}, [r1]!
475    vmov            q12, q2
476    NEWQUANT_COEF_EACH_16BITS_MAX   q1, q12, d24, d25, d6, d7, q9, q10, q11, d29
477    vst1.s16        {q12}, [r1]!    // then 4th 16 elem in d27 & d29
478
479    SELECT_MAX_IN_ABS_COEF  q13, q14, q0, d0, d1
480    vst1.s32        {d0[0]}, [r3]!
481
482WELS_ASM_FUNC_END
483
484
485WELS_ASM_FUNC_BEGIN WelsHadamardT4Dc_neon
486    push    {r2,r3}
487    mov     r2, #64 // 2*16*sizeof(int16_t)
488    add     r3, r1, #32
489
490    vld1.s16        {d0}, [r1], r2
491    vld1.s16        {d1}, [r3], r2
492    vld1.s16        {d4}, [r1], r2
493    vld1.s16        {d5}, [r3], r2
494    vld1.s16        {d2}, [r1], r2
495    vld1.s16        {d3}, [r3], r2
496    vld1.s16        {d6}, [r1], r2
497    vld1.s16        {d7}, [r3], r2
498    vtrn.16     q0, q2      // d0[0 4], d1[1 5]
499    vtrn.16     q1, q3      // d2[2 6], d3[3 7]
500
501    vld1.s16        {d16}, [r1], r2
502    vld1.s16        {d17}, [r3], r2
503    vld1.s16        {d20}, [r1], r2
504    vld1.s16        {d21}, [r3], r2
505    vld1.s16        {d18}, [r1], r2
506    vld1.s16        {d19}, [r3], r2
507    vld1.s16        {d22}, [r1], r2
508    vld1.s16        {d23}, [r3], r2
509    vtrn.16     q8, q10     //d16[08 12],d17[09 13]
510    vtrn.16     q9, q11     //d18[10 14],d19[11 15]
511
512    vtrn.32     q0, q8      // d0 [0 4 08 12] = dct[idx],       d1[1 5 09 13] = dct[idx+16]
513    vtrn.32     q1, q9      // d2 [2 6 10 14] = dct[idx+64],    d3[3 7 11 15] = dct[idx+80]
514
515    ROW_TRANSFORM_0_STEP    d0, d1, d3, d2, q8, q11, q10, q9
516
517    TRANSFORM_4BYTES        q0, q1, q3, q2, q8, q11, q10, q9
518
519    // transform element 32bits
520    vtrn.s32        q0, q1              //[0 1 2 3]+[4 5 6 7]-->[0 4 2 6]+[1 5 3 7]
521    vtrn.s32        q2, q3              //[8 9 10 11]+[12 13 14 15]-->[8 12 10 14]+[9 13 11 15]
522    vswp            d1, d4              //[0 4 2 6]+[8 12 10 14]-->[0 4 8 12]+[2 6 10 14]
523    vswp            d3, d6              //[1 5 3 7]+[9 13 11 15]-->[1 5 9 13]+[3 7 11 15]
524
525    COL_TRANSFORM_0_STEP    q0, q1, q3, q2, q8, q11, q10, q9
526
527    TRANSFORM_4BYTES        q0, q1, q3, q2, q8, q11, q10, q9
528
529    vrshrn.s32      d16, q0, #1
530    vrshrn.s32      d17, q1, #1
531    vrshrn.s32      d18, q2, #1
532    vrshrn.s32      d19, q3, #1
533    vst1.16 {q8, q9}, [r0]  //store
534
535    pop     {r2,r3}
536WELS_ASM_FUNC_END
537
538
539WELS_ASM_FUNC_BEGIN WelsHadamardQuant2x2_neon
540
541    vdup.s16    d1, r1              //ff
542    vdup.s16    d2, r2              //mf
543    veor        d3, d3
544
545    mov         r1, #32
546    mov         r2, r0
547
548    vld1.s16    {d0[0]}, [r0], r1       //rs[00]
549    vst1.s16    {d3[0]}, [r2], r1       //rs[00]=0
550    vld1.s16    {d0[1]}, [r0], r1       //rs[16]
551    vst1.s16    {d3[0]}, [r2], r1       //rs[16]=0
552    vld1.s16    {d0[2]}, [r0], r1       //rs[32]
553    vst1.s16    {d3[0]}, [r2], r1       //rs[32]=0
554    vld1.s16    {d0[3]}, [r0], r1       //rs[48]
555    vst1.s16    {d3[0]}, [r2], r1       //rs[48]=0
556
557    HDM_QUANT_2x2_TOTAL_16BITS  d0, d4, d5      // output d5
558
559    HDM_QUANT_2x2_TOTAL_16BITS  d5, d4, d0      // output d0
560
561    QUANT_DUALWORD_COEF_EACH_16BITS d0, d1, d2, d3, q2
562
563    vst1.s16    d1, [r3]        // store to dct
564    ldr         r2, [sp, #0]
565    vst1.s16    d1, [r2]        // store to block
566
567    mov         r1, #1
568    vdup.s16    d3, r1
569    DC_ZERO_COUNT_IN_DUALWORD   d1, d0, d3
570
571    vmov    r0, r1, d0
572    and     r0, #0x07       // range [0~4]
573    rsb     r0, #4
574WELS_ASM_FUNC_END
575
576
577WELS_ASM_FUNC_BEGIN WelsHadamardQuant2x2SkipKernel_neon
578
579    vdup.s16    d3, r1
580    mov         r1, #32
581    vld1.s16    {d0[0]}, [r0], r1       //rs[00]
582    vld1.s16    {d0[1]}, [r0], r1       //rs[16]
583    vld1.s16    {d0[2]}, [r0], r1       //rs[32]
584    vld1.s16    {d0[3]}, [r0], r1       //rs[48]
585
586    HDM_QUANT_2x2_TOTAL_16BITS  d0, d1, d2      // output d2
587
588    HDM_QUANT_2x2_TOTAL_16BITS  d2, d1, d0      // output d0
589
590    vabs.s16    d1, d0
591    vcgt.s16    d1, d1, d3      // abs(dct[i])>threshold;
592    vmov    r0, r1, d1
593    orr     r0, r1
594WELS_ASM_FUNC_END
595
596
597WELS_ASM_FUNC_BEGIN WelsGetNoneZeroCount_neon
598    push    {r1}
599    vld1.s16    {q0, q1}, [r0]
600    vmov.s16    q8, #1
601
602    ZERO_COUNT_IN_2_QUARWORD    q0, q1, q8, d0, d1, d2, d3
603    vmov    r0, r1, d0
604    and     r0, #0x1F   // range [0~16]
605    rsb     r0, #16
606    pop     {r1}
607WELS_ASM_FUNC_END
608
609
610WELS_ASM_FUNC_BEGIN WelsDequant4x4_neon
611    vld1.s16    {q0, q1}, [r0]
612    vld1.u16    {q2}, [r1]
613
614    vmul.s16    q8, q0, q2
615    vmul.s16    q9, q1, q2
616
617    vst1.s16    {q8, q9}, [r0]
618WELS_ASM_FUNC_END
619
620
621WELS_ASM_FUNC_BEGIN WelsDequantFour4x4_neon
622    vld1.u16    {q12}, [r1]
623    mov     r1, r0
624    vld1.s16    {q0, q1}, [r0]!
625    vld1.s16    {q2, q3}, [r0]!
626    vmul.s16    q0, q0, q12
627    vld1.s16    {q8, q9}, [r0]!
628    vmul.s16    q1, q1, q12
629    vld1.s16    {q10, q11}, [r0]!
630
631    vst1.s16    {q0, q1}, [r1]!
632
633    vmul.s16    q2, q2, q12
634    vmul.s16    q3, q3, q12
635    vmul.s16    q8, q8, q12
636    vst1.s16    {q2, q3}, [r1]!
637
638    vmul.s16    q9, q9, q12
639    vmul.s16    q10, q10, q12
640    vmul.s16    q11, q11, q12
641    vst1.s16    {q8, q9}, [r1]!
642    vst1.s16    {q10, q11}, [r1]!
643
644WELS_ASM_FUNC_END
645
646
647WELS_ASM_FUNC_BEGIN WelsDequantIHadamard4x4_neon
648
649    vld1.s16    {q0, q1}, [r0]
650    vdup.s16    q8, r1
651
652    IHDM_4x4_TOTAL_16BITS   q0, q2, q3
653    IHDM_4x4_TOTAL_16BITS   q1, q2, q3
654
655    MATRIX_TRANSFORM_EACH_16BITS    d0, d1, d2, d3
656
657    IHDM_4x4_TOTAL_16BITS   q0, q2, q3
658    vmul.s16    q0, q8
659
660    IHDM_4x4_TOTAL_16BITS   q1, q2, q3
661    vmul.s16    q1, q8
662
663    MATRIX_TRANSFORM_EACH_16BITS    d0, d1, d2, d3
664    vst1.s16    {q0, q1}, [r0]
665WELS_ASM_FUNC_END
666
667
668WELS_ASM_FUNC_BEGIN WelsIDctT4Rec_neon
669    vld1.u32        {d16[0]}, [r2], r3
670    push            {r4}
671    ldr             r4, [sp, #4]
672    vld1.u32        {d16[1]}, [r2], r3
673
674    vld4.s16        {d0, d1, d2, d3}, [r4]      // cost 3 cycles!
675    vld1.u32        {d17[0]}, [r2], r3
676    vld1.u32        {d17[1]}, [r2], r3          // q7 is pred
677
678    ROW_TRANSFORM_1_STEP_TOTAL_16BITS       d0, d1, d2, d3, d4, d5, d6, d7
679
680    TRANSFORM_TOTAL_16BITS      d0, d1, d2, d3, d4, d5, d6, d7
681
682    MATRIX_TRANSFORM_EACH_16BITS    d0, d1, d2, d3
683
684    ROW_TRANSFORM_1_STEP_TOTAL_16BITS       d0, d1, d2, d3, d4, d5, d6, d7
685
686    TRANSFORM_TOTAL_16BITS      d0, d1, d2, d3, d4, d5, d6, d7
687    vrshr.s16       d0, d0, #6
688    vrshr.s16       d1, d1, #6
689    vrshr.s16       d2, d2, #6
690    vrshr.s16       d3, d3, #6
691
692    //after rounding 6, clip into [0, 255]
693    vmovl.u8        q2,d16
694    vadd.s16        q0,q2
695    vqmovun.s16 d16,q0
696    vst1.32     {d16[0]},[r0],r1
697    vst1.32     {d16[1]},[r0],r1
698
699    vmovl.u8        q2,d17
700    vadd.s16        q1,q2
701    vqmovun.s16 d17,q1
702    vst1.32     {d17[0]},[r0],r1
703    vst1.32     {d17[1]},[r0]
704
705    pop         {r4}
706WELS_ASM_FUNC_END
707
708
709WELS_ASM_FUNC_BEGIN WelsIDctFourT4Rec_neon
710
711    vld1.u64        {d24}, [r2], r3
712    push            {r4}
713    ldr             r4, [sp, #4]
714    vld1.u64        {d25}, [r2], r3
715
716    vld4.s16        {d0, d1, d2, d3}, [r4]!     // cost 3 cycles!
717    vld1.u64        {d26}, [r2], r3
718    vld1.u64        {d27}, [r2], r3
719    vld4.s16        {d4, d5, d6, d7}, [r4]!     // cost 3 cycles!
720    vswp            d1, d4
721    vswp            d3, d6
722    vswp            q1, q2                      // q0~q3
723
724    ROW_TRANSFORM_1_STEP_TOTAL_16BITS       q0, q1, q2, q3, q8, q9, q10, q11
725
726    TRANSFORM_TOTAL_16BITS      q0, q1, q2, q3, q8, q9, q10, q11
727
728    MATRIX_TRANSFORM_EACH_16BITS    q0, q1, q2, q3
729
730    ROW_TRANSFORM_1_STEP_TOTAL_16BITS       q0, q1, q2, q3, q8, q9, q10, q11
731
732    TRANSFORM_TOTAL_16BITS      q0, q1, q2, q3, q8, q9, q10, q11
733    vrshr.s16       q0, q0, #6
734    vrshr.s16       q1, q1, #6
735    vrshr.s16       q2, q2, #6
736    vrshr.s16       q3, q3, #6
737
738    //after rounding 6, clip into [0, 255]
739    vmovl.u8        q8,d24
740    vadd.s16        q0,q8
741    vqmovun.s16 d24,q0
742    vst1.u8     {d24},[r0],r1
743
744    vmovl.u8        q8,d25
745    vadd.s16        q1,q8
746    vqmovun.s16 d25,q1
747    vst1.u8     {d25},[r0],r1
748
749    vmovl.u8        q8,d26
750    vadd.s16        q2,q8
751    vqmovun.s16 d26,q2
752    vst1.u8     {d26},[r0],r1
753
754    vmovl.u8        q8,d27
755    vadd.s16        q3,q8
756    vqmovun.s16 d27,q3
757    vst1.u8     {d27},[r0],r1
758
759    vld1.u64        {d24}, [r2], r3
760    vld1.u64        {d25}, [r2], r3
761
762    vld4.s16        {d0, d1, d2, d3}, [r4]!     // cost 3 cycles!
763    vld1.u64        {d26}, [r2], r3
764    vld1.u64        {d27}, [r2], r3
765    vld4.s16        {d4, d5, d6, d7}, [r4]!     // cost 3 cycles!
766    vswp            d1, d4
767    vswp            d3, d6
768    vswp            q1, q2                      // q0~q3
769
770    ROW_TRANSFORM_1_STEP_TOTAL_16BITS       q0, q1, q2, q3, q8, q9, q10, q11
771
772    TRANSFORM_TOTAL_16BITS      q0, q1, q2, q3, q8, q9, q10, q11
773
774    MATRIX_TRANSFORM_EACH_16BITS    q0, q1, q2, q3
775
776    ROW_TRANSFORM_1_STEP_TOTAL_16BITS       q0, q1, q2, q3, q8, q9, q10, q11
777
778    TRANSFORM_TOTAL_16BITS      q0, q1, q2, q3, q8, q9, q10, q11
779    vrshr.s16       q0, q0, #6
780    vrshr.s16       q1, q1, #6
781    vrshr.s16       q2, q2, #6
782    vrshr.s16       q3, q3, #6
783
784    //after rounding 6, clip into [0, 255]
785    vmovl.u8        q8,d24
786    vadd.s16        q0,q8
787    vqmovun.s16 d24,q0
788    vst1.u8     {d24},[r0],r1
789
790    vmovl.u8        q8,d25
791    vadd.s16        q1,q8
792    vqmovun.s16 d25,q1
793    vst1.u8     {d25},[r0],r1
794
795    vmovl.u8        q8,d26
796    vadd.s16        q2,q8
797    vqmovun.s16 d26,q2
798    vst1.u8     {d26},[r0],r1
799
800    vmovl.u8        q8,d27
801    vadd.s16        q3,q8
802    vqmovun.s16 d27,q3
803    vst1.u8     {d27},[r0],r1
804
805    pop         {r4}
806WELS_ASM_FUNC_END
807
808
809WELS_ASM_FUNC_BEGIN WelsIDctRecI16x16Dc_neon
810    push        {r4}
811    ldr         r4, [sp, #4]
812
813    vld1.s16    {q8,q9}, [r4]
814    vrshr.s16       q8, q8, #6
815    vrshr.s16       q9, q9, #6
816
817    vdup.s16    d20, d16[0]
818    vdup.s16    d21, d16[1]
819    vdup.s16    d22, d16[2]
820    vdup.s16    d23, d16[3]
821
822    vld1.u8 {q0}, [r2], r3
823    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
824    vst1.u8 {q0}, [r0], r1
825
826    vld1.u8 {q0}, [r2], r3
827    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
828    vst1.u8 {q0}, [r0], r1
829
830    vld1.u8 {q0}, [r2], r3
831    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
832    vst1.u8 {q0}, [r0], r1
833
834    vld1.u8 {q0}, [r2], r3
835    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
836    vst1.u8 {q0}, [r0], r1
837
838    vdup.s16    d20, d17[0]
839    vdup.s16    d21, d17[1]
840    vdup.s16    d22, d17[2]
841    vdup.s16    d23, d17[3]
842
843    vld1.u8 {q0}, [r2], r3
844    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
845    vst1.u8 {q0}, [r0], r1
846
847    vld1.u8 {q0}, [r2], r3
848    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
849    vst1.u8 {q0}, [r0], r1
850
851    vld1.u8 {q0}, [r2], r3
852    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
853    vst1.u8 {q0}, [r0], r1
854
855    vld1.u8 {q0}, [r2], r3
856    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
857    vst1.u8 {q0}, [r0], r1
858
859    vdup.s16    d20, d18[0]
860    vdup.s16    d21, d18[1]
861    vdup.s16    d22, d18[2]
862    vdup.s16    d23, d18[3]
863
864    vld1.u8 {q0}, [r2], r3
865    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
866    vst1.u8 {q0}, [r0], r1
867
868    vld1.u8 {q0}, [r2], r3
869    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
870    vst1.u8 {q0}, [r0], r1
871
872    vld1.u8 {q0}, [r2], r3
873    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
874    vst1.u8 {q0}, [r0], r1
875
876    vld1.u8 {q0}, [r2], r3
877    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
878    vst1.u8 {q0}, [r0], r1
879
880    vdup.s16    d20, d19[0]
881    vdup.s16    d21, d19[1]
882    vdup.s16    d22, d19[2]
883    vdup.s16    d23, d19[3]
884
885    vld1.u8 {q0}, [r2], r3
886    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
887    vst1.u8 {q0}, [r0], r1
888
889    vld1.u8 {q0}, [r2], r3
890    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
891    vst1.u8 {q0}, [r0], r1
892
893    vld1.u8 {q0}, [r2], r3
894    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
895    vst1.u8 {q0}, [r0], r1
896
897    vld1.u8 {q0}, [r2], r3
898    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   d0, d1, q10, q11, q12, q13
899    vst1.u8 {q0}, [r0], r1
900
901    pop         {r4}
902WELS_ASM_FUNC_END
903#endif
904