• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /******************************************************************************
2  *
3  * Copyright (C) 2018 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 *  ihevce_scale_by_2_neon.c
24 *
25 * @brief
26 *  Contains definitions of functions for scale by 2
27 *
28 * @author
29 *  Ittiam
30 *
31 * @par List of Functions:
32 *
33 * @remarks
34 *  None
35 *
36 ********************************************************************************
37 */
38 /*****************************************************************************/
39 /* File Includes                                                             */
40 /*****************************************************************************/
41 /* System include files */
42 #include <stdio.h>
43 #include <string.h>
44 #include <assert.h>
45 #include <arm_neon.h>
46 
47 /* System user files */
48 #include "ihevc_typedefs.h"
49 #include "ihevc_macros.h"
50 #include "itt_video_api.h"
51 #include "ihevce_ipe_instr_set_router.h"
52 
53 /*****************************************************************************/
54 /* Constant Macros                                                           */
55 /*****************************************************************************/
56 #define FILT_TAP_Q 7
57 
58 /*****************************************************************************/
59 /* Function Definitions                                                      */
60 /*****************************************************************************/
61 
ihevce_horz_scale_neon_w16(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 wd,WORD32 ht)62 static void ihevce_horz_scale_neon_w16(
63     UWORD8 *pu1_src, WORD32 src_strd, UWORD8 *pu1_dst, WORD32 dst_strd, WORD32 wd, WORD32 ht)
64 {
65     const int16x8_t prec = vdupq_n_s16(8192);
66     const int16x8_t inv_prec = vdupq_n_s16(64);
67     const uint8x8_t wt_0 = vdup_n_u8(66);
68     const int8_t wt_1 = 40;
69     const int8_t wt_2 = 9;
70     WORD32 i, j;
71 
72     for(j = 0; j < ht; j++)
73     {
74         UWORD8 *pu1_src_tmp = pu1_src + j * src_strd - 3;
75         UWORD8 *pu1_dst_tmp = pu1_dst + j * dst_strd;
76 
77         for(i = 0; i < wd;)
78         {
79             uint8x16x2_t src = vld2q_u8(pu1_src_tmp);
80             uint8x8_t c, l0, r0, r3;
81             int16x8_t p, q, r;
82             int16x8_t sum;
83 
84             c = vext_u8(vget_low_u8(src.val[1]), vget_high_u8(src.val[1]), 1);
85             l0 = vext_u8(vget_low_u8(src.val[0]), vget_high_u8(src.val[0]), 1);
86             r0 = vext_u8(vget_low_u8(src.val[0]), vget_high_u8(src.val[0]), 2);
87             r3 = vext_u8(vget_low_u8(src.val[0]), vget_high_u8(src.val[0]), 3);
88 
89             p = vreinterpretq_s16_u16(vmull_u8(c, wt_0));  // a[0] * 66
90             q = vreinterpretq_s16_u16(vaddl_u8(l0, r0));
91             q = vmulq_n_s16(q, wt_1);  // (a[-1] + a[1]) * 40
92             r = vreinterpretq_s16_u16(vaddl_u8(r3, vget_low_u8(src.val[0])));
93             r = vmulq_n_s16(r, wt_2);  // (a[-3] + a[3]) * 9
94 
95             // a[0] * 66 + (a[-1] + a[1]) * 40 - (a[-3] + a[3]) * 9
96             sum = vsubq_s16(p, prec);
97             q = vsubq_s16(q, r);
98             sum = vaddq_s16(q, sum);
99             sum = vrshrq_n_s16(sum, FILT_TAP_Q);
100             sum = vaddq_s16(sum, inv_prec);
101 
102             // result
103             c = vqmovun_s16(sum);
104             vst1_u8(pu1_dst_tmp, c);
105 
106             i += 16;
107             pu1_src_tmp += 16;
108             pu1_dst_tmp += 8;
109         }
110     }
111 }
112 
ihevce_vert_scale_neon_w16(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 wd,WORD32 ht)113 static void ihevce_vert_scale_neon_w16(
114     UWORD8 *pu1_src, WORD32 src_strd, UWORD8 *pu1_dst, WORD32 dst_strd, WORD32 wd, WORD32 ht)
115 {
116     const int16x8_t prec = vdupq_n_s16(8192);
117     const int16x8_t inv_prec = vdupq_n_s16(64);
118     const uint8x8_t wt_0 = vdup_n_u8(66);
119     const int8_t wt_1 = 40;
120     const int8_t wt_2 = 9;
121     WORD32 i, j;
122 
123 #define LOAD_ROW()                                                                                 \
124     {                                                                                              \
125         src[mod8] = vld1q_u8(pu1_src_tmp);                                                         \
126         pu1_src_tmp += src_strd;                                                                   \
127         mod8++;                                                                                    \
128         mod8 &= 7;                                                                                 \
129     }
130 
131     for(i = 0; i < wd; i += 16)
132     {
133         UWORD8 *pu1_src_tmp = pu1_src - 3 * src_strd + i;
134         WORD32 lut_id = 0;
135         UWORD8 mod8 = 0;
136         uint8x16_t src[8];
137 
138         LOAD_ROW()  // r[-3]
139         LOAD_ROW()  // r[-2]
140         LOAD_ROW()  // r[-1]
141         LOAD_ROW()  // r[0]
142         LOAD_ROW()  // r[1]
143 
144         for(j = 0; j < ht; j += 2)
145         {
146             UWORD8 *pu1_dst_tmp = pu1_dst + (j >> 1) * dst_strd + i;
147             UWORD8 c, t1, b1, t2, b2;
148             int16x8_t p, q, r;
149             int16x8_t sum;
150 
151             LOAD_ROW()  // r[2]
152             LOAD_ROW()  // r[3]
153 
154             t2 = (lut_id & 7);
155             t1 = (lut_id + 2) & 7;
156             c = (lut_id + 3) & 7;
157             b1 = (lut_id + 4) & 7;
158             b2 = (lut_id + 6) & 7;
159             lut_id += 2;
160 
161             // a[0] * 66
162             p = vreinterpretq_s16_u16(vmull_u8(vget_low_u8(src[c]), wt_0));
163             // (a[-1] + a[1]) * 40
164             q = vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(src[t1]), vget_low_u8(src[b1])));
165             q = vmulq_n_s16(q, wt_1);
166             // (a[-3] + a[3]) * 9
167             r = vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(src[t2]), vget_low_u8(src[b2])));
168             r = vmulq_n_s16(r, wt_2);
169 
170             // a[0] * 66 + (a[-1] + a[1]) * 40 - (a[-3] + a[3]) * 9
171             sum = vsubq_s16(p, prec);
172             q = vsubq_s16(q, r);
173             sum = vaddq_s16(q, sum);
174             sum = vrshrq_n_s16(sum, FILT_TAP_Q);
175             sum = vaddq_s16(sum, inv_prec);
176 
177             vst1_u8(pu1_dst_tmp, vqmovun_s16(sum));
178 
179             // a[0] * 66
180             p = vreinterpretq_s16_u16(vmull_u8(vget_high_u8(src[c]), wt_0));
181             // (a[-1] + a[1]) * 40
182             q = vreinterpretq_s16_u16(vaddl_u8(vget_high_u8(src[t1]), vget_high_u8(src[b1])));
183             q = vmulq_n_s16(q, wt_1);
184             // (a[-3] + a[3]) * 9
185             r = vreinterpretq_s16_u16(vaddl_u8(vget_high_u8(src[t2]), vget_high_u8(src[b2])));
186             r = vmulq_n_s16(r, wt_2);
187 
188             // a[0] * 66 + (a[-1] + a[1]) * 40 - (a[-3] + a[3]) * 9
189             sum = vsubq_s16(p, prec);
190             q = vsubq_s16(q, r);
191             sum = vaddq_s16(q, sum);
192             sum = vrshrq_n_s16(sum, FILT_TAP_Q);
193             sum = vaddq_s16(sum, inv_prec);
194 
195             vst1_u8(pu1_dst_tmp + 8, vqmovun_s16(sum));
196 
197             pu1_dst_tmp += 16;
198         }
199     }
200 }
201 
ihevce_scaling_filter_mxn_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_scrtch,WORD32 scrtch_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 ht,WORD32 wd)202 void ihevce_scaling_filter_mxn_neon(
203     UWORD8 *pu1_src,
204     WORD32 src_strd,
205     UWORD8 *pu1_scrtch,
206     WORD32 scrtch_strd,
207     UWORD8 *pu1_dst,
208     WORD32 dst_strd,
209     WORD32 ht,
210     WORD32 wd)
211 {
212     WORD32 i, j;
213 
214     assert(wd >= 16 && wd % 16 == 0);
215     assert(ht % 2 == 0);
216     for(j = 0; j < ht;)
217     {
218         UWORD8 *pu1_src_tmp = pu1_src + j * src_strd;
219         UWORD8 *pu1_dst_tmp = pu1_dst + (j >> 1) * dst_strd;
220         WORD32 rows = MIN(64, (ht - j));
221 
222         for(i = 0; i < wd;)
223         {
224             WORD32 cols;
225 
226             if((wd - i) >= 64)
227                 cols = 64;
228             else if((wd - i) >= 32)
229                 cols = 32;
230             else
231                 cols = 16;
232 
233             ihevce_horz_scale_neon_w16(
234                 pu1_src_tmp - 3 * src_strd + i,
235                 src_strd,
236                 pu1_scrtch,
237                 scrtch_strd,
238                 cols,
239                 (3 + rows + 2));
240             ihevce_vert_scale_neon_w16(
241                 pu1_scrtch + 3 * scrtch_strd,
242                 scrtch_strd,
243                 pu1_dst_tmp + (i >> 1),
244                 dst_strd,
245                 (cols >> 1),
246                 rows);
247             i += cols;
248         }
249         j += rows;
250     }
251 }
252