1 /******************************************************************************
2 *
3 * Copyright (C) 2018 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 *******************************************************************************
22 * @file
23 * ihevce_scale_by_2_neon.c
24 *
25 * @brief
26 * Contains definitions of functions for scale by 2
27 *
28 * @author
29 * Ittiam
30 *
31 * @par List of Functions:
32 *
33 * @remarks
34 * None
35 *
36 ********************************************************************************
37 */
38 /*****************************************************************************/
39 /* File Includes */
40 /*****************************************************************************/
41 /* System include files */
42 #include <stdio.h>
43 #include <string.h>
44 #include <assert.h>
45 #include <arm_neon.h>
46
47 /* System user files */
48 #include "ihevc_typedefs.h"
49 #include "ihevc_macros.h"
50 #include "itt_video_api.h"
51 #include "ihevce_ipe_instr_set_router.h"
52
53 /*****************************************************************************/
54 /* Constant Macros */
55 /*****************************************************************************/
56 #define FILT_TAP_Q 7
57
58 /*****************************************************************************/
59 /* Function Definitions */
60 /*****************************************************************************/
61
ihevce_horz_scale_neon_w16(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 wd,WORD32 ht)62 static void ihevce_horz_scale_neon_w16(
63 UWORD8 *pu1_src, WORD32 src_strd, UWORD8 *pu1_dst, WORD32 dst_strd, WORD32 wd, WORD32 ht)
64 {
65 const int16x8_t prec = vdupq_n_s16(8192);
66 const int16x8_t inv_prec = vdupq_n_s16(64);
67 const uint8x8_t wt_0 = vdup_n_u8(66);
68 const int8_t wt_1 = 40;
69 const int8_t wt_2 = 9;
70 WORD32 i, j;
71
72 for(j = 0; j < ht; j++)
73 {
74 UWORD8 *pu1_src_tmp = pu1_src + j * src_strd - 3;
75 UWORD8 *pu1_dst_tmp = pu1_dst + j * dst_strd;
76
77 for(i = 0; i < wd;)
78 {
79 uint8x16x2_t src = vld2q_u8(pu1_src_tmp);
80 uint8x8_t c, l0, r0, r3;
81 int16x8_t p, q, r;
82 int16x8_t sum;
83
84 c = vext_u8(vget_low_u8(src.val[1]), vget_high_u8(src.val[1]), 1);
85 l0 = vext_u8(vget_low_u8(src.val[0]), vget_high_u8(src.val[0]), 1);
86 r0 = vext_u8(vget_low_u8(src.val[0]), vget_high_u8(src.val[0]), 2);
87 r3 = vext_u8(vget_low_u8(src.val[0]), vget_high_u8(src.val[0]), 3);
88
89 p = vreinterpretq_s16_u16(vmull_u8(c, wt_0)); // a[0] * 66
90 q = vreinterpretq_s16_u16(vaddl_u8(l0, r0));
91 q = vmulq_n_s16(q, wt_1); // (a[-1] + a[1]) * 40
92 r = vreinterpretq_s16_u16(vaddl_u8(r3, vget_low_u8(src.val[0])));
93 r = vmulq_n_s16(r, wt_2); // (a[-3] + a[3]) * 9
94
95 // a[0] * 66 + (a[-1] + a[1]) * 40 - (a[-3] + a[3]) * 9
96 sum = vsubq_s16(p, prec);
97 q = vsubq_s16(q, r);
98 sum = vaddq_s16(q, sum);
99 sum = vrshrq_n_s16(sum, FILT_TAP_Q);
100 sum = vaddq_s16(sum, inv_prec);
101
102 // result
103 c = vqmovun_s16(sum);
104 vst1_u8(pu1_dst_tmp, c);
105
106 i += 16;
107 pu1_src_tmp += 16;
108 pu1_dst_tmp += 8;
109 }
110 }
111 }
112
ihevce_vert_scale_neon_w16(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 wd,WORD32 ht)113 static void ihevce_vert_scale_neon_w16(
114 UWORD8 *pu1_src, WORD32 src_strd, UWORD8 *pu1_dst, WORD32 dst_strd, WORD32 wd, WORD32 ht)
115 {
116 const int16x8_t prec = vdupq_n_s16(8192);
117 const int16x8_t inv_prec = vdupq_n_s16(64);
118 const uint8x8_t wt_0 = vdup_n_u8(66);
119 const int8_t wt_1 = 40;
120 const int8_t wt_2 = 9;
121 WORD32 i, j;
122
123 #define LOAD_ROW() \
124 { \
125 src[mod8] = vld1q_u8(pu1_src_tmp); \
126 pu1_src_tmp += src_strd; \
127 mod8++; \
128 mod8 &= 7; \
129 }
130
131 for(i = 0; i < wd; i += 16)
132 {
133 UWORD8 *pu1_src_tmp = pu1_src - 3 * src_strd + i;
134 WORD32 lut_id = 0;
135 UWORD8 mod8 = 0;
136 uint8x16_t src[8];
137
138 LOAD_ROW() // r[-3]
139 LOAD_ROW() // r[-2]
140 LOAD_ROW() // r[-1]
141 LOAD_ROW() // r[0]
142 LOAD_ROW() // r[1]
143
144 for(j = 0; j < ht; j += 2)
145 {
146 UWORD8 *pu1_dst_tmp = pu1_dst + (j >> 1) * dst_strd + i;
147 UWORD8 c, t1, b1, t2, b2;
148 int16x8_t p, q, r;
149 int16x8_t sum;
150
151 LOAD_ROW() // r[2]
152 LOAD_ROW() // r[3]
153
154 t2 = (lut_id & 7);
155 t1 = (lut_id + 2) & 7;
156 c = (lut_id + 3) & 7;
157 b1 = (lut_id + 4) & 7;
158 b2 = (lut_id + 6) & 7;
159 lut_id += 2;
160
161 // a[0] * 66
162 p = vreinterpretq_s16_u16(vmull_u8(vget_low_u8(src[c]), wt_0));
163 // (a[-1] + a[1]) * 40
164 q = vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(src[t1]), vget_low_u8(src[b1])));
165 q = vmulq_n_s16(q, wt_1);
166 // (a[-3] + a[3]) * 9
167 r = vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(src[t2]), vget_low_u8(src[b2])));
168 r = vmulq_n_s16(r, wt_2);
169
170 // a[0] * 66 + (a[-1] + a[1]) * 40 - (a[-3] + a[3]) * 9
171 sum = vsubq_s16(p, prec);
172 q = vsubq_s16(q, r);
173 sum = vaddq_s16(q, sum);
174 sum = vrshrq_n_s16(sum, FILT_TAP_Q);
175 sum = vaddq_s16(sum, inv_prec);
176
177 vst1_u8(pu1_dst_tmp, vqmovun_s16(sum));
178
179 // a[0] * 66
180 p = vreinterpretq_s16_u16(vmull_u8(vget_high_u8(src[c]), wt_0));
181 // (a[-1] + a[1]) * 40
182 q = vreinterpretq_s16_u16(vaddl_u8(vget_high_u8(src[t1]), vget_high_u8(src[b1])));
183 q = vmulq_n_s16(q, wt_1);
184 // (a[-3] + a[3]) * 9
185 r = vreinterpretq_s16_u16(vaddl_u8(vget_high_u8(src[t2]), vget_high_u8(src[b2])));
186 r = vmulq_n_s16(r, wt_2);
187
188 // a[0] * 66 + (a[-1] + a[1]) * 40 - (a[-3] + a[3]) * 9
189 sum = vsubq_s16(p, prec);
190 q = vsubq_s16(q, r);
191 sum = vaddq_s16(q, sum);
192 sum = vrshrq_n_s16(sum, FILT_TAP_Q);
193 sum = vaddq_s16(sum, inv_prec);
194
195 vst1_u8(pu1_dst_tmp + 8, vqmovun_s16(sum));
196
197 pu1_dst_tmp += 16;
198 }
199 }
200 }
201
ihevce_scaling_filter_mxn_neon(UWORD8 * pu1_src,WORD32 src_strd,UWORD8 * pu1_scrtch,WORD32 scrtch_strd,UWORD8 * pu1_dst,WORD32 dst_strd,WORD32 ht,WORD32 wd)202 void ihevce_scaling_filter_mxn_neon(
203 UWORD8 *pu1_src,
204 WORD32 src_strd,
205 UWORD8 *pu1_scrtch,
206 WORD32 scrtch_strd,
207 UWORD8 *pu1_dst,
208 WORD32 dst_strd,
209 WORD32 ht,
210 WORD32 wd)
211 {
212 WORD32 i, j;
213
214 assert(wd >= 16 && wd % 16 == 0);
215 assert(ht % 2 == 0);
216 for(j = 0; j < ht;)
217 {
218 UWORD8 *pu1_src_tmp = pu1_src + j * src_strd;
219 UWORD8 *pu1_dst_tmp = pu1_dst + (j >> 1) * dst_strd;
220 WORD32 rows = MIN(64, (ht - j));
221
222 for(i = 0; i < wd;)
223 {
224 WORD32 cols;
225
226 if((wd - i) >= 64)
227 cols = 64;
228 else if((wd - i) >= 32)
229 cols = 32;
230 else
231 cols = 16;
232
233 ihevce_horz_scale_neon_w16(
234 pu1_src_tmp - 3 * src_strd + i,
235 src_strd,
236 pu1_scrtch,
237 scrtch_strd,
238 cols,
239 (3 + rows + 2));
240 ihevce_vert_scale_neon_w16(
241 pu1_scrtch + 3 * scrtch_strd,
242 scrtch_strd,
243 pu1_dst_tmp + (i >> 1),
244 dst_strd,
245 (cols >> 1),
246 rows);
247 i += cols;
248 }
249 j += rows;
250 }
251 }
252