1 /******************************************************************************
2 *
3 * Copyright (C) 2022 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20
21 /**
22 *******************************************************************************
23 * @file
24 * isvce_downscaler.c
25 *
26 * @brief
27 * Contains downscaler functions required by the SVC encoder
28 *
29 * @author
30 * ittiam
31 *
32 * @par List of Functions:
33 * - isvce_get_downscaler_data_size()
34 * - isvce_get_downscaler_padding_dims()
35 * - isvce_get_downscaler_normalized_filtered_pixel()
36 * - isvce_horizontal_downscale_and_transpose()
37 * - isvce_process_downscaler()
38 * - isvce_initialize_downscaler()
39 *
40 * @remarks
41 * None
42 *
43 *******************************************************************************
44 */
45
46 /*****************************************************************************/
47 /* File Includes */
48 /*****************************************************************************/
49
50 /* system include files */
51 #include <stdio.h>
52 #include <stdlib.h>
53
54 #include "ih264_typedefs.h"
55 #include "ih264_macros.h"
56 #include "isvc_macros.h"
57 #include "ih264_platform_macros.h"
58 #include "iv2.h"
59 #include "isvc_defs.h"
60 #include "isvce_defs.h"
61 #include "isvc_structs.h"
62 #include "isvc_structs.h"
63 #include "isvce_downscaler.h"
64 #include "isvce_downscaler_private_defs.h"
65
66 /**
67 ******************************************************************************
68 * @brief lanczos filter coefficients for 2x downscaling
69 * @remarks Though the length of the filter is 8, the
70 * same coefficients
71 * are replicated so that 2 rows can be processed at one
72 * go in SIMD
73 ******************************************************************************
74 */
75 static WORD8 gai1_lanczos_coefficients_2x[NUM_SCALER_FILTER_PHASES][NUM_SCALER_FILTER_TAPS * 2] = {
76 {-7, 0, 39, 64, 39, 0, -7, 0, -7, 0, 39, 64, 39, 0, -7, 0},
77 {-6, 0, 33, 62, 41, 4, -6, 0, -6, 0, 33, 62, 41, 4, -6, 0},
78 {-5, -1, 29, 57, 45, 9, -5, -1, -5, -1, 29, 57, 45, 9, -5, -1},
79 {-4, -2, 23, 55, 48, 14, -4, -2, -4, -2, 23, 55, 48, 14, -4, -2},
80 {-3, -3, 18, 52, 52, 18, -3, -3, -3, -3, 18, 52, 52, 18, -3, -3},
81 {-2, -4, 13, 49, 54, 24, -2, -4, -2, -4, 13, 49, 54, 24, -2, -4},
82 {-1, -5, 9, 44, 58, 29, -1, -5, -1, -5, 9, 44, 58, 29, -1, -5},
83 {0, -6, 3, 42, 61, 34, 0, -6, 0, -6, 3, 42, 61, 34, 0, -6}};
84
85 /**
86 ******************************************************************************
87 * @brief lanczos filter coefficients for 1.5x downscaling
88 * @remarks Though the length of the filter is 8, the same coefficients
89 * are replicated so that 2 rows can be processed at one go in SIMD.
90 ******************************************************************************
91 */
92 static WORD8 gai1_lanczos_coefficients_3by2x[NUM_SCALER_FILTER_PHASES][NUM_SCALER_FILTER_TAPS * 2] =
93 {{0, -11, 32, 86, 32, -11, 0, 0, 0, -11, 32, 86, 32, -11, 0, 0},
94 {0, -10, 26, 79, 39, -5, 0, 0, 0, -10, 26, 79, 39, -5, 0, 0},
95 {0, -8, 21, 72, 46, 0, -2, 0, 0, -8, 21, 72, 46, 0, -2, 0},
96 {0, -6, 15, 66, 52, 3, -3, 0, 0, -6, 15, 66, 52, 3, -3, 0},
97 {0, -6, 10, 60, 60, 10, -6, 0, 0, -6, 10, 60, 60, 10, -6, 0},
98 {0, -3, 3, 52, 66, 15, -6, 0, 0, -3, 3, 52, 66, 15, -6, 0},
99 {0, -2, 0, 46, 72, 21, -8, 0, 0, -2, 0, 46, 72, 21, -8, 0},
100 {0, 0, -5, 39, 79, 26, -10, 0, 0, 0, -5, 39, 79, 26, -10, 0}};
101
102 /**
103 *******************************************************************************
104 *
105 * @brief
106 * gets the memory size required for downscaler
107 *
108 * @par Description:
109 * returns the memory required by the downscaler context and state structs
110 * for allocation.
111 *
112 * @returns
113 *
114 * @remarks
115 *
116 *
117 *******************************************************************************
118 */
119
isvce_get_downscaler_data_size(UWORD8 u1_num_spatial_layers,DOUBLE d_scaling_factor,UWORD32 u4_width,UWORD32 u4_height)120 UWORD32 isvce_get_downscaler_data_size(UWORD8 u1_num_spatial_layers, DOUBLE d_scaling_factor,
121 UWORD32 u4_width, UWORD32 u4_height)
122 {
123 UWORD32 u4_size = 0;
124
125 if(u1_num_spatial_layers > 1)
126 {
127 u4_size += sizeof(downscaler_state_t);
128
129 u4_size +=
130 (u4_height + NUM_SCALER_FILTER_TAPS * 2) * ((UWORD32) (u4_width / d_scaling_factor));
131 }
132
133 return u4_size;
134 }
135
136 /**
137 *******************************************************************************
138 *
139 * @brief
140 * gets the padding size required for filtering
141 *
142 * @par Description:
143 * gets the padding size required for filtering
144 *
145 * @returns
146 *
147 * @remarks
148 *
149 *
150 *******************************************************************************
151 */
152
isvce_get_downscaler_padding_dims(padding_dims_t * ps_pad_dims)153 void isvce_get_downscaler_padding_dims(padding_dims_t *ps_pad_dims)
154 {
155 ps_pad_dims->u1_left_pad_size = ALIGN8(NUM_SCALER_FILTER_TAPS / 2);
156 ps_pad_dims->u1_right_pad_size = ALIGN8(NUM_SCALER_FILTER_TAPS / 2);
157 ps_pad_dims->u1_top_pad_size = NUM_SCALER_FILTER_TAPS / 2;
158 ps_pad_dims->u1_bottom_pad_size = NUM_SCALER_FILTER_TAPS / 2;
159 }
160
161 /**
162 *******************************************************************************
163 *
164 * @brief
165 * processes downscaler
166 *
167 * @par Description:
168 * calls the function for padding and scaling
169 *
170 * @param[in] ps_scaler
171 * pointer to downdownscaler context
172 *
173 * @param[in] ps_src_buf_props
174 * pointer to source buffer props struct
175 *
176 * @param[in] u4_blk_wd
177 * width of the block to be processed
178 *
179 * @param[in] u4_blk_ht
180 * height of the block to be processed
181 *
182 * @returns
183 *
184 * @remarks
185 *
186 *
187 *******************************************************************************
188 */
189
isvce_process_downscaler(downscaler_ctxt_t * ps_scaler,yuv_buf_props_t * ps_src_buf_props,yuv_buf_props_t * ps_dst_buf_props,UWORD32 u4_blk_wd,UWORD32 u4_blk_ht)190 void isvce_process_downscaler(downscaler_ctxt_t *ps_scaler, yuv_buf_props_t *ps_src_buf_props,
191 yuv_buf_props_t *ps_dst_buf_props, UWORD32 u4_blk_wd,
192 UWORD32 u4_blk_ht)
193 {
194 buffer_container_t s_src_buf;
195 buffer_container_t s_dst_buf;
196
197 UWORD32 u4_scaled_block_size_x, u4_scaled_block_size_y;
198
199 downscaler_state_t *ps_scaler_state = (downscaler_state_t *) ps_scaler->pv_scaler_state;
200
201 ASSERT(ps_src_buf_props->e_color_format == IV_YUV_420SP_UV);
202
203 u4_scaled_block_size_x = (UWORD32) (u4_blk_wd / ps_scaler->d_scaling_factor);
204 u4_scaled_block_size_y = (UWORD32) (u4_blk_ht / ps_scaler->d_scaling_factor);
205
206 /* luma */
207 s_src_buf = ps_src_buf_props->as_component_bufs[Y];
208 s_src_buf.pv_data = ((UWORD8 *) s_src_buf.pv_data) - (NUM_SCALER_FILTER_TAPS / 2) -
209 (NUM_SCALER_FILTER_TAPS / 2) * s_src_buf.i4_data_stride;
210
211 s_dst_buf.pv_data = ps_scaler_state->pv_scratch_buf;
212 s_dst_buf.i4_data_stride = u4_blk_ht + NUM_SCALER_FILTER_TAPS;
213
214 ps_scaler_state->pf_downscaler(ps_scaler, &s_src_buf, &s_dst_buf, ps_scaler_state->pai1_filters,
215 u4_scaled_block_size_x, u4_blk_ht + NUM_SCALER_FILTER_TAPS, 0);
216
217 s_src_buf = s_dst_buf;
218 s_dst_buf = ps_dst_buf_props->as_component_bufs[Y];
219
220 ps_scaler_state->pf_downscaler(ps_scaler, &s_src_buf, &s_dst_buf, ps_scaler_state->pai1_filters,
221 u4_scaled_block_size_y, u4_scaled_block_size_x, 0);
222
223 /* chroma */
224 u4_blk_ht /= 2;
225 u4_scaled_block_size_y /= 2;
226
227 s_src_buf = ps_src_buf_props->as_component_bufs[U];
228 s_src_buf.pv_data = ((UWORD8 *) s_src_buf.pv_data) - NUM_SCALER_FILTER_TAPS -
229 (NUM_SCALER_FILTER_TAPS / 2) * s_src_buf.i4_data_stride;
230
231 s_dst_buf.pv_data = ps_scaler_state->pv_scratch_buf;
232 s_dst_buf.i4_data_stride = u4_blk_ht + NUM_SCALER_FILTER_TAPS;
233
234 ps_scaler_state->pf_downscaler(ps_scaler, &s_src_buf, &s_dst_buf, ps_scaler_state->pai1_filters,
235 u4_scaled_block_size_x, u4_blk_ht + NUM_SCALER_FILTER_TAPS, 1);
236
237 s_src_buf = s_dst_buf;
238 s_dst_buf = ps_dst_buf_props->as_component_bufs[U];
239
240 ps_scaler_state->pf_downscaler(ps_scaler, &s_src_buf, &s_dst_buf, ps_scaler_state->pai1_filters,
241 u4_scaled_block_size_y, u4_scaled_block_size_x, 0);
242 }
243
244 /**
245 *******************************************************************************
246 *
247 * @brief
248 * normalized dot product computer for downscaler
249 *
250 * @par Description:
251 * Given the downscaler filter coefficients, source buffer, the function
252 * calculates the dot product between them, adds an offset and normalizes it
253 *
254 * @param[in] ps_scaler
255 * pointer to src buf
256 *
257 * @param[in] pi1_filter
258 * pointer to filter coefficients
259 *
260 * @returns
261 *
262 * @remarks
263 *
264 *******************************************************************************
265 */
266
isvce_get_downscaler_normalized_filtered_pixel(UWORD8 * pu1_src,WORD8 * pi1_filter)267 static UWORD8 isvce_get_downscaler_normalized_filtered_pixel(UWORD8 *pu1_src, WORD8 *pi1_filter)
268 {
269 WORD32 i;
270 WORD32 i4_norm_dot_product;
271 UWORD8 u1_out_pixel;
272 WORD32 i4_dot_product_sum = 0;
273 WORD32 i4_rounding_offset = 1 << (FILTER_COEFF_Q - 1);
274 WORD32 i4_normalizing_factor = 1 << FILTER_COEFF_Q;
275
276 for(i = 0; i < NUM_SCALER_FILTER_TAPS; i++)
277 {
278 i4_dot_product_sum += (pu1_src[i] * pi1_filter[i]);
279 }
280
281 i4_norm_dot_product = ((i4_dot_product_sum + i4_rounding_offset) / i4_normalizing_factor);
282 u1_out_pixel = (UWORD8) CLIP_U8(i4_norm_dot_product);
283
284 return u1_out_pixel;
285 }
286
287 /**
288 *******************************************************************************
289 *
290 * @brief
291 * horizontal scaler function
292 *
293 * @par Description:
294 * Does horizontal scaling for the given block
295 *
296 * @param[in] ps_scaler
297 * pointer to downscaler context
298 *
299 * @param[in] ps_src
300 * pointer to source buffer container
301 *
302 * @param[in] ps_dst
303 * pointer to destination buffer container
304 *
305 * @param[in] pai1_filters
306 * pointer to array of downscaler filters
307 *
308 * @param[in] u4_blk_wd
309 * width of the block after horizontal scaling (output block width)
310 *
311 * @param[in] u4_blk_ht
312 * height of the current block (input block height)
313 *
314 * @param[in] u1_is_chroma
315 * flag suggesting whether the buffer is luma or chroma
316 *
317 *
318 * @returns
319 *
320 * @remarks
321 * The same function is used for vertical scaling too as
322 * the horizontally scaled input in stored in transpose fashion.
323 *
324 *******************************************************************************
325 */
326
isvce_horizontal_downscale_and_transpose(downscaler_ctxt_t * ps_scaler,buffer_container_t * ps_src,buffer_container_t * ps_dst,FILTER_COEFF_ARRAY pai1_filters,UWORD32 u4_blk_wd,UWORD32 u4_blk_ht,UWORD8 u1_is_chroma)327 static void isvce_horizontal_downscale_and_transpose(
328 downscaler_ctxt_t *ps_scaler, buffer_container_t *ps_src, buffer_container_t *ps_dst,
329 FILTER_COEFF_ARRAY pai1_filters, UWORD32 u4_blk_wd, UWORD32 u4_blk_ht, UWORD8 u1_is_chroma)
330 {
331 WORD32 i, j, k;
332 UWORD8 u1_phase;
333 UWORD8 u1_filtered_out_pixel;
334 UWORD8 *pu1_src_j, *pu1_dst_j;
335 UWORD8 u1_filtered_out_u_pixel, u1_filtered_out_v_pixel;
336 UWORD8 *pu1_in_pixel;
337 UWORD8 *pu1_out_pixel;
338 WORD8 *pi1_filter_grid;
339 UWORD16 u2_full_pixel_inc;
340 UWORD8 au1_temp_u_buff[NUM_SCALER_FILTER_TAPS];
341 UWORD8 au1_temp_v_buff[NUM_SCALER_FILTER_TAPS];
342
343 downscaler_state_t *ps_scaler_state = (downscaler_state_t *) ps_scaler->pv_scaler_state;
344
345 UWORD32 u4_center_pixel_pos = ps_scaler_state->i4_init_offset;
346 UWORD32 u4_src_horz_increments = ps_scaler_state->u4_horz_increment;
347 UWORD8 *pu1_src = ps_src->pv_data;
348 UWORD32 u4_in_stride = ps_src->i4_data_stride;
349 UWORD8 *pu1_dst = ps_dst->pv_data;
350 UWORD32 u4_out_stride = ps_dst->i4_data_stride;
351 UWORD32 u4_center_pixel_pos_src = u4_center_pixel_pos;
352
353 /* Offset the input so that the input pixel to be processed
354 co-incides with the centre of filter (4th coefficient)*/
355 pu1_src += (1 + u1_is_chroma);
356
357 ASSERT((1 << DOWNSCALER_Q) == ps_scaler_state->u4_vert_increment);
358
359 if(!u1_is_chroma)
360 {
361 for(j = 0; j < (WORD32) u4_blk_ht; j++)
362 {
363 pu1_src_j = pu1_src + (j * u4_in_stride);
364 pu1_dst_j = pu1_dst + j;
365
366 u4_center_pixel_pos = u4_center_pixel_pos_src;
367
368 for(i = 0; i < (WORD32) u4_blk_wd; i++)
369 {
370 u1_phase = get_filter_phase(u4_center_pixel_pos);
371 pi1_filter_grid = pai1_filters[u1_phase];
372
373 /* Doing the Calculation for current Loop Count */
374 u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
375 pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
376 pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);
377
378 u1_filtered_out_pixel =
379 isvce_get_downscaler_normalized_filtered_pixel(pu1_in_pixel, pi1_filter_grid);
380 *pu1_out_pixel = u1_filtered_out_pixel;
381
382 /* Update the context for next Loop Count */
383 u4_center_pixel_pos += u4_src_horz_increments;
384 }
385 }
386 }
387 else
388 {
389 for(j = 0; j < (WORD32) u4_blk_ht; j++)
390 {
391 pu1_src_j = pu1_src + (j * u4_in_stride);
392 pu1_dst_j = pu1_dst + j;
393
394 u4_center_pixel_pos = u4_center_pixel_pos_src;
395
396 for(i = 0; i < (WORD32) u4_blk_wd; i++)
397 {
398 u1_phase = get_filter_phase(u4_center_pixel_pos);
399 pi1_filter_grid = pai1_filters[u1_phase];
400
401 /*Doing the Calculation for current Loop Count */
402 u2_full_pixel_inc = u4_center_pixel_pos >> DOWNSCALER_Q;
403 pu1_in_pixel = pu1_src_j + (u2_full_pixel_inc << u1_is_chroma);
404 pu1_out_pixel = pu1_dst_j + ((i << u1_is_chroma) * u4_out_stride);
405
406 for(k = 0; k < NUM_SCALER_FILTER_TAPS; k++)
407 {
408 au1_temp_u_buff[k] = *(pu1_in_pixel + (2 * k));
409 au1_temp_v_buff[k] = *(pu1_in_pixel + ((2 * k) + 1));
410 }
411
412 u1_filtered_out_u_pixel = isvce_get_downscaler_normalized_filtered_pixel(
413 au1_temp_u_buff, pi1_filter_grid);
414 u1_filtered_out_v_pixel = isvce_get_downscaler_normalized_filtered_pixel(
415 au1_temp_v_buff, pi1_filter_grid);
416 *pu1_out_pixel = u1_filtered_out_u_pixel;
417 *(pu1_out_pixel + u4_out_stride) = u1_filtered_out_v_pixel;
418
419 /* Update the context for next Loop Count */
420 u4_center_pixel_pos += u4_src_horz_increments;
421 }
422 }
423 }
424 }
425
isvce_downscaler_function_selector(downscaler_state_t * ps_scaler_state,IV_ARCH_T e_arch)426 void isvce_downscaler_function_selector(downscaler_state_t *ps_scaler_state, IV_ARCH_T e_arch)
427 {
428 switch(e_arch)
429 {
430 #if defined(X86)
431 case ARCH_X86_SSE42:
432 {
433 ps_scaler_state->pf_downscaler = isvce_horizontal_downscale_and_transpose_sse42;
434
435 break;
436 }
437 #elif defined(ARMV8)
438 case ARCH_ARM_A53:
439 case ARCH_ARM_A57:
440 case ARCH_ARM_V8_NEON:
441 {
442 ps_scaler_state->pf_downscaler = isvce_horizontal_downscale_and_transpose_neon;
443
444 break;
445 }
446 #elif defined(ARM) && !defined(DISABLE_NEON)
447 case ARCH_ARM_A9Q:
448 case ARCH_ARM_A9A:
449 case ARCH_ARM_A9:
450 case ARCH_ARM_A7:
451 case ARCH_ARM_A5:
452 case ARCH_ARM_A15:
453 {
454 ps_scaler_state->pf_downscaler = isvce_horizontal_downscale_and_transpose_neon;
455
456 break;
457 }
458 #endif
459 default:
460 {
461 ps_scaler_state->pf_downscaler = isvce_horizontal_downscale_and_transpose;
462
463 break;
464 }
465 }
466 }
467
468 /**
469 *******************************************************************************
470 *
471 * @brief
472 * initializes the downscaler context
473 *
474 * @par Description:
475 * initializes the downscaler context for the given scaling factor
476 * with padding size, filter size, etc.
477 *
478 * @param[in] ps_scaler
479 * pointer downscaler context
480 *
481 * @param[in] ps_mem_rec
482 * pointer to memory allocated to downscaler process
483 *
484 * @param[in] d_scaling_factor
485 * scaling reatio of width/ height between two consecutive SVC layers
486 *
487 * @param[in] u1_num_spatial_layers
488 * scaling reatio of width/ height between two consecutive SVC layers
489 *
490 * @param[in] u4_wd
491 * width of the input
492 *
493 * @param[in] u4_ht
494 * height of the input
495 *
496 * @param[in] e_arch
497 * architecure type
498 *
499 * @returns
500 *
501 * @remarks
502 * when ARM intrinsics are added, update should be done here
503 *
504 *******************************************************************************
505 */
506
isvce_initialize_downscaler(downscaler_ctxt_t * ps_scaler,iv_mem_rec_t * ps_mem_rec,DOUBLE d_scaling_factor,UWORD8 u1_num_spatial_layers,UWORD32 u4_in_width,UWORD32 u4_in_height,IV_ARCH_T e_arch)507 void isvce_initialize_downscaler(downscaler_ctxt_t *ps_scaler, iv_mem_rec_t *ps_mem_rec,
508 DOUBLE d_scaling_factor, UWORD8 u1_num_spatial_layers,
509 UWORD32 u4_in_width, UWORD32 u4_in_height, IV_ARCH_T e_arch)
510 {
511 if(u1_num_spatial_layers > 1)
512 {
513 downscaler_state_t *ps_scaler_state;
514
515 UWORD8 *pu1_buf = (UWORD8 *) ps_mem_rec->pv_base;
516
517 ps_scaler_state = (downscaler_state_t *) pu1_buf;
518 pu1_buf += sizeof(ps_scaler_state[0]);
519
520 ps_scaler_state->pv_scratch_buf = pu1_buf;
521 ps_scaler_state->u4_in_wd = u4_in_width;
522 ps_scaler_state->u4_in_ht = u4_in_height;
523
524 ps_scaler->pv_scaler_state = ps_scaler_state;
525 ps_scaler->d_scaling_factor = d_scaling_factor;
526 ps_scaler->u1_num_spatial_layers = u1_num_spatial_layers;
527
528 isvce_downscaler_function_selector(ps_scaler_state, e_arch);
529
530 ps_scaler_state->u4_horz_increment = (UWORD32) (d_scaling_factor * (1 << DOWNSCALER_Q));
531
532 ps_scaler_state->u4_vert_increment = (1 << DOWNSCALER_Q);
533 ps_scaler_state->i4_init_offset = 0;
534 ps_scaler_state->pai1_filters = (d_scaling_factor == 2.0) ? gai1_lanczos_coefficients_2x
535 : gai1_lanczos_coefficients_3by2x;
536 }
537 }
538