1 /******************************************************************************
2 *
3 * Copyright (C) 2022 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 *****************************************************************************
18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 /**
21 * *******************************************************************************
22 * * @file
23 * isvc_mem_fns_av8.c
24 *
25 * @brief
26 * armv8 variants of
27 * functions used for memory operations
28 *
29 * *******************************************************************************
30 */
31 #include <arm_neon.h>
32 #include <string.h>
33
34 #include "ih264_typedefs.h"
35 #include "isvc_mem_fns.h"
36
isvc_memset_2d_neon(UWORD8 * pu1_dst,WORD32 i4_dst_stride,UWORD8 u1_val,WORD32 i4_blk_wd,WORD32 i4_blk_ht)37 void isvc_memset_2d_neon(UWORD8 *pu1_dst, WORD32 i4_dst_stride, UWORD8 u1_val, WORD32 i4_blk_wd,
38 WORD32 i4_blk_ht)
39 {
40 if(i4_blk_wd == 4)
41 {
42 vst1_lane_u32((UWORD32 *) pu1_dst, vreinterpret_u32_u8(vdup_n_u8(u1_val)), 0);
43 pu1_dst += i4_dst_stride;
44
45 vst1_lane_u32((UWORD32 *) pu1_dst, vreinterpret_u32_u8(vdup_n_u8(u1_val)), 0);
46 pu1_dst += i4_dst_stride;
47
48 vst1_lane_u32((UWORD32 *) pu1_dst, vreinterpret_u32_u8(vdup_n_u8(u1_val)), 0);
49 pu1_dst += i4_dst_stride;
50
51 vst1_lane_u32((UWORD32 *) pu1_dst, vreinterpret_u32_u8(vdup_n_u8(u1_val)), 0);
52 }
53 else if(i4_blk_wd == 8)
54 {
55 vst1_u8(pu1_dst, vdup_n_u8(u1_val));
56 pu1_dst += i4_dst_stride;
57
58 vst1_u8(pu1_dst, vdup_n_u8(u1_val));
59 pu1_dst += i4_dst_stride;
60
61 vst1_u8(pu1_dst, vdup_n_u8(u1_val));
62 pu1_dst += i4_dst_stride;
63
64 vst1_u8(pu1_dst, vdup_n_u8(u1_val));
65 pu1_dst += i4_dst_stride;
66
67 vst1_u8(pu1_dst, vdup_n_u8(u1_val));
68 pu1_dst += i4_dst_stride;
69
70 vst1_u8(pu1_dst, vdup_n_u8(u1_val));
71 pu1_dst += i4_dst_stride;
72
73 vst1_u8(pu1_dst, vdup_n_u8(u1_val));
74 pu1_dst += i4_dst_stride;
75
76 vst1_u8(pu1_dst, vdup_n_u8(u1_val));
77 }
78 else if((i4_blk_wd % 16 == 0) && (i4_blk_ht % 16 == 0))
79 {
80 WORD32 i, j;
81 UWORD8 *pu1_dst_col_ptr, *pu1_dst_row_ptr;
82 WORD32 i4_width_by_16 = i4_blk_wd / 16;
83 WORD32 i4_height_by_16 = i4_blk_ht / 16;
84
85 for(i = 0; i < i4_height_by_16; i++)
86 {
87 pu1_dst_row_ptr = pu1_dst + i * 16 * i4_dst_stride;
88 for(j = 0; j < i4_width_by_16; j++)
89 {
90 pu1_dst_col_ptr = pu1_dst_row_ptr + (j << 4);
91
92 vst1q_u8(&pu1_dst_col_ptr[0], vdupq_n_u8(u1_val));
93 pu1_dst_col_ptr += i4_dst_stride;
94
95 vst1q_u8(&pu1_dst_col_ptr[0], vdupq_n_u8(u1_val));
96 pu1_dst_col_ptr += i4_dst_stride;
97
98 vst1q_u8(&pu1_dst_col_ptr[0], vdupq_n_u8(u1_val));
99 pu1_dst_col_ptr += i4_dst_stride;
100
101 vst1q_u8(&pu1_dst_col_ptr[0], vdupq_n_u8(u1_val));
102 pu1_dst_col_ptr += i4_dst_stride;
103
104 vst1q_u8(&pu1_dst_col_ptr[0], vdupq_n_u8(u1_val));
105 pu1_dst_col_ptr += i4_dst_stride;
106
107 vst1q_u8(&pu1_dst_col_ptr[0], vdupq_n_u8(u1_val));
108 pu1_dst_col_ptr += i4_dst_stride;
109
110 vst1q_u8(&pu1_dst_col_ptr[0], vdupq_n_u8(u1_val));
111 pu1_dst_col_ptr += i4_dst_stride;
112
113 vst1q_u8(&pu1_dst_col_ptr[0], vdupq_n_u8(u1_val));
114 pu1_dst_col_ptr += i4_dst_stride;
115
116 vst1q_u8(&pu1_dst_col_ptr[0], vdupq_n_u8(u1_val));
117 pu1_dst_col_ptr += i4_dst_stride;
118
119 vst1q_u8(&pu1_dst_col_ptr[0], vdupq_n_u8(u1_val));
120 pu1_dst_col_ptr += i4_dst_stride;
121
122 vst1q_u8(&pu1_dst_col_ptr[0], vdupq_n_u8(u1_val));
123 pu1_dst_col_ptr += i4_dst_stride;
124
125 vst1q_u8(&pu1_dst_col_ptr[0], vdupq_n_u8(u1_val));
126 pu1_dst_col_ptr += i4_dst_stride;
127
128 vst1q_u8(&pu1_dst_col_ptr[0], vdupq_n_u8(u1_val));
129 pu1_dst_col_ptr += i4_dst_stride;
130
131 vst1q_u8(&pu1_dst_col_ptr[0], vdupq_n_u8(u1_val));
132 pu1_dst_col_ptr += i4_dst_stride;
133
134 vst1q_u8(&pu1_dst_col_ptr[0], vdupq_n_u8(u1_val));
135 pu1_dst_col_ptr += i4_dst_stride;
136
137 vst1q_u8(&pu1_dst_col_ptr[0], vdupq_n_u8(u1_val));
138 }
139 }
140 }
141 else
142 {
143 WORD32 i;
144
145 for(i = 0; i < i4_blk_ht; i++)
146 {
147 memset(pu1_dst, u1_val, i4_blk_wd);
148 pu1_dst += i4_dst_stride;
149 }
150 }
151 }
152