Lines Matching full:vec
14 #include <executorch/kernels/optimized/vec/vec.h>
17 namespace vec {
23 vec::Vectorized<scalar_t> acc_vec, in vec_reduce_all()
25 using Vec = vec::Vectorized<scalar_t>; in vec_reduce_all() local
26 scalar_t acc_arr[Vec::size()]; in vec_reduce_all()
29 std::array<scalar_t, Vec::size()> acc_arr_next = {0}; in vec_reduce_all()
31 Vec acc_vec_next = Vec::loadu(acc_arr_next.data()); in vec_reduce_all()
50 using Vec = Vectorized<float>;
51 Vec v = acc_vec;
53 Vec v1 = _mm256_permute2f128_ps(v, v, 0x1);
69 using Vec = Vectorized<float>;
70 Vec v = acc_vec;
72 Vec v1 = _mm512_shuffle_f32x4(v, v, 0x4E);
96 using Vec = vec::Vectorized<scalar_t>;
97 if (size < Vec::size())
98 return vec_reduce_all(vec_fun, Vec::loadu(data, size), size);
99 int64_t d = Vec::size();
100 Vec acc_vec = Vec::loadu(data);
101 for (; d < size - (size % Vec::size()); d += Vec::size()) {
102 Vec data_vec = Vec::loadu(data + d);
106 Vec data_vec = Vec::loadu(data + d, size - d);
107 acc_vec = Vec::set(acc_vec, vec_fun(acc_vec, data_vec), size - d);
116 using Vec = vec::Vectorized<scalar_t>;
117 if (size < Vec::size()) {
118 auto loaded_data = Vec::loadu(data, size);
123 int64_t d = Vec::size();
124 Vec acc_vec1 = Vec::loadu(data);
125 Vec acc_vec2 = Vec::loadu(data);
126 for (; d < size - (size % Vec::size()); d += Vec::size()) {
127 Vec data_vec = Vec::loadu(data + d);
132 Vec data_vec = Vec::loadu(data + d, size - d);
133 acc_vec1 = Vec::set(acc_vec1, vec_fun1(acc_vec1, data_vec), size - d);
134 acc_vec2 = Vec::set(acc_vec2, vec_fun2(acc_vec2, data_vec), size - d);
147 using Vec = vec::Vectorized<scalar_t>;
148 if (size < Vec::size())
149 return vec_reduce_all(red_fun, map_fun(Vec::loadu(data, size)), size);
150 int64_t d = Vec::size();
151 Vec acc_vec = map_fun(Vec::loadu(data));
152 for (; d < size - (size % Vec::size()); d += Vec::size()) {
153 Vec data_vec = Vec::loadu(data + d);
158 Vec data_vec = Vec::loadu(data + d, size - d);
160 acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d);
172 using Vec = vec::Vectorized<scalar_t>;
173 if (size < Vec::size()) {
174 Vec data_vec = Vec::loadu(data, size);
175 Vec data2_vec = Vec::loadu(data2, size);
179 int64_t d = Vec::size();
180 Vec acc_vec = map_fun(Vec::loadu(data), Vec::loadu(data2));
181 for (; d < size - (size % Vec::size()); d += Vec::size()) {
182 Vec data_vec = Vec::loadu(data + d);
183 Vec data2_vec = Vec::loadu(data2 + d);
188 Vec data_vec = Vec::loadu(data + d, size - d);
189 Vec data2_vec = Vec::loadu(data2 + d, size - d);
191 acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d);
204 using Vec = vec::Vectorized<scalar_t>;
205 if (size < Vec::size()) {
206 Vec data_vec = Vec::loadu(data, size);
207 Vec data2_vec = Vec::loadu(data2, size);
208 Vec data3_vec = Vec::loadu(data3, size);
213 int64_t d = Vec::size();
214 Vec acc_vec = map_fun(Vec::loadu(data), Vec::loadu(data2), Vec::loadu(data3));
215 for (; d < size - (size % Vec::size()); d += Vec::size()) {
216 Vec data_vec = Vec::loadu(data + d);
217 Vec data2_vec = Vec::loadu(data2 + d);
218 Vec data3_vec = Vec::loadu(data3 + d);
223 Vec data_vec = Vec::loadu(data + d, size - d);
224 Vec data2_vec = Vec::loadu(data2 + d, size - d);
225 Vec data3_vec = Vec::loadu(data3 + d, size - d);
227 acc_vec = Vec::set(acc_vec, red_fun(acc_vec, data_vec), size - d);
238 using Vec = vec::Vectorized<scalar_t>;
240 for (; d < size - (size % Vec::size()); d += Vec::size()) {
241 Vec output_vec = vec_fun(Vec::loadu(input_data + d));
245 Vec output_vec = vec_fun(Vec::loadu(input_data + d, size - d));
257 using Vec = vec::Vectorized<scalar_t>;
259 for (; d < size - (size % Vec::size()); d += Vec::size()) {
260 Vec data_vec = Vec::loadu(input_data + d);
261 Vec data_vec2 = Vec::loadu(input_data2 + d);
262 Vec output_vec = vec_fun(data_vec, data_vec2);
266 Vec data_vec = Vec::loadu(input_data + d, size - d);
267 Vec data_vec2 = Vec::loadu(input_data2 + d, size - d);
268 Vec output_vec = vec_fun(data_vec, data_vec2);
281 using Vec = vec::Vectorized<scalar_t>;
283 for (; d < size - (size % Vec::size()); d += Vec::size()) {
284 Vec data_vec1 = Vec::loadu(input_data1 + d);
285 Vec data_vec2 = Vec::loadu(input_data2 + d);
286 Vec data_vec3 = Vec::loadu(input_data3 + d);
287 Vec output_vec = vec_fun(data_vec1, data_vec2, data_vec3);
291 Vec data_vec1 = Vec::loadu(input_data1 + d, size - d);
292 Vec data_vec2 = Vec::loadu(input_data2 + d, size - d);
293 Vec data_vec3 = Vec::loadu(input_data3 + d, size - d);
294 Vec output_vec = vec_fun(data_vec1, data_vec2, data_vec3);
308 using Vec = vec::Vectorized<scalar_t>;
310 for (; d < size - (size % Vec::size()); d += Vec::size()) {
311 Vec data_vec1 = Vec::loadu(input_data1 + d);
312 Vec data_vec2 = Vec::loadu(input_data2 + d);
313 Vec data_vec3 = Vec::loadu(input_data3 + d);
314 Vec data_vec4 = Vec::loadu(input_data4 + d);
315 Vec output_vec = vec_fun(data_vec1, data_vec2, data_vec3, data_vec4);
319 Vec data_vec1 = Vec::loadu(input_data1 + d, size - d);
320 Vec data_vec2 = Vec::loadu(input_data2 + d, size - d);
321 Vec data_vec3 = Vec::loadu(input_data3 + d, size - d);
322 Vec data_vec4 = Vec::loadu(input_data4 + d, size - d);
323 Vec output_vec = vec_fun(data_vec1, data_vec2, data_vec3, data_vec4);
344 using Vec = vec::Vectorized<scalar_t>;
356 for (; inner_idx < inner_size - (inner_size % Vec::size()); inner_idx += Vec::size()) {
357 Vec data_vec = Vec::loadu(lhs_outer_2 + inner_idx);
358 Vec data_vec2 = Vec::loadu(rhs_outer + inner_idx);
359 Vec output_vec = vec_fun(data_vec, data_vec2);
363 Vec data_vec = Vec::loadu(lhs_outer_2 + inner_idx, inner_size - inner_idx);
364 Vec data_vec2 = Vec::loadu(rhs_outer + inner_idx, inner_size - inner_idx);
365 Vec output_vec = vec_fun(data_vec, data_vec2);
401 using Vec = vec::Vectorized<scalar_t>;
407 Vec data_vec2 = Vec(rhs[outer_idx]);
408 for (; inner_idx < broadcast_size - (broadcast_size % Vec::size()); inner_idx += Vec::size()) {
409 Vec data_vec = Vec::loadu(lhs_outer + inner_idx);
410 Vec output_vec = vec_fun(data_vec, data_vec2);
414 Vec data_vec = Vec::loadu(lhs_outer + inner_idx, broadcast_size - inner_idx);
415 Vec output_vec = vec_fun(data_vec, data_vec2);
421 } // namespace vec