Lines Matching refs:sum
34 __m128i sum, t; in inner_product_gint16_full_1_sse2() local
36 sum = _mm_setzero_si128 (); in inner_product_gint16_full_1_sse2()
40 sum = in inner_product_gint16_full_1_sse2()
41 _mm_add_epi32 (sum, _mm_madd_epi16 (t, in inner_product_gint16_full_1_sse2()
45 sum = in inner_product_gint16_full_1_sse2()
46 _mm_add_epi32 (sum, _mm_madd_epi16 (t, in inner_product_gint16_full_1_sse2()
49 sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2, 3))); in inner_product_gint16_full_1_sse2()
50 sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1, 1))); in inner_product_gint16_full_1_sse2()
52 sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); in inner_product_gint16_full_1_sse2()
53 sum = _mm_srai_epi32 (sum, PRECISION_S16); in inner_product_gint16_full_1_sse2()
54 sum = _mm_packs_epi32 (sum, sum); in inner_product_gint16_full_1_sse2()
55 *o = _mm_extract_epi16 (sum, 0); in inner_product_gint16_full_1_sse2()
63 __m128i sum[2], t; in inner_product_gint16_linear_1_sse2() local
69 sum[0] = sum[1] = _mm_setzero_si128 (); in inner_product_gint16_linear_1_sse2()
70 f = _mm_unpacklo_epi16 (f, sum[0]); in inner_product_gint16_linear_1_sse2()
74 sum[0] = in inner_product_gint16_linear_1_sse2()
75 _mm_add_epi32 (sum[0], _mm_madd_epi16 (t, in inner_product_gint16_linear_1_sse2()
77 sum[1] = in inner_product_gint16_linear_1_sse2()
78 _mm_add_epi32 (sum[1], _mm_madd_epi16 (t, in inner_product_gint16_linear_1_sse2()
82 sum[0] = in inner_product_gint16_linear_1_sse2()
83 _mm_add_epi32 (sum[0], _mm_madd_epi16 (t, in inner_product_gint16_linear_1_sse2()
85 sum[1] = in inner_product_gint16_linear_1_sse2()
86 _mm_add_epi32 (sum[1], _mm_madd_epi16 (t, in inner_product_gint16_linear_1_sse2()
89 sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); in inner_product_gint16_linear_1_sse2()
90 sum[1] = _mm_srai_epi32 (sum[1], PRECISION_S16); in inner_product_gint16_linear_1_sse2()
92 sum[0] = in inner_product_gint16_linear_1_sse2()
93 _mm_madd_epi16 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0))); in inner_product_gint16_linear_1_sse2()
94 sum[1] = in inner_product_gint16_linear_1_sse2()
95 _mm_madd_epi16 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1))); in inner_product_gint16_linear_1_sse2()
96 sum[0] = _mm_add_epi32 (sum[0], sum[1]); in inner_product_gint16_linear_1_sse2()
98 sum[0] = in inner_product_gint16_linear_1_sse2()
99 _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, in inner_product_gint16_linear_1_sse2()
101 sum[0] = in inner_product_gint16_linear_1_sse2()
102 _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1, in inner_product_gint16_linear_1_sse2()
105 sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); in inner_product_gint16_linear_1_sse2()
106 sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); in inner_product_gint16_linear_1_sse2()
107 sum[0] = _mm_packs_epi32 (sum[0], sum[0]); in inner_product_gint16_linear_1_sse2()
108 *o = _mm_extract_epi16 (sum[0], 0); in inner_product_gint16_linear_1_sse2()
116 __m128i sum[4], t[4]; in inner_product_gint16_cubic_1_sse2() local
124 sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 (); in inner_product_gint16_cubic_1_sse2()
125 f = _mm_unpacklo_epi16 (f, sum[0]); in inner_product_gint16_cubic_1_sse2()
129 sum[0] = in inner_product_gint16_cubic_1_sse2()
130 _mm_add_epi32 (sum[0], _mm_madd_epi16 (t[0], in inner_product_gint16_cubic_1_sse2()
132 sum[1] = in inner_product_gint16_cubic_1_sse2()
133 _mm_add_epi32 (sum[1], _mm_madd_epi16 (t[0], in inner_product_gint16_cubic_1_sse2()
135 sum[2] = in inner_product_gint16_cubic_1_sse2()
136 _mm_add_epi32 (sum[2], _mm_madd_epi16 (t[0], in inner_product_gint16_cubic_1_sse2()
138 sum[3] = in inner_product_gint16_cubic_1_sse2()
139 _mm_add_epi32 (sum[3], _mm_madd_epi16 (t[0], in inner_product_gint16_cubic_1_sse2()
142 t[0] = _mm_unpacklo_epi32 (sum[0], sum[1]); in inner_product_gint16_cubic_1_sse2()
143 t[1] = _mm_unpacklo_epi32 (sum[2], sum[3]); in inner_product_gint16_cubic_1_sse2()
144 t[2] = _mm_unpackhi_epi32 (sum[0], sum[1]); in inner_product_gint16_cubic_1_sse2()
145 t[3] = _mm_unpackhi_epi32 (sum[2], sum[3]); in inner_product_gint16_cubic_1_sse2()
147 sum[0] = in inner_product_gint16_cubic_1_sse2()
150 sum[2] = in inner_product_gint16_cubic_1_sse2()
153 sum[0] = _mm_add_epi32 (sum[0], sum[2]); in inner_product_gint16_cubic_1_sse2()
155 sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); in inner_product_gint16_cubic_1_sse2()
156 sum[0] = _mm_madd_epi16 (sum[0], f); in inner_product_gint16_cubic_1_sse2()
158 sum[0] = in inner_product_gint16_cubic_1_sse2()
159 _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, in inner_product_gint16_cubic_1_sse2()
161 sum[0] = in inner_product_gint16_cubic_1_sse2()
162 _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1, in inner_product_gint16_cubic_1_sse2()
165 sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); in inner_product_gint16_cubic_1_sse2()
166 sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); in inner_product_gint16_cubic_1_sse2()
167 sum[0] = _mm_packs_epi32 (sum[0], sum[0]); in inner_product_gint16_cubic_1_sse2()
168 *o = _mm_extract_epi16 (sum[0], 0); in inner_product_gint16_cubic_1_sse2()
176 __m128d sum = _mm_setzero_pd (); in inner_product_gdouble_full_1_sse2() local
179 sum = in inner_product_gdouble_full_1_sse2()
180 _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 0), in inner_product_gdouble_full_1_sse2()
182 sum = in inner_product_gdouble_full_1_sse2()
183 _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 2), in inner_product_gdouble_full_1_sse2()
185 sum = in inner_product_gdouble_full_1_sse2()
186 _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 4), in inner_product_gdouble_full_1_sse2()
188 sum = in inner_product_gdouble_full_1_sse2()
189 _mm_add_pd (sum, _mm_mul_pd (_mm_loadu_pd (a + i + 6), in inner_product_gdouble_full_1_sse2()
192 sum = _mm_add_sd (sum, _mm_unpackhi_pd (sum, sum)); in inner_product_gdouble_full_1_sse2()
193 _mm_store_sd (o, sum); in inner_product_gdouble_full_1_sse2()
201 __m128d sum[2], t; in inner_product_gdouble_linear_1_sse2() local
206 sum[0] = sum[1] = _mm_setzero_pd (); in inner_product_gdouble_linear_1_sse2()
210 sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 0))); in inner_product_gdouble_linear_1_sse2()
211 sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 0))); in inner_product_gdouble_linear_1_sse2()
213 sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i + 2))); in inner_product_gdouble_linear_1_sse2()
214 sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i + 2))); in inner_product_gdouble_linear_1_sse2()
216 sum[0] = _mm_mul_pd (_mm_sub_pd (sum[0], sum[1]), _mm_load1_pd (icoeff)); in inner_product_gdouble_linear_1_sse2()
217 sum[0] = _mm_add_pd (sum[0], sum[1]); in inner_product_gdouble_linear_1_sse2()
218 sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0])); in inner_product_gdouble_linear_1_sse2()
219 _mm_store_sd (o, sum[0]); in inner_product_gdouble_linear_1_sse2()
227 __m128d f[2], sum[4], t; in inner_product_gdouble_cubic_1_sse2() local
236 sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_pd (); in inner_product_gdouble_cubic_1_sse2()
240 sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i))); in inner_product_gdouble_cubic_1_sse2()
241 sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i))); in inner_product_gdouble_cubic_1_sse2()
242 sum[2] = _mm_add_pd (sum[2], _mm_mul_pd (t, _mm_load_pd (c[2] + i))); in inner_product_gdouble_cubic_1_sse2()
243 sum[3] = _mm_add_pd (sum[3], _mm_mul_pd (t, _mm_load_pd (c[3] + i))); in inner_product_gdouble_cubic_1_sse2()
245 sum[0] = in inner_product_gdouble_cubic_1_sse2()
246 _mm_mul_pd (sum[0], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (0, 0))); in inner_product_gdouble_cubic_1_sse2()
247 sum[1] = in inner_product_gdouble_cubic_1_sse2()
248 _mm_mul_pd (sum[1], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (1, 1))); in inner_product_gdouble_cubic_1_sse2()
249 sum[2] = in inner_product_gdouble_cubic_1_sse2()
250 _mm_mul_pd (sum[2], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (0, 0))); in inner_product_gdouble_cubic_1_sse2()
251 sum[3] = in inner_product_gdouble_cubic_1_sse2()
252 _mm_mul_pd (sum[3], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (1, 1))); in inner_product_gdouble_cubic_1_sse2()
253 sum[0] = _mm_add_pd (sum[0], sum[1]); in inner_product_gdouble_cubic_1_sse2()
254 sum[2] = _mm_add_pd (sum[2], sum[3]); in inner_product_gdouble_cubic_1_sse2()
255 sum[0] = _mm_add_pd (sum[0], sum[2]); in inner_product_gdouble_cubic_1_sse2()
256 sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0])); in inner_product_gdouble_cubic_1_sse2()
257 _mm_store_sd (o, sum[0]); in inner_product_gdouble_cubic_1_sse2()