• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2020 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cmath>
8 #include <cstddef>
9 #include <cstdint>
10 #include <cstdlib>
11 #include <iomanip>
12 #include <ios>
13 #include <vector>
14 
15 #include <gtest/gtest.h>
16 
17 #include <fp16.h>
18 
19 #include <xnnpack/AlignedAllocator.h>
20 #include <xnnpack/common.h>
21 #include <xnnpack/math-stubs.h>
22 
23 
24 constexpr int kBlockSize = 1024;
25 
26 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(ROUNDNE__SSE_ADDSUB,positive_normal)27   TEST(ROUNDNE__SSE_ADDSUB, positive_normal) {
28     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
29     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
30     for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x4B800000); n += kBlockSize) {
31       for (uint32_t i = 0; i < kBlockSize; i++) {
32         inputs[i] = fp32_from_bits(n + i);
33       }
34       xnn_math_f32_roundne__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
35       for (uint32_t i = 0; i < kBlockSize; i++) {
36         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
37         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
38           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
39           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
40           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
41       }
42     }
43   }
44 
TEST(ROUNDNE__SSE_ADDSUB,negative_normal)45   TEST(ROUNDNE__SSE_ADDSUB, negative_normal) {
46     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
47     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
48     for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0xCB800000); n += kBlockSize) {
49       for (uint32_t i = 0; i < kBlockSize; i++) {
50         inputs[i] = fp32_from_bits(n + i);
51       }
52       xnn_math_f32_roundne__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
53       for (uint32_t i = 0; i < kBlockSize; i++) {
54         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
55         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
56           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
57           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
58           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
59       }
60     }
61   }
62 
TEST(ROUNDNE__SSE_ADDSUB,positive_integral)63   TEST(ROUNDNE__SSE_ADDSUB, positive_integral) {
64     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
65     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
66     for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
67       for (uint32_t i = 0; i < kBlockSize; i++) {
68         inputs[i] = fp32_from_bits(n + i);
69       }
70       xnn_math_f32_roundne__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
71       for (uint32_t i = 0; i < kBlockSize; i++) {
72         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
73         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
74           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
75           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
76           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
77       }
78     }
79   }
80 
TEST(ROUNDNE__SSE_ADDSUB,negative_integral)81   TEST(ROUNDNE__SSE_ADDSUB, negative_integral) {
82     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
83     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
84     for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
85       for (uint32_t i = 0; i < kBlockSize; i++) {
86         inputs[i] = fp32_from_bits(n + i);
87       }
88       xnn_math_f32_roundne__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
89       for (uint32_t i = 0; i < kBlockSize; i++) {
90         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
91         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
92           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
93           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
94           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
95       }
96     }
97   }
98 
TEST(ROUNDNE__SSE_ADDSUB,positive_infinity)99   TEST(ROUNDNE__SSE_ADDSUB, positive_infinity) {
100     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
101     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
102     std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
103     xnn_math_f32_roundne__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
104     const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[0]));
105     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
106       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
107       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
108       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
109   }
110 
TEST(ROUNDNE__SSE_ADDSUB,negative_infinity)111   TEST(ROUNDNE__SSE_ADDSUB, negative_infinity) {
112     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
113     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
114     std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
115     xnn_math_f32_roundne__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
116     const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[0]));
117     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
118       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
119       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
120       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
121   }
122 
TEST(ROUNDNE__SSE_ADDSUB,positive_qnan)123   TEST(ROUNDNE__SSE_ADDSUB, positive_qnan) {
124     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
125     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
126     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
127       for (uint32_t i = 0; i < kBlockSize; i++) {
128         inputs[i] = fp32_from_bits(n + i);
129       }
130       xnn_math_f32_roundne__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
131       for (uint32_t i = 0; i < kBlockSize; i++) {
132         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
133         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
134           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
135           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
136           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
137       }
138     }
139   }
140 
TEST(ROUNDNE__SSE_ADDSUB,negative_qnan)141   TEST(ROUNDNE__SSE_ADDSUB, negative_qnan) {
142     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
143     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
144     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
145       for (uint32_t i = 0; i < kBlockSize; i++) {
146         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
147       }
148       xnn_math_f32_roundne__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
149       for (uint32_t i = 0; i < kBlockSize; i++) {
150         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
151         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
152           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
153           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
154           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
155       }
156     }
157   }
158 
TEST(ROUNDNE__SSE_ADDSUB,positive_snan)159   TEST(ROUNDNE__SSE_ADDSUB, positive_snan) {
160     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
161     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
162     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
163       for (uint32_t i = 0; i < kBlockSize; i++) {
164         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
165       }
166       xnn_math_f32_roundne__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
167       for (uint32_t i = 0; i < kBlockSize; i++) {
168         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
169         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
170           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
171           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
172           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
173       }
174     }
175   }
176 
TEST(ROUNDNE__SSE_ADDSUB,negative_snan)177   TEST(ROUNDNE__SSE_ADDSUB, negative_snan) {
178     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
179     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
180     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
181       for (uint32_t i = 0; i < kBlockSize; i++) {
182         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
183       }
184       xnn_math_f32_roundne__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
185       for (uint32_t i = 0; i < kBlockSize; i++) {
186         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
187         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
188           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
189           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
190           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
191       }
192     }
193   }
194 
TEST(ROUNDNE__SSE_ADDSUB,positive_snan_to_qnan)195   TEST(ROUNDNE__SSE_ADDSUB, positive_snan_to_qnan) {
196     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
197     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
198     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
199       for (uint32_t i = 0; i < kBlockSize; i++) {
200         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
201       }
202       xnn_math_f32_roundne__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
203       for (uint32_t i = 0; i < kBlockSize; i++) {
204         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
205         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
206           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
207           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
208           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
209       }
210     }
211   }
212 
TEST(ROUNDNE__SSE_ADDSUB,negative_snan_to_qnan)213   TEST(ROUNDNE__SSE_ADDSUB, negative_snan_to_qnan) {
214     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
215     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
216     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
217       for (uint32_t i = 0; i < kBlockSize; i++) {
218         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
219       }
220       xnn_math_f32_roundne__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
221       for (uint32_t i = 0; i < kBlockSize; i++) {
222         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
223         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
224           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
225           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
226           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
227       }
228     }
229   }
230 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
231 
232 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(ROUNDNE__SSE2_CVT,positive_normal)233   TEST(ROUNDNE__SSE2_CVT, positive_normal) {
234     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
235     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
236     for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x4B800000); n += kBlockSize) {
237       for (uint32_t i = 0; i < kBlockSize; i++) {
238         inputs[i] = fp32_from_bits(n + i);
239       }
240       xnn_math_f32_roundne__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
241       for (uint32_t i = 0; i < kBlockSize; i++) {
242         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
243         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
244           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
245           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
246           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
247       }
248     }
249   }
250 
TEST(ROUNDNE__SSE2_CVT,negative_normal)251   TEST(ROUNDNE__SSE2_CVT, negative_normal) {
252     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
253     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
254     for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0xCB800000); n += kBlockSize) {
255       for (uint32_t i = 0; i < kBlockSize; i++) {
256         inputs[i] = fp32_from_bits(n + i);
257       }
258       xnn_math_f32_roundne__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
259       for (uint32_t i = 0; i < kBlockSize; i++) {
260         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
261         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
262           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
263           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
264           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
265       }
266     }
267   }
268 
TEST(ROUNDNE__SSE2_CVT,positive_integral)269   TEST(ROUNDNE__SSE2_CVT, positive_integral) {
270     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
271     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
272     for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
273       for (uint32_t i = 0; i < kBlockSize; i++) {
274         inputs[i] = fp32_from_bits(n + i);
275       }
276       xnn_math_f32_roundne__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
277       for (uint32_t i = 0; i < kBlockSize; i++) {
278         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
279         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
280           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
281           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
282           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
283       }
284     }
285   }
286 
TEST(ROUNDNE__SSE2_CVT,negative_integral)287   TEST(ROUNDNE__SSE2_CVT, negative_integral) {
288     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
289     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
290     for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
291       for (uint32_t i = 0; i < kBlockSize; i++) {
292         inputs[i] = fp32_from_bits(n + i);
293       }
294       xnn_math_f32_roundne__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
295       for (uint32_t i = 0; i < kBlockSize; i++) {
296         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
297         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
298           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
299           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
300           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
301       }
302     }
303   }
304 
TEST(ROUNDNE__SSE2_CVT,positive_infinity)305   TEST(ROUNDNE__SSE2_CVT, positive_infinity) {
306     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
307     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
308     std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
309     xnn_math_f32_roundne__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
310     const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[0]));
311     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
312       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
313       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
314       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
315   }
316 
TEST(ROUNDNE__SSE2_CVT,negative_infinity)317   TEST(ROUNDNE__SSE2_CVT, negative_infinity) {
318     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
319     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
320     std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
321     xnn_math_f32_roundne__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
322     const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[0]));
323     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
324       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
325       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
326       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
327   }
328 
TEST(ROUNDNE__SSE2_CVT,positive_qnan)329   TEST(ROUNDNE__SSE2_CVT, positive_qnan) {
330     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
331     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
332     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
333       for (uint32_t i = 0; i < kBlockSize; i++) {
334         inputs[i] = fp32_from_bits(n + i);
335       }
336       xnn_math_f32_roundne__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
337       for (uint32_t i = 0; i < kBlockSize; i++) {
338         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
339         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
340           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
341           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
342           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
343       }
344     }
345   }
346 
TEST(ROUNDNE__SSE2_CVT,negative_qnan)347   TEST(ROUNDNE__SSE2_CVT, negative_qnan) {
348     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
349     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
350     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
351       for (uint32_t i = 0; i < kBlockSize; i++) {
352         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
353       }
354       xnn_math_f32_roundne__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
355       for (uint32_t i = 0; i < kBlockSize; i++) {
356         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
357         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
358           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
359           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
360           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
361       }
362     }
363   }
364 
TEST(ROUNDNE__SSE2_CVT,positive_snan)365   TEST(ROUNDNE__SSE2_CVT, positive_snan) {
366     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
367     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
368     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
369       for (uint32_t i = 0; i < kBlockSize; i++) {
370         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
371       }
372       xnn_math_f32_roundne__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
373       for (uint32_t i = 0; i < kBlockSize; i++) {
374         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
375         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
376           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
377           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
378           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
379       }
380     }
381   }
382 
TEST(ROUNDNE__SSE2_CVT,negative_snan)383   TEST(ROUNDNE__SSE2_CVT, negative_snan) {
384     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
385     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
386     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
387       for (uint32_t i = 0; i < kBlockSize; i++) {
388         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
389       }
390       xnn_math_f32_roundne__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
391       for (uint32_t i = 0; i < kBlockSize; i++) {
392         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
393         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
394           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
395           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
396           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
397       }
398     }
399   }
400 
TEST(ROUNDNE__SSE2_CVT,DISABLED_positive_snan_to_qnan)401   TEST(ROUNDNE__SSE2_CVT, DISABLED_positive_snan_to_qnan) {
402     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
403     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
404     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
405       for (uint32_t i = 0; i < kBlockSize; i++) {
406         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
407       }
408       xnn_math_f32_roundne__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
409       for (uint32_t i = 0; i < kBlockSize; i++) {
410         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
411         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
412           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
413           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
414           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
415       }
416     }
417   }
418 
TEST(ROUNDNE__SSE2_CVT,DISABLED_negative_snan_to_qnan)419   TEST(ROUNDNE__SSE2_CVT, DISABLED_negative_snan_to_qnan) {
420     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
421     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
422     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
423       for (uint32_t i = 0; i < kBlockSize; i++) {
424         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
425       }
426       xnn_math_f32_roundne__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
427       for (uint32_t i = 0; i < kBlockSize; i++) {
428         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
429         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
430           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
431           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
432           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
433       }
434     }
435   }
436 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
437 
438 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(ROUNDNE__SSE41,positive_normal)439   TEST(ROUNDNE__SSE41, positive_normal) {
440     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
441     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
442     for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x4B800000); n += kBlockSize) {
443       for (uint32_t i = 0; i < kBlockSize; i++) {
444         inputs[i] = fp32_from_bits(n + i);
445       }
446       xnn_math_f32_roundne__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
447       for (uint32_t i = 0; i < kBlockSize; i++) {
448         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
449         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
450           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
451           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
452           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
453       }
454     }
455   }
456 
TEST(ROUNDNE__SSE41,negative_normal)457   TEST(ROUNDNE__SSE41, negative_normal) {
458     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
459     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
460     for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0xCB800000); n += kBlockSize) {
461       for (uint32_t i = 0; i < kBlockSize; i++) {
462         inputs[i] = fp32_from_bits(n + i);
463       }
464       xnn_math_f32_roundne__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
465       for (uint32_t i = 0; i < kBlockSize; i++) {
466         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
467         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
468           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
469           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
470           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
471       }
472     }
473   }
474 
TEST(ROUNDNE__SSE41,positive_integral)475   TEST(ROUNDNE__SSE41, positive_integral) {
476     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
477     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
478     for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
479       for (uint32_t i = 0; i < kBlockSize; i++) {
480         inputs[i] = fp32_from_bits(n + i);
481       }
482       xnn_math_f32_roundne__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
483       for (uint32_t i = 0; i < kBlockSize; i++) {
484         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
485         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
486           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
487           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
488           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
489       }
490     }
491   }
492 
TEST(ROUNDNE__SSE41,negative_integral)493   TEST(ROUNDNE__SSE41, negative_integral) {
494     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
495     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
496     for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
497       for (uint32_t i = 0; i < kBlockSize; i++) {
498         inputs[i] = fp32_from_bits(n + i);
499       }
500       xnn_math_f32_roundne__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
501       for (uint32_t i = 0; i < kBlockSize; i++) {
502         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
503         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
504           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
505           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
506           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
507       }
508     }
509   }
510 
TEST(ROUNDNE__SSE41,positive_infinity)511   TEST(ROUNDNE__SSE41, positive_infinity) {
512     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
513     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
514     std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
515     xnn_math_f32_roundne__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
516     const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[0]));
517     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
518       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
519       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
520       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
521   }
522 
TEST(ROUNDNE__SSE41,negative_infinity)523   TEST(ROUNDNE__SSE41, negative_infinity) {
524     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
525     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
526     std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
527     xnn_math_f32_roundne__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
528     const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[0]));
529     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
530       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
531       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
532       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
533   }
534 
TEST(ROUNDNE__SSE41,positive_qnan)535   TEST(ROUNDNE__SSE41, positive_qnan) {
536     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
537     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
538     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
539       for (uint32_t i = 0; i < kBlockSize; i++) {
540         inputs[i] = fp32_from_bits(n + i);
541       }
542       xnn_math_f32_roundne__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
543       for (uint32_t i = 0; i < kBlockSize; i++) {
544         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
545         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
546           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
547           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
548           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
549       }
550     }
551   }
552 
TEST(ROUNDNE__SSE41,negative_qnan)553   TEST(ROUNDNE__SSE41, negative_qnan) {
554     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
555     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
556     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
557       for (uint32_t i = 0; i < kBlockSize; i++) {
558         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
559       }
560       xnn_math_f32_roundne__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
561       for (uint32_t i = 0; i < kBlockSize; i++) {
562         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
563         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
564           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
565           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
566           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
567       }
568     }
569   }
570 
TEST(ROUNDNE__SSE41,positive_snan)571   TEST(ROUNDNE__SSE41, positive_snan) {
572     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
573     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
574     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
575       for (uint32_t i = 0; i < kBlockSize; i++) {
576         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
577       }
578       xnn_math_f32_roundne__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
579       for (uint32_t i = 0; i < kBlockSize; i++) {
580         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
581         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
582           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
583           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
584           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
585       }
586     }
587   }
588 
TEST(ROUNDNE__SSE41,negative_snan)589   TEST(ROUNDNE__SSE41, negative_snan) {
590     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
591     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
592     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
593       for (uint32_t i = 0; i < kBlockSize; i++) {
594         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
595       }
596       xnn_math_f32_roundne__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
597       for (uint32_t i = 0; i < kBlockSize; i++) {
598         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
599         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
600           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
601           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
602           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
603       }
604     }
605   }
606 
TEST(ROUNDNE__SSE41,positive_snan_to_qnan)607   TEST(ROUNDNE__SSE41, positive_snan_to_qnan) {
608     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
609     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
610     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
611       for (uint32_t i = 0; i < kBlockSize; i++) {
612         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
613       }
614       xnn_math_f32_roundne__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
615       for (uint32_t i = 0; i < kBlockSize; i++) {
616         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
617         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
618           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
619           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
620           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
621       }
622     }
623   }
624 
TEST(ROUNDNE__SSE41,negative_snan_to_qnan)625   TEST(ROUNDNE__SSE41, negative_snan_to_qnan) {
626     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
627     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
628     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
629       for (uint32_t i = 0; i < kBlockSize; i++) {
630         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
631       }
632       xnn_math_f32_roundne__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
633       for (uint32_t i = 0; i < kBlockSize; i++) {
634         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
635         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
636           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
637           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
638           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
639       }
640     }
641   }
642 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
643 
644 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(ROUNDNE__NEON_ADDSUB,positive_normal)645   TEST(ROUNDNE__NEON_ADDSUB, positive_normal) {
646     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
647     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
648     for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x4B800000); n += kBlockSize) {
649       for (uint32_t i = 0; i < kBlockSize; i++) {
650         inputs[i] = fp32_from_bits(n + i);
651       }
652       xnn_math_f32_roundne__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
653       for (uint32_t i = 0; i < kBlockSize; i++) {
654         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
655         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
656           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
657           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
658           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
659       }
660     }
661   }
662 
TEST(ROUNDNE__NEON_ADDSUB,negative_normal)663   TEST(ROUNDNE__NEON_ADDSUB, negative_normal) {
664     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
665     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
666     for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0xCB800000); n += kBlockSize) {
667       for (uint32_t i = 0; i < kBlockSize; i++) {
668         inputs[i] = fp32_from_bits(n + i);
669       }
670       xnn_math_f32_roundne__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
671       for (uint32_t i = 0; i < kBlockSize; i++) {
672         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
673         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
674           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
675           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
676           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
677       }
678     }
679   }
680 
TEST(ROUNDNE__NEON_ADDSUB,positive_integral)681   TEST(ROUNDNE__NEON_ADDSUB, positive_integral) {
682     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
683     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
684     for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
685       for (uint32_t i = 0; i < kBlockSize; i++) {
686         inputs[i] = fp32_from_bits(n + i);
687       }
688       xnn_math_f32_roundne__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
689       for (uint32_t i = 0; i < kBlockSize; i++) {
690         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
691         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
692           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
693           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
694           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
695       }
696     }
697   }
698 
TEST(ROUNDNE__NEON_ADDSUB,negative_integral)699   TEST(ROUNDNE__NEON_ADDSUB, negative_integral) {
700     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
701     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
702     for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
703       for (uint32_t i = 0; i < kBlockSize; i++) {
704         inputs[i] = fp32_from_bits(n + i);
705       }
706       xnn_math_f32_roundne__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
707       for (uint32_t i = 0; i < kBlockSize; i++) {
708         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
709         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
710           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
711           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
712           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
713       }
714     }
715   }
716 
TEST(ROUNDNE__NEON_ADDSUB,positive_infinity)717   TEST(ROUNDNE__NEON_ADDSUB, positive_infinity) {
718     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
719     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
720     std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
721     xnn_math_f32_roundne__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
722     const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[0]));
723     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
724       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
725       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
726       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
727   }
728 
TEST(ROUNDNE__NEON_ADDSUB,negative_infinity)729   TEST(ROUNDNE__NEON_ADDSUB, negative_infinity) {
730     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
731     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
732     std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
733     xnn_math_f32_roundne__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
734     const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[0]));
735     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
736       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
737       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
738       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
739   }
740 
TEST(ROUNDNE__NEON_ADDSUB,positive_qnan)741   TEST(ROUNDNE__NEON_ADDSUB, positive_qnan) {
742     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
743     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
744     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
745       for (uint32_t i = 0; i < kBlockSize; i++) {
746         inputs[i] = fp32_from_bits(n + i);
747       }
748       xnn_math_f32_roundne__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
749       for (uint32_t i = 0; i < kBlockSize; i++) {
750         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
751         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
752           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
753           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
754           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
755       }
756     }
757   }
758 
TEST(ROUNDNE__NEON_ADDSUB,negative_qnan)759   TEST(ROUNDNE__NEON_ADDSUB, negative_qnan) {
760     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
761     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
762     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
763       for (uint32_t i = 0; i < kBlockSize; i++) {
764         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
765       }
766       xnn_math_f32_roundne__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
767       for (uint32_t i = 0; i < kBlockSize; i++) {
768         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
769         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
770           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
771           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
772           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
773       }
774     }
775   }
776 
TEST(ROUNDNE__NEON_ADDSUB,positive_snan)777   TEST(ROUNDNE__NEON_ADDSUB, positive_snan) {
778     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
779     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
780     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
781       for (uint32_t i = 0; i < kBlockSize; i++) {
782         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
783       }
784       xnn_math_f32_roundne__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
785       for (uint32_t i = 0; i < kBlockSize; i++) {
786         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
787         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
788           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
789           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
790           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
791       }
792     }
793   }
794 
TEST(ROUNDNE__NEON_ADDSUB,negative_snan)795   TEST(ROUNDNE__NEON_ADDSUB, negative_snan) {
796     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
797     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
798     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
799       for (uint32_t i = 0; i < kBlockSize; i++) {
800         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
801       }
802       xnn_math_f32_roundne__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
803       for (uint32_t i = 0; i < kBlockSize; i++) {
804         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
805         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
806           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
807           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
808           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
809       }
810     }
811   }
812 
TEST(ROUNDNE__NEON_ADDSUB,positive_snan_to_qnan)813   TEST(ROUNDNE__NEON_ADDSUB, positive_snan_to_qnan) {
814     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
815     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
816     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
817       for (uint32_t i = 0; i < kBlockSize; i++) {
818         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
819       }
820       xnn_math_f32_roundne__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
821       for (uint32_t i = 0; i < kBlockSize; i++) {
822         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
823         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
824           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
825           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
826           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
827       }
828     }
829   }
830 
TEST(ROUNDNE__NEON_ADDSUB,negative_snan_to_qnan)831   TEST(ROUNDNE__NEON_ADDSUB, negative_snan_to_qnan) {
832     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
833     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
834     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
835       for (uint32_t i = 0; i < kBlockSize; i++) {
836         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
837       }
838       xnn_math_f32_roundne__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
839       for (uint32_t i = 0; i < kBlockSize; i++) {
840         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
841         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
842           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
843           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
844           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
845       }
846     }
847   }
848 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
849 
850 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(ROUNDNE__NEONV8,positive_normal)851   TEST(ROUNDNE__NEONV8, positive_normal) {
852     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
853     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
854     for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x4B800000); n += kBlockSize) {
855       for (uint32_t i = 0; i < kBlockSize; i++) {
856         inputs[i] = fp32_from_bits(n + i);
857       }
858       xnn_math_f32_roundne__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
859       for (uint32_t i = 0; i < kBlockSize; i++) {
860         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
861         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
862           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
863           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
864           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
865       }
866     }
867   }
868 
TEST(ROUNDNE__NEONV8,negative_normal)869   TEST(ROUNDNE__NEONV8, negative_normal) {
870     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
871     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
872     for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0xCB800000); n += kBlockSize) {
873       for (uint32_t i = 0; i < kBlockSize; i++) {
874         inputs[i] = fp32_from_bits(n + i);
875       }
876       xnn_math_f32_roundne__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
877       for (uint32_t i = 0; i < kBlockSize; i++) {
878         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
879         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
880           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
881           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
882           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
883       }
884     }
885   }
886 
TEST(ROUNDNE__NEONV8,positive_integral)887   TEST(ROUNDNE__NEONV8, positive_integral) {
888     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
889     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
890     for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
891       for (uint32_t i = 0; i < kBlockSize; i++) {
892         inputs[i] = fp32_from_bits(n + i);
893       }
894       xnn_math_f32_roundne__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
895       for (uint32_t i = 0; i < kBlockSize; i++) {
896         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
897         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
898           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
899           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
900           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
901       }
902     }
903   }
904 
TEST(ROUNDNE__NEONV8,negative_integral)905   TEST(ROUNDNE__NEONV8, negative_integral) {
906     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
907     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
908     for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
909       for (uint32_t i = 0; i < kBlockSize; i++) {
910         inputs[i] = fp32_from_bits(n + i);
911       }
912       xnn_math_f32_roundne__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
913       for (uint32_t i = 0; i < kBlockSize; i++) {
914         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
915         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
916           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
917           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
918           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
919       }
920     }
921   }
922 
TEST(ROUNDNE__NEONV8,positive_infinity)923   TEST(ROUNDNE__NEONV8, positive_infinity) {
924     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
925     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
926     std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
927     xnn_math_f32_roundne__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
928     const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[0]));
929     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
930       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
931       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
932       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
933   }
934 
TEST(ROUNDNE__NEONV8,negative_infinity)935   TEST(ROUNDNE__NEONV8, negative_infinity) {
936     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
937     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
938     std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
939     xnn_math_f32_roundne__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
940     const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[0]));
941     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
942       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
943       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
944       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
945   }
946 
TEST(ROUNDNE__NEONV8,positive_qnan)947   TEST(ROUNDNE__NEONV8, positive_qnan) {
948     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
949     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
950     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
951       for (uint32_t i = 0; i < kBlockSize; i++) {
952         inputs[i] = fp32_from_bits(n + i);
953       }
954       xnn_math_f32_roundne__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
955       for (uint32_t i = 0; i < kBlockSize; i++) {
956         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
957         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
958           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
959           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
960           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
961       }
962     }
963   }
964 
TEST(ROUNDNE__NEONV8,negative_qnan)965   TEST(ROUNDNE__NEONV8, negative_qnan) {
966     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
967     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
968     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
969       for (uint32_t i = 0; i < kBlockSize; i++) {
970         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
971       }
972       xnn_math_f32_roundne__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
973       for (uint32_t i = 0; i < kBlockSize; i++) {
974         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
975         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
976           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
977           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
978           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
979       }
980     }
981   }
982 
TEST(ROUNDNE__NEONV8,positive_snan)983   TEST(ROUNDNE__NEONV8, positive_snan) {
984     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
985     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
986     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
987       for (uint32_t i = 0; i < kBlockSize; i++) {
988         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
989       }
990       xnn_math_f32_roundne__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
991       for (uint32_t i = 0; i < kBlockSize; i++) {
992         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
993         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
994           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
995           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
996           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
997       }
998     }
999   }
1000 
TEST(ROUNDNE__NEONV8,negative_snan)1001   TEST(ROUNDNE__NEONV8, negative_snan) {
1002     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1003     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1004     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1005       for (uint32_t i = 0; i < kBlockSize; i++) {
1006         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1007       }
1008       xnn_math_f32_roundne__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1009       for (uint32_t i = 0; i < kBlockSize; i++) {
1010         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
1011         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
1012           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1013           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1014           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1015       }
1016     }
1017   }
1018 
TEST(ROUNDNE__NEONV8,positive_snan_to_qnan)1019   TEST(ROUNDNE__NEONV8, positive_snan_to_qnan) {
1020     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1021     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1022     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1023       for (uint32_t i = 0; i < kBlockSize; i++) {
1024         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1025       }
1026       xnn_math_f32_roundne__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1027       for (uint32_t i = 0; i < kBlockSize; i++) {
1028         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
1029         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1030           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1031           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1032           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1033       }
1034     }
1035   }
1036 
TEST(ROUNDNE__NEONV8,negative_snan_to_qnan)1037   TEST(ROUNDNE__NEONV8, negative_snan_to_qnan) {
1038     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1039     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1040     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1041       for (uint32_t i = 0; i < kBlockSize; i++) {
1042         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1043       }
1044       xnn_math_f32_roundne__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1045       for (uint32_t i = 0; i < kBlockSize; i++) {
1046         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
1047         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1048           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1049           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1050           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1051       }
1052     }
1053   }
1054 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
1055 
1056 #if XNN_ARCH_WASMSIMD
TEST(ROUNDNE__WASMSIMD_ADDSUB,positive_normal)1057   TEST(ROUNDNE__WASMSIMD_ADDSUB, positive_normal) {
1058     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1059     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1060     for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x4B800000); n += kBlockSize) {
1061       for (uint32_t i = 0; i < kBlockSize; i++) {
1062         inputs[i] = fp32_from_bits(n + i);
1063       }
1064       xnn_math_f32_roundne__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1065       for (uint32_t i = 0; i < kBlockSize; i++) {
1066         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
1067         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1068           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1069           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1070           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1071       }
1072     }
1073   }
1074 
TEST(ROUNDNE__WASMSIMD_ADDSUB,negative_normal)1075   TEST(ROUNDNE__WASMSIMD_ADDSUB, negative_normal) {
1076     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1077     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1078     for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0xCB800000); n += kBlockSize) {
1079       for (uint32_t i = 0; i < kBlockSize; i++) {
1080         inputs[i] = fp32_from_bits(n + i);
1081       }
1082       xnn_math_f32_roundne__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1083       for (uint32_t i = 0; i < kBlockSize; i++) {
1084         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
1085         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1086           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1087           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1088           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1089       }
1090     }
1091   }
1092 
TEST(ROUNDNE__WASMSIMD_ADDSUB,positive_integral)1093   TEST(ROUNDNE__WASMSIMD_ADDSUB, positive_integral) {
1094     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1095     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1096     for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
1097       for (uint32_t i = 0; i < kBlockSize; i++) {
1098         inputs[i] = fp32_from_bits(n + i);
1099       }
1100       xnn_math_f32_roundne__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1101       for (uint32_t i = 0; i < kBlockSize; i++) {
1102         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
1103         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1104           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1105           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1106           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1107       }
1108     }
1109   }
1110 
TEST(ROUNDNE__WASMSIMD_ADDSUB,negative_integral)1111   TEST(ROUNDNE__WASMSIMD_ADDSUB, negative_integral) {
1112     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1113     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1114     for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
1115       for (uint32_t i = 0; i < kBlockSize; i++) {
1116         inputs[i] = fp32_from_bits(n + i);
1117       }
1118       xnn_math_f32_roundne__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1119       for (uint32_t i = 0; i < kBlockSize; i++) {
1120         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
1121         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1122           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1123           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1124           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1125       }
1126     }
1127   }
1128 
TEST(ROUNDNE__WASMSIMD_ADDSUB,positive_infinity)1129   TEST(ROUNDNE__WASMSIMD_ADDSUB, positive_infinity) {
1130     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1131     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1132     std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
1133     xnn_math_f32_roundne__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1134     const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[0]));
1135     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
1136       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1137       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1138       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1139   }
1140 
TEST(ROUNDNE__WASMSIMD_ADDSUB,negative_infinity)1141   TEST(ROUNDNE__WASMSIMD_ADDSUB, negative_infinity) {
1142     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1143     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1144     std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
1145     xnn_math_f32_roundne__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1146     const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[0]));
1147     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
1148       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1149       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1150       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1151   }
1152 
TEST(ROUNDNE__WASMSIMD_ADDSUB,positive_qnan)1153   TEST(ROUNDNE__WASMSIMD_ADDSUB, positive_qnan) {
1154     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1155     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1156     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
1157       for (uint32_t i = 0; i < kBlockSize; i++) {
1158         inputs[i] = fp32_from_bits(n + i);
1159       }
1160       xnn_math_f32_roundne__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1161       for (uint32_t i = 0; i < kBlockSize; i++) {
1162         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
1163         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1164           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1165           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1166           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1167       }
1168     }
1169   }
1170 
TEST(ROUNDNE__WASMSIMD_ADDSUB,negative_qnan)1171   TEST(ROUNDNE__WASMSIMD_ADDSUB, negative_qnan) {
1172     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1173     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1174     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
1175       for (uint32_t i = 0; i < kBlockSize; i++) {
1176         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
1177       }
1178       xnn_math_f32_roundne__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1179       for (uint32_t i = 0; i < kBlockSize; i++) {
1180         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
1181         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1182           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1183           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1184           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1185       }
1186     }
1187   }
1188 
TEST(ROUNDNE__WASMSIMD_ADDSUB,positive_snan)1189   TEST(ROUNDNE__WASMSIMD_ADDSUB, positive_snan) {
1190     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1191     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1192     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1193       for (uint32_t i = 0; i < kBlockSize; i++) {
1194         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1195       }
1196       xnn_math_f32_roundne__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1197       for (uint32_t i = 0; i < kBlockSize; i++) {
1198         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
1199         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
1200           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1201           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1202           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1203       }
1204     }
1205   }
1206 
TEST(ROUNDNE__WASMSIMD_ADDSUB,negative_snan)1207   TEST(ROUNDNE__WASMSIMD_ADDSUB, negative_snan) {
1208     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1209     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1210     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1211       for (uint32_t i = 0; i < kBlockSize; i++) {
1212         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1213       }
1214       xnn_math_f32_roundne__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1215       for (uint32_t i = 0; i < kBlockSize; i++) {
1216         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
1217         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
1218           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1219           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1220           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1221       }
1222     }
1223   }
1224 
TEST(ROUNDNE__WASMSIMD_ADDSUB,positive_snan_to_qnan)1225   TEST(ROUNDNE__WASMSIMD_ADDSUB, positive_snan_to_qnan) {
1226     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1227     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1228     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1229       for (uint32_t i = 0; i < kBlockSize; i++) {
1230         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1231       }
1232       xnn_math_f32_roundne__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1233       for (uint32_t i = 0; i < kBlockSize; i++) {
1234         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
1235         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1236           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1237           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1238           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1239       }
1240     }
1241   }
1242 
TEST(ROUNDNE__WASMSIMD_ADDSUB,negative_snan_to_qnan)1243   TEST(ROUNDNE__WASMSIMD_ADDSUB, negative_snan_to_qnan) {
1244     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1245     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1246     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1247       for (uint32_t i = 0; i < kBlockSize; i++) {
1248         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1249       }
1250       xnn_math_f32_roundne__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1251       for (uint32_t i = 0; i < kBlockSize; i++) {
1252         const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
1253         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1254           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1255           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1256           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1257       }
1258     }
1259   }
1260 #endif  // XNN_ARCH_WASMSIMD
1261 
TEST(ROUNDNE__SCALAR_ADDSUB,positive_normal)1262 TEST(ROUNDNE__SCALAR_ADDSUB, positive_normal) {
1263   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1264   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1265   for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x4B800000); n += kBlockSize) {
1266     for (uint32_t i = 0; i < kBlockSize; i++) {
1267       inputs[i] = fp32_from_bits(n + i);
1268     }
1269     xnn_math_f32_roundne__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1270     for (uint32_t i = 0; i < kBlockSize; i++) {
1271       const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
1272       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1273         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1274         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1275         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1276     }
1277   }
1278 }
1279 
TEST(ROUNDNE__SCALAR_ADDSUB,negative_normal)1280 TEST(ROUNDNE__SCALAR_ADDSUB, negative_normal) {
1281   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1282   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1283   for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0xCB800000); n += kBlockSize) {
1284     for (uint32_t i = 0; i < kBlockSize; i++) {
1285       inputs[i] = fp32_from_bits(n + i);
1286     }
1287     xnn_math_f32_roundne__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1288     for (uint32_t i = 0; i < kBlockSize; i++) {
1289       const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
1290       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1291         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1292         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1293         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1294     }
1295   }
1296 }
1297 
TEST(ROUNDNE__SCALAR_ADDSUB,positive_integral)1298 TEST(ROUNDNE__SCALAR_ADDSUB, positive_integral) {
1299   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1300   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1301   for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
1302     for (uint32_t i = 0; i < kBlockSize; i++) {
1303       inputs[i] = fp32_from_bits(n + i);
1304     }
1305     xnn_math_f32_roundne__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1306     for (uint32_t i = 0; i < kBlockSize; i++) {
1307       const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
1308       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1309         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1310         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1311         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1312     }
1313   }
1314 }
1315 
TEST(ROUNDNE__SCALAR_ADDSUB,negative_integral)1316 TEST(ROUNDNE__SCALAR_ADDSUB, negative_integral) {
1317   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1318   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1319   for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
1320     for (uint32_t i = 0; i < kBlockSize; i++) {
1321       inputs[i] = fp32_from_bits(n + i);
1322     }
1323     xnn_math_f32_roundne__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1324     for (uint32_t i = 0; i < kBlockSize; i++) {
1325       const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
1326       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1327         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1328         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1329         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1330     }
1331   }
1332 }
1333 
TEST(ROUNDNE__SCALAR_ADDSUB,positive_infinity)1334 TEST(ROUNDNE__SCALAR_ADDSUB, positive_infinity) {
1335   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1336   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1337   std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
1338   xnn_math_f32_roundne__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1339   const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[0]));
1340   ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
1341     << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1342     << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1343     << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1344 }
1345 
TEST(ROUNDNE__SCALAR_ADDSUB,negative_infinity)1346 TEST(ROUNDNE__SCALAR_ADDSUB, negative_infinity) {
1347   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1348   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1349   std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
1350   xnn_math_f32_roundne__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1351   const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[0]));
1352   ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
1353     << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1354     << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1355     << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1356 }
1357 
TEST(ROUNDNE__SCALAR_ADDSUB,positive_qnan)1358 TEST(ROUNDNE__SCALAR_ADDSUB, positive_qnan) {
1359   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1360   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1361   for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
1362     for (uint32_t i = 0; i < kBlockSize; i++) {
1363       inputs[i] = fp32_from_bits(n + i);
1364     }
1365     xnn_math_f32_roundne__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1366     for (uint32_t i = 0; i < kBlockSize; i++) {
1367       const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
1368       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1369         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1370         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1371         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1372     }
1373   }
1374 }
1375 
TEST(ROUNDNE__SCALAR_ADDSUB,negative_qnan)1376 TEST(ROUNDNE__SCALAR_ADDSUB, negative_qnan) {
1377   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1378   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1379   for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
1380     for (uint32_t i = 0; i < kBlockSize; i++) {
1381       inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
1382     }
1383     xnn_math_f32_roundne__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1384     for (uint32_t i = 0; i < kBlockSize; i++) {
1385       const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
1386       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1387         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1388         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1389         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1390     }
1391   }
1392 }
1393 
TEST(ROUNDNE__SCALAR_ADDSUB,positive_snan)1394 TEST(ROUNDNE__SCALAR_ADDSUB, positive_snan) {
1395   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1396   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1397   for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1398     for (uint32_t i = 0; i < kBlockSize; i++) {
1399       inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1400     }
1401     xnn_math_f32_roundne__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1402     for (uint32_t i = 0; i < kBlockSize; i++) {
1403       const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
1404       ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
1405         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1406         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1407         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1408     }
1409   }
1410 }
1411 
TEST(ROUNDNE__SCALAR_ADDSUB,negative_snan)1412 TEST(ROUNDNE__SCALAR_ADDSUB, negative_snan) {
1413   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1414   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1415   for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1416     for (uint32_t i = 0; i < kBlockSize; i++) {
1417       inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1418     }
1419     xnn_math_f32_roundne__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1420     for (uint32_t i = 0; i < kBlockSize; i++) {
1421       const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
1422       ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
1423         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1424         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1425         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1426     }
1427   }
1428 }
1429 
TEST(ROUNDNE__SCALAR_ADDSUB,positive_snan_to_qnan)1430 TEST(ROUNDNE__SCALAR_ADDSUB, positive_snan_to_qnan) {
1431   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1432   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1433   for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1434     for (uint32_t i = 0; i < kBlockSize; i++) {
1435       inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1436     }
1437     xnn_math_f32_roundne__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1438     for (uint32_t i = 0; i < kBlockSize; i++) {
1439       const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
1440       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1441         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1442         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1443         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1444     }
1445   }
1446 }
1447 
TEST(ROUNDNE__SCALAR_ADDSUB,negative_snan_to_qnan)1448 TEST(ROUNDNE__SCALAR_ADDSUB, negative_snan_to_qnan) {
1449   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1450   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1451   for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1452     for (uint32_t i = 0; i < kBlockSize; i++) {
1453       inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1454     }
1455     xnn_math_f32_roundne__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1456     for (uint32_t i = 0; i < kBlockSize; i++) {
1457       const uint32_t reference_output = fp32_to_bits(std::nearbyint(inputs[i]));
1458       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1459         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1460         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1461         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1462     }
1463   }
1464 }
1465