• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2020 Google LLC
2 //
3 // This source code is licensed under the BSD-style license found in the
4 // LICENSE file in the root directory of this source tree.
5 
6 #include <algorithm>
7 #include <cmath>
8 #include <cstddef>
9 #include <cstdint>
10 #include <cstdlib>
11 #include <iomanip>
12 #include <ios>
13 #include <vector>
14 
15 #include <gtest/gtest.h>
16 
17 #include <fp16.h>
18 
19 #include <xnnpack/AlignedAllocator.h>
20 #include <xnnpack/common.h>
21 #include <xnnpack/math-stubs.h>
22 
23 
24 constexpr int kBlockSize = 1024;
25 
26 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(ROUNDZ__SSE_ADDSUB,positive_normal)27   TEST(ROUNDZ__SSE_ADDSUB, positive_normal) {
28     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
29     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
30     for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x4B800000); n += kBlockSize) {
31       for (uint32_t i = 0; i < kBlockSize; i++) {
32         inputs[i] = fp32_from_bits(n + i);
33       }
34       xnn_math_f32_roundz__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
35       for (uint32_t i = 0; i < kBlockSize; i++) {
36         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
37         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
38           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
39           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
40           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
41       }
42     }
43   }
44 
TEST(ROUNDZ__SSE_ADDSUB,negative_normal)45   TEST(ROUNDZ__SSE_ADDSUB, negative_normal) {
46     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
47     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
48     for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0xCB800000); n += kBlockSize) {
49       for (uint32_t i = 0; i < kBlockSize; i++) {
50         inputs[i] = fp32_from_bits(n + i);
51       }
52       xnn_math_f32_roundz__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
53       for (uint32_t i = 0; i < kBlockSize; i++) {
54         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
55         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
56           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
57           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
58           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
59       }
60     }
61   }
62 
TEST(ROUNDZ__SSE_ADDSUB,positive_integral)63   TEST(ROUNDZ__SSE_ADDSUB, positive_integral) {
64     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
65     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
66     for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
67       for (uint32_t i = 0; i < kBlockSize; i++) {
68         inputs[i] = fp32_from_bits(n + i);
69       }
70       xnn_math_f32_roundz__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
71       for (uint32_t i = 0; i < kBlockSize; i++) {
72         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
73         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
74           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
75           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
76           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
77       }
78     }
79   }
80 
TEST(ROUNDZ__SSE_ADDSUB,negative_integral)81   TEST(ROUNDZ__SSE_ADDSUB, negative_integral) {
82     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
83     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
84     for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
85       for (uint32_t i = 0; i < kBlockSize; i++) {
86         inputs[i] = fp32_from_bits(n + i);
87       }
88       xnn_math_f32_roundz__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
89       for (uint32_t i = 0; i < kBlockSize; i++) {
90         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
91         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
92           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
93           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
94           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
95       }
96     }
97   }
98 
TEST(ROUNDZ__SSE_ADDSUB,positive_infinity)99   TEST(ROUNDZ__SSE_ADDSUB, positive_infinity) {
100     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
101     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
102     std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
103     xnn_math_f32_roundz__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
104     const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
105     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
106       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
107       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
108       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
109   }
110 
TEST(ROUNDZ__SSE_ADDSUB,negative_infinity)111   TEST(ROUNDZ__SSE_ADDSUB, negative_infinity) {
112     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
113     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
114     std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
115     xnn_math_f32_roundz__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
116     const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
117     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
118       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
119       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
120       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
121   }
122 
TEST(ROUNDZ__SSE_ADDSUB,positive_qnan)123   TEST(ROUNDZ__SSE_ADDSUB, positive_qnan) {
124     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
125     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
126     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
127       for (uint32_t i = 0; i < kBlockSize; i++) {
128         inputs[i] = fp32_from_bits(n + i);
129       }
130       xnn_math_f32_roundz__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
131       for (uint32_t i = 0; i < kBlockSize; i++) {
132         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
133         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
134           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
135           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
136           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
137       }
138     }
139   }
140 
TEST(ROUNDZ__SSE_ADDSUB,negative_qnan)141   TEST(ROUNDZ__SSE_ADDSUB, negative_qnan) {
142     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
143     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
144     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
145       for (uint32_t i = 0; i < kBlockSize; i++) {
146         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
147       }
148       xnn_math_f32_roundz__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
149       for (uint32_t i = 0; i < kBlockSize; i++) {
150         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
151         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
152           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
153           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
154           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
155       }
156     }
157   }
158 
TEST(ROUNDZ__SSE_ADDSUB,positive_snan)159   TEST(ROUNDZ__SSE_ADDSUB, positive_snan) {
160     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
161     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
162     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
163       for (uint32_t i = 0; i < kBlockSize; i++) {
164         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
165       }
166       xnn_math_f32_roundz__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
167       for (uint32_t i = 0; i < kBlockSize; i++) {
168         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
169         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
170           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
171           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
172           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
173       }
174     }
175   }
176 
TEST(ROUNDZ__SSE_ADDSUB,negative_snan)177   TEST(ROUNDZ__SSE_ADDSUB, negative_snan) {
178     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
179     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
180     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
181       for (uint32_t i = 0; i < kBlockSize; i++) {
182         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
183       }
184       xnn_math_f32_roundz__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
185       for (uint32_t i = 0; i < kBlockSize; i++) {
186         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
187         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
188           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
189           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
190           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
191       }
192     }
193   }
194 
TEST(ROUNDZ__SSE_ADDSUB,positive_snan_to_qnan)195   TEST(ROUNDZ__SSE_ADDSUB, positive_snan_to_qnan) {
196     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
197     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
198     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
199       for (uint32_t i = 0; i < kBlockSize; i++) {
200         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
201       }
202       xnn_math_f32_roundz__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
203       for (uint32_t i = 0; i < kBlockSize; i++) {
204         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
205         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
206           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
207           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
208           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
209       }
210     }
211   }
212 
TEST(ROUNDZ__SSE_ADDSUB,negative_snan_to_qnan)213   TEST(ROUNDZ__SSE_ADDSUB, negative_snan_to_qnan) {
214     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
215     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
216     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
217       for (uint32_t i = 0; i < kBlockSize; i++) {
218         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
219       }
220       xnn_math_f32_roundz__sse_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
221       for (uint32_t i = 0; i < kBlockSize; i++) {
222         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
223         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
224           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
225           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
226           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
227       }
228     }
229   }
230 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
231 
232 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(ROUNDZ__SSE2_CVT,positive_normal)233   TEST(ROUNDZ__SSE2_CVT, positive_normal) {
234     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
235     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
236     for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x4B800000); n += kBlockSize) {
237       for (uint32_t i = 0; i < kBlockSize; i++) {
238         inputs[i] = fp32_from_bits(n + i);
239       }
240       xnn_math_f32_roundz__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
241       for (uint32_t i = 0; i < kBlockSize; i++) {
242         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
243         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
244           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
245           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
246           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
247       }
248     }
249   }
250 
TEST(ROUNDZ__SSE2_CVT,negative_normal)251   TEST(ROUNDZ__SSE2_CVT, negative_normal) {
252     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
253     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
254     for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0xCB800000); n += kBlockSize) {
255       for (uint32_t i = 0; i < kBlockSize; i++) {
256         inputs[i] = fp32_from_bits(n + i);
257       }
258       xnn_math_f32_roundz__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
259       for (uint32_t i = 0; i < kBlockSize; i++) {
260         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
261         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
262           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
263           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
264           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
265       }
266     }
267   }
268 
TEST(ROUNDZ__SSE2_CVT,positive_integral)269   TEST(ROUNDZ__SSE2_CVT, positive_integral) {
270     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
271     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
272     for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
273       for (uint32_t i = 0; i < kBlockSize; i++) {
274         inputs[i] = fp32_from_bits(n + i);
275       }
276       xnn_math_f32_roundz__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
277       for (uint32_t i = 0; i < kBlockSize; i++) {
278         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
279         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
280           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
281           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
282           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
283       }
284     }
285   }
286 
TEST(ROUNDZ__SSE2_CVT,negative_integral)287   TEST(ROUNDZ__SSE2_CVT, negative_integral) {
288     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
289     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
290     for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
291       for (uint32_t i = 0; i < kBlockSize; i++) {
292         inputs[i] = fp32_from_bits(n + i);
293       }
294       xnn_math_f32_roundz__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
295       for (uint32_t i = 0; i < kBlockSize; i++) {
296         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
297         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
298           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
299           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
300           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
301       }
302     }
303   }
304 
TEST(ROUNDZ__SSE2_CVT,positive_infinity)305   TEST(ROUNDZ__SSE2_CVT, positive_infinity) {
306     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
307     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
308     std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
309     xnn_math_f32_roundz__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
310     const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
311     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
312       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
313       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
314       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
315   }
316 
TEST(ROUNDZ__SSE2_CVT,negative_infinity)317   TEST(ROUNDZ__SSE2_CVT, negative_infinity) {
318     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
319     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
320     std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
321     xnn_math_f32_roundz__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
322     const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
323     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
324       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
325       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
326       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
327   }
328 
TEST(ROUNDZ__SSE2_CVT,positive_qnan)329   TEST(ROUNDZ__SSE2_CVT, positive_qnan) {
330     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
331     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
332     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
333       for (uint32_t i = 0; i < kBlockSize; i++) {
334         inputs[i] = fp32_from_bits(n + i);
335       }
336       xnn_math_f32_roundz__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
337       for (uint32_t i = 0; i < kBlockSize; i++) {
338         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
339         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
340           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
341           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
342           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
343       }
344     }
345   }
346 
TEST(ROUNDZ__SSE2_CVT,negative_qnan)347   TEST(ROUNDZ__SSE2_CVT, negative_qnan) {
348     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
349     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
350     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
351       for (uint32_t i = 0; i < kBlockSize; i++) {
352         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
353       }
354       xnn_math_f32_roundz__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
355       for (uint32_t i = 0; i < kBlockSize; i++) {
356         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
357         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
358           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
359           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
360           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
361       }
362     }
363   }
364 
TEST(ROUNDZ__SSE2_CVT,positive_snan)365   TEST(ROUNDZ__SSE2_CVT, positive_snan) {
366     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
367     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
368     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
369       for (uint32_t i = 0; i < kBlockSize; i++) {
370         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
371       }
372       xnn_math_f32_roundz__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
373       for (uint32_t i = 0; i < kBlockSize; i++) {
374         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
375         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
376           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
377           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
378           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
379       }
380     }
381   }
382 
TEST(ROUNDZ__SSE2_CVT,negative_snan)383   TEST(ROUNDZ__SSE2_CVT, negative_snan) {
384     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
385     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
386     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
387       for (uint32_t i = 0; i < kBlockSize; i++) {
388         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
389       }
390       xnn_math_f32_roundz__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
391       for (uint32_t i = 0; i < kBlockSize; i++) {
392         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
393         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
394           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
395           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
396           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
397       }
398     }
399   }
400 
TEST(ROUNDZ__SSE2_CVT,DISABLED_positive_snan_to_qnan)401   TEST(ROUNDZ__SSE2_CVT, DISABLED_positive_snan_to_qnan) {
402     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
403     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
404     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
405       for (uint32_t i = 0; i < kBlockSize; i++) {
406         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
407       }
408       xnn_math_f32_roundz__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
409       for (uint32_t i = 0; i < kBlockSize; i++) {
410         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
411         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
412           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
413           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
414           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
415       }
416     }
417   }
418 
TEST(ROUNDZ__SSE2_CVT,DISABLED_negative_snan_to_qnan)419   TEST(ROUNDZ__SSE2_CVT, DISABLED_negative_snan_to_qnan) {
420     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
421     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
422     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
423       for (uint32_t i = 0; i < kBlockSize; i++) {
424         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
425       }
426       xnn_math_f32_roundz__sse2_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
427       for (uint32_t i = 0; i < kBlockSize; i++) {
428         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
429         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
430           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
431           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
432           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
433       }
434     }
435   }
436 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
437 
438 #if XNN_ARCH_X86 || XNN_ARCH_X86_64
TEST(ROUNDZ__SSE41,positive_normal)439   TEST(ROUNDZ__SSE41, positive_normal) {
440     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
441     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
442     for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x4B800000); n += kBlockSize) {
443       for (uint32_t i = 0; i < kBlockSize; i++) {
444         inputs[i] = fp32_from_bits(n + i);
445       }
446       xnn_math_f32_roundz__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
447       for (uint32_t i = 0; i < kBlockSize; i++) {
448         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
449         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
450           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
451           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
452           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
453       }
454     }
455   }
456 
TEST(ROUNDZ__SSE41,negative_normal)457   TEST(ROUNDZ__SSE41, negative_normal) {
458     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
459     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
460     for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0xCB800000); n += kBlockSize) {
461       for (uint32_t i = 0; i < kBlockSize; i++) {
462         inputs[i] = fp32_from_bits(n + i);
463       }
464       xnn_math_f32_roundz__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
465       for (uint32_t i = 0; i < kBlockSize; i++) {
466         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
467         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
468           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
469           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
470           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
471       }
472     }
473   }
474 
TEST(ROUNDZ__SSE41,positive_integral)475   TEST(ROUNDZ__SSE41, positive_integral) {
476     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
477     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
478     for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
479       for (uint32_t i = 0; i < kBlockSize; i++) {
480         inputs[i] = fp32_from_bits(n + i);
481       }
482       xnn_math_f32_roundz__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
483       for (uint32_t i = 0; i < kBlockSize; i++) {
484         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
485         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
486           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
487           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
488           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
489       }
490     }
491   }
492 
TEST(ROUNDZ__SSE41,negative_integral)493   TEST(ROUNDZ__SSE41, negative_integral) {
494     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
495     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
496     for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
497       for (uint32_t i = 0; i < kBlockSize; i++) {
498         inputs[i] = fp32_from_bits(n + i);
499       }
500       xnn_math_f32_roundz__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
501       for (uint32_t i = 0; i < kBlockSize; i++) {
502         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
503         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
504           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
505           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
506           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
507       }
508     }
509   }
510 
TEST(ROUNDZ__SSE41,positive_infinity)511   TEST(ROUNDZ__SSE41, positive_infinity) {
512     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
513     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
514     std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
515     xnn_math_f32_roundz__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
516     const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
517     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
518       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
519       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
520       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
521   }
522 
TEST(ROUNDZ__SSE41,negative_infinity)523   TEST(ROUNDZ__SSE41, negative_infinity) {
524     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
525     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
526     std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
527     xnn_math_f32_roundz__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
528     const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
529     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
530       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
531       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
532       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
533   }
534 
TEST(ROUNDZ__SSE41,positive_qnan)535   TEST(ROUNDZ__SSE41, positive_qnan) {
536     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
537     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
538     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
539       for (uint32_t i = 0; i < kBlockSize; i++) {
540         inputs[i] = fp32_from_bits(n + i);
541       }
542       xnn_math_f32_roundz__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
543       for (uint32_t i = 0; i < kBlockSize; i++) {
544         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
545         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
546           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
547           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
548           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
549       }
550     }
551   }
552 
TEST(ROUNDZ__SSE41,negative_qnan)553   TEST(ROUNDZ__SSE41, negative_qnan) {
554     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
555     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
556     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
557       for (uint32_t i = 0; i < kBlockSize; i++) {
558         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
559       }
560       xnn_math_f32_roundz__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
561       for (uint32_t i = 0; i < kBlockSize; i++) {
562         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
563         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
564           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
565           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
566           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
567       }
568     }
569   }
570 
TEST(ROUNDZ__SSE41,positive_snan)571   TEST(ROUNDZ__SSE41, positive_snan) {
572     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
573     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
574     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
575       for (uint32_t i = 0; i < kBlockSize; i++) {
576         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
577       }
578       xnn_math_f32_roundz__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
579       for (uint32_t i = 0; i < kBlockSize; i++) {
580         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
581         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
582           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
583           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
584           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
585       }
586     }
587   }
588 
TEST(ROUNDZ__SSE41,negative_snan)589   TEST(ROUNDZ__SSE41, negative_snan) {
590     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
591     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
592     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
593       for (uint32_t i = 0; i < kBlockSize; i++) {
594         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
595       }
596       xnn_math_f32_roundz__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
597       for (uint32_t i = 0; i < kBlockSize; i++) {
598         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
599         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
600           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
601           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
602           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
603       }
604     }
605   }
606 
TEST(ROUNDZ__SSE41,positive_snan_to_qnan)607   TEST(ROUNDZ__SSE41, positive_snan_to_qnan) {
608     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
609     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
610     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
611       for (uint32_t i = 0; i < kBlockSize; i++) {
612         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
613       }
614       xnn_math_f32_roundz__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
615       for (uint32_t i = 0; i < kBlockSize; i++) {
616         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
617         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
618           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
619           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
620           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
621       }
622     }
623   }
624 
TEST(ROUNDZ__SSE41,negative_snan_to_qnan)625   TEST(ROUNDZ__SSE41, negative_snan_to_qnan) {
626     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
627     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
628     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
629       for (uint32_t i = 0; i < kBlockSize; i++) {
630         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
631       }
632       xnn_math_f32_roundz__sse41(kBlockSize * sizeof(float), inputs.data(), outputs.data());
633       for (uint32_t i = 0; i < kBlockSize; i++) {
634         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
635         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
636           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
637           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
638           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
639       }
640     }
641   }
642 #endif  // XNN_ARCH_X86 || XNN_ARCH_X86_64
643 
644 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(ROUNDZ__NEON_ADDSUB,positive_normal)645   TEST(ROUNDZ__NEON_ADDSUB, positive_normal) {
646     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
647     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
648     for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x4B800000); n += kBlockSize) {
649       for (uint32_t i = 0; i < kBlockSize; i++) {
650         inputs[i] = fp32_from_bits(n + i);
651       }
652       xnn_math_f32_roundz__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
653       for (uint32_t i = 0; i < kBlockSize; i++) {
654         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
655         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
656           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
657           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
658           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
659       }
660     }
661   }
662 
TEST(ROUNDZ__NEON_ADDSUB,negative_normal)663   TEST(ROUNDZ__NEON_ADDSUB, negative_normal) {
664     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
665     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
666     for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0xCB800000); n += kBlockSize) {
667       for (uint32_t i = 0; i < kBlockSize; i++) {
668         inputs[i] = fp32_from_bits(n + i);
669       }
670       xnn_math_f32_roundz__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
671       for (uint32_t i = 0; i < kBlockSize; i++) {
672         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
673         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
674           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
675           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
676           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
677       }
678     }
679   }
680 
TEST(ROUNDZ__NEON_ADDSUB,positive_integral)681   TEST(ROUNDZ__NEON_ADDSUB, positive_integral) {
682     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
683     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
684     for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
685       for (uint32_t i = 0; i < kBlockSize; i++) {
686         inputs[i] = fp32_from_bits(n + i);
687       }
688       xnn_math_f32_roundz__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
689       for (uint32_t i = 0; i < kBlockSize; i++) {
690         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
691         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
692           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
693           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
694           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
695       }
696     }
697   }
698 
TEST(ROUNDZ__NEON_ADDSUB,negative_integral)699   TEST(ROUNDZ__NEON_ADDSUB, negative_integral) {
700     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
701     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
702     for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
703       for (uint32_t i = 0; i < kBlockSize; i++) {
704         inputs[i] = fp32_from_bits(n + i);
705       }
706       xnn_math_f32_roundz__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
707       for (uint32_t i = 0; i < kBlockSize; i++) {
708         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
709         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
710           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
711           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
712           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
713       }
714     }
715   }
716 
TEST(ROUNDZ__NEON_ADDSUB,positive_infinity)717   TEST(ROUNDZ__NEON_ADDSUB, positive_infinity) {
718     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
719     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
720     std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
721     xnn_math_f32_roundz__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
722     const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
723     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
724       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
725       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
726       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
727   }
728 
TEST(ROUNDZ__NEON_ADDSUB,negative_infinity)729   TEST(ROUNDZ__NEON_ADDSUB, negative_infinity) {
730     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
731     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
732     std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
733     xnn_math_f32_roundz__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
734     const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
735     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
736       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
737       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
738       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
739   }
740 
TEST(ROUNDZ__NEON_ADDSUB,positive_qnan)741   TEST(ROUNDZ__NEON_ADDSUB, positive_qnan) {
742     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
743     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
744     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
745       for (uint32_t i = 0; i < kBlockSize; i++) {
746         inputs[i] = fp32_from_bits(n + i);
747       }
748       xnn_math_f32_roundz__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
749       for (uint32_t i = 0; i < kBlockSize; i++) {
750         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
751         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
752           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
753           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
754           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
755       }
756     }
757   }
758 
TEST(ROUNDZ__NEON_ADDSUB,negative_qnan)759   TEST(ROUNDZ__NEON_ADDSUB, negative_qnan) {
760     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
761     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
762     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
763       for (uint32_t i = 0; i < kBlockSize; i++) {
764         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
765       }
766       xnn_math_f32_roundz__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
767       for (uint32_t i = 0; i < kBlockSize; i++) {
768         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
769         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
770           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
771           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
772           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
773       }
774     }
775   }
776 
TEST(ROUNDZ__NEON_ADDSUB,positive_snan)777   TEST(ROUNDZ__NEON_ADDSUB, positive_snan) {
778     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
779     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
780     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
781       for (uint32_t i = 0; i < kBlockSize; i++) {
782         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
783       }
784       xnn_math_f32_roundz__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
785       for (uint32_t i = 0; i < kBlockSize; i++) {
786         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
787         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
788           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
789           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
790           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
791       }
792     }
793   }
794 
TEST(ROUNDZ__NEON_ADDSUB,negative_snan)795   TEST(ROUNDZ__NEON_ADDSUB, negative_snan) {
796     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
797     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
798     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
799       for (uint32_t i = 0; i < kBlockSize; i++) {
800         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
801       }
802       xnn_math_f32_roundz__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
803       for (uint32_t i = 0; i < kBlockSize; i++) {
804         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
805         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
806           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
807           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
808           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
809       }
810     }
811   }
812 
TEST(ROUNDZ__NEON_ADDSUB,positive_snan_to_qnan)813   TEST(ROUNDZ__NEON_ADDSUB, positive_snan_to_qnan) {
814     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
815     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
816     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
817       for (uint32_t i = 0; i < kBlockSize; i++) {
818         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
819       }
820       xnn_math_f32_roundz__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
821       for (uint32_t i = 0; i < kBlockSize; i++) {
822         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
823         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
824           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
825           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
826           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
827       }
828     }
829   }
830 
TEST(ROUNDZ__NEON_ADDSUB,negative_snan_to_qnan)831   TEST(ROUNDZ__NEON_ADDSUB, negative_snan_to_qnan) {
832     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
833     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
834     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
835       for (uint32_t i = 0; i < kBlockSize; i++) {
836         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
837       }
838       xnn_math_f32_roundz__neon_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
839       for (uint32_t i = 0; i < kBlockSize; i++) {
840         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
841         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
842           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
843           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
844           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
845       }
846     }
847   }
848 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
849 
850 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(ROUNDZ__NEON_CVT,positive_normal)851   TEST(ROUNDZ__NEON_CVT, positive_normal) {
852     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
853     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
854     for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x4B800000); n += kBlockSize) {
855       for (uint32_t i = 0; i < kBlockSize; i++) {
856         inputs[i] = fp32_from_bits(n + i);
857       }
858       xnn_math_f32_roundz__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
859       for (uint32_t i = 0; i < kBlockSize; i++) {
860         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
861         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
862           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
863           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
864           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
865       }
866     }
867   }
868 
TEST(ROUNDZ__NEON_CVT,negative_normal)869   TEST(ROUNDZ__NEON_CVT, negative_normal) {
870     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
871     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
872     for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0xCB800000); n += kBlockSize) {
873       for (uint32_t i = 0; i < kBlockSize; i++) {
874         inputs[i] = fp32_from_bits(n + i);
875       }
876       xnn_math_f32_roundz__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
877       for (uint32_t i = 0; i < kBlockSize; i++) {
878         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
879         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
880           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
881           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
882           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
883       }
884     }
885   }
886 
TEST(ROUNDZ__NEON_CVT,positive_integral)887   TEST(ROUNDZ__NEON_CVT, positive_integral) {
888     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
889     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
890     for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
891       for (uint32_t i = 0; i < kBlockSize; i++) {
892         inputs[i] = fp32_from_bits(n + i);
893       }
894       xnn_math_f32_roundz__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
895       for (uint32_t i = 0; i < kBlockSize; i++) {
896         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
897         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
898           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
899           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
900           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
901       }
902     }
903   }
904 
TEST(ROUNDZ__NEON_CVT,negative_integral)905   TEST(ROUNDZ__NEON_CVT, negative_integral) {
906     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
907     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
908     for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
909       for (uint32_t i = 0; i < kBlockSize; i++) {
910         inputs[i] = fp32_from_bits(n + i);
911       }
912       xnn_math_f32_roundz__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
913       for (uint32_t i = 0; i < kBlockSize; i++) {
914         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
915         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
916           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
917           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
918           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
919       }
920     }
921   }
922 
TEST(ROUNDZ__NEON_CVT,positive_infinity)923   TEST(ROUNDZ__NEON_CVT, positive_infinity) {
924     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
925     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
926     std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
927     xnn_math_f32_roundz__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
928     const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
929     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
930       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
931       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
932       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
933   }
934 
TEST(ROUNDZ__NEON_CVT,negative_infinity)935   TEST(ROUNDZ__NEON_CVT, negative_infinity) {
936     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
937     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
938     std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
939     xnn_math_f32_roundz__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
940     const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
941     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
942       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
943       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
944       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
945   }
946 
TEST(ROUNDZ__NEON_CVT,positive_qnan)947   TEST(ROUNDZ__NEON_CVT, positive_qnan) {
948     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
949     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
950     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
951       for (uint32_t i = 0; i < kBlockSize; i++) {
952         inputs[i] = fp32_from_bits(n + i);
953       }
954       xnn_math_f32_roundz__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
955       for (uint32_t i = 0; i < kBlockSize; i++) {
956         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
957         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
958           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
959           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
960           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
961       }
962     }
963   }
964 
TEST(ROUNDZ__NEON_CVT,negative_qnan)965   TEST(ROUNDZ__NEON_CVT, negative_qnan) {
966     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
967     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
968     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
969       for (uint32_t i = 0; i < kBlockSize; i++) {
970         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
971       }
972       xnn_math_f32_roundz__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
973       for (uint32_t i = 0; i < kBlockSize; i++) {
974         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
975         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
976           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
977           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
978           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
979       }
980     }
981   }
982 
TEST(ROUNDZ__NEON_CVT,positive_snan)983   TEST(ROUNDZ__NEON_CVT, positive_snan) {
984     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
985     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
986     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
987       for (uint32_t i = 0; i < kBlockSize; i++) {
988         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
989       }
990       xnn_math_f32_roundz__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
991       for (uint32_t i = 0; i < kBlockSize; i++) {
992         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
993         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
994           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
995           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
996           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
997       }
998     }
999   }
1000 
TEST(ROUNDZ__NEON_CVT,negative_snan)1001   TEST(ROUNDZ__NEON_CVT, negative_snan) {
1002     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1003     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1004     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1005       for (uint32_t i = 0; i < kBlockSize; i++) {
1006         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1007       }
1008       xnn_math_f32_roundz__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1009       for (uint32_t i = 0; i < kBlockSize; i++) {
1010         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1011         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
1012           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1013           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1014           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1015       }
1016     }
1017   }
1018 
TEST(ROUNDZ__NEON_CVT,DISABLED_positive_snan_to_qnan)1019   TEST(ROUNDZ__NEON_CVT, DISABLED_positive_snan_to_qnan) {
1020     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1021     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1022     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1023       for (uint32_t i = 0; i < kBlockSize; i++) {
1024         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1025       }
1026       xnn_math_f32_roundz__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1027       for (uint32_t i = 0; i < kBlockSize; i++) {
1028         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1029         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1030           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1031           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1032           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1033       }
1034     }
1035   }
1036 
TEST(ROUNDZ__NEON_CVT,DISABLED_negative_snan_to_qnan)1037   TEST(ROUNDZ__NEON_CVT, DISABLED_negative_snan_to_qnan) {
1038     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1039     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1040     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1041       for (uint32_t i = 0; i < kBlockSize; i++) {
1042         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1043       }
1044       xnn_math_f32_roundz__neon_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1045       for (uint32_t i = 0; i < kBlockSize; i++) {
1046         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1047         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1048           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1049           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1050           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1051       }
1052     }
1053   }
1054 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
1055 
1056 #if XNN_ARCH_ARM || XNN_ARCH_ARM64
TEST(ROUNDZ__NEONV8,positive_normal)1057   TEST(ROUNDZ__NEONV8, positive_normal) {
1058     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1059     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1060     for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x4B800000); n += kBlockSize) {
1061       for (uint32_t i = 0; i < kBlockSize; i++) {
1062         inputs[i] = fp32_from_bits(n + i);
1063       }
1064       xnn_math_f32_roundz__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1065       for (uint32_t i = 0; i < kBlockSize; i++) {
1066         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1067         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1068           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1069           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1070           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1071       }
1072     }
1073   }
1074 
TEST(ROUNDZ__NEONV8,negative_normal)1075   TEST(ROUNDZ__NEONV8, negative_normal) {
1076     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1077     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1078     for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0xCB800000); n += kBlockSize) {
1079       for (uint32_t i = 0; i < kBlockSize; i++) {
1080         inputs[i] = fp32_from_bits(n + i);
1081       }
1082       xnn_math_f32_roundz__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1083       for (uint32_t i = 0; i < kBlockSize; i++) {
1084         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1085         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1086           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1087           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1088           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1089       }
1090     }
1091   }
1092 
TEST(ROUNDZ__NEONV8,positive_integral)1093   TEST(ROUNDZ__NEONV8, positive_integral) {
1094     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1095     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1096     for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
1097       for (uint32_t i = 0; i < kBlockSize; i++) {
1098         inputs[i] = fp32_from_bits(n + i);
1099       }
1100       xnn_math_f32_roundz__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1101       for (uint32_t i = 0; i < kBlockSize; i++) {
1102         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1103         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1104           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1105           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1106           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1107       }
1108     }
1109   }
1110 
TEST(ROUNDZ__NEONV8,negative_integral)1111   TEST(ROUNDZ__NEONV8, negative_integral) {
1112     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1113     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1114     for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
1115       for (uint32_t i = 0; i < kBlockSize; i++) {
1116         inputs[i] = fp32_from_bits(n + i);
1117       }
1118       xnn_math_f32_roundz__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1119       for (uint32_t i = 0; i < kBlockSize; i++) {
1120         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1121         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1122           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1123           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1124           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1125       }
1126     }
1127   }
1128 
TEST(ROUNDZ__NEONV8,positive_infinity)1129   TEST(ROUNDZ__NEONV8, positive_infinity) {
1130     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1131     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1132     std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
1133     xnn_math_f32_roundz__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1134     const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
1135     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
1136       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1137       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1138       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1139   }
1140 
TEST(ROUNDZ__NEONV8,negative_infinity)1141   TEST(ROUNDZ__NEONV8, negative_infinity) {
1142     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1143     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1144     std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
1145     xnn_math_f32_roundz__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1146     const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
1147     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
1148       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1149       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1150       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1151   }
1152 
TEST(ROUNDZ__NEONV8,positive_qnan)1153   TEST(ROUNDZ__NEONV8, positive_qnan) {
1154     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1155     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1156     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
1157       for (uint32_t i = 0; i < kBlockSize; i++) {
1158         inputs[i] = fp32_from_bits(n + i);
1159       }
1160       xnn_math_f32_roundz__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1161       for (uint32_t i = 0; i < kBlockSize; i++) {
1162         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1163         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1164           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1165           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1166           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1167       }
1168     }
1169   }
1170 
TEST(ROUNDZ__NEONV8,negative_qnan)1171   TEST(ROUNDZ__NEONV8, negative_qnan) {
1172     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1173     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1174     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
1175       for (uint32_t i = 0; i < kBlockSize; i++) {
1176         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
1177       }
1178       xnn_math_f32_roundz__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1179       for (uint32_t i = 0; i < kBlockSize; i++) {
1180         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1181         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1182           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1183           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1184           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1185       }
1186     }
1187   }
1188 
TEST(ROUNDZ__NEONV8,positive_snan)1189   TEST(ROUNDZ__NEONV8, positive_snan) {
1190     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1191     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1192     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1193       for (uint32_t i = 0; i < kBlockSize; i++) {
1194         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1195       }
1196       xnn_math_f32_roundz__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1197       for (uint32_t i = 0; i < kBlockSize; i++) {
1198         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1199         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
1200           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1201           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1202           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1203       }
1204     }
1205   }
1206 
TEST(ROUNDZ__NEONV8,negative_snan)1207   TEST(ROUNDZ__NEONV8, negative_snan) {
1208     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1209     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1210     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1211       for (uint32_t i = 0; i < kBlockSize; i++) {
1212         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1213       }
1214       xnn_math_f32_roundz__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1215       for (uint32_t i = 0; i < kBlockSize; i++) {
1216         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1217         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
1218           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1219           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1220           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1221       }
1222     }
1223   }
1224 
TEST(ROUNDZ__NEONV8,positive_snan_to_qnan)1225   TEST(ROUNDZ__NEONV8, positive_snan_to_qnan) {
1226     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1227     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1228     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1229       for (uint32_t i = 0; i < kBlockSize; i++) {
1230         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1231       }
1232       xnn_math_f32_roundz__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1233       for (uint32_t i = 0; i < kBlockSize; i++) {
1234         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1235         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1236           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1237           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1238           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1239       }
1240     }
1241   }
1242 
TEST(ROUNDZ__NEONV8,negative_snan_to_qnan)1243   TEST(ROUNDZ__NEONV8, negative_snan_to_qnan) {
1244     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1245     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1246     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1247       for (uint32_t i = 0; i < kBlockSize; i++) {
1248         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1249       }
1250       xnn_math_f32_roundz__neonv8(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1251       for (uint32_t i = 0; i < kBlockSize; i++) {
1252         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1253         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1254           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1255           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1256           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1257       }
1258     }
1259   }
1260 #endif  // XNN_ARCH_ARM || XNN_ARCH_ARM64
1261 
1262 #if XNN_ARCH_WASMSIMD
TEST(ROUNDZ__WASMSIMD_ADDSUB,positive_normal)1263   TEST(ROUNDZ__WASMSIMD_ADDSUB, positive_normal) {
1264     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1265     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1266     for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x4B800000); n += kBlockSize) {
1267       for (uint32_t i = 0; i < kBlockSize; i++) {
1268         inputs[i] = fp32_from_bits(n + i);
1269       }
1270       xnn_math_f32_roundz__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1271       for (uint32_t i = 0; i < kBlockSize; i++) {
1272         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1273         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1274           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1275           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1276           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1277       }
1278     }
1279   }
1280 
TEST(ROUNDZ__WASMSIMD_ADDSUB,negative_normal)1281   TEST(ROUNDZ__WASMSIMD_ADDSUB, negative_normal) {
1282     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1283     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1284     for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0xCB800000); n += kBlockSize) {
1285       for (uint32_t i = 0; i < kBlockSize; i++) {
1286         inputs[i] = fp32_from_bits(n + i);
1287       }
1288       xnn_math_f32_roundz__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1289       for (uint32_t i = 0; i < kBlockSize; i++) {
1290         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1291         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1292           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1293           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1294           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1295       }
1296     }
1297   }
1298 
TEST(ROUNDZ__WASMSIMD_ADDSUB,positive_integral)1299   TEST(ROUNDZ__WASMSIMD_ADDSUB, positive_integral) {
1300     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1301     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1302     for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
1303       for (uint32_t i = 0; i < kBlockSize; i++) {
1304         inputs[i] = fp32_from_bits(n + i);
1305       }
1306       xnn_math_f32_roundz__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1307       for (uint32_t i = 0; i < kBlockSize; i++) {
1308         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1309         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1310           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1311           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1312           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1313       }
1314     }
1315   }
1316 
TEST(ROUNDZ__WASMSIMD_ADDSUB,negative_integral)1317   TEST(ROUNDZ__WASMSIMD_ADDSUB, negative_integral) {
1318     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1319     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1320     for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
1321       for (uint32_t i = 0; i < kBlockSize; i++) {
1322         inputs[i] = fp32_from_bits(n + i);
1323       }
1324       xnn_math_f32_roundz__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1325       for (uint32_t i = 0; i < kBlockSize; i++) {
1326         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1327         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1328           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1329           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1330           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1331       }
1332     }
1333   }
1334 
TEST(ROUNDZ__WASMSIMD_ADDSUB,positive_infinity)1335   TEST(ROUNDZ__WASMSIMD_ADDSUB, positive_infinity) {
1336     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1337     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1338     std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
1339     xnn_math_f32_roundz__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1340     const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
1341     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
1342       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1343       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1344       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1345   }
1346 
TEST(ROUNDZ__WASMSIMD_ADDSUB,negative_infinity)1347   TEST(ROUNDZ__WASMSIMD_ADDSUB, negative_infinity) {
1348     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1349     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1350     std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
1351     xnn_math_f32_roundz__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1352     const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
1353     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
1354       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1355       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1356       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1357   }
1358 
TEST(ROUNDZ__WASMSIMD_ADDSUB,positive_qnan)1359   TEST(ROUNDZ__WASMSIMD_ADDSUB, positive_qnan) {
1360     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1361     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1362     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
1363       for (uint32_t i = 0; i < kBlockSize; i++) {
1364         inputs[i] = fp32_from_bits(n + i);
1365       }
1366       xnn_math_f32_roundz__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1367       for (uint32_t i = 0; i < kBlockSize; i++) {
1368         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1369         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1370           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1371           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1372           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1373       }
1374     }
1375   }
1376 
TEST(ROUNDZ__WASMSIMD_ADDSUB,negative_qnan)1377   TEST(ROUNDZ__WASMSIMD_ADDSUB, negative_qnan) {
1378     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1379     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1380     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
1381       for (uint32_t i = 0; i < kBlockSize; i++) {
1382         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
1383       }
1384       xnn_math_f32_roundz__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1385       for (uint32_t i = 0; i < kBlockSize; i++) {
1386         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1387         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1388           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1389           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1390           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1391       }
1392     }
1393   }
1394 
TEST(ROUNDZ__WASMSIMD_ADDSUB,positive_snan)1395   TEST(ROUNDZ__WASMSIMD_ADDSUB, positive_snan) {
1396     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1397     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1398     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1399       for (uint32_t i = 0; i < kBlockSize; i++) {
1400         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1401       }
1402       xnn_math_f32_roundz__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1403       for (uint32_t i = 0; i < kBlockSize; i++) {
1404         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1405         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
1406           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1407           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1408           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1409       }
1410     }
1411   }
1412 
TEST(ROUNDZ__WASMSIMD_ADDSUB,negative_snan)1413   TEST(ROUNDZ__WASMSIMD_ADDSUB, negative_snan) {
1414     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1415     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1416     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1417       for (uint32_t i = 0; i < kBlockSize; i++) {
1418         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1419       }
1420       xnn_math_f32_roundz__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1421       for (uint32_t i = 0; i < kBlockSize; i++) {
1422         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1423         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
1424           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1425           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1426           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1427       }
1428     }
1429   }
1430 
TEST(ROUNDZ__WASMSIMD_ADDSUB,positive_snan_to_qnan)1431   TEST(ROUNDZ__WASMSIMD_ADDSUB, positive_snan_to_qnan) {
1432     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1433     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1434     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1435       for (uint32_t i = 0; i < kBlockSize; i++) {
1436         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1437       }
1438       xnn_math_f32_roundz__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1439       for (uint32_t i = 0; i < kBlockSize; i++) {
1440         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1441         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1442           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1443           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1444           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1445       }
1446     }
1447   }
1448 
TEST(ROUNDZ__WASMSIMD_ADDSUB,negative_snan_to_qnan)1449   TEST(ROUNDZ__WASMSIMD_ADDSUB, negative_snan_to_qnan) {
1450     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1451     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1452     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1453       for (uint32_t i = 0; i < kBlockSize; i++) {
1454         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1455       }
1456       xnn_math_f32_roundz__wasmsimd_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1457       for (uint32_t i = 0; i < kBlockSize; i++) {
1458         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1459         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1460           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1461           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1462           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1463       }
1464     }
1465   }
1466 #endif  // XNN_ARCH_WASMSIMD
1467 
1468 #if XNN_ARCH_WASMSIMD
TEST(ROUNDZ__WASMSIMD_CVT,positive_normal)1469   TEST(ROUNDZ__WASMSIMD_CVT, positive_normal) {
1470     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1471     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1472     for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x4B800000); n += kBlockSize) {
1473       for (uint32_t i = 0; i < kBlockSize; i++) {
1474         inputs[i] = fp32_from_bits(n + i);
1475       }
1476       xnn_math_f32_roundz__wasmsimd_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1477       for (uint32_t i = 0; i < kBlockSize; i++) {
1478         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1479         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1480           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1481           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1482           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1483       }
1484     }
1485   }
1486 
TEST(ROUNDZ__WASMSIMD_CVT,negative_normal)1487   TEST(ROUNDZ__WASMSIMD_CVT, negative_normal) {
1488     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1489     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1490     for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0xCB800000); n += kBlockSize) {
1491       for (uint32_t i = 0; i < kBlockSize; i++) {
1492         inputs[i] = fp32_from_bits(n + i);
1493       }
1494       xnn_math_f32_roundz__wasmsimd_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1495       for (uint32_t i = 0; i < kBlockSize; i++) {
1496         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1497         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1498           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1499           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1500           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1501       }
1502     }
1503   }
1504 
TEST(ROUNDZ__WASMSIMD_CVT,positive_integral)1505   TEST(ROUNDZ__WASMSIMD_CVT, positive_integral) {
1506     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1507     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1508     for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
1509       for (uint32_t i = 0; i < kBlockSize; i++) {
1510         inputs[i] = fp32_from_bits(n + i);
1511       }
1512       xnn_math_f32_roundz__wasmsimd_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1513       for (uint32_t i = 0; i < kBlockSize; i++) {
1514         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1515         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1516           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1517           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1518           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1519       }
1520     }
1521   }
1522 
TEST(ROUNDZ__WASMSIMD_CVT,negative_integral)1523   TEST(ROUNDZ__WASMSIMD_CVT, negative_integral) {
1524     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1525     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1526     for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
1527       for (uint32_t i = 0; i < kBlockSize; i++) {
1528         inputs[i] = fp32_from_bits(n + i);
1529       }
1530       xnn_math_f32_roundz__wasmsimd_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1531       for (uint32_t i = 0; i < kBlockSize; i++) {
1532         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1533         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1534           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1535           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1536           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1537       }
1538     }
1539   }
1540 
TEST(ROUNDZ__WASMSIMD_CVT,positive_infinity)1541   TEST(ROUNDZ__WASMSIMD_CVT, positive_infinity) {
1542     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1543     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1544     std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
1545     xnn_math_f32_roundz__wasmsimd_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1546     const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
1547     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
1548       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1549       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1550       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1551   }
1552 
TEST(ROUNDZ__WASMSIMD_CVT,negative_infinity)1553   TEST(ROUNDZ__WASMSIMD_CVT, negative_infinity) {
1554     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1555     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1556     std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
1557     xnn_math_f32_roundz__wasmsimd_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1558     const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
1559     ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
1560       << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1561       << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1562       << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1563   }
1564 
TEST(ROUNDZ__WASMSIMD_CVT,positive_qnan)1565   TEST(ROUNDZ__WASMSIMD_CVT, positive_qnan) {
1566     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1567     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1568     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
1569       for (uint32_t i = 0; i < kBlockSize; i++) {
1570         inputs[i] = fp32_from_bits(n + i);
1571       }
1572       xnn_math_f32_roundz__wasmsimd_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1573       for (uint32_t i = 0; i < kBlockSize; i++) {
1574         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1575         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1576           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1577           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1578           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1579       }
1580     }
1581   }
1582 
TEST(ROUNDZ__WASMSIMD_CVT,negative_qnan)1583   TEST(ROUNDZ__WASMSIMD_CVT, negative_qnan) {
1584     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1585     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1586     for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
1587       for (uint32_t i = 0; i < kBlockSize; i++) {
1588         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
1589       }
1590       xnn_math_f32_roundz__wasmsimd_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1591       for (uint32_t i = 0; i < kBlockSize; i++) {
1592         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1593         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1594           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1595           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1596           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1597       }
1598     }
1599   }
1600 
TEST(ROUNDZ__WASMSIMD_CVT,positive_snan)1601   TEST(ROUNDZ__WASMSIMD_CVT, positive_snan) {
1602     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1603     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1604     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1605       for (uint32_t i = 0; i < kBlockSize; i++) {
1606         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1607       }
1608       xnn_math_f32_roundz__wasmsimd_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1609       for (uint32_t i = 0; i < kBlockSize; i++) {
1610         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1611         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
1612           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1613           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1614           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1615       }
1616     }
1617   }
1618 
TEST(ROUNDZ__WASMSIMD_CVT,negative_snan)1619   TEST(ROUNDZ__WASMSIMD_CVT, negative_snan) {
1620     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1621     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1622     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1623       for (uint32_t i = 0; i < kBlockSize; i++) {
1624         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1625       }
1626       xnn_math_f32_roundz__wasmsimd_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1627       for (uint32_t i = 0; i < kBlockSize; i++) {
1628         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1629         ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
1630           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1631           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1632           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1633       }
1634     }
1635   }
1636 
TEST(ROUNDZ__WASMSIMD_CVT,DISABLED_positive_snan_to_qnan)1637   TEST(ROUNDZ__WASMSIMD_CVT, DISABLED_positive_snan_to_qnan) {
1638     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1639     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1640     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1641       for (uint32_t i = 0; i < kBlockSize; i++) {
1642         inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1643       }
1644       xnn_math_f32_roundz__wasmsimd_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1645       for (uint32_t i = 0; i < kBlockSize; i++) {
1646         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1647         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1648           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1649           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1650           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1651       }
1652     }
1653   }
1654 
TEST(ROUNDZ__WASMSIMD_CVT,DISABLED_negative_snan_to_qnan)1655   TEST(ROUNDZ__WASMSIMD_CVT, DISABLED_negative_snan_to_qnan) {
1656     std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1657     std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1658     for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1659       for (uint32_t i = 0; i < kBlockSize; i++) {
1660         inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1661       }
1662       xnn_math_f32_roundz__wasmsimd_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1663       for (uint32_t i = 0; i < kBlockSize; i++) {
1664         const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1665         ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1666           << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1667           << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1668           << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1669       }
1670     }
1671   }
1672 #endif  // XNN_ARCH_WASMSIMD
1673 
TEST(ROUNDZ__SCALAR_ADDSUB,positive_normal)1674 TEST(ROUNDZ__SCALAR_ADDSUB, positive_normal) {
1675   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1676   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1677   for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x4B800000); n += kBlockSize) {
1678     for (uint32_t i = 0; i < kBlockSize; i++) {
1679       inputs[i] = fp32_from_bits(n + i);
1680     }
1681     xnn_math_f32_roundz__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1682     for (uint32_t i = 0; i < kBlockSize; i++) {
1683       const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1684       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1685         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1686         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1687         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1688     }
1689   }
1690 }
1691 
TEST(ROUNDZ__SCALAR_ADDSUB,negative_normal)1692 TEST(ROUNDZ__SCALAR_ADDSUB, negative_normal) {
1693   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1694   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1695   for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0xCB800000); n += kBlockSize) {
1696     for (uint32_t i = 0; i < kBlockSize; i++) {
1697       inputs[i] = fp32_from_bits(n + i);
1698     }
1699     xnn_math_f32_roundz__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1700     for (uint32_t i = 0; i < kBlockSize; i++) {
1701       const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1702       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1703         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1704         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1705         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1706     }
1707   }
1708 }
1709 
TEST(ROUNDZ__SCALAR_ADDSUB,positive_integral)1710 TEST(ROUNDZ__SCALAR_ADDSUB, positive_integral) {
1711   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1712   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1713   for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
1714     for (uint32_t i = 0; i < kBlockSize; i++) {
1715       inputs[i] = fp32_from_bits(n + i);
1716     }
1717     xnn_math_f32_roundz__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1718     for (uint32_t i = 0; i < kBlockSize; i++) {
1719       const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1720       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1721         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1722         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1723         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1724     }
1725   }
1726 }
1727 
TEST(ROUNDZ__SCALAR_ADDSUB,negative_integral)1728 TEST(ROUNDZ__SCALAR_ADDSUB, negative_integral) {
1729   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1730   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1731   for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
1732     for (uint32_t i = 0; i < kBlockSize; i++) {
1733       inputs[i] = fp32_from_bits(n + i);
1734     }
1735     xnn_math_f32_roundz__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1736     for (uint32_t i = 0; i < kBlockSize; i++) {
1737       const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1738       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1739         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1740         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1741         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1742     }
1743   }
1744 }
1745 
TEST(ROUNDZ__SCALAR_ADDSUB,positive_infinity)1746 TEST(ROUNDZ__SCALAR_ADDSUB, positive_infinity) {
1747   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1748   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1749   std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
1750   xnn_math_f32_roundz__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1751   const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
1752   ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
1753     << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1754     << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1755     << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1756 }
1757 
TEST(ROUNDZ__SCALAR_ADDSUB,negative_infinity)1758 TEST(ROUNDZ__SCALAR_ADDSUB, negative_infinity) {
1759   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1760   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1761   std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
1762   xnn_math_f32_roundz__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1763   const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
1764   ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
1765     << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1766     << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1767     << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1768 }
1769 
TEST(ROUNDZ__SCALAR_ADDSUB,positive_qnan)1770 TEST(ROUNDZ__SCALAR_ADDSUB, positive_qnan) {
1771   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1772   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1773   for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
1774     for (uint32_t i = 0; i < kBlockSize; i++) {
1775       inputs[i] = fp32_from_bits(n + i);
1776     }
1777     xnn_math_f32_roundz__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1778     for (uint32_t i = 0; i < kBlockSize; i++) {
1779       const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1780       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1781         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1782         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1783         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1784     }
1785   }
1786 }
1787 
TEST(ROUNDZ__SCALAR_ADDSUB,negative_qnan)1788 TEST(ROUNDZ__SCALAR_ADDSUB, negative_qnan) {
1789   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1790   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1791   for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
1792     for (uint32_t i = 0; i < kBlockSize; i++) {
1793       inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
1794     }
1795     xnn_math_f32_roundz__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1796     for (uint32_t i = 0; i < kBlockSize; i++) {
1797       const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1798       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1799         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1800         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1801         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1802     }
1803   }
1804 }
1805 
TEST(ROUNDZ__SCALAR_ADDSUB,positive_snan)1806 TEST(ROUNDZ__SCALAR_ADDSUB, positive_snan) {
1807   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1808   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1809   for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1810     for (uint32_t i = 0; i < kBlockSize; i++) {
1811       inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1812     }
1813     xnn_math_f32_roundz__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1814     for (uint32_t i = 0; i < kBlockSize; i++) {
1815       const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1816       ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
1817         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1818         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1819         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1820     }
1821   }
1822 }
1823 
TEST(ROUNDZ__SCALAR_ADDSUB,negative_snan)1824 TEST(ROUNDZ__SCALAR_ADDSUB, negative_snan) {
1825   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1826   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1827   for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1828     for (uint32_t i = 0; i < kBlockSize; i++) {
1829       inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1830     }
1831     xnn_math_f32_roundz__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1832     for (uint32_t i = 0; i < kBlockSize; i++) {
1833       const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1834       ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
1835         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1836         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1837         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1838     }
1839   }
1840 }
1841 
TEST(ROUNDZ__SCALAR_ADDSUB,positive_snan_to_qnan)1842 TEST(ROUNDZ__SCALAR_ADDSUB, positive_snan_to_qnan) {
1843   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1844   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1845   for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1846     for (uint32_t i = 0; i < kBlockSize; i++) {
1847       inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1848     }
1849     xnn_math_f32_roundz__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1850     for (uint32_t i = 0; i < kBlockSize; i++) {
1851       const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1852       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1853         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1854         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1855         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1856     }
1857   }
1858 }
1859 
TEST(ROUNDZ__SCALAR_ADDSUB,negative_snan_to_qnan)1860 TEST(ROUNDZ__SCALAR_ADDSUB, negative_snan_to_qnan) {
1861   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1862   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1863   for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
1864     for (uint32_t i = 0; i < kBlockSize; i++) {
1865       inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
1866     }
1867     xnn_math_f32_roundz__scalar_addsub(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1868     for (uint32_t i = 0; i < kBlockSize; i++) {
1869       const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1870       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1871         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1872         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1873         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1874     }
1875   }
1876 }
1877 
TEST(ROUNDZ__SCALAR_CVT,positive_normal)1878 TEST(ROUNDZ__SCALAR_CVT, positive_normal) {
1879   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1880   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1881   for (uint32_t n = UINT32_C(0x00000000); n < UINT32_C(0x4B800000); n += kBlockSize) {
1882     for (uint32_t i = 0; i < kBlockSize; i++) {
1883       inputs[i] = fp32_from_bits(n + i);
1884     }
1885     xnn_math_f32_roundz__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1886     for (uint32_t i = 0; i < kBlockSize; i++) {
1887       const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1888       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1889         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1890         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1891         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1892     }
1893   }
1894 }
1895 
TEST(ROUNDZ__SCALAR_CVT,negative_normal)1896 TEST(ROUNDZ__SCALAR_CVT, negative_normal) {
1897   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1898   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1899   for (uint32_t n = UINT32_C(0x80000000); n < UINT32_C(0xCB800000); n += kBlockSize) {
1900     for (uint32_t i = 0; i < kBlockSize; i++) {
1901       inputs[i] = fp32_from_bits(n + i);
1902     }
1903     xnn_math_f32_roundz__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1904     for (uint32_t i = 0; i < kBlockSize; i++) {
1905       const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1906       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1907         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1908         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1909         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1910     }
1911   }
1912 }
1913 
TEST(ROUNDZ__SCALAR_CVT,positive_integral)1914 TEST(ROUNDZ__SCALAR_CVT, positive_integral) {
1915   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1916   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1917   for (uint32_t n = UINT32_C(0x4B800000); n < UINT32_C(0x7F800000); n += kBlockSize) {
1918     for (uint32_t i = 0; i < kBlockSize; i++) {
1919       inputs[i] = fp32_from_bits(n + i);
1920     }
1921     xnn_math_f32_roundz__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1922     for (uint32_t i = 0; i < kBlockSize; i++) {
1923       const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1924       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1925         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1926         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1927         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1928     }
1929   }
1930 }
1931 
TEST(ROUNDZ__SCALAR_CVT,negative_integral)1932 TEST(ROUNDZ__SCALAR_CVT, negative_integral) {
1933   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1934   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1935   for (uint32_t n = UINT32_C(0xCB800000); n < UINT32_C(0xFF800000); n += kBlockSize) {
1936     for (uint32_t i = 0; i < kBlockSize; i++) {
1937       inputs[i] = fp32_from_bits(n + i);
1938     }
1939     xnn_math_f32_roundz__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1940     for (uint32_t i = 0; i < kBlockSize; i++) {
1941       const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1942       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1943         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1944         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1945         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1946     }
1947   }
1948 }
1949 
TEST(ROUNDZ__SCALAR_CVT,positive_infinity)1950 TEST(ROUNDZ__SCALAR_CVT, positive_infinity) {
1951   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1952   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1953   std::fill(inputs.begin(), inputs.end(), +std::numeric_limits<float>::infinity());
1954   xnn_math_f32_roundz__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1955   const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
1956   ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
1957     << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1958     << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1959     << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1960 }
1961 
TEST(ROUNDZ__SCALAR_CVT,negative_infinity)1962 TEST(ROUNDZ__SCALAR_CVT, negative_infinity) {
1963   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1964   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1965   std::fill(inputs.begin(), inputs.end(), -std::numeric_limits<float>::infinity());
1966   xnn_math_f32_roundz__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1967   const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[0]));
1968   ASSERT_EQ(reference_output, fp32_to_bits(outputs[0]))
1969     << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[0])
1970     << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1971     << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[0]);
1972 }
1973 
TEST(ROUNDZ__SCALAR_CVT,positive_qnan)1974 TEST(ROUNDZ__SCALAR_CVT, positive_qnan) {
1975   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1976   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1977   for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
1978     for (uint32_t i = 0; i < kBlockSize; i++) {
1979       inputs[i] = fp32_from_bits(n + i);
1980     }
1981     xnn_math_f32_roundz__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
1982     for (uint32_t i = 0; i < kBlockSize; i++) {
1983       const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
1984       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
1985         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
1986         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
1987         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
1988     }
1989   }
1990 }
1991 
TEST(ROUNDZ__SCALAR_CVT,negative_qnan)1992 TEST(ROUNDZ__SCALAR_CVT, negative_qnan) {
1993   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
1994   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
1995   for (uint32_t n = UINT32_C(0x7FC00000); n < UINT32_C(0x80000000); n += kBlockSize) {
1996     for (uint32_t i = 0; i < kBlockSize; i++) {
1997       inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | (n + i));
1998     }
1999     xnn_math_f32_roundz__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2000     for (uint32_t i = 0; i < kBlockSize; i++) {
2001       const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
2002       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
2003         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2004         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2005         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2006     }
2007   }
2008 }
2009 
TEST(ROUNDZ__SCALAR_CVT,positive_snan)2010 TEST(ROUNDZ__SCALAR_CVT, positive_snan) {
2011   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2012   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2013   for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
2014     for (uint32_t i = 0; i < kBlockSize; i++) {
2015       inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
2016     }
2017     xnn_math_f32_roundz__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2018     for (uint32_t i = 0; i < kBlockSize; i++) {
2019       const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
2020       ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
2021         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2022         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2023         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2024     }
2025   }
2026 }
2027 
TEST(ROUNDZ__SCALAR_CVT,negative_snan)2028 TEST(ROUNDZ__SCALAR_CVT, negative_snan) {
2029   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2030   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2031   for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
2032     for (uint32_t i = 0; i < kBlockSize; i++) {
2033       inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
2034     }
2035     xnn_math_f32_roundz__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2036     for (uint32_t i = 0; i < kBlockSize; i++) {
2037       const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
2038       ASSERT_EQ(reference_output & UINT32_C(0xFFBFFFFF), fp32_to_bits(outputs[i]) & UINT32_C(0xFFBFFFFF))
2039         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2040         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2041         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2042     }
2043   }
2044 }
2045 
TEST(ROUNDZ__SCALAR_CVT,DISABLED_positive_snan_to_qnan)2046 TEST(ROUNDZ__SCALAR_CVT, DISABLED_positive_snan_to_qnan) {
2047   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2048   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2049   for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
2050     for (uint32_t i = 0; i < kBlockSize; i++) {
2051       inputs[i] = fp32_from_bits(std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
2052     }
2053     xnn_math_f32_roundz__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2054     for (uint32_t i = 0; i < kBlockSize; i++) {
2055       const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
2056       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
2057         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2058         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2059         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2060     }
2061   }
2062 }
2063 
TEST(ROUNDZ__SCALAR_CVT,DISABLED_negative_snan_to_qnan)2064 TEST(ROUNDZ__SCALAR_CVT, DISABLED_negative_snan_to_qnan) {
2065   std::vector<float, AlignedAllocator<float, 64>> inputs(kBlockSize);
2066   std::vector<float, AlignedAllocator<float, 64>> outputs(kBlockSize);
2067   for (uint32_t n = UINT32_C(0x7F800000); n < UINT32_C(0x7FC00000); n += kBlockSize) {
2068     for (uint32_t i = 0; i < kBlockSize; i++) {
2069       inputs[i] = fp32_from_bits(UINT32_C(0x80000000) | std::max<uint32_t>(n + i, UINT32_C(0x7F800001)));
2070     }
2071     xnn_math_f32_roundz__scalar_cvt(kBlockSize * sizeof(float), inputs.data(), outputs.data());
2072     for (uint32_t i = 0; i < kBlockSize; i++) {
2073       const uint32_t reference_output = fp32_to_bits(std::trunc(inputs[i]));
2074       ASSERT_EQ(reference_output, fp32_to_bits(outputs[i]))
2075         << "input = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(inputs[i])
2076         << ", reference = 0x" << std::hex << std::setw(8) << std::setfill('0') << reference_output
2077         << ", optimized = 0x" << std::hex << std::setw(8) << std::setfill('0') << fp32_to_bits(outputs[i]);
2078     }
2079   }
2080 }
2081