1 #include <gtest/gtest.h>
2
3 #include <torch/types.h>
4 #include <torch/utils.h>
5
6 #include <ATen/native/xnnpack/Common.h>
7 #include <ATen/native/xnnpack/Engine.h>
8 #include <ATen/native/xnnpack/OpContext.h>
9 #include <ATen/native/xnnpack/Pooling.h>
10 #include <c10/core/CPUAllocator.h>
11 #include <c10/core/MemoryFormat.h>
12
13 #include <atomic>
14 #include <condition_variable>
15 #include <thread>
16
17 #if defined(C10_MOBILE) && defined(USE_XNNPACK)
18
checkRtol(const at::Tensor & diff,const std::vector<at::Tensor> inputs)19 bool checkRtol(const at::Tensor& diff, const std::vector<at::Tensor> inputs) {
20 double maxValue = 0.0;
21 for (auto& tensor : inputs) {
22 maxValue = fmax(tensor.abs().max().item<float>(), maxValue);
23 }
24 return diff.abs().max().item<float>() < (0.01 + 2e-2 * maxValue);
25 }
almostEqual(const at::Tensor & a,const at::Tensor & b)26 bool almostEqual(const at::Tensor& a, const at::Tensor& b) {
27 return checkRtol(a - b, {a, b});
28 }
29
exactlyEqual(const at::Tensor & a,const at::Tensor & b)30 bool exactlyEqual(const at::Tensor& a, const at::Tensor& b) {
31 return (a - b).abs().max().item<float>() == 0.f;
32 }
33
test_hardswish(const at::Tensor & input,const at::Tensor & expected)34 void test_hardswish(const at::Tensor& input, const at::Tensor& expected) {
35 ASSERT_TRUE(at::native::xnnpack::use_hardswish(input));
36 auto result = at::native::xnnpack::hardswish(input);
37 auto check = almostEqual(expected, result);
38 ASSERT_TRUE(check);
39 ASSERT_TRUE(
40 expected.suggest_memory_format() == input.suggest_memory_format());
41 }
42
test_hardswish_(at::Tensor input,const at::Tensor & expected)43 void test_hardswish_(at::Tensor input, const at::Tensor& expected) {
44 ASSERT_TRUE(at::native::xnnpack::use_hardswish(input));
45 at::native::xnnpack::hardswish_(input);
46 auto check = almostEqual(expected, input);
47 ASSERT_TRUE(check);
48 ASSERT_TRUE(
49 expected.suggest_memory_format() == input.suggest_memory_format());
50 }
51
test_global_average_pool(at::Tensor input,const at::Tensor & expected)52 void test_global_average_pool(at::Tensor input, const at::Tensor& expected) {
53 ASSERT_TRUE(at::native::xnnpack::use_global_average_pool(input));
54 auto result = at::native::xnnpack::global_average_pool(input);
55 auto check = almostEqual(expected, result);
56 ASSERT_TRUE(check);
57 }
58
59 // Since XNNPACK path is only taken #if defined(C10_MOBILE) &&
60 // defined(USE_XNNPACK) We can't compare regular CPU path with XNNPACK path in
61 // the same test binary Instead we precompute regular results and compare with
62 // XNNPACK path here
TEST(TestXNNPackOps,TestLinear)63 TEST(TestXNNPackOps, TestLinear) {
64 constexpr std::array<int64_t, 2u> input_shape{1, 37};
65 constexpr std::array<int64_t, 2u> weight_shape{41, 37};
66 constexpr std::array<int64_t, 2u> bias_shape{1, 41};
67 const auto input_cpu =
68 at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
69 const auto weight =
70 at::rand(weight_shape, at::device(at::kCPU).dtype(at::kFloat));
71 const auto bias =
72 at::rand(bias_shape, at::device(at::kCPU).dtype(at::kFloat));
73
74 const auto out_cpu = at::linear(input_cpu, weight, bias);
75
76 const auto xnnpack_bias = bias.view({41});
77 ASSERT_TRUE(at::native::xnnpack::use_linear(input_cpu, weight, xnnpack_bias));
78 const auto result =
79 at::native::xnnpack::linear(input_cpu, weight, xnnpack_bias);
80
81 auto check = almostEqual(out_cpu, result);
82 ASSERT_TRUE(check);
83 }
84
TEST(TestXNNPackOps,TestMaxPool2d)85 TEST(TestXNNPackOps, TestMaxPool2d) {
86 const auto in_cpu =
87 at::rand({5, 13, 55, 68}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
88 const auto out_cpu =
89 at::max_pool2d(in_cpu, {3, 4}, {2, 1}, {1, 1}, {1, 1}, false);
90 ASSERT_TRUE(at::native::xnnpack::use_max_pool2d(
91 in_cpu, {3, 4}, {1, 1}, {2, 1}, {1, 1}, false));
92 const auto result = at::native::xnnpack::max_pool2d(
93 in_cpu, {3, 4}, {1, 1}, {2, 1}, {1, 1}, false);
94
95 auto check = almostEqual(out_cpu, result);
96 ASSERT_TRUE(check);
97 }
98
TEST(TestXNNPackOps,TestConvolution2d)99 TEST(TestXNNPackOps, TestConvolution2d) {
100 constexpr int64_t groups = 1;
101 constexpr std::array<int64_t, 2u> stride{2, 2};
102 constexpr std::array<int64_t, 2u> padding{1, 1};
103 constexpr std::array<int64_t, 2u> dilation{1, 1};
104
105 constexpr struct {
106 uint32_t batches;
107 uint32_t channels;
108 uint32_t width;
109 uint32_t height;
110
111 std::array<int64_t, 4u> size() const {
112 return {
113 batches,
114 channels,
115 width,
116 height,
117 };
118 }
119 } input{1, 3, 8, 8};
120
121 constexpr struct {
122 uint32_t output_channels;
123 uint32_t input_channels;
124 uint32_t width;
125 uint32_t height;
126
127 std::array<int64_t, 4u> size() const {
128 return {
129 output_channels,
130 input_channels,
131 width,
132 height,
133 };
134 }
135 } weights{1, input.channels, 3, 3};
136
137 const auto input_cpu =
138 at::randn(input.size(), at::device(at::kCPU).dtype(at::kFloat));
139 const auto weights_cpu =
140 at::randn(weights.size(), at::device(at::kCPU).dtype(at::kFloat));
141 const auto bias_cpu = at::randn(
142 {weights.output_channels}, at::device(at::kCPU).dtype(at::kFloat));
143
144 const auto output_cpu = at::conv2d(
145 input_cpu, weights_cpu, bias_cpu, stride, padding, dilation, groups);
146
147 ASSERT_TRUE(at::native::xnnpack::use_convolution2d(
148 input_cpu,
149 weights_cpu,
150 weights.output_channels,
151 padding,
152 stride,
153 dilation,
154 groups,
155 false));
156 const auto result = at::native::xnnpack::convolution2d(
157 input_cpu, weights_cpu, bias_cpu, padding, stride, dilation, groups);
158 auto check = almostEqual(output_cpu, result);
159 ASSERT_TRUE(check);
160 }
161
TEST(TestXNNPackOps,TestHardSwish)162 TEST(TestXNNPackOps, TestHardSwish) {
163 // input, expected_result pair
164 auto in = torch::tensor({{1, 1}, {1, 1}}, {torch::kFloat32});
165 auto in_slice = in.index({"...", 0});
166
167 std::vector<std::pair<at::Tensor, at::Tensor>> input_result_pairs = {
168 {torch::tensor({1, 2, 3, 4, 5}, {torch::kFloat32}),
169 torch::tensor(
170 {0.6667, 1.6667, 3.0000, 4.0000, 5.0000}, {torch::kFloat32})},
171 {torch::tensor({0.3330}, {torch::kFloat32}),
172 torch::tensor({0.1850}, {torch::kFloat32})},
173 {torch::tensor({{0.4523, 0.8131, 0.9829}, {0.0782, 0.7395, 0.0787}}),
174 torch::tensor({{0.2602, 0.5167, 0.6525}, {0.0401, 0.4609, 0.0404}})},
175 {in_slice, torch::tensor({0.6667, 0.6667}, {torch::kFloat32})},
176 {torch::tensor({{{{0.4993, 0.3835}, {0.3163, 0.2348}},
177 {{0.4705, 0.4129}, {0.9314, 0.0631}}},
178 {{{0.0030, 0.5656}, {0.1413, 0.1943}},
179 {{0.1380, 0.1985}, {0.2746, 0.8109}}}})
180 .contiguous(at::MemoryFormat::ChannelsLast),
181 torch::tensor({{{{0.2912, 0.2163}, {0.1748, 0.1266}},
182 {{0.2722, 0.2349}, {0.6103, 0.0322}}},
183 {{{0.0015, 0.3361}, {0.0740, 0.1034}},
184 {{0.0722, 0.1058}, {0.1499, 0.5150}}}})
185 .contiguous(at::MemoryFormat::ChannelsLast)}};
186
187 for (const auto& input_result : input_result_pairs) {
188 test_hardswish(input_result.first, input_result.second);
189 test_hardswish_(input_result.first, input_result.second);
190 }
191 }
192
TEST(TestXNNPackOps,TestConvolution2dMultiThreaded)193 TEST(TestXNNPackOps, TestConvolution2dMultiThreaded) {
194 constexpr int64_t groups = 1;
195
196 constexpr struct {
197 uint32_t batches;
198 uint32_t channels;
199 uint32_t width;
200 uint32_t height;
201
202 std::array<int64_t, 4u> size() const {
203 return {
204 batches,
205 channels,
206 width,
207 height,
208 };
209 }
210 } input{1, 3, 8, 8};
211
212 constexpr struct {
213 uint32_t output_channels;
214 uint32_t input_channels;
215 uint32_t width;
216 uint32_t height;
217
218 std::array<int64_t, 4u> size() const {
219 return {
220 output_channels,
221 input_channels,
222 width,
223 height,
224 };
225 }
226 } weights{1, input.channels, 3, 3};
227
228 const auto input_cpu =
229 at::randn(input.size(), at::device(at::kCPU).dtype(at::kFloat));
230 auto weights_cpu =
231 at::randn(weights.size(), at::device(at::kCPU).dtype(at::kFloat));
232 auto bias_cpu = at::randn(
233 {weights.output_channels}, at::device(at::kCPU).dtype(at::kFloat));
234
235 auto context = at::native::xnnpack::XNNPackConv2dOpContext::create_context(
236 std::move(weights_cpu), std::move(bias_cpu), {1, 1}, {2, 2}, {1, 1}, groups, std::nullopt, std::nullopt);
237 std::atomic<int64_t> count{0};
238 int64_t num_workers = 5;
239 std::mutex lock;
240 std::condition_variable cond;
241 auto sync_and_run_conv = [&](int64_t h, int64_t w) -> at::Tensor
242 {
243 auto input_tensor = at::randn({1, 3, h, w}, at::device(at::kCPU).dtype(at::kFloat));
244 int64_t count_val = ++count;
245 if (count_val < num_workers) {
246 std::unique_lock<std::mutex> g(lock);
247 while ((count_val = count.load()) < num_workers) {
248 cond.wait(g, [&]() {
249 auto new_val = count.load();
250 return new_val >= num_workers;});
251 }
252 } else {
253 std::unique_lock<std::mutex> g(lock);
254 cond.notify_all();
255 }
256 for (int64_t i = 0; i < 30; i++) {
257 context->run(input_tensor);
258 }
259 return context->run(input_tensor);
260 };
261
262 auto conv = [sync_and_run_conv](int64_t h, int64_t w) -> at::Tensor
263 {
264 return sync_and_run_conv(h, w);
265 };
266
267 std::thread t1(conv, 16, 16);
268 std::thread t2(conv, 12, 12);
269 std::thread t3(conv, 20, 20);
270 std::thread t4(conv, 22, 22);
271 std::thread t5(conv, 8, 8);
272 t1.join();
273 t2.join();
274 t3.join();
275 t4.join();
276 t5.join();
277 }
278
TEST(TestXNNPackOps,TestGlobal)279 TEST(TestXNNPackOps, TestGlobal) {
280 // input, expected_result pair
281 std::vector<std::pair<at::Tensor, at::Tensor>> input_result_pairs = {
282 {torch::tensor(
283 {{{{0.0852, 0.7312, 0.9943, 0.7105},
284 {0.0956, 0.9072, 0.3124, 0.9362},
285 {0.5878, 0.8883, 0.5086, 0.9494}},
286 {{0.1056, 0.4968, 0.7740, 0.7593},
287 {0.8519, 0.3543, 0.8078, 0.5517},
288 {0.1413, 0.4608, 0.1706, 0.0314}}}},
289 {torch::kFloat32}),
290 torch::tensor({{{{0.6422}}, {{0.4588}}}}, {torch::kFloat32})},
291 {torch::tensor(
292 {{{{0.0280, 0.9073}, {0.2103, 0.5298}},
293 {{0.5335, 0.9901}, {0.2902, 0.2955}}},
294 {{{0.2363, 0.7024}, {0.7903, 0.8260}},
295 {{0.3802, 0.5959}, {0.5749, 0.8855}}}},
296 {torch::kFloat32}),
297 torch::tensor(
298 {{{{0.4188}}, {{0.5273}}}, {{{0.6388}}, {{0.6091}}}},
299 {torch::kFloat32})}};
300
301 for (const auto& input_result : input_result_pairs) {
302 test_global_average_pool(input_result.first, input_result.second);
303 }
304 }
305
main(int argc,char * argv[])306 int main(int argc, char* argv[]) {
307 // Setting default allocator as mobile to test copy / no copy cases
308 c10::SetCPUAllocator(c10::GetDefaultMobileCPUAllocator(), /*priority*/ 100);
309 ::testing::InitGoogleTest(&argc, argv);
310 return RUN_ALL_TESTS();
311 }
312 #endif
313