• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include <gtest/gtest.h>
2 
3 #include <torch/types.h>
4 #include <torch/utils.h>
5 
6 #include <ATen/native/xnnpack/Common.h>
7 #include <ATen/native/xnnpack/Engine.h>
8 #include <ATen/native/xnnpack/OpContext.h>
9 #include <ATen/native/xnnpack/Pooling.h>
10 #include <c10/core/CPUAllocator.h>
11 #include <c10/core/MemoryFormat.h>
12 
13 #include <atomic>
14 #include <condition_variable>
15 #include <thread>
16 
17 #if defined(C10_MOBILE) && defined(USE_XNNPACK)
18 
checkRtol(const at::Tensor & diff,const std::vector<at::Tensor> inputs)19 bool checkRtol(const at::Tensor& diff, const std::vector<at::Tensor> inputs) {
20   double maxValue = 0.0;
21   for (auto& tensor : inputs) {
22     maxValue = fmax(tensor.abs().max().item<float>(), maxValue);
23   }
24   return diff.abs().max().item<float>() < (0.01 + 2e-2 * maxValue);
25 }
almostEqual(const at::Tensor & a,const at::Tensor & b)26 bool almostEqual(const at::Tensor& a, const at::Tensor& b) {
27   return checkRtol(a - b, {a, b});
28 }
29 
exactlyEqual(const at::Tensor & a,const at::Tensor & b)30 bool exactlyEqual(const at::Tensor& a, const at::Tensor& b) {
31   return (a - b).abs().max().item<float>() == 0.f;
32 }
33 
test_hardswish(const at::Tensor & input,const at::Tensor & expected)34 void test_hardswish(const at::Tensor& input, const at::Tensor& expected) {
35   ASSERT_TRUE(at::native::xnnpack::use_hardswish(input));
36   auto result = at::native::xnnpack::hardswish(input);
37   auto check = almostEqual(expected, result);
38   ASSERT_TRUE(check);
39   ASSERT_TRUE(
40       expected.suggest_memory_format() == input.suggest_memory_format());
41 }
42 
test_hardswish_(at::Tensor input,const at::Tensor & expected)43 void test_hardswish_(at::Tensor input, const at::Tensor& expected) {
44   ASSERT_TRUE(at::native::xnnpack::use_hardswish(input));
45   at::native::xnnpack::hardswish_(input);
46   auto check = almostEqual(expected, input);
47   ASSERT_TRUE(check);
48   ASSERT_TRUE(
49       expected.suggest_memory_format() == input.suggest_memory_format());
50 }
51 
test_global_average_pool(at::Tensor input,const at::Tensor & expected)52 void test_global_average_pool(at::Tensor input, const at::Tensor& expected) {
53   ASSERT_TRUE(at::native::xnnpack::use_global_average_pool(input));
54   auto result = at::native::xnnpack::global_average_pool(input);
55   auto check = almostEqual(expected, result);
56   ASSERT_TRUE(check);
57 }
58 
59 // Since XNNPACK path is only taken #if defined(C10_MOBILE) &&
60 // defined(USE_XNNPACK) We can't compare regular CPU path with XNNPACK path in
61 // the same test binary Instead we precompute regular results and compare with
62 // XNNPACK path here
TEST(TestXNNPackOps,TestLinear)63 TEST(TestXNNPackOps, TestLinear) {
64   constexpr std::array<int64_t, 2u> input_shape{1, 37};
65   constexpr std::array<int64_t, 2u> weight_shape{41, 37};
66   constexpr std::array<int64_t, 2u> bias_shape{1, 41};
67   const auto input_cpu =
68       at::rand(input_shape, at::device(at::kCPU).dtype(at::kFloat));
69   const auto weight =
70       at::rand(weight_shape, at::device(at::kCPU).dtype(at::kFloat));
71   const auto bias =
72       at::rand(bias_shape, at::device(at::kCPU).dtype(at::kFloat));
73 
74   const auto out_cpu = at::linear(input_cpu, weight, bias);
75 
76   const auto xnnpack_bias = bias.view({41});
77   ASSERT_TRUE(at::native::xnnpack::use_linear(input_cpu, weight, xnnpack_bias));
78   const auto result =
79       at::native::xnnpack::linear(input_cpu, weight, xnnpack_bias);
80 
81   auto check = almostEqual(out_cpu, result);
82   ASSERT_TRUE(check);
83 }
84 
TEST(TestXNNPackOps,TestMaxPool2d)85 TEST(TestXNNPackOps, TestMaxPool2d) {
86   const auto in_cpu =
87       at::rand({5, 13, 55, 68}, at::TensorOptions(at::kCPU).dtype(at::kFloat));
88   const auto out_cpu =
89       at::max_pool2d(in_cpu, {3, 4}, {2, 1}, {1, 1}, {1, 1}, false);
90   ASSERT_TRUE(at::native::xnnpack::use_max_pool2d(
91       in_cpu, {3, 4}, {1, 1}, {2, 1}, {1, 1}, false));
92   const auto result = at::native::xnnpack::max_pool2d(
93       in_cpu, {3, 4}, {1, 1}, {2, 1}, {1, 1}, false);
94 
95   auto check = almostEqual(out_cpu, result);
96   ASSERT_TRUE(check);
97 }
98 
TEST(TestXNNPackOps,TestConvolution2d)99 TEST(TestXNNPackOps, TestConvolution2d) {
100   constexpr int64_t groups = 1;
101   constexpr std::array<int64_t, 2u> stride{2, 2};
102   constexpr std::array<int64_t, 2u> padding{1, 1};
103   constexpr std::array<int64_t, 2u> dilation{1, 1};
104 
105   constexpr struct {
106     uint32_t batches;
107     uint32_t channels;
108     uint32_t width;
109     uint32_t height;
110 
111     std::array<int64_t, 4u> size() const {
112       return {
113           batches,
114           channels,
115           width,
116           height,
117       };
118     }
119   } input{1, 3, 8, 8};
120 
121   constexpr struct {
122     uint32_t output_channels;
123     uint32_t input_channels;
124     uint32_t width;
125     uint32_t height;
126 
127     std::array<int64_t, 4u> size() const {
128       return {
129           output_channels,
130           input_channels,
131           width,
132           height,
133       };
134     }
135   } weights{1, input.channels, 3, 3};
136 
137   const auto input_cpu =
138       at::randn(input.size(), at::device(at::kCPU).dtype(at::kFloat));
139   const auto weights_cpu =
140       at::randn(weights.size(), at::device(at::kCPU).dtype(at::kFloat));
141   const auto bias_cpu = at::randn(
142       {weights.output_channels}, at::device(at::kCPU).dtype(at::kFloat));
143 
144   const auto output_cpu = at::conv2d(
145       input_cpu, weights_cpu, bias_cpu, stride, padding, dilation, groups);
146 
147   ASSERT_TRUE(at::native::xnnpack::use_convolution2d(
148       input_cpu,
149       weights_cpu,
150       weights.output_channels,
151       padding,
152       stride,
153       dilation,
154       groups,
155       false));
156   const auto result = at::native::xnnpack::convolution2d(
157       input_cpu, weights_cpu, bias_cpu, padding, stride, dilation, groups);
158   auto check = almostEqual(output_cpu, result);
159   ASSERT_TRUE(check);
160 }
161 
TEST(TestXNNPackOps,TestHardSwish)162 TEST(TestXNNPackOps, TestHardSwish) {
163   // input, expected_result pair
164   auto in = torch::tensor({{1, 1}, {1, 1}}, {torch::kFloat32});
165   auto in_slice = in.index({"...", 0});
166 
167   std::vector<std::pair<at::Tensor, at::Tensor>> input_result_pairs = {
168       {torch::tensor({1, 2, 3, 4, 5}, {torch::kFloat32}),
169        torch::tensor(
170            {0.6667, 1.6667, 3.0000, 4.0000, 5.0000}, {torch::kFloat32})},
171       {torch::tensor({0.3330}, {torch::kFloat32}),
172        torch::tensor({0.1850}, {torch::kFloat32})},
173       {torch::tensor({{0.4523, 0.8131, 0.9829}, {0.0782, 0.7395, 0.0787}}),
174        torch::tensor({{0.2602, 0.5167, 0.6525}, {0.0401, 0.4609, 0.0404}})},
175       {in_slice, torch::tensor({0.6667, 0.6667}, {torch::kFloat32})},
176       {torch::tensor({{{{0.4993, 0.3835}, {0.3163, 0.2348}},
177                        {{0.4705, 0.4129}, {0.9314, 0.0631}}},
178                       {{{0.0030, 0.5656}, {0.1413, 0.1943}},
179                        {{0.1380, 0.1985}, {0.2746, 0.8109}}}})
180            .contiguous(at::MemoryFormat::ChannelsLast),
181        torch::tensor({{{{0.2912, 0.2163}, {0.1748, 0.1266}},
182                        {{0.2722, 0.2349}, {0.6103, 0.0322}}},
183                       {{{0.0015, 0.3361}, {0.0740, 0.1034}},
184                        {{0.0722, 0.1058}, {0.1499, 0.5150}}}})
185            .contiguous(at::MemoryFormat::ChannelsLast)}};
186 
187   for (const auto& input_result : input_result_pairs) {
188     test_hardswish(input_result.first, input_result.second);
189     test_hardswish_(input_result.first, input_result.second);
190   }
191 }
192 
TEST(TestXNNPackOps,TestConvolution2dMultiThreaded)193 TEST(TestXNNPackOps, TestConvolution2dMultiThreaded) {
194   constexpr int64_t groups = 1;
195 
196   constexpr struct {
197     uint32_t batches;
198     uint32_t channels;
199     uint32_t width;
200     uint32_t height;
201 
202     std::array<int64_t, 4u> size() const {
203       return {
204           batches,
205           channels,
206           width,
207           height,
208       };
209     }
210   } input{1, 3, 8, 8};
211 
212   constexpr struct {
213     uint32_t output_channels;
214     uint32_t input_channels;
215     uint32_t width;
216     uint32_t height;
217 
218     std::array<int64_t, 4u> size() const {
219       return {
220           output_channels,
221           input_channels,
222           width,
223           height,
224       };
225     }
226   } weights{1, input.channels, 3, 3};
227 
228   const auto input_cpu =
229       at::randn(input.size(), at::device(at::kCPU).dtype(at::kFloat));
230   auto weights_cpu =
231       at::randn(weights.size(), at::device(at::kCPU).dtype(at::kFloat));
232   auto bias_cpu = at::randn(
233       {weights.output_channels}, at::device(at::kCPU).dtype(at::kFloat));
234 
235   auto context = at::native::xnnpack::XNNPackConv2dOpContext::create_context(
236       std::move(weights_cpu), std::move(bias_cpu), {1, 1}, {2, 2}, {1, 1}, groups, std::nullopt, std::nullopt);
237   std::atomic<int64_t> count{0};
238   int64_t num_workers = 5;
239   std::mutex lock;
240   std::condition_variable cond;
241   auto sync_and_run_conv = [&](int64_t h, int64_t w) -> at::Tensor
242   {
243     auto input_tensor = at::randn({1, 3, h, w}, at::device(at::kCPU).dtype(at::kFloat));
244     int64_t count_val = ++count;
245     if (count_val < num_workers) {
246       std::unique_lock<std::mutex> g(lock);
247       while ((count_val = count.load()) < num_workers) {
248         cond.wait(g, [&]() {
249             auto new_val = count.load();
250             return new_val >= num_workers;});
251       }
252     } else {
253       std::unique_lock<std::mutex> g(lock);
254       cond.notify_all();
255     }
256     for (int64_t i = 0; i < 30; i++) {
257       context->run(input_tensor);
258     }
259     return context->run(input_tensor);
260   };
261 
262   auto conv = [sync_and_run_conv](int64_t h, int64_t w) -> at::Tensor
263   {
264     return sync_and_run_conv(h, w);
265   };
266 
267   std::thread t1(conv, 16, 16);
268   std::thread t2(conv, 12, 12);
269   std::thread t3(conv, 20, 20);
270   std::thread t4(conv, 22, 22);
271   std::thread t5(conv, 8, 8);
272   t1.join();
273   t2.join();
274   t3.join();
275   t4.join();
276   t5.join();
277 }
278 
TEST(TestXNNPackOps,TestGlobal)279 TEST(TestXNNPackOps, TestGlobal) {
280   // input, expected_result pair
281   std::vector<std::pair<at::Tensor, at::Tensor>> input_result_pairs = {
282       {torch::tensor(
283            {{{{0.0852, 0.7312, 0.9943, 0.7105},
284               {0.0956, 0.9072, 0.3124, 0.9362},
285               {0.5878, 0.8883, 0.5086, 0.9494}},
286              {{0.1056, 0.4968, 0.7740, 0.7593},
287               {0.8519, 0.3543, 0.8078, 0.5517},
288               {0.1413, 0.4608, 0.1706, 0.0314}}}},
289            {torch::kFloat32}),
290        torch::tensor({{{{0.6422}}, {{0.4588}}}}, {torch::kFloat32})},
291       {torch::tensor(
292            {{{{0.0280, 0.9073}, {0.2103, 0.5298}},
293              {{0.5335, 0.9901}, {0.2902, 0.2955}}},
294             {{{0.2363, 0.7024}, {0.7903, 0.8260}},
295              {{0.3802, 0.5959}, {0.5749, 0.8855}}}},
296            {torch::kFloat32}),
297        torch::tensor(
298            {{{{0.4188}}, {{0.5273}}}, {{{0.6388}}, {{0.6091}}}},
299            {torch::kFloat32})}};
300 
301   for (const auto& input_result : input_result_pairs) {
302     test_global_average_pool(input_result.first, input_result.second);
303   }
304 }
305 
main(int argc,char * argv[])306 int main(int argc, char* argv[]) {
307   // Setting default allocator as mobile to test copy / no copy cases
308   c10::SetCPUAllocator(c10::GetDefaultMobileCPUAllocator(), /*priority*/ 100);
309   ::testing::InitGoogleTest(&argc, argv);
310   return RUN_ALL_TESTS();
311 }
312 #endif
313