#include #include #include #include #include #if defined(USE_CUDA) || defined(USE_ROCM) #include #endif #include #include #define STR_VALUE(x) #x #define STRINGIZE(x) STR_VALUE(x) namespace { void test_aoti(const std::string& device, bool use_runtime_constant_folding) { torch::NoGradGuard no_grad; std::string data_path = (std::filesystem::path(STRINGIZE(CMAKE_CURRENT_BINARY_DIR)) / "data.pt") .string(); torch::jit::script::Module data_loader = torch::jit::load(data_path); std::string suffix = use_runtime_constant_folding ? device + "_use_runtime_constant_folding" : device; std::string path_attr = "model_so_path_" + suffix; std::string inputs_attr = "inputs_" + suffix; std::string outputs_attr = "outputs_" + suffix; const auto& model_so_path = data_loader.attr(path_attr.c_str()).toStringRef(); auto input_tensors = data_loader.attr(inputs_attr.c_str()).toTensorList().vec(); const auto& ref_output_tensors = data_loader.attr(outputs_attr.c_str()).toTensorList().vec(); std::unique_ptr runner; if (device == "cpu") { runner = std::make_unique( model_so_path); #if defined(USE_CUDA) || defined(USE_ROCM) } else if (device == "cuda") { runner = std::make_unique( model_so_path); #endif } else { testing::AssertionFailure() << "unsupported device: " << device; } auto actual_output_tensors = runner->run(input_tensors); ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0])); } void test_aoti_script(const std::string& device) { torch::NoGradGuard no_grad; std::string script_model = "script_model_" + device + ".pt"; std::string model_path = (std::filesystem::path( STRINGIZE(CMAKE_CURRENT_BINARY_DIR)) / script_model.c_str()) .string(); torch::jit::script::Module model = torch::jit::load(model_path); std::string sample_data_path = (std::filesystem::path( STRINGIZE(CMAKE_CURRENT_BINARY_DIR)) / "script_data.pt") .string(); torch::jit::script::Module sample_data = torch::jit::load(sample_data_path); std::string inputs_attr = "inputs_" + device; std::string outputs_attr = "outputs_" + device; const auto& inputs = sample_data.attr(inputs_attr.c_str()).toList().vec(); const auto& ref_output_tensors = sample_data.attr(outputs_attr.c_str()).toTensorVector(); auto outputs = model.forward(inputs).toTuple()->elements(); ASSERT_EQ(outputs.size(), ref_output_tensors.size()); for (size_t i = 0; i < ref_output_tensors.size(); i++) { ASSERT_TRUE(torch::allclose(outputs[i].toTensor(), ref_output_tensors[i])); } } void test_aoti_constants_update( const std::string& device, bool use_runtime_constant_folding) { torch::NoGradGuard no_grad; std::string data_path = (std::filesystem::path(STRINGIZE(CMAKE_CURRENT_BINARY_DIR)) / "data.pt") .string(); torch::jit::script::Module data_loader = torch::jit::load(data_path); std::string suffix = use_runtime_constant_folding ? device + "_use_runtime_constant_folding" : device; std::string path_attr = "model_so_path_" + suffix; std::string inputs_attr = "inputs_" + suffix; std::string outputs_attr = "outputs_" + suffix; std::string weights_attr = "w_pre_" + suffix; std::string add_attr = "w_add_" + suffix; const auto& model_so_path = data_loader.attr(path_attr.c_str()).toStringRef(); auto input_tensors = data_loader.attr(inputs_attr.c_str()).toTensorList().vec(); const auto& ref_output_tensors = data_loader.attr(outputs_attr.c_str()).toTensorList().vec(); const auto& weight_tensors = data_loader.attr(weights_attr.c_str()).toTensor(); const auto& add_tensors = data_loader.attr(add_attr.c_str()).toTensor(); torch::inductor::TensorConstantMap missing_map, rand_map, real_map; missing_map.emplace("L__self___w_pre", new at::Tensor(at::randn({4, 4}))); rand_map.emplace("L__self___w_pre", new at::Tensor(at::randn({10}))); rand_map.emplace("L__self___w_add", new at::Tensor(at::randn({10}))); real_map.emplace("L__self___w_pre", new at::Tensor(weight_tensors)); real_map.emplace("L__self___w_add", new at::Tensor(add_tensors)); std::unique_ptr runner; if (device == "cpu") { runner = std::make_unique( model_so_path); #if defined(USE_CUDA) || defined(USE_ROCM) } else if (device == "cuda") { runner = std::make_unique( model_so_path); #endif } else { testing::AssertionFailure() << "unsupported device: " << device; } // By default, buffer #1 get loaded with burned in weights. Correct results. auto actual_output_tensors = runner->run(input_tensors); ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0])); // Update with missing map which should throw. EXPECT_THROW( runner->update_constant_buffer(missing_map, false, true), std::runtime_error); // Update random weight to buffer #1. runner->update_constant_buffer(missing_map, false, false); actual_output_tensors = runner->run(input_tensors); if (use_runtime_constant_folding) { // At this moment, this update is applied on the original weight. // The weight being consumed is "folded", so will have no affect. ASSERT_TRUE( torch::allclose(ref_output_tensors[0], actual_output_tensors[0])); runner->run_const_fold(/* use_inactive = */ false); actual_output_tensors = runner->run(input_tensors); } ASSERT_FALSE( torch::allclose(ref_output_tensors[0], actual_output_tensors[0])); // Update with real map. runner->update_constant_buffer(real_map, false, false); actual_output_tensors = runner->run(input_tensors); if (use_runtime_constant_folding) { runner->run_const_fold(/* use_inactive = */ false); } actual_output_tensors = runner->run(input_tensors); ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0])); // Update with full random map. runner->update_constant_buffer(rand_map, false, false); if (use_runtime_constant_folding) { runner->run_const_fold(/* use_inactive = */ false); } actual_output_tensors = runner->run(input_tensors); ASSERT_FALSE( torch::allclose(ref_output_tensors[0], actual_output_tensors[0])); } void test_aoti_double_buffering( const std::string& device, bool use_runtime_constant_folding) { torch::NoGradGuard no_grad; std::string data_path = (std::filesystem::path(STRINGIZE(CMAKE_CURRENT_BINARY_DIR)) / "data.pt") .string(); torch::jit::script::Module data_loader = torch::jit::load(data_path); std::string suffix = use_runtime_constant_folding ? device + "_use_runtime_constant_folding" : device; std::string path_attr = "model_so_path_" + suffix; std::string inputs_attr = "inputs_" + suffix; std::string outputs_attr = "outputs_" + suffix; std::string weights_attr = "w_pre_" + suffix; std::string add_attr = "w_add_" + suffix; const auto& model_so_path = data_loader.attr(path_attr.c_str()).toStringRef(); auto input_tensors = data_loader.attr(inputs_attr.c_str()).toTensorList().vec(); const auto& ref_output_tensors = data_loader.attr(outputs_attr.c_str()).toTensorList().vec(); const auto& weight_tensors = data_loader.attr(weights_attr.c_str()).toTensor(); const auto& add_tensors = data_loader.attr(add_attr.c_str()).toTensor(); torch::inductor::TensorConstantMap rand_map, real_map; rand_map.emplace("L__self___w_pre", new at::Tensor(at::randn({4, 4}))); rand_map.emplace("L__self___w_add", new at::Tensor(at::randn({4, 4}))); real_map.emplace("L__self___w_pre", new at::Tensor(weight_tensors)); real_map.emplace("L__self___w_add", new at::Tensor(add_tensors)); std::unique_ptr runner; if (device == "cpu") { runner = std::make_unique( model_so_path); #if defined(USE_CUDA) || defined(USE_ROCM) } else if (device == "cuda") { runner = std::make_unique( model_so_path); #endif } else { testing::AssertionFailure() << "unsupported device: " << device; } // By default, buffer #1 get loaded with burned in weights. Correct results. auto actual_output_tensors = runner->run(input_tensors); ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0])); // We update the weights to buffer #2 and activate it. This should still // produce correct result, as it's the real constant map. runner->update_inactive_constant_buffer(real_map); if (use_runtime_constant_folding) { runner->run_const_fold(/* use_inactive = */ true); } runner->swap_constant_buffer(); actual_output_tensors = runner->run(input_tensors); ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0])); // We update random weights to buffer #1. But do not swap in the weight yet. runner->update_inactive_constant_buffer(rand_map); if (use_runtime_constant_folding) { runner->run_const_fold(/* use_inactive = */ true); } actual_output_tensors = runner->run(input_tensors); ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0])); // We swap and activate the weight to buffer #1. This is random weight and // should produce incorrect results. runner->swap_constant_buffer(); actual_output_tensors = runner->run(input_tensors); ASSERT_FALSE( torch::allclose(ref_output_tensors[0], actual_output_tensors[0])); // Swap back to buffer #2 which is the real constants. runner->swap_constant_buffer(); actual_output_tensors = runner->run(input_tensors); ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0])); } #if defined(USE_CUDA) || defined(USE_ROCM) void test_aoti_double_buffering_with_tensor_constants() { torch::NoGradGuard no_grad; std::string data_path = (std::filesystem::path( STRINGIZE(CMAKE_CURRENT_BINARY_DIR)) / "data_with_tensor_constants.pt") .string(); torch::jit::script::Module data_loader = torch::jit::load(data_path); std::string path_attr = "model_so_path"; std::string inputs_attr = "inputs"; std::string w_attr = "w"; std::string outputs_attr = "outputs"; const auto& model_so_path = data_loader.attr(path_attr.c_str()).toStringRef(); auto input_tensors = data_loader.attr(inputs_attr.c_str()).toTensorList().vec(); const auto& w_tensors = data_loader.attr(w_attr.c_str()).toTensor(); const auto& ref_output_tensors = data_loader.attr(outputs_attr.c_str()).toTensorList().vec(); torch::inductor::TensorConstantMap real_map; real_map.emplace("L__self___w", new at::Tensor(w_tensors)); std::unique_ptr runner; runner = std::make_unique( model_so_path.c_str()); // By default, buffer #1 get loaded with burned in weights. Correct results. auto actual_output_tensors = runner->run(input_tensors); ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0])); // We update the weights to buffer #2 and activate it. This should still // produce correct result, since we would have copied the tensor_constants. runner->update_inactive_constant_buffer(real_map); runner->swap_constant_buffer(); actual_output_tensors = runner->run(input_tensors); ASSERT_TRUE(torch::allclose(ref_output_tensors[0], actual_output_tensors[0])); } #endif } // namespace namespace torch { namespace aot_inductor { TEST(AotInductorTest, BasicTestCpu) { test_aoti("cpu", false); } TEST(AotInductorTest, BasicScriptTestCpu) { test_aoti_script("cpu"); } #ifdef USE_CUDA TEST(AotInductorTest, BasicTestCuda) { test_aoti("cuda", true); test_aoti("cuda", false); } TEST(AotInductorTest, BasicScriptTestCuda) { test_aoti_script("cuda"); } TEST(AotInductorTest, RuntimeUpdateConstantsCuda) { test_aoti_constants_update("cuda", true); } TEST(AotInductorTest, UpdateConstantsCuda) { test_aoti_constants_update("cuda", false); } TEST(AotInductorTest, RuntimeUpdateInactiveConstantsCuda) { test_aoti_double_buffering("cuda", true); } TEST(AotInductorTest, UpdateInactiveConstantsCuda) { test_aoti_double_buffering("cuda", false); } TEST(AotInductorTest, UpdateInactiveConstantsWithTensorConstantsCuda) { test_aoti_double_buffering_with_tensor_constants(); } #endif } // namespace aot_inductor } // namespace torch