# Owner(s): ["module: inductor"]
import copy
import sys
import unittest

import torch
from torch._inductor import config
from torch._inductor.package import load_package
from torch._inductor.test_case import TestCase
from torch.testing._internal import common_utils
from torch.testing._internal.common_utils import IS_FBCODE
from torch.testing._internal.triton_utils import HAS_CUDA


try:
    try:
        from .test_torchinductor import copy_tests
    except ImportError:
        from test_torchinductor import copy_tests
except (unittest.SkipTest, ImportError) as e:
    if __name__ == "__main__":
        sys.exit(0)
    raise


def compile(model, example_inputs, dynamic_shapes, options, device):
    ep = torch.export.export(
        model,
        example_inputs,
        dynamic_shapes=dynamic_shapes,
        strict=False,
    )
    gm = ep.module()
    package_path = torch._inductor.aot_compile(gm, example_inputs, options=options)  # type: ignore[arg-type]
    compiled_model = load_package(package_path, device)
    return compiled_model


def check_model(
    self: TestCase,
    model,
    example_inputs,
    options=None,
    dynamic_shapes=None,
    disable_constraint_solver=False,
    atol=None,
    rtol=None,
):
    with torch.no_grad(), config.patch(
        {
            "aot_inductor.package": True,
            # TODO: "aot_inductor.force_mmap_weights": True,
        }
    ):
        torch.manual_seed(0)
        model = model.to(self.device)
        ref_model = copy.deepcopy(model)
        ref_inputs = copy.deepcopy(example_inputs)
        expected = ref_model(*ref_inputs)

        torch.manual_seed(0)
        compiled_model = compile(
            model,
            example_inputs,
            dynamic_shapes,
            options,
            self.device,
        )

        actual = compiled_model(*example_inputs)

    self.assertEqual(actual, expected, atol=atol, rtol=rtol)


class AOTInductorTestsTemplate:
    def test_add(self):
        class Model(torch.nn.Module):
            def forward(self, x, y):
                return x + y

        example_inputs = (
            torch.randn(10, 10, device=self.device),
            torch.randn(10, 10, device=self.device),
        )
        self.check_model(Model(), example_inputs)

    def test_linear(self):
        class Model(torch.nn.Module):
            def __init__(self) -> None:
                super().__init__()
                self.linear = torch.nn.Linear(10, 10)

            def forward(self, x, y):
                return x + self.linear(y)

        example_inputs = (
            torch.randn(10, 10, device=self.device),
            torch.randn(10, 10, device=self.device),
        )
        self.check_model(Model(), example_inputs)


common_utils.instantiate_parametrized_tests(AOTInductorTestsTemplate)


@unittest.skipIf(sys.platform == "darwin" or IS_FBCODE, "No CUDA on MacOS")
class AOTInductorTestPackagedABICompatibleCuda(TestCase):
    device = "cuda"
    check_model = check_model


copy_tests(
    AOTInductorTestsTemplate,
    AOTInductorTestPackagedABICompatibleCuda,
    "packaged_abi_compatible_cuda",
)


@unittest.skipIf(IS_FBCODE, "This is for OSS only")
class AOTInductorTestPackagedABICompatibleCpu(TestCase):
    device = "cpu"
    check_model = check_model


copy_tests(
    AOTInductorTestsTemplate,
    AOTInductorTestPackagedABICompatibleCpu,
    "packaged_abi_compatible_cpu",
)

if __name__ == "__main__":
    from torch._inductor.test_case import run_tests

    # cpp_extension N/A in fbcode
    if HAS_CUDA or sys.platform == "darwin":
        run_tests(needs="filelock")