Remove fx2trt/torch2trt backends (pytorch#93822)

jansel · pytorchmergebot · commit 203b2cad3e4c · 2023-02-03T21:04:21.000Z
These backends have been broken for some time. I tried to get them running again, but as far as I can tell they are not maintained. Installing torch_tensorrt downgrades PyTorch to 1.12. If I manually bypass that downgrade, I get import errors from inside fx2trt. Fixes that re-add these are welcome, but it might make sense to move these wrappers to the torch_tensorrt repo once PyTorch 2.0 support is added. Pull Request resolved: pytorch#93822 Approved by: https://github.com/frank-wei
diff --git a/benchmarks/dynamo/common.py b/benchmarks/dynamo/common.py
@@ -2007,25 +2007,6 @@ def run(runner, args, original_dir=None):
         optimize_ctx = torch._dynamo.optimize("ts", nopython=args.nopython)
         experiment = speedup_experiment
         output_filename = "speedup_dynamo_ts.csv"
-    elif args.speedup_fx2trt:
-        optimize_ctx = torch._dynamo.optimize(
-            backends.fx2trt_compiler, nopython=args.nopython
-        )
-        experiment = speedup_experiment_fx2trt
-        output_filename = "speedups_fx2trt.csv"
-        runner.skip_models.update(runner.failing_fx2trt_models)
-        args.float32 = True
-        args.float16 = False
-        args.cosine = True
-    elif args.speedup_fx2trt_fp16:
-        optimize_ctx = torch._dynamo.optimize(
-            backends.fx2trt_compiler_fp16, nopython=args.nopython
-        )
-        experiment = speedup_experiment_fx2trt
-        output_filename = "speedups_fx2trt_fp16.csv"
-        args.float32 = False
-        args.float16 = True
-        args.cosine = True
     elif args.prims_nvfuser:
         optimize_ctx = torch._dynamo.optimize("prims_nvfuser", nopython=args.nopython)
         experiment = speedup_experiment
diff --git a/torch/_dynamo/backends/onnxrt.py b/torch/_dynamo/backends/onnxrt.py
@@ -107,3 +107,8 @@ def _call(*initial_args):
         return outputs
 
     return _call
+
+
+@register_backend
+def tensorrt(gm, example_inputs):
+    return onnxrt(gm, example_inputs, provider="TensorrtExecutionProvider")
diff --git a/torch/_dynamo/optimizations/backends.py b/torch/_dynamo/optimizations/backends.py
@@ -41,133 +41,6 @@ def _raise_timeout(signum, frame):
     raise TimeoutError()
 
 
-@create_backend
-def fx2trt(subgraph, **kwargs):
-    if subgraph.will_tensorrt_barf():
-        # TensorRT fails violently with an abort() on this
-        return None
-
-    from torch_tensorrt.fx.fx2trt import (  # type: ignore[import]
-        InputTensorSpec,
-        TRTInterpreter,
-    )
-    from torch_tensorrt.fx.passes.lower_basic_pass import (  # type: ignore[import]
-        transform_setitem,
-    )
-    from torch_tensorrt.fx.tools.trt_splitter import (  # type: ignore[import]
-        TRTSplitter,
-        TRTSplitterSetting,
-    )
-    from torch_tensorrt.fx.tracer.acc_tracer import acc_tracer  # type: ignore[import]
-    from torch_tensorrt.fx.trt_module import TRTModule  # type: ignore[import]
-    from torch_tensorrt.fx.utils import LowerPrecision  # type: ignore[import]
-
-    try:
-        model = subgraph.model
-        inputs = subgraph.example_inputs
-        # pass rewrite
-        model = transform_setitem(model, inputs)
-        acc_model = acc_tracer.trace(model, inputs)
-        # Split out unsupported ops
-        splitter_setting = TRTSplitterSetting()
-        splitter_setting.use_implicit_batch_dim = False
-        splitter = TRTSplitter(acc_model, inputs, settings=splitter_setting)
-        splitter.node_support_preview()
-        split_mod = splitter()
-        num_piece = 0
-        for name, _ in split_mod.named_children():
-            print(f"graph is split into {name}")
-            num_piece += 1
-
-        # if the graph module is split into pieces larger than 8, we consider its perf
-        # is not good and fall back to non-TRT
-        if num_piece > 8:
-            print(
-                f"The graph module is split into {num_piece} which is large than the \
-                threshold=8. Fall back to non-TRT module."
-            )
-            return None
-
-        if "fp16_mode" in kwargs and kwargs["fp16_mode"]:
-            precision = LowerPrecision.FP16
-        else:
-            precision = LowerPrecision.FP32
-
-        def get_submod_inputs(mod, submod, inputs):
-            acc_inputs = None
-
-            def get_input(self, inputs):
-                nonlocal acc_inputs
-                acc_inputs = inputs
-
-            handle = submod.register_forward_pre_hook(get_input)
-            mod(*inputs)
-            handle.remove()
-            return acc_inputs
-
-        for name, _ in split_mod.named_children():
-            if "_run_on_acc" in name:
-                submod = getattr(split_mod, name)
-                # print("acc=",submod.code)
-                # Get submodule inputs for fx2trt
-                acc_inputs = get_submod_inputs(split_mod, submod, inputs)
-
-                # fx2trt replacement
-                interp = TRTInterpreter(
-                    submod,
-                    InputTensorSpec.from_tensors(acc_inputs),
-                    explicit_batch_dimension=True,
-                )
-                r = interp.run(
-                    max_workspace_size=20 << 30,
-                    lower_precision=precision,
-                    # profiling_verbosity=trt.ProfilingVerbosity.DETAILED, #For profile
-                )
-                # For profile
-                # from fx2trt_oss.fx.tools.trt_profiler_sorted import profile_trt_module
-                # profile_trt_module("", trt_mod, acc_inputs)
-                trt_mod = TRTModule(*r)
-
-                setattr(split_mod, name, trt_mod)
-            else:
-                submod = getattr(split_mod, name)
-                # print("gpu=",submod.code)
-        return subgraph.wrap_returns(split_mod)
-    except Exception:
-        log.exception("FX2TRT conversion error")
-        return None
-
-
-@create_backend
-def torch2trt(subgraph):
-    if subgraph.will_tensorrt_barf():
-        # TensorRT fails violently with an abort() on this
-        return None
-
-    from torch2trt import torch2trt  # type: ignore[import]
-
-    inputs = subgraph.example_inputs
-    trt_mod = torch2trt(
-        subgraph.model,
-        inputs,
-        max_batch_size=len(inputs[0]),
-        strict_type_constraints=True,
-    )
-    return subgraph.wrap_returns(trt_mod)
-
-
-@create_backend
-def tensorrt(subgraph):
-    if subgraph.will_tensorrt_barf():
-        # TensorRT fails violently with an abort() on this
-        return None
-
-    model = fx2trt(subgraph)
-    if model is None:
-        model = torch2trt(subgraph)
-    return model
-
-
 def tvm_compile(jit_mod, example_inputs, log_file=None, **kwargs):
     if jit_mod is None:
         return None
@@ -403,27 +276,3 @@ def ipex(subgraph):
     except Exception:
         log.warning("JIT trace failed during the 'ipex' optimize process.")
         return model
-
-
-def fx2trt_compiler_fp16(gm: torch.fx.GraphModule, example_inputs):
-    kwargs_fx2trt = {"fp16_mode": True}
-    trt_compiled = fx2trt(gm, example_inputs, **kwargs_fx2trt)
-    if trt_compiled is not None:
-        return trt_compiled
-    else:
-        print(
-            "FX2TRT conversion failed on the subgraph. Return GraphModule forward instead"
-        )
-        return gm.forward
-
-
-def fx2trt_compiler(gm: torch.fx.GraphModule, example_inputs):
-    kwargs_fx2trt = {"fp16_mode": False}
-    trt_compiled = fx2trt(gm, example_inputs, **kwargs_fx2trt)
-    if trt_compiled is not None:
-        return trt_compiled
-    else:
-        print(
-            "FX2TRT conversion failed on the subgraph. Return GraphModule forward instead"
-        )
-        return gm.forward