Python环境下进行benchmark

# from /home/oldpan/code/convert/pytorch/torch/_inductor/utils.py
def timed(
    model: Callable[..., Any], example_inputs, times: int = 1, device: str = "cuda"
) -> float:
    synchronize(device)
    torch.manual_seed(1337)
    t0 = time.perf_counter()
    for _ in range(times):
        result = model(*example_inputs)
        synchronize(device)
    t1 = time.perf_counter()
    # GC the result after timing
    assert result is not None
    return t1 - t0


def print_performance(
    fn, args=(), times=10, repeat=10, baseline=1.0, device: str = "cuda"
):
    timings = torch.tensor([timed(fn, args, times, device) for _ in range(repeat)])
    took = torch.median(timings) / times
    print(f"{took/baseline:.6f}")
    return took

# usage
def benchmark_compiled_module(times=10, repeat=10):
    from torch._dynamo.testing import rand_strided
    from torch._inductor.utils import print_performance
    global _tensor_constant0
    _tensor_constant0 = rand_strided((19, ), (1, ), device='cpu', dtype=torch.int64)
    arg0_1 = rand_strided((1, ), (1, ), device='cpu', dtype=torch.float32)
    return print_performance(lambda: call([arg0_1]), times=times, repeat=repeat)

参考