# from /home/oldpan/code/convert/pytorch/torch/_inductor/utils.py
def timed(
model: Callable[..., Any], example_inputs, times: int = 1, device: str = "cuda"
) -> float:
synchronize(device)
torch.manual_seed(1337)
t0 = time.perf_counter()
for _ in range(times):
result = model(*example_inputs)
synchronize(device)
t1 = time.perf_counter()
# GC the result after timing
assert result is not None
return t1 - t0
def print_performance(
fn, args=(), times=10, repeat=10, baseline=1.0, device: str = "cuda"
):
timings = torch.tensor([timed(fn, args, times, device) for _ in range(repeat)])
took = torch.median(timings) / times
print(f"{took/baseline:.6f}")
return took
# usage
def benchmark_compiled_module(times=10, repeat=10):
from torch._dynamo.testing import rand_strided
from torch._inductor.utils import print_performance
global _tensor_constant0
_tensor_constant0 = rand_strided((19, ), (1, ), device='cpu', dtype=torch.int64)
arg0_1 = rand_strided((1, ), (1, ), device='cpu', dtype=torch.float32)
return print_performance(lambda: call([arg0_1]), times=times, repeat=repeat)
参考