- # -*- coding: utf-8 -*-
- from tqdm import tqdm
- import pandas as pd
- import time
- import torch
- import torchvision
- import numpy as np
- import onnxruntime as ort
-
-
- def save_onnx(path, device):
- dummy_input = torch.randn(3, 3, 224, 224).to(device)
- model = torchvision.models.alexnet(pretrained=False).to(device)
- input_names = ["actual_input_1"] + ["learned_%d" % i for i in range(16)]
- output_names = ["output1"]
- torch.onnx.export(model, dummy_input, path, verbose=False, input_names=input_names, output_names=output_names,
- dynamic_axes={'actual_input_1': [0], 'output1': [0]})
- # print(model)
- return model
-
-
- def torch_t(model, batch_size, device):
- s = time.time()
- input_ = torch.randn(batch_size, 3, 224, 224).to(device)
- for i in range(epoch):
- model(input_)
- cost = time.time() - s
- return round(cost, 2)
-
-
- def onnx_t_fun(path, batch_size, device):
- if device == 'cpu':
- ort_session = ort.InferenceSession(path, providers=['CPUExecutionProvider'])
- else:
- ort_session = ort.InferenceSession(path, providers=['CUDAExecutionProvider'])
- s = time.time()
- input_ = np.random.randn(batch_size, 3, 224, 224).astype(np.float32)
- for i in range(epoch):
- ort_session.run(
- None,
- {"actual_input_1": input_},
- )
- cost = time.time() - s
- return round(cost, 2)
-
-
- if __name__ == '__main__':
-
- batch_size_list = [2 ** i for i in range(8)]
- device_list = ['cpu', torch.device(0)]
- tuples = [(device, batch_size) for device in device_list for batch_size in batch_size_list]
- index = pd.MultiIndex.from_tuples(tuples)
- epoch = 100
- path = "alexnet.onnx"
- df = pd.DataFrame(columns=['torch', 'onnx'], index=index)
- for batch_size in tqdm(batch_size_list):
- for device in device_list:
- model = save_onnx(path, device)
- cost_torch = torch_t(model, batch_size, device)
- cost_onnx = onnx_t_fun(path, batch_size, device)
- df.loc[(device, batch_size), :] = [cost_torch, cost_onnx]
- print(df)
torch onnx
cpu 1 1.2 0.89
2 2.47 1.15
4 2.86 1.68
8 3.67 2.7
16 5.93 4.72
32 9.47 8.85
64 17.33 17.26
128 32.77 34.27
cuda:0 1 0.07 0.42
2 0.08 0.65
4 0.12 0.6
8 0.2 0.27
16 0.24 0.4
32 0.38 0.89
64 0.69 1.72
128 1.26 3.12
可以发现cpu时,onnx在batch_size较小时,速度能提升部分,随着batch_size变大,越来越慢
gpu上反而变慢了,不知道是否合理
cpu查询命令为cat /proc/cpuinfo | grep 'model name' |uniq
结果为 Intel(R) Core(TM) i7-9700K CPU @ 3.60GHz