alexnet pytorch模型和onnx模型速度对比


# -*- coding: utf-8 -*-
from tqdm import tqdm
import pandas as pd
import time
import torch
import torchvision
import numpy as np
import onnxruntime as ort
 
 
def save_onnx(path, device):
    dummy_input = torch.randn(3, 3, 224, 224).to(device)
    model = torchvision.models.alexnet(pretrained=False).to(device)
    input_names = ["actual_input_1"] + ["learned_%d" % i for i in range(16)]
    output_names = ["output1"]
    torch.onnx.export(model, dummy_input, path, verbose=False, input_names=input_names, output_names=output_names,
                      dynamic_axes={'actual_input_1': [0], 'output1': [0]})
    # print(model)
    return model
 
 
def torch_t(model, batch_size, device):
    s = time.time()
    input_ = torch.randn(batch_size, 3, 224, 224).to(device)
    for i in range(epoch):
        model(input_)
    cost = time.time() - s
    return round(cost, 2)
 
 
def onnx_t_fun(path, batch_size, device):
    if device == 'cpu':
        ort_session = ort.InferenceSession(path, providers=['CPUExecutionProvider'])
    else:
        ort_session = ort.InferenceSession(path, providers=['CUDAExecutionProvider'])
    s = time.time()
    input_ = np.random.randn(batch_size, 3, 224, 224).astype(np.float32)
    for i in range(epoch):
        ort_session.run(
            None,
            {"actual_input_1": input_},
        )
    cost = time.time() - s
    return round(cost, 2)
 
 
if __name__ == '__main__':
 
    batch_size_list = [2 ** i for i in range(8)]
    device_list = ['cpu', torch.device(0)]
    tuples = [(device, batch_size) for device in device_list for batch_size in batch_size_list]
    index = pd.MultiIndex.from_tuples(tuples)
    epoch = 100
    path = "alexnet.onnx"
    df = pd.DataFrame(columns=['torch', 'onnx'], index=index)
    for batch_size in tqdm(batch_size_list):
        for device in device_list:
            model = save_onnx(path, device)
            cost_torch = torch_t(model, batch_size, device)
            cost_onnx = onnx_t_fun(path, batch_size, device)
            df.loc[(device, batch_size), :] = [cost_torch, cost_onnx]
    print(df)

torch onnx
cpu 1 1.2 0.89
2 2.47 1.15
4 2.86 1.68
8 3.67 2.7
16 5.93 4.72
32 9.47 8.85
64 17.33 17.26
128 32.77 34.27
cuda:0 1 0.07 0.42
2 0.08 0.65
4 0.12 0.6
8 0.2 0.27
16 0.24 0.4
32 0.38 0.89
64 0.69 1.72
128 1.26 3.12

可以发现cpu时，onnx在batch_size较小时，速度能提升部分，随着batch_size变大，越来越慢

gpu上反而变慢了，不知道是否合理

cpu查询命令为cat /proc/cpuinfo | grep 'model name' |uniq

结果为 Intel(R) Core(TM) i7-9700K CPU @ 3.60GHz

相关阅读:
【数据分析】NumPy
产品周报第35期｜APP端学习门户上线、每日一练新增编程题型、专栏显示订阅来源……
专利申请需要多少钱
linux进阶56——systemd实现程序日志保存成文件
Go踩坑记录分享
单源最短路径问题（Java）
JVM基础08_强软弱虚引用
HCIA笔记-1 网络基础
揭秘编码器与解码器语言模型
Ruby教程_编程入门自学教程_菜鸟教程-免费教程分享

原文地址：https://blog.csdn.net/qq_22526061/article/details/126976847