• Ai项目十四:基于 LeNet5 的手写数字识别及训练


    若该文为原创文章,转载请注明原文出处。

    一、介绍

    pytorch复现lenet5模型,并检测自己手写的数字图片。

    利用torch框架搭建模型相对比较简单,但是也会遇到很多问题,网上资料很多,搭建模型的方法大同小异,在我尝试了自己搭建搭建出来模型,无论是训练还是检测都会遇到很多的问题,像这种自己遇到的问题,请教别人也没有用。原本使用的是github上的一份代码来复现,环境搭建完成后,才发现要有GPU,而我搭建是使用CPU,失败告终,为了复现,租用了AutoDL平台,在次搭建,这里记录GPU下的操作,CPU版本需要修改源码,自行修改,我的目的是在要训练自己的模型并在RK3568上部署,所以先训练并测试好。为后续部署作基础。

    二、环境

    三、搭建

    1、创建虚拟环境

     conda create -n LeNet5_env python==3.8

    2、安装pytorch

    Previous PyTorch Versions | PyTorch

    根据官方PyTorch,安装pytorch,使用的是CPU版本,其他版本自行安装,安装命令:

    1. ​​​​​​​
    2. pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
    3.  -i https://pypi.tuna.tsinghua.edu.cn/simple

    还需要安装一些其他的库

    1. pip install matplotlib -i https://pypi.tuna.tsinghua.edu.cn/simple
    2. pip install opencv-python -i https://pypi.tuna.tsinghua.edu.cn/simple

    3、数据集下载

    1. http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
    2. http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
    3. http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
    4. http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz

    直接把上面地址复制到网页上,就只可以下载

    下载后保存到data/MNIST/raw目录下

    四、训练代码

    训练模型有四个文件分别为:LeNet5.py;myDatast.py;readMnist.py;train.py

    文件LeNet5.py是网络层模型

    train.py

    1. import torch
    2. from torch.autograd import Variable
    3. import torch.nn as nn
    4. from torch.utils.data import DataLoader
    5. from readMnist import *
    6. from myDatast import Mnist
    7. from LeNet5 import LeNet5
    8. train_images = load_train_images()
    9. train_labels = load_train_labels()
    10. trainData = Mnist(train_images, train_labels)
    11. train_data = DataLoader(dataset=trainData, batch_size=1, shuffle=True)
    12. lenet5 = LeNet5()
    13. lenet5.cuda()
    14. lossFun = nn.CrossEntropyLoss()
    15. optimizer = torch.optim.Adam(params=lenet5.parameters(), lr=1e-4)
    16. Epochs = 100
    17. L = len(train_data)
    18. for epoch in range(Epochs):
    19. for i, (img, id) in enumerate(train_data):
    20. img = img.float()
    21. id = id.float()
    22. img = img.cuda()
    23. id = id.cuda()
    24. img = Variable(img, requires_grad=True)
    25. id = Variable(id, requires_grad=True)
    26. Output = lenet5.forward(img)
    27. loss = lossFun(Output, id.long())
    28. optimizer.zero_grad()
    29. loss.backward()
    30. optimizer.step()
    31. iter = epoch * L + i + 1
    32. if iter % 100 == 0:
    33. print('epoch:{},iter:{},loss:{:.6f}'.format(epoch + 1, iter, loss))
    34. torch.save(lenet5.state_dict(), 'lenet5.pth')

     LeNet5.py

    1. import torch.nn as nn
    2. class LeNet5(nn.Module):
    3. def __init__(self):
    4. super(LeNet5, self).__init__()
    5. self.conv1 = nn.Sequential(
    6. nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5),
    7. nn.Sigmoid(),
    8. nn.MaxPool2d(kernel_size=2, stride=2)
    9. )
    10. self.conv2 = nn.Sequential(
    11. nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5),
    12. nn.Sigmoid(),
    13. nn.MaxPool2d(kernel_size=2, stride=2)
    14. )
    15. self.fc1 = nn.Sequential(
    16. nn.Linear(in_features=16 * 4 * 4, out_features=120),
    17. nn.Sigmoid()
    18. )
    19. self.fc2 = nn.Sequential(
    20. nn.Linear(in_features=120, out_features=84),
    21. nn.Sigmoid()
    22. )
    23. self.fc3 = nn.Linear(in_features=84, out_features=10)
    24. def forward(self, img):
    25. img = self.conv1.forward(img)
    26. img = self.conv2.forward(img)
    27. img = img.view(img.size()[0], -1)
    28. img = self.fc1.forward(img)
    29. img = self.fc2.forward(img)
    30. img = self.fc3.forward(img)
    31. return img

     readMnist.py

    1. from torch.utils.data import Dataset
    2. from torchvision import transforms
    3. import numpy as np
    4. class Mnist(Dataset):
    5. def __init__(self, dataset, label):
    6. self.dataset = dataset
    7. self.label = label
    8. self.len = len(self.label)
    9. self.transforms = transforms.Compose([transforms.ToTensor() , transforms.Normalize(mean=[0.5], std=[0.5])])
    10. def __len__(self):
    11. return self.len
    12. def __getitem__(self, item):
    13. img = self.dataset[item]
    14. img_id = self.label[item]
    15. img = np.transpose(img,(1,2,0))
    16. img = self.transforms(img)
    17. return img, img_id

    readMnist.py

    1. import numpy as np
    2. import struct
    3. import matplotlib.pyplot as plt
    4. import cv2
    5. fpath = 'G:/enpei_Project_Code/21_LeNet5/LeNet5-master/myLeNet5/data/MNIST/raw/'
    6. # 训练集文件
    7. train_images_idx3_ubyte_file = fpath + 'train-images-idx3-ubyte'
    8. # 训练集标签文件
    9. train_labels_idx1_ubyte_file = fpath + 'train-labels-idx1-ubyte'
    10. # 测试集文件
    11. test_images_idx3_ubyte_file = fpath + 't10k-images-idx3-ubyte'
    12. # 测试集标签文件
    13. test_labels_idx1_ubyte_file = fpath + 't10k-labels-idx1-ubyte'
    14. def decode_idx3_ubyte(idx3_ubyte_file):
    15. """
    16. 解析idx3文件的通用函数
    17. :param idx3_ubyte_file: idx3文件路径
    18. :return: 数据集
    19. """
    20. # 读取二进制数据
    21. bin_data = open(idx3_ubyte_file, 'rb').read()
    22. # 解析文件头信息,依次为魔数、图片数量、每张图片高、每张图片宽
    23. offset = 0
    24. fmt_header = '>iiii' # 因为数据结构中前4行的数据类型都是32位整型,所以采用i格式,但我们需要读取前4行数据,所以需要4个i。我们后面会看到标签集中,只使用2个ii。
    25. magic_number, num_images, num_rows, num_cols = struct.unpack_from(fmt_header, bin_data, offset)
    26. print('魔数:%d, 图片数量: %d张, 图片大小: %d*%d' % (magic_number, num_images, num_rows, num_cols))
    27. # 解析数据集
    28. image_size = num_rows * num_cols
    29. offset += struct.calcsize(fmt_header) # 获得数据在缓存中的指针位置,从前面介绍的数据结构可以看出,读取了前4行之后,指针位置(即偏移位置offset)指向0016
    30. print(offset)
    31. fmt_image = '>' + str(
    32. image_size) + 'B' # 图像数据像素值的类型为unsigned char型,对应的format格式为B。这里还有加上图像大小784,是为了读取784个B格式数据,如果没有则只会读取一个值(即一副图像中的一个像素值)
    33. print(fmt_image, offset, struct.calcsize(fmt_image))
    34. images = np.empty((num_images, 1, num_rows, num_cols))
    35. # plt.figure()
    36. for i in range(num_images):
    37. if (i + 1) % 10000 == 0:
    38. print('已解析 %d' % (i + 1) + '张')
    39. print(offset)
    40. images[i] = np.array(struct.unpack_from(fmt_image, bin_data, offset)).reshape((1, num_rows, num_cols))
    41. # print(images[i])
    42. offset += struct.calcsize(fmt_image)
    43. # plt.imshow(images[i],'gray')
    44. # plt.pause(0.00001)
    45. # plt.show()
    46. # plt.show()
    47. return images
    48. def decode_idx1_ubyte(idx1_ubyte_file):
    49. """
    50. 解析idx1文件的通用函数
    51. :param idx1_ubyte_file: idx1文件路径
    52. :return: 数据集
    53. """
    54. # 读取二进制数据
    55. bin_data = open(idx1_ubyte_file, 'rb').read()
    56. # 解析文件头信息,依次为魔数和标签数
    57. offset = 0
    58. fmt_header = '>ii'
    59. magic_number, num_images = struct.unpack_from(fmt_header, bin_data, offset)
    60. print('魔数:%d, 图片数量: %d张' % (magic_number, num_images))
    61. # 解析数据集
    62. offset += struct.calcsize(fmt_header)
    63. fmt_image = '>B'
    64. labels = np.empty(num_images)
    65. for i in range(num_images):
    66. if (i + 1) % 10000 == 0:
    67. print('已解析 %d' % (i + 1) + '张')
    68. labels[i] = struct.unpack_from(fmt_image, bin_data, offset)[0]
    69. offset += struct.calcsize(fmt_image)
    70. return labels
    71. def load_train_images(idx_ubyte_file=train_images_idx3_ubyte_file):
    72. """
    73. TRAINING SET IMAGE FILE (train-images-idx3-ubyte):
    74. [offset] [type] [value] [description]
    75. 0000 32 bit integer 0x00000803(2051) magic number
    76. 0004 32 bit integer 60000 number of images
    77. 0008 32 bit integer 28 number of rows
    78. 0012 32 bit integer 28 number of columns
    79. 0016 unsigned byte ?? pixel
    80. 0017 unsigned byte ?? pixel
    81. ........
    82. xxxx unsigned byte ?? pixel
    83. Pixels are organized row-wise. Pixel values are 0 to 255. 0 means background (white), 255 means foreground (black).
    84. :param idx_ubyte_file: idx文件路径
    85. :return: n*row*col维np.array对象,n为图片数量
    86. """
    87. return decode_idx3_ubyte(idx_ubyte_file)
    88. def load_train_labels(idx_ubyte_file=train_labels_idx1_ubyte_file):
    89. """
    90. TRAINING SET LABEL FILE (train-labels-idx1-ubyte):
    91. [offset] [type] [value] [description]
    92. 0000 32 bit integer 0x00000801(2049) magic number (MSB first)
    93. 0004 32 bit integer 60000 number of items
    94. 0008 unsigned byte ?? label
    95. 0009 unsigned byte ?? label
    96. ........
    97. xxxx unsigned byte ?? label
    98. The labels values are 0 to 9.
    99. :param idx_ubyte_file: idx文件路径
    100. :return: n*1维np.array对象,n为图片数量
    101. """
    102. return decode_idx1_ubyte(idx_ubyte_file)
    103. def load_test_images(idx_ubyte_file=test_images_idx3_ubyte_file):
    104. """
    105. TEST SET IMAGE FILE (t10k-images-idx3-ubyte):
    106. [offset] [type] [value] [description]
    107. 0000 32 bit integer 0x00000803(2051) magic number
    108. 0004 32 bit integer 10000 number of images
    109. 0008 32 bit integer 28 number of rows
    110. 0012 32 bit integer 28 number of columns
    111. 0016 unsigned byte ?? pixel
    112. 0017 unsigned byte ?? pixel
    113. ........
    114. xxxx unsigned byte ?? pixel
    115. Pixels are organized row-wise. Pixel values are 0 to 255. 0 means background (white), 255 means foreground (black).
    116. :param idx_ubyte_file: idx文件路径
    117. :return: n*row*col维np.array对象,n为图片数量
    118. """
    119. return decode_idx3_ubyte(idx_ubyte_file)
    120. def load_test_labels(idx_ubyte_file=test_labels_idx1_ubyte_file):
    121. """
    122. TEST SET LABEL FILE (t10k-labels-idx1-ubyte):
    123. [offset] [type] [value] [description]
    124. 0000 32 bit integer 0x00000801(2049) magic number (MSB first)
    125. 0004 32 bit integer 10000 number of items
    126. 0008 unsigned byte ?? label
    127. 0009 unsigned byte ?? label
    128. ........
    129. xxxx unsigned byte ?? label
    130. The labels values are 0 to 9.
    131. :param idx_ubyte_file: idx文件路径
    132. :return: n*1维np.array对象,n为图片数量
    133. """
    134. return decode_idx1_ubyte(idx_ubyte_file)
    135. if __name__ == '__main__':
    136. train_images = load_train_images()
    137. train_labels = load_train_labels()
    138. test_images = load_test_images()
    139. test_labels = load_test_labels()
    140. pass
    141. # 查看前十个数据及其标签以读取是否正确
    142. for i in range(10):
    143. print(train_labels[i])
    144. img = train_images[i]
    145. img = np.transpose(img, (1, 2, 0))
    146. cv2.namedWindow('img')
    147. cv2.imshow('img', img)
    148. cv2.waitKey(100)
    149. print('done')

    上面代码需要注意的是数据集的路径,需要修改成对应的路径。

    运行python train.py

    训练大概5小时

    五、测试

    1. from LeNet5 import LeNet5
    2. import torch
    3. from readMnist import *
    4. from myDatast import Mnist
    5. from torch.utils.data import DataLoader
    6. import numpy as np
    7. import cv2
    8. test_images = load_test_images()
    9. test_labels = load_test_labels()
    10. testData = Mnist(test_images, test_labels)
    11. test_data = DataLoader(dataset=testData, batch_size=1, shuffle=True)
    12. lenet5 = LeNet5()
    13. lenet5.load_state_dict(torch.load('lenet5.pth'))
    14. lenet5.eval()
    15. showimg = True
    16. js = 0
    17. for i, (img, id) in enumerate(test_data):
    18. img = img.float()
    19. outid = lenet5(img)
    20. oid = torch.argmax(outid)
    21. if oid == id:
    22. js = js + 1
    23. if showimg == True:
    24. img = img.numpy()
    25. img = np.squeeze(img)
    26. id = id.numpy()
    27. id = np.squeeze(id)
    28. id = np.int32(id)
    29. oid = oid.numpy()
    30. oid = np.squeeze(oid)
    31. maxv = np.max(img)
    32. minv = np.min(img)
    33. img = (img - minv) / (maxv - minv)
    34. cv2.namedWindow("img", 0)
    35. cv2.imshow("img", img)
    36. title = "img, predicted value:{},truth value:{}".format(oid, id)
    37. cv2.setWindowTitle("img",title)
    38. cv2.waitKey(1)
    39. print('准确率:{:.6f}'.format(js / (i + 1)))

    测试结果准确率达到0.986基本达到要求 

    如有侵权,或需要完整代码,请及时联系博主。

  • 相关阅读:
    如何获取第三方maven依赖信息?
    云计算基础知识
    [vue] 嵌套iframe,$router.go(-1)后退bug
    iNFTnews | 元宇宙浪潮下,企业正通过AR和VR技术改善客户体验
    The Log-Structured Merge-Tree (LSM-Tree) 论文阅读笔记
    flutter 初视回味
    在c++项目中你必须真正使用的15个c++特性
    IDEA运行main方法,为什么要编译整个工程?
    Spring Data Jpa 多数据源配置(mysql 和 orcle)过程
    Linux 手动卸载jdk
  • 原文地址:https://blog.csdn.net/weixin_38807927/article/details/133514669