使用CIFAR10数据集完成知识蒸馏（参照了快乐就好_Peng博主的博客）致谢

俺这次做的是知识蒸馏，但是看到别人做的数据集不是CIFAT10，所以有点手痒，就自己做了一期CIFAR10数据集的尝试，做出来的结果........还行，还能感觉到教师网络确实是具有指导作用的

在这里我推荐搭建去B站看一下：同济子豪兄讲解的知识蒸馏，挺详细的

话不多说，直接进入正题吧：

1.先进行一个教师网络的定义，并且这个网络的隐藏层的参数我设置为1200，这个主要是为了凸显教师网络和学生网络的参数量大小之间具有巨大的差距。

注意：在运行我的代码的时候可能会报错关于：if __name__ == "__main__": 问题，这个百度就行了，这里就不详细介绍了，因为我为了调用教师网络去到蒸馏网络那里运行，我注释了if __name__ == "__main__"语句，并且把缩径也关掉了，但是你直接使用教师网络运行的话会报错的可能。。。。。。。。。。


# -*- coding:utf-8 -*-
# @Time : 2022-06-27 13:43
# @Author : DaFuChen
# @File : teacher_moudle.py
# @software: PyCharm
 
 
# if __name__ == "__main__":
 
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from tqdm import tqdm  # tqdm 是python中很常用的模块,它的作用就是在终端上出现一个进度条,使得代码进度可视化.
 
file_name_path = "test_teacher" + ".txt"
file_name_path = open(file_name_path, "w")
 
 
 
"""
第一部分
开始进行数据集合的引入
"""
 
# 设置一个随机种子，为了实验的复现
torch.manual_seed(9)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
# 使用cudnn加速卷积运算
torch.backends.cudnn.benchmark = True
 
from torchvision.transforms.transforms import ToTensor
 
# 载入训练集
trainset = torchvision.datasets.CIFAR10(
    root='./datac10',
    train=True,
    transform=ToTensor(),  # 将PLIImage转化为适合pytorch进行处理的数据格式
    download=True
)
 
# 生成测试集合数据
testset = torchvision.datasets.CIFAR10(
    root='./datac10',
    train=False,
    transform=transforms.ToTensor(),
    download=True
)
 
tranloader = torch.utils.data.DataLoader(
    dataset=trainset,
    batch_size=32,
    shuffle=True
)
 
testloader = torch.utils.data.DataLoader(
    dataset=testset,
    batch_size=32,
    shuffle=False
)
 
"""
第二部分
定义teacher模型
"""
 
 
class teacherMoudle(nn.Module):
    def __init__(self, in_channels=1, num_class=10):
        super(teacherMoudle, self).__init__()
 
        # 定义卷积
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.conv2 = nn.Conv2d(6, 16, 5)
 
        # 定义池化层
        self.pool = nn.MaxPool2d(2, 2)
 
        # 定义激活函数选择relu()
        self.relu = nn.ReLU()
 
        # 定义droput层
        self.droput = nn.Dropout(p=0.5)
 
        # 定义全连接层
        self.fc1 = nn.Linear(16 * 5 * 5, 1200)
        # 这里需要注意的是教师的模型的隐藏层的参数很对,有1200个
        self.fc2 = nn.Linear(1200, 600)
        self.fc3 = nn.Linear(600, num_class)
 
    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))
        x = self.pool(self.relu(self.conv2(x)))
 
        x = x.view(-1, 16 * 5 * 5)
        x = self.fc1(x)
        x = self.droput(x)
        x = self.relu(x)
 
        x = self.fc2(x)
        x = self.droput(x)
        x = self.relu(x)
 
        x = self.fc3(x)
 
        return x
 
 
"""
第三部分
训练教师模型
"""
model = teacherMoudle()
model = model.to(device)
 
# 定义损失函数和优化器
crossLoss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)
 
epochs = 6
for epoch in range(epochs):
    model.train()
    # model.train()的作用是启用
    # Batch Normalization和Dropout。如果模型中有BN层或Dropout层, model.train()
    # 是保证训练时BN层能够用到每一批数据的均值和方差, 对于Dropout, model.train()
    # 是随机取一部分网络连接来训练更新参数。”
 
 
    for image, lables in tqdm(tranloader):
        image = image.to(device)
        lables = lables.to(device)
        running_loss = 0.0
 
        # 前向预测
        pred = model(image)
        loss = crossLoss(pred, lables)
        running_loss += loss.item()
 
        # 步骤
        # 1.优化器清零
        # 2.损失返回
        # 3.优化器更新参数值
 
        # 反向传播
        optimizer.zero_grad()  # 必须进行的一个清零操作
 
        # ##################################重要的地方 需要利用loss进行反向传播进行更新参数，不然就会没有优化这个模型一样
        loss.backward()
 
        optimizer.step()  # 整体参数的保存
 
    """
    第四部分
    模型的评估
    """
    # 在评估模式下，batchNorm层，dropout层等用于优化训练而添加的网络层会被关闭，从而使得评估时不会发生偏移。
    model.eval()
    num_correct = 0
    num_samples = 0
 
    with torch.no_grad():
        for image, lables in testloader:
            image = image.to(device)
            lables = lables.to(device)
 
            pred = model(image)
 
            predictions = pred.max(1).indices  # 获取标签值
            num_correct += (predictions == lables).sum()
            num_samples += predictions.size(0)
        acc = (num_correct / num_samples).item()
 
        model.train()
 
 
        print('Epoch:{}\t Accuracy:{:.5f}'.format(epoch + 1, acc))
        print('loss : ' + str(running_loss))
 
        txt = open("./test_teacher.txt", "a").write((str(epoch+1) + "    " + str(acc) + "    " + str(running_loss) + '\r\n'))

训练的结果是：

1    0.49399998784065247    1.1463124752044678

2    0.5374000072479248    1.0659717321395874

3    0.5343999862670898    1.388565182685852

4    0.5910999774932861    1.176950454711914

5    0.6060000061988831    1.2053321599960327

6    0.6089999675750732    1.2049661874771118

接下来就是学生网络了，我在隐藏层那里定义了相对于教师网络来说隐藏层还是很少的神经元，所以看到了学习的效果果真不如教师网络......


# -*- coding:utf-8 -*-
# @Time : 2022-06-27 13:44
# @Author : DaFuChen
# @File : student_moudle.py
# @software: PyCharm
import torchvision.datasets
 
if __name__ == "__main__":
 
    import torch
    import torch.nn as nn
    from tqdm import tqdm
    from torchvision.transforms.transforms import ToTensor
 
 
 
    file_name_path = "test_student" + ".txt"
    file_name_path = (file_name_path, "w")
 
    torch.manual_seed(8)
    device = torch.device("cuda" if torch.cuda.is_available()  else "cpu")
    torch.backends.cudnn.benchmark = True
 
    train_set = torchvision.datasets.CIFAR10(
        root="./datac10",
        train=True,
        transform=ToTensor(),
        download=True
    )
 
    test_set = torchvision.datasets.CIFAR10(
        root="./datac10",
        train=False,
        transform=ToTensor(),
        download=True
    )
 
    train_loader = torch.utils.data.DataLoader(
        dataset=train_set,
        batch_size=32,
        shuffle=True
    )
 
    test_loader = torch.utils.data.DataLoader(
        dataset=test_set,
        batch_size=32,
        shuffle=True
    )
 
 
    class student_moudle(nn.Module):
        def __init__(self, in_channels=1, num_class=10):
            super(student_moudle, self).__init__()
 
            self.conv1 = nn.Conv2d(3, 6, 5)
            self.conv2 = nn.Conv2d(6, 16, 5)
 
            self.pool = nn.MaxPool2d(2, 2)
            self.relu = nn.ReLU()
            self.droput = nn.Dropout(p=0.5)
 
            # 定义全连接层
            self.fc1 = nn.Linear(16 * 5 * 5, 64)
            # 学生的参数是64层，比教师网络的层数少很多，可以使用
            self.fc2 = nn.Linear(64, 32)
            self.fc3 = nn.Linear(32, num_class)
 
 
        def forward(self, x):
            x = self.pool(self.relu(self.conv1(x)))
            x = self.pool(self.relu(self.conv2(x)))
 
            x = x.view(-1, 16 * 5 * 5)
            x = self.fc1(x)
            x = self.droput(x)
            x = self.relu(x)
 
            x = self.fc2(x)
            x = self.droput(x)
            x = self.relu(x)
 
            x = self.fc3(x)
 
            return x
 
    model = student_moudle()
    model = model.to(device)
 
    crossLoss = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(params=model.parameters(), lr=0.001)
 
    epochs = 6
    for epoch in range(epochs):
        model.train()
 
        for image, lables in tqdm(train_loader):
            image = image.to(device)
            lables = lables.to(device)
            running_loss = 0.0
 
            pred = model(image)
            loss = crossLoss(pred, lables)
            running_loss += loss.item()
 
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
 
        model.eval()
        num_correct = 0
        num_samples = 0
 
        with torch.no_grad():
            for image, lables in test_loader:
                image = image.to(device)
                lables = lables.to(device)
 
                pred = model(image)
 
                predictions = pred.max(1).indices
                num_correct += (predictions == lables).sum()
                num_samples +=predictions.size(0)
 
            acc = (num_correct / num_samples).item()
            model.train()
 
            print('Epoch:{}\t Accuracy:{:.5f}'.format(epoch + 1, acc))
            print('loss : ' + str(running_loss))
 
            txt = open("./test_student.txt", "a").write(
                (str(epoch + 1) + "    " + str(acc) + "    " + str(running_loss) + '\r\n'))

训练的效果如下：

1    0.3483999967575073    1.81593656539917

2    0.4001999795436859    1.9031776189804077

3    0.43629997968673706    1.633713960647583

4    0.4610999822616577    1.5931921005249023

5    0.4650999903678894    1.601379632949829

6    0.47529998421669006    1.547680377960205

接下来就是蒸馏网络了：


# -*- coding:utf-8 -*-
# @Time : 2022-06-30 18:49
# @Author : DaFuChen
# @File : distillation.py
# @software: PyCharm
 
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision.transforms import ToTensor
from tqdm import tqdm
import numpy as np
 
 
# 这里引用不了的一个问题就是------我在student_moudle.py文件那里使用了 if __name__ == '__main__'的语句，导致缩径造成引入失败
# import student_moudle
import teacher_moudle
 
 
if __name__ == "__main__":
 
    file_name_path = "distillation" + ".txt"
    file_name_path = (file_name_path, "w")
 
 
 
    torch.manual_seed(8)
    device = torch.device("cuda" if torch.cuda.is_available()  else "cpu")
    torch.backends.cudnn.benchmark = True
 
    train_set = torchvision.datasets.CIFAR10(
        root="./datac10",
        train=True,
        transform=ToTensor(),
        download=True
    )
 
    test_set = torchvision.datasets.CIFAR10(
        root="./datac10",
        train=False,
        transform=ToTensor(),
        download=True
    )
 
    train_loader = torch.utils.data.DataLoader(
        dataset=train_set,
        batch_size=32,
        shuffle=True
    )
 
    test_loader = torch.utils.data.DataLoader(
        dataset=test_set,
        batch_size=32,
        shuffle=True
    )
 
    # 把一个学生模型在这里进行定义，因为写在其他的py文件的话，会在创建student_model的时候产生训练学生网络的现象
    # 在创建teacher的时候就发生了这样的事情
    class student_moudle(nn.Module):
        def __init__(self, in_channels=1, num_class=10):
            super(student_moudle, self).__init__()
 
            self.conv1 = nn.Conv2d(3, 6, 5)
            self.conv2 = nn.Conv2d(6, 16, 5)
 
            self.pool = nn.MaxPool2d(2, 2)
            self.relu = nn.ReLU()
            self.droput = nn.Dropout(p=0.5)
 
            # 定义全连接层
            self.fc1 = nn.Linear(16 * 5 * 5, 64)
            # 学生的参数是64层，比教师网络的层数少很多，可以使用
            self.fc2 = nn.Linear(64, 32)
            self.fc3 = nn.Linear(32, num_class)
 
        def forward(self, x):
            x = self.pool(self.relu(self.conv1(x)))
            x = self.pool(self.relu(self.conv2(x)))
 
            x = x.view(-1, 16 * 5 * 5)
            x = self.fc1(x)
            x = self.droput(x)
            x = self.relu(x)
 
            x = self.fc2(x)
            x = self.droput(x)
            x = self.relu(x)
 
            x = self.fc3(x)
 
            return x
 
 
 
 
    # 在这里继续定义个teacher_model网络
    class teacherMoudle(nn.Module):
        def __init__(self, in_channels=1, num_class=10):
            super(teacherMoudle, self).__init__()
 
            # 定义卷积
            self.conv1 = nn.Conv2d(3, 6, 5)
            self.conv2 = nn.Conv2d(6, 16, 5)
 
            # 定义池化层
            self.pool = nn.MaxPool2d(2, 2)
 
            # 定义激活函数选择relu()
            self.relu = nn.ReLU()
 
            # 定义droput层
            self.droput = nn.Dropout(p=0.5)
 
            # 定义全连接层
            self.fc1 = nn.Linear(16 * 5 * 5, 1200)
            # 这里需要注意的是教师的模型的隐藏层的参数很对,有1200个
            self.fc2 = nn.Linear(1200, 600)
            self.fc3 = nn.Linear(600, num_class)
 
        def forward(self, x):
            x = self.pool(self.relu(self.conv1(x)))
            x = self.pool(self.relu(self.conv2(x)))
 
            x = x.view(-1, 16 * 5 * 5)
            x = self.fc1(x)
            x = self.droput(x)
            x = self.relu(x)
 
            x = self.fc2(x)
            x = self.droput(x)
            x = self.relu(x)
 
            x = self.fc3(x)
 
            return x
 
 
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
    # 先准备已经接受训练的教师模型
    teacher_moudle.model.eval()
 
    # 准备新的学生模型
    # model = student_moudle.student_moudle()
    model = student_moudle()
    model = model.to(device)
 
    # 教师网络需要进行这样一个操作在后面才能使用这个网络进行调用预测image, 因为后面image把它置与GPU中，这里需要把模型也放到GPU中
    teachermoudle = teacherMoudle()
    teachermoudle = teachermoudle.to(device)
 
    model.train()
 
    # 设置需要进行蒸馏的温度
    temp = 7
 
    # 设置蒸馏学习的损失函数 使用的还是交叉熵损失函数
    hard_loss = nn.CrossEntropyLoss()
 
    # 设置学习的损失值的权重
    alpha = 0.3
 
    # 使用一个soft_loss
    soft_loss = nn.KLDivLoss(reduction='batchmean')
 
    # 设置一个优化器
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
 
    epochs = 6
    for epoch in range(epochs):
        # 训练集上训练模型的权重
        for image, lables in tqdm(train_loader):
            running_loss = 0
            image = image.to(device)
            lables = lables.to(device)
 
            with torch.no_grad():
                teacher_preds = teachermoudle(image)
                # print(teacher_preds)
                # teacher_preds = teacher_preds.numpy()
            # 学生的模型
            student_preds = model(image)
 
            # 计算hard_loss
            student_loss = hard_loss(student_preds, lables)
 
            # 计算蒸馏后的预测结果soft_loss
            ditillation_loss = soft_loss(
                F.softmax(student_preds/temp, dim=1),
                F.softmax(teacher_preds/temp, dim=1)
            )
 
 
            # 将hard_loss和soft_loss加权求和  更新loss值
            loss = alpha*student_loss+(1-alpha)*ditillation_loss
            running_loss += loss.item()
 
            # 反向传播 优化权重
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
 
        # 测试集上评估模型性能
        model.eval()
 
 
        # model.eval()
        num_correct = 0
        num_samples = 0
 
        with torch.no_grad():
            for image2, lables2 in test_loader:
                image2 = image2.to(device)
                lables2 = lables2.to(device)
 
                preds = model(image2)
                predictions = preds.max(1).indices
                num_correct += (predictions == lables2).sum()
                num_samples += predictions.size(0)
            acc = (num_correct/num_samples).item()
        model.train()
 
        print('Epoch:{}\t Accuracy:{:.5f}'.format(epoch + 1, acc))
        print('loss : ' + str(running_loss))
 
        txt = open("./distillation.txt", "a").write(
            (str(epoch + 1) + "    " + str(acc) + "    " + str(running_loss) + '\r\n'))

训练效果如下：

1    0.3822000026702881    -1.1664808988571167

2    0.41040000319480896    -1.1804652214050293

3    0.46219998598098755    -1.2277178764343262

4    0.4693000018596649    -1.302511215209961

5    0.4846999943256378    -1.2454404830932617

6    0.4932999908924103    -1.27120840549469

本人还是大二的，对深度学习不是很深入了解，如果上面有错误，欢迎指正，请别互联网暴力哇，发这个博客纯属是为了给大伙一起学习新知识

相关阅读:
考虑长短期兴趣和内外站信号的推荐
 统一最小二乘标准问题形式
 【Shiro】SpringBoot集成Shiro权限认证《上》
Android源码分析 - Framework层的Binder（客户端篇）
Go语言函数底层实现
 未分区Linux磁盘扩容时，扩展文件系统
 C#NPOI操作Excel，实现Excel数据导入导出（支持多个sheet）
专业扫盲，熬夜整理56个JavaScript高级的手写知识点【史上最全】
二维数组——onenote笔记
 【【verilog代码异步FIFO的设计解释+源码+tb】】
原文地址：https://blog.csdn.net/blockshowtouse/article/details/125551415