目录
上一次构建带bn层的神经网络中,会发现一个现象,加入bn层不会提升模型迭代的平稳性,甚至加入bn层后模型不平稳的特点会更严重。不平稳性一般与学习率有关,为了解决这一现象,我们可以对学习率进行优化调整。我们先观察学习率与模型不平稳之间的关系(在学习率分别为0.1,0.01,0.03,0.005时,模型的误差变化情况):
- def test_lr():
- utils = MyTorchUtils()
- torch.manual_seed(929)
-
- # create data
- feature,labels = utils.tensorDataGenRe(bag=2,w=[2,-1,3,1,2],bias=False)
-
- # split data
- train_loader,test_loader = utils.split_loader(feature,labels,batch_size=50)
-
- tanh_model3 = net_class3(act_fun=torch.tanh,in_features=5)
- tanh_model3_norml = net_class3(act_fun=torch.tanh,in_features=5,BN_model='pre')
- tanh_model4 = net_class4(act_fun=torch.tanh,in_features=5)
- tanh_model4_norml = net_class4(act_fun=torch.tanh,in_features=5,BN_model='pre')
-
- model_1 = [tanh_model3,tanh_model3_norml,tanh_model4,tanh_model4_norml]
- name_1 = ['tanh_model3','tanh_model3_norml','tanh_model4','tanh_model4_norml']
-
- num_epochs = 40
-
- train_01,test_01 = utils.model_comparison(model_1=model_1,name_1=name_1,
- train_data=train_loader,
- test_data=test_loader,
- num_epochs=num_epochs,
- optimizer=optim.SGD,
- criterion=nn.MSELoss(),
- lr=0.1,cla=False,
- eva=utils.mse_cla)
-
- tanh_model3 = net_class3(act_fun=torch.tanh, in_features=5)
- tanh_model3_norml = net_class3(act_fun=torch.tanh, in_features=5, BN_model='pre')
- tanh_model4 = net_class4(act_fun=torch.tanh, in_features=5)
- tanh_model4_norml = net_class4(act_fun=torch.tanh, in_features=5, BN_model='pre')
-
- model_1 = [tanh_model3, tanh_model3_norml, tanh_model4, tanh_model4_norml]
- name_1 = ['tanh_model3', 'tanh_model3_norml', 'tanh_model4', 'tanh_model4_norml']
-
- train_003, test_003 = utils.model_comparison(model_1=model_1, name_1=name_1,
- train_data=train_loader,
- test_data=test_loader,
- num_epochs=num_epochs,
- optimizer=optim.SGD,
- criterion=nn.MSELoss(),
- lr=0.03, cla=False,
- eva=utils.mse_cla)
-
- tanh_model3 = net_class3(act_fun=torch.tanh, in_features=5)
- tanh_model3_norml = net_class3(act_fun=torch.tanh, in_features=5, BN_model='pre')
- tanh_model4 = net_class4(act_fun=torch.tanh, in_features=5)
- tanh_model4_norml = net_class4(act_fun=torch.tanh, in_features=5, BN_model='pre')
-
- model_1 = [tanh_model3, tanh_model3_norml, tanh_model4, tanh_model4_norml]
- name_1 = ['tanh_model3', 'tanh_model3_norml', 'tanh_model4', 'tanh_model4_norml']
-
- train_001, test_001 = utils.model_comparison(model_1=model_1, name_1=name_1,
- train_data=train_loader,
- test_data=test_loader,
- num_epochs=num_epochs,
- optimizer=optim.SGD,
- criterion=nn.MSELoss(),
- lr=0.01, cla=False,
- eva=utils.mse_cla)
-
- tanh_model3 = net_class3(act_fun=torch.tanh, in_features=5)
- tanh_model3_norml = net_class3(act_fun=torch.tanh, in_features=5, BN_model='pre')
- tanh_model4 = net_class4(act_fun=torch.tanh, in_features=5)
- tanh_model4_norml = net_class4(act_fun=torch.tanh, in_features=5, BN_model='pre')
-
- model_1 = [tanh_model3, tanh_model3_norml, tanh_model4, tanh_model4_norml]
- name_1 = ['tanh_model3', 'tanh_model3_norml', 'tanh_model4', 'tanh_model4_norml']
-
- train_0005, test_0005 = utils.model_comparison(model_1=model_1, name_1=name_1,
- train_data=train_loader,
- test_data=test_loader,
- num_epochs=num_epochs,
- optimizer=optim.SGD,
- criterion=nn.MSELoss(),
- lr=0.005, cla=False,
- eva=utils.mse_cla)
-
- plt.subplot(221)
- for i,name in enumerate(name_1):
- plt.plot(list(range(num_epochs)),train_01[i])
-
-
- plt.subplot(222)
- for i,name in enumerate(name_1):
- plt.plot(list(range(num_epochs)),train_003[i])
-
-
- plt.subplot(223)
- for i,name in enumerate(name_1):
- plt.plot(list(range(num_epochs)),train_001[i])
-
-
- plt.subplot(224)
- for i, name in enumerate(name_1):
- plt.plot(list(range(num_epochs)), train_0005[i],label=name)
- plt.legend(loc=1)
- plt.show()
代码要注意的一个,我们每次训练时都要重新定义一个全新的网络
结果

由上图我们基本可以得出结论,学习率的变化会嚷有bn层的模型有较大的波动,即有bn层的模型对学习率变化更加敏感,而对于此类模型而言,调整学习率往往会获得更好的效果。
从上面的结果看出,当学习率越小的时候,模型表现越稳定。实际情况可能是,当学习率较大时,模型会在最优点附近来回振荡,但由于步长过大,一直跨过最优点。而学习率较小时会解决这个情况,但也会存在过小导致步长太短无法走到最优点的情况。所以学习率的学习曲线就像是"随着学习率降低,模型的损失越来越小,到最小点后,学习率越小,损失反而会越来越大的U形图像"。可以用我上面的代码去证明。
幂调度:lr -- lr/2 -- lr/3
指数调度:lr -- lr/10^1 -- lr/10^2
分段恒定调度:每间隔一段时间调整一小学习率,比如1~100为lr,100~200为lr/100.
性能调度:每隔一段时间观察误差变化情况,误差不变,则降低学习率继续迭代
周期调度:学习率在一个周期内进行先递增后递减的变化
让优化器动态调整学习率的类称为学习率调度器类,所有学习率调度器中,lambdaLR是最通用的一种方法。
- from torch.optim import lr_scheduler
- import torch.nn as nn
- import torch
-
- # Create lambda function
- lr_lambda = lambda epoch: 0.5 ** epoch
-
- class net_class2(nn.Module):
- def __init__(self,act_fun=torch.relu,in_features=2,n_hidden1=4,n_hidden2=4,out_features=1,bias=True,BN_model=None,momentum=0.1):
- super(net_class2, self).__init__()
- self.linear1 = nn.Linear(in_features,n_hidden1,bias=bias)
- self.bn1 = nn.BatchNorm1d(n_hidden1,momentum=momentum)
- self.linear2 = nn.Linear(n_hidden1,n_hidden2,bias=bias)
- self.bn2 = nn.BatchNorm1d(n_hidden2,momentum=momentum)
- self.linear3 = nn.Linear(n_hidden2,out_features,bias=bias)
- self.act_fun = act_fun
- self.BN_model = BN_model
-
- def forward(self,x):
- if self.BN_model == 'pre':
- z1 = self.bn1(self.linear1(x))
- f1 = self.act_fun(z1)
- z2 = self.bn2(self.linear2(f1))
- out = self.linear3(self.act_fun(z2))
-
- elif self.BN_model == 'post':
- z1 = self.linear1(x)
- f1 = self.act_fun(z1)
- z2 = self.linear2(self.bn1(f1))
- f2 = self.act_fun(z2)
- out = self.linear3(self.bn2(f2))
- else:
- z1 = self.linear1(x)
- f1 = self.act_fun(z1)
- z2 = self.linear2(f1)
- out = self.linear3(self.act_fun(z2))
- return out
-
- torch.manual_seed(422)
-
- tahn_model1 = net_class2(act_fun=torch.tanh,in_features=5,BN_model='pre')
-
- optimizer = torch.optim.SGD(tahn_model1.parameters(),lr=0.05)
-
- # print(optimizer.state_dict())
-
- # Create Learning Rate Scheduler
- scheduler = lr_scheduler.LambdaLR(optimizer,lr_lambda)
-
- print(optimizer.state_dict())
结果如下:
{'state': {}, 'param_groups': [{'lr': 0.05, 'momentum': 0, 'dampening': 0, 'weight_decay': 0, 'nesterov': False, 'initial_lr': 0.05, 'params': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]}]}
其中intiial_lr代表最初的lr值,lr代表下一轮要进行训练时lr的值,根据设置的调度器的计算方法,lr的更新规为new_lr = lr_lambda(epoch) * initial_lr。
更新lr:
- for X,y in train_loader:
- yhat = tahn_model1.forward(X)
- loss = criterion(yhat,y)
- optimizer.zero_grad()
- loss.backward()
- optimizer.step()
- scheduler.step()
此时,数据训练了一次,epoch=1,lr更新为0.025.