• NNDL 作业11:优化算法比较


     

    目录

    1. 编程实现图6-1,并观察特征

    2. 观察梯度方向

    3. 编写代码实现算法,并可视化轨迹

    4. 分析上图,说明原理(选做)

    5. 总结SGD、Momentum、AdaGrad、Adam的优缺点(选做)

    6. Adam这么好,SGD是不是就用不到了?(选做)

    7. 增加RMSprop、Nesterov算法。(选做)

    8. 基于MNIST数据集的更新方法的比较(选做)


    参考:深度学习入门:基于Python的理论与实现 (ituring.com.cn)

    1. 编程实现图6-1,并观察特征

     

     

     参考代码:

    1. import numpy as np
    2. from matplotlib import pyplot as plt
    3. from mpl_toolkits.mplot3d import Axes3D
    4. # https://blog.csdn.net/weixin_39228381/article/details/108511882
    5. def func(x, y):
    6. return x * x / 20 + y * y
    7. def paint_loss_func():
    8. x = np.linspace(-50, 50, 100) # x的绘制范围是-50到50,从改区间均匀取100个数
    9. y = np.linspace(-50, 50, 100) # y的绘制范围是-50到50,从改区间均匀取100个数
    10. X, Y = np.meshgrid(x, y)
    11. Z = func(X, Y)
    12. fig = plt.figure() # figsize=(10, 10))
    13. ax = Axes3D(fig)
    14. plt.xlabel('x')
    15. plt.ylabel('y')
    16. ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap='rainbow')
    17. plt.show()
    18. paint_loss_func()

     2. 观察梯度方向

    3. 编写代码实现算法,并可视化轨迹

    SGD、Momentum、Adagrad、Adam

    参考代码:

    1. # coding: utf-8
    2. import numpy as np
    3. import matplotlib.pyplot as plt
    4. from collections import OrderedDict
    5. class SGD:
    6. """随机梯度下降法(Stochastic Gradient Descent)"""
    7. def __init__(self, lr=0.01):
    8. self.lr = lr
    9. def update(self, params, grads):
    10. for key in params.keys():
    11. params[key] -= self.lr * grads[key]
    12. class Momentum:
    13. """Momentum SGD"""
    14. def __init__(self, lr=0.01, momentum=0.9):
    15. self.lr = lr
    16. self.momentum = momentum
    17. self.v = None
    18. def update(self, params, grads):
    19. if self.v is None:
    20. self.v = {}
    21. for key, val in params.items():
    22. self.v[key] = np.zeros_like(val)
    23. for key in params.keys():
    24. self.v[key] = self.momentum * self.v[key] - self.lr * grads[key]
    25. params[key] += self.v[key]
    26. class Nesterov:
    27. """Nesterov's Accelerated Gradient (http://arxiv.org/abs/1212.0901)"""
    28. def __init__(self, lr=0.01, momentum=0.9):
    29. self.lr = lr
    30. self.momentum = momentum
    31. self.v = None
    32. def update(self, params, grads):
    33. if self.v is None:
    34. self.v = {}
    35. for key, val in params.items():
    36. self.v[key] = np.zeros_like(val)
    37. for key in params.keys():
    38. self.v[key] *= self.momentum
    39. self.v[key] -= self.lr * grads[key]
    40. params[key] += self.momentum * self.momentum * self.v[key]
    41. params[key] -= (1 + self.momentum) * self.lr * grads[key]
    42. class AdaGrad:
    43. """AdaGrad"""
    44. def __init__(self, lr=0.01):
    45. self.lr = lr
    46. self.h = None
    47. def update(self, params, grads):
    48. if self.h is None:
    49. self.h = {}
    50. for key, val in params.items():
    51. self.h[key] = np.zeros_like(val)
    52. for key in params.keys():
    53. self.h[key] += grads[key] * grads[key]
    54. params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7)
    55. class RMSprop:
    56. """RMSprop"""
    57. def __init__(self, lr=0.01, decay_rate=0.99):
    58. self.lr = lr
    59. self.decay_rate = decay_rate
    60. self.h = None
    61. def update(self, params, grads):
    62. if self.h is None:
    63. self.h = {}
    64. for key, val in params.items():
    65. self.h[key] = np.zeros_like(val)
    66. for key in params.keys():
    67. self.h[key] *= self.decay_rate
    68. self.h[key] += (1 - self.decay_rate) * grads[key] * grads[key]
    69. params[key] -= self.lr * grads[key] / (np.sqrt(self.h[key]) + 1e-7)
    70. class Adam:
    71. """Adam (http://arxiv.org/abs/1412.6980v8)"""
    72. def __init__(self, lr=0.001, beta1=0.9, beta2=0.999):
    73. self.lr = lr
    74. self.beta1 = beta1
    75. self.beta2 = beta2
    76. self.iter = 0
    77. self.m = None
    78. self.v = None
    79. def update(self, params, grads):
    80. if self.m is None:
    81. self.m, self.v = {}, {}
    82. for key, val in params.items():
    83. self.m[key] = np.zeros_like(val)
    84. self.v[key] = np.zeros_like(val)
    85. self.iter += 1
    86. lr_t = self.lr * np.sqrt(1.0 - self.beta2 ** self.iter) / (1.0 - self.beta1 ** self.iter)
    87. for key in params.keys():
    88. self.m[key] += (1 - self.beta1) * (grads[key] - self.m[key])
    89. self.v[key] += (1 - self.beta2) * (grads[key] ** 2 - self.v[key])
    90. params[key] -= lr_t * self.m[key] / (np.sqrt(self.v[key]) + 1e-7)
    91. def f(x, y):
    92. return x ** 2 / 20.0 + y ** 2
    93. def df(x, y):
    94. return x / 10.0, 2.0 * y
    95. init_pos = (-7.0, 2.0)
    96. params = {}
    97. params['x'], params['y'] = init_pos[0], init_pos[1]
    98. grads = {}
    99. grads['x'], grads['y'] = 0, 0
    100. optimizers = OrderedDict()
    101. optimizers["SGD"] = SGD(lr=0.95)
    102. optimizers["Momentum"] = Momentum(lr=0.1)
    103. optimizers["AdaGrad"] = AdaGrad(lr=1.5)
    104. optimizers["Adam"] = Adam(lr=0.3)
    105. idx = 1
    106. for key in optimizers:
    107. optimizer = optimizers[key]
    108. x_history = []
    109. y_history = []
    110. params['x'], params['y'] = init_pos[0], init_pos[1]
    111. for i in range(30):
    112. x_history.append(params['x'])
    113. y_history.append(params['y'])
    114. grads['x'], grads['y'] = df(params['x'], params['y'])
    115. optimizer.update(params, grads)
    116. x = np.arange(-10, 10, 0.01)
    117. y = np.arange(-5, 5, 0.01)
    118. X, Y = np.meshgrid(x, y)
    119. Z = f(X, Y)
    120. # for simple contour line
    121. mask = Z > 7
    122. Z[mask] = 0
    123. # plot
    124. plt.subplot(2, 2, idx)
    125. idx += 1
    126. plt.plot(x_history, y_history, 'o-', color="red")
    127. plt.contour(X, Y, Z) # 绘制等高线
    128. plt.ylim(-10, 10)
    129. plt.xlim(-10, 10)
    130. plt.plot(0, 0, '+')
    131. plt.title(key)
    132. plt.xlabel("x")
    133. plt.ylabel("y")
    134. plt.subplots_adjust(wspace=0, hspace=0) # 调整子图间距
    135. plt.show()

    4. 分析上图,说明原理(选做)

    1. 为什么SGD会走“之字形”?其它算法为什么会比较平滑?
    2. MomentumAdaGrad对SGD的改进体现在哪里?速度?方向?在图上有哪些体现?
    3. 仅从轨迹来看,Adam似乎不如AdaGrad效果好,是这样么?
    4. 四种方法分别用了多长时间?是否符合预期?
    5. 调整学习率、动量等超参数,轨迹有哪些变化?

    5. 总结SGD、MomentumAdaGrad、Adam的优缺点(选做)

    6. Adam这么好,SGD是不是就用不到了?(选做)

    7. 增加RMSprop、Nesterov算法。(选做)

    对比Momentum与Nesterov、AdaGrad与RMSprop。

    8. 基于MNIST数据集的更新方法的比较(选做)

    在原图基础上,增加RMSprop、Nesterov算法。

    编程实现,并谈谈自己的看法。

     优化算法代码可参考前面的内容。

     MNIST数据集的更新方法的比较

    1. # coding: utf-8
    2. import os
    3. import sys
    4. sys.path.append(os.pardir) # 为了导入父目录的文件而进行的设定
    5. import matplotlib.pyplot as plt
    6. from dataset.mnist import load_mnist
    7. from common.util import smooth_curve
    8. from common.multi_layer_net import MultiLayerNet
    9. from common.optimizer import *
    10. # 0:读入MNIST数据==========
    11. (x_train, t_train), (x_test, t_test) = load_mnist(normalize=True)
    12. train_size = x_train.shape[0]
    13. batch_size = 128
    14. max_iterations = 2000
    15. # 1:进行实验的设置==========
    16. optimizers = {}
    17. optimizers['SGD'] = SGD()
    18. optimizers['Momentum'] = Momentum()
    19. optimizers['AdaGrad'] = AdaGrad()
    20. optimizers['Adam'] = Adam()
    21. #optimizers['RMSprop'] = RMSprop()
    22. networks = {}
    23. train_loss = {}
    24. for key in optimizers.keys():
    25. networks[key] = MultiLayerNet(
    26. input_size=784, hidden_size_list=[100, 100, 100, 100],
    27. output_size=10)
    28. train_loss[key] = []
    29. # 2:开始训练==========
    30. for i in range(max_iterations):
    31. batch_mask = np.random.choice(train_size, batch_size)
    32. x_batch = x_train[batch_mask]
    33. t_batch = t_train[batch_mask]
    34. for key in optimizers.keys():
    35. grads = networks[key].gradient(x_batch, t_batch)
    36. optimizers[key].update(networks[key].params, grads)
    37. loss = networks[key].loss(x_batch, t_batch)
    38. train_loss[key].append(loss)
    39. if i % 100 == 0:
    40. print( "===========" + "iteration:" + str(i) + "===========")
    41. for key in optimizers.keys():
    42. loss = networks[key].loss(x_batch, t_batch)
    43. print(key + ":" + str(loss))
    44. # 3.绘制图形==========
    45. markers = {"SGD": "o", "Momentum": "x", "AdaGrad": "s", "Adam": "D"}
    46. x = np.arange(max_iterations)
    47. for key in optimizers.keys():
    48. plt.plot(x, smooth_curve(train_loss[key]), marker=markers[key], markevery=100, label=key)
    49. plt.xlabel("iterations")
    50. plt.ylabel("loss")
    51. plt.ylim(0, 1)
    52. plt.legend()
    53. plt.show()

  • 相关阅读:
    C++ STL 序列式容器(二)
    互联网常见职称
    第三篇,芯片启动和时钟系统
    前端模块化
    SpringBoot 工程的启动LOGO——关闭——修改
    求程序段中++x或者x++的频度,时间复杂度、执行次数
    Redis数据结构一之对象的介绍及各版本对应实现
    无处不在的边缘网络感知
    第七章 查找 七、红黑树
    ZYNQ之定时器
  • 原文地址:https://blog.csdn.net/qq_38975453/article/details/128025693