• PyTorch深度强化学习中蒙特卡洛策略梯度法在短走廊环境(CartPole-v0)中的实战(超详细 附源码)


    需要源码请点赞关注收藏后评论区留下QQ~~~

    一、策略梯度法

    策略梯度法(PG)利用策略函数来选择动作,同时使用值函数来辅助策略函数参数的更新,根据策略类型的不同,可以分为随机策略梯度和确定性策略梯度

    策略梯度法与值函数逼近法相比优点如下

    1:平滑收敛

    在学习过程中,PG法每次更新策略函数,权重参数都会朝着最优值变化,且只发生微小变化,有很强的收敛性,值函数逼近法基于贪心策略对策略进行改进,有些价值函数在后期会一直围绕着最优价值函数持续小的震荡而不收敛 即出现策略退化

    2:处理连续动作空间任务

    PG法可以直接得到一个策略,而值函数逼近法需要对比状态S中的所有动作的价值

    3:学习随机策略

    PG法能够输出随机策略,而值函数逼近法基于贪心方法,每次输出的都是确定性函数

    PG法缺点如下

    1:PG法通常只能收敛到局部最优解

    2:PG法的易收敛性和学习过程平滑优势,都是使Agent尝试过多的无效探索,从而造成学习效率低,整体策略方差偏大,以及存在累积错误带来的过高估计问题 

    二、蒙特卡洛策略梯度法(REINFORCE)

    蒙特卡洛策略梯度法是一种针对情节式问题的,基于MC算法的PG法

    REINFORCE算法采用MC算法来计算动作值函数,只考虑Agent在状态S下实际采取的动作,该方法可以从理论上保存策略参数的收敛性

    1:实验环境设置

    引入短走廊网格世界环境  如下图

    每步的收益是-1,对于三个非终止状态都有两个动作可以选择,向左或者向右,特殊的是,第一个状态向左走会保持原地不动,而在第二个状态执行的动作会导致向相反的方向移动 

     

    部分超参数列表如下

     2:实验结果分析

    每个环境下算法的训练情节数均为1000个情节,这是因为两个环境在1000个情节后都能收敛,如下图所示REINFORCE算法在短走廊环境和CartPole环境的效果整体上都呈现稳步上升,后平稳的学习趋势

    迭代一千次后如下  (每隔一千次输出一次可视化图像)

    迭代两千次后效果如下 

     

     迭代8000次后如下 发现基本已经在高位收敛

     

    三、代码

    部分源码如下

    1. # 代38-REINFORCE算法的实验过程
    2. #CartPole环境
    3. import argparse
    4. import gym
    5. import numpy as np
    6. from itertools import count
    7. import torch
    8. import torch.nn as nn
    9. import torch.nn.functional as F
    10. import torch.optim as optim
    11. from torch.distributions import Categorical
    12. import matplotlib.pyplot as plt
    13. import os
    14. os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
    15. parser = argparse.ArgumentParser(description='PyTorch REINFORCE example')
    16. parser.add_argument('--gamma', type=float, default=0.99, metavar='G',
    17. help='discount factor (default: 0.99)')
    18. parser.add_argument('--seed', type=int, default=543, metavar='N',
    19. help='random seed (default: 543)')
    20. parser.add_argument('--render', action='store_true',
    21. help='render the environment')
    22. parser.add_argument('--log-interval', type=int, default=10, metavar='N',
    23. help='interval between training status logs (default: 10)')
    24. args = parser.parse_args()
    25. env = gym.make('CartPole-v1')
    26. env.seed(args.seed)
    27. torch.manual_seed(args.seed)
    28. class Policy(nn.Module):
    29. def __init__(self):
    30. super(Policy, self).__init__()
    31. self.affine1 = nn.Linear(4, 128)
    32. self.dropout = nn.Dropout(p=0.6)
    33. self.affine2 = nn.Linear(128, 2)
    34. self.saved_log_probs = []
    35. self.rewards = []
    36. def forward(self, x):
    37. x = self.affine1(x)
    38. x = self.dropout(x)
    39. x = F.relu(x)
    40. action_scores = self.affine2(x)
    41. return F.softmax(action_scores, dim=1)
    42. policy = Policy()
    43. optimizer = optim.Adam(policy.parameters(), lr=1e-2)
    44. eps = np.finfo(np.float32).eps.item()
    45. def select_action(state):
    46. state = torch.from_numpy(state).float().unsqueeze(0)
    47. probs = policy(state)
    48. m = Categorical(probs)
    49. action = m.sample()
    50. policy.saved_log_probs.append(m.log_prob(action))
    51. return action.item()
    52. def finish_episode():
    53. R = 0
    54. policy_loss = []
    55. rewards = []
    56. for r in policy.rewards[::-1]:
    57. R = r + args.gamma * R
    58. rewards.insert(0, R)
    59. rewards = torch.tensor(rewards)
    60. rewards = (rewards - rewards.mean()) / (rewards.std() + eps)
    61. for log_prob, reward in zip(policy.saved_log_probs, rewards):
    62. policy_loss.append(-log_prob * reward)
    63. optimizer.zero_grad()
    64. policy_loss = torch.cat(policy_loss).sum()
    65. policy_loss.backward()
    66. optimizer.step()
    67. del policy.rewards[:]
    68. del policy.saved_log_probs[:]
    69. def plot(epi, run_time):
    70. plt.title('Training')
    71. plt.xlabel('Episode')
    72. plt.ylabel('Run Time')
    73. plt.plot(epi, run_time, color='red')
    74. plt.show()
    75. def main():
    76. running_reward = 10
    77. running_rewards = []
    78. i_episodes = []
    79. for i_episode in range(10000):
    80. state, ep_reward = env.reset(), 0
    81. for t in range(1, 10000): # Don't infinite loop while learning
    82. action = select_action(state)
    83. state, reward, done, _ = env.step(action)
    84. if args.render:
    85. env.render()
    86. policy.rewards.append(reward)
    87. ep_reward += reward
    88. if done:
    89. break
    90. running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
    91. finish_episode()
    92. running_rewards.append(running_reward)
    93. i_episodes.append(i_episode)
    94. if i_episode % args.log_interval == 0:
    95. print('Episode {}\tLast length: {:2f}\tAverage length: {:.2f}'.format(
    96. i_episode, ep_reward, running_reward))
    97. if (i_episode % 1000 == 0):
    98. plot(i_episodes, running_rewards)
    99. np.save(f"putu", running_rewards)
    100. if __name__ == '__main__':
    101. main()
    102. # 短走廊环境
    103. import numpy as np
    104. import matplotlib
    105. matplotlib.use('Agg')
    106. import matplotlib.pyplot as plt
    107. from tqdm import tqdm
    108. def true_value(p):
    109. """ True value of the first state
    110. Args:
    111. p (float): probability of the action 'right'.
    112. Returns:
    113. True value of the first state.
    114. The expression is obtained by manually solving the easy linear system
    115. of Bellman equations using known dynamics.
    116. """
    117. return (2 * p - 4) / (p * (1 - p))
    118. class ShortCorridor:
    119. """
    120. Short corridor environment, see Example 13.1
    121. """
    122. def __init__(self):
    123. self.reset()
    124. def reset(self):
    125. self.state = 0
    126. def step(self, go_right):
    127. """
    128. Args:
    129. go_right (bool): chosen action
    130. Returns:
    131. tuple of (reward, episode terminated?)
    132. """
    133. if self.state == 0 or self.state == 2:
    134. if go_right:
    135. self.state += 1
    136. else:
    137. self.state = max(0, self.state - 1)
    138. else:
    139. if go_right:
    140. self.state -= 1
    141. else:
    142. self.state += 1
    143. if self.state == 3:
    144. # terminal state
    145. return 0, True
    146. else:
    147. return -1, False
    148. def softmax(x):
    149. t = np.exp(x - np.max(x))
    150. return t / np.sum(t)
    151. class ReinforceAgent:
    152. """
    153. ReinforceAgent that follows algorithm
    154. 'REINFORNCE Monte-Carlo Policy-Gradient Control (episodic)'
    155. """
    156. def __init__(self, alpha, gamma):
    157. # set values such that initial conditions correspond to left-epsilon greedy
    158. self.theta = np.array([-1.47, 1.47])
    159. self.alpha = alpha
    160. self.gamma = gamma
    161. # first column - left, second - right
    162. self.x = np.array([[0, 1],
    163. [1, 0]])
    164. self.rewards = []
    165. self.actions = []
    166. def get_pi(self):
    167. h = np.dot(self.theta, self.x)
    168. t = np.exp(h - np.max(h))
    169. pmf = t / np.sum(t)
    170. # never become deterministic,
    171. # guarantees episode finish
    172. imin = np.argmin(pmf)
    173. epsilon = 0.05
    174. if pmf[imin] < epsilon:
    175. pmf[:] = 1 - epsilon
    176. pmf[imin] = epsilon
    177. return pmf
    178. def get_p_right(self):
    179. return self.get_pi()[1]
    180. def choose_action(self, reward):
    181. if reward is not None:
    182. self.rewards.append(reward)
    183. pmf = self.get_pi()
    184. go_right = np.random.uniform() <= pmf[1]
    185. self.actions.append(go_right)
    186. return go_right
    187. def episode_end(self, last_reward):
    188. self.rewards.append(last_reward)
    189. # learn theta
    190. G = np.zeros(len(self.rewards))
    191. G[-1] = self.rewards[-1]
    192. for i in range(2, len(G) + 1):
    193. G[-i] = self.gamma * G[-i + 1] + self.rewards[-i]
    194. gamma_pow = 1
    195. for i in range(len(G)):
    196. j = 1 if self.actions[i] else 0
    197. pmf = self.get_pi()
    198. grad_ln_pi = self.x[:, j] - np.dot(self.x, pmf)
    199. update = self.alpha * gamma_pow * G[i] * grad_ln_pi
    200. self.theta += update
    201. gamma_pow *= self.gamma
    202. self.rewards = []
    203. self.actions = []
    204. class ReinforceBaselineAgent(ReinforceAgent):
    205. def __init__(self, alpha, gamma, alpha_w):
    206. super(ReinforceBaselineAgent, self).__init__(alpha, gamma)
    207. self.alpha_w = alpha_w
    208. self.w = 0
    209. def episode_end(self, last_reward):
    210. self.rewards.append(last_reward)
    211. # learn theta
    212. G = np.zeros(len(self.rewards))
    213. G[-1] = self.rewards[-1]
    214. for i in range(2, len(G) + 1):
    215. G[-i] = self.gamma * G[-i + 1] + self.rewards[-i]
    216. gamma_pow = 1
    217. for i in range(len(G)):
    218. self.w += self.alpha_w * gamma_pow * (G[i] - self.w)
    219. j = 1 if self.actions[i] else 0
    220. pmf = self.get_pi()
    221. grad_ln_pi = self.x[:, j] - np.dot(self.x, pmf)
    222. update = self.alpha * gamma_pow * (G[i] - self.w) * grad_ln_pi
    223. self.theta += update
    224. gamma_pow *= self.gamma
    225. self.rewards = []
    226. self.actions = []
    227. def trial(num_episodes, agent_generator):
    228. env = ShortCorridor()
    229. agent = agent_generator()
    230. rewards = np.zeros(num_episodes)
    231. for episode_idx in range(num_episodes):
    232. rewards_sum = 0
    233. reward = None
    234. env.reset()
    235. while True:
    236. go_right = agent.choose_action(reward)
    237. reward, episode_end = env.step(go_right)
    238. rewards_sum += reward
    239. if episode_end:
    240. agent.episode_end(reward)
    241. break
    242. rewards[episode_idx] = rewards_sum
    243. return rewards
    244. def example_13_1():
    245. epsilon = 0.05
    246. fig, ax = plt.subplots(1, 1)
    247. # Plot a graph
    248. p = np.linspace(0.01, 0.99, 100)
    249. y = true_value(p)
    250. ax.plot(p, y, color='red')
    251. # Find a maximum point, can also be done analytically by taking a derivative
    252. imax = np.argmax(y)
    253. pmax = p[imax]
    254. ymax = y[imax]
    255. ax.plot(pmax, ymax, color='green', marker="*", label="optimal point: f({0:.2f}) = {1:.2f}".format(pmax, ymax))
    256. # Plot points of two epsilon-greedy policies
    257. ax.plot(epsilon, true_value(epsilon), color='magenta', marker="o", label="epsilon-greedy left")
    258. ax.plot(1 - epsilon, true_value(1 - epsilon), color='blue', marker="o", label="epsilon-greedy right")
    259. ax.set_ylabel("Value of the first state")
    260. ax.set_xlabel("Probability of the action 'right'")
    261. ax.set_title("Short corridor with switched actions")
    262. ax.set_ylim(ymin=-105.0, ymax=5)
    263. ax.legend()
    264. plt.savefig('../images/example_13_1.png')
    265. plt.close()
    266. def figure_13_1():
    267. num_trials = 100
    268. num_episodes = 1000
    269. gamma = 1
    270. agent_generators = [lambda : ReinforceAgent(alpha=2e-4, gamma=gamma),
    271. lambda : ReinforceAgent(alpha=2e-5, gamma=gamma),
    272. lambda : ReinforceAgent(alpha=2e-3, gamma=gamma)]
    273. labels = ['alpha = 2e-4',
    274. 'alpha = 2e-5',
    275. 'alpha = 2e-3']
    276. rewards = np.zeros((len(agent_generators), num_trials, num_episodes))
    277. for agent_index, agent_generator in enumerate(agent_generators):
    278. for i in tqdm(range(num_trials)):
    279. reward = trial(num_episodes, agent_generator)
    280. rewards[agent_index, i, :] = reward
    281. plt.plot(np.arange(num_episodes) + 1, -11.6 * np.ones(num_episodes), ls='dashed', color='red', label='-11.6')
    282. for i, label in enumerate(labels):
    283. plt.plot(np.arange(num_episodes) + 1, rewards[i].mean(axis=0), label=label)
    284. plt.ylabel('total reward on episode')
    285. plt.xlabel('episode')
    286. plt.legend(loc='lower right')
    287. plt.savefig('../images/figure_13_1.png')
    288. plt.close()
    289. def figure_13_2():
    290. num_trials = 100
    291. num_episodes = 1000
    292. alpha = 2e-4
    293. gamma = 1
    294. ageninforce with baseline']
    295. rewards = np.zeros((len(agent_generators), num_trials, num_episodes))
    296. for agent_index, agent_generator in enumerate(agent_generators):
    297. for i in tqdm(range(num_trials)):
    298. reward = trial(num_episodes, agent_generator)
    299. rewards[agent_index, i, :] = reward
    300. plt.plot(np.arange(num_episodes) + 1, -11.6 * np.ones(num_episodes), ls='dashed', color='red', label='-11.6')
    301. for i, label in enumerate(labels):
    302. plt.plot(np.arange(num_episodes) + 1, rewards[i].mean(axis=0), label=label)
    303. _':
    304. example_13_1()
    305. figure_13_1()
    306. figure_13_2()

    创作不易  觉得有帮助请点赞关注收藏~~~

  • 相关阅读:
    前端代码规范
    0基础学习VR全景平台篇 第101篇:企业版功能-子账号分配管理
    对非均匀采样信号进行重采样
    Spring 框架使用的设计模式
    Redux Toolkit中action派发但state值不更新的原因
    「项目管理工具」进度猫如何实现可视化?
    08:STM32----DMA数据转运
    c# 面试题
    11 月 11 日 ROS 学习笔记——ROS 架构及概念
    Go——数组
  • 原文地址:https://blog.csdn.net/jiebaoshayebuhui/article/details/128055746