• 深度强化学习中Double DQN算法(Q-Learning+CNN)的讲解及在Asterix游戏上的实战(超详细 附源码)


    需要源码和环境搭建请点赞关注收藏后评论区留下QQ~~~

    一、核心思想

    针对DQN中出现的高估问题,有人提出深度双Q网络算法(DDQN),该算法是将强化学习中的双Q学习应用于DQN中。在强化学习中,双Q学习的提出能在一定程度上缓解Q学习带来的过高估计问题。

    DDQN的主要思想是在目标值计算时将动作的选择和评估分离,在更新过程中,利用两个网络来学习两组权重,分别是预测网络的权重W和目标网络的权重W',在DQN中,动作选择和评估都是通过目标网络来实现的,而在DDQN中,计算目标Q值时,采取目标网络获取最优动作,再通过预测网络估计该最优动作的目标Q值,这样就可以将最优动作选额和动作值函数估计分离,采用不用的样本保证独立性

    二、允许结果与分析

    本节实验在Asterix游戏上,通过控制参数变量对DQN,DDQN算法进行性能对比从而验证了在一定程度上,DDQN算法可以缓解DQN算法的高估问题,DDQN需要两个不同的参数网络,每1000步后预测预测网络的参数同步更新给目标网络,实验设有最大能容纳1000000记录的缓冲池,每个Atari游戏,DDQN算法训练1000000时间步

    实战结果如下图所示,图中的DDQN算法最后收敛回报明显大于DQN,并且在实验过程中,可以发现DQN算法容易陷入局部的情况,其问题主要在于Q-Learning中的最大化操作,Agent在选择动作时每次都取最大Q值得动作,对于真实的策略来说,在给定的状态下并不是每次都选择Q值最大的动作,因为一般真实的策略都是随机性策略,所以在这里目标值直接选择动作最大的Q值往往会导致目标值高于真实值

     为了解决值函数高估计的问题,DDQN算法将动作的选择和动作的评估分别用不同的值函数来实现,结果表明DDQN能够估计出更准确的Q值,在一些Atari2600游戏中可获得更稳定有效的策略

    三、代码

    部分源码如下

    1. import gym, random, pickle, os.path, math, glob
    2. import numpy as np
    3. import pandas as pd
    4. import matplotlib.pyplot as plt
    5. import torch
    6. import torch.optim as optim
    7. import torch.nn as nn
    8. import torch.nn.functional as F
    9. import torch.autograd as autograd
    10. import pdb
    11. from atari_wrappers import make_atari, wrap_deepmind,LazyFrames
    12. def __init__(self, in_channels=4, num_actions=5):
    13. nnels: number of channel of input.
    14. i.e The number of most recent frames stacked together as describe in the paper
    15. num_actions: number of action-value to output, one-to-one correspondence to action in game.
    16. """
    17. super(DQN, self).__init__()
    18. self.conv1 = nn.Conv2d(in_channels, 32, kernel_size=8, stride=4)
    19. self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
    20. self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
    21. self.fc4 = nn.Linear(7 * 7 * 64, 512)
    22. self.fc5 = nn.Linear(512, num_actions)
    23. def forward(self, x):
    24. x = F.relu(self.conv1(x))
    25. x = F.relu(self.conv2(x))
    26. x = F.relu(self.conv3(x))
    27. x = F.relu(self.fc4(x.view(x.size(0), -1)))
    28. return self.fc5(x)
    29. class Memory_Buffer(object):
    30. def __init__(self, memory_size=1000):
    31. self.buffer = []
    32. self.memory_size = memory_size
    33. self.next_idx = 0
    34. def push(self, state, action, reward, next_state, done):
    35. data = (state, action, reward, next_state, done)
    36. if len(self.buffer) <= self.memory_size: # buffer not full
    37. self.buffer.append(data)
    38. else: # buffer is full
    39. self.buffer[self.next_idx] = data
    40. self.next_idx = (self.next_idx + 1) % self.memory_size
    41. def sample(self, batch_size):
    42. states, actions, rewards, next_states, dones = [], [], [], [], []
    43. for i in range(batch_size):
    44. idx = random.randint(0, self.size() - 1)
    45. data = self.buffer[idx]
    46. state, action, reward, next_state, done= data
    47. states.append(state)
    48. actions.append(action)
    49. rewards.append(reward)
    50. next_states.append(next_state)
    51. dones.append(done)
    52. return np.concatenate(states), actions, rewards, np.concatenate(next_states), dones
    53. def size(self):
    54. return len(self.buffer)
    55. class DDQNAgent:
    56. def __init__(self, in_channels = 1, action_space = [], USE_CUDA = False, memory_size = 10000, epsilon = 1, lr = 1e-4):
    57. self.epsilon = epsilon
    58. self.action_space = action_space
    59. self.memory_buffer = Memory_Buffer(memory_size)
    60. self.DQN = DQN(in_channels = in_channels, num_actions = action_space.n)
    61. self.DQN_target = DQN(in_channels = in_channels, num_actions = action_space.n)
    62. self.DQN_target.load_state_dict(self.DQN.state_dict())
    63. self.USE_CUDA = USE_CUDA
    64. if USE_CUDA:
    65. self.DQN = self.DQN.to(device)
    66. self.DQN_target = self.DQN_target.to(device)
    67. self.optimizer = optim.RMSprop(self.DQN.parameters(),lr=lr, eps=0.001, alpha=0.95)
    68. def observe(self, lazyframe):
    69. # from Lazy frame to tensor
    70. state = torch.from_numpy(lazyframe._force().transpose(2,0,1)[None]/255).float()
    71. if self.USE_CUDA:
    72. state = state.to(device)
    73. return state
    74. def value(self, state):
    75. q_values = self.DQN(state)
    76. return q_values
    77. def act(self, state, epsilon = None):
    78. """
    79. sample actions with epsilon-greedy policy
    80. recap: with p = epsilon pick random action, else pick action with highest Q(s,a)
    81. """
    82. if epsilon is None: epsilon = self.epsilon
    83. q_values = self.value(state).cpu().detach().numpy()
    84. if random.random()
    85. aciton = random.randrange(self.action_space.n)
    86. else:
    87. aciton = q_values.argmax(1)[0]
    88. return aciton
    89. def compute_td_loss(self, states, actions, rewards, next_states, is_done, gamma=0.99):
    90. """ Compute td loss using torch operations only. Use the formula above. """
    91. actions = torch.tensor(actions).long() # shape: [batch_size]
    92. rewards = torch.tensor(rewards, dtype =torch.float) # shape: [batch_size]
    93. is_done = torch.tensor(is_done, dtype = torch.uint8) # shape: [batch_size]
    94. if self.USE_CUDA:
    95. actions = actions.to(device)
    96. rewards = rewards.to(device)
    97. is_done = is_done.to(device)
    98. # get q-values for all actions in current states
    99. predicted_qvalues = self.DQN(states)
    100. # select q-values for chosen actions
    101. predicted_qvalues_for_actions = predicted_qvalues[
    102. range(states.shape[0]), actions
    103. ]
    104. # compute q-values for all actions in next states
    105. ## Where DDQN is different from DQN
    106. predicted_next_qvalues_current = self.DQN(next_states)
    107. predicted_next_qvalues_target = self.DQN_target(next_states)
    108. # compute V*(next_states) using predicted next q-values
    109. next_state_values = predicted_next_qvalues_target.gather(1, torch.max(predicted_next_qvalues_current, 1)[1].unsqueeze(1)).squeeze(1)
    110. # compute "target q-values" for loss - it's what's inside square parentheses in the above formula.
    111. target_qvalues_for_actions = rewards + gamma *next_state_values
    112. # at the last state we shall use simplified formula: Q(s,a) = r(s,a) since s' doesn't exist
    113. target_qvalues_for_actions = torch.where(
    114. is_done, rewards, target_qvalues_for_actions)
    115. # mean squared error loss to minimize
    116. #loss = torch.mean((predicted_qvalues_for_actions -
    117. # target_qvalues_for_actions.detach()) ** 2)
    118. loss = F.smooth_l1_loss(predicted_qvalues_for_actions, target_qvalues_for_actions.detach())
    119. return loss
    120. def sample_from_buffer(self, batch_size):
    121. states, actions, rewards, next_states, dones = [], [], [], [], []
    122. for i in range(batch_size):
    123. idx = random.randint(0, self.memory_buffer.size() - 1)
    124. data = self.memory_buffer.buffer[idx]
    125. frame, action, reward, next_frame, done= data
    126. states.append(self.observe(frame))
    127. actions.append(action)
    128. rewards.append(reward)
    129. next_states.append(self.observe(next_frame))
    130. dones.append(done)
    131. return torch.cat(states), actions, rewards, torch.cat(next_states), dones
    132. def learn_from_experience(self, batch_size):
    133. if self.memory_buffer.size() > batch_size:
    134. states, actions, rewards, next_states, dones = self.sample_from_buffer(batch_size)
    135. td_loss = self.compute_td_loss(states, actions, rewards, next_states, dones)
    136. self.optimizer.zero_grad()
    137. td_loss.backward()
    138. for param in self.DQN.parameters():
    139. param.grad.data.clamp_(-1, 1)
    140. self.optimizer.step()
    141. return(td_loss.item())
    142. else:
    143. return(0)
    144. def moving_average(a, n=3) :
    145. ret = np.cumsum(a, dtype=float)
    146. ret[n:] = ret[n:] - ret[:-n]
    147. return ret[n - 1:] / n
    148. def plot_training(frame_idx, rewards, losses):
    149. clear_output(True)
    150. plt.figure(figsize=(20,5))
    151. plt.subplot(131)
    152. plt.title('frame %s. reward: %s' % (frame_idx, np.mean(rewards[-100:])))
    153. plt.plot(moving_average(rewards,20))
    154. plt.subplot(132)
    155. plt.title('loss, average on 100 stpes')
    156. plt.plot(moving_average(losses, 100),linewidth=0.2)
    157. plt.show()
    158. # if __name__ == '__main__':
    159. # Training DQN in PongNoFrameskip-v4
    160. env = make_atari('PongNoFrameskip-v4')
    161. env = wrap_deepmind(env, scale = False, frame_stack=True)
    162. gamma = 0.99
    163. epsilon_max = 1
    164. epsilon_min = 0.01
    165. eps_decay = 30000
    166. frames = 1000000
    167. USE_CUDA = True
    168. learning_rate = 2e-4
    169. max_buff = 100000
    170. update_tar_interval = 1000
    171. batch_size = 32
    172. print_interval = 1000
    173. log_interval = 1000
    174. learning_start = 10000
    175. win_reward = 18 # Pong-v4
    176. win_break = True
    177. action_space = env.action_space
    178. action_dim = env.action_space.n
    179. state_dim = env.observation_space.shape[0]
    180. state_channel = env.observation_space.shape[2]
    181. agent = DDQNAgent(in_channels = state_channel, action_space= action_space, USE_CUDA = USE_CUDA, lr = learning_rate)
    182. #frame = env.reset()
    183. episode_reward = 0
    184. all_rewards = []
    185. losses = []
    186. episode_num = 0
    187. is_win = False
    188. # tensorboard
    189. summary_writer = SummaryWriter(log_dir = "DDQN", comment= "good_makeatari")
    190. # e-greedy decay
    191. epsilon_by_frame = lambda frame_idx: epsilon_min + (epsilon_max - epsilon_min) * math.exp(
    192. -1. * frame_idx / eps_decay)
    193. # plt.plot([epsilon_by_frame(i) for i in range(10000)])
    194. for i in range(frames):
    195. epsilon = epsilon_by_frame(i)
    196. #state_tensor = agent.observe(frame)
    197. #action = agent.act(state_tensor, epsilon)
    198. #next_frame, reward, done, _ = env.step(action)
    199. #episode_reward += reward
    200. #agent.memory_buffer.push(frame, action, reward, next_frame, done)
    201. #frame = next_frame
    202. loss = 0
    203. if agent.memory_buffer.size() >= learning_start:
    204. loss = agent.learn_from_experience(batch_size)
    205. losses.append(loss)
    206. if i % print_interval == 0:
    207. print("frames: %5d, reward: %5f, loss: %4f, epsilon: %5f, episode: %4d" % (i, np.mean(all_rewards[-10:]), loss, epsilon, episode_num))
    208. summary_writer.add_scalar("Temporal Difference Loss", loss, i)
    209. summary_writer.add_scalar("Mean Reward", np.mean(all_rewards[-10:]), i)
    210. summary_writer.add_scalar("Epsilon", epsilon, i)
    211. if iQN_dict.pth.tar")
    212. plot_training(i, all_rewards, losses)

    创作不易 觉得有帮助请点赞关注收藏~~~

  • 相关阅读:
    高防服务器和普通服务器之间的区别有哪些
    GB/T28181-2016 SDP定义和音视频传输模式解读
    实时光线追踪(3)Ray Casting
    最小生成树
    c++新闻发布系统(支持登录注册)
    xxxxxxx
    SpringBoot(原理篇 ==> 解析起步依赖
    Android使用Banner框架实现轮播图
    Gem5 simpoint 全流程
    【数据结构初阶】一个队列怎么实现栈~~OJ
  • 原文地址:https://blog.csdn.net/jiebaoshayebuhui/article/details/128049271