    1. import gym, random, pickle, os.path, math, glob
    2. import numpy as np
    3. import pandas as pd
    4. import matplotlib.pyplot as plt
    5. import torch
    6. import torch.optim as optim
    7. import torch.nn as nn
    8. import torch.nn.functional as F
    9. import torch.autograd as autograd
    10. import pdb
    11. from atari_wrappers import make_atari, wrap_deepmind,LazyFrames
    12. def __init__(self, in_channels=4, num_actions=5):
    13. nnels: number of channel of input.
    14. i.e The number of most recent frames stacked together as describe in the paper
    15. num_actions: number of action-value to output, one-to-one correspondence to action in game.
    16. """
    17. super(DQN, self).__init__()
    18. self.conv1 = nn.Conv2d(in_channels, 32, kernel_size=8, stride=4)
    19. self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
    20. self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
    21. self.fc4 = nn.Linear(7 * 7 * 64, 512)
    22. self.fc5 = nn.Linear(512, num_actions)
    23. def forward(self, x):
    24. x = F.relu(self.conv1(x))
    25. x = F.relu(self.conv2(x))
    26. x = F.relu(self.conv3(x))
    27. x = F.relu(self.fc4(x.view(x.size(0), -1)))
    28. return self.fc5(x)
    29. class Memory_Buffer(object):
    30. def __init__(self, memory_size=1000):
    31. self.buffer = []
    32. self.memory_size = memory_size
    33. self.next_idx = 0
    34. def push(self, state, action, reward, next_state, done):
    35. data = (state, action, reward, next_state, done)
    36. if len(self.buffer) <= self.memory_size: # buffer not full
    37. self.buffer.append(data)
    38. else: # buffer is full
    39. self.buffer[self.next_idx] = data
    40. self.next_idx = (self.next_idx + 1) % self.memory_size
    41. def sample(self, batch_size):
    42. states, actions, rewards, next_states, dones = [], [], [], [], []
    43. for i in range(batch_size):
    44. idx = random.randint(0, self.size() - 1)
    45. data = self.buffer[idx]
    46. state, action, reward, next_state, done= data
    47. states.append(state)
    48. actions.append(action)
    49. rewards.append(reward)
    50. next_states.append(next_state)
    51. dones.append(done)
    52. return np.concatenate(states), actions, rewards, np.concatenate(next_states), dones
    53. def size(self):
    54. return len(self.buffer)
    55. class DDQNAgent:
    56. def __init__(self, in_channels = 1, action_space = [], USE_CUDA = False, memory_size = 10000, epsilon = 1, lr = 1e-4):
    57. self.epsilon = epsilon
    58. self.action_space = action_space
    59. self.memory_buffer = Memory_Buffer(memory_size)
    60. self.DQN = DQN(in_channels = in_channels, num_actions = action_space.n)
    61. self.DQN_target = DQN(in_channels = in_channels, num_actions = action_space.n)
    62. self.DQN_target.load_state_dict(self.DQN.state_dict())
    63. self.USE_CUDA = USE_CUDA
    64. if USE_CUDA:
    65. self.DQN = self.DQN.to(device)
    66. self.DQN_target = self.DQN_target.to(device)
    67. self.optimizer = optim.RMSprop(self.DQN.parameters(),lr=lr, eps=0.001, alpha=0.95)
    68. def observe(self, lazyframe):
    69. # from Lazy frame to tensor
    70. state = torch.from_numpy(lazyframe._force().transpose(2,0,1)[None]/255).float()
    71. if self.USE_CUDA:
    72. state = state.to(device)
    73. return state
    74. def value(self, state):
    75. q_values = self.DQN(state)
    76. return q_values
    77. def act(self, state, epsilon = None):
    78. """
    79. sample actions with epsilon-greedy policy
    80. recap: with p = epsilon pick random action, else pick action with highest Q(s,a)
    81. """
    82. if epsilon is None: epsilon = self.epsilon
    83. q_values = self.value(state).cpu().detach().numpy()
    84. if random.random()
    85. aciton = random.randrange(self.action_space.n)
    86. else:
    87. aciton = q_values.argmax(1)[0]
    88. return aciton
    89. def compute_td_loss(self, states, actions, rewards, next_states, is_done, gamma=0.99):
    90. """ Compute td loss using torch operations only. Use the formula above. """
    91. actions = torch.tensor(actions).long() # shape: [batch_size]
    92. rewards = torch.tensor(rewards, dtype =torch.float) # shape: [batch_size]
    93. is_done = torch.tensor(is_done, dtype = torch.uint8) # shape: [batch_size]
    94. if self.USE_CUDA:
    95. actions = actions.to(device)
    96. rewards = rewards.to(device)
    97. is_done = is_done.to(device)
    98. # get q-values for all actions in current states
    99. predicted_qvalues = self.DQN(states)
    100. # select q-values for chosen actions
    101. predicted_qvalues_for_actions = predicted_qvalues[
    102. range(states.shape[0]), actions
    103. ]
    104. # compute q-values for all actions in next states
    105. ## Where DDQN is different from DQN
    106. predicted_next_qvalues_current = self.DQN(next_states)
    107. predicted_next_qvalues_target = self.DQN_target(next_states)
    108. # compute V*(next_states) using predicted next q-values
    109. next_state_values = predicted_next_qvalues_target.gather(1, torch.max(predicted_next_qvalues_current, 1)[1].unsqueeze(1)).squeeze(1)
    110. # compute "target q-values" for loss - it's what's inside square parentheses in the above formula.
    111. target_qvalues_for_actions = rewards + gamma *next_state_values
    112. # at the last state we shall use simplified formula: Q(s,a) = r(s,a) since s' doesn't exist
    113. target_qvalues_for_actions = torch.where(
    114. is_done, rewards, target_qvalues_for_actions)
    115. # mean squared error loss to minimize
    116. #loss = torch.mean((predicted_qvalues_for_actions -
    117. # target_qvalues_for_actions.detach()) ** 2)
    118. loss = F.smooth_l1_loss(predicted_qvalues_for_actions, target_qvalues_for_actions.detach())
    119. return loss
    120. def sample_from_buffer(self, batch_size):
    121. states, actions, rewards, next_states, dones = [], [], [], [], []
    122. for i in range(batch_size):
    123. idx = random.randint(0, self.memory_buffer.size() - 1)
    124. data = self.memory_buffer.buffer[idx]
    125. frame, action, reward, next_frame, done= data
    126. states.append(self.observe(frame))
    127. actions.append(action)
    128. rewards.append(reward)
    129. next_states.append(self.observe(next_frame))
    130. dones.append(done)
    131. return torch.cat(states), actions, rewards, torch.cat(next_states), dones
    132. def learn_from_experience(self, batch_size):
    133. if self.memory_buffer.size() > batch_size:
    134. states, actions, rewards, next_states, dones = self.sample_from_buffer(batch_size)
    135. td_loss = self.compute_td_loss(states, actions, rewards, next_states, dones)
    136. self.optimizer.zero_grad()
    137. td_loss.backward()
    138. for param in self.DQN.parameters():
    139. param.grad.data.clamp_(-1, 1)
    140. self.optimizer.step()
    141. return(td_loss.item())
    142. else:
    143. return(0)
    144. def moving_average(a, n=3) :
    145. ret = np.cumsum(a, dtype=float)
    146. ret[n:] = ret[n:] - ret[:-n]
    147. return ret[n - 1:] / n
    148. def plot_training(frame_idx, rewards, losses):
    149. clear_output(True)
    150. plt.figure(figsize=(20,5))
    151. plt.subplot(131)
    152. plt.title('frame %s. reward: %s' % (frame_idx, np.mean(rewards[-100:])))
    153. plt.plot(moving_average(rewards,20))
    154. plt.subplot(132)
    155. plt.title('loss, average on 100 stpes')
    156. plt.plot(moving_average(losses, 100),linewidth=0.2)
    157. plt.show()
    158. # if __name__ == '__main__':
    159. # Training DQN in PongNoFrameskip-v4
    160. env = make_atari('PongNoFrameskip-v4')
    161. env = wrap_deepmind(env, scale = False, frame_stack=True)
    162. gamma = 0.99
    163. epsilon_max = 1
    164. epsilon_min = 0.01
    165. eps_decay = 30000
    166. frames = 1000000
    167. USE_CUDA = True
    168. learning_rate = 2e-4
    169. max_buff = 100000
    170. update_tar_interval = 1000
    171. batch_size = 32
    172. print_interval = 1000
    173. log_interval = 1000
    174. learning_start = 10000
    175. win_reward = 18 # Pong-v4
    176. win_break = True
    177. action_space = env.action_space
    178. action_dim = env.action_space.n
    179. state_dim = env.observation_space.shape[0]
    180. state_channel = env.observation_space.shape[2]
    181. agent = DDQNAgent(in_channels = state_channel, action_space= action_space, USE_CUDA = USE_CUDA, lr = learning_rate)
    182. #frame = env.reset()
    183. episode_reward = 0
    184. all_rewards = []
    185. losses = []
    186. episode_num = 0
    187. is_win = False
    188. # tensorboard
    189. summary_writer = SummaryWriter(log_dir = "DDQN", comment= "good_makeatari")
    190. # e-greedy decay
    191. epsilon_by_frame = lambda frame_idx: epsilon_min + (epsilon_max - epsilon_min) * math.exp(
    192. -1. * frame_idx / eps_decay)
    193. # plt.plot([epsilon_by_frame(i) for i in range(10000)])
    194. for i in range(frames):
    195. epsilon = epsilon_by_frame(i)
    196. #state_tensor = agent.observe(frame)
    197. #action = agent.act(state_tensor, epsilon)
    198. #next_frame, reward, done, _ = env.step(action)
    199. #episode_reward += reward
    200. #agent.memory_buffer.push(frame, action, reward, next_frame, done)
    201. #frame = next_frame
    202. loss = 0
    203. if agent.memory_buffer.size() >= learning_start:
    204. loss = agent.learn_from_experience(batch_size)
    205. losses.append(loss)
    206. if i % print_interval == 0:
    207. print("frames: %5d, reward: %5f, loss: %4f, epsilon: %5f, episode: %4d" % (i, np.mean(all_rewards[-10:]), loss, epsilon, episode_num))
    208. summary_writer.add_scalar("Temporal Difference Loss", loss, i)
    209. summary_writer.add_scalar("Mean Reward", np.mean(all_rewards[-10:]), i)
    210. summary_writer.add_scalar("Epsilon", epsilon, i)
    211. if iQN_dict.pth.tar")
    212. plot_training(i, all_rewards, losses)

