      在动作离散的强化学习任务中,通常可以遍历所有的动作来计算动作值函数q(s,a)q(s,a),从而得到最优动作值函数q∗(s,a)q∗(s,a) 。但在大规模连续动作空间中,遍历所有动作是不现实,且计算代价过大。针对解决连续动作空间问题,2016年TP Lillicrap等人提出深度确定性策略梯度算法(Deep Deterministic Policy Gradient,DDPG)算法。该算法基于深度神经网络表达确定性策略μ(s)μ(s),采用确定性策略梯度来更新网络参数,能够有效应用于大规模或连续动作空间的强化学习任务中。

       (1) 行为策略ββ:一种探索性策略,通过引入随机噪声影响动作的选择;
       (2) 状态分布ρβρβ :Agent根据行为策略ββ产生的状态分布;
       (3) 策略网络:或行动者网络:DDPG使用深度网络对确定性策略函数μ(s,θ)μ(s,θ)进行逼近,θθ为网络参数,输入为当前的状态ss,输出为确定性的动作值aa。有时θθ也表示为θμθμ;
       (4) 价值网络:或评论家网络,DDPG使用深度网络对近似动作值函数Q(s,a,w)Q(s,a,w)进行逼近,ww为网络参数。有时ww也表示为θQθQ。
       (1) 采用深度神经网络:构建策略网络和价值网络,分别用来学习近似性策略函数μ(s,θ)μ(s,θ)和近似动作值函数Q(s,a,w)Q(s,a,w),并使用Adam训练网络模型;
       (2) 引入经验回放机制:Agent与环境进行交互时产生的经验转移样本具有时序相关性,通过引入经验回放机制,减少值函数估计所产生的偏差,解决数据间相关性及非静态分布问题,使算法更加容易收敛;
       (3) 使用双网络架构:策略函数和价值函数均使用双网络架构,即分别设置预测网络和目标网络,使算法的学习过程更加稳定,收敛更快。





    实验环境:OpenAI Gym工具包中的MuIoCo环境,用了其中四个连续控制任务,包括Ant,HalfCheetah,Walker2d,Hopper

    每次训练 均运行1000000步,并每取5000步作为一个训练阶段,每个训练阶段结束,对所学策略进行测试评估 与环境交互十个情节并取平均返回值 

    结果可视化如下 横轴为训练时间步数,纵轴为训练不同阶段评估所得到的平均回报 




    1. #深度强化学习——原理、算法与PyTorch实战,代码名称:代40-DDPG算法的实验过程.py
    2. import numpy as np
    3. import torch
    4. import gym
    5. import os
    6. import copy
    9. import torch.nn as nn
    10. import torch.nn.functional as F
    11. device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    12. class ReplayBuffer(object):
    13. def __init__(self, state_dim, action_dim, max_size=int(1e6)):
    14. self.max_size = max_size
    15. self.ptr = 0
    16. self.size = 0
    17. self.state = np.zeros((max_size, state_dim))
    18. self.action = np.zeros((max_size, action_dim))
    19. self.next_state = np.zeros((max_size, state_dim))
    20. self.reward = np.zeros((max_size, 1))
    21. self.not_done = np.zeros((max_size, 1))
    22. self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    23. def add(self, state, action, next_state, reward, done):
    24. self.state[self.ptr] = state
    25. self.action[self.ptr] = action
    26. self.next_state[self.ptr] = next_state
    27. self.reward[self.ptr] = reward
    28. self.not_done[self.ptr] = 1. - done
    29. self.ptr = (self.ptr + 1) % self.max_size
    30. self.size = min(self.size + 1, self.max_size)
    31. def sample(self, batch_size):
    32. ind = np.random.randint(0, self.size, size=batch_size)
    33. return (
    34. torch.FloatTensor(self.state[ind]).to(self.device),
    35. torch.FloatTensor(self.action[ind]).to(self.device),
    36. torch.FloatTensor(self.next_state[ind]).to(self.device),
    37. torch.FloatTensor(self.reward[ind]).to(self.device),
    38. torch.FloatTensor(self.not_done[ind]).to(self.device)
    39. )
    40. class Actor(nn.Module):
    41. def __init__(self, state_dim, action_dim, max_action):
    42. super(Actor, self).__init__()
    43. self.l1 = nn.Linear(state_dim, 400)
    44. self.l2 = nn.Linear(400, 300)
    45. self.l3 = nn.Linear(300, action_dim)
    46. self.max_action = max_action
    47. def forward(self, state):
    48. a = F.relu(self.l1(state))
    49. a = F.relu(self.l2(a))
    50. return self.max_action * torch.tanh(self.l3(a))
    51. class Critic(nn.Module):
    52. def __init__(self, state_dim, action_dim):
    53. super(Critic, self).__init__()
    54. self.l1 = nn.Linear(state_dim, 400)
    55. self.l2 = nn.Linear(400 + action_dim, 300)
    56. self.l3 = nn.Linear(300, 1)
    57. def forward(self, state, action):
    58. q = F.relu(self.l1(state))
    59. q = F.relu(self.l2(torch.cat([q, action], 1)))
    60. return self.l3(q)
    61. actor1=Actor(17,6,1.0)
    62. for ch in actor1.children():
    63. print(ch)
    64. print("*********************")
    65. critic1=Critic(17,6)
    66. for ch in critic1.children():
    67. print(ch)
    68. class DDPG(object):
    69. def __init__(self, state_dim, action_dim, max_action, discount=0.99, tau=0.001):
    70. self.actor = Actor(state_dim, action_dim, max_action).to(device)
    71. self.actor_target = copy.deepcopy(self.actor)
    72. self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=1e-4)
    73. self.critic = Critic(state_dim, action_dim).to(device)
    74. self.critic_target = copy.deepcopy(self.critic)
    75. self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), weight_decay=1e-2)
    76. self.discount = discount
    77. self.tau = tau
    78. def select_action(self, state):
    79. state = torch.FloatTensor(state.reshape(1, -1)).to(device)
    80. return self.actor(state).cpu().data.numpy().flatten()
    81. def train(self, replay_buffer, batch_size=64):
    82. # Sample replay buffer
    83. state, action, next_state, reward, not_done = replay_buffer.sample(batch_size)
    84. # Compute the target Q value
    85. target_Q = self.critic_target(next_state, self.actor_target(next_state))
    86. target_Q = reward + (not_done * self.discount * target_Q).detach()
    87. # Get current Q estimate
    88. current_Q = self.critic(state, action)
    89. # Compute critic loss
    90. critic_loss = F.mse_loss(current_Q, target_Q)
    91. # Optimize the critic
    92. self.critic_optimizer.zero_grad()
    93. critic_loss.backward()
    94. self.critic_optimizer.step()
    95. # Compute actor loss
    96. actor_loss = -self.critic(state, self.actor(state)).mean()
    97. # Optimize the actor
    98. self.actor_optimizer.zero_grad()
    99. actor_loss.backward()
    100. self.actor_optimizer.step()
    101. # Update the frozen target models
    102. for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
    103. target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
    104. for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
    105. target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
    106. def save(self, filename):
    107. torch.save(self.critic.state_dict(), filename + "_critic")
    108. torch.save(self.critic_optimizer.state_dict(), filename + "_critic_optimizer")
    109. torch.save(self.actor.state_dict(), filename + "_actor")
    110. torch.save(self.actor_optimizer.state_dict(), filename + "_actor_optimizer")
    111. def load(self, filename):
    112. self.critic.load_state_dict(torch.load(filename + "_critic"))
    113. self.critic_optimizer.load_state_dict(torch.load(filename + "_critic_optimizer"))
    114. self.critic_target = copy.deepcopy(self.critic)
    115. self.actor.load_state_dict(torch.load(filename + "_actor"))
    116. self.actor_optimizer.load_state_dict(torch.load(filename + "_actor_optimizer"))
    117. self.actor_target = copy.deepcopy(self.actor)
    118. # Runs policy for X episodes and returns average reward
    119. # A fixed seed is used for the eval environment
    120. def eval_policy(policy, env_name, seed, eval_episodes=10):
    121. eval_env = gym.make(env_name)
    122. eval_env.seed(seed + 100)
    123. avg_reward = 0.
    124. for _ in range(eval_episodes):
    125. state, done = eval_env.reset(), False
    126. while not done:
    127. action = policy.select_action(np.array(state))
    128. state, reward, done, _ = eval_env.step(action)
    129. avg_reward += reward
    130. avg_reward /= eval_episodes
    131. print("---------------------------------------")
    132. print(f"Evaluation over {eval_episodes} episodes: {avg_reward:.3f}")
    133. print("---------------------------------------")
    134. return avg_reward
    135. policy = "DDPG"
    136. env_name = "Walker2d-v4" # OpenAI gym environment name
    137. seed = 0 # Sets Gym, PyTorch and Numpy seeds
    138. start_timesteps = 25e3 # Time steps initial random policy is used
    139. eval_freq = 5e3 # How often (time steps) we evaluate
    140. max_timesteps = 1e6 # Max time steps to run environment
    141. expl_noise = 0.1 # Std of Gaussian exploration noise
    142. batch_size = 256 # Batch size for both actor and critic
    143. discount = 0.99 # Discount factor
    144. tau = 0.005 # Target network update rate
    145. policy_noise = 0.2 # Noise added to target policy during critic update
    146. noise_clip = 0.5 # Range to clip target policy noise
    147. policy_freq = 2 # Frequency of delayed policy updates
    148. save_model = "store_true" # Save model and optimizer parameters
    149. load_model = "" # Model load file name, "" doesn't load, "default" uses file_name
    150. file_name = f"{policy}_{env_name}_{seed}"
    151. print("---------------------------------------")
    152. print(f"Policy: {policy}, Env: {env_name}, Seed: {seed}")
    153. print("---------------------------------------")
    154. if not os.path.exists("./results"):
    155. os.makedirs("./results")
    156. if save_model and not os.path.exists("./models"):
    157. os.makedirs("./models")
    158. env = gym.make(env_name)
    159. # Set seeds
    160. env.seed(seed)
    161. torch.manual_seed(seed)
    162. np.random.seed(seed)
    163. state_dim = env.observation_space.shape[0]
    164. action_dim = env.action_space.shape[0]
    165. max_action = float(env.action_space.high[0])
    166. kwargs = {
    167. "state_dim": state_dim,
    168. "action_dim": action_dim,
    169. "max_action": max_action,
    170. "discount": discount,
    171. "tau": tau,
    172. }
    173. policy = DDPG(**kwargs)
    174. if load_model != "":
    175. policy_file = file_name if load_model == "default" else load_model
    176. policy.load(f"./models/{policy_file}")
    177. replay_buffer = ReplayBuffer(state_dim, action_dim)
    178. # Evaluate untrained policy
    179. evaluations = [eval_policy(policy, env_name, seed)]
    180. state, done = env.reset(), False
    181. episode_reward = 0
    182. episode_timesteps = 0
    183. episode_num = 0
    184. for t in range(int(max_timesteps)):
    185. episode_timesteps += 1
    186. # Select action randomly or according to policy
    187. if t < start_timesteps:
    188. action = env.action_space.sample()
    189. else:
    190. action = (
    191. policy.select_action(np.array(state))
    192. + np.random.normal(0, max_action * expl_noise, size=action_dim)
    193. ).clip(-max_action, max_action)
    194. # Perform action
    195. next_state, reward, done, _ = env.step(action)
    196. done_bool = float(done) if episode_timesteps < env._max_episode_steps else 0
    197. # Store data in replay buffer
    199. if done:
    200. # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True
    201. print(
    202. f"Total T: {t + 1} Episode Num: {episode_num + 1} Episode T: {episode_timesteps} Reward: {episode_reward:.3f}")
    203. # Reset environment
    204. state, done = env.reset(), False
    205. episode_reward = 0
    206. episode_timesteps = 0
    207. episode_num += 1
    208. # Evaluate episode
    209. if (t + 1) % eval_freq == 0:
    210. evaluations.append(eval_policy(policy, env_name, seed))
    211. np.save(f"./results/{file_name}", evaluations)
    212. if save_model:
    213. policy.save(f"./models/{file_name}")

