• 【强化学习】优势演员-评论员算法(Advantage Actor-Critic , A2C)求解倒立摆问题 + Pytorch代码实战



    一、倒立摆问题介绍

    Agent 必须在两个动作之间做出决定 - 向左或向右移动推车 - 以使连接到它的杆保持直立。
    在这里插入图片描述

    二、优势演员-评论员算法简介

    优势演员-评论员算法的流程如下图所示,我们有一个 π \pi π ,有个初始的演员与环境交互,先收集资料。在策略梯度方法里收集资料以后,就来更新策略。但是在演员-评论员算法里面,我们不是直接使用那些资料来更新策略。我们先用这些资料去估计价值函数,可以用时序差分方法或蒙特卡洛方法来估计价值函数。接下来,我们再基于价值函数,使用下式更新 π \pi π
    ∇ R ˉ θ ≈ 1 N ∑ n − 1 N ∑ t − 1 T n ( r t n + V π ( s t + 1 n ) − V π ( s t n ) ) ∇ log ⁡ p θ ( a t n ∣ s t n ) \nabla \bar{R}_\theta \approx \frac{1}{N} \sum_{n-1}^N \sum_{t-1}^{T_n}\left(r_t^n+V_\pi\left(s_{t+1}^n\right)-V_\pi\left(s_t^n\right)\right) \nabla \log p_\theta\left(a_t^n \mid s_t^n\right) RˉθN1n1Nt1Tn(rtn+Vπ(st+1n)Vπ(stn))logpθ(atnstn)
    有了新的 π \pi π 以后,再与环境交互,收集新的资料,去估计价值函数。再用新的价值函数更新策略,更新演员。整个优势演员-评论员算法就是这么运作的。
    在这里插入图片描述

    三、详细资料

    关于更加详细的优势演员-评论员算法的介绍,请看我之前发的博客:【EasyRL学习笔记】第九章 Actor-Critic 演员-评论员算法

    在学习优势演员-评论员算法前你最好能了解以下知识点:

    • 深度Q网络
    • 时序差分方法
    • 蒙特卡洛方法

    四、Python代码实战

    4.1 运行前配置

    准备好一个RL_Utils.py文件,文件内容可以从我的一篇里博客获取:【RL工具类】强化学习常用函数工具类(Python代码)

    这一步很重要,后面需要引入该RL_Utils.py文件

    在这里插入图片描述

    4.2 主要代码

    4.2.1 网络参数不共享版本

    import argparse
    import datetime
    import time
    from collections import deque
    import torch.nn.functional as F
    import gym
    from torch import nn
    
    # 这里需要改成自己的RL_Utils.py文件的路径
    from Python.ReinforcementLearning.EasyRL.RL_Utils import *
    
    # 经验回放缓存区对象
    class PGReplay:
        def __init__(self):
            self.buffer = deque()
    
        def push(self, transitions):
            self.buffer.append(transitions)
    
        def sample(self):
            batch = list(self.buffer)
            return zip(*batch)
    
        def clear(self):
            self.buffer.clear()
    
        def __len__(self):
            return len(self.buffer)
    
    # 演员:离散动作,就输出每个动作的概率分布(softmax);连续动作就直接输出动作(sigmoid)
    class ActorSoftmax(nn.Module):
        def __init__(self, input_dim, output_dim, hidden_dim=256):
            super(ActorSoftmax, self).__init__()
            self.fc1 = nn.Linear(input_dim, hidden_dim)
            self.fc2 = nn.Linear(hidden_dim, output_dim)
    
        def forward(self, state):
            dist = F.relu(self.fc1(state))
            dist = F.softmax(self.fc2(dist), dim=1)
            return dist
    
    # 评论员:输出V_{\pi}(s)
    class Critic(nn.Module):
        def __init__(self, input_dim, output_dim, hidden_dim=256):
            super(Critic, self).__init__()
            assert output_dim == 1
            self.fc1 = nn.Linear(input_dim, hidden_dim)
            self.fc2 = nn.Linear(hidden_dim, output_dim)
    
        def forward(self, state):
            value = F.relu(self.fc1(state))
            value = self.fc2(value)
            return value
    
    
    # A2C智能体对象
    class A2C:
        def __init__(self, models, memory, arg_dict):
            self.n_actions = arg_dict['n_actions']
            self.gamma = arg_dict['gamma']
            self.device = torch.device(arg_dict['device'])
            self.memory = memory
            self.actor = models['Actor'].to(self.device)
            self.critic = models['Critic'].to(self.device)
            self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=arg_dict['actor_lr'])
            self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=arg_dict['critic_lr'])
    
        def sample_action(self, state):
            # unsqueeze():升维
            state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
            # 将当前状态传给演员,演员返回每个动作的概率分布
            dist = self.actor(state)
            # 将当前状态传给评论员,评论员返回每个动作的价值
            value = self.critic(state)  # note that 'dist' need require_grad=True
            value = value.detach().numpy().squeeze(0)[0]
            # squeeze():降维
            # 根据分布,按照概率选取动作
            action = np.random.choice(self.n_actions, p=dist.detach().numpy().squeeze(0))  # shape(p=(n_actions,1)
            return action, value, dist
    
        # 这里之所以不用最大概率的动作进行行动,是为了增加模型的随机性,防止被猜透
        def predict_action(self, state):
            state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
            dist = self.actor(state)
            value = self.critic(state)  # note that 'dist' need require_grad=True
            value = value.detach().numpy().squeeze(0)[0]
            action = np.random.choice(self.n_actions, p=dist.detach().numpy().squeeze(0))  # shape(p=(n_actions,1)
            return action, value, dist
    
        def update(self, next_state, entropy):
            value_pool, log_prob_pool, reward_pool = self.memory.sample()
            next_state = torch.tensor(next_state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
            next_value = self.critic(next_state)
            returns = np.zeros_like(reward_pool)
            for t in reversed(range(len(reward_pool))):
                next_value = reward_pool[t] + self.gamma * next_value  # G(s_{t},a{t}) = r_{t+1} + gamma * V(s_{t+1})
                returns[t] = next_value
            returns = torch.tensor(returns, device=self.device)
            value_pool = torch.tensor(value_pool, device=self.device)
            advantages = returns - value_pool
            log_prob_pool = torch.stack(log_prob_pool)
            actor_loss = (-log_prob_pool * advantages).mean()
            critic_loss = 0.5 * advantages.pow(2).mean()
            tot_loss = actor_loss + critic_loss + 0.001 * entropy
            self.actor_optim.zero_grad()
            self.critic_optim.zero_grad()
            tot_loss.backward()
            self.actor_optim.step()
            self.critic_optim.step()
            self.memory.clear()
    
        def save_model(self, path):
            Path(path).mkdir(parents=True, exist_ok=True)
            torch.save(self.actor.state_dict(), f"{path}/actor_checkpoint.pt")
            torch.save(self.critic.state_dict(), f"{path}/critic_checkpoint.pt")
    
        def load_model(self, path):
            self.actor.load_state_dict(torch.load(f"{path}/actor_checkpoint.pt"))
            self.critic.load_state_dict(torch.load(f"{path}/critic_checkpoint.pt"))
    
    
    # 训练函数
    def train(arg_dict, env, agent):
        # 开始计时
        startTime = time.time()
        print(f"环境名: {arg_dict['env_name']}, 算法名: {arg_dict['algo_name']}, Device: {arg_dict['device']}")
        print("开始训练智能体......")
        rewards = []
        steps = []
        for i_ep in range(arg_dict['train_eps']):
            ep_reward = 0
            ep_step = 0
            ep_entropy = 0
            state = env.reset()
            # 每次采样ep_max_steps个样本再对模型进行更新
            for _ in range(arg_dict['ep_max_steps']):
                # 画图
                if arg_dict['train_render']:
                    env.render()
                # 探索采样
                action, value, dist = agent.sample_action(state)
                # 根据动作获取下一步状态、奖励(经验)
                next_state, reward, done, _ = env.step(action)
                log_prob = torch.log(dist.squeeze(0)[action])
                entropy = -np.sum(np.mean(dist.detach().numpy()) * np.log(dist.detach().numpy()))
                # 保存经验
                agent.memory.push((value, log_prob, reward))
                # 更新状态
                state = next_state
                ep_reward += reward
                ep_entropy += entropy
                ep_step += 1
                if done:
                    break
            # 更新智能体参数
            agent.update(next_state, ep_entropy)
            rewards.append(ep_reward)
            steps.append(ep_step)
            if (i_ep + 1) % 10 == 0:
                print(f'Episode: {i_ep + 1}/{arg_dict["train_eps"]}, Reward: {ep_reward:.2f}, Steps:{ep_step}')
        print('训练结束 , 用时: ' + str(time.time() - startTime) + " s")
        # 关闭环境
        env.close()
        return {'episodes': range(len(rewards)), 'rewards': rewards}
    
    
    # 测试函数
    def test(arg_dict, env, agent):
        startTime = time.time()
        print("开始测试智能体......")
        print(f"环境名: {arg_dict['env_name']}, 算法名: {arg_dict['algo_name']}, Device: {arg_dict['device']}")
        rewards = []
        steps = []
        for i_ep in range(arg_dict['test_eps']):
            ep_reward = 0
            ep_step = 0
            state = env.reset()
            for _ in range(arg_dict['ep_max_steps']):
                # 画图
                if arg_dict['test_render']:
                    env.render()
                # 预测动作
                action, _, _ = agent.predict_action(state)
                next_state, reward, done, _ = env.step(action)
                state = next_state
                ep_reward += reward
                ep_step += 1
                if done:
                    break
            rewards.append(ep_reward)
            steps.append(ep_step)
            print(f"Episode: {i_ep + 1}/{arg_dict['test_eps']}, Steps:{ep_step}, Reward: {ep_reward:.2f}")
        print("测试结束 , 用时: " + str(time.time() - startTime) + " s")
        env.close()
        return {'episodes': range(len(rewards)), 'rewards': rewards}
    
    
    # 创建环境和智能体
    def create_env_agent(arg_dict):
        # 创建环境
        env = gym.make(arg_dict['env_name'])
        # 设置随机种子
        all_seed(env, seed=arg_dict["seed"])
        # 获取状态数
        try:
            n_states = env.observation_space.n
        except AttributeError:
            n_states = env.observation_space.shape[0]
        # 获取动作数
        n_actions = env.action_space.n
        print(f"状态数: {n_states}, 动作数: {n_actions}")
        # 将状态数和动作数加入算法参数字典
        arg_dict.update({"n_states": n_states, "n_actions": n_actions})
        # 实例化智能体对象
        models = {
            'Actor': ActorSoftmax(arg_dict['n_states'], arg_dict['n_actions'], hidden_dim=arg_dict['actor_hidden_dim']),
            'Critic': Critic(arg_dict['n_states'], 1, hidden_dim=arg_dict['critic_hidden_dim'])}
        # 经验回放缓存区
        memory = PGReplay()
        agent = A2C(models, memory, arg_dict)
        # 返回环境,智能体
        return env, agent
    
    
    if __name__ == '__main__':
        # 防止报错 OMP: Error #15: Initializing libiomp5md.dll, but found libiomp5md.dll already initialized.
        os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
        # 获取当前路径
        curr_path = os.path.dirname(os.path.abspath(__file__))
        # 获取当前时间
        curr_time = datetime.datetime.now().strftime("%Y_%m_%d-%H_%M_%S")
        # 相关参数设置
        parser = argparse.ArgumentParser(description="hyper parameters")
        parser.add_argument('--algo_name', default='A2C', type=str, help="name of algorithm")
        parser.add_argument('--env_name', default='CartPole-v0', type=str, help="name of environment")
        parser.add_argument('--train_eps', default=1600, type=int, help="episodes of training")
        parser.add_argument('--test_eps', default=20, type=int, help="episodes of testing")
        parser.add_argument('--ep_max_steps', default=100000, type=int,
                            help="steps per episode, much larger value can simulate infinite steps")
        parser.add_argument('--gamma', default=0.99, type=float, help="discounted factor")
        parser.add_argument('--actor_lr', default=3e-4, type=float, help="learning rate of actor")
        parser.add_argument('--critic_lr', default=1e-3, type=float, help="learning rate of critic")
        parser.add_argument('--actor_hidden_dim', default=256, type=int, help="hidden of actor net")
        parser.add_argument('--critic_hidden_dim', default=256, type=int, help="hidden of critic net")
        parser.add_argument('--device', default='cpu', type=str, help="cpu or cuda")
        parser.add_argument('--seed', default=520, type=int, help="seed")
        parser.add_argument('--show_fig', default=False, type=bool, help="if show figure or not")
        parser.add_argument('--save_fig', default=True, type=bool, help="if save figure or not")
        parser.add_argument('--train_render', default=False, type=bool,
                            help="Whether to render the environment during training")
        parser.add_argument('--test_render', default=True, type=bool,
                            help="Whether to render the environment during testing")
        args = parser.parse_args()
        default_args = {'result_path': f"{curr_path}/outputs/{args.env_name}/{curr_time}/results/",
                        'model_path': f"{curr_path}/outputs/{args.env_name}/{curr_time}/models/",
                        }
        # 将参数转化为字典 type(dict)
        arg_dict = {**vars(args), **default_args}
        print("算法参数字典:", arg_dict)
    
        # 创建环境和智能体
        env, agent = create_env_agent(arg_dict)
        # 传入算法参数、环境、智能体,然后开始训练
        res_dic = train(arg_dict, env, agent)
        print("算法返回结果字典:", res_dic)
        # 保存相关信息
        agent.save_model(path=arg_dict['model_path'])
        save_args(arg_dict, path=arg_dict['result_path'])
        save_results(res_dic, tag='train', path=arg_dict['result_path'])
        plot_rewards(res_dic['rewards'], arg_dict, path=arg_dict['result_path'], tag="train")
    
        # =================================================================================================
        # 创建新环境和智能体用来测试
        print("=" * 300)
        env, agent = create_env_agent(arg_dict)
        # 加载已保存的智能体
        agent.load_model(path=arg_dict['model_path'])
        res_dic = test(arg_dict, env, agent)
        save_results(res_dic, tag='test', path=arg_dict['result_path'])
        plot_rewards(res_dic['rewards'], arg_dict, path=arg_dict['result_path'], tag="test")
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89
    • 90
    • 91
    • 92
    • 93
    • 94
    • 95
    • 96
    • 97
    • 98
    • 99
    • 100
    • 101
    • 102
    • 103
    • 104
    • 105
    • 106
    • 107
    • 108
    • 109
    • 110
    • 111
    • 112
    • 113
    • 114
    • 115
    • 116
    • 117
    • 118
    • 119
    • 120
    • 121
    • 122
    • 123
    • 124
    • 125
    • 126
    • 127
    • 128
    • 129
    • 130
    • 131
    • 132
    • 133
    • 134
    • 135
    • 136
    • 137
    • 138
    • 139
    • 140
    • 141
    • 142
    • 143
    • 144
    • 145
    • 146
    • 147
    • 148
    • 149
    • 150
    • 151
    • 152
    • 153
    • 154
    • 155
    • 156
    • 157
    • 158
    • 159
    • 160
    • 161
    • 162
    • 163
    • 164
    • 165
    • 166
    • 167
    • 168
    • 169
    • 170
    • 171
    • 172
    • 173
    • 174
    • 175
    • 176
    • 177
    • 178
    • 179
    • 180
    • 181
    • 182
    • 183
    • 184
    • 185
    • 186
    • 187
    • 188
    • 189
    • 190
    • 191
    • 192
    • 193
    • 194
    • 195
    • 196
    • 197
    • 198
    • 199
    • 200
    • 201
    • 202
    • 203
    • 204
    • 205
    • 206
    • 207
    • 208
    • 209
    • 210
    • 211
    • 212
    • 213
    • 214
    • 215
    • 216
    • 217
    • 218
    • 219
    • 220
    • 221
    • 222
    • 223
    • 224
    • 225
    • 226
    • 227
    • 228
    • 229
    • 230
    • 231
    • 232
    • 233
    • 234
    • 235
    • 236
    • 237
    • 238
    • 239
    • 240
    • 241
    • 242
    • 243
    • 244
    • 245
    • 246
    • 247
    • 248
    • 249
    • 250
    • 251
    • 252
    • 253
    • 254
    • 255
    • 256
    • 257
    • 258
    • 259
    • 260
    • 261
    • 262
    • 263
    • 264
    • 265
    • 266
    • 267
    • 268
    • 269
    • 270
    • 271
    • 272
    • 273
    • 274
    • 275
    • 276
    • 277
    • 278
    • 279
    • 280

    运行结果展示

    由于有些输出太长,下面仅展示部分输出

    状态数: 4, 动作数: 2
    环境名: CartPole-v0, 算法名: A2C, Device: cpu
    开始训练智能体......
    Episode: 10/1600, Reward: 25.00, Steps:25
    Episode: 20/1600, Reward: 12.00, Steps:12
    Episode: 30/1600, Reward: 20.00, Steps:20
    Episode: 40/1600, Reward: 14.00, Steps:14
    Episode: 50/1600, Reward: 24.00, Steps:24
    Episode: 60/1600, Reward: 37.00, Steps:37
    Episode: 70/1600, Reward: 40.00, Steps:40
    Episode: 80/1600, Reward: 13.00, Steps:13
    Episode: 90/1600, Reward: 23.00, Steps:23
    Episode: 100/1600, Reward: 14.00, Steps:14
    Episode: 110/1600, Reward: 25.00, Steps:25
    Episode: 120/1600, Reward: 25.00, Steps:25
    Episode: 130/1600, Reward: 22.00, Steps:22
    Episode: 140/1600, Reward: 20.00, Steps:20
    Episode: 150/1600, Reward: 94.00, Steps:94
    Episode: 160/1600, Reward: 19.00, Steps:19
    Episode: 170/1600, Reward: 25.00, Steps:25
    Episode: 180/1600, Reward: 11.00, Steps:11
    Episode: 190/1600, Reward: 36.00, Steps:36
    Episode: 200/1600, Reward: 33.00, Steps:33
    Episode: 210/1600, Reward: 20.00, Steps:20
    Episode: 220/1600, Reward: 17.00, Steps:17
    Episode: 230/1600, Reward: 12.00, Steps:12
    Episode: 240/1600, Reward: 15.00, Steps:15
    Episode: 250/1600, Reward: 31.00, Steps:31
    Episode: 260/1600, Reward: 12.00, Steps:12
    Episode: 270/1600, Reward: 27.00, Steps:27
    Episode: 280/1600, Reward: 40.00, Steps:40
    Episode: 290/1600, Reward: 20.00, Steps:20
    Episode: 300/1600, Reward: 60.00, Steps:60
    Episode: 310/1600, Reward: 38.00, Steps:38
    Episode: 320/1600, Reward: 10.00, Steps:10
    Episode: 330/1600, Reward: 23.00, Steps:23
    Episode: 340/1600, Reward: 34.00, Steps:34
    Episode: 350/1600, Reward: 55.00, Steps:55
    Episode: 360/1600, Reward: 24.00, Steps:24
    Episode: 370/1600, Reward: 45.00, Steps:45
    Episode: 380/1600, Reward: 24.00, Steps:24
    Episode: 390/1600, Reward: 32.00, Steps:32
    Episode: 400/1600, Reward: 92.00, Steps:92
    Episode: 410/1600, Reward: 53.00, Steps:53
    Episode: 420/1600, Reward: 40.00, Steps:40
    Episode: 430/1600, Reward: 77.00, Steps:77
    Episode: 440/1600, Reward: 44.00, Steps:44
    Episode: 450/1600, Reward: 32.00, Steps:32
    Episode: 460/1600, Reward: 51.00, Steps:51
    Episode: 470/1600, Reward: 91.00, Steps:91
    Episode: 480/1600, Reward: 51.00, Steps:51
    Episode: 490/1600, Reward: 66.00, Steps:66
    Episode: 500/1600, Reward: 27.00, Steps:27
    Episode: 510/1600, Reward: 66.00, Steps:66
    Episode: 520/1600, Reward: 37.00, Steps:37
    Episode: 530/1600, Reward: 29.00, Steps:29
    Episode: 540/1600, Reward: 38.00, Steps:38
    Episode: 550/1600, Reward: 82.00, Steps:82
    Episode: 560/1600, Reward: 33.00, Steps:33
    Episode: 570/1600, Reward: 79.00, Steps:79
    Episode: 580/1600, Reward: 78.00, Steps:78
    Episode: 590/1600, Reward: 26.00, Steps:26
    Episode: 600/1600, Reward: 80.00, Steps:80
    Episode: 610/1600, Reward: 85.00, Steps:85
    Episode: 620/1600, Reward: 92.00, Steps:92
    Episode: 630/1600, Reward: 35.00, Steps:35
    Episode: 640/1600, Reward: 88.00, Steps:88
    Episode: 650/1600, Reward: 157.00, Steps:157
    Episode: 660/1600, Reward: 35.00, Steps:35
    Episode: 670/1600, Reward: 60.00, Steps:60
    Episode: 680/1600, Reward: 42.00, Steps:42
    Episode: 690/1600, Reward: 55.00, Steps:55
    Episode: 700/1600, Reward: 51.00, Steps:51
    Episode: 710/1600, Reward: 65.00, Steps:65
    Episode: 720/1600, Reward: 61.00, Steps:61
    Episode: 730/1600, Reward: 125.00, Steps:125
    Episode: 740/1600, Reward: 162.00, Steps:162
    Episode: 750/1600, Reward: 19.00, Steps:19
    Episode: 760/1600, Reward: 120.00, Steps:120
    Episode: 770/1600, Reward: 34.00, Steps:34
    Episode: 780/1600, Reward: 115.00, Steps:115
    Episode: 790/1600, Reward: 66.00, Steps:66
    Episode: 800/1600, Reward: 114.00, Steps:114
    Episode: 810/1600, Reward: 130.00, Steps:130
    Episode: 820/1600, Reward: 71.00, Steps:71
    Episode: 830/1600, Reward: 52.00, Steps:52
    Episode: 840/1600, Reward: 128.00, Steps:128
    Episode: 850/1600, Reward: 24.00, Steps:24
    Episode: 860/1600, Reward: 101.00, Steps:101
    Episode: 870/1600, Reward: 39.00, Steps:39
    Episode: 880/1600, Reward: 33.00, Steps:33
    Episode: 890/1600, Reward: 111.00, Steps:111
    Episode: 900/1600, Reward: 159.00, Steps:159
    Episode: 910/1600, Reward: 131.00, Steps:131
    Episode: 920/1600, Reward: 73.00, Steps:73
    Episode: 930/1600, Reward: 54.00, Steps:54
    Episode: 940/1600, Reward: 178.00, Steps:178
    Episode: 950/1600, Reward: 200.00, Steps:200
    Episode: 960/1600, Reward: 82.00, Steps:82
    Episode: 970/1600, Reward: 63.00, Steps:63
    Episode: 980/1600, Reward: 113.00, Steps:113
    Episode: 990/1600, Reward: 68.00, Steps:68
    Episode: 1000/1600, Reward: 151.00, Steps:151
    Episode: 1010/1600, Reward: 160.00, Steps:160
    Episode: 1020/1600, Reward: 135.00, Steps:135
    Episode: 1030/1600, Reward: 135.00, Steps:135
    Episode: 1040/1600, Reward: 200.00, Steps:200
    Episode: 1050/1600, Reward: 200.00, Steps:200
    Episode: 1060/1600, Reward: 141.00, Steps:141
    Episode: 1070/1600, Reward: 101.00, Steps:101
    Episode: 1080/1600, Reward: 200.00, Steps:200
    Episode: 1090/1600, Reward: 191.00, Steps:191
    Episode: 1100/1600, Reward: 200.00, Steps:200
    Episode: 1110/1600, Reward: 89.00, Steps:89
    Episode: 1120/1600, Reward: 198.00, Steps:198
    Episode: 1130/1600, Reward: 162.00, Steps:162
    Episode: 1140/1600, Reward: 175.00, Steps:175
    Episode: 1150/1600, Reward: 149.00, Steps:149
    Episode: 1160/1600, Reward: 110.00, Steps:110
    Episode: 1170/1600, Reward: 200.00, Steps:200
    Episode: 1180/1600, Reward: 129.00, Steps:129
    Episode: 1190/1600, Reward: 161.00, Steps:161
    Episode: 1200/1600, Reward: 137.00, Steps:137
    Episode: 1210/1600, Reward: 200.00, Steps:200
    Episode: 1220/1600, Reward: 200.00, Steps:200
    Episode: 1230/1600, Reward: 200.00, Steps:200
    Episode: 1240/1600, Reward: 190.00, Steps:190
    Episode: 1250/1600, Reward: 166.00, Steps:166
    Episode: 1260/1600, Reward: 163.00, Steps:163
    Episode: 1270/1600, Reward: 127.00, Steps:127
    Episode: 1280/1600, Reward: 137.00, Steps:137
    Episode: 1290/1600, Reward: 60.00, Steps:60
    Episode: 1300/1600, Reward: 156.00, Steps:156
    Episode: 1310/1600, Reward: 97.00, Steps:97
    Episode: 1320/1600, Reward: 115.00, Steps:115
    Episode: 1330/1600, Reward: 200.00, Steps:200
    Episode: 1340/1600, Reward: 200.00, Steps:200
    Episode: 1350/1600, Reward: 200.00, Steps:200
    Episode: 1360/1600, Reward: 200.00, Steps:200
    Episode: 1370/1600, Reward: 200.00, Steps:200
    Episode: 1380/1600, Reward: 200.00, Steps:200
    Episode: 1390/1600, Reward: 200.00, Steps:200
    Episode: 1400/1600, Reward: 154.00, Steps:154
    Episode: 1410/1600, Reward: 174.00, Steps:174
    Episode: 1420/1600, Reward: 114.00, Steps:114
    Episode: 1430/1600, Reward: 157.00, Steps:157
    Episode: 1440/1600, Reward: 191.00, Steps:191
    Episode: 1450/1600, Reward: 65.00, Steps:65
    Episode: 1460/1600, Reward: 200.00, Steps:200
    Episode: 1470/1600, Reward: 200.00, Steps:200
    Episode: 1480/1600, Reward: 155.00, Steps:155
    Episode: 1490/1600, Reward: 107.00, Steps:107
    Episode: 1500/1600, Reward: 27.00, Steps:27
    Episode: 1510/1600, Reward: 200.00, Steps:200
    Episode: 1520/1600, Reward: 200.00, Steps:200
    Episode: 1530/1600, Reward: 132.00, Steps:132
    Episode: 1540/1600, Reward: 142.00, Steps:142
    Episode: 1550/1600, Reward: 99.00, Steps:99
    Episode: 1560/1600, Reward: 171.00, Steps:171
    Episode: 1570/1600, Reward: 172.00, Steps:172
    Episode: 1580/1600, Reward: 147.00, Steps:147
    Episode: 1590/1600, Reward: 182.00, Steps:182
    Episode: 1600/1600, Reward: 200.00, Steps:200
    训练结束 , 用时: 81.30708861351013 s
    ============================================================================================================================================================================================================================================================================================================
    状态数: 4, 动作数: 2
    开始测试智能体......
    环境名: CartPole-v0, 算法名: A2C, Device: cpu
    Episode: 1/20, Steps:161, Reward: 161.00
    Episode: 2/20, Steps:150, Reward: 150.00
    Episode: 3/20, Steps:93, Reward: 93.00
    Episode: 4/20, Steps:169, Reward: 169.00
    Episode: 5/20, Steps:200, Reward: 200.00
    Episode: 6/20, Steps:168, Reward: 168.00
    Episode: 7/20, Steps:25, Reward: 25.00
    Episode: 8/20, Steps:171, Reward: 171.00
    Episode: 9/20, Steps:200, Reward: 200.00
    Episode: 10/20, Steps:200, Reward: 200.00
    Episode: 11/20, Steps:188, Reward: 188.00
    Episode: 12/20, Steps:200, Reward: 200.00
    Episode: 13/20, Steps:87, Reward: 87.00
    Episode: 14/20, Steps:200, Reward: 200.00
    Episode: 15/20, Steps:200, Reward: 200.00
    Episode: 16/20, Steps:200, Reward: 200.00
    Episode: 17/20, Steps:200, Reward: 200.00
    Episode: 18/20, Steps:200, Reward: 200.00
    Episode: 19/20, Steps:198, Reward: 198.00
    Episode: 20/20, Steps:200, Reward: 200.00
    测试结束 , 用时: 28.915676593780518 s
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89
    • 90
    • 91
    • 92
    • 93
    • 94
    • 95
    • 96
    • 97
    • 98
    • 99
    • 100
    • 101
    • 102
    • 103
    • 104
    • 105
    • 106
    • 107
    • 108
    • 109
    • 110
    • 111
    • 112
    • 113
    • 114
    • 115
    • 116
    • 117
    • 118
    • 119
    • 120
    • 121
    • 122
    • 123
    • 124
    • 125
    • 126
    • 127
    • 128
    • 129
    • 130
    • 131
    • 132
    • 133
    • 134
    • 135
    • 136
    • 137
    • 138
    • 139
    • 140
    • 141
    • 142
    • 143
    • 144
    • 145
    • 146
    • 147
    • 148
    • 149
    • 150
    • 151
    • 152
    • 153
    • 154
    • 155
    • 156
    • 157
    • 158
    • 159
    • 160
    • 161
    • 162
    • 163
    • 164
    • 165
    • 166
    • 167
    • 168
    • 169
    • 170
    • 171
    • 172
    • 173
    • 174
    • 175
    • 176
    • 177
    • 178
    • 179
    • 180
    • 181
    • 182
    • 183
    • 184
    • 185
    • 186
    • 187
    • 188
    • 189

    在这里插入图片描述
    在这里插入图片描述

    4.2.2 网络参数共享版本

    import argparse
    import datetime
    import time
    from collections import deque
    import torch.nn.functional as F
    import gym
    from torch import nn
    
    # 这里需要改成自己的RL_Utils.py文件的路径
    from Python.ReinforcementLearning.EasyRL.RL_Utils import *
    
    
    # 经验回放缓存区对象
    class PGReplay:
        def __init__(self):
            self.buffer = deque()
    
        def push(self, transitions):
            self.buffer.append(transitions)
    
        def sample(self):
            batch = list(self.buffer)
            return zip(*batch)
    
        def clear(self):
            self.buffer.clear()
    
        def __len__(self):
            return len(self.buffer)
    
    
    # 演员:离散动作,就输出每个动作的概率分布(softmax);连续动作就直接输出动作(sigmoid)
    # 评论员:输出V_{\pi}(s)
    # 网络参数共享的演员-评论员网络
    class ActorCriticSoftMax(nn.Module):
        def __init__(self, input_dim, output_dim, actor_hidden_dim=256, critic_hidden_dim=256):
            super(ActorCriticSoftMax, self).__init__()
    
            self.critic_fc1 = nn.Linear(input_dim, critic_hidden_dim)
            self.critic_fc2 = nn.Linear(critic_hidden_dim, 1)
    
            self.actor_fc1 = nn.Linear(input_dim, actor_hidden_dim)
            self.actor_fc2 = nn.Linear(actor_hidden_dim, output_dim)
    
        def forward(self, state):
            value = F.relu(self.critic_fc1(state))
            value = self.critic_fc2(value)
    
            policy_dist = F.relu(self.actor_fc1(state))
            policy_dist = F.softmax(self.actor_fc2(policy_dist), dim=1)
    
            return value, policy_dist
    
    
    # A2C智能体对象
    class A2C:
        def __init__(self, models, memory, cfg):
            self.n_actions = cfg['n_actions']
            self.gamma = cfg['gamma']
            self.device = torch.device(cfg['device'])
            self.memory = memory
            self.ac_net = models['ActorCritic'].to(self.device)
            self.ac_optimizer = torch.optim.Adam(self.ac_net.parameters(), lr=cfg['lr'])
    
        def sample_action(self, state):
            state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
            value, dist = self.ac_net(state)  # note that 'dist' need require_grad=True
            value = value.detach().numpy().squeeze(0)[0]
            action = np.random.choice(self.n_actions, p=dist.detach().numpy().squeeze(0))  # shape(p=(n_actions,1)
            return action, value, dist
    
        def predict_action(self, state):
            with torch.no_grad():
                state = torch.tensor(state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
                value, dist = self.ac_net(state)
                value = value.numpy().squeeze(0)[0]  # shape(value) = (1,)
                action = np.random.choice(self.n_actions, p=dist.numpy().squeeze(0))  # shape(p=(n_actions,1)
            return action, value, dist
    
        def update(self, next_state, entropy):
            value_pool, log_prob_pool, reward_pool = self.memory.sample()
            next_state = torch.tensor(next_state, device=self.device, dtype=torch.float32).unsqueeze(dim=0)
            next_value, _ = self.ac_net(next_state)
            returns = np.zeros_like(reward_pool)
            for t in reversed(range(len(reward_pool))):
                next_value = reward_pool[t] + self.gamma * next_value  # G(s_{t},a{t}) = r_{t+1} + gamma * V(s_{t+1})
                returns[t] = next_value
            returns = torch.tensor(returns, device=self.device)
            value_pool = torch.tensor(value_pool, device=self.device)
            advantages = returns - value_pool
            log_prob_pool = torch.stack(log_prob_pool)
            actor_loss = (-log_prob_pool * advantages).mean()
            critic_loss = 0.5 * advantages.pow(2).mean()
            ac_loss = actor_loss + critic_loss + 0.001 * entropy
            self.ac_optimizer.zero_grad()
            ac_loss.backward()
            self.ac_optimizer.step()
            self.memory.clear()
    
        def save_model(self, path):
            Path(path).mkdir(parents=True, exist_ok=True)
            torch.save(self.ac_net.state_dict(), f"{path}/a2c_checkpoint.pt")
    
        def load_model(self, path):
            self.ac_net.load_state_dict(torch.load(f"{path}/a2c_checkpoint.pt"))
    
    
    # 训练函数
    def train(arg_dict, env, agent):
        # 开始计时
        startTime = time.time()
        print(f"环境名: {arg_dict['env_name']}, 算法名: {arg_dict['algo_name']}, Device: {arg_dict['device']}")
        print("开始训练智能体......")
        rewards = []
        steps = []
        for i_ep in range(arg_dict['train_eps']):
            ep_reward = 0
            ep_step = 0
            ep_entropy = 0
            state = env.reset()
            # 每次采样ep_max_steps个样本再对模型进行更新
            for _ in range(arg_dict['ep_max_steps']):
                # 画图
                if arg_dict['train_render']:
                    env.render()
                # 探索采样
                action, value, dist = agent.sample_action(state)
                # 根据动作获取下一步状态、奖励(经验)
                next_state, reward, done, _ = env.step(action)
                log_prob = torch.log(dist.squeeze(0)[action])
                entropy = -np.sum(np.mean(dist.detach().numpy()) * np.log(dist.detach().numpy()))
                # 保存经验
                agent.memory.push((value, log_prob, reward))
                # 更新状态
                state = next_state
                ep_reward += reward
                ep_entropy += entropy
                ep_step += 1
                if done:
                    break
            # 更新智能体参数
            agent.update(next_state, ep_entropy)
            rewards.append(ep_reward)
            steps.append(ep_step)
            if (i_ep + 1) % 10 == 0:
                print(f'Episode: {i_ep + 1}/{arg_dict["train_eps"]}, Reward: {ep_reward:.2f}, Steps:{ep_step}')
        print('训练结束 , 用时: ' + str(time.time() - startTime) + " s")
        # 关闭环境
        env.close()
        return {'episodes': range(len(rewards)), 'rewards': rewards}
    
    
    # 测试函数
    def test(arg_dict, env, agent):
        startTime = time.time()
        print("开始测试智能体......")
        print(f"环境名: {arg_dict['env_name']}, 算法名: {arg_dict['algo_name']}, Device: {arg_dict['device']}")
        rewards = []
        steps = []
        for i_ep in range(arg_dict['test_eps']):
            ep_reward = 0
            ep_step = 0
            state = env.reset()
            for _ in range(arg_dict['ep_max_steps']):
                # 画图
                if arg_dict['test_render']:
                    env.render()
                # 预测动作
                action, _, _ = agent.predict_action(state)
                next_state, reward, done, _ = env.step(action)
                state = next_state
                ep_reward += reward
                ep_step += 1
                if done:
                    break
            rewards.append(ep_reward)
            steps.append(ep_step)
            print(f"Episode: {i_ep + 1}/{arg_dict['test_eps']}, Steps:{ep_step}, Reward: {ep_reward:.2f}")
        print("测试结束 , 用时: " + str(time.time() - startTime) + " s")
        env.close()
        return {'episodes': range(len(rewards)), 'rewards': rewards}
    
    
    # 创建环境和智能体
    def create_env_agent(arg_dict):
        # 创建环境
        env = gym.make(arg_dict['env_name'])
        # 设置随机种子
        all_seed(env, seed=arg_dict["seed"])
        # 获取状态数
        try:
            n_states = env.observation_space.n
        except AttributeError:
            n_states = env.observation_space.shape[0]
        # 获取动作数
        n_actions = env.action_space.n
        print(f"状态数: {n_states}, 动作数: {n_actions}")
        # 将状态数和动作数加入算法参数字典
        arg_dict.update({"n_states": n_states, "n_actions": n_actions})
        # 实例化智能体对象
        models = {
            'ActorCritic': ActorCriticSoftMax(arg_dict['n_states'], arg_dict['n_actions'],
                                              actor_hidden_dim=arg_dict['actor_hidden_dim'],
                                              critic_hidden_dim=arg_dict['critic_hidden_dim'])}
        # 经验回放缓存区
        memory = PGReplay()
        agent = A2C(models, memory, arg_dict)
        # 返回环境,智能体
        return env, agent
    
    
    if __name__ == '__main__':
        # 防止报错 OMP: Error #15: Initializing libiomp5md.dll, but found libiomp5md.dll already initialized.
        os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
        # 获取当前路径
        curr_path = os.path.dirname(os.path.abspath(__file__))
        # 获取当前时间
        curr_time = datetime.datetime.now().strftime("%Y_%m_%d-%H_%M_%S")
        # 相关参数设置
        parser = argparse.ArgumentParser(description="hyper parameters")
        parser.add_argument('--algo_name', default='A2C', type=str, help="name of algorithm")
        parser.add_argument('--env_name', default='CartPole-v0', type=str, help="name of environment")
        parser.add_argument('--train_eps', default=2000, type=int, help="episodes of training")
        parser.add_argument('--test_eps', default=20, type=int, help="episodes of testing")
        parser.add_argument('--ep_max_steps', default=100000, type=int,
                            help="steps per episode, much larger value can simulate infinite steps")
        parser.add_argument('--gamma', default=0.99, type=float, help="discounted factor")
        parser.add_argument('--lr', default=3e-4, type=float, help="learning rate")
        parser.add_argument('--actor_hidden_dim', default=256, type=int)
        parser.add_argument('--critic_hidden_dim', default=256, type=int)
        parser.add_argument('--device', default='cpu', type=str, help="cpu or cuda")
        parser.add_argument('--seed', default=520, type=int, help="seed")
        parser.add_argument('--show_fig', default=False, type=bool, help="if show figure or not")
        parser.add_argument('--save_fig', default=True, type=bool, help="if save figure or not")
        parser.add_argument('--train_render', default=False, type=bool,
                            help="Whether to render the environment during training")
        parser.add_argument('--test_render', default=True, type=bool,
                            help="Whether to render the environment during testing")
        args = parser.parse_args()
        default_args = {'result_path': f"{curr_path}/outputs/{args.env_name}/{curr_time}/results/",
                        'model_path': f"{curr_path}/outputs/{args.env_name}/{curr_time}/models/",
                        }
        # 将参数转化为字典 type(dict)
        arg_dict = {**vars(args), **default_args}
        print("算法参数字典:", arg_dict)
    
        # 创建环境和智能体
        env, agent = create_env_agent(arg_dict)
        # 传入算法参数、环境、智能体,然后开始训练
        res_dic = train(arg_dict, env, agent)
        print("算法返回结果字典:", res_dic)
        # 保存相关信息
        agent.save_model(path=arg_dict['model_path'])
        save_args(arg_dict, path=arg_dict['result_path'])
        save_results(res_dic, tag='train', path=arg_dict['result_path'])
        plot_rewards(res_dic['rewards'], arg_dict, path=arg_dict['result_path'], tag="train")
    
        # =================================================================================================
        # 创建新环境和智能体用来测试
        print("=" * 300)
        env, agent = create_env_agent(arg_dict)
        # 加载已保存的智能体
        agent.load_model(path=arg_dict['model_path'])
        res_dic = test(arg_dict, env, agent)
        save_results(res_dic, tag='test', path=arg_dict['result_path'])
        plot_rewards(res_dic['rewards'], arg_dict, path=arg_dict['result_path'], tag="test")
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89
    • 90
    • 91
    • 92
    • 93
    • 94
    • 95
    • 96
    • 97
    • 98
    • 99
    • 100
    • 101
    • 102
    • 103
    • 104
    • 105
    • 106
    • 107
    • 108
    • 109
    • 110
    • 111
    • 112
    • 113
    • 114
    • 115
    • 116
    • 117
    • 118
    • 119
    • 120
    • 121
    • 122
    • 123
    • 124
    • 125
    • 126
    • 127
    • 128
    • 129
    • 130
    • 131
    • 132
    • 133
    • 134
    • 135
    • 136
    • 137
    • 138
    • 139
    • 140
    • 141
    • 142
    • 143
    • 144
    • 145
    • 146
    • 147
    • 148
    • 149
    • 150
    • 151
    • 152
    • 153
    • 154
    • 155
    • 156
    • 157
    • 158
    • 159
    • 160
    • 161
    • 162
    • 163
    • 164
    • 165
    • 166
    • 167
    • 168
    • 169
    • 170
    • 171
    • 172
    • 173
    • 174
    • 175
    • 176
    • 177
    • 178
    • 179
    • 180
    • 181
    • 182
    • 183
    • 184
    • 185
    • 186
    • 187
    • 188
    • 189
    • 190
    • 191
    • 192
    • 193
    • 194
    • 195
    • 196
    • 197
    • 198
    • 199
    • 200
    • 201
    • 202
    • 203
    • 204
    • 205
    • 206
    • 207
    • 208
    • 209
    • 210
    • 211
    • 212
    • 213
    • 214
    • 215
    • 216
    • 217
    • 218
    • 219
    • 220
    • 221
    • 222
    • 223
    • 224
    • 225
    • 226
    • 227
    • 228
    • 229
    • 230
    • 231
    • 232
    • 233
    • 234
    • 235
    • 236
    • 237
    • 238
    • 239
    • 240
    • 241
    • 242
    • 243
    • 244
    • 245
    • 246
    • 247
    • 248
    • 249
    • 250
    • 251
    • 252
    • 253
    • 254
    • 255
    • 256
    • 257
    • 258
    • 259
    • 260
    • 261
    • 262
    • 263
    • 264
    • 265
    • 266

    运行结果展示

    由于有些输出太长,下面仅展示部分输出

    状态数: 4, 动作数: 2
    环境名: CartPole-v0, 算法名: A2C, Device: cpu
    开始训练智能体......
    Episode: 10/2000, Reward: 12.00, Steps:12
    Episode: 20/2000, Reward: 21.00, Steps:21
    Episode: 30/2000, Reward: 13.00, Steps:13
    Episode: 40/2000, Reward: 14.00, Steps:14
    Episode: 50/2000, Reward: 19.00, Steps:19
    Episode: 60/2000, Reward: 22.00, Steps:22
    Episode: 70/2000, Reward: 50.00, Steps:50
    Episode: 80/2000, Reward: 19.00, Steps:19
    Episode: 90/2000, Reward: 18.00, Steps:18
    Episode: 100/2000, Reward: 28.00, Steps:28
    Episode: 110/2000, Reward: 20.00, Steps:20
    Episode: 120/2000, Reward: 28.00, Steps:28
    Episode: 130/2000, Reward: 76.00, Steps:76
    Episode: 140/2000, Reward: 22.00, Steps:22
    Episode: 150/2000, Reward: 70.00, Steps:70
    Episode: 160/2000, Reward: 20.00, Steps:20
    Episode: 170/2000, Reward: 85.00, Steps:85
    Episode: 180/2000, Reward: 17.00, Steps:17
    Episode: 190/2000, Reward: 49.00, Steps:49
    Episode: 200/2000, Reward: 21.00, Steps:21
    Episode: 210/2000, Reward: 65.00, Steps:65
    Episode: 220/2000, Reward: 54.00, Steps:54
    Episode: 230/2000, Reward: 85.00, Steps:85
    Episode: 240/2000, Reward: 48.00, Steps:48
    Episode: 250/2000, Reward: 22.00, Steps:22
    Episode: 260/2000, Reward: 34.00, Steps:34
    Episode: 270/2000, Reward: 22.00, Steps:22
    Episode: 280/2000, Reward: 29.00, Steps:29
    Episode: 290/2000, Reward: 77.00, Steps:77
    Episode: 300/2000, Reward: 30.00, Steps:30
    Episode: 310/2000, Reward: 115.00, Steps:115
    Episode: 320/2000, Reward: 62.00, Steps:62
    Episode: 330/2000, Reward: 45.00, Steps:45
    Episode: 340/2000, Reward: 102.00, Steps:102
    Episode: 350/2000, Reward: 93.00, Steps:93
    Episode: 360/2000, Reward: 27.00, Steps:27
    Episode: 370/2000, Reward: 31.00, Steps:31
    Episode: 380/2000, Reward: 27.00, Steps:27
    Episode: 390/2000, Reward: 30.00, Steps:30
    Episode: 400/2000, Reward: 30.00, Steps:30
    Episode: 410/2000, Reward: 61.00, Steps:61
    Episode: 420/2000, Reward: 61.00, Steps:61
    Episode: 430/2000, Reward: 56.00, Steps:56
    Episode: 440/2000, Reward: 120.00, Steps:120
    Episode: 450/2000, Reward: 87.00, Steps:87
    Episode: 460/2000, Reward: 66.00, Steps:66
    Episode: 470/2000, Reward: 30.00, Steps:30
    Episode: 480/2000, Reward: 65.00, Steps:65
    Episode: 490/2000, Reward: 72.00, Steps:72
    Episode: 500/2000, Reward: 64.00, Steps:64
    Episode: 510/2000, Reward: 93.00, Steps:93
    Episode: 520/2000, Reward: 159.00, Steps:159
    Episode: 530/2000, Reward: 21.00, Steps:21
    Episode: 540/2000, Reward: 31.00, Steps:31
    Episode: 550/2000, Reward: 126.00, Steps:126
    Episode: 560/2000, Reward: 176.00, Steps:176
    Episode: 570/2000, Reward: 116.00, Steps:116
    Episode: 580/2000, Reward: 131.00, Steps:131
    Episode: 590/2000, Reward: 156.00, Steps:156
    Episode: 600/2000, Reward: 158.00, Steps:158
    Episode: 610/2000, Reward: 125.00, Steps:125
    Episode: 620/2000, Reward: 39.00, Steps:39
    Episode: 630/2000, Reward: 52.00, Steps:52
    Episode: 640/2000, Reward: 67.00, Steps:67
    Episode: 650/2000, Reward: 110.00, Steps:110
    Episode: 660/2000, Reward: 95.00, Steps:95
    Episode: 670/2000, Reward: 33.00, Steps:33
    Episode: 680/2000, Reward: 188.00, Steps:188
    Episode: 690/2000, Reward: 29.00, Steps:29
    Episode: 700/2000, Reward: 58.00, Steps:58
    Episode: 710/2000, Reward: 60.00, Steps:60
    Episode: 720/2000, Reward: 131.00, Steps:131
    Episode: 730/2000, Reward: 132.00, Steps:132
    Episode: 740/2000, Reward: 169.00, Steps:169
    Episode: 750/2000, Reward: 189.00, Steps:189
    Episode: 760/2000, Reward: 109.00, Steps:109
    Episode: 770/2000, Reward: 70.00, Steps:70
    Episode: 780/2000, Reward: 200.00, Steps:200
    Episode: 790/2000, Reward: 157.00, Steps:157
    Episode: 800/2000, Reward: 178.00, Steps:178
    Episode: 810/2000, Reward: 181.00, Steps:181
    Episode: 820/2000, Reward: 112.00, Steps:112
    Episode: 830/2000, Reward: 28.00, Steps:28
    Episode: 840/2000, Reward: 184.00, Steps:184
    Episode: 850/2000, Reward: 80.00, Steps:80
    Episode: 860/2000, Reward: 25.00, Steps:25
    Episode: 870/2000, Reward: 148.00, Steps:148
    Episode: 880/2000, Reward: 111.00, Steps:111
    Episode: 890/2000, Reward: 121.00, Steps:121
    Episode: 900/2000, Reward: 130.00, Steps:130
    Episode: 910/2000, Reward: 190.00, Steps:190
    Episode: 920/2000, Reward: 124.00, Steps:124
    Episode: 930/2000, Reward: 140.00, Steps:140
    Episode: 940/2000, Reward: 200.00, Steps:200
    Episode: 950/2000, Reward: 86.00, Steps:86
    Episode: 960/2000, Reward: 82.00, Steps:82
    Episode: 970/2000, Reward: 186.00, Steps:186
    Episode: 980/2000, Reward: 66.00, Steps:66
    Episode: 990/2000, Reward: 200.00, Steps:200
    Episode: 1000/2000, Reward: 193.00, Steps:193
    Episode: 1010/2000, Reward: 200.00, Steps:200
    Episode: 1020/2000, Reward: 157.00, Steps:157
    Episode: 1030/2000, Reward: 150.00, Steps:150
    Episode: 1040/2000, Reward: 200.00, Steps:200
    Episode: 1050/2000, Reward: 200.00, Steps:200
    Episode: 1060/2000, Reward: 115.00, Steps:115
    Episode: 1070/2000, Reward: 108.00, Steps:108
    Episode: 1080/2000, Reward: 189.00, Steps:189
    Episode: 1090/2000, Reward: 126.00, Steps:126
    Episode: 1100/2000, Reward: 200.00, Steps:200
    Episode: 1110/2000, Reward: 200.00, Steps:200
    Episode: 1120/2000, Reward: 200.00, Steps:200
    Episode: 1130/2000, Reward: 200.00, Steps:200
    Episode: 1140/2000, Reward: 200.00, Steps:200
    Episode: 1150/2000, Reward: 200.00, Steps:200
    Episode: 1160/2000, Reward: 131.00, Steps:131
    Episode: 1170/2000, Reward: 191.00, Steps:191
    Episode: 1180/2000, Reward: 200.00, Steps:200
    Episode: 1190/2000, Reward: 200.00, Steps:200
    Episode: 1200/2000, Reward: 171.00, Steps:171
    Episode: 1210/2000, Reward: 200.00, Steps:200
    Episode: 1220/2000, Reward: 180.00, Steps:180
    Episode: 1230/2000, Reward: 127.00, Steps:127
    Episode: 1240/2000, Reward: 94.00, Steps:94
    Episode: 1250/2000, Reward: 113.00, Steps:113
    Episode: 1260/2000, Reward: 150.00, Steps:150
    Episode: 1270/2000, Reward: 200.00, Steps:200
    Episode: 1280/2000, Reward: 148.00, Steps:148
    Episode: 1290/2000, Reward: 111.00, Steps:111
    Episode: 1300/2000, Reward: 200.00, Steps:200
    Episode: 1310/2000, Reward: 77.00, Steps:77
    Episode: 1320/2000, Reward: 158.00, Steps:158
    Episode: 1330/2000, Reward: 200.00, Steps:200
    Episode: 1340/2000, Reward: 180.00, Steps:180
    Episode: 1350/2000, Reward: 142.00, Steps:142
    Episode: 1360/2000, Reward: 142.00, Steps:142
    Episode: 1370/2000, Reward: 147.00, Steps:147
    Episode: 1380/2000, Reward: 196.00, Steps:196
    Episode: 1390/2000, Reward: 200.00, Steps:200
    Episode: 1400/2000, Reward: 163.00, Steps:163
    Episode: 1410/2000, Reward: 159.00, Steps:159
    Episode: 1420/2000, Reward: 170.00, Steps:170
    Episode: 1430/2000, Reward: 200.00, Steps:200
    Episode: 1440/2000, Reward: 200.00, Steps:200
    Episode: 1450/2000, Reward: 200.00, Steps:200
    Episode: 1460/2000, Reward: 200.00, Steps:200
    Episode: 1470/2000, Reward: 200.00, Steps:200
    Episode: 1480/2000, Reward: 200.00, Steps:200
    Episode: 1490/2000, Reward: 200.00, Steps:200
    Episode: 1500/2000, Reward: 200.00, Steps:200
    Episode: 1510/2000, Reward: 200.00, Steps:200
    Episode: 1520/2000, Reward: 75.00, Steps:75
    Episode: 1530/2000, Reward: 200.00, Steps:200
    Episode: 1540/2000, Reward: 200.00, Steps:200
    Episode: 1550/2000, Reward: 200.00, Steps:200
    Episode: 1560/2000, Reward: 189.00, Steps:189
    Episode: 1570/2000, Reward: 194.00, Steps:194
    Episode: 1580/2000, Reward: 200.00, Steps:200
    Episode: 1590/2000, Reward: 164.00, Steps:164
    Episode: 1600/2000, Reward: 200.00, Steps:200
    Episode: 1610/2000, Reward: 200.00, Steps:200
    Episode: 1620/2000, Reward: 161.00, Steps:161
    Episode: 1630/2000, Reward: 200.00, Steps:200
    Episode: 1640/2000, Reward: 135.00, Steps:135
    Episode: 1650/2000, Reward: 159.00, Steps:159
    Episode: 1660/2000, Reward: 115.00, Steps:115
    Episode: 1670/2000, Reward: 197.00, Steps:197
    Episode: 1680/2000, Reward: 200.00, Steps:200
    Episode: 1690/2000, Reward: 200.00, Steps:200
    Episode: 1700/2000, Reward: 157.00, Steps:157
    Episode: 1710/2000, Reward: 190.00, Steps:190
    Episode: 1720/2000, Reward: 127.00, Steps:127
    Episode: 1730/2000, Reward: 64.00, Steps:64
    Episode: 1740/2000, Reward: 178.00, Steps:178
    Episode: 1750/2000, Reward: 130.00, Steps:130
    Episode: 1760/2000, Reward: 142.00, Steps:142
    Episode: 1770/2000, Reward: 108.00, Steps:108
    Episode: 1780/2000, Reward: 99.00, Steps:99
    Episode: 1790/2000, Reward: 130.00, Steps:130
    Episode: 1800/2000, Reward: 147.00, Steps:147
    Episode: 1810/2000, Reward: 200.00, Steps:200
    Episode: 1820/2000, Reward: 60.00, Steps:60
    Episode: 1830/2000, Reward: 200.00, Steps:200
    Episode: 1840/2000, Reward: 93.00, Steps:93
    Episode: 1850/2000, Reward: 163.00, Steps:163
    Episode: 1860/2000, Reward: 189.00, Steps:189
    Episode: 1870/2000, Reward: 200.00, Steps:200
    Episode: 1880/2000, Reward: 200.00, Steps:200
    Episode: 1890/2000, Reward: 200.00, Steps:200
    Episode: 1900/2000, Reward: 200.00, Steps:200
    Episode: 1910/2000, Reward: 200.00, Steps:200
    Episode: 1920/2000, Reward: 200.00, Steps:200
    Episode: 1930/2000, Reward: 200.00, Steps:200
    Episode: 1940/2000, Reward: 102.00, Steps:102
    Episode: 1950/2000, Reward: 106.00, Steps:106
    Episode: 1960/2000, Reward: 200.00, Steps:200
    Episode: 1970/2000, Reward: 200.00, Steps:200
    Episode: 1980/2000, Reward: 200.00, Steps:200
    Episode: 1990/2000, Reward: 200.00, Steps:200
    Episode: 2000/2000, Reward: 200.00, Steps:200
    训练结束 , 用时: 129.54206490516663 s
    状态数: 4, 动作数: 2
    开始测试智能体......
    环境名: CartPole-v0, 算法名: A2C, Device: cpu
    Episode: 1/20, Steps:130, Reward: 130.00
    Episode: 2/20, Steps:200, Reward: 200.00
    Episode: 3/20, Steps:200, Reward: 200.00
    Episode: 4/20, Steps:200, Reward: 200.00
    Episode: 5/20, Steps:200, Reward: 200.00
    Episode: 6/20, Steps:200, Reward: 200.00
    Episode: 7/20, Steps:87, Reward: 87.00
    Episode: 8/20, Steps:200, Reward: 200.00
    Episode: 9/20, Steps:68, Reward: 68.00
    Episode: 10/20, Steps:200, Reward: 200.00
    Episode: 11/20, Steps:62, Reward: 62.00
    Episode: 12/20, Steps:200, Reward: 200.00
    Episode: 13/20, Steps:200, Reward: 200.00
    Episode: 14/20, Steps:200, Reward: 200.00
    Episode: 15/20, Steps:200, Reward: 200.00
    Episode: 16/20, Steps:200, Reward: 200.00
    Episode: 17/20, Steps:200, Reward: 200.00
    Episode: 18/20, Steps:200, Reward: 200.00
    Episode: 19/20, Steps:200, Reward: 200.00
    Episode: 20/20, Steps:200, Reward: 200.00
    测试结束 , 用时: 27.40801215171814 s
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62
    • 63
    • 64
    • 65
    • 66
    • 67
    • 68
    • 69
    • 70
    • 71
    • 72
    • 73
    • 74
    • 75
    • 76
    • 77
    • 78
    • 79
    • 80
    • 81
    • 82
    • 83
    • 84
    • 85
    • 86
    • 87
    • 88
    • 89
    • 90
    • 91
    • 92
    • 93
    • 94
    • 95
    • 96
    • 97
    • 98
    • 99
    • 100
    • 101
    • 102
    • 103
    • 104
    • 105
    • 106
    • 107
    • 108
    • 109
    • 110
    • 111
    • 112
    • 113
    • 114
    • 115
    • 116
    • 117
    • 118
    • 119
    • 120
    • 121
    • 122
    • 123
    • 124
    • 125
    • 126
    • 127
    • 128
    • 129
    • 130
    • 131
    • 132
    • 133
    • 134
    • 135
    • 136
    • 137
    • 138
    • 139
    • 140
    • 141
    • 142
    • 143
    • 144
    • 145
    • 146
    • 147
    • 148
    • 149
    • 150
    • 151
    • 152
    • 153
    • 154
    • 155
    • 156
    • 157
    • 158
    • 159
    • 160
    • 161
    • 162
    • 163
    • 164
    • 165
    • 166
    • 167
    • 168
    • 169
    • 170
    • 171
    • 172
    • 173
    • 174
    • 175
    • 176
    • 177
    • 178
    • 179
    • 180
    • 181
    • 182
    • 183
    • 184
    • 185
    • 186
    • 187
    • 188
    • 189
    • 190
    • 191
    • 192
    • 193
    • 194
    • 195
    • 196
    • 197
    • 198
    • 199
    • 200
    • 201
    • 202
    • 203
    • 204
    • 205
    • 206
    • 207
    • 208
    • 209
    • 210
    • 211
    • 212
    • 213
    • 214
    • 215
    • 216
    • 217
    • 218
    • 219
    • 220
    • 221
    • 222
    • 223
    • 224
    • 225
    • 226
    • 227
    • 228

    在这里插入图片描述
    在这里插入图片描述

    4.4 关于可视化的设置

    如果你觉得可视化比较耗时,你可以进行设置,取消可视化。
    或者你想看看训练过程的可视化,也可以进行相关设置

    在这里插入图片描述

  • 相关阅读:
    蓝桥杯-修建灌木
    网页编辑软件Whisk mac中文版功能介绍
    Ant-design 组件库使用
    Rocketmq--消息发送和接收演示
    sqli-labs 靶场闯关基础准备、学习步骤、SQL注入类型,常用基本函数、获取数据库元数据
    uniapp发行H5获取当前页面query
    汽车电子电气TARA分析从入门到放弃
    投稿开奖丨轻量应用服务器征文活动(5月)奖励公布
    多域名SSL证书的优势
    101. 垃圾回收与内存泄漏?
  • 原文地址:https://blog.csdn.net/weixin_51545953/article/details/127486106