• 强化学习--多维动作状态空间的设计



    免责声明:以下代码部分来自网络,部分来自ChatGPT,部分来自个人的理解。如有其他观点,欢迎讨论!

    一、离散动作

    注意:本文均以PPO算法为例。

    # time: 2023/11/22 21:04
    # author: YanJP
    
    
    import torch
    import torch
    import torch.nn as nn
    from torch.distributions import Categorical
    
    class MultiDimensionalActor(nn.Module):
        def __init__(self, input_dim, output_dims):
            super(MultiDimensionalActor, self).__init__()
    
            # Define a shared feature extraction network
            self.feature_extractor = nn.Sequential(
                nn.Linear(input_dim, 128),
                nn.ReLU(),
                nn.Linear(128, 64),
                nn.ReLU()
            )
    
            # Define individual output layers for each action dimension
            self.output_layers = nn.ModuleList([
                nn.Linear(64, num_actions) for num_actions in output_dims
            ])
    
        def forward(self, state):
            # Feature extraction
            features = self.feature_extractor(state)
    
            # Generate Categorical objects for each action dimension
            categorical_objects = [Categorical(logits=output_layer(features)) for output_layer in self.output_layers]
    
            return categorical_objects
    
    # 定义主函数
    def main():
        # 定义输入状态维度和每个动作维度的动作数
        input_dim = 10
        output_dims = [5, 8]  # 两个动作维度,分别有 3 和 4 个可能的动作
    
        # 创建 MultiDimensionalActor 实例
        actor_network = MultiDimensionalActor(input_dim, output_dims)
    
        # 生成输入状态(这里使用随机数据作为示例)
        state = torch.randn(1, input_dim)
    
        # 调用 actor 网络
        categorical_objects = actor_network(state)
    
        # 输出每个动作维度的采样动作和对应的对数概率
        for i, categorical in enumerate(categorical_objects):
            sampled_action = categorical.sample()
            log_prob = categorical.log_prob(sampled_action)
            print(f"Sampled action for dimension {i+1}: {sampled_action.item()}, Log probability: {log_prob.item()}")
    
    if __name__ == "__main__":
        main()
    
    #Sampled action for dimension 1: 1, Log probability: -1.4930928945541382
    #Sampled action for dimension 2: 3, Log probability: -2.1875085830688477
    
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41
    • 42
    • 43
    • 44
    • 45
    • 46
    • 47
    • 48
    • 49
    • 50
    • 51
    • 52
    • 53
    • 54
    • 55
    • 56
    • 57
    • 58
    • 59
    • 60
    • 61
    • 62

    注意代码中categorical函数的两个不同传入参数的区别:参考链接
    简单来说,logits是计算softmax的,probs直接就是已知概率的时候传进去就行。

    二、连续动作

    参考链接:github知乎
    为什么取对数概率?参考回答
    在这里插入图片描述

    1、例子1

    先看如下的代码:

    # time: 2023/11/21 21:33
    # author: YanJP
    #这是对应多维连续变量的例子:
    # 参考链接:https://github.com/XinJingHao/PPO-Continuous-Pytorch/blob/main/utils.py
    # https://www.zhihu.com/question/417161289
    import torch.nn as nn
    import torch
    class Policy(nn.Module):
        def __init__(self, in_dim, n_hidden_1, n_hidden_2, num_outputs):
            super(Policy, self).__init__()
            self.layer = nn.Sequential(
                nn.Linear(in_dim, n_hidden_1),
                nn.ReLU(True),
                nn.Linear(n_hidden_1, n_hidden_2),
                nn.ReLU(True),
                nn.Linear(n_hidden_2, num_outputs)
            )
    
    class Normal(nn.Module):
        def __init__(self, num_outputs):
            super().__init__()
            self.stds = nn.Parameter(torch.zeros(num_outputs))  #创建一个可学习的参数 
        def forward(self, x):
            dist = torch.distributions.Normal(loc=x, scale=self.stds.exp())
            action = dist.sample((every_dimention_output,))  #这里我觉得是最重要的,不填sample的参数的话,默认每个分布只采样一个值!!!!!!!!
            return action
    
    if __name__ == '__main__':
        policy = Policy(4,20,20,5)
        normal = Normal(5) #设置5个维度
        every_dimention_output=10  #每个维度10个输出
        observation = torch.Tensor(4)
        action = normal.forward(policy.layer( observation))
        print("action: ",action)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • self.stds.exp(),表示求指数,因为正态分布的标准差都是正数。
    • action = dist.sample((every_dimention_output,))这里最重要!!!

    2、知乎给出的示例

    
    class Agent(nn.Module):
        def __init__(self, envs):
            super(Agent, self).__init__()
            self.actor_mean = nn.Sequential(
                layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)),
                nn.Tanh(),
                layer_init(nn.Linear(64, 64)),
                nn.Tanh(),
                layer_init(nn.Linear(64, np.prod(envs.single_action_space.shape)), std=0.01),
            )
            self.actor_logstd = nn.Parameter(torch.zeros(1, np.prod(envs.single_action_space.shape)))
    
        def get_action_and_value(self, x, action=None):
            action_mean = self.actor_mean(x)
            action_logstd = self.actor_logstd.expand_as(action_mean)
            action_std = torch.exp(action_logstd)
            probs = Normal(action_mean, action_std)
            if action is None:
                action = probs.sample()
            return action, probs.log_prob(action).sum(1), probs.entropy().sum(1), self.critic(x)
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21

    这里的np.prod(envs.single_action_space.shape),表示每个维度的动作数相乘,然后初始化这么多个actor网络的标准差和均值,最后action里面的sample就是采样这么多个数据。(感觉还是拉成了一维计算)

    2、github里面的代码

    github

    import torch
    import torch.nn as nn
    import torch.nn.functional as F
    from torch.distributions import Beta,Normal
    
    class GaussianActor_musigma(nn.Module):
    	def __init__(self, state_dim, action_dim, net_width):
    		super(GaussianActor_musigma, self).__init__()
    
    		self.l1 = nn.Linear(state_dim, net_width)
    		self.l2 = nn.Linear(net_width, net_width)
    		self.mu_head = nn.Linear(net_width, action_dim)
    		self.sigma_head = nn.Linear(net_width, action_dim)
    
    	def forward(self, state):
    		a = torch.tanh(self.l1(state))
    		a = torch.tanh(self.l2(a))
    		mu = torch.sigmoid(self.mu_head(a))
    		sigma = F.softplus( self.sigma_head(a) )
    		return mu,sigma
    
    	def get_dist(self, state):
    		mu,sigma = self.forward(state)
    		dist = Normal(mu,sigma)
    		return dist
    
    	def deterministic_act(self, state):
    		mu, sigma = self.forward(state)
    		return mu
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29

    上述代码主要是通过设置mu_head 和sigma_head的个数,来实现多维动作。

    class GaussianActor_mu(nn.Module):
    	def __init__(self, state_dim, action_dim, net_width, log_std=0):
    		super(GaussianActor_mu, self).__init__()
    
    		self.l1 = nn.Linear(state_dim, net_width)
    		self.l2 = nn.Linear(net_width, net_width)
    		self.mu_head = nn.Linear(net_width, action_dim)
    		self.mu_head.weight.data.mul_(0.1)
    		self.mu_head.bias.data.mul_(0.0)
    
    		self.action_log_std = nn.Parameter(torch.ones(1, action_dim) * log_std)
    
    	def forward(self, state):
    		a = torch.relu(self.l1(state))
    		a = torch.relu(self.l2(a))
    		mu = torch.sigmoid(self.mu_head(a))
    		return mu
    
    	def get_dist(self,state):
    		mu = self.forward(state)
    		action_log_std = self.action_log_std.expand_as(mu)
    		action_std = torch.exp(action_log_std)
    
    		dist = Normal(mu, action_std)
    		return dist
    
    	def deterministic_act(self, state):
    		return self.forward(state)
    class Critic(nn.Module):
    	def __init__(self, state_dim,net_width):
    		super(Critic, self).__init__()
    
    		self.C1 = nn.Linear(state_dim, net_width)
    		self.C2 = nn.Linear(net_width, net_width)
    		self.C3 = nn.Linear(net_width, 1)
    
    	def forward(self, state):
    		v = torch.tanh(self.C1(state))
    		v = torch.tanh(self.C2(v))
    		v = self.C3(v)
    		return v
    
    • 1
    • 2
    • 3
    • 4
    • 5
    • 6
    • 7
    • 8
    • 9
    • 10
    • 11
    • 12
    • 13
    • 14
    • 15
    • 16
    • 17
    • 18
    • 19
    • 20
    • 21
    • 22
    • 23
    • 24
    • 25
    • 26
    • 27
    • 28
    • 29
    • 30
    • 31
    • 32
    • 33
    • 34
    • 35
    • 36
    • 37
    • 38
    • 39
    • 40
    • 41

    上述代码只定义了mu的个数与维度数一样,std作为可学习的参数之一。

  • 相关阅读:
    【实用】Linux系统知识小汇
    js书写规范
    MySQL实现的一点总结(五)
    最新总结MySQL核心知识点
    debian无法使用root用户登陆系统
    Boot Rom和Bootloader
    【Spring】事务管理
    【数据结构】排序--插入排序(希尔排序)
    Hbase 之KeyValue结构详解
    Unity关于无法新建项目的可能解决办法
  • 原文地址:https://blog.csdn.net/qq_45889056/article/details/134562556