强化学习--多维动作状态空间的设计

一、离散动作

注意：本文均以PPO算法为例。

# time: 2023/11/22 21:04
# author: YanJP


import torch
import torch
import torch.nn as nn
from torch.distributions import Categorical

class MultiDimensionalActor(nn.Module):
    def __init__(self, input_dim, output_dims):
        super(MultiDimensionalActor, self).__init__()

        # Define a shared feature extraction network
        self.feature_extractor = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU()
        )

        # Define individual output layers for each action dimension
        self.output_layers = nn.ModuleList([
            nn.Linear(64, num_actions) for num_actions in output_dims
        ])

    def forward(self, state):
        # Feature extraction
        features = self.feature_extractor(state)

        # Generate Categorical objects for each action dimension
        categorical_objects = [Categorical(logits=output_layer(features)) for output_layer in self.output_layers]

        return categorical_objects

# 定义主函数
def main():
    # 定义输入状态维度和每个动作维度的动作数
    input_dim = 10
    output_dims = [5, 8]  # 两个动作维度，分别有 3 和 4 个可能的动作

    # 创建 MultiDimensionalActor 实例
    actor_network = MultiDimensionalActor(input_dim, output_dims)

    # 生成输入状态（这里使用随机数据作为示例）
    state = torch.randn(1, input_dim)

    # 调用 actor 网络
    categorical_objects = actor_network(state)

    # 输出每个动作维度的采样动作和对应的对数概率
    for i, categorical in enumerate(categorical_objects):
        sampled_action = categorical.sample()
        log_prob = categorical.log_prob(sampled_action)
        print(f"Sampled action for dimension {i+1}: {sampled_action.item()}, Log probability: {log_prob.item()}")

if __name__ == "__main__":
    main()

#Sampled action for dimension 1: 1, Log probability: -1.4930928945541382
#Sampled action for dimension 2: 3, Log probability: -2.1875085830688477

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62

注意代码中categorical函数的两个不同传入参数的区别：参考链接
简单来说，logits是计算softmax的，probs直接就是已知概率的时候传进去就行。

二、连续动作

参考链接：github、知乎
为什么取对数概率？参考回答
在这里插入图片描述

1、例子1

先看如下的代码：

# time: 2023/11/21 21:33
# author: YanJP
#这是对应多维连续变量的例子：
# 参考链接：https://github.com/XinJingHao/PPO-Continuous-Pytorch/blob/main/utils.py
# https://www.zhihu.com/question/417161289
import torch.nn as nn
import torch
class Policy(nn.Module):
    def __init__(self, in_dim, n_hidden_1, n_hidden_2, num_outputs):
        super(Policy, self).__init__()
        self.layer = nn.Sequential(
            nn.Linear(in_dim, n_hidden_1),
            nn.ReLU(True),
            nn.Linear(n_hidden_1, n_hidden_2),
            nn.ReLU(True),
            nn.Linear(n_hidden_2, num_outputs)
        )

class Normal(nn.Module):
    def __init__(self, num_outputs):
        super().__init__()
        self.stds = nn.Parameter(torch.zeros(num_outputs))  #创建一个可学习的参数 
    def forward(self, x):
        dist = torch.distributions.Normal(loc=x, scale=self.stds.exp())
        action = dist.sample((every_dimention_output,))  #这里我觉得是最重要的，不填sample的参数的话，默认每个分布只采样一个值！！！！！！！！
        return action

if __name__ == '__main__':
    policy = Policy(4,20,20,5)
    normal = Normal(5) #设置5个维度
    every_dimention_output=10  #每个维度10个输出
    observation = torch.Tensor(4)
    action = normal.forward(policy.layer( observation))
    print("action: ",action)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34

self.stds.exp()，表示求指数，因为正态分布的标准差都是正数。
action = dist.sample((every_dimention_output,))这里最重要！！！

2、知乎给出的示例


class Agent(nn.Module):
    def __init__(self, envs):
        super(Agent, self).__init__()
        self.actor_mean = nn.Sequential(
            layer_init(nn.Linear(np.array(envs.single_observation_space.shape).prod(), 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, 64)),
            nn.Tanh(),
            layer_init(nn.Linear(64, np.prod(envs.single_action_space.shape)), std=0.01),
        )
        self.actor_logstd = nn.Parameter(torch.zeros(1, np.prod(envs.single_action_space.shape)))

    def get_action_and_value(self, x, action=None):
        action_mean = self.actor_mean(x)
        action_logstd = self.actor_logstd.expand_as(action_mean)
        action_std = torch.exp(action_logstd)
        probs = Normal(action_mean, action_std)
        if action is None:
            action = probs.sample()
        return action, probs.log_prob(action).sum(1), probs.entropy().sum(1), self.critic(x)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21

这里的np.prod(envs.single_action_space.shape)，表示每个维度的动作数相乘，然后初始化这么多个actor网络的标准差和均值，最后action里面的sample就是采样这么多个数据。（感觉还是拉成了一维计算）

2、github里面的代码

github

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Beta,Normal

class GaussianActor_musigma(nn.Module):
	def __init__(self, state_dim, action_dim, net_width):
		super(GaussianActor_musigma, self).__init__()

		self.l1 = nn.Linear(state_dim, net_width)
		self.l2 = nn.Linear(net_width, net_width)
		self.mu_head = nn.Linear(net_width, action_dim)
		self.sigma_head = nn.Linear(net_width, action_dim)

	def forward(self, state):
		a = torch.tanh(self.l1(state))
		a = torch.tanh(self.l2(a))
		mu = torch.sigmoid(self.mu_head(a))
		sigma = F.softplus( self.sigma_head(a) )
		return mu,sigma

	def get_dist(self, state):
		mu,sigma = self.forward(state)
		dist = Normal(mu,sigma)
		return dist

	def deterministic_act(self, state):
		mu, sigma = self.forward(state)
		return mu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29

上述代码主要是通过设置mu_head 和sigma_head的个数，来实现多维动作。

class GaussianActor_mu(nn.Module):
	def __init__(self, state_dim, action_dim, net_width, log_std=0):
		super(GaussianActor_mu, self).__init__()

		self.l1 = nn.Linear(state_dim, net_width)
		self.l2 = nn.Linear(net_width, net_width)
		self.mu_head = nn.Linear(net_width, action_dim)
		self.mu_head.weight.data.mul_(0.1)
		self.mu_head.bias.data.mul_(0.0)

		self.action_log_std = nn.Parameter(torch.ones(1, action_dim) * log_std)

	def forward(self, state):
		a = torch.relu(self.l1(state))
		a = torch.relu(self.l2(a))
		mu = torch.sigmoid(self.mu_head(a))
		return mu

	def get_dist(self,state):
		mu = self.forward(state)
		action_log_std = self.action_log_std.expand_as(mu)
		action_std = torch.exp(action_log_std)

		dist = Normal(mu, action_std)
		return dist

	def deterministic_act(self, state):
		return self.forward(state)
class Critic(nn.Module):
	def __init__(self, state_dim,net_width):
		super(Critic, self).__init__()

		self.C1 = nn.Linear(state_dim, net_width)
		self.C2 = nn.Linear(net_width, net_width)
		self.C3 = nn.Linear(net_width, 1)

	def forward(self, state):
		v = torch.tanh(self.C1(state))
		v = torch.tanh(self.C2(v))
		v = self.C3(v)
		return v
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41

上述代码只定义了mu的个数与维度数一样，std作为可学习的参数之一。

相关阅读:
React Redux
webpack常用loader和pluglin
【无标题】51单片机人体反应速度测试仪带万年历功能LCD1602 DS1302
Cocos Creator 3.8.x 后效处理（前向渲染）
javaEE基于springboot民宿推荐系统springmvc+mybatis+jsp]
从零开始学习Dubbo4——让模块独立运行
 自然语言处理 Paddle NLP - 预训练语言模型及应用
 深度学习开发环境搭建
 阿里云交互式建模（PAI-DSW）训练并微调推理ChatGLM模型
 C++ 的typedef详解
原文地址：https://blog.csdn.net/qq_45889056/article/details/134562556