• 深度强化学习中利用Q-Learngin和期望Sarsa算法确定机器人最优策略实战(超详细 附源码)


    需要源码和环境搭建请点赞关注收藏后评论区留下QQ~~~

    一、Q-Learning算法

    Q-Learning算法中动作值函数Q的更新方向是最优动作值函数q,而与Agent所遵循的行为策略无关,在评估动作值函数Q时,更新目标为最优动作值函数q的直接近似,故需要遍历当前状态的所有动作,在所有状态都能被无限次访问的前提下,Q-Learning算法能以1的概率收敛到最优动作值函数和最优策略

    下图是估算最优策略的Q-Learning算法流程图

    Q-Learning虽然是异策略,但是从值函数更新迭代式中可以看出,它并没有使用到重要性采样。

     

     使用Q-Learning算法解决确定环境中的扫地机器人问题 参数设置与之前相同 使用贪心策略 

    机器人背景及环境搭建

    输出如下

     

     

     代码如下

    1. #Q-learning算法
    2. from 扫地机器人gym环境 import GridWorldEnv
    3. import numpy as np
    4. np.random.seed(1)
    5. env = GridWorldEnv()
    6. #有效动作空间
    7. def vilid_action_space(s):
    8. action_sacpe = []
    9. if s % 5 != 0:#左
    10. action_sacpe.append(0)
    11. if s % 5 != 4:#右
    12. action_sacpe.append(1)
    13. if s <= 19:#上
    14. action_sacpe.append(2)
    15. if s >= 5:#下
    16. action_sacpe.append(3)
    17. return action_sacpe
    18. def policy_epsilon_greedy(s, Q, epsilon):
    19. Q_s = Q[s]
    20. action = vilid_action_space(s)
    21. if np.random.rand() < epsilon:
    22. a = np.random.choice(action)
    23. else:
    24. index_a = np.argmax([Q_s[i] for i in action])
    25. a = action[index_a]
    26. return a
    27. def trans1(Q_S):
    28. new_Q = []
    29. new_Q.append(Q_S[2])
    30. new_Q.append(Q_S[3])
    31. new_Q.append(Q_S[0])
    32. new_Q.append(Q_S[1])
    33. return new_Q
    34. def trans(Q_S):
    35. new_Q = []
    36. new_Q.append(round(Q_S[2],3))
    37. new_Q.append(round(Q_S[3],3))
    38. new_Q.append(round(Q_S[0],3))
    39. new_Q.append(round(Q_S[1],3))
    40. return new_Q
    41. def print_dd(s, a, next_s, print_len, episode_i, Q,e_k,a_k):
    42. for i in range(2):
    43. if episode_i == int(print_len * (0.1 * i + 1)):
    44. if s == 15 and a == 3 and next_s == 10:
    45. print("*********************************单步的计算过程***************************************")
    46. print("alpha:"+str(a_k))
    47. print("epsilon:"+str(e_k))
    48. print("state:" + str(int(print_len * (0.1 * i + 1))))
    49. print("Q(%d,%d)"%(s,a))
    50. print(Q[s][a])
    51. print("Q(%d,*)"%(next_s))
    52. print(trans1(Q[next_s]))
    53. print('output:'+str(Q[s][a] + a_k * (0.8 * np.max(Q[next_s]) - Q[s, a])))
    54. def print_ff(list_q, Q, episode_i,epsilon_k,alpha_k):
    55. list_s = range(0,25)
    56. for em in list_q:
    57. if em == episode_i:
    58. print("*******************************情节数:%s*******************************"%(str(em)))
    59. for state in list_s:
    60. print("Q(%d,*)"%(state) + str(trans(Q[state])))
    61. action = vilid_action_space(state)
    62. len_a = len(action)
    63. e_p = epsilon_k / float(len_a)
    64. max_a = np.argmax(Q[state])
    65. prob = []
    66. index_a = np.argmax([Q[state][i] for i in action])
    67. for i in range(4):#计算epsilon
    68. if i not in action:
    69. prob.append(0.0)
    70. else:
    71. if i == action[index_a]:
    72. prob.append(1 - epsilon_k + e_p)
    73. else:
    74. prob.append(e_p)
    75. print('概率值:' + str(trans(prob)))
    76. print("epsilon_k: {}".format(epsilon_k))
    77. print("alpha_k:{}".format(alpha_k))
    78. def Attenuation(epsilon,alpha,episode_sum,episode):
    79. epsilon = (float(episode_sum) - float(episode)) / float(episode_sum) * epsilon
    80. alpha = (float(episode_sum) - float(episode)) / float(episode_sum) * alpha
    81. return epsilon, alpha
    82. while not done:
    83. a = policy_epsilon_greedy(s, Q, epsilon_k)
    84. next_s, r, done, _ = env.step(a)
    85. print_dd(s, a, next_s, 10000, episode_i, Q, epsilon_k, alpha_k)
    86. Q[s, a] += alpha_k * (r + gamma * np.max(Q[next_s]) - Q[s, a])
    87. s = next_s
    88. return Q
    89. Q = Q_Learning(env, 25000, 0.05, 0.8, 0.5)

    二、期望Sarsa算法

    通过对Sarsa算法进行改进,得到一种异策略TD算法,该算法考虑当前策略下所有动作的可能性,利用动作值函数的期望值取代某一特定动作值函数来更新估计值,该算法称为期望Sarsa算法。

    相比于Sarsa算法,期望Sarsa算法计算更为复杂,但通过计算能够有效地消除银随机选择而产生的方差,因此通常情况下,期望Sarsa算法明显优于Sarsa算法,另外期望Sarsa算法还可以使用异策略方法,将Q-Learning进行推广并提升性能

    下面利用期望Sarsa算法解决确定环境扫地机器人问题 背景与前面相同 不再赘述

    迭代到20000次后基本Q值已经收敛

     

     代码如下

    1. # 期望Sarsa算法
    2. from 扫地机器人gym环境 import GridWorldEnv
    3. import numpy as np
    4. from queue import Queue
    5. np.random.seed(1)
    6. env = GridWorldEnv()
    7. # 有效动作空间
    8. def vilid_action_space(s):
    9. action_sacpe = []
    10. if s % 5 != 0: # 左
    11. action_sacpe.append(0)
    12. if s % 5 != 4: # 右
    13. action_sacpe.append(1)
    14. if s <= 19: # 上
    15. action_sacpe.append(2)
    16. if s >= 5: # 下
    17. action_sacpe.append(3)
    18. return action_sacpe
    19. def policy_epsilon_greedy(s, Q, epsilon):
    20. Q_s = Q[s]
    21. action = vilid_action_space(s)
    22. if np.random.rand() < epsilon:
    23. a = np.random.choice(action)
    24. else:
    25. index_a = np.argmax([Q_s[i] for i in action])
    26. a = action[index_a]
    27. return a
    28. def compute_epsion(s, Q, epsilon):
    29. max_a = np.argmax(Q[s])
    30. action = vilid_action_space(s)
    31. len_all_a = len(action)
    32. prob_l = [0.0, 0.0, 0.0, 0.0]
    33. for index_a in action:
    34. if index_a == max_a:
    35. prob_l[index_a] = 1.0 - epsilon + (epsilon / len_all_a)
    36. else:
    37. prob_l[index_a] = epsilon / len_all_a
    38. return prob_l
    39. def compute_e_q(prob, q_n):
    40. sum = 0.0
    41. for i in range(4):
    42. sum += prob[i] * q_n[i]
    43. return sum
    44. def trans1(Q_S):
    45. new_Q = []
    46. new_Q.append(Q_S[2])
    47. new_Q.append(Q_S[3])
    48. new_Q.append(Q_S[0])
    49. new_Q.append(Q_S[1])
    50. return new_Q
    51. def print_dd(s, a, next_s, print_len, episode_i, Q, e_k, a_k):
    52. for i in range(50):
    53. if episode_i == int(print_len * ((0.02 * i) + 1)):
    54. if s == 15 and a == 3 and next_s == 10:
    55. print("*****************************单步计算过程****************************************")
    56. print("alpha:" + str(a_k))
    57. print("epsilon:" + str(e_k))
    58. print("state:" + str(int(print_len * (1 + (0.02 * i)))))
    59. print("Q(%d,%d)" % (s, a))
    60. print(Q[s][a])
    61. print("Q(%d,*)" % (next_s))
    62. print(trans1(Q[next_s]))
    63. prob_l = compute_epsion(next_s, Q, e_k)
    64. print('概率' + str(trans1(prob_l)))
    65. Q_e = compute_e_q(prob_l, Q[next_s])
    66. print('update:' + str(Q[s, a] + a_k * (0.8 * Q_e - Q[s, a])))
    67. def trans(Q_S):
    68. new_Q = []
    69. new_Q.append(round(Q_S[2], 3))
    70. new_Q.append(round(Q_S[3], 3))
    71. new_Q.append(round(Q_S[0], 3))
    72. new_Q.append(round(Q_S[1], 3))
    73. return new_Q
    74. def print_ff(list_q, Q, episode_i, epsilon_k, alpha_k):
    75. list_s = range(0, 25)
    76. for em in list_q:
    77. if em == episode_i:
    78. print("*******************************情节数:%s*******************************" % (str(em)))
    79. for state in list_s:
    80. print("Q(%d,*) " % (state) + str(trans(Q[state])))
    81. action = vilid_action_space(state)
    82. len_a = len(action)
    83. e_p = epsilon_k / float(len_a)
    84. prob = []
    85. index_a = np.argmax([Q[state][i] for i in action])
    86. for i in range(4): # 计算epsilon
    87. if i not in action:
    88. prob.append(0.0)
    89. else:
    90. if i == action[index_a]:
    91. prob.append(1 - epsilon_k + e_p)
    92. else:
    93. prob.append(e_p)
    94. print('概率值:' + str(trans(prob)))
    95. print("epsilon_k: {}".format(epsilon_k))
    96. print("alpha_k:{}".format(alpha_k))
    97. def Attenuation(epsilon, alpha, episode_sum, episode):
    98. epsilon = (float(episode_sum) - float(episode)) / float(episode_sum) * epsilon
    99. alpha = (float(episode_sum) - float(episode)) / float(episode_sum) * alpha
    100. return epsilon, alpha
    101. def Expectation_sarsa(env, episode_num, alpha, gamma, epsilon):
    102. Q = np.zeros((env.n_width * env.n_height, env.action_space.n))
    103. Q_queue = Queue(maxsize=11)
    104. lon_k, alpha_k)
    105. prob_l = compute_epsion(next_s, Q, epsilon_k)
    106. Q_e = compute_e_q(prob_l, Q[next_s])
    107. Q[s, a] += alpha_k * (r + gamma * Q_e - Q[s, a])
    108. s = next_s
    109. return Q
    110. Q = Expectation_sarsa(env, 20000, 0.05, 0.8, 0.5)

    创作不易 觉得有帮助请点赞关注收藏~~~

  • 相关阅读:
    【Python】read() || readline() || readlines()-笔记
    LeetCode337:打家劫舍III
    如何用Python获取网页指定内容
    JS深浅拷贝
    Git的认识和使用
    go语言 | 图解反射(二)
    大数据中Hadoop、Hive、Spark的关系
    实验三十二、OCL电路的研究
    神机百炼3.52-Prim
    中国女士职业套装行业深度调研及投资前景预测研究报告
  • 原文地址:https://blog.csdn.net/jiebaoshayebuhui/article/details/128012217