当前位置:   article > 正文

强化学习-PPO算法实现pendulum_强化学习 nan ppo

强化学习 nan ppo

代码都是学习别人的,但我分享几点我踩过的大坑。

1.蒙特卡洛的V值

书上给的例子,是一次取一条轨迹,v=r+gamma*v 依次计算状态价值,这几乎是全部用蒙特卡洛方法的计算状态价值,并且没有对各条轨迹取均值,我想这种方法是极其不好的

2.样本不是独立同分布

由于1.中的原因,取到的样本不是独立同分布,把这种样本放入训练,可能会大幅影响训练效果。

(2022/12/29注 :可见当时没学明白,ppo是on-policy 样本,1中也不是蒙特卡洛,还是TD-error)

3.代码写的太繁复。

俗话说的好,宁简勿繁,把太多方法封装成函数,在前期是不太好的行为,非常不便于调试,应当全部删去。

4.神经网络极易输出[nan]

可能是因为用了torch.Tensor()来转化向量,double型向量这使得他的内存占用高,改为torch.FloatTensor()有明显改善。这一点极其重要,如果不用这个很可能根本没办法训练

                                                                训练效果

 代码如下

  1. """
  2. """
  3. import torch.nn.functional as F
  4. import torchvision.models as models
  5. import retro
  6. import hiddenlayer as hl
  7. import torch
  8. # import retro
  9. import pandas as pd
  10. import numpy as np
  11. import gym
  12. import torch.nn as nn
  13. from torch.distributions import Normal
  14. class DQBReplayer:
  15. def __init__(self,capacity):
  16. self.memory = pd.DataFrame(index=range(capacity),columns=['observation','action','reward','next_observation','done','step'])
  17. self.i=0
  18. self.count=0
  19. self.capacity=capacity
  20. def store(self,*args):
  21. self.memory.loc[self.i]=args
  22. self.i=(self.i+1)%self.capacity
  23. self.count=min(self.count+1,self.capacity)
  24. def sample(self,size=32):
  25. indics=np.random.choice(self.count,size=size)
  26. return (np.stack(self.memory.loc[indics,field]) for field in self.memory.columns)#为什么#是第indics行和feild列
  27. def clear(self):
  28. self.memory.drop(self.memory.index,inplace=True)
  29. self.count=0
  30. self.i=0
  31. #
  32. class PolicyNetwork(nn.Module):
  33. def __init__(self):
  34. super(PolicyNetwork, self).__init__()
  35. self.relu = nn.ReLU()
  36. self.fc1 = nn.Linear(3, 64)
  37. self.fc2 = nn.Linear(64, 256)
  38. self.fc_mu = nn.Linear(256, 1)
  39. self.fc_std = nn.Linear(256, 1)
  40. self.tanh = nn.Tanh()
  41. self.softplus = nn.Softplus()
  42. def forward(self, x):
  43. x = self.relu(self.fc1(x))
  44. x = self.relu(self.fc2(x))
  45. mu = 2 * self.tanh(self.fc_mu(x))
  46. std = self.softplus(self.fc_std(x)) + 1e-3
  47. return mu, std
  48. def select_action(self, state):
  49. with torch.no_grad():
  50. mu, std = self.forward(state)
  51. n = Normal(mu, std)
  52. action = n.sample()
  53. # print(" ac{:.1f},mu{},std{}".format( float(action),mu,std), end=" ")
  54. return np.clip(action.item(), -2., 2.)
  55. class ValueNetwork(nn.Module):
  56. def __init__(self):
  57. super(ValueNetwork, self).__init__()
  58. self.relu = nn.ReLU()
  59. self.fc1 = nn.Linear(3, 64)
  60. self.fc2 = nn.Linear(64, 256)
  61. self.fc3 = nn.Linear(256, 1)
  62. def forward(self, x):
  63. x = self.relu(self.fc1(x))
  64. x = self.relu(self.fc2(x))
  65. x = self.fc3(x)
  66. return x
  67. class PPO(nn.Module):
  68. def __init__(self):
  69. super(PPO,self).__init__()
  70. self.replayer=DQBReplayer(capacity=1000)
  71. self.gamma=0.99
  72. self.policy = PolicyNetwork().to(device)
  73. self.old_policy = PolicyNetwork().to(device)
  74. self.value = ValueNetwork().to(device)
  75. self.learn_step=0
  76. self.canvasl = hl.Canvas()
  77. self.history = hl.History()
  78. if __name__ == "__main__":
  79. device=torch.device("cuda" if torch.cuda.is_available() else"cpu")
  80. env=gym.make("Pendulum-v0").unwrapped
  81. net=PPO().to(device)
  82. optim = torch.optim.Adam(net.policy.parameters(), lr=0.001)
  83. value_optim= torch.optim.Adam(net.value.parameters(), lr=0.001)
  84. for i in range(200000):
  85. state = env.reset()
  86. epoch_reward=0#每局游戏的累计奖励
  87. for step in range(200):
  88. # env.render()
  89. state_tensor = torch.FloatTensor(state).to(device)
  90. action=net.policy.select_action(state_tensor)
  91. next_state,r,done,info=env.step([action])
  92. reward = (r + 8.1) / 8.1
  93. epoch_reward+=reward
  94. net.replayer.store(state, action, reward, next_state, done,step)
  95. net.learn_step += 1
  96. state = next_state
  97. net.old_policy.load_state_dict(net.policy.state_dict())
  98. for K in range(10):
  99. sample_n = net.replayer.count
  100. states, actions, rewards, next_states, dones, steps = net.replayer.sample(32)
  101. states = torch.FloatTensor(states).to(device)
  102. next_states = torch.FloatTensor(next_states).to(device)
  103. actions = torch.FloatTensor(actions).unsqueeze(1).to(device)
  104. rewards = torch.FloatTensor(rewards).unsqueeze(1).to(device)
  105. with torch.no_grad(): # 为什么
  106. old_mu, old_std = net.old_policy(states)
  107. old_n = Normal(old_mu, old_std)
  108. value_target = rewards + net.gamma * net.value(next_states)
  109. advantage = value_target - net.value(states)
  110. mu, std = net.policy(states)
  111. n = Normal(mu, std)
  112. log_prob = n.log_prob(actions)
  113. old_log_prob = old_n.log_prob(actions)
  114. ratio = torch.exp(log_prob - old_log_prob)
  115. L1 = ratio * advantage
  116. L2 = torch.clamp(ratio, 0.8, 1.2) * advantage
  117. loss = torch.min(L1, L2)
  118. loss = - loss.mean()
  119. # writer.add_scalar('action loss', loss.item(), steps)
  120. optim.zero_grad()
  121. loss.backward()
  122. optim.step()
  123. #clear
  124. value_loss = F.mse_loss(value_target, net.value(states))
  125. value_optim.zero_grad()
  126. value_loss.backward()
  127. value_optim.step()
  128. net.replayer.clear()
  129. # writer.add_scalar('value loss', value_loss.item(), steps)
  130. if i % 10 == 0 and i!=0:
  131. print('Epoch:{}, episode reward is {}'.format(i, epoch_reward))
  132. torch.save(net.policy.state_dict(), "pendulun_para\\reward"+str(epoch_reward//10)+'ppo-policy.para')
  133. # net.history.log((i * 200), avg_reward=epoch_reward/10)
  134. # with net.canvasl:
  135. # net.canvasl.draw_plot(net.history["avg_reward"])
  136. epoch_reward = 0

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/运维做开发/article/detail/928235
推荐阅读
相关标签
  

闽ICP备14008679号