当前位置:   article > 正文

深度强化学习 学术前沿与实战应用——DQN_深度强化学习学术前沿与实战应用

深度强化学习学术前沿与实战应用

(1) 训练网络 

  1. while True:
  2. env.render()
  3. action = RL.choose_action(observation)
  4. observation_, reward, done = env.step(action)
  5. RL.store_transition(observation, action, reward, observation_)
  6. if (step > x) and (step % y == 0):
  7. RL.learn()
  8. observation = observation_
  9. if done:
  10. break
  11. step += 1

(2) 更新网络参数

  1. def choose_action(self, observation):
  2. observation = observation[np.newaxis, :]
  3. if np.random.uniform() < self.epsilon:
  4. actions_value = self.sess.run(self.q_eval, feed_dict={self.s:observation})
  5. action = np.argmax(actions_value)
  6. else:
  7. action = np.random.randint(0, self.n_actions)
  8. return action

  1. def store_transition(self, s, a, r, s_):
  2. if not hasattr(self, 'memory_counter'):
  3. self.memory_counter = 0
  4. transition = np.hstack((s, [a, r], s_))
  5. index = self.memory_counter % self.memory_size
  6. self.memory[index, :] = transition
  7. self.memory_counter += 1

  

  1. def learn(self):
  2. q_target = q_eval.copy()
  3. batch_index = np.arrange(self.batch_size, dtype = np.int32)
  4. eval_act_index = batch_memory[:, self.n_features].astype(int)
  5. reward = batch_memory[:, self.n_features + 1]
  6. q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
  7. _, self.cost = self.sess.run([self._train_op, self.loss], feed_dict={self.s:batch_memory[:, :self.n_features], self.q_target:q_target})
  8. self.cost_his.append(self.cost)

  

  1. def _build_net(self):
  2. # ----all inputs----
  3. self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s') # input State
  4. self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_') # input Next State
  5. self.r = tf.placeholder(tf.float32, [None,], name='r') # input Reward
  6. self.a = tf.placeholder(tf.int32, [None,], name='a') # input Action
  7. w_initializer, b_initializer = tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)
  8. # ----build evaluate_net----
  9. with tf.variable_scope('online_net'):
  10. e1 = tf.layers.dense(self.s, 20, tf.nn.relu, kernel_initializer=w_initializer, bias_initializer=b_initializer, name='e1')
  11. self.q_eval = tf.layers.dense(e1, self.n_actions, kernel_initializer=w_initializer, bias_initializer=b_initializer,
  12. name='q')
  13. # ----build target_net----
  14. with tf.variable_scope('target_net'):
  15. t1 = tf.layers.dense(self.s_, 20, tf.nn.relu, kernel_initializer=w_initializer,
  16. bias_initializer=b_initializer, name='t1')
  17. self.q_next = tf.layers.dense(t1, self.n_actions, kernel_initializer=w_initializer,
  18. bias_initializer=b_initializer,
  19. name='t2')
  20. with tf.variable_scope('q_target'):
  21. q_target = self.r + self.gamma * tf.reduce_max(self.q_next, axis=1, name='Qmax_s_') # shape = (None,)
  22. self.q_target = tf.stop_gradient(q_target)
  23. with tf.variable_scope('q_eval'):
  24. a_indices = tf.stack([tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1)
  25. self.q_eval_wrt_a = tf.gather_nd(params=self.q_eval, indices=a_indices) # shape = (None,)
  26. with tf.variable_scope('loss'):
  27. self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval_wrt_a, name='TD_error'))
  28. with tf.variable_scope('train'):
  29. self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)

声明:本文内容由网友自发贡献,不代表【wpsshop博客】立场,版权归原作者所有,本站不承担相应法律责任。如您发现有侵权的内容,请联系我们。转载请注明出处:https://www.wpsshop.cn/w/2023面试高手/article/detail/295915
推荐阅读
相关标签
  

闽ICP备14008679号