def obtain_episode(self, policy, start_state, start_action, length):
self.env.agent_location = self.env.state2pos(start_state)
episode = []
next_action = start_action
next_state = start_state
while length > 0:
length -= 1
state = next_state
action = next_action
_, reward, done, _, _ = self.env.step(action)
next_state = self.env.pos2state(self.env.agent_location)
next_action = np.random.choice(np.arange(len(policy[next_state])),
p=policy[next_state])
episode.append({"state": state, "action": action, "reward": reward, "next_state": next_state,
"next_action": next_action})
return episode
您好:
请问 obtain_episode 函数在获取 episode 时,设置的是采样固定长度返回,而不是一个 episode 结束时返回。这与赵老师原文中的不一样。
这样写会对收敛效率有影响吗?