-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrl.py
More file actions
95 lines (88 loc) · 2.81 KB
/
rl.py
File metadata and controls
95 lines (88 loc) · 2.81 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import numpy as np
from grid import Grid
from constants import *
from actions import *
def init_random_policy(grid, policy={}):
for s in grid.actions.keys():
policy[s] = np.random.choice(grid.actions[s])
return policy
def init_random_value_function(grid, V={}):
for s in grid.all_states():
if not grid.is_terminal(s):
V[s] = np.random.random()
else:
V[s] = 0
return V
def iterative_policy_evaluation(grid, policy, V={}):
S = grid.all_states()
while True:
delta = 0
for s in S:
old_v = V.get(s, 0)
if not grid.is_terminal(s):
grid.set_state(s)
a = policy[s]
r = grid.move(a)
next_v = V.get(grid.get_state(), 0)
V[s] = r + GAMMA * next_v
delta = max(delta, abs(V[s] - old_v))
else:
V[s] = 0
if delta < EPSILON:
break
return V
def policy_iteration(grid, V, policy=None):
if policy == None:
policy = init_random_policy(grid)
is_policy_converged = True
for s in grid.all_states():
if s in policy:
old_a = policy[s]
new_a = None
best_value = float('-inf')
for a in grid.actions[s]:
grid.set_state(s)
r = grid.move(a)
v = r + GAMMA * V.get(grid.get_state(), 0)
if v > best_value:
best_value = v
new_a = a
policy[s] = new_a
if new_a != old_a:
is_policy_converged = False
return policy, is_policy_converged
def value_iteration(grid, V={}):
while True:
delta = 0
for s in grid.all_states():
if grid.is_terminal(s):
V[s] = 0
else:
old_v = V.get(s, 0)
new_v = float('-inf')
for a in grid.actions[s]:
grid.set_state(s)
r = grid.move(a)
v = r + GAMMA * V.get(grid.get_state(), 0)
if v > new_v:
new_v = v
V[s] = new_v
delta = max(delta, abs(new_v - old_v))
if delta < EPSILON:
break
return V
def monte_carlo(grid, policy, V={}, iterations=1000):
returns = {}
for _ in range(iterations):
grid.random_start_position()
states_and_returns = grid.play(policy, delay=0, log=False)
seen_states = set()
for s, G in states_and_returns:
if s not in seen_states:
if s in returns.keys():
returns[s].append(G)
else:
returns[s] = [G]
V[s] = np.mean(returns[s])
seen_states.add(s)
return V