basic-rl/q_learning_exploration.py at master · traai/basic-rl · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import random
MAX_STEPS = 10000
MAX_EPSILON_RED_STEPS = 10

class Environment(object):
	def __init__(self):
		self.s = 'A'
		self.r = 0

	def transition(self, a):
		if self.s == 'A':
			if a == '1':
				self.s = 'A'
				self.r = 10 + random.randint(-3, 3)
			if a == '2':
				if random.uniform(0, 1) <= 0.99:
					self.s = 'B'
				else:
					self.s = 'A'
				self.r = -10 + random.randint(-3, 3)
		elif self.s == 'B':
			if random.uniform(0, 1) <= 0.99:
				self.s = 'A'
			else:
				self.s = 'B'
			if a == '1':
				self.r = 40 + random.randint(-3, 3)
			if a == '2':
				self.r = 20 + random.randint(-3, 3)

	def act(self, a):
		self.transition(a)
		return self.r, self.s

class Agent(object):
	def __init__(self, eta, gamma, env):
		self.q_table = {'A': {'1': 0, '2': 0}, 'B': {'1': 0, '2': 0}}
		self.eta = eta
		self.gamma = gamma
		self.actions = ['1', '2']
		self.env = env
		self.epsilon = 0.5
		self.epsilon_init = 0.5
		self.epsilon_final = 0.3

	def execute_action(self, action):
		return self.env.act(action)

	def maxQ(self, s):
		return max([v for _, v in self.q_table[s].items()])

	def argmaxQ(self, s):
		return max(self.q_table[s], key=lambda x: self.q_table[s][x[0]])

	def updateQ(self, s, a, r, new_s):
		self.q_table[s][a] += self.eta * (r + self.gamma * self.maxQ(new_s) - self.q_table[s][a])

	def update_epsilon(self, t):
		if self.epsilon > self.epsilon_final:
			self.epsilon -= (self.epsilon_init - self.epsilon_final) / MAX_EPSILON_RED_STEPS

	def policy(self, s, t):
		epsilon = random.uniform(0, 1)
		if epsilon <= self.epsilon:
			if random.uniform(0, 1) <= 0.5:
				action = '1'
			else:
				action = '2'
		else:
			action = self.argmaxQ(s)
		self.update_epsilon(t)
		return action

env = Environment()
q_agent = Agent(0.1, 0.9, env)
s = env.s
for t in xrange(0, MAX_STEPS):
	a = q_agent.policy(s, t)
	print "time=" + str(t) + ",  state=" + s + ",  action=" + a
	r, new_s = q_agent.execute_action(a)
	print "\n\t\t\t\t reward=" + str(r)
	q_agent.updateQ(s, a, r, new_s)
	s = new_s
print q_agent.q_table