From 348985ed79078ee2d903b08a60e0109b27722a46 Mon Sep 17 00:00:00 2001 From: Neal McBurnett Date: Sun, 29 Jul 2012 13:25:05 -0600 Subject: [PATCH 01/19] Add test for Queues: ensure member is not there after being popped --- utils.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/utils.py b/utils.py index c167589..dce9dfe 100644 --- a/utils.py +++ b/utils.py @@ -855,7 +855,13 @@ def fixup(test): ... q.extend(nums) ... for num in nums: assert num in q ... assert 42 not in q -... return [q.pop() for i in range(len(q))] +... result = [] +... for i in range(len(q)): +... num = q.pop() +... assert num not in q +... result.append(num) +... return result + >>> qtest(Stack()) [0, 3, 4, 99, -99, 6, 5, 7, 2, 8, 1] From 6c0eef780a024cbbc7f814c2407baadbaf20d906 Mon Sep 17 00:00:00 2001 From: Neal McBurnett Date: Sun, 29 Jul 2012 14:08:16 -0600 Subject: [PATCH 02/19] Speed up PriorityQueue.__contains__() from linear to constant time, fixing Issue 31 (quadratic search times). Note that __getitem__() and __delitem__() are still linear time, but could easily be improved also if necessary. --- utils.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/utils.py b/utils.py index dce9dfe..8356337 100644 --- a/utils.py +++ b/utils.py @@ -734,17 +734,22 @@ class PriorityQueue(Queue): Also supports dict-like lookup.""" def __init__(self, order=min, f=lambda x: x): update(self, A=[], order=order, f=f) + self.membership = {} def append(self, item): bisect.insort(self.A, (self.f(item), item)) + hashval = hash(item) + self.membership[hashval] = self.membership.get(hashval, 0) + 1 def __len__(self): return len(self.A) def pop(self): if self.order == min: - return self.A.pop(0)[1] + item = self.A.pop(0)[1] else: - return self.A.pop()[1] + item = self.A.pop()[1] + self._remove_(item) + return item def __contains__(self, item): - return some(lambda (_, x): x == item, self.A) + return hash(item) in self.membership def __getitem__(self, key): for _, item in self.A: if item == key: @@ -752,8 +757,15 @@ def __getitem__(self, key): def __delitem__(self, key): for i, (value, item) in enumerate(self.A): if item == key: - self.A.pop(i) + item = self.A.pop(i) + self._remove_(item) return + def _remove_(self, item): + hashval = hash(item) + self.membership[hashval] -= 1 + if self.membership[hashval] == 0: + del self.membership[hashval] + ## Fig: The idea is we can define things like Fig[3,10] later. ## Alas, it is Fig[3,10] not Fig[3.10], because that would be the same @@ -858,8 +870,9 @@ def fixup(test): ... result = [] ... for i in range(len(q)): ... num = q.pop() -... assert num not in q +... assert num not in q # num could appear multiple times, in which case this would fail ... result.append(num) +... ... return result >>> qtest(Stack()) From 3b81f1776fac34b7659caf8e8055c815a2a95864 Mon Sep 17 00:00:00 2001 From: Ng Yee Sian Date: Thu, 6 Dec 2012 13:16:01 +0800 Subject: [PATCH 03/19] fixed bug in value iteration when gamma == 1 --- mdp.py | 343 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 172 insertions(+), 171 deletions(-) diff --git a/mdp.py b/mdp.py index e5142c1..d8057ce 100644 --- a/mdp.py +++ b/mdp.py @@ -1,171 +1,172 @@ -"""Markov Decision Processes (Chapter 17) - -First we define an MDP, and the special case of a GridMDP, in which -states are laid out in a 2-dimensional grid. We also represent a policy -as a dictionary of {state:action} pairs, and a Utility function as a -dictionary of {state:number} pairs. We then define the value_iteration -and policy_iteration algorithms.""" - -from utils import * - -class MDP: - """A Markov Decision Process, defined by an initial state, transition model, - and reward function. We also keep track of a gamma value, for use by - algorithms. The transition model is represented somewhat differently from - the text. Instead of P(s' | s, a) being a probability number for each - state/state/action triplet, we instead have T(s, a) return a list of (p, s') - pairs. We also keep track of the possible states, terminal states, and - actions for each state. [page 646]""" - - def __init__(self, init, actlist, terminals, gamma=.9): - update(self, init=init, actlist=actlist, terminals=terminals, - gamma=gamma, states=set(), reward={}) - - def R(self, state): - "Return a numeric reward for this state." - return self.reward[state] - - def T(self, state, action): - """Transition model. From a state and an action, return a list - of (probability, result-state) pairs.""" - abstract - - def actions(self, state): - """Set of actions that can be performed in this state. By default, a - fixed list of actions, except for terminal states. Override this - method if you need to specialize by state.""" - if state in self.terminals: - return [None] - else: - return self.actlist - -class GridMDP(MDP): - """A two-dimensional grid MDP, as in [Figure 17.1]. All you have to do is - specify the grid as a list of lists of rewards; use None for an obstacle - (unreachable state). Also, you should specify the terminal states. - An action is an (x, y) unit vector; e.g. (1, 0) means move east.""" - def __init__(self, grid, terminals, init=(0, 0), gamma=.9): - grid.reverse() ## because we want row 0 on bottom, not on top - MDP.__init__(self, init, actlist=orientations, - terminals=terminals, gamma=gamma) - update(self, grid=grid, rows=len(grid), cols=len(grid[0])) - for x in range(self.cols): - for y in range(self.rows): - self.reward[x, y] = grid[y][x] - if grid[y][x] is not None: - self.states.add((x, y)) - - def T(self, state, action): - if action is None: - return [(0.0, state)] - else: - return [(0.8, self.go(state, action)), - (0.1, self.go(state, turn_right(action))), - (0.1, self.go(state, turn_left(action)))] - - def go(self, state, direction): - "Return the state that results from going in this direction." - state1 = vector_add(state, direction) - return if_(state1 in self.states, state1, state) - - def to_grid(self, mapping): - """Convert a mapping from (x, y) to v into a [[..., v, ...]] grid.""" - return list(reversed([[mapping.get((x,y), None) - for x in range(self.cols)] - for y in range(self.rows)])) - - def to_arrows(self, policy): - chars = {(1, 0):'>', (0, 1):'^', (-1, 0):'<', (0, -1):'v', None: '.'} - return self.to_grid(dict([(s, chars[a]) for (s, a) in policy.items()])) - -#______________________________________________________________________________ - -Fig[17,1] = GridMDP([[-0.04, -0.04, -0.04, +1], - [-0.04, None, -0.04, -1], - [-0.04, -0.04, -0.04, -0.04]], - terminals=[(3, 2), (3, 1)]) - -#______________________________________________________________________________ - -def value_iteration(mdp, epsilon=0.001): - "Solving an MDP by value iteration. [Fig. 17.4]" - U1 = dict([(s, 0) for s in mdp.states]) - R, T, gamma = mdp.R, mdp.T, mdp.gamma - while True: - U = U1.copy() - delta = 0 - for s in mdp.states: - U1[s] = R(s) + gamma * max([sum([p * U[s1] for (p, s1) in T(s, a)]) - for a in mdp.actions(s)]) - delta = max(delta, abs(U1[s] - U[s])) - if delta < epsilon * (1 - gamma) / gamma: - return U - -def best_policy(mdp, U): - """Given an MDP and a utility function U, determine the best policy, - as a mapping from state to action. (Equation 17.4)""" - pi = {} - for s in mdp.states: - pi[s] = argmax(mdp.actions(s), lambda a:expected_utility(a, s, U, mdp)) - return pi - -def expected_utility(a, s, U, mdp): - "The expected utility of doing a in state s, according to the MDP and U." - return sum([p * U[s1] for (p, s1) in mdp.T(s, a)]) - -#______________________________________________________________________________ - -def policy_iteration(mdp): - "Solve an MDP by policy iteration [Fig. 17.7]" - U = dict([(s, 0) for s in mdp.states]) - pi = dict([(s, random.choice(mdp.actions(s))) for s in mdp.states]) - while True: - U = policy_evaluation(pi, U, mdp) - unchanged = True - for s in mdp.states: - a = argmax(mdp.actions(s), lambda a: expected_utility(a,s,U,mdp)) - if a != pi[s]: - pi[s] = a - unchanged = False - if unchanged: - return pi - -def policy_evaluation(pi, U, mdp, k=20): - """Return an updated utility mapping U from each state in the MDP to its - utility, using an approximation (modified policy iteration).""" - R, T, gamma = mdp.R, mdp.T, mdp.gamma - for i in range(k): - for s in mdp.states: - U[s] = R(s) + gamma * sum([p * U[s1] for (p, s1) in T(s, pi[s])]) - return U - -__doc__ += """ ->>> pi = best_policy(Fig[17,1], value_iteration(Fig[17,1], .01)) - ->>> Fig[17,1].to_arrows(pi) -[['>', '>', '>', '.'], ['^', None, '^', '.'], ['^', '>', '^', '<']] - ->>> print_table(Fig[17,1].to_arrows(pi)) -> > > . -^ None ^ . -^ > ^ < - ->>> print_table(Fig[17,1].to_arrows(policy_iteration(Fig[17,1]))) -> > > . -^ None ^ . -^ > ^ < -""" - -__doc__ += random_tests(""" ->>> pi -{(3, 2): None, (3, 1): None, (3, 0): (-1, 0), (2, 1): (0, 1), (0, 2): (1, 0), (1, 0): (1, 0), (0, 0): (0, 1), (1, 2): (1, 0), (2, 0): (0, 1), (0, 1): (0, 1), (2, 2): (1, 0)} - ->>> value_iteration(Fig[17,1], .01) -{(3, 2): 1.0, (3, 1): -1.0, (3, 0): 0.12958868267972745, (0, 1): 0.39810203830605462, (0, 2): 0.50928545646220924, (1, 0): 0.25348746162470537, (0, 0): 0.29543540628363629, (1, 2): 0.64958064617168676, (2, 0): 0.34461306281476806, (2, 1): 0.48643676237737926, (2, 2): 0.79536093684710951} - ->>> policy_iteration(Fig[17,1]) -{(3, 2): None, (3, 1): None, (3, 0): (0, -1), (2, 1): (-1, 0), (0, 2): (1, 0), (1, 0): (1, 0), (0, 0): (1, 0), (1, 2): (1, 0), (2, 0): (1, 0), (0, 1): (1, 0), (2, 2): (1, 0)} - -""") - - +"""Markov Decision Processes (Chapter 17) + +First we define an MDP, and the special case of a GridMDP, in which +states are laid out in a 2-dimensional grid. We also represent a policy +as a dictionary of {state:action} pairs, and a Utility function as a +dictionary of {state:number} pairs. We then define the value_iteration +and policy_iteration algorithms.""" + +from utils import * + +class MDP: + """A Markov Decision Process, defined by an initial state, transition model, + and reward function. We also keep track of a gamma value, for use by + algorithms. The transition model is represented somewhat differently from + the text. Instead of P(s' | s, a) being a probability number for each + state/state/action triplet, we instead have T(s, a) return a list of (p, s') + pairs. We also keep track of the possible states, terminal states, and + actions for each state. [page 646]""" + + def __init__(self, init, actlist, terminals, gamma=.9): + update(self, init=init, actlist=actlist, terminals=terminals, + gamma=gamma, states=set(), reward={}) + + def R(self, state): + "Return a numeric reward for this state." + return self.reward[state] + + def T(self, state, action): + """Transition model. From a state and an action, return a list + of (probability, result-state) pairs.""" + abstract + + def actions(self, state): + """Set of actions that can be performed in this state. By default, a + fixed list of actions, except for terminal states. Override this + method if you need to specialize by state.""" + if state in self.terminals: + return [None] + else: + return self.actlist + +class GridMDP(MDP): + """A two-dimensional grid MDP, as in [Figure 17.1]. All you have to do is + specify the grid as a list of lists of rewards; use None for an obstacle + (unreachable state). Also, you should specify the terminal states. + An action is an (x, y) unit vector; e.g. (1, 0) means move east.""" + def __init__(self, grid, terminals, init=(0, 0), gamma=.9): + grid.reverse() ## because we want row 0 on bottom, not on top + MDP.__init__(self, init, actlist=orientations, + terminals=terminals, gamma=gamma) + update(self, grid=grid, rows=len(grid), cols=len(grid[0])) + for x in range(self.cols): + for y in range(self.rows): + self.reward[x, y] = grid[y][x] + if grid[y][x] is not None: + self.states.add((x, y)) + + def T(self, state, action): + if action is None: + return [(0.0, state)] + else: + return [(0.8, self.go(state, action)), + (0.1, self.go(state, turn_right(action))), + (0.1, self.go(state, turn_left(action)))] + + def go(self, state, direction): + "Return the state that results from going in this direction." + state1 = vector_add(state, direction) + return if_(state1 in self.states, state1, state) + + def to_grid(self, mapping): + """Convert a mapping from (x, y) to v into a [[..., v, ...]] grid.""" + return list(reversed([[mapping.get((x,y), None) + for x in range(self.cols)] + for y in range(self.rows)])) + + def to_arrows(self, policy): + chars = {(1, 0):'>', (0, 1):'^', (-1, 0):'<', (0, -1):'v', None: '.'} + return self.to_grid(dict([(s, chars[a]) for (s, a) in policy.items()])) + +#______________________________________________________________________________ + +Fig[17,1] = GridMDP([[-0.04, -0.04, -0.04, +1], + [-0.04, None, -0.04, -1], + [-0.04, -0.04, -0.04, -0.04]], + terminals=[(3, 2), (3, 1)]) + +#______________________________________________________________________________ + +def value_iteration(mdp, epsilon=0.001): + "Solving an MDP by value iteration. [Fig. 17.4]" + U1 = dict([(s, 0) for s in mdp.states]) + R, T, gamma = mdp.R, mdp.T, mdp.gamma + while True: + U = U1.copy() + delta = 0 + for s in mdp.states: + U1[s] = R(s) + gamma * max([sum([p * U[s1] for (p, s1) in T(s, a)]) + for a in mdp.actions(s)]) + delta = max(delta, abs(U1[s] - U[s])) + if (((gamma < 1) and (delta < epsilon * (1 - gamma) / gamma)) or + ((gamma == 1) and (delta < epsilon))): # allows for gamma to be 1 + return U + +def best_policy(mdp, U): + """Given an MDP and a utility function U, determine the best policy, + as a mapping from state to action. (Equation 17.4)""" + pi = {} + for s in mdp.states: + pi[s] = argmax(mdp.actions(s), lambda a:expected_utility(a, s, U, mdp)) + return pi + +def expected_utility(a, s, U, mdp): + "The expected utility of doing a in state s, according to the MDP and U." + return sum([p * U[s1] for (p, s1) in mdp.T(s, a)]) + +#______________________________________________________________________________ + +def policy_iteration(mdp): + "Solve an MDP by policy iteration [Fig. 17.7]" + U = dict([(s, 0) for s in mdp.states]) + pi = dict([(s, random.choice(mdp.actions(s))) for s in mdp.states]) + while True: + U = policy_evaluation(pi, U, mdp) + unchanged = True + for s in mdp.states: + a = argmax(mdp.actions(s), lambda a: expected_utility(a,s,U,mdp)) + if a != pi[s]: + pi[s] = a + unchanged = False + if unchanged: + return pi + +def policy_evaluation(pi, U, mdp, k=20): + """Return an updated utility mapping U from each state in the MDP to its + utility, using an approximation (modified policy iteration).""" + R, T, gamma = mdp.R, mdp.T, mdp.gamma + for i in range(k): + for s in mdp.states: + U[s] = R(s) + gamma * sum([p * U[s1] for (p, s1) in T(s, pi[s])]) + return U + +__doc__ += """ +>>> pi = best_policy(Fig[17,1], value_iteration(Fig[17,1], .01)) + +>>> Fig[17,1].to_arrows(pi) +[['>', '>', '>', '.'], ['^', None, '^', '.'], ['^', '>', '^', '<']] + +>>> print_table(Fig[17,1].to_arrows(pi)) +> > > . +^ None ^ . +^ > ^ < + +>>> print_table(Fig[17,1].to_arrows(policy_iteration(Fig[17,1]))) +> > > . +^ None ^ . +^ > ^ < +""" + +__doc__ += random_tests(""" +>>> pi +{(3, 2): None, (3, 1): None, (3, 0): (-1, 0), (2, 1): (0, 1), (0, 2): (1, 0), (1, 0): (1, 0), (0, 0): (0, 1), (1, 2): (1, 0), (2, 0): (0, 1), (0, 1): (0, 1), (2, 2): (1, 0)} + +>>> value_iteration(Fig[17,1], .01) +{(3, 2): 1.0, (3, 1): -1.0, (3, 0): 0.12958868267972745, (0, 1): 0.39810203830605462, (0, 2): 0.50928545646220924, (1, 0): 0.25348746162470537, (0, 0): 0.29543540628363629, (1, 2): 0.64958064617168676, (2, 0): 0.34461306281476806, (2, 1): 0.48643676237737926, (2, 2): 0.79536093684710951} + +>>> policy_iteration(Fig[17,1]) +{(3, 2): None, (3, 1): None, (3, 0): (0, -1), (2, 1): (-1, 0), (0, 2): (1, 0), (1, 0): (1, 0), (0, 0): (1, 0), (1, 2): (1, 0), (2, 0): (1, 0), (0, 1): (1, 0), (2, 2): (1, 0)} + +""") + + From 14eaf6ee54bf77202c92c2a5b83da7f5478baf4b Mon Sep 17 00:00:00 2001 From: Ng Yee Sian Date: Fri, 7 Dec 2012 19:48:14 +0800 Subject: [PATCH 04/19] added code for passive adp original code from Steve Klebanoff at https://github.com/steveklebanoff/AIMA-Python-Reinforcement-Learning/blob/master/passive_adp.py --- passive_adp.py | 318 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 318 insertions(+) create mode 100644 passive_adp.py diff --git a/passive_adp.py b/passive_adp.py new file mode 100644 index 0000000..bb55f25 --- /dev/null +++ b/passive_adp.py @@ -0,0 +1,318 @@ +import logging + +from mdp import GridMDP, MDP, value_iteration, policy_evaluation +from utils import turn_left, turn_right +from optparse import OptionParser +from random import randint +from time import time +from itertools import product + +class GridMDP(GridMDP): + + char_switch = { + '>' : (1,0), + '^' : (0,1), + '<' : (-1, 0), + '.' : None + } + + # TODO: this and the next should be static methods + def char_to_tuple(self, direction): + return self.char_switch[direction] + + def tuple_to_char(self, tuple): + for k,v in self.char_switch.items(): + if v == tuple: + return k + + return None + + def simulate_move(self, state, action): + # TODO: get percentages from T + random_number = randint(0, 100) + if (random_number >= 0) and (random_number <= 9): + return self.go(state, turn_right(action)) + elif (random_number >= 10) and (random_number <= 20): + return self.go(state, turn_left(action)) + else: + return self.go(state, action) + + +class MyMDP(MDP): + """ Extends MDP class to use a dictionary transistion model """ + + def __init__(self, init, actlist, terminals, gamma=.9): + MDP.__init__(self,init, actlist, terminals, gamma) + self.model = { } + + def R(self, state): + " Return a numeric reward for this state. " + if state in self.reward: + return self.reward[state] + else: + # TODO: this should really return zero? or return False beause we + # don't know. Returns 0 for now as it makes the value iteration + # function work + return 0 + #raise Exception('tried to get reward of state we dont have yet %s' % str(state)) + + def T(self, state, action): + " Returns a list of tuples with probabilities for states " + try: + possible_results_and_probabilities = self.model[state][action] + except KeyError: + return [] + + l = [] + for result_state, probability in possible_results_and_probabilities.items(): + l.append((probability, result_state)) + return l + + def T_add(self, state, action, result_state, probability): + " Adds a value to the transistion model " + if (state in self.model) and (action in self.model[state]): + self.model[state][action][result_state] = probability + elif (state in self.model): + self.model[state][action] = { result_state : probability } + else: + self.model[state] = {action : { result_state : probability} } + +class PassiveADPAgent(object): + + def __init__(self, action_mdp, policy): + self.mdp = MyMDP(init=(0, 0), + actlist=[(1,0), (0, 1), (-1, 0), (0, -1)], + terminals=action_mdp.terminals, + gamma = 0.9) + self.action_mdp = action_mdp + self.utility, self.outcome_freq = { }, { } + self.reached_states = set([]) + self.previous_state, self.previous_action = None, None + self.create_policy_and_states(policy) + self.create_empty_sa_freq() + + def create_empty_sa_freq(self): + " Creates state action frequences with inital values of 0 " + self.sa_freq = { } + for state in self.mdp.states: + self.sa_freq[state] = { } + for action in self.mdp.actlist: + self.sa_freq[state][action] = 0.0 + + def create_policy_and_states(self, policy): + " Sets the initial policy, and also sets the mdp's states " + self.policy = {} + self.mdp.states = set() + + ## Reverse because we want row 0 on bottom, not on top + policy.reverse() + self.rows, self.cols = len(policy), len(policy[0]) + for x in range(self.cols): + for y in range(self.rows): + # Convert arrows to numbers + if policy[y][x] == None: + self.policy[x, y] = None + else: + self.policy[x, y] = self.action_mdp.char_to_tuple(policy[y][x]) + + # States are all non-none values + if policy[y][x] is not None: + self.mdp.states.add((x, y)) + + def add_state_action_pair_frequency(self, state, action): + self.sa_freq[state][action] += 1 + + def get_state_action_pair_frequency(self, state, action): + return self.sa_freq[state][action] + + def add_outcome_frequency(self, state, action, outcome): + # We haven't seen this state yet + if state not in self.outcome_freq: + self.outcome_freq[state] = {action : {outcome : 1}} + return + + # We've seen the state but not the action + if action not in self.outcome_freq[state]: + self.outcome_freq[state][action] = {outcome : 1} + return + + # We've seen the state and the action, but not the outcome + if outcome not in self.outcome_freq[state][action]: + self.outcome_freq[state][action][outcome] = 1 + return + + # We've seen the state, action, and outcome, add 1 + self.outcome_freq[state][action][outcome] += 1 + + def get_outcome_frequency(self, state, action, outcome): + try: + return self.outcome_freq[state][action][outcome] + except KeyError: + return 0 + + def print_outcome_frequency(self): + for state in agent.outcome_freq: + for action in agent.outcome_freq[state]: + for result_state, result_frequency in agent.outcome_freq[state][action].items(): + print 'state', state, '\t action', action, \ + '\t result state',result_state, '\t frequency', result_frequency + + + def get_move_from_policy(self, state_x, state_y): + return self.policy[state_x][state_y] + + def next_action(self, current_state, current_reward): + # policy = self.policy computed by constructor + # MDP = mdp object. self.mdp + # MDP.T - transistion model (initially empty), + # MDP.reward - reward + # MDP gamma in initializer + # utility = dictionary [(0,0)] = 0.57 etc + # state action frequencies = sa_freq (dict) initially empty + # outcome frequencies given state outcome and state-action pairs = outcome_freq initially empty + # dict with key being new state, value being another dict with keys being + # state, action pairs and values being that percentage + # previous state, previous action = s,a + + # if s' is new then: + if (current_state not in self.reached_states): + # U[s'] <- r' + self.utility[current_state] = current_reward + + # R[s'] <- r' + self.mdp.reward[current_state] = current_reward + + # Make sure we know we have seen it before + self.reached_states.add(current_state) + + # if s is not null + if self.previous_state is not None: + # increment Nsa[s,a] and Ns'|sa[s', s, a] + self.add_state_action_pair_frequency(self.previous_state, self.previous_action) + self.add_outcome_frequency(self.previous_state, self.previous_action, current_state) + + # for each t such that Ns'|sa[t,s,a] is nonzero: + for state in agent.outcome_freq: + for action in agent.outcome_freq[state]: + for result_state, result_frequency in agent.outcome_freq[state][action].items(): + if result_frequency > 0: + # P (t, s, a) <- Ns'|sa[t, s, a] / Nsa[s,a] + # Update the model to be: + # ((freq of this action happening with this state action pair) + # / (total freq of this state action pair combo)) + probability = result_frequency / self.get_state_action_pair_frequency(state, action) + self.mdp.T_add(state, action, result_state, probability) + + self.utility = policy_evaluation(self.policy, self.utility, self.mdp) + + # if s'.TERMINAL? + # If we're at a terminal we don't want a next move + if current_state in self.mdp.terminals: + logging.info('Reached terminal state %s' % str(current_state)) + # s,a <- null + self.previous_state, self.previous_action = None, None + return False + else: + # s,a <- s', policy[s'] + next_action = self.policy[current_state] + self.previous_state, self.previous_action = current_state, next_action + # Return the next action that the policy dictates + return next_action + + + def execute_trial(self): + # Start at initial state + current_state = self.mdp.init + + # Keep going until we get to a terminal state + while True: + logging.info('--------------------------') + + # Get reward for current state + current_reward = self.action_mdp.R(current_state) + + # Calculate move from current state + next_action = self.next_action(current_state, current_reward) + + logging.info('Current State: %s ' % str(current_state)) + logging.info('Current Reward: %s ' % current_reward) + logging.info('Next action: %s' % self.action_mdp.tuple_to_char(next_action)) + + if next_action == False: + # End because next_action told us to + logging.info('Next_action returned false, stopping') + break + + # Get new current_state + current_state = self.action_mdp.simulate_move(current_state, next_action) + +if __name__ == '__main__': + ''' Parses options from command line, creates Fig 17,1, runs the passive + adp agent on it certain amount of times, outputs info and utilities ''' + + # Setup file options + parser = OptionParser() + parser.add_option("-t", "--times", dest="times", type="int", default = 100, + help="times to run") + parser.add_option("-d", "--debug", action='store_true', dest="debug", + default=False, help="debug mode?") + parser.add_option("-i", "--info", action='store_true', dest="info", + default=False, help="info mode?") + parser.add_option("-f", "--file", dest="log_file", + default=False, help="file to log to") + (options, args) = parser.parse_args() + + if options.debug: + level = logging.DEBUG + elif options.info: + level = logging.INFO + else: + level = logging.CRITICAL + + format = '%(levelname)s: %(message)s' + if options.log_file: + logging.basicConfig(level=level, + filename=options.log_file, + filemode='w', + format=format) + else: + logging.basicConfig(level=level, + format=format) + + # Set up grid MDP to act on + Fig = {} + Fig[17,1] = GridMDP([[-0.04, -0.04, -0.04, +1.0], + [-0.04, None, -0.04, -1.0], + [-0.04, -0.04, -0.04, -0.04]], + terminals=[(3, 2), (3, 1)]) + + # Setup values + policy = [['>', '>', '>', '.'], + ['^', None, '^', '.'], + ['^', '<', '<', '<']] + + # Create agent + agent = PassiveADPAgent(Fig[17,1], policy) + + # Start timing + time_start = time() + logging.info('Start at %s' % time_start) + + # Execute a bunch of trials + trials = options.times + for i in range (0,trials): + agent.execute_trial() + + # End timing + time_end = time() + logging.info('End at %s' % time_end) + + seconds_elapsed = time_end - time_start + minutes_elapsed = seconds_elapsed / 60.0 + + # Print and log final results + final_results = (('Took %d seconds, which is %d minutes' % (seconds_elapsed, minutes_elapsed)),\ + ('Executed %i trials' % (trials)), ('Utilities: %s' % (agent.utility))) + for result in final_results: + logging.info(result) + print result From 36237b40cd6603310b9badfc6db49e57f2cf8b72 Mon Sep 17 00:00:00 2001 From: Ng Yee Sian Date: Fri, 7 Dec 2012 22:40:21 +0800 Subject: [PATCH 05/19] refactored the code for passive_adp to look more like the pseudo-code, and removed the logging --- passive_adp.py | 378 ++++++++++++------------------------------------- 1 file changed, 94 insertions(+), 284 deletions(-) diff --git a/passive_adp.py b/passive_adp.py index bb55f25..626b9c0 100644 --- a/passive_adp.py +++ b/passive_adp.py @@ -1,54 +1,24 @@ -import logging - -from mdp import GridMDP, MDP, value_iteration, policy_evaluation -from utils import turn_left, turn_right -from optparse import OptionParser -from random import randint +from mdp import GridMDP, MDP, value_iteration, policy_evaluation, Fig +from utils import turn_left, turn_right, update +from random import random from time import time -from itertools import product - -class GridMDP(GridMDP): - - char_switch = { - '>' : (1,0), - '^' : (0,1), - '<' : (-1, 0), - '.' : None - } - - # TODO: this and the next should be static methods - def char_to_tuple(self, direction): - return self.char_switch[direction] - - def tuple_to_char(self, tuple): - for k,v in self.char_switch.items(): - if v == tuple: - return k - - return None - - def simulate_move(self, state, action): - # TODO: get percentages from T - random_number = randint(0, 100) - if (random_number >= 0) and (random_number <= 9): - return self.go(state, turn_right(action)) - elif (random_number >= 10) and (random_number <= 20): - return self.go(state, turn_left(action)) - else: - return self.go(state, action) - class MyMDP(MDP): """ Extends MDP class to use a dictionary transistion model """ - - def __init__(self, init, actlist, terminals, gamma=.9): - MDP.__init__(self,init, actlist, terminals, gamma) - self.model = { } - - def R(self, state): - " Return a numeric reward for this state. " - if state in self.reward: - return self.reward[state] + def __init__(self, mdp): + MDP.__init__(self, + mdp.init, + mdp.actlist, + mdp.terminals, + mdp.gamma) + update(self, + P = {}, + states = mdp.states) + + def R(self, s): + """Return a numeric reward for the state s""" + if s in self.reward: + return self.reward[s] else: # TODO: this should really return zero? or return False beause we # don't know. Returns 0 for now as it makes the value iteration @@ -56,263 +26,103 @@ def R(self, state): return 0 #raise Exception('tried to get reward of state we dont have yet %s' % str(state)) - def T(self, state, action): - " Returns a list of tuples with probabilities for states " + def T(self, s, a): + """Returns a list of tuples with probabilities for states""" try: - possible_results_and_probabilities = self.model[state][action] + return [(p,s) for (s,p) in self.P[s][a].items()] except KeyError: - return [] - - l = [] - for result_state, probability in possible_results_and_probabilities.items(): - l.append((probability, result_state)) - return l + return [] # return an empty list - def T_add(self, state, action, result_state, probability): + def T_add(self, (s,a,t), p): " Adds a value to the transistion model " - if (state in self.model) and (action in self.model[state]): - self.model[state][action][result_state] = probability - elif (state in self.model): - self.model[state][action] = { result_state : probability } + if (s in self.P) and (a in self.P[s]): + self.P[s][a][t] = p + elif (s in self.P): + self.P[s][a] = {t:p} else: - self.model[state] = {action : { result_state : probability} } + self.P[s] = {a:{t:p}} class PassiveADPAgent(object): - def __init__(self, action_mdp, policy): - self.mdp = MyMDP(init=(0, 0), - actlist=[(1,0), (0, 1), (-1, 0), (0, -1)], - terminals=action_mdp.terminals, - gamma = 0.9) - self.action_mdp = action_mdp - self.utility, self.outcome_freq = { }, { } - self.reached_states = set([]) - self.previous_state, self.previous_action = None, None - self.create_policy_and_states(policy) - self.create_empty_sa_freq() - - def create_empty_sa_freq(self): - " Creates state action frequences with inital values of 0 " - self.sa_freq = { } - for state in self.mdp.states: - self.sa_freq[state] = { } - for action in self.mdp.actlist: - self.sa_freq[state][action] = 0.0 - - def create_policy_and_states(self, policy): - " Sets the initial policy, and also sets the mdp's states " - self.policy = {} - self.mdp.states = set() - - ## Reverse because we want row 0 on bottom, not on top - policy.reverse() - self.rows, self.cols = len(policy), len(policy[0]) - for x in range(self.cols): - for y in range(self.rows): - # Convert arrows to numbers - if policy[y][x] == None: - self.policy[x, y] = None - else: - self.policy[x, y] = self.action_mdp.char_to_tuple(policy[y][x]) - - # States are all non-none values - if policy[y][x] is not None: - self.mdp.states.add((x, y)) - - def add_state_action_pair_frequency(self, state, action): - self.sa_freq[state][action] += 1 - - def get_state_action_pair_frequency(self, state, action): - return self.sa_freq[state][action] - - def add_outcome_frequency(self, state, action, outcome): - # We haven't seen this state yet - if state not in self.outcome_freq: - self.outcome_freq[state] = {action : {outcome : 1}} - return - - # We've seen the state but not the action - if action not in self.outcome_freq[state]: - self.outcome_freq[state][action] = {outcome : 1} - return - - # We've seen the state and the action, but not the outcome - if outcome not in self.outcome_freq[state][action]: - self.outcome_freq[state][action][outcome] = 1 - return - - # We've seen the state, action, and outcome, add 1 - self.outcome_freq[state][action][outcome] += 1 - - def get_outcome_frequency(self, state, action, outcome): - try: - return self.outcome_freq[state][action][outcome] - except KeyError: - return 0 - - def print_outcome_frequency(self): - for state in agent.outcome_freq: - for action in agent.outcome_freq[state]: - for result_state, result_frequency in agent.outcome_freq[state][action].items(): - print 'state', state, '\t action', action, \ - '\t result state',result_state, '\t frequency', result_frequency - - - def get_move_from_policy(self, state_x, state_y): - return self.policy[state_x][state_y] - - def next_action(self, current_state, current_reward): - # policy = self.policy computed by constructor - # MDP = mdp object. self.mdp - # MDP.T - transistion model (initially empty), - # MDP.reward - reward - # MDP gamma in initializer - # utility = dictionary [(0,0)] = 0.57 etc - # state action frequencies = sa_freq (dict) initially empty - # outcome frequencies given state outcome and state-action pairs = outcome_freq initially empty - # dict with key being new state, value being another dict with keys being - # state, action pairs and values being that percentage - # previous state, previous action = s,a - - # if s' is new then: - if (current_state not in self.reached_states): - # U[s'] <- r' - self.utility[current_state] = current_reward - - # R[s'] <- r' - self.mdp.reward[current_state] = current_reward - - # Make sure we know we have seen it before - self.reached_states.add(current_state) - - # if s is not null - if self.previous_state is not None: - # increment Nsa[s,a] and Ns'|sa[s', s, a] - self.add_state_action_pair_frequency(self.previous_state, self.previous_action) - self.add_outcome_frequency(self.previous_state, self.previous_action, current_state) - - # for each t such that Ns'|sa[t,s,a] is nonzero: - for state in agent.outcome_freq: - for action in agent.outcome_freq[state]: - for result_state, result_frequency in agent.outcome_freq[state][action].items(): - if result_frequency > 0: - # P (t, s, a) <- Ns'|sa[t, s, a] / Nsa[s,a] - # Update the model to be: - # ((freq of this action happening with this state action pair) - # / (total freq of this state action pair combo)) - probability = result_frequency / self.get_state_action_pair_frequency(state, action) - self.mdp.T_add(state, action, result_state, probability) + def __init__(self, action_mdp, pi): + update(self, + pi = pi, + mdp = MyMDP(action_mdp), + action_mdp = action_mdp, + U = {}, + Ns_sa = {s:{a:{t:0 for (p,t) in action_mdp.T(s,a)} + for a in action_mdp.actlist} + for s in action_mdp.states}, + Nsa = {s:{a:0. for a in action_mdp.actlist} + for s in action_mdp.states}, + s = None, + a = None) - self.utility = policy_evaluation(self.policy, self.utility, self.mdp) - - # if s'.TERMINAL? - # If we're at a terminal we don't want a next move - if current_state in self.mdp.terminals: - logging.info('Reached terminal state %s' % str(current_state)) - # s,a <- null - self.previous_state, self.previous_action = None, None + def program(self, s1, r1): + mdp,U,s,a,Nsa,Ns_sa = self.mdp,self.U,self.s,self.a,self.Nsa,self.Ns_sa + if s1 not in mdp.reward: # mdp.reward also tracks the visited states + U[s1] = r1 + mdp.reward[s1] = r1 + if s is not None: + Nsa[s][a] += 1 + Ns_sa[s][a][s1] += 1 + for t in Ns_sa[s][a]: + if Ns_sa[s][a][t] > 0: + self.mdp.T_add((s,a,t), Ns_sa[s][a][t] / Nsa[s][a]) + U = policy_evaluation(self.pi, U, mdp) + if s1 in mdp.terminals: + self.s, self.a = None, None return False else: - # s,a <- s', policy[s'] - next_action = self.policy[current_state] - self.previous_state, self.previous_action = current_state, next_action - # Return the next action that the policy dictates - return next_action - - - def execute_trial(self): - # Start at initial state - current_state = self.mdp.init - - # Keep going until we get to a terminal state - while True: - logging.info('--------------------------') - - # Get reward for current state - current_reward = self.action_mdp.R(current_state) - - # Calculate move from current state - next_action = self.next_action(current_state, current_reward) - - logging.info('Current State: %s ' % str(current_state)) - logging.info('Current Reward: %s ' % current_reward) - logging.info('Next action: %s' % self.action_mdp.tuple_to_char(next_action)) - - if next_action == False: - # End because next_action told us to - logging.info('Next_action returned false, stopping') - break - - # Get new current_state - current_state = self.action_mdp.simulate_move(current_state, next_action) - -if __name__ == '__main__': - ''' Parses options from command line, creates Fig 17,1, runs the passive - adp agent on it certain amount of times, outputs info and utilities ''' - - # Setup file options - parser = OptionParser() - parser.add_option("-t", "--times", dest="times", type="int", default = 100, - help="times to run") - parser.add_option("-d", "--debug", action='store_true', dest="debug", - default=False, help="debug mode?") - parser.add_option("-i", "--info", action='store_true', dest="info", - default=False, help="info mode?") - parser.add_option("-f", "--file", dest="log_file", - default=False, help="file to log to") - (options, args) = parser.parse_args() - - if options.debug: - level = logging.DEBUG - elif options.info: - level = logging.INFO - else: - level = logging.CRITICAL - - format = '%(levelname)s: %(message)s' - if options.log_file: - logging.basicConfig(level=level, - filename=options.log_file, - filemode='w', - format=format) - else: - logging.basicConfig(level=level, - format=format) - - # Set up grid MDP to act on - Fig = {} - Fig[17,1] = GridMDP([[-0.04, -0.04, -0.04, +1.0], - [-0.04, None, -0.04, -1.0], - [-0.04, -0.04, -0.04, -0.04]], - terminals=[(3, 2), (3, 1)]) - + self.s, self.a = s1, self.pi[s1] + return self.a + +def simulate(mdp,(s,a)): + r = random() # 0 <= r <= 1 + p,s1 = zip(*(mdp.T(s,a))) + for i in range(len(p)): + if sum(p[:i+1]) >= r: + return s1[i] + +def execute_trial(agent,mdp): + current_state = agent.mdp.init + while True: + current_reward = mdp.R(current_state) + next_action = agent.program(current_state, current_reward) + if next_action == False: + break + current_state = simulate(mdp,(current_state, next_action)) + +def demoPassiveADPAgent(): + print 'DEMO PassiveADPAgent' + print '--------------------' # Setup values - policy = [['>', '>', '>', '.'], - ['^', None, '^', '.'], - ['^', '<', '<', '<']] + policy = {(0, 1): (0, 1), + (1, 2): (1, 0), + (3, 2): None, + (0, 0): (0, 1), + (3, 0): (-1, 0), + (3, 1): None, + (2, 1): (0, 1), + (2, 0): (0, 1), + (2, 2): (1, 0), + (1, 0): (1, 0), + (0, 2): (1, 0)} # Create agent - agent = PassiveADPAgent(Fig[17,1], policy) - - # Start timing time_start = time() - logging.info('Start at %s' % time_start) - - # Execute a bunch of trials - trials = options.times + trials = 100 + agent = PassiveADPAgent(Fig[17,1], policy) for i in range (0,trials): - agent.execute_trial() - - # End timing + execute_trial(agent,Fig[17,1]) time_end = time() - logging.info('End at %s' % time_end) seconds_elapsed = time_end - time_start minutes_elapsed = seconds_elapsed / 60.0 - - # Print and log final results final_results = (('Took %d seconds, which is %d minutes' % (seconds_elapsed, minutes_elapsed)),\ - ('Executed %i trials' % (trials)), ('Utilities: %s' % (agent.utility))) + ('Executed %i trials' % (trials)), ('Utilities: %s' % (agent.U))) for result in final_results: - logging.info(result) print result + +if __name__ == '__main__': + demoPassiveADPAgent() From 27d1745017addc5e126dd4a91b9229e994d59b6c Mon Sep 17 00:00:00 2001 From: Ng Yee Sian Date: Fri, 7 Dec 2012 23:14:46 +0800 Subject: [PATCH 06/19] moved the LearntMDP class into PassiveADP since it's a model of the mdp that is internal to the passive adp agent. --- passive_adp.py | 78 ++++++++++++++++++++++---------------------------- 1 file changed, 34 insertions(+), 44 deletions(-) diff --git a/passive_adp.py b/passive_adp.py index 626b9c0..f89ae86 100644 --- a/passive_adp.py +++ b/passive_adp.py @@ -1,53 +1,43 @@ from mdp import GridMDP, MDP, value_iteration, policy_evaluation, Fig -from utils import turn_left, turn_right, update +from utils import update from random import random from time import time +import agents -class MyMDP(MDP): - """ Extends MDP class to use a dictionary transistion model """ - def __init__(self, mdp): - MDP.__init__(self, - mdp.init, - mdp.actlist, - mdp.terminals, - mdp.gamma) - update(self, - P = {}, - states = mdp.states) - - def R(self, s): - """Return a numeric reward for the state s""" - if s in self.reward: - return self.reward[s] - else: - # TODO: this should really return zero? or return False beause we - # don't know. Returns 0 for now as it makes the value iteration - # function work - return 0 - #raise Exception('tried to get reward of state we dont have yet %s' % str(state)) - - def T(self, s, a): - """Returns a list of tuples with probabilities for states""" - try: - return [(p,s) for (s,p) in self.P[s][a].items()] - except KeyError: - return [] # return an empty list +class PassiveADPAgent(agents.Agent): + """Passive (non-learning) agent that uses adaptive dynamic programming + on a given MDP and policy. [Fig. 21.2]""" + class LearntMDP: + def __init__(self, states, gamma, terminals): + update(self, P={}, reward={}, states=states, gamma=gamma, terminals=terminals) + + def R(self, s): + """Return a numeric reward for the state s""" + if s in self.reward: + return self.reward[s] + else: + return 0. # we don't know the value of the reward. + + def T(self, s, a): + """Returns a list of tuples with probabilities for states""" + try: + return [(p,s) for (s,p) in self.P[s][a].items()] + except KeyError: + return [] + + def T_add(self, (s,a,t), p): + " Adds a value to the transistion model " + if (s in self.P) and (a in self.P[s]): + self.P[s][a][t] = p + elif (s in self.P): + self.P[s][a] = {t:p} + else: + self.P[s] = {a:{t:p}} - def T_add(self, (s,a,t), p): - " Adds a value to the transistion model " - if (s in self.P) and (a in self.P[s]): - self.P[s][a][t] = p - elif (s in self.P): - self.P[s][a] = {t:p} - else: - self.P[s] = {a:{t:p}} - -class PassiveADPAgent(object): - def __init__(self, action_mdp, pi): update(self, pi = pi, - mdp = MyMDP(action_mdp), + mdp = self.LearntMDP(action_mdp.states,action_mdp.gamma,action_mdp.terminals), action_mdp = action_mdp, U = {}, Ns_sa = {s:{a:{t:0 for (p,t) in action_mdp.T(s,a)} @@ -60,7 +50,7 @@ def __init__(self, action_mdp, pi): def program(self, s1, r1): mdp,U,s,a,Nsa,Ns_sa = self.mdp,self.U,self.s,self.a,self.Nsa,self.Ns_sa - if s1 not in mdp.reward: # mdp.reward also tracks the visited states + if s1 not in mdp.reward: # mdp.R also tracks the visited states U[s1] = r1 mdp.reward[s1] = r1 if s is not None: @@ -85,7 +75,7 @@ def simulate(mdp,(s,a)): return s1[i] def execute_trial(agent,mdp): - current_state = agent.mdp.init + current_state = mdp.init while True: current_reward = mdp.R(current_state) next_action = agent.program(current_state, current_reward) From ec27f8cfffc29ae56222ad5c32a0573cbfdee1fe Mon Sep 17 00:00:00 2001 From: Ng Yee Sian Date: Fri, 7 Dec 2012 23:23:33 +0800 Subject: [PATCH 07/19] moved the code for passive_adp into rl.py --- rl.py | 145 ++++++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 130 insertions(+), 15 deletions(-) diff --git a/rl.py b/rl.py index fc0e2c9..89ab75b 100644 --- a/rl.py +++ b/rl.py @@ -1,15 +1,130 @@ -"""Reinforcement Learning (Chapter 21) -""" - -from utils import * -import agents - -class PassiveADPAgent(agents.Agent): - """Passive (non-learning) agent that uses adaptive dynamic programming - on a given MDP and policy. [Fig. 21.2]""" - NotImplemented - -class PassiveTDAgent(agents.Agent): - """Passive (non-learning) agent that uses temporal differences to learn - utility estimates. [Fig. 21.4]""" - NotImplemented +"""Reinforcement Learning (Chapter 21) +""" + +from mdp import GridMDP, MDP, value_iteration, policy_evaluation, Fig +from utils import update +from random import random +from time import time +import agents + +class PassiveADPAgent(agents.Agent): + """Passive (non-learning) agent that uses adaptive dynamic programming + on a given MDP and policy. [Fig. 21.2]""" + class LearntMDP: + """a model of the original mdp that the PassiveADP is trying to learn""" + def __init__(self, states, gamma, terminals): + update(self, P={}, reward={}, states=states, gamma=gamma, terminals=terminals) + + def R(self, s): + """Return a numeric reward for the state s""" + if s in self.reward: + return self.reward[s] + else: + return 0. # we don't know the value of the reward. + + def T(self, s, a): + """Returns a list of tuples with probabilities for states""" + try: + return [(p,s) for (s,p) in self.P[s][a].items()] + except KeyError: + return [] + + def T_set(self, (s,a,t), p): + " Adds a value to the transistion model " + if (s in self.P) and (a in self.P[s]): + self.P[s][a][t] = p + elif (s in self.P): + self.P[s][a] = {t:p} + else: + self.P[s] = {a:{t:p}} + + def __init__(self, action_mdp, pi): + update(self, + pi = pi, + mdp = self.LearntMDP(action_mdp.states,action_mdp.gamma,action_mdp.terminals), + action_mdp = action_mdp, + U = {}, + Ns_sa = {s:{a:{t:0 for (p,t) in action_mdp.T(s,a)} + for a in action_mdp.actlist} + for s in action_mdp.states}, + Nsa = {s:{a:0. for a in action_mdp.actlist} + for s in action_mdp.states}, + s = None, + a = None) + + def program(self, s1, r1): + mdp,U,s,a,Nsa,Ns_sa = self.mdp,self.U,self.s,self.a,self.Nsa,self.Ns_sa + if s1 not in mdp.reward: # mdp.R also tracks the visited states + U[s1] = r1 + mdp.reward[s1] = r1 + if s is not None: + Nsa[s][a] += 1 + Ns_sa[s][a][s1] += 1 + for t in Ns_sa[s][a]: + if Ns_sa[s][a][t] > 0: + self.mdp.T_set((s,a,t), Ns_sa[s][a][t] / Nsa[s][a]) + U = policy_evaluation(self.pi, U, mdp) + if s1 in mdp.terminals: + self.s, self.a = None, None + return False + else: + self.s, self.a = s1, self.pi[s1] + return self.a + +def simulate(mdp,(s,a)): + r = random() # 0 <= r <= 1 + p,s1 = zip(*(mdp.T(s,a))) + for i in range(len(p)): + if sum(p[:i+1]) >= r: + return s1[i] + +def execute_trial(agent,mdp): + current_state = mdp.init + while True: + current_reward = mdp.R(current_state) + next_action = agent.program(current_state, current_reward) + if next_action == False: + break + current_state = simulate(mdp,(current_state, next_action)) + +def demoPassiveADPAgent(): + print 'DEMO PassiveADPAgent' + print '--------------------' + # Setup values + policy = {(0, 1): (0, 1), + (1, 2): (1, 0), + (3, 2): None, + (0, 0): (0, 1), + (3, 0): (-1, 0), + (3, 1): None, + (2, 1): (0, 1), + (2, 0): (0, 1), + (2, 2): (1, 0), + (1, 0): (1, 0), + (0, 2): (1, 0)} + + # Create agent + time_start = time() + trials = 100 + agent = PassiveADPAgent(Fig[17,1], policy) + for i in range (0,trials): + execute_trial(agent,Fig[17,1]) + time_end = time() + + seconds_elapsed = time_end - time_start + minutes_elapsed = seconds_elapsed / 60.0 + final_results = (('Took %d seconds, which is %d minutes' % (seconds_elapsed, minutes_elapsed)),\ + ('Executed %i trials' % (trials)), ('Utilities: %s' % (agent.U))) + for result in final_results: + print result + + print '\nCorrect Utilities (estimated by value iteration, for comparison):' + print value_iteration(Fig[17,1]) + +class PassiveTDAgent(agents.Agent): + """Passive (non-learning) agent that uses temporal differences to learn + utility estimates. [Fig. 21.4]""" + NotImplemented + +if __name__ == '__main__': + demoPassiveADPAgent() From 5002382c448086d9bc2bb2c7dce5d147583a9c84 Mon Sep 17 00:00:00 2001 From: Ng Yee Sian Date: Fri, 7 Dec 2012 23:27:20 +0800 Subject: [PATCH 08/19] added docstring for LearntMDP --- passive_adp.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/passive_adp.py b/passive_adp.py index f89ae86..eb64f8b 100644 --- a/passive_adp.py +++ b/passive_adp.py @@ -8,6 +8,7 @@ class PassiveADPAgent(agents.Agent): """Passive (non-learning) agent that uses adaptive dynamic programming on a given MDP and policy. [Fig. 21.2]""" class LearntMDP: + """a model of the original mdp that the PassiveADP is trying to learn""" def __init__(self, states, gamma, terminals): update(self, P={}, reward={}, states=states, gamma=gamma, terminals=terminals) @@ -25,7 +26,7 @@ def T(self, s, a): except KeyError: return [] - def T_add(self, (s,a,t), p): + def T_set(self, (s,a,t), p): " Adds a value to the transistion model " if (s in self.P) and (a in self.P[s]): self.P[s][a][t] = p @@ -58,7 +59,7 @@ def program(self, s1, r1): Ns_sa[s][a][s1] += 1 for t in Ns_sa[s][a]: if Ns_sa[s][a][t] > 0: - self.mdp.T_add((s,a,t), Ns_sa[s][a][t] / Nsa[s][a]) + self.mdp.T_set((s,a,t), Ns_sa[s][a][t] / Nsa[s][a]) U = policy_evaluation(self.pi, U, mdp) if s1 in mdp.terminals: self.s, self.a = None, None @@ -114,5 +115,8 @@ def demoPassiveADPAgent(): for result in final_results: print result + print '\nCorrect Utilities (estimated by value iteration, for comparison):' + print value_iteration(Fig[17,1]) + if __name__ == '__main__': demoPassiveADPAgent() From e7cbcafdcff2fdc6397dc53ec0ccc4d8562be639 Mon Sep 17 00:00:00 2001 From: Ng Yee Sian Date: Fri, 7 Dec 2012 23:32:56 +0800 Subject: [PATCH 09/19] removed passive_adp.py and merged it into rl.py --- passive_adp.py | 122 ------------------------------------------------- 1 file changed, 122 deletions(-) delete mode 100644 passive_adp.py diff --git a/passive_adp.py b/passive_adp.py deleted file mode 100644 index eb64f8b..0000000 --- a/passive_adp.py +++ /dev/null @@ -1,122 +0,0 @@ -from mdp import GridMDP, MDP, value_iteration, policy_evaluation, Fig -from utils import update -from random import random -from time import time -import agents - -class PassiveADPAgent(agents.Agent): - """Passive (non-learning) agent that uses adaptive dynamic programming - on a given MDP and policy. [Fig. 21.2]""" - class LearntMDP: - """a model of the original mdp that the PassiveADP is trying to learn""" - def __init__(self, states, gamma, terminals): - update(self, P={}, reward={}, states=states, gamma=gamma, terminals=terminals) - - def R(self, s): - """Return a numeric reward for the state s""" - if s in self.reward: - return self.reward[s] - else: - return 0. # we don't know the value of the reward. - - def T(self, s, a): - """Returns a list of tuples with probabilities for states""" - try: - return [(p,s) for (s,p) in self.P[s][a].items()] - except KeyError: - return [] - - def T_set(self, (s,a,t), p): - " Adds a value to the transistion model " - if (s in self.P) and (a in self.P[s]): - self.P[s][a][t] = p - elif (s in self.P): - self.P[s][a] = {t:p} - else: - self.P[s] = {a:{t:p}} - - def __init__(self, action_mdp, pi): - update(self, - pi = pi, - mdp = self.LearntMDP(action_mdp.states,action_mdp.gamma,action_mdp.terminals), - action_mdp = action_mdp, - U = {}, - Ns_sa = {s:{a:{t:0 for (p,t) in action_mdp.T(s,a)} - for a in action_mdp.actlist} - for s in action_mdp.states}, - Nsa = {s:{a:0. for a in action_mdp.actlist} - for s in action_mdp.states}, - s = None, - a = None) - - def program(self, s1, r1): - mdp,U,s,a,Nsa,Ns_sa = self.mdp,self.U,self.s,self.a,self.Nsa,self.Ns_sa - if s1 not in mdp.reward: # mdp.R also tracks the visited states - U[s1] = r1 - mdp.reward[s1] = r1 - if s is not None: - Nsa[s][a] += 1 - Ns_sa[s][a][s1] += 1 - for t in Ns_sa[s][a]: - if Ns_sa[s][a][t] > 0: - self.mdp.T_set((s,a,t), Ns_sa[s][a][t] / Nsa[s][a]) - U = policy_evaluation(self.pi, U, mdp) - if s1 in mdp.terminals: - self.s, self.a = None, None - return False - else: - self.s, self.a = s1, self.pi[s1] - return self.a - -def simulate(mdp,(s,a)): - r = random() # 0 <= r <= 1 - p,s1 = zip(*(mdp.T(s,a))) - for i in range(len(p)): - if sum(p[:i+1]) >= r: - return s1[i] - -def execute_trial(agent,mdp): - current_state = mdp.init - while True: - current_reward = mdp.R(current_state) - next_action = agent.program(current_state, current_reward) - if next_action == False: - break - current_state = simulate(mdp,(current_state, next_action)) - -def demoPassiveADPAgent(): - print 'DEMO PassiveADPAgent' - print '--------------------' - # Setup values - policy = {(0, 1): (0, 1), - (1, 2): (1, 0), - (3, 2): None, - (0, 0): (0, 1), - (3, 0): (-1, 0), - (3, 1): None, - (2, 1): (0, 1), - (2, 0): (0, 1), - (2, 2): (1, 0), - (1, 0): (1, 0), - (0, 2): (1, 0)} - - # Create agent - time_start = time() - trials = 100 - agent = PassiveADPAgent(Fig[17,1], policy) - for i in range (0,trials): - execute_trial(agent,Fig[17,1]) - time_end = time() - - seconds_elapsed = time_end - time_start - minutes_elapsed = seconds_elapsed / 60.0 - final_results = (('Took %d seconds, which is %d minutes' % (seconds_elapsed, minutes_elapsed)),\ - ('Executed %i trials' % (trials)), ('Utilities: %s' % (agent.U))) - for result in final_results: - print result - - print '\nCorrect Utilities (estimated by value iteration, for comparison):' - print value_iteration(Fig[17,1]) - -if __name__ == '__main__': - demoPassiveADPAgent() From 2e4d9a4e68376b82a1e59ba641eeea6393505ec7 Mon Sep 17 00:00:00 2001 From: Ng Yee Sian Date: Sat, 8 Dec 2012 09:15:55 +0800 Subject: [PATCH 10/19] added code for PassiveTDAgent --- rl.py | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 69 insertions(+), 12 deletions(-) diff --git a/rl.py b/rl.py index 89ab75b..9147ff4 100644 --- a/rl.py +++ b/rl.py @@ -38,21 +38,21 @@ def T_set(self, (s,a,t), p): else: self.P[s] = {a:{t:p}} - def __init__(self, action_mdp, pi): + def __init__(self, mdp, pi): update(self, pi = pi, - mdp = self.LearntMDP(action_mdp.states,action_mdp.gamma,action_mdp.terminals), - action_mdp = action_mdp, + mdp = self.LearntMDP(mdp.states,mdp.gamma,mdp.terminals), U = {}, - Ns_sa = {s:{a:{t:0 for (p,t) in action_mdp.T(s,a)} - for a in action_mdp.actlist} - for s in action_mdp.states}, - Nsa = {s:{a:0. for a in action_mdp.actlist} - for s in action_mdp.states}, + Ns_sa = {s:{a:{t:0 for (p,t) in mdp.T(s,a)} + for a in mdp.actlist} + for s in mdp.states}, + Nsa = {s:{a:0. for a in mdp.actlist} + for s in mdp.states}, s = None, a = None) - def program(self, s1, r1): + def program(self, percept): + s1,r1 = percept mdp,U,s,a,Nsa,Ns_sa = self.mdp,self.U,self.s,self.a,self.Nsa,self.Ns_sa if s1 not in mdp.reward: # mdp.R also tracks the visited states U[s1] = r1 @@ -82,7 +82,7 @@ def execute_trial(agent,mdp): current_state = mdp.init while True: current_reward = mdp.R(current_state) - next_action = agent.program(current_state, current_reward) + next_action = agent.program((current_state, current_reward)) if next_action == False: break current_state = simulate(mdp,(current_state, next_action)) @@ -124,7 +124,64 @@ def demoPassiveADPAgent(): class PassiveTDAgent(agents.Agent): """Passive (non-learning) agent that uses temporal differences to learn utility estimates. [Fig. 21.4]""" - NotImplemented + def __init__(self,mdp,pi,alpha=None): + update(self, + pi = pi, + U = {s:0. for s in mdp.states}, + Ns = {s:0 for s in mdp.states}, + s = None, + a = None, + r = None, + gamma = mdp.gamma, + terminals = mdp.terminals) + if alpha is None: + alpha = lambda n: 60./(59+n) # page 837 + def program(self,percept): + s1,r1 = percept + pi,U,Ns,s,a,r = self.pi,self.U,self.Ns,self.s,self.a,self.r + alpha,gamma = self.alpha,self.gamma + if s1 not in U: U[s1] = r1 + if s is not None: + Ns[s] += 1 + U[s] = U[s] + alpha(Ns[s])*(r+gamma*U[s1]-U[s]) + if s in self.terminals: self.s,self.a,self.r = None,None,None + else: self.s,self.a,self.r = s1, pi[s1],r1 + return self.a + +def demoPassiveTDAgent(): + print 'DEMO PassiveTDAgent' + print '--------------------' + # Setup values + policy = {(0, 1): (0, 1), + (1, 2): (1, 0), + (3, 2): None, + (0, 0): (0, 1), + (3, 0): (-1, 0), + (3, 1): None, + (2, 1): (0, 1), + (2, 0): (0, 1), + (2, 2): (1, 0), + (1, 0): (1, 0), + (0, 2): (1, 0)} + + # Create agent + time_start = time() + trials = 100 + agent = PassiveADPAgent(Fig[17,1], policy) + for i in range (0,trials): + execute_trial(agent,Fig[17,1]) + time_end = time() + + seconds_elapsed = time_end - time_start + minutes_elapsed = seconds_elapsed / 60.0 + final_results = (('Took %d seconds, which is %d minutes' % (seconds_elapsed, minutes_elapsed)),\ + ('Executed %i trials' % (trials)), ('Utilities: %s' % (agent.U))) + for result in final_results: + print result + + print '\nCorrect Utilities (estimated by value iteration, for comparison):' + print value_iteration(Fig[17,1]) if __name__ == '__main__': - demoPassiveADPAgent() + #demoPassiveADPAgent() + demoPassiveTDAgent() From 72f3e2c66228561b5aa12f87883f779030b177b7 Mon Sep 17 00:00:00 2001 From: Ng Yee Sian Date: Sat, 8 Dec 2012 09:44:23 +0800 Subject: [PATCH 11/19] include a new class: QLearningAgent --- .gitattributes | 22 +++ .gitignore | 163 +++++++++++++++++++++ mdp.py | 344 ++++++++++++++++++++++---------------------- rl.py | 379 +++++++++++++++++++++++++------------------------ 4 files changed, 549 insertions(+), 359 deletions(-) create mode 100644 .gitattributes create mode 100644 .gitignore diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..412eeda --- /dev/null +++ b/.gitattributes @@ -0,0 +1,22 @@ +# Auto detect text files and perform LF normalization +* text=auto + +# Custom for Visual Studio +*.cs diff=csharp +*.sln merge=union +*.csproj merge=union +*.vbproj merge=union +*.fsproj merge=union +*.dbproj merge=union + +# Standard to msysgit +*.doc diff=astextplain +*.DOC diff=astextplain +*.docx diff=astextplain +*.DOCX diff=astextplain +*.dot diff=astextplain +*.DOT diff=astextplain +*.pdf diff=astextplain +*.PDF diff=astextplain +*.rtf diff=astextplain +*.RTF diff=astextplain diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5ebd21a --- /dev/null +++ b/.gitignore @@ -0,0 +1,163 @@ +################# +## Eclipse +################# + +*.pydevproject +.project +.metadata +bin/ +tmp/ +*.tmp +*.bak +*.swp +*~.nib +local.properties +.classpath +.settings/ +.loadpath + +# External tool builders +.externalToolBuilders/ + +# Locally stored "Eclipse launch configurations" +*.launch + +# CDT-specific +.cproject + +# PDT-specific +.buildpath + + +################# +## Visual Studio +################# + +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. + +# User-specific files +*.suo +*.user +*.sln.docstates + +# Build results +[Dd]ebug/ +[Rr]elease/ +*_i.c +*_p.c +*.ilk +*.meta +*.obj +*.pch +*.pdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.vspscc +.builds +*.dotCover + +## TODO: If you have NuGet Package Restore enabled, uncomment this +#packages/ + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opensdf +*.sdf + +# Visual Studio profiler +*.psess +*.vsp + +# ReSharper is a .NET coding add-in +_ReSharper* + +# Installshield output folder +[Ee]xpress + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish + +# Others +[Bb]in +[Oo]bj +sql +TestResults +*.Cache +ClientBin +stylecop.* +~$* +*.dbmdl +Generated_Code #added for RIA/Silverlight projects + +# Backup & report files from converting an old project file to a newer +# Visual Studio version. Backup files are not needed, because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML + + + +############ +## Windows +############ + +# Windows image file caches +Thumbs.db + +# Folder config file +Desktop.ini + + +############# +## Python +############# + +*.py[co] + +# Packages +*.egg +*.egg-info +dist +build +eggs +parts +bin +var +sdist +develop-eggs +.installed.cfg + +# Installer logs +pip-log.txt + +# Unit test / coverage reports +.coverage +.tox + +#Translations +*.mo + +#Mr Developer +.mr.developer.cfg + +# Mac crap +.DS_Store diff --git a/mdp.py b/mdp.py index d8057ce..0048843 100644 --- a/mdp.py +++ b/mdp.py @@ -1,172 +1,172 @@ -"""Markov Decision Processes (Chapter 17) - -First we define an MDP, and the special case of a GridMDP, in which -states are laid out in a 2-dimensional grid. We also represent a policy -as a dictionary of {state:action} pairs, and a Utility function as a -dictionary of {state:number} pairs. We then define the value_iteration -and policy_iteration algorithms.""" - -from utils import * - -class MDP: - """A Markov Decision Process, defined by an initial state, transition model, - and reward function. We also keep track of a gamma value, for use by - algorithms. The transition model is represented somewhat differently from - the text. Instead of P(s' | s, a) being a probability number for each - state/state/action triplet, we instead have T(s, a) return a list of (p, s') - pairs. We also keep track of the possible states, terminal states, and - actions for each state. [page 646]""" - - def __init__(self, init, actlist, terminals, gamma=.9): - update(self, init=init, actlist=actlist, terminals=terminals, - gamma=gamma, states=set(), reward={}) - - def R(self, state): - "Return a numeric reward for this state." - return self.reward[state] - - def T(self, state, action): - """Transition model. From a state and an action, return a list - of (probability, result-state) pairs.""" - abstract - - def actions(self, state): - """Set of actions that can be performed in this state. By default, a - fixed list of actions, except for terminal states. Override this - method if you need to specialize by state.""" - if state in self.terminals: - return [None] - else: - return self.actlist - -class GridMDP(MDP): - """A two-dimensional grid MDP, as in [Figure 17.1]. All you have to do is - specify the grid as a list of lists of rewards; use None for an obstacle - (unreachable state). Also, you should specify the terminal states. - An action is an (x, y) unit vector; e.g. (1, 0) means move east.""" - def __init__(self, grid, terminals, init=(0, 0), gamma=.9): - grid.reverse() ## because we want row 0 on bottom, not on top - MDP.__init__(self, init, actlist=orientations, - terminals=terminals, gamma=gamma) - update(self, grid=grid, rows=len(grid), cols=len(grid[0])) - for x in range(self.cols): - for y in range(self.rows): - self.reward[x, y] = grid[y][x] - if grid[y][x] is not None: - self.states.add((x, y)) - - def T(self, state, action): - if action is None: - return [(0.0, state)] - else: - return [(0.8, self.go(state, action)), - (0.1, self.go(state, turn_right(action))), - (0.1, self.go(state, turn_left(action)))] - - def go(self, state, direction): - "Return the state that results from going in this direction." - state1 = vector_add(state, direction) - return if_(state1 in self.states, state1, state) - - def to_grid(self, mapping): - """Convert a mapping from (x, y) to v into a [[..., v, ...]] grid.""" - return list(reversed([[mapping.get((x,y), None) - for x in range(self.cols)] - for y in range(self.rows)])) - - def to_arrows(self, policy): - chars = {(1, 0):'>', (0, 1):'^', (-1, 0):'<', (0, -1):'v', None: '.'} - return self.to_grid(dict([(s, chars[a]) for (s, a) in policy.items()])) - -#______________________________________________________________________________ - -Fig[17,1] = GridMDP([[-0.04, -0.04, -0.04, +1], - [-0.04, None, -0.04, -1], - [-0.04, -0.04, -0.04, -0.04]], - terminals=[(3, 2), (3, 1)]) - -#______________________________________________________________________________ - -def value_iteration(mdp, epsilon=0.001): - "Solving an MDP by value iteration. [Fig. 17.4]" - U1 = dict([(s, 0) for s in mdp.states]) - R, T, gamma = mdp.R, mdp.T, mdp.gamma - while True: - U = U1.copy() - delta = 0 - for s in mdp.states: - U1[s] = R(s) + gamma * max([sum([p * U[s1] for (p, s1) in T(s, a)]) - for a in mdp.actions(s)]) - delta = max(delta, abs(U1[s] - U[s])) - if (((gamma < 1) and (delta < epsilon * (1 - gamma) / gamma)) or - ((gamma == 1) and (delta < epsilon))): # allows for gamma to be 1 - return U - -def best_policy(mdp, U): - """Given an MDP and a utility function U, determine the best policy, - as a mapping from state to action. (Equation 17.4)""" - pi = {} - for s in mdp.states: - pi[s] = argmax(mdp.actions(s), lambda a:expected_utility(a, s, U, mdp)) - return pi - -def expected_utility(a, s, U, mdp): - "The expected utility of doing a in state s, according to the MDP and U." - return sum([p * U[s1] for (p, s1) in mdp.T(s, a)]) - -#______________________________________________________________________________ - -def policy_iteration(mdp): - "Solve an MDP by policy iteration [Fig. 17.7]" - U = dict([(s, 0) for s in mdp.states]) - pi = dict([(s, random.choice(mdp.actions(s))) for s in mdp.states]) - while True: - U = policy_evaluation(pi, U, mdp) - unchanged = True - for s in mdp.states: - a = argmax(mdp.actions(s), lambda a: expected_utility(a,s,U,mdp)) - if a != pi[s]: - pi[s] = a - unchanged = False - if unchanged: - return pi - -def policy_evaluation(pi, U, mdp, k=20): - """Return an updated utility mapping U from each state in the MDP to its - utility, using an approximation (modified policy iteration).""" - R, T, gamma = mdp.R, mdp.T, mdp.gamma - for i in range(k): - for s in mdp.states: - U[s] = R(s) + gamma * sum([p * U[s1] for (p, s1) in T(s, pi[s])]) - return U - -__doc__ += """ ->>> pi = best_policy(Fig[17,1], value_iteration(Fig[17,1], .01)) - ->>> Fig[17,1].to_arrows(pi) -[['>', '>', '>', '.'], ['^', None, '^', '.'], ['^', '>', '^', '<']] - ->>> print_table(Fig[17,1].to_arrows(pi)) -> > > . -^ None ^ . -^ > ^ < - ->>> print_table(Fig[17,1].to_arrows(policy_iteration(Fig[17,1]))) -> > > . -^ None ^ . -^ > ^ < -""" - -__doc__ += random_tests(""" ->>> pi -{(3, 2): None, (3, 1): None, (3, 0): (-1, 0), (2, 1): (0, 1), (0, 2): (1, 0), (1, 0): (1, 0), (0, 0): (0, 1), (1, 2): (1, 0), (2, 0): (0, 1), (0, 1): (0, 1), (2, 2): (1, 0)} - ->>> value_iteration(Fig[17,1], .01) -{(3, 2): 1.0, (3, 1): -1.0, (3, 0): 0.12958868267972745, (0, 1): 0.39810203830605462, (0, 2): 0.50928545646220924, (1, 0): 0.25348746162470537, (0, 0): 0.29543540628363629, (1, 2): 0.64958064617168676, (2, 0): 0.34461306281476806, (2, 1): 0.48643676237737926, (2, 2): 0.79536093684710951} - ->>> policy_iteration(Fig[17,1]) -{(3, 2): None, (3, 1): None, (3, 0): (0, -1), (2, 1): (-1, 0), (0, 2): (1, 0), (1, 0): (1, 0), (0, 0): (1, 0), (1, 2): (1, 0), (2, 0): (1, 0), (0, 1): (1, 0), (2, 2): (1, 0)} - -""") - - +"""Markov Decision Processes (Chapter 17) + +First we define an MDP, and the special case of a GridMDP, in which +states are laid out in a 2-dimensional grid. We also represent a policy +as a dictionary of {state:action} pairs, and a Utility function as a +dictionary of {state:number} pairs. We then define the value_iteration +and policy_iteration algorithms.""" + +from utils import * + +class MDP: + """A Markov Decision Process, defined by an initial state, transition model, + and reward function. We also keep track of a gamma value, for use by + algorithms. The transition model is represented somewhat differently from + the text. Instead of P(s' | s, a) being a probability number for each + state/state/action triplet, we instead have T(s, a) return a list of (p, s') + pairs. We also keep track of the possible states, terminal states, and + actions for each state. [page 646]""" + + def __init__(self, init, actlist, terminals, gamma=.9): + update(self, init=init, actlist=actlist, terminals=terminals, + gamma=gamma, states=set(), reward={}) + + def R(self, state): + "Return a numeric reward for this state." + return self.reward[state] + + def T(self, state, action): + """Transition model. From a state and an action, return a list + of (probability, result-state) pairs.""" + abstract + + def actions(self, state): + """Set of actions that can be performed in this state. By default, a + fixed list of actions, except for terminal states. Override this + method if you need to specialize by state.""" + if state in self.terminals: + return [None] + else: + return self.actlist + +class GridMDP(MDP): + """A two-dimensional grid MDP, as in [Figure 17.1]. All you have to do is + specify the grid as a list of lists of rewards; use None for an obstacle + (unreachable state). Also, you should specify the terminal states. + An action is an (x, y) unit vector; e.g. (1, 0) means move east.""" + def __init__(self, grid, terminals, init=(0, 0), gamma=.9): + grid.reverse() ## because we want row 0 on bottom, not on top + MDP.__init__(self, init, actlist=orientations, + terminals=terminals, gamma=gamma) + update(self, grid=grid, rows=len(grid), cols=len(grid[0])) + for x in range(self.cols): + for y in range(self.rows): + self.reward[x, y] = grid[y][x] + if grid[y][x] is not None: + self.states.add((x, y)) + + def T(self, state, action): + if action is None: + return [(0.0, state)] + else: + return [(0.8, self.go(state, action)), + (0.1, self.go(state, turn_right(action))), + (0.1, self.go(state, turn_left(action)))] + + def go(self, state, direction): + "Return the state that results from going in this direction." + state1 = vector_add(state, direction) + return if_(state1 in self.states, state1, state) + + def to_grid(self, mapping): + """Convert a mapping from (x, y) to v into a [[..., v, ...]] grid.""" + return list(reversed([[mapping.get((x,y), None) + for x in range(self.cols)] + for y in range(self.rows)])) + + def to_arrows(self, policy): + chars = {(1, 0):'>', (0, 1):'^', (-1, 0):'<', (0, -1):'v', None: '.'} + return self.to_grid(dict([(s, chars[a]) for (s, a) in policy.items()])) + +#______________________________________________________________________________ + +Fig[17,1] = GridMDP([[-0.04, -0.04, -0.04, +1], + [-0.04, None, -0.04, -1], + [-0.04, -0.04, -0.04, -0.04]], + terminals=[(3, 2), (3, 1)]) + +#______________________________________________________________________________ + +def value_iteration(mdp, epsilon=0.001): + "Solving an MDP by value iteration. [Fig. 17.4]" + U1 = dict([(s, 0) for s in mdp.states]) + R, T, gamma = mdp.R, mdp.T, mdp.gamma + while True: + U = U1.copy() + delta = 0 + for s in mdp.states: + U1[s] = R(s) + gamma * max([sum([p * U[s1] for (p, s1) in T(s, a)]) + for a in mdp.actions(s)]) + delta = max(delta, abs(U1[s] - U[s])) + if (((gamma < 1) and (delta < epsilon * (1 - gamma) / gamma)) or + ((gamma == 1) and (delta < epsilon))): # allows for gamma to be 1 + return U + +def best_policy(mdp, U): + """Given an MDP and a utility function U, determine the best policy, + as a mapping from state to action. (Equation 17.4)""" + pi = {} + for s in mdp.states: + pi[s] = argmax(mdp.actions(s), lambda a:expected_utility(a, s, U, mdp)) + return pi + +def expected_utility(a, s, U, mdp): + "The expected utility of doing a in state s, according to the MDP and U." + return sum([p * U[s1] for (p, s1) in mdp.T(s, a)]) + +#______________________________________________________________________________ + +def policy_iteration(mdp): + "Solve an MDP by policy iteration [Fig. 17.7]" + U = dict([(s, 0) for s in mdp.states]) + pi = dict([(s, random.choice(mdp.actions(s))) for s in mdp.states]) + while True: + U = policy_evaluation(pi, U, mdp) + unchanged = True + for s in mdp.states: + a = argmax(mdp.actions(s), lambda a: expected_utility(a,s,U,mdp)) + if a != pi[s]: + pi[s] = a + unchanged = False + if unchanged: + return pi + +def policy_evaluation(pi, U, mdp, k=20): + """Return an updated utility mapping U from each state in the MDP to its + utility, using an approximation (modified policy iteration).""" + R, T, gamma = mdp.R, mdp.T, mdp.gamma + for i in range(k): + for s in mdp.states: + U[s] = R(s) + gamma * sum([p * U[s1] for (p, s1) in T(s, pi[s])]) + return U + +__doc__ += """ +>>> pi = best_policy(Fig[17,1], value_iteration(Fig[17,1], .01)) + +>>> Fig[17,1].to_arrows(pi) +[['>', '>', '>', '.'], ['^', None, '^', '.'], ['^', '>', '^', '<']] + +>>> print_table(Fig[17,1].to_arrows(pi)) +> > > . +^ None ^ . +^ > ^ < + +>>> print_table(Fig[17,1].to_arrows(policy_iteration(Fig[17,1]))) +> > > . +^ None ^ . +^ > ^ < +""" + +__doc__ += random_tests(""" +>>> pi +{(3, 2): None, (3, 1): None, (3, 0): (-1, 0), (2, 1): (0, 1), (0, 2): (1, 0), (1, 0): (1, 0), (0, 0): (0, 1), (1, 2): (1, 0), (2, 0): (0, 1), (0, 1): (0, 1), (2, 2): (1, 0)} + +>>> value_iteration(Fig[17,1], .01) +{(3, 2): 1.0, (3, 1): -1.0, (3, 0): 0.12958868267972745, (0, 1): 0.39810203830605462, (0, 2): 0.50928545646220924, (1, 0): 0.25348746162470537, (0, 0): 0.29543540628363629, (1, 2): 0.64958064617168676, (2, 0): 0.34461306281476806, (2, 1): 0.48643676237737926, (2, 2): 0.79536093684710951} + +>>> policy_iteration(Fig[17,1]) +{(3, 2): None, (3, 1): None, (3, 0): (0, -1), (2, 1): (-1, 0), (0, 2): (1, 0), (1, 0): (1, 0), (0, 0): (1, 0), (1, 2): (1, 0), (2, 0): (1, 0), (0, 1): (1, 0), (2, 2): (1, 0)} + +""") + + diff --git a/rl.py b/rl.py index 9147ff4..82520df 100644 --- a/rl.py +++ b/rl.py @@ -1,187 +1,192 @@ -"""Reinforcement Learning (Chapter 21) -""" - -from mdp import GridMDP, MDP, value_iteration, policy_evaluation, Fig -from utils import update -from random import random -from time import time -import agents - -class PassiveADPAgent(agents.Agent): - """Passive (non-learning) agent that uses adaptive dynamic programming - on a given MDP and policy. [Fig. 21.2]""" - class LearntMDP: - """a model of the original mdp that the PassiveADP is trying to learn""" - def __init__(self, states, gamma, terminals): - update(self, P={}, reward={}, states=states, gamma=gamma, terminals=terminals) - - def R(self, s): - """Return a numeric reward for the state s""" - if s in self.reward: - return self.reward[s] - else: - return 0. # we don't know the value of the reward. - - def T(self, s, a): - """Returns a list of tuples with probabilities for states""" - try: - return [(p,s) for (s,p) in self.P[s][a].items()] - except KeyError: - return [] - - def T_set(self, (s,a,t), p): - " Adds a value to the transistion model " - if (s in self.P) and (a in self.P[s]): - self.P[s][a][t] = p - elif (s in self.P): - self.P[s][a] = {t:p} - else: - self.P[s] = {a:{t:p}} - - def __init__(self, mdp, pi): - update(self, - pi = pi, - mdp = self.LearntMDP(mdp.states,mdp.gamma,mdp.terminals), - U = {}, - Ns_sa = {s:{a:{t:0 for (p,t) in mdp.T(s,a)} - for a in mdp.actlist} - for s in mdp.states}, - Nsa = {s:{a:0. for a in mdp.actlist} - for s in mdp.states}, - s = None, - a = None) - - def program(self, percept): - s1,r1 = percept - mdp,U,s,a,Nsa,Ns_sa = self.mdp,self.U,self.s,self.a,self.Nsa,self.Ns_sa - if s1 not in mdp.reward: # mdp.R also tracks the visited states - U[s1] = r1 - mdp.reward[s1] = r1 - if s is not None: - Nsa[s][a] += 1 - Ns_sa[s][a][s1] += 1 - for t in Ns_sa[s][a]: - if Ns_sa[s][a][t] > 0: - self.mdp.T_set((s,a,t), Ns_sa[s][a][t] / Nsa[s][a]) - U = policy_evaluation(self.pi, U, mdp) - if s1 in mdp.terminals: - self.s, self.a = None, None - return False - else: - self.s, self.a = s1, self.pi[s1] - return self.a - -def simulate(mdp,(s,a)): - r = random() # 0 <= r <= 1 - p,s1 = zip(*(mdp.T(s,a))) - for i in range(len(p)): - if sum(p[:i+1]) >= r: - return s1[i] - -def execute_trial(agent,mdp): - current_state = mdp.init - while True: - current_reward = mdp.R(current_state) - next_action = agent.program((current_state, current_reward)) - if next_action == False: - break - current_state = simulate(mdp,(current_state, next_action)) - -def demoPassiveADPAgent(): - print 'DEMO PassiveADPAgent' - print '--------------------' - # Setup values - policy = {(0, 1): (0, 1), - (1, 2): (1, 0), - (3, 2): None, - (0, 0): (0, 1), - (3, 0): (-1, 0), - (3, 1): None, - (2, 1): (0, 1), - (2, 0): (0, 1), - (2, 2): (1, 0), - (1, 0): (1, 0), - (0, 2): (1, 0)} - - # Create agent - time_start = time() - trials = 100 - agent = PassiveADPAgent(Fig[17,1], policy) - for i in range (0,trials): - execute_trial(agent,Fig[17,1]) - time_end = time() - - seconds_elapsed = time_end - time_start - minutes_elapsed = seconds_elapsed / 60.0 - final_results = (('Took %d seconds, which is %d minutes' % (seconds_elapsed, minutes_elapsed)),\ - ('Executed %i trials' % (trials)), ('Utilities: %s' % (agent.U))) - for result in final_results: - print result - - print '\nCorrect Utilities (estimated by value iteration, for comparison):' - print value_iteration(Fig[17,1]) - -class PassiveTDAgent(agents.Agent): - """Passive (non-learning) agent that uses temporal differences to learn - utility estimates. [Fig. 21.4]""" - def __init__(self,mdp,pi,alpha=None): - update(self, - pi = pi, - U = {s:0. for s in mdp.states}, - Ns = {s:0 for s in mdp.states}, - s = None, - a = None, - r = None, - gamma = mdp.gamma, - terminals = mdp.terminals) - if alpha is None: - alpha = lambda n: 60./(59+n) # page 837 - def program(self,percept): - s1,r1 = percept - pi,U,Ns,s,a,r = self.pi,self.U,self.Ns,self.s,self.a,self.r - alpha,gamma = self.alpha,self.gamma - if s1 not in U: U[s1] = r1 - if s is not None: - Ns[s] += 1 - U[s] = U[s] + alpha(Ns[s])*(r+gamma*U[s1]-U[s]) - if s in self.terminals: self.s,self.a,self.r = None,None,None - else: self.s,self.a,self.r = s1, pi[s1],r1 - return self.a - -def demoPassiveTDAgent(): - print 'DEMO PassiveTDAgent' - print '--------------------' - # Setup values - policy = {(0, 1): (0, 1), - (1, 2): (1, 0), - (3, 2): None, - (0, 0): (0, 1), - (3, 0): (-1, 0), - (3, 1): None, - (2, 1): (0, 1), - (2, 0): (0, 1), - (2, 2): (1, 0), - (1, 0): (1, 0), - (0, 2): (1, 0)} - - # Create agent - time_start = time() - trials = 100 - agent = PassiveADPAgent(Fig[17,1], policy) - for i in range (0,trials): - execute_trial(agent,Fig[17,1]) - time_end = time() - - seconds_elapsed = time_end - time_start - minutes_elapsed = seconds_elapsed / 60.0 - final_results = (('Took %d seconds, which is %d minutes' % (seconds_elapsed, minutes_elapsed)),\ - ('Executed %i trials' % (trials)), ('Utilities: %s' % (agent.U))) - for result in final_results: - print result - - print '\nCorrect Utilities (estimated by value iteration, for comparison):' - print value_iteration(Fig[17,1]) - -if __name__ == '__main__': - #demoPassiveADPAgent() - demoPassiveTDAgent() +"""Reinforcement Learning (Chapter 21) +""" + +from mdp import GridMDP, MDP, value_iteration, policy_evaluation, Fig +from utils import update +from random import random +from time import time +import agents + +class PassiveADPAgent(agents.Agent): + """Passive (non-learning) agent that uses adaptive dynamic programming + on a given MDP and policy. [Fig. 21.2]""" + class LearntMDP: + """a model of the original mdp that the PassiveADP is trying to learn""" + def __init__(self, states, gamma, terminals): + update(self, P={}, reward={}, states=states, gamma=gamma, terminals=terminals) + + def R(self, s): + """Return a numeric reward for the state s""" + if s in self.reward: + return self.reward[s] + else: + return 0. # we don't know the value of the reward. + + def T(self, s, a): + """Returns a list of tuples with probabilities for states""" + try: + return [(p,s) for (s,p) in self.P[s][a].items()] + except KeyError: + return [] + + def T_set(self, (s,a,t), p): + " Adds a value to the transistion model " + if (s in self.P) and (a in self.P[s]): + self.P[s][a][t] = p + elif (s in self.P): + self.P[s][a] = {t:p} + else: + self.P[s] = {a:{t:p}} + + def __init__(self, mdp, pi): + update(self, + pi = pi, + mdp = self.LearntMDP(mdp.states,mdp.gamma,mdp.terminals), + U = {}, + Ns_sa = {s:{a:{t:0 for (p,t) in mdp.T(s,a)} + for a in mdp.actlist} + for s in mdp.states}, + Nsa = {s:{a:0. for a in mdp.actlist} + for s in mdp.states}, + s = None, + a = None) + + def program(self, percept): + s1,r1 = percept + mdp,U,s,a,Nsa,Ns_sa = self.mdp,self.U,self.s,self.a,self.Nsa,self.Ns_sa + if s1 not in mdp.reward: # mdp.R also tracks the visited states + U[s1] = r1 + mdp.reward[s1] = r1 + if s is not None: + Nsa[s][a] += 1 + Ns_sa[s][a][s1] += 1 + for t in Ns_sa[s][a]: + if Ns_sa[s][a][t] > 0: + self.mdp.T_set((s,a,t), Ns_sa[s][a][t] / Nsa[s][a]) + U = policy_evaluation(self.pi, U, mdp) + if s1 in mdp.terminals: + self.s, self.a = None, None + return False + else: + self.s, self.a = s1, self.pi[s1] + return self.a + +def simulate(mdp,(s,a)): + r = random() # 0 <= r <= 1 + p,s1 = zip(*(mdp.T(s,a))) + for i in range(len(p)): + if sum(p[:i+1]) >= r: + return s1[i] + +def execute_trial(agent,mdp): + current_state = mdp.init + while True: + current_reward = mdp.R(current_state) + next_action = agent.program((current_state, current_reward)) + if next_action == False: + break + current_state = simulate(mdp,(current_state, next_action)) + +def demoPassiveADPAgent(): + print 'DEMO PassiveADPAgent' + print '--------------------' + # Setup values + policy = {(0, 1): (0, 1), + (1, 2): (1, 0), + (3, 2): None, + (0, 0): (0, 1), + (3, 0): (-1, 0), + (3, 1): None, + (2, 1): (0, 1), + (2, 0): (0, 1), + (2, 2): (1, 0), + (1, 0): (1, 0), + (0, 2): (1, 0)} + + # Create agent + time_start = time() + trials = 100 + agent = PassiveADPAgent(Fig[17,1], policy) + for i in range (0,trials): + execute_trial(agent,Fig[17,1]) + time_end = time() + + seconds_elapsed = time_end - time_start + minutes_elapsed = seconds_elapsed / 60.0 + final_results = (('Took %d seconds, which is %d minutes' % (seconds_elapsed, minutes_elapsed)),\ + ('Executed %i trials' % (trials)), ('Utilities: %s' % (agent.U))) + for result in final_results: + print result + + print '\nCorrect Utilities (estimated by value iteration, for comparison):' + print value_iteration(Fig[17,1]) + +class PassiveTDAgent(agents.Agent): + """Passive (non-learning) agent that uses temporal differences to learn + utility estimates. [Fig. 21.4]""" + def __init__(self,mdp,pi,alpha=None): + update(self, + pi = pi, + U = {s:0. for s in mdp.states}, + Ns = {s:0 for s in mdp.states}, + s = None, + a = None, + r = None, + gamma = mdp.gamma, + terminals = mdp.terminals) + if alpha is None: + alpha = lambda n: 60./(59+n) # page 837 + def program(self,percept): + s1,r1 = percept + pi,U,Ns,s,a,r = self.pi,self.U,self.Ns,self.s,self.a,self.r + alpha,gamma = self.alpha,self.gamma + if s1 not in U: U[s1] = r1 + if s is not None: + Ns[s] += 1 + U[s] = U[s] + alpha(Ns[s])*(r+gamma*U[s1]-U[s]) + if s in self.terminals: self.s,self.a,self.r = None,None,None + else: self.s,self.a,self.r = s1, pi[s1],r1 + return self.a + +def demoPassiveTDAgent(): + print 'DEMO PassiveTDAgent' + print '--------------------' + # Setup values + policy = {(0, 1): (0, 1), + (1, 2): (1, 0), + (3, 2): None, + (0, 0): (0, 1), + (3, 0): (-1, 0), + (3, 1): None, + (2, 1): (0, 1), + (2, 0): (0, 1), + (2, 2): (1, 0), + (1, 0): (1, 0), + (0, 2): (1, 0)} + + # Create agent + time_start = time() + trials = 100 + agent = PassiveADPAgent(Fig[17,1], policy) + for i in range (0,trials): + execute_trial(agent,Fig[17,1]) + time_end = time() + + seconds_elapsed = time_end - time_start + minutes_elapsed = seconds_elapsed / 60.0 + final_results = (('Took %d seconds, which is %d minutes' % (seconds_elapsed, minutes_elapsed)),\ + ('Executed %i trials' % (trials)), ('Utilities: %s' % (agent.U))) + for result in final_results: + print result + + print '\nCorrect Utilities (estimated by value iteration, for comparison):' + print value_iteration(Fig[17,1]) + +class QLearningAgent(agents.Agent): + """Active TD agent that uses temporal differences to learn an + action-utility representation. [Fig. 21.8]""" + NotImplemented + +if __name__ == '__main__': + #demoPassiveADPAgent() + demoPassiveTDAgent() From 8e90dc4081087e8cf766219e0041575b4cf12cfb Mon Sep 17 00:00:00 2001 From: Ng Yee Sian Date: Sat, 8 Dec 2012 12:58:44 +0800 Subject: [PATCH 12/19] added code for QLearningAgent --- rl.py | 159 ++++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 122 insertions(+), 37 deletions(-) diff --git a/rl.py b/rl.py index 82520df..6aaa065 100644 --- a/rl.py +++ b/rl.py @@ -1,10 +1,12 @@ """Reinforcement Learning (Chapter 21) """ -from mdp import GridMDP, MDP, value_iteration, policy_evaluation, Fig -from utils import update +from mdp import value_iteration, policy_evaluation, policy_iteration, \ + GridMDP, MDP, Fig +from utils import update, argmax from random import random from time import time +from itertools import product import agents class PassiveADPAgent(agents.Agent): @@ -71,6 +73,84 @@ def program(self, percept): self.s, self.a = s1, self.pi[s1] return self.a +class PassiveTDAgent(agents.Agent): + """Passive (non-learning) agent that uses temporal differences to learn + utility estimates. [Fig. 21.4]""" + def __init__(self,mdp,pi,alpha=None): + update(self, + pi = pi, + U = {s:0. for s in mdp.states}, + Ns = {s:0 for s in mdp.states}, + s = None, + a = None, + r = None, + gamma = mdp.gamma, + terminals = mdp.terminals) + if alpha is None: + alpha = lambda n: 60./(59+n) # page 837 + else: + self.alpha = alpha + def program(self,percept): + s1,r1 = percept + pi,U,Ns,s,a,r = self.pi,self.U,self.Ns,self.s,self.a,self.r + alpha,gamma = self.alpha,self.gamma + if s1 not in U: U[s1] = r1 + if s is not None: + Ns[s] += 1 + U[s] += alpha(Ns[s])*(r+gamma*U[s1]-U[s]) + if s in self.terminals: self.s,self.a,self.r = None,None,None + else: self.s,self.a,self.r = s1, pi[s1],r1 + return self.a + +class QLearningAgent(agents.Agent): + """Active TD agent that uses temporal differences to learn an + action-utility representation. [Fig. 21.8]""" + def __init__(self,mdp,alpha=None,Ne=5,Rplus=2): + update(self, + Q = {s:{a:0. for a in mdp.actlist} + for s in mdp.states if s not in mdp.terminals}, + Nsa = {s:{a:0. for a in mdp.actlist} + for s in mdp.states}, + s = None, + a = None, + r = None, + Ne = Ne, + Rplus = Rplus, + gamma = mdp.gamma, + terminals = mdp.terminals) + + for s in mdp.terminals: self.Q[s] = {None:0.} + + if alpha is None: + self.alpha = lambda n: 60./(59+n) # page 837 + else: + self.alpha = alpha + + def f(self,u,n): # the exploration function in AIMA(3rd ed), pg 842 + if n < self.Ne: + return self.Rplus + else: + return u + + def program(self,percept): + s1,r1 = percept + Q, Nsa, s, a, r = self.Q, self.Nsa, self.s, self.a, self.r + alpha, gamma, f = self.alpha, self.gamma, self.f + if s1 in self.terminals: + Q[s1][None] = r1 + if s is not None: + Nsa[s][a] += 1 + Q[s][a] += alpha(Nsa[s][a])*(r+gamma*max(Q[s1].values())-Q[s][a]) + if s1 in self.terminals: + self.s,self.a,self.r = None, None, None + return False + else: + self.s,self.r = s1,r1 + self.a = argmax(Q[s1].keys(),lambda a1: f(Q[s1][a1],Nsa[s1][a1])) + return self.a + +# --- + def simulate(mdp,(s,a)): r = random() # 0 <= r <= 1 p,s1 = zip(*(mdp.T(s,a))) @@ -88,9 +168,9 @@ def execute_trial(agent,mdp): current_state = simulate(mdp,(current_state, next_action)) def demoPassiveADPAgent(): + print '--------------------' print 'DEMO PassiveADPAgent' print '--------------------' - # Setup values policy = {(0, 1): (0, 1), (1, 2): (1, 0), (3, 2): None, @@ -103,7 +183,6 @@ def demoPassiveADPAgent(): (1, 0): (1, 0), (0, 2): (1, 0)} - # Create agent time_start = time() trials = 100 agent = PassiveADPAgent(Fig[17,1], policy) @@ -121,34 +200,8 @@ def demoPassiveADPAgent(): print '\nCorrect Utilities (estimated by value iteration, for comparison):' print value_iteration(Fig[17,1]) -class PassiveTDAgent(agents.Agent): - """Passive (non-learning) agent that uses temporal differences to learn - utility estimates. [Fig. 21.4]""" - def __init__(self,mdp,pi,alpha=None): - update(self, - pi = pi, - U = {s:0. for s in mdp.states}, - Ns = {s:0 for s in mdp.states}, - s = None, - a = None, - r = None, - gamma = mdp.gamma, - terminals = mdp.terminals) - if alpha is None: - alpha = lambda n: 60./(59+n) # page 837 - def program(self,percept): - s1,r1 = percept - pi,U,Ns,s,a,r = self.pi,self.U,self.Ns,self.s,self.a,self.r - alpha,gamma = self.alpha,self.gamma - if s1 not in U: U[s1] = r1 - if s is not None: - Ns[s] += 1 - U[s] = U[s] + alpha(Ns[s])*(r+gamma*U[s1]-U[s]) - if s in self.terminals: self.s,self.a,self.r = None,None,None - else: self.s,self.a,self.r = s1, pi[s1],r1 - return self.a - def demoPassiveTDAgent(): + print '--------------------' print 'DEMO PassiveTDAgent' print '--------------------' # Setup values @@ -164,7 +217,6 @@ def demoPassiveTDAgent(): (1, 0): (1, 0), (0, 2): (1, 0)} - # Create agent time_start = time() trials = 100 agent = PassiveADPAgent(Fig[17,1], policy) @@ -182,11 +234,44 @@ def demoPassiveTDAgent(): print '\nCorrect Utilities (estimated by value iteration, for comparison):' print value_iteration(Fig[17,1]) -class QLearningAgent(agents.Agent): - """Active TD agent that uses temporal differences to learn an - action-utility representation. [Fig. 21.8]""" - NotImplemented +def demoQLearningAgent(): + print '--------------------' + print 'DEMO PassiveTDAgent' + print '--------------------' + # Setup values + policy = {(0, 1): (0, 1), + (1, 2): (1, 0), + (3, 2): None, + (0, 0): (0, 1), + (3, 0): (-1, 0), + (3, 1): None, + (2, 1): (0, 1), + (2, 0): (0, 1), + (2, 2): (1, 0), + (1, 0): (1, 0), + (0, 2): (1, 0)} + + time_start = time() + trials = 1000 + agent = QLearningAgent(Fig[17,1]) + for i in range (0,trials): + execute_trial(agent,Fig[17,1]) + time_end = time() + + seconds_elapsed = time_end - time_start + minutes_elapsed = seconds_elapsed / 60.0 + final_results = (('Took %d seconds, which is %d minutes' % (seconds_elapsed, minutes_elapsed)),\ + ('Executed %i trials' % (trials)), + ('Utilities: %s' % {s:max(agent.Q[s].values()) for s in agent.Q})) + for result in final_results: + print result + + print '\nCorrect Utilities (estimated by value iteration, for comparison):' + print value_iteration(Fig[17,1]) + +# --- if __name__ == '__main__': - #demoPassiveADPAgent() + demoPassiveADPAgent() demoPassiveTDAgent() + demoQLearningAgent() From 79811e02dfef49226bc487b43abf9325e8dc54f7 Mon Sep 17 00:00:00 2001 From: Ng Yee Sian Date: Sat, 8 Dec 2012 15:11:07 +0800 Subject: [PATCH 13/19] fixed a bug with PassiveTDAgent where alpha (rather than self.alpha) was called --- rl.py | 72 +++++++++++++++++++++++++++++++---------------------------- 1 file changed, 38 insertions(+), 34 deletions(-) diff --git a/rl.py b/rl.py index 6aaa065..4313948 100644 --- a/rl.py +++ b/rl.py @@ -6,7 +6,6 @@ from utils import update, argmax from random import random from time import time -from itertools import product import agents class PassiveADPAgent(agents.Agent): @@ -15,13 +14,19 @@ class PassiveADPAgent(agents.Agent): class LearntMDP: """a model of the original mdp that the PassiveADP is trying to learn""" def __init__(self, states, gamma, terminals): - update(self, P={}, reward={}, states=states, gamma=gamma, terminals=terminals) + update(self, + P={}, + reward={}, + states=states, + gamma=gamma, + terminals=terminals) def R(self, s): """Return a numeric reward for the state s""" if s in self.reward: return self.reward[s] else: + # not specified in AIMA(3rd ed) return 0. # we don't know the value of the reward. def T(self, s, a): @@ -57,21 +62,19 @@ def program(self, percept): s1,r1 = percept mdp,U,s,a,Nsa,Ns_sa = self.mdp,self.U,self.s,self.a,self.Nsa,self.Ns_sa if s1 not in mdp.reward: # mdp.R also tracks the visited states - U[s1] = r1 - mdp.reward[s1] = r1 + U[s1] = mdp.reward[s1] = r1 if s is not None: Nsa[s][a] += 1 Ns_sa[s][a][s1] += 1 for t in Ns_sa[s][a]: if Ns_sa[s][a][t] > 0: - self.mdp.T_set((s,a,t), Ns_sa[s][a][t] / Nsa[s][a]) + self.mdp.T_set((s,a,t), Ns_sa[s][a][t]/Nsa[s][a]) U = policy_evaluation(self.pi, U, mdp) if s1 in mdp.terminals: - self.s, self.a = None, None - return False + self.s = self.a = None else: self.s, self.a = s1, self.pi[s1] - return self.a + return self.a class PassiveTDAgent(agents.Agent): """Passive (non-learning) agent that uses temporal differences to learn @@ -86,20 +89,24 @@ def __init__(self,mdp,pi,alpha=None): r = None, gamma = mdp.gamma, terminals = mdp.terminals) + if alpha is None: - alpha = lambda n: 60./(59+n) # page 837 + self.alpha = lambda n: 60./(59+n) # page 837 else: self.alpha = alpha - def program(self,percept): - s1,r1 = percept - pi,U,Ns,s,a,r = self.pi,self.U,self.Ns,self.s,self.a,self.r - alpha,gamma = self.alpha,self.gamma + + def program(self, percept): + s1, r1 = percept + pi, U, Ns, s, a, r = self.pi, self.U, self.Ns, self.s, self.a, self.r + alpha, gamma = self.alpha, self.gamma if s1 not in U: U[s1] = r1 if s is not None: Ns[s] += 1 - U[s] += alpha(Ns[s])*(r+gamma*U[s1]-U[s]) - if s in self.terminals: self.s,self.a,self.r = None,None,None - else: self.s,self.a,self.r = s1, pi[s1],r1 + U[s] += alpha(Ns[s]) * (r + gamma * U[s1] - U[s]) + if s in self.terminals: + self.s = self.a = self.r = None + else: + self.s, self.a, self.r = s1, pi[s1], r1 return self.a class QLearningAgent(agents.Agent): @@ -107,8 +114,8 @@ class QLearningAgent(agents.Agent): action-utility representation. [Fig. 21.8]""" def __init__(self,mdp,alpha=None,Ne=5,Rplus=2): update(self, - Q = {s:{a:0. for a in mdp.actlist} - for s in mdp.states if s not in mdp.terminals}, + Q = {s:{a:0. for a in mdp.actlist} if s not in mdp.terminals + else {None:0.} for s in mdp.states}, Nsa = {s:{a:0. for a in mdp.actlist} for s in mdp.states}, s = None, @@ -118,15 +125,13 @@ def __init__(self,mdp,alpha=None,Ne=5,Rplus=2): Rplus = Rplus, gamma = mdp.gamma, terminals = mdp.terminals) - - for s in mdp.terminals: self.Q[s] = {None:0.} if alpha is None: self.alpha = lambda n: 60./(59+n) # page 837 else: self.alpha = alpha - def f(self,u,n): # the exploration function in AIMA(3rd ed), pg 842 + def f(self,u,n): # exploration function in AIMA(3rd ed), pg 842 if n < self.Ne: return self.Rplus else: @@ -142,16 +147,15 @@ def program(self,percept): Nsa[s][a] += 1 Q[s][a] += alpha(Nsa[s][a])*(r+gamma*max(Q[s1].values())-Q[s][a]) if s1 in self.terminals: - self.s,self.a,self.r = None, None, None - return False + self.s = self.a = self.r = None else: - self.s,self.r = s1,r1 - self.a = argmax(Q[s1].keys(),lambda a1: f(Q[s1][a1],Nsa[s1][a1])) - return self.a + self.s, self.r = s1, r1 + self.a = argmax(Q[s1].keys(), lambda a1: f(Q[s1][a1],Nsa[s1][a1])) + return self.a # --- -def simulate(mdp,(s,a)): +def simulate_move(mdp,(s,a)): r = random() # 0 <= r <= 1 p,s1 = zip(*(mdp.T(s,a))) for i in range(len(p)): @@ -163,9 +167,9 @@ def execute_trial(agent,mdp): while True: current_reward = mdp.R(current_state) next_action = agent.program((current_state, current_reward)) - if next_action == False: + if next_action == None: break - current_state = simulate(mdp,(current_state, next_action)) + current_state = simulate_move(mdp,(current_state, next_action)) def demoPassiveADPAgent(): print '--------------------' @@ -219,7 +223,7 @@ def demoPassiveTDAgent(): time_start = time() trials = 100 - agent = PassiveADPAgent(Fig[17,1], policy) + agent = PassiveTDAgent(Fig[17,1], policy) for i in range (0,trials): execute_trial(agent,Fig[17,1]) time_end = time() @@ -236,7 +240,7 @@ def demoPassiveTDAgent(): def demoQLearningAgent(): print '--------------------' - print 'DEMO PassiveTDAgent' + print 'DEMO QLearningAgent' print '--------------------' # Setup values policy = {(0, 1): (0, 1), @@ -252,7 +256,7 @@ def demoQLearningAgent(): (0, 2): (1, 0)} time_start = time() - trials = 1000 + trials = 100 agent = QLearningAgent(Fig[17,1]) for i in range (0,trials): execute_trial(agent,Fig[17,1]) @@ -272,6 +276,6 @@ def demoQLearningAgent(): # --- if __name__ == '__main__': - demoPassiveADPAgent() + #demoPassiveADPAgent() demoPassiveTDAgent() - demoQLearningAgent() + #demoQLearningAgent() From 8890b6950d679f56fa064ea3e14a143aa171dde8 Mon Sep 17 00:00:00 2001 From: Ng Yee Sian Date: Sat, 8 Dec 2012 15:17:12 +0800 Subject: [PATCH 14/19] fixed a bug in PassiveTDAgent added an attribute: reached_states to track new states. --- rl.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/rl.py b/rl.py index 4313948..0d6bc92 100644 --- a/rl.py +++ b/rl.py @@ -88,7 +88,8 @@ def __init__(self,mdp,pi,alpha=None): a = None, r = None, gamma = mdp.gamma, - terminals = mdp.terminals) + terminals = mdp.terminals, + reached_states = set()) if alpha is None: self.alpha = lambda n: 60./(59+n) # page 837 @@ -99,11 +100,13 @@ def program(self, percept): s1, r1 = percept pi, U, Ns, s, a, r = self.pi, self.U, self.Ns, self.s, self.a, self.r alpha, gamma = self.alpha, self.gamma - if s1 not in U: U[s1] = r1 + if s1 not in self.reached_states: + self.reached_states.add(s1) + U[s1] = r1 if s is not None: Ns[s] += 1 U[s] += alpha(Ns[s]) * (r + gamma * U[s1] - U[s]) - if s in self.terminals: + if s1 in self.terminals: self.s = self.a = self.r = None else: self.s, self.a, self.r = s1, pi[s1], r1 From 9956cbaacc85c38955c12c783f14629d1d126cc5 Mon Sep 17 00:00:00 2001 From: Ng Yee Sian Date: Sat, 8 Dec 2012 23:14:53 +0800 Subject: [PATCH 15/19] refactored the code in the demo_functions --- rl.py | 41 ++++++++++++++--------------------------- 1 file changed, 14 insertions(+), 27 deletions(-) diff --git a/rl.py b/rl.py index 0d6bc92..438e4a1 100644 --- a/rl.py +++ b/rl.py @@ -197,14 +197,10 @@ def demoPassiveADPAgent(): execute_trial(agent,Fig[17,1]) time_end = time() - seconds_elapsed = time_end - time_start - minutes_elapsed = seconds_elapsed / 60.0 - final_results = (('Took %d seconds, which is %d minutes' % (seconds_elapsed, minutes_elapsed)),\ - ('Executed %i trials' % (trials)), ('Utilities: %s' % (agent.U))) - for result in final_results: - print result - - print '\nCorrect Utilities (estimated by value iteration, for comparison):' + print 'Executed %i trials' % trials + print 'Took %d seconds' % (time_end - time_start) + print 'Utilities: %s' % agent.U + print '\nCorrect Utilities (estimated by value iteration):' print value_iteration(Fig[17,1]) def demoPassiveTDAgent(): @@ -231,14 +227,10 @@ def demoPassiveTDAgent(): execute_trial(agent,Fig[17,1]) time_end = time() - seconds_elapsed = time_end - time_start - minutes_elapsed = seconds_elapsed / 60.0 - final_results = (('Took %d seconds, which is %d minutes' % (seconds_elapsed, minutes_elapsed)),\ - ('Executed %i trials' % (trials)), ('Utilities: %s' % (agent.U))) - for result in final_results: - print result - - print '\nCorrect Utilities (estimated by value iteration, for comparison):' + print 'Executed %i trials' % trials + print 'Took %d seconds' % (time_end - time_start) + print 'Utilities: %s' % agent.U + print '\nCorrect Utilities (estimated by value iteration):' print value_iteration(Fig[17,1]) def demoQLearningAgent(): @@ -264,21 +256,16 @@ def demoQLearningAgent(): for i in range (0,trials): execute_trial(agent,Fig[17,1]) time_end = time() - - seconds_elapsed = time_end - time_start - minutes_elapsed = seconds_elapsed / 60.0 - final_results = (('Took %d seconds, which is %d minutes' % (seconds_elapsed, minutes_elapsed)),\ - ('Executed %i trials' % (trials)), - ('Utilities: %s' % {s:max(agent.Q[s].values()) for s in agent.Q})) - for result in final_results: - print result - print '\nCorrect Utilities (estimated by value iteration, for comparison):' + print 'Executed %i trials' % trials + print 'Took %d seconds' % (time_end - time_start) + print 'Utilities: %s' % {s:max(agent.Q[s].values()) for s in agent.Q} + print '\nCorrect Utilities (estimated by value iteration):' print value_iteration(Fig[17,1]) # --- if __name__ == '__main__': - #demoPassiveADPAgent() + demoPassiveADPAgent() demoPassiveTDAgent() - #demoQLearningAgent() + demoQLearningAgent() From bb91e37da73ab23ef6fb05ebd6a46cfa18b2eb95 Mon Sep 17 00:00:00 2001 From: Ng Yee Sian Date: Sun, 12 May 2013 11:52:06 +0800 Subject: [PATCH 16/19] used get() for dict look-ups --- rl.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/rl.py b/rl.py index 438e4a1..8ddcb7f 100644 --- a/rl.py +++ b/rl.py @@ -23,11 +23,7 @@ def __init__(self, states, gamma, terminals): def R(self, s): """Return a numeric reward for the state s""" - if s in self.reward: - return self.reward[s] - else: - # not specified in AIMA(3rd ed) - return 0. # we don't know the value of the reward. + return self.reward.get(s, 0.) def T(self, s, a): """Returns a list of tuples with probabilities for states""" @@ -91,10 +87,10 @@ def __init__(self,mdp,pi,alpha=None): terminals = mdp.terminals, reached_states = set()) - if alpha is None: - self.alpha = lambda n: 60./(59+n) # page 837 - else: + if alpha: self.alpha = alpha + else: + self.alpha = lambda n: 60./(59+n) # page 837 def program(self, percept): s1, r1 = percept From 0713821c11fe26fb91d42a50f864711a4022bd22 Mon Sep 17 00:00:00 2001 From: Ng Yee Sian Date: Sun, 12 May 2013 12:22:22 +0800 Subject: [PATCH 17/19] normalize the files specified on commit and always convert them to LF on checkout --- .gitattributes | 2 +- images/dirt05-icon.jpg | Bin 1772 -> 1771 bytes 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitattributes b/.gitattributes index 412eeda..930429e 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,5 +1,5 @@ # Auto detect text files and perform LF normalization -* text=auto +* text eol=lf # Custom for Visual Studio *.cs diff=csharp diff --git a/images/dirt05-icon.jpg b/images/dirt05-icon.jpg index 38d02e97f84973f98b2b98ad5ca3a7a752986174..262c2b7cee5c34b08d466c3dc64120e4ded44892 100644 GIT binary patch delta 10 RcmaFE` Date: Sun, 12 May 2013 12:31:33 +0800 Subject: [PATCH 18/19] revert back to auto setting --- .gitattributes | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitattributes b/.gitattributes index 930429e..412eeda 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,5 +1,5 @@ # Auto detect text files and perform LF normalization -* text eol=lf +* text=auto # Custom for Visual Studio *.cs diff=csharp From 2ffc7ab2df89a21c205d5809bd46699d4fe613ea Mon Sep 17 00:00:00 2001 From: Ng Yee Sian Date: Sun, 12 May 2013 12:39:37 +0800 Subject: [PATCH 19/19] normalize and convert .py files --- .gitattributes | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/.gitattributes b/.gitattributes index 412eeda..f7c2ff5 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,6 +1,19 @@ # Auto detect text files and perform LF normalization * text=auto +# Explicitly declare text files we want to always be normalized and converted +# to native line endings on checkout. +*.c text +*.h text +*.py text + +# Declare files that will always have CRLF line endings on checkout. +*.sln text eol=crlf + +# Denote all files that are truly binary and should not be modified. +*.png binary +*.jpg binary + # Custom for Visual Studio *.cs diff=csharp *.sln merge=union