From 348985ed79078ee2d903b08a60e0109b27722a46 Mon Sep 17 00:00:00 2001
From: Neal McBurnett <neal@mcburnett.org>
Date: Sun, 29 Jul 2012 13:25:05 -0600
Subject: [PATCH 01/19] Add test for Queues: ensure member is not there after
 being popped

---
 utils.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/utils.py b/utils.py
index c167589..dce9dfe 100644
--- a/utils.py
+++ b/utils.py
@@ -855,7 +855,13 @@ def fixup(test):
 ...     q.extend(nums)
 ...     for num in nums: assert num in q
 ...     assert 42 not in q
-...     return [q.pop() for i in range(len(q))]
+...     result = []
+...     for i in range(len(q)):
+...         num = q.pop()
+...         assert num not in q
+...         result.append(num)
+...     return result
+
 >>> qtest(Stack())
 [0, 3, 4, 99, -99, 6, 5, 7, 2, 8, 1]
 

From 6c0eef780a024cbbc7f814c2407baadbaf20d906 Mon Sep 17 00:00:00 2001
From: Neal McBurnett <neal@mcburnett.org>
Date: Sun, 29 Jul 2012 14:08:16 -0600
Subject: [PATCH 02/19] Speed up PriorityQueue.__contains__() from linear to
 constant time, fixing Issue 31 (quadratic search times). Note that
 __getitem__() and __delitem__() are still linear time, but could easily be
 improved also if necessary.

---
 utils.py | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/utils.py b/utils.py
index dce9dfe..8356337 100644
--- a/utils.py
+++ b/utils.py
@@ -734,17 +734,22 @@ class PriorityQueue(Queue):
     Also supports dict-like lookup."""
     def __init__(self, order=min, f=lambda x: x):
         update(self, A=[], order=order, f=f)
+        self.membership = {}
     def append(self, item):
         bisect.insort(self.A, (self.f(item), item))
+        hashval = hash(item)
+        self.membership[hashval] = self.membership.get(hashval, 0) + 1
     def __len__(self):
         return len(self.A)
     def pop(self):
         if self.order == min:
-            return self.A.pop(0)[1]
+            item = self.A.pop(0)[1]
         else:
-            return self.A.pop()[1]
+            item = self.A.pop()[1]
+        self._remove_(item)
+        return item
     def __contains__(self, item):
-        return some(lambda (_, x): x == item, self.A)
+        return hash(item) in self.membership
     def __getitem__(self, key):
         for _, item in self.A:
             if item == key:
@@ -752,8 +757,15 @@ def __getitem__(self, key):
     def __delitem__(self, key):
         for i, (value, item) in enumerate(self.A):
             if item == key:
-                self.A.pop(i)
+                item = self.A.pop(i)
+                self._remove_(item)
                 return
+    def _remove_(self, item):
+        hashval = hash(item)
+        self.membership[hashval] -= 1
+        if self.membership[hashval] == 0:
+            del self.membership[hashval]
+
 
 ## Fig: The idea is we can define things like Fig[3,10] later.
 ## Alas, it is Fig[3,10] not Fig[3.10], because that would be the same
@@ -858,8 +870,9 @@ def fixup(test):
 ...     result = []
 ...     for i in range(len(q)):
 ...         num = q.pop()
-...         assert num not in q
+...         assert num not in q		# num could appear multiple times, in which case this would fail
 ...         result.append(num)
+...
 ...     return result
 
 >>> qtest(Stack())

From 3b81f1776fac34b7659caf8e8055c815a2a95864 Mon Sep 17 00:00:00 2001
From: Ng Yee Sian <ngyeesian@gmail.com>
Date: Thu, 6 Dec 2012 13:16:01 +0800
Subject: [PATCH 03/19] fixed bug in value iteration when gamma == 1

---
 mdp.py | 343 +++++++++++++++++++++++++++++----------------------------
 1 file changed, 172 insertions(+), 171 deletions(-)

diff --git a/mdp.py b/mdp.py
index e5142c1..d8057ce 100644
--- a/mdp.py
+++ b/mdp.py
@@ -1,171 +1,172 @@
-"""Markov Decision Processes (Chapter 17)
-
-First we define an MDP, and the special case of a GridMDP, in which
-states are laid out in a 2-dimensional grid.  We also represent a policy
-as a dictionary of {state:action} pairs, and a Utility function as a
-dictionary of {state:number} pairs.  We then define the value_iteration
-and policy_iteration algorithms."""
-
-from utils import *
-
-class MDP:
-    """A Markov Decision Process, defined by an initial state, transition model,
-    and reward function. We also keep track of a gamma value, for use by
-    algorithms. The transition model is represented somewhat differently from
-    the text.  Instead of P(s' | s, a) being a probability number for each
-    state/state/action triplet, we instead have T(s, a) return a list of (p, s')
-    pairs.  We also keep track of the possible states, terminal states, and
-    actions for each state. [page 646]"""
-
-    def __init__(self, init, actlist, terminals, gamma=.9):
-        update(self, init=init, actlist=actlist, terminals=terminals,
-               gamma=gamma, states=set(), reward={})
-
-    def R(self, state):
-        "Return a numeric reward for this state."
-        return self.reward[state]
-
-    def T(self, state, action):
-        """Transition model.  From a state and an action, return a list
-        of (probability, result-state) pairs."""
-        abstract
-
-    def actions(self, state):
-        """Set of actions that can be performed in this state.  By default, a
-        fixed list of actions, except for terminal states. Override this
-        method if you need to specialize by state."""
-        if state in self.terminals:
-            return [None]
-        else:
-            return self.actlist
-
-class GridMDP(MDP):
-    """A two-dimensional grid MDP, as in [Figure 17.1].  All you have to do is
-    specify the grid as a list of lists of rewards; use None for an obstacle
-    (unreachable state).  Also, you should specify the terminal states.
-    An action is an (x, y) unit vector; e.g. (1, 0) means move east."""
-    def __init__(self, grid, terminals, init=(0, 0), gamma=.9):
-        grid.reverse() ## because we want row 0 on bottom, not on top
-        MDP.__init__(self, init, actlist=orientations,
-                     terminals=terminals, gamma=gamma)
-        update(self, grid=grid, rows=len(grid), cols=len(grid[0]))
-        for x in range(self.cols):
-            for y in range(self.rows):
-                self.reward[x, y] = grid[y][x]
-                if grid[y][x] is not None:
-                    self.states.add((x, y))
-
-    def T(self, state, action):
-        if action is None:
-            return [(0.0, state)]
-        else:
-            return [(0.8, self.go(state, action)),
-                    (0.1, self.go(state, turn_right(action))),
-                    (0.1, self.go(state, turn_left(action)))]
-
-    def go(self, state, direction):
-        "Return the state that results from going in this direction."
-        state1 = vector_add(state, direction)
-        return if_(state1 in self.states, state1, state)
-
-    def to_grid(self, mapping):
-        """Convert a mapping from (x, y) to v into a [[..., v, ...]] grid."""
-        return list(reversed([[mapping.get((x,y), None)
-                               for x in range(self.cols)]
-                              for y in range(self.rows)]))
-
-    def to_arrows(self, policy):
-        chars = {(1, 0):'>', (0, 1):'^', (-1, 0):'<', (0, -1):'v', None: '.'}
-        return self.to_grid(dict([(s, chars[a]) for (s, a) in policy.items()]))
-
-#______________________________________________________________________________
-
-Fig[17,1] = GridMDP([[-0.04, -0.04, -0.04, +1],
-                     [-0.04, None,  -0.04, -1],
-                     [-0.04, -0.04, -0.04, -0.04]],
-                    terminals=[(3, 2), (3, 1)])
-
-#______________________________________________________________________________
-
-def value_iteration(mdp, epsilon=0.001):
-    "Solving an MDP by value iteration. [Fig. 17.4]"
-    U1 = dict([(s, 0) for s in mdp.states])
-    R, T, gamma = mdp.R, mdp.T, mdp.gamma
-    while True:
-        U = U1.copy()
-        delta = 0
-        for s in mdp.states:
-            U1[s] = R(s) + gamma * max([sum([p * U[s1] for (p, s1) in T(s, a)])
-                                        for a in mdp.actions(s)])
-            delta = max(delta, abs(U1[s] - U[s]))
-        if delta < epsilon * (1 - gamma) / gamma:
-             return U
-
-def best_policy(mdp, U):
-    """Given an MDP and a utility function U, determine the best policy,
-    as a mapping from state to action. (Equation 17.4)"""
-    pi = {}
-    for s in mdp.states:
-        pi[s] = argmax(mdp.actions(s), lambda a:expected_utility(a, s, U, mdp))
-    return pi
-
-def expected_utility(a, s, U, mdp):
-    "The expected utility of doing a in state s, according to the MDP and U."
-    return sum([p * U[s1] for (p, s1) in mdp.T(s, a)])
-
-#______________________________________________________________________________
-
-def policy_iteration(mdp):
-    "Solve an MDP by policy iteration [Fig. 17.7]"
-    U = dict([(s, 0) for s in mdp.states])
-    pi = dict([(s, random.choice(mdp.actions(s))) for s in mdp.states])
-    while True:
-        U = policy_evaluation(pi, U, mdp)
-        unchanged = True
-        for s in mdp.states:
-            a = argmax(mdp.actions(s), lambda a: expected_utility(a,s,U,mdp))
-            if a != pi[s]:
-                pi[s] = a
-                unchanged = False
-        if unchanged:
-            return pi
-
-def policy_evaluation(pi, U, mdp, k=20):
-    """Return an updated utility mapping U from each state in the MDP to its
-    utility, using an approximation (modified policy iteration)."""
-    R, T, gamma = mdp.R, mdp.T, mdp.gamma
-    for i in range(k):
-        for s in mdp.states:
-            U[s] = R(s) + gamma * sum([p * U[s1] for (p, s1) in T(s, pi[s])])
-    return U
-
-__doc__ += """
->>> pi = best_policy(Fig[17,1], value_iteration(Fig[17,1], .01))
-
->>> Fig[17,1].to_arrows(pi)
-[['>', '>', '>', '.'], ['^', None, '^', '.'], ['^', '>', '^', '<']]
-
->>> print_table(Fig[17,1].to_arrows(pi))
->   >      >   .
-^   None   ^   .
-^   >      ^   <
-
->>> print_table(Fig[17,1].to_arrows(policy_iteration(Fig[17,1])))
->   >      >   .
-^   None   ^   .
-^   >      ^   <
-"""
-
-__doc__ += random_tests("""
->>> pi
-{(3, 2): None, (3, 1): None, (3, 0): (-1, 0), (2, 1): (0, 1), (0, 2): (1, 0), (1, 0): (1, 0), (0, 0): (0, 1), (1, 2): (1, 0), (2, 0): (0, 1), (0, 1): (0, 1), (2, 2): (1, 0)}
-
->>> value_iteration(Fig[17,1], .01)
-{(3, 2): 1.0, (3, 1): -1.0, (3, 0): 0.12958868267972745, (0, 1): 0.39810203830605462, (0, 2): 0.50928545646220924, (1, 0): 0.25348746162470537, (0, 0): 0.29543540628363629, (1, 2): 0.64958064617168676, (2, 0): 0.34461306281476806, (2, 1): 0.48643676237737926, (2, 2): 0.79536093684710951}
-
->>> policy_iteration(Fig[17,1])
-{(3, 2): None, (3, 1): None, (3, 0): (0, -1), (2, 1): (-1, 0), (0, 2): (1, 0), (1, 0): (1, 0), (0, 0): (1, 0), (1, 2): (1, 0), (2, 0): (1, 0), (0, 1): (1, 0), (2, 2): (1, 0)}
-
-""")
-
-
+"""Markov Decision Processes (Chapter 17)
+
+First we define an MDP, and the special case of a GridMDP, in which
+states are laid out in a 2-dimensional grid.  We also represent a policy
+as a dictionary of {state:action} pairs, and a Utility function as a
+dictionary of {state:number} pairs.  We then define the value_iteration
+and policy_iteration algorithms."""
+
+from utils import *
+
+class MDP:
+    """A Markov Decision Process, defined by an initial state, transition model,
+    and reward function. We also keep track of a gamma value, for use by
+    algorithms. The transition model is represented somewhat differently from
+    the text.  Instead of P(s' | s, a) being a probability number for each
+    state/state/action triplet, we instead have T(s, a) return a list of (p, s')
+    pairs.  We also keep track of the possible states, terminal states, and
+    actions for each state. [page 646]"""
+
+    def __init__(self, init, actlist, terminals, gamma=.9):
+        update(self, init=init, actlist=actlist, terminals=terminals,
+               gamma=gamma, states=set(), reward={})
+
+    def R(self, state):
+        "Return a numeric reward for this state."
+        return self.reward[state]
+
+    def T(self, state, action):
+        """Transition model.  From a state and an action, return a list
+        of (probability, result-state) pairs."""
+        abstract
+
+    def actions(self, state):
+        """Set of actions that can be performed in this state.  By default, a
+        fixed list of actions, except for terminal states. Override this
+        method if you need to specialize by state."""
+        if state in self.terminals:
+            return [None]
+        else:
+            return self.actlist
+
+class GridMDP(MDP):
+    """A two-dimensional grid MDP, as in [Figure 17.1].  All you have to do is
+    specify the grid as a list of lists of rewards; use None for an obstacle
+    (unreachable state).  Also, you should specify the terminal states.
+    An action is an (x, y) unit vector; e.g. (1, 0) means move east."""
+    def __init__(self, grid, terminals, init=(0, 0), gamma=.9):
+        grid.reverse() ## because we want row 0 on bottom, not on top
+        MDP.__init__(self, init, actlist=orientations,
+                     terminals=terminals, gamma=gamma)
+        update(self, grid=grid, rows=len(grid), cols=len(grid[0]))
+        for x in range(self.cols):
+            for y in range(self.rows):
+                self.reward[x, y] = grid[y][x]
+                if grid[y][x] is not None:
+                    self.states.add((x, y))
+
+    def T(self, state, action):
+        if action is None:
+            return [(0.0, state)]
+        else:
+            return [(0.8, self.go(state, action)),
+                    (0.1, self.go(state, turn_right(action))),
+                    (0.1, self.go(state, turn_left(action)))]
+
+    def go(self, state, direction):
+        "Return the state that results from going in this direction."
+        state1 = vector_add(state, direction)
+        return if_(state1 in self.states, state1, state)
+
+    def to_grid(self, mapping):
+        """Convert a mapping from (x, y) to v into a [[..., v, ...]] grid."""
+        return list(reversed([[mapping.get((x,y), None)
+                               for x in range(self.cols)]
+                              for y in range(self.rows)]))
+
+    def to_arrows(self, policy):
+        chars = {(1, 0):'>', (0, 1):'^', (-1, 0):'<', (0, -1):'v', None: '.'}
+        return self.to_grid(dict([(s, chars[a]) for (s, a) in policy.items()]))
+
+#______________________________________________________________________________
+
+Fig[17,1] = GridMDP([[-0.04, -0.04, -0.04, +1],
+                     [-0.04, None,  -0.04, -1],
+                     [-0.04, -0.04, -0.04, -0.04]],
+                    terminals=[(3, 2), (3, 1)])
+
+#______________________________________________________________________________
+
+def value_iteration(mdp, epsilon=0.001):
+    "Solving an MDP by value iteration. [Fig. 17.4]"
+    U1 = dict([(s, 0) for s in mdp.states])
+    R, T, gamma = mdp.R, mdp.T, mdp.gamma
+    while True:
+        U = U1.copy()
+        delta = 0
+        for s in mdp.states:
+            U1[s] = R(s) + gamma * max([sum([p * U[s1] for (p, s1) in T(s, a)])
+                                        for a in mdp.actions(s)])
+            delta = max(delta, abs(U1[s] - U[s]))
+        if (((gamma < 1) and (delta < epsilon * (1 - gamma) / gamma)) or
+            ((gamma == 1) and (delta < epsilon))): # allows for gamma to be 1
+             return U
+
+def best_policy(mdp, U):
+    """Given an MDP and a utility function U, determine the best policy,
+    as a mapping from state to action. (Equation 17.4)"""
+    pi = {}
+    for s in mdp.states:
+        pi[s] = argmax(mdp.actions(s), lambda a:expected_utility(a, s, U, mdp))
+    return pi
+
+def expected_utility(a, s, U, mdp):
+    "The expected utility of doing a in state s, according to the MDP and U."
+    return sum([p * U[s1] for (p, s1) in mdp.T(s, a)])
+
+#______________________________________________________________________________
+
+def policy_iteration(mdp):
+    "Solve an MDP by policy iteration [Fig. 17.7]"
+    U = dict([(s, 0) for s in mdp.states])
+    pi = dict([(s, random.choice(mdp.actions(s))) for s in mdp.states])
+    while True:
+        U = policy_evaluation(pi, U, mdp)
+        unchanged = True
+        for s in mdp.states:
+            a = argmax(mdp.actions(s), lambda a: expected_utility(a,s,U,mdp))
+            if a != pi[s]:
+                pi[s] = a
+                unchanged = False
+        if unchanged:
+            return pi
+
+def policy_evaluation(pi, U, mdp, k=20):
+    """Return an updated utility mapping U from each state in the MDP to its
+    utility, using an approximation (modified policy iteration)."""
+    R, T, gamma = mdp.R, mdp.T, mdp.gamma
+    for i in range(k):
+        for s in mdp.states:
+            U[s] = R(s) + gamma * sum([p * U[s1] for (p, s1) in T(s, pi[s])])
+    return U
+
+__doc__ += """
+>>> pi = best_policy(Fig[17,1], value_iteration(Fig[17,1], .01))
+
+>>> Fig[17,1].to_arrows(pi)
+[['>', '>', '>', '.'], ['^', None, '^', '.'], ['^', '>', '^', '<']]
+
+>>> print_table(Fig[17,1].to_arrows(pi))
+>   >      >   .
+^   None   ^   .
+^   >      ^   <
+
+>>> print_table(Fig[17,1].to_arrows(policy_iteration(Fig[17,1])))
+>   >      >   .
+^   None   ^   .
+^   >      ^   <
+"""
+
+__doc__ += random_tests("""
+>>> pi
+{(3, 2): None, (3, 1): None, (3, 0): (-1, 0), (2, 1): (0, 1), (0, 2): (1, 0), (1, 0): (1, 0), (0, 0): (0, 1), (1, 2): (1, 0), (2, 0): (0, 1), (0, 1): (0, 1), (2, 2): (1, 0)}
+
+>>> value_iteration(Fig[17,1], .01)
+{(3, 2): 1.0, (3, 1): -1.0, (3, 0): 0.12958868267972745, (0, 1): 0.39810203830605462, (0, 2): 0.50928545646220924, (1, 0): 0.25348746162470537, (0, 0): 0.29543540628363629, (1, 2): 0.64958064617168676, (2, 0): 0.34461306281476806, (2, 1): 0.48643676237737926, (2, 2): 0.79536093684710951}
+
+>>> policy_iteration(Fig[17,1])
+{(3, 2): None, (3, 1): None, (3, 0): (0, -1), (2, 1): (-1, 0), (0, 2): (1, 0), (1, 0): (1, 0), (0, 0): (1, 0), (1, 2): (1, 0), (2, 0): (1, 0), (0, 1): (1, 0), (2, 2): (1, 0)}
+
+""")
+
+

From 14eaf6ee54bf77202c92c2a5b83da7f5478baf4b Mon Sep 17 00:00:00 2001
From: Ng Yee Sian <ngyeesian@gmail.com>
Date: Fri, 7 Dec 2012 19:48:14 +0800
Subject: [PATCH 04/19] added code for passive adp

original code from Steve Klebanoff at
https://github.com/steveklebanoff/AIMA-Python-Reinforcement-Learning/blob/master/passive_adp.py
---
 passive_adp.py | 318 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 318 insertions(+)
 create mode 100644 passive_adp.py

diff --git a/passive_adp.py b/passive_adp.py
new file mode 100644
index 0000000..bb55f25
--- /dev/null
+++ b/passive_adp.py
@@ -0,0 +1,318 @@
+import logging
+
+from mdp import GridMDP, MDP, value_iteration, policy_evaluation
+from utils import turn_left, turn_right
+from optparse import OptionParser
+from random import randint
+from time import time
+from itertools import product
+
+class GridMDP(GridMDP):
+
+    char_switch = {
+        '>' : (1,0),
+        '^' : (0,1),
+        '<' : (-1, 0),
+        '.' : None
+    }
+
+    # TODO: this and the next should be static methods
+    def char_to_tuple(self, direction):
+        return self.char_switch[direction]
+
+    def tuple_to_char(self, tuple):
+        for k,v in self.char_switch.items():
+            if v == tuple:
+                return k
+
+        return None
+
+    def simulate_move(self, state, action):
+        # TODO: get percentages from T
+        random_number = randint(0, 100)
+        if (random_number >= 0) and (random_number <= 9):
+            return self.go(state, turn_right(action))
+        elif (random_number >= 10) and (random_number <= 20):
+            return self.go(state, turn_left(action))
+        else:
+            return self.go(state, action)
+
+
+class MyMDP(MDP):
+    """ Extends MDP class to use a dictionary transistion model """
+    
+    def __init__(self, init, actlist, terminals, gamma=.9):
+        MDP.__init__(self,init, actlist, terminals, gamma)
+        self.model = { }
+
+    def R(self, state): 
+        " Return a numeric reward for this state. "
+        if state in self.reward:
+            return self.reward[state]
+        else:
+            # TODO: this should really return zero? or return False beause we
+            # don't know.  Returns 0 for now as it makes the value iteration
+            # function work
+            return 0
+            #raise Exception('tried to get reward of state we dont have yet %s' % str(state))
+
+    def T(self, state, action):
+        " Returns a list of tuples with probabilities for states "
+        try:
+            possible_results_and_probabilities = self.model[state][action]
+        except KeyError:
+            return []
+        
+        l = []
+        for result_state, probability in possible_results_and_probabilities.items():
+            l.append((probability, result_state))
+        return l
+    
+    def T_add(self, state, action, result_state, probability):
+        " Adds a value to the transistion model "
+        if (state in self.model) and (action in self.model[state]):
+            self.model[state][action][result_state] = probability
+        elif (state in self.model):
+            self.model[state][action] = { result_state : probability }
+        else:
+            self.model[state] = {action : { result_state : probability} }
+
+class PassiveADPAgent(object):
+
+    def __init__(self, action_mdp, policy):
+        self.mdp = MyMDP(init=(0, 0),
+                       actlist=[(1,0), (0, 1), (-1, 0), (0, -1)],
+                       terminals=action_mdp.terminals,
+                       gamma = 0.9)
+        self.action_mdp = action_mdp
+        self.utility, self.outcome_freq = { }, { }
+        self.reached_states = set([])
+        self.previous_state, self.previous_action = None, None
+        self.create_policy_and_states(policy)
+        self.create_empty_sa_freq()
+
+    def create_empty_sa_freq(self):
+        " Creates state action frequences with inital values of 0 "
+        self.sa_freq = { }
+        for state in self.mdp.states:
+            self.sa_freq[state] = { }
+            for action in self.mdp.actlist:
+                self.sa_freq[state][action] = 0.0
+        
+    def create_policy_and_states(self, policy):
+        " Sets the initial policy, and also sets the mdp's states "
+        self.policy = {}
+        self.mdp.states = set()
+
+        ## Reverse because we want row 0 on bottom, not on top
+        policy.reverse() 
+        self.rows, self.cols = len(policy), len(policy[0])
+        for x in range(self.cols):
+            for y in range(self.rows):
+                # Convert arrows to numbers
+                if policy[y][x] == None:
+                    self.policy[x, y] = None
+                else:
+                    self.policy[x, y] = self.action_mdp.char_to_tuple(policy[y][x])
+
+                # States are all non-none values
+                if policy[y][x] is not None:
+                    self.mdp.states.add((x, y))
+
+    def add_state_action_pair_frequency(self, state, action):
+        self.sa_freq[state][action] += 1
+    
+    def get_state_action_pair_frequency(self, state, action):
+        return self.sa_freq[state][action]
+        
+    def add_outcome_frequency(self, state, action, outcome):
+        # We haven't seen this state yet
+        if state not in self.outcome_freq:
+            self.outcome_freq[state] = {action : {outcome : 1}}
+            return
+        
+        # We've seen the state but not the action
+        if action not in self.outcome_freq[state]:
+            self.outcome_freq[state][action] = {outcome : 1}
+            return
+        
+        # We've seen the state and the action, but not the outcome
+        if outcome not in self.outcome_freq[state][action]:
+            self.outcome_freq[state][action][outcome] = 1
+            return
+        
+        # We've seen the state, action, and outcome, add 1
+        self.outcome_freq[state][action][outcome] += 1
+    
+    def get_outcome_frequency(self, state, action, outcome):
+        try:
+            return self.outcome_freq[state][action][outcome]
+        except KeyError:
+            return 0
+
+    def print_outcome_frequency(self):
+        for state in agent.outcome_freq:
+            for action in agent.outcome_freq[state]:
+                for result_state, result_frequency in agent.outcome_freq[state][action].items():
+                    print 'state', state, '\t action', action, \
+                    '\t result state',result_state, '\t frequency', result_frequency
+
+
+    def get_move_from_policy(self, state_x, state_y):
+        return self.policy[state_x][state_y]
+        
+    def next_action(self, current_state, current_reward):
+        # policy = self.policy computed by constructor
+        # MDP = mdp object. self.mdp
+        #          MDP.T  - transistion model (initially empty),
+        #          MDP.reward - reward
+        #          MDP gamma in initializer
+        # utility = dictionary [(0,0)] = 0.57 etc
+        # state action frequencies = sa_freq (dict) initially empty
+        # outcome frequencies given state outcome and state-action pairs = outcome_freq  initially empty
+        #     dict with key being new state, value being another dict with keys being
+        #     state, action pairs and values being that percentage
+        # previous state, previous action = s,a
+        
+        # if s' is new then:
+        if (current_state not in self.reached_states):
+            # U[s'] <- r'
+            self.utility[current_state] = current_reward
+            
+            # R[s'] <- r'
+            self.mdp.reward[current_state] = current_reward
+            
+            # Make sure we know we have seen it before
+            self.reached_states.add(current_state)
+        
+        # if s is not null
+        if self.previous_state is not None:
+            # increment Nsa[s,a] and Ns'|sa[s', s, a]
+            self.add_state_action_pair_frequency(self.previous_state, self.previous_action)
+            self.add_outcome_frequency(self.previous_state, self.previous_action, current_state)
+            
+            # for each t such that Ns'|sa[t,s,a] is nonzero:
+            for state in agent.outcome_freq:
+                for action in agent.outcome_freq[state]:
+                    for result_state, result_frequency in agent.outcome_freq[state][action].items():
+                        if result_frequency > 0:
+                            # P (t, s, a) <- Ns'|sa[t, s, a] / Nsa[s,a]
+                            # Update the model to be:
+                            # ((freq of this action happening with this state action pair)
+                            # / (total freq of this state action pair combo))
+                            probability = result_frequency / self.get_state_action_pair_frequency(state, action)
+                            self.mdp.T_add(state, action, result_state, probability)
+        
+        self.utility = policy_evaluation(self.policy, self.utility, self.mdp)
+
+        # if s'.TERMINAL?
+        # If we're at a terminal we don't want a next move
+        if current_state in self.mdp.terminals:
+            logging.info('Reached terminal state %s' % str(current_state))
+            # s,a <- null
+            self.previous_state, self.previous_action = None, None
+            return False
+        else:
+            # s,a <- s', policy[s']
+            next_action = self.policy[current_state]
+            self.previous_state, self.previous_action = current_state, next_action
+            # Return the next action that the policy dictates
+            return next_action
+
+    
+    def execute_trial(self):
+        # Start at initial state
+        current_state = self.mdp.init
+        
+        # Keep going until we get to a terminal state
+        while True:
+            logging.info('--------------------------')
+
+            # Get reward for current state
+            current_reward = self.action_mdp.R(current_state)
+
+            # Calculate move from current state
+            next_action = self.next_action(current_state, current_reward)
+
+            logging.info('Current State: %s ' % str(current_state))
+            logging.info('Current Reward: %s ' % current_reward)
+            logging.info('Next action: %s' % self.action_mdp.tuple_to_char(next_action))
+
+            if next_action == False:
+                # End because next_action told us to
+                logging.info('Next_action returned false, stopping')
+                break
+
+            # Get new current_state
+            current_state = self.action_mdp.simulate_move(current_state, next_action)
+
+if __name__ == '__main__':
+    ''' Parses options from command line, creates Fig 17,1, runs the passive
+    adp agent on it certain amount of times, outputs info and utilities '''
+    
+    # Setup file options
+    parser = OptionParser()
+    parser.add_option("-t", "--times", dest="times", type="int", default = 100,
+                      help="times to run")
+    parser.add_option("-d", "--debug", action='store_true', dest="debug",
+                      default=False, help="debug mode?")
+    parser.add_option("-i", "--info", action='store_true', dest="info",
+                      default=False, help="info mode?")
+    parser.add_option("-f", "--file", dest="log_file",
+                      default=False, help="file to log to")
+    (options, args) = parser.parse_args()
+    
+    if options.debug:
+        level = logging.DEBUG
+    elif options.info:
+        level = logging.INFO
+    else:
+        level = logging.CRITICAL
+    
+    format = '%(levelname)s: %(message)s'
+    if options.log_file:
+        logging.basicConfig(level=level,
+                            filename=options.log_file,
+                            filemode='w',
+                            format=format)
+    else:
+        logging.basicConfig(level=level,
+                            format=format)
+    
+    # Set up grid MDP to act on
+    Fig = {}
+    Fig[17,1] = GridMDP([[-0.04, -0.04, -0.04, +1.0],
+                         [-0.04, None, -0.04, -1.0],
+                         [-0.04, -0.04, -0.04, -0.04]],
+                        terminals=[(3, 2), (3, 1)])
+    
+    # Setup values
+    policy = [['>', '>', '>', '.'],
+              ['^', None, '^', '.'],
+              ['^', '<', '<', '<']]
+    
+    # Create agent
+    agent = PassiveADPAgent(Fig[17,1], policy)
+    
+    # Start timing
+    time_start = time()
+    logging.info('Start at %s' % time_start)
+    
+    # Execute a bunch of trials
+    trials = options.times
+    for i in range (0,trials):
+        agent.execute_trial()
+    
+    # End timing
+    time_end = time()
+    logging.info('End at %s' % time_end)
+    
+    seconds_elapsed = time_end - time_start
+    minutes_elapsed = seconds_elapsed / 60.0
+    
+    # Print and log final results
+    final_results = (('Took %d seconds, which is %d minutes' % (seconds_elapsed, minutes_elapsed)),\
+        ('Executed %i trials' % (trials)), ('Utilities: %s' % (agent.utility)))
+    for result in final_results:
+        logging.info(result)
+        print result

From 36237b40cd6603310b9badfc6db49e57f2cf8b72 Mon Sep 17 00:00:00 2001
From: Ng Yee Sian <ngyeesian@gmail.com>
Date: Fri, 7 Dec 2012 22:40:21 +0800
Subject: [PATCH 05/19] refactored the code for passive_adp

to look more like the pseudo-code, and removed the logging
---
 passive_adp.py | 378 ++++++++++++-------------------------------------
 1 file changed, 94 insertions(+), 284 deletions(-)

diff --git a/passive_adp.py b/passive_adp.py
index bb55f25..626b9c0 100644
--- a/passive_adp.py
+++ b/passive_adp.py
@@ -1,54 +1,24 @@
-import logging
-
-from mdp import GridMDP, MDP, value_iteration, policy_evaluation
-from utils import turn_left, turn_right
-from optparse import OptionParser
-from random import randint
+from mdp import GridMDP, MDP, value_iteration, policy_evaluation, Fig
+from utils import turn_left, turn_right, update
+from random import random
 from time import time
-from itertools import product
-
-class GridMDP(GridMDP):
-
-    char_switch = {
-        '>' : (1,0),
-        '^' : (0,1),
-        '<' : (-1, 0),
-        '.' : None
-    }
-
-    # TODO: this and the next should be static methods
-    def char_to_tuple(self, direction):
-        return self.char_switch[direction]
-
-    def tuple_to_char(self, tuple):
-        for k,v in self.char_switch.items():
-            if v == tuple:
-                return k
-
-        return None
-
-    def simulate_move(self, state, action):
-        # TODO: get percentages from T
-        random_number = randint(0, 100)
-        if (random_number >= 0) and (random_number <= 9):
-            return self.go(state, turn_right(action))
-        elif (random_number >= 10) and (random_number <= 20):
-            return self.go(state, turn_left(action))
-        else:
-            return self.go(state, action)
-
 
 class MyMDP(MDP):
     """ Extends MDP class to use a dictionary transistion model """
-    
-    def __init__(self, init, actlist, terminals, gamma=.9):
-        MDP.__init__(self,init, actlist, terminals, gamma)
-        self.model = { }
-
-    def R(self, state): 
-        " Return a numeric reward for this state. "
-        if state in self.reward:
-            return self.reward[state]
+    def __init__(self, mdp):
+        MDP.__init__(self,
+                     mdp.init,
+                     mdp.actlist,
+                     mdp.terminals,
+                     mdp.gamma)
+        update(self,
+               P = {},
+               states = mdp.states)
+
+    def R(self, s):
+        """Return a numeric reward for the state s"""
+        if s in self.reward:
+            return self.reward[s]
         else:
             # TODO: this should really return zero? or return False beause we
             # don't know.  Returns 0 for now as it makes the value iteration
@@ -56,263 +26,103 @@ def R(self, state):
             return 0
             #raise Exception('tried to get reward of state we dont have yet %s' % str(state))
 
-    def T(self, state, action):
-        " Returns a list of tuples with probabilities for states "
+    def T(self, s, a):
+        """Returns a list of tuples with probabilities for states"""
         try:
-            possible_results_and_probabilities = self.model[state][action]
+            return [(p,s) for (s,p) in self.P[s][a].items()]
         except KeyError:
-            return []
-        
-        l = []
-        for result_state, probability in possible_results_and_probabilities.items():
-            l.append((probability, result_state))
-        return l
+            return [] # return an empty list
     
-    def T_add(self, state, action, result_state, probability):
+    def T_add(self, (s,a,t), p):
         " Adds a value to the transistion model "
-        if (state in self.model) and (action in self.model[state]):
-            self.model[state][action][result_state] = probability
-        elif (state in self.model):
-            self.model[state][action] = { result_state : probability }
+        if (s in self.P) and (a in self.P[s]):
+            self.P[s][a][t] = p
+        elif (s in self.P):
+            self.P[s][a] = {t:p}
         else:
-            self.model[state] = {action : { result_state : probability} }
+            self.P[s] = {a:{t:p}}
 
 class PassiveADPAgent(object):
 
-    def __init__(self, action_mdp, policy):
-        self.mdp = MyMDP(init=(0, 0),
-                       actlist=[(1,0), (0, 1), (-1, 0), (0, -1)],
-                       terminals=action_mdp.terminals,
-                       gamma = 0.9)
-        self.action_mdp = action_mdp
-        self.utility, self.outcome_freq = { }, { }
-        self.reached_states = set([])
-        self.previous_state, self.previous_action = None, None
-        self.create_policy_and_states(policy)
-        self.create_empty_sa_freq()
-
-    def create_empty_sa_freq(self):
-        " Creates state action frequences with inital values of 0 "
-        self.sa_freq = { }
-        for state in self.mdp.states:
-            self.sa_freq[state] = { }
-            for action in self.mdp.actlist:
-                self.sa_freq[state][action] = 0.0
-        
-    def create_policy_and_states(self, policy):
-        " Sets the initial policy, and also sets the mdp's states "
-        self.policy = {}
-        self.mdp.states = set()
-
-        ## Reverse because we want row 0 on bottom, not on top
-        policy.reverse() 
-        self.rows, self.cols = len(policy), len(policy[0])
-        for x in range(self.cols):
-            for y in range(self.rows):
-                # Convert arrows to numbers
-                if policy[y][x] == None:
-                    self.policy[x, y] = None
-                else:
-                    self.policy[x, y] = self.action_mdp.char_to_tuple(policy[y][x])
-
-                # States are all non-none values
-                if policy[y][x] is not None:
-                    self.mdp.states.add((x, y))
-
-    def add_state_action_pair_frequency(self, state, action):
-        self.sa_freq[state][action] += 1
-    
-    def get_state_action_pair_frequency(self, state, action):
-        return self.sa_freq[state][action]
-        
-    def add_outcome_frequency(self, state, action, outcome):
-        # We haven't seen this state yet
-        if state not in self.outcome_freq:
-            self.outcome_freq[state] = {action : {outcome : 1}}
-            return
-        
-        # We've seen the state but not the action
-        if action not in self.outcome_freq[state]:
-            self.outcome_freq[state][action] = {outcome : 1}
-            return
-        
-        # We've seen the state and the action, but not the outcome
-        if outcome not in self.outcome_freq[state][action]:
-            self.outcome_freq[state][action][outcome] = 1
-            return
-        
-        # We've seen the state, action, and outcome, add 1
-        self.outcome_freq[state][action][outcome] += 1
-    
-    def get_outcome_frequency(self, state, action, outcome):
-        try:
-            return self.outcome_freq[state][action][outcome]
-        except KeyError:
-            return 0
-
-    def print_outcome_frequency(self):
-        for state in agent.outcome_freq:
-            for action in agent.outcome_freq[state]:
-                for result_state, result_frequency in agent.outcome_freq[state][action].items():
-                    print 'state', state, '\t action', action, \
-                    '\t result state',result_state, '\t frequency', result_frequency
-
-
-    def get_move_from_policy(self, state_x, state_y):
-        return self.policy[state_x][state_y]
-        
-    def next_action(self, current_state, current_reward):
-        # policy = self.policy computed by constructor
-        # MDP = mdp object. self.mdp
-        #          MDP.T  - transistion model (initially empty),
-        #          MDP.reward - reward
-        #          MDP gamma in initializer
-        # utility = dictionary [(0,0)] = 0.57 etc
-        # state action frequencies = sa_freq (dict) initially empty
-        # outcome frequencies given state outcome and state-action pairs = outcome_freq  initially empty
-        #     dict with key being new state, value being another dict with keys being
-        #     state, action pairs and values being that percentage
-        # previous state, previous action = s,a
-        
-        # if s' is new then:
-        if (current_state not in self.reached_states):
-            # U[s'] <- r'
-            self.utility[current_state] = current_reward
-            
-            # R[s'] <- r'
-            self.mdp.reward[current_state] = current_reward
-            
-            # Make sure we know we have seen it before
-            self.reached_states.add(current_state)
-        
-        # if s is not null
-        if self.previous_state is not None:
-            # increment Nsa[s,a] and Ns'|sa[s', s, a]
-            self.add_state_action_pair_frequency(self.previous_state, self.previous_action)
-            self.add_outcome_frequency(self.previous_state, self.previous_action, current_state)
-            
-            # for each t such that Ns'|sa[t,s,a] is nonzero:
-            for state in agent.outcome_freq:
-                for action in agent.outcome_freq[state]:
-                    for result_state, result_frequency in agent.outcome_freq[state][action].items():
-                        if result_frequency > 0:
-                            # P (t, s, a) <- Ns'|sa[t, s, a] / Nsa[s,a]
-                            # Update the model to be:
-                            # ((freq of this action happening with this state action pair)
-                            # / (total freq of this state action pair combo))
-                            probability = result_frequency / self.get_state_action_pair_frequency(state, action)
-                            self.mdp.T_add(state, action, result_state, probability)
+    def __init__(self, action_mdp, pi):
+        update(self,
+               pi = pi,
+               mdp = MyMDP(action_mdp),
+               action_mdp = action_mdp,
+               U = {},
+               Ns_sa = {s:{a:{t:0 for (p,t) in action_mdp.T(s,a)}
+                           for a in action_mdp.actlist}
+                        for s in action_mdp.states},
+               Nsa = {s:{a:0. for a in action_mdp.actlist}
+                      for s in action_mdp.states},
+               s = None,
+               a = None)
         
-        self.utility = policy_evaluation(self.policy, self.utility, self.mdp)
-
-        # if s'.TERMINAL?
-        # If we're at a terminal we don't want a next move
-        if current_state in self.mdp.terminals:
-            logging.info('Reached terminal state %s' % str(current_state))
-            # s,a <- null
-            self.previous_state, self.previous_action = None, None
+    def program(self, s1, r1):
+        mdp,U,s,a,Nsa,Ns_sa = self.mdp,self.U,self.s,self.a,self.Nsa,self.Ns_sa
+        if s1 not in mdp.reward: # mdp.reward also tracks the visited states
+            U[s1] = r1
+            mdp.reward[s1] = r1
+        if s is not None:
+            Nsa[s][a] += 1
+            Ns_sa[s][a][s1] += 1
+            for t in Ns_sa[s][a]:
+                if Ns_sa[s][a][t] > 0:
+                    self.mdp.T_add((s,a,t), Ns_sa[s][a][t] / Nsa[s][a])
+        U = policy_evaluation(self.pi, U, mdp)
+        if s1 in mdp.terminals:
+            self.s, self.a = None, None
             return False
         else:
-            # s,a <- s', policy[s']
-            next_action = self.policy[current_state]
-            self.previous_state, self.previous_action = current_state, next_action
-            # Return the next action that the policy dictates
-            return next_action
-
-    
-    def execute_trial(self):
-        # Start at initial state
-        current_state = self.mdp.init
-        
-        # Keep going until we get to a terminal state
-        while True:
-            logging.info('--------------------------')
-
-            # Get reward for current state
-            current_reward = self.action_mdp.R(current_state)
-
-            # Calculate move from current state
-            next_action = self.next_action(current_state, current_reward)
-
-            logging.info('Current State: %s ' % str(current_state))
-            logging.info('Current Reward: %s ' % current_reward)
-            logging.info('Next action: %s' % self.action_mdp.tuple_to_char(next_action))
-
-            if next_action == False:
-                # End because next_action told us to
-                logging.info('Next_action returned false, stopping')
-                break
-
-            # Get new current_state
-            current_state = self.action_mdp.simulate_move(current_state, next_action)
-
-if __name__ == '__main__':
-    ''' Parses options from command line, creates Fig 17,1, runs the passive
-    adp agent on it certain amount of times, outputs info and utilities '''
-    
-    # Setup file options
-    parser = OptionParser()
-    parser.add_option("-t", "--times", dest="times", type="int", default = 100,
-                      help="times to run")
-    parser.add_option("-d", "--debug", action='store_true', dest="debug",
-                      default=False, help="debug mode?")
-    parser.add_option("-i", "--info", action='store_true', dest="info",
-                      default=False, help="info mode?")
-    parser.add_option("-f", "--file", dest="log_file",
-                      default=False, help="file to log to")
-    (options, args) = parser.parse_args()
-    
-    if options.debug:
-        level = logging.DEBUG
-    elif options.info:
-        level = logging.INFO
-    else:
-        level = logging.CRITICAL
-    
-    format = '%(levelname)s: %(message)s'
-    if options.log_file:
-        logging.basicConfig(level=level,
-                            filename=options.log_file,
-                            filemode='w',
-                            format=format)
-    else:
-        logging.basicConfig(level=level,
-                            format=format)
-    
-    # Set up grid MDP to act on
-    Fig = {}
-    Fig[17,1] = GridMDP([[-0.04, -0.04, -0.04, +1.0],
-                         [-0.04, None, -0.04, -1.0],
-                         [-0.04, -0.04, -0.04, -0.04]],
-                        terminals=[(3, 2), (3, 1)])
-    
+            self.s, self.a = s1, self.pi[s1]
+            return self.a
+
+def simulate(mdp,(s,a)):
+    r = random() # 0 <= r <= 1
+    p,s1 = zip(*(mdp.T(s,a)))
+    for i in range(len(p)):
+        if sum(p[:i+1]) >= r:
+            return s1[i]
+
+def execute_trial(agent,mdp):
+    current_state = agent.mdp.init
+    while True:
+        current_reward = mdp.R(current_state)
+        next_action = agent.program(current_state, current_reward)
+        if next_action == False:
+            break
+        current_state = simulate(mdp,(current_state, next_action))
+
+def demoPassiveADPAgent():
+    print 'DEMO PassiveADPAgent'
+    print '--------------------'
     # Setup values
-    policy = [['>', '>', '>', '.'],
-              ['^', None, '^', '.'],
-              ['^', '<', '<', '<']]
+    policy = {(0, 1): (0, 1),
+              (1, 2): (1, 0),
+              (3, 2): None,
+              (0, 0): (0, 1),
+              (3, 0): (-1, 0),
+              (3, 1): None,
+              (2, 1): (0, 1),
+              (2, 0): (0, 1),
+              (2, 2): (1, 0),
+              (1, 0): (1, 0),
+              (0, 2): (1, 0)}
     
     # Create agent
-    agent = PassiveADPAgent(Fig[17,1], policy)
-    
-    # Start timing
     time_start = time()
-    logging.info('Start at %s' % time_start)
-    
-    # Execute a bunch of trials
-    trials = options.times
+    trials = 100
+    agent = PassiveADPAgent(Fig[17,1], policy)
     for i in range (0,trials):
-        agent.execute_trial()
-    
-    # End timing
+        execute_trial(agent,Fig[17,1])
     time_end = time()
-    logging.info('End at %s' % time_end)
     
     seconds_elapsed = time_end - time_start
     minutes_elapsed = seconds_elapsed / 60.0
-    
-    # Print and log final results
     final_results = (('Took %d seconds, which is %d minutes' % (seconds_elapsed, minutes_elapsed)),\
-        ('Executed %i trials' % (trials)), ('Utilities: %s' % (agent.utility)))
+                     ('Executed %i trials' % (trials)), ('Utilities: %s' % (agent.U)))
     for result in final_results:
-        logging.info(result)
         print result
+
+if __name__ == '__main__':
+    demoPassiveADPAgent()

From 27d1745017addc5e126dd4a91b9229e994d59b6c Mon Sep 17 00:00:00 2001
From: Ng Yee Sian <ngyeesian@gmail.com>
Date: Fri, 7 Dec 2012 23:14:46 +0800
Subject: [PATCH 06/19] moved the LearntMDP class into PassiveADP

since it's a model of the mdp that is internal to the passive adp agent.
---
 passive_adp.py | 78 ++++++++++++++++++++++----------------------------
 1 file changed, 34 insertions(+), 44 deletions(-)

diff --git a/passive_adp.py b/passive_adp.py
index 626b9c0..f89ae86 100644
--- a/passive_adp.py
+++ b/passive_adp.py
@@ -1,53 +1,43 @@
 from mdp import GridMDP, MDP, value_iteration, policy_evaluation, Fig
-from utils import turn_left, turn_right, update
+from utils import update
 from random import random
 from time import time
+import agents
 
-class MyMDP(MDP):
-    """ Extends MDP class to use a dictionary transistion model """
-    def __init__(self, mdp):
-        MDP.__init__(self,
-                     mdp.init,
-                     mdp.actlist,
-                     mdp.terminals,
-                     mdp.gamma)
-        update(self,
-               P = {},
-               states = mdp.states)
-
-    def R(self, s):
-        """Return a numeric reward for the state s"""
-        if s in self.reward:
-            return self.reward[s]
-        else:
-            # TODO: this should really return zero? or return False beause we
-            # don't know.  Returns 0 for now as it makes the value iteration
-            # function work
-            return 0
-            #raise Exception('tried to get reward of state we dont have yet %s' % str(state))
-
-    def T(self, s, a):
-        """Returns a list of tuples with probabilities for states"""
-        try:
-            return [(p,s) for (s,p) in self.P[s][a].items()]
-        except KeyError:
-            return [] # return an empty list
+class PassiveADPAgent(agents.Agent):
+    """Passive (non-learning) agent that uses adaptive dynamic programming
+    on a given MDP and policy. [Fig. 21.2]"""
+    class LearntMDP:
+        def __init__(self, states, gamma, terminals):
+            update(self, P={}, reward={}, states=states, gamma=gamma, terminals=terminals)
+            
+        def R(self, s):
+            """Return a numeric reward for the state s"""
+            if s in self.reward:
+                return self.reward[s]
+            else:
+                return 0. # we don't know the value of the reward.
+            
+        def T(self, s, a):
+            """Returns a list of tuples with probabilities for states"""
+            try:
+                return [(p,s) for (s,p) in self.P[s][a].items()]
+            except KeyError:
+                return []
+            
+        def T_add(self, (s,a,t), p):
+            " Adds a value to the transistion model "
+            if (s in self.P) and (a in self.P[s]):
+                self.P[s][a][t] = p
+            elif (s in self.P):
+                self.P[s][a] = {t:p}
+            else:
+                self.P[s] = {a:{t:p}}
     
-    def T_add(self, (s,a,t), p):
-        " Adds a value to the transistion model "
-        if (s in self.P) and (a in self.P[s]):
-            self.P[s][a][t] = p
-        elif (s in self.P):
-            self.P[s][a] = {t:p}
-        else:
-            self.P[s] = {a:{t:p}}
-
-class PassiveADPAgent(object):
-
     def __init__(self, action_mdp, pi):
         update(self,
                pi = pi,
-               mdp = MyMDP(action_mdp),
+               mdp = self.LearntMDP(action_mdp.states,action_mdp.gamma,action_mdp.terminals),
                action_mdp = action_mdp,
                U = {},
                Ns_sa = {s:{a:{t:0 for (p,t) in action_mdp.T(s,a)}
@@ -60,7 +50,7 @@ def __init__(self, action_mdp, pi):
         
     def program(self, s1, r1):
         mdp,U,s,a,Nsa,Ns_sa = self.mdp,self.U,self.s,self.a,self.Nsa,self.Ns_sa
-        if s1 not in mdp.reward: # mdp.reward also tracks the visited states
+        if s1 not in mdp.reward: # mdp.R also tracks the visited states
             U[s1] = r1
             mdp.reward[s1] = r1
         if s is not None:
@@ -85,7 +75,7 @@ def simulate(mdp,(s,a)):
             return s1[i]
 
 def execute_trial(agent,mdp):
-    current_state = agent.mdp.init
+    current_state = mdp.init
     while True:
         current_reward = mdp.R(current_state)
         next_action = agent.program(current_state, current_reward)

From ec27f8cfffc29ae56222ad5c32a0573cbfdee1fe Mon Sep 17 00:00:00 2001
From: Ng Yee Sian <ngyeesian@gmail.com>
Date: Fri, 7 Dec 2012 23:23:33 +0800
Subject: [PATCH 07/19] moved the code for passive_adp into rl.py

---
 rl.py | 145 ++++++++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 130 insertions(+), 15 deletions(-)

diff --git a/rl.py b/rl.py
index fc0e2c9..89ab75b 100644
--- a/rl.py
+++ b/rl.py
@@ -1,15 +1,130 @@
-"""Reinforcement Learning (Chapter 21)
-"""
-
-from utils import *
-import agents
-
-class PassiveADPAgent(agents.Agent):
-    """Passive (non-learning) agent that uses adaptive dynamic programming
-    on a given MDP and policy. [Fig. 21.2]"""
-    NotImplemented
-
-class PassiveTDAgent(agents.Agent):
-    """Passive (non-learning) agent that uses temporal differences to learn
-    utility estimates. [Fig. 21.4]"""
-    NotImplemented
+"""Reinforcement Learning (Chapter 21)
+"""
+
+from mdp import GridMDP, MDP, value_iteration, policy_evaluation, Fig
+from utils import update
+from random import random
+from time import time
+import agents
+
+class PassiveADPAgent(agents.Agent):
+    """Passive (non-learning) agent that uses adaptive dynamic programming
+    on a given MDP and policy. [Fig. 21.2]"""
+    class LearntMDP:
+        """a model of the original mdp that the PassiveADP is trying to learn"""
+        def __init__(self, states, gamma, terminals):
+            update(self, P={}, reward={}, states=states, gamma=gamma, terminals=terminals)
+            
+        def R(self, s):
+            """Return a numeric reward for the state s"""
+            if s in self.reward:
+                return self.reward[s]
+            else:
+                return 0. # we don't know the value of the reward.
+            
+        def T(self, s, a):
+            """Returns a list of tuples with probabilities for states"""
+            try:
+                return [(p,s) for (s,p) in self.P[s][a].items()]
+            except KeyError:
+                return []
+            
+        def T_set(self, (s,a,t), p):
+            " Adds a value to the transistion model "
+            if (s in self.P) and (a in self.P[s]):
+                self.P[s][a][t] = p
+            elif (s in self.P):
+                self.P[s][a] = {t:p}
+            else:
+                self.P[s] = {a:{t:p}}
+    
+    def __init__(self, action_mdp, pi):
+        update(self,
+               pi = pi,
+               mdp = self.LearntMDP(action_mdp.states,action_mdp.gamma,action_mdp.terminals),
+               action_mdp = action_mdp,
+               U = {},
+               Ns_sa = {s:{a:{t:0 for (p,t) in action_mdp.T(s,a)}
+                           for a in action_mdp.actlist}
+                        for s in action_mdp.states},
+               Nsa = {s:{a:0. for a in action_mdp.actlist}
+                      for s in action_mdp.states},
+               s = None,
+               a = None)
+        
+    def program(self, s1, r1):
+        mdp,U,s,a,Nsa,Ns_sa = self.mdp,self.U,self.s,self.a,self.Nsa,self.Ns_sa
+        if s1 not in mdp.reward: # mdp.R also tracks the visited states
+            U[s1] = r1
+            mdp.reward[s1] = r1
+        if s is not None:
+            Nsa[s][a] += 1
+            Ns_sa[s][a][s1] += 1
+            for t in Ns_sa[s][a]:
+                if Ns_sa[s][a][t] > 0:
+                    self.mdp.T_set((s,a,t), Ns_sa[s][a][t] / Nsa[s][a])
+        U = policy_evaluation(self.pi, U, mdp)
+        if s1 in mdp.terminals:
+            self.s, self.a = None, None
+            return False
+        else:
+            self.s, self.a = s1, self.pi[s1]
+            return self.a
+
+def simulate(mdp,(s,a)):
+    r = random() # 0 <= r <= 1
+    p,s1 = zip(*(mdp.T(s,a)))
+    for i in range(len(p)):
+        if sum(p[:i+1]) >= r:
+            return s1[i]
+
+def execute_trial(agent,mdp):
+    current_state = mdp.init
+    while True:
+        current_reward = mdp.R(current_state)
+        next_action = agent.program(current_state, current_reward)
+        if next_action == False:
+            break
+        current_state = simulate(mdp,(current_state, next_action))
+
+def demoPassiveADPAgent():
+    print 'DEMO PassiveADPAgent'
+    print '--------------------'
+    # Setup values
+    policy = {(0, 1): (0, 1),
+              (1, 2): (1, 0),
+              (3, 2): None,
+              (0, 0): (0, 1),
+              (3, 0): (-1, 0),
+              (3, 1): None,
+              (2, 1): (0, 1),
+              (2, 0): (0, 1),
+              (2, 2): (1, 0),
+              (1, 0): (1, 0),
+              (0, 2): (1, 0)}
+    
+    # Create agent
+    time_start = time()
+    trials = 100
+    agent = PassiveADPAgent(Fig[17,1], policy)
+    for i in range (0,trials):
+        execute_trial(agent,Fig[17,1])
+    time_end = time()
+    
+    seconds_elapsed = time_end - time_start
+    minutes_elapsed = seconds_elapsed / 60.0
+    final_results = (('Took %d seconds, which is %d minutes' % (seconds_elapsed, minutes_elapsed)),\
+                     ('Executed %i trials' % (trials)), ('Utilities: %s' % (agent.U)))
+    for result in final_results:
+        print result
+
+    print '\nCorrect Utilities (estimated by value iteration, for comparison):'
+    print value_iteration(Fig[17,1])
+
+class PassiveTDAgent(agents.Agent):
+    """Passive (non-learning) agent that uses temporal differences to learn
+    utility estimates. [Fig. 21.4]"""
+    NotImplemented
+
+if __name__ == '__main__':
+    demoPassiveADPAgent()

From 5002382c448086d9bc2bb2c7dce5d147583a9c84 Mon Sep 17 00:00:00 2001
From: Ng Yee Sian <ngyeesian@gmail.com>
Date: Fri, 7 Dec 2012 23:27:20 +0800
Subject: [PATCH 08/19] added docstring for LearntMDP

---
 passive_adp.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/passive_adp.py b/passive_adp.py
index f89ae86..eb64f8b 100644
--- a/passive_adp.py
+++ b/passive_adp.py
@@ -8,6 +8,7 @@ class PassiveADPAgent(agents.Agent):
     """Passive (non-learning) agent that uses adaptive dynamic programming
     on a given MDP and policy. [Fig. 21.2]"""
     class LearntMDP:
+        """a model of the original mdp that the PassiveADP is trying to learn"""
         def __init__(self, states, gamma, terminals):
             update(self, P={}, reward={}, states=states, gamma=gamma, terminals=terminals)
             
@@ -25,7 +26,7 @@ def T(self, s, a):
             except KeyError:
                 return []
             
-        def T_add(self, (s,a,t), p):
+        def T_set(self, (s,a,t), p):
             " Adds a value to the transistion model "
             if (s in self.P) and (a in self.P[s]):
                 self.P[s][a][t] = p
@@ -58,7 +59,7 @@ def program(self, s1, r1):
             Ns_sa[s][a][s1] += 1
             for t in Ns_sa[s][a]:
                 if Ns_sa[s][a][t] > 0:
-                    self.mdp.T_add((s,a,t), Ns_sa[s][a][t] / Nsa[s][a])
+                    self.mdp.T_set((s,a,t), Ns_sa[s][a][t] / Nsa[s][a])
         U = policy_evaluation(self.pi, U, mdp)
         if s1 in mdp.terminals:
             self.s, self.a = None, None
@@ -114,5 +115,8 @@ def demoPassiveADPAgent():
     for result in final_results:
         print result
 
+    print '\nCorrect Utilities (estimated by value iteration, for comparison):'
+    print value_iteration(Fig[17,1])
+
 if __name__ == '__main__':
     demoPassiveADPAgent()

From e7cbcafdcff2fdc6397dc53ec0ccc4d8562be639 Mon Sep 17 00:00:00 2001
From: Ng Yee Sian <ngyeesian@gmail.com>
Date: Fri, 7 Dec 2012 23:32:56 +0800
Subject: [PATCH 09/19] removed passive_adp.py

and merged it into rl.py
---
 passive_adp.py | 122 -------------------------------------------------
 1 file changed, 122 deletions(-)
 delete mode 100644 passive_adp.py

diff --git a/passive_adp.py b/passive_adp.py
deleted file mode 100644
index eb64f8b..0000000
--- a/passive_adp.py
+++ /dev/null
@@ -1,122 +0,0 @@
-from mdp import GridMDP, MDP, value_iteration, policy_evaluation, Fig
-from utils import update
-from random import random
-from time import time
-import agents
-
-class PassiveADPAgent(agents.Agent):
-    """Passive (non-learning) agent that uses adaptive dynamic programming
-    on a given MDP and policy. [Fig. 21.2]"""
-    class LearntMDP:
-        """a model of the original mdp that the PassiveADP is trying to learn"""
-        def __init__(self, states, gamma, terminals):
-            update(self, P={}, reward={}, states=states, gamma=gamma, terminals=terminals)
-            
-        def R(self, s):
-            """Return a numeric reward for the state s"""
-            if s in self.reward:
-                return self.reward[s]
-            else:
-                return 0. # we don't know the value of the reward.
-            
-        def T(self, s, a):
-            """Returns a list of tuples with probabilities for states"""
-            try:
-                return [(p,s) for (s,p) in self.P[s][a].items()]
-            except KeyError:
-                return []
-            
-        def T_set(self, (s,a,t), p):
-            " Adds a value to the transistion model "
-            if (s in self.P) and (a in self.P[s]):
-                self.P[s][a][t] = p
-            elif (s in self.P):
-                self.P[s][a] = {t:p}
-            else:
-                self.P[s] = {a:{t:p}}
-    
-    def __init__(self, action_mdp, pi):
-        update(self,
-               pi = pi,
-               mdp = self.LearntMDP(action_mdp.states,action_mdp.gamma,action_mdp.terminals),
-               action_mdp = action_mdp,
-               U = {},
-               Ns_sa = {s:{a:{t:0 for (p,t) in action_mdp.T(s,a)}
-                           for a in action_mdp.actlist}
-                        for s in action_mdp.states},
-               Nsa = {s:{a:0. for a in action_mdp.actlist}
-                      for s in action_mdp.states},
-               s = None,
-               a = None)
-        
-    def program(self, s1, r1):
-        mdp,U,s,a,Nsa,Ns_sa = self.mdp,self.U,self.s,self.a,self.Nsa,self.Ns_sa
-        if s1 not in mdp.reward: # mdp.R also tracks the visited states
-            U[s1] = r1
-            mdp.reward[s1] = r1
-        if s is not None:
-            Nsa[s][a] += 1
-            Ns_sa[s][a][s1] += 1
-            for t in Ns_sa[s][a]:
-                if Ns_sa[s][a][t] > 0:
-                    self.mdp.T_set((s,a,t), Ns_sa[s][a][t] / Nsa[s][a])
-        U = policy_evaluation(self.pi, U, mdp)
-        if s1 in mdp.terminals:
-            self.s, self.a = None, None
-            return False
-        else:
-            self.s, self.a = s1, self.pi[s1]
-            return self.a
-
-def simulate(mdp,(s,a)):
-    r = random() # 0 <= r <= 1
-    p,s1 = zip(*(mdp.T(s,a)))
-    for i in range(len(p)):
-        if sum(p[:i+1]) >= r:
-            return s1[i]
-
-def execute_trial(agent,mdp):
-    current_state = mdp.init
-    while True:
-        current_reward = mdp.R(current_state)
-        next_action = agent.program(current_state, current_reward)
-        if next_action == False:
-            break
-        current_state = simulate(mdp,(current_state, next_action))
-
-def demoPassiveADPAgent():
-    print 'DEMO PassiveADPAgent'
-    print '--------------------'
-    # Setup values
-    policy = {(0, 1): (0, 1),
-              (1, 2): (1, 0),
-              (3, 2): None,
-              (0, 0): (0, 1),
-              (3, 0): (-1, 0),
-              (3, 1): None,
-              (2, 1): (0, 1),
-              (2, 0): (0, 1),
-              (2, 2): (1, 0),
-              (1, 0): (1, 0),
-              (0, 2): (1, 0)}
-    
-    # Create agent
-    time_start = time()
-    trials = 100
-    agent = PassiveADPAgent(Fig[17,1], policy)
-    for i in range (0,trials):
-        execute_trial(agent,Fig[17,1])
-    time_end = time()
-    
-    seconds_elapsed = time_end - time_start
-    minutes_elapsed = seconds_elapsed / 60.0
-    final_results = (('Took %d seconds, which is %d minutes' % (seconds_elapsed, minutes_elapsed)),\
-                     ('Executed %i trials' % (trials)), ('Utilities: %s' % (agent.U)))
-    for result in final_results:
-        print result
-
-    print '\nCorrect Utilities (estimated by value iteration, for comparison):'
-    print value_iteration(Fig[17,1])
-
-if __name__ == '__main__':
-    demoPassiveADPAgent()

From 2e4d9a4e68376b82a1e59ba641eeea6393505ec7 Mon Sep 17 00:00:00 2001
From: Ng Yee Sian <ngyeesian@gmail.com>
Date: Sat, 8 Dec 2012 09:15:55 +0800
Subject: [PATCH 10/19] added code for PassiveTDAgent

---
 rl.py | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 69 insertions(+), 12 deletions(-)

diff --git a/rl.py b/rl.py
index 89ab75b..9147ff4 100644
--- a/rl.py
+++ b/rl.py
@@ -38,21 +38,21 @@ def T_set(self, (s,a,t), p):
             else:
                 self.P[s] = {a:{t:p}}
     
-    def __init__(self, action_mdp, pi):
+    def __init__(self, mdp, pi):
         update(self,
                pi = pi,
-               mdp = self.LearntMDP(action_mdp.states,action_mdp.gamma,action_mdp.terminals),
-               action_mdp = action_mdp,
+               mdp = self.LearntMDP(mdp.states,mdp.gamma,mdp.terminals),
                U = {},
-               Ns_sa = {s:{a:{t:0 for (p,t) in action_mdp.T(s,a)}
-                           for a in action_mdp.actlist}
-                        for s in action_mdp.states},
-               Nsa = {s:{a:0. for a in action_mdp.actlist}
-                      for s in action_mdp.states},
+               Ns_sa = {s:{a:{t:0 for (p,t) in mdp.T(s,a)}
+                           for a in mdp.actlist}
+                        for s in mdp.states},
+               Nsa = {s:{a:0. for a in mdp.actlist}
+                      for s in mdp.states},
                s = None,
                a = None)
         
-    def program(self, s1, r1):
+    def program(self, percept):
+        s1,r1 = percept
         mdp,U,s,a,Nsa,Ns_sa = self.mdp,self.U,self.s,self.a,self.Nsa,self.Ns_sa
         if s1 not in mdp.reward: # mdp.R also tracks the visited states
             U[s1] = r1
@@ -82,7 +82,7 @@ def execute_trial(agent,mdp):
     current_state = mdp.init
     while True:
         current_reward = mdp.R(current_state)
-        next_action = agent.program(current_state, current_reward)
+        next_action = agent.program((current_state, current_reward))
         if next_action == False:
             break
         current_state = simulate(mdp,(current_state, next_action))
@@ -124,7 +124,64 @@ def demoPassiveADPAgent():
 class PassiveTDAgent(agents.Agent):
     """Passive (non-learning) agent that uses temporal differences to learn
     utility estimates. [Fig. 21.4]"""
-    NotImplemented
+    def __init__(self,mdp,pi,alpha=None):
+        update(self,
+               pi = pi,
+               U = {s:0. for s in mdp.states},
+               Ns = {s:0 for s in mdp.states},
+               s = None,
+               a = None,
+               r = None,
+               gamma = mdp.gamma,
+               terminals = mdp.terminals)
+        if alpha is None:
+            alpha = lambda n: 60./(59+n) # page 837
+    def program(self,percept):
+        s1,r1 = percept
+        pi,U,Ns,s,a,r = self.pi,self.U,self.Ns,self.s,self.a,self.r
+        alpha,gamma = self.alpha,self.gamma
+        if s1 not in U: U[s1] = r1
+        if s is not None:
+            Ns[s] += 1
+            U[s] = U[s] + alpha(Ns[s])*(r+gamma*U[s1]-U[s])
+        if s in self.terminals: self.s,self.a,self.r = None,None,None
+        else: self.s,self.a,self.r = s1, pi[s1],r1
+        return self.a
+
+def demoPassiveTDAgent():
+    print 'DEMO PassiveTDAgent'
+    print '--------------------'
+    # Setup values
+    policy = {(0, 1): (0, 1),
+              (1, 2): (1, 0),
+              (3, 2): None,
+              (0, 0): (0, 1),
+              (3, 0): (-1, 0),
+              (3, 1): None,
+              (2, 1): (0, 1),
+              (2, 0): (0, 1),
+              (2, 2): (1, 0),
+              (1, 0): (1, 0),
+              (0, 2): (1, 0)}
+    
+    # Create agent
+    time_start = time()
+    trials = 100
+    agent = PassiveADPAgent(Fig[17,1], policy)
+    for i in range (0,trials):
+        execute_trial(agent,Fig[17,1])
+    time_end = time()
+    
+    seconds_elapsed = time_end - time_start
+    minutes_elapsed = seconds_elapsed / 60.0
+    final_results = (('Took %d seconds, which is %d minutes' % (seconds_elapsed, minutes_elapsed)),\
+                     ('Executed %i trials' % (trials)), ('Utilities: %s' % (agent.U)))
+    for result in final_results:
+        print result
+
+    print '\nCorrect Utilities (estimated by value iteration, for comparison):'
+    print value_iteration(Fig[17,1])
 
 if __name__ == '__main__':
-    demoPassiveADPAgent()
+    #demoPassiveADPAgent()
+    demoPassiveTDAgent()

From 72f3e2c66228561b5aa12f87883f779030b177b7 Mon Sep 17 00:00:00 2001
From: Ng Yee Sian <ngyeesian@gmail.com>
Date: Sat, 8 Dec 2012 09:44:23 +0800
Subject: [PATCH 11/19] include a new class: QLearningAgent

---
 .gitattributes |  22 +++
 .gitignore     | 163 +++++++++++++++++++++
 mdp.py         | 344 ++++++++++++++++++++++----------------------
 rl.py          | 379 +++++++++++++++++++++++++------------------------
 4 files changed, 549 insertions(+), 359 deletions(-)
 create mode 100644 .gitattributes
 create mode 100644 .gitignore

diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000..412eeda
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,22 @@
+# Auto detect text files and perform LF normalization
+* text=auto
+
+# Custom for Visual Studio
+*.cs     diff=csharp
+*.sln    merge=union
+*.csproj merge=union
+*.vbproj merge=union
+*.fsproj merge=union
+*.dbproj merge=union
+
+# Standard to msysgit
+*.doc	 diff=astextplain
+*.DOC	 diff=astextplain
+*.docx diff=astextplain
+*.DOCX diff=astextplain
+*.dot  diff=astextplain
+*.DOT  diff=astextplain
+*.pdf  diff=astextplain
+*.PDF	 diff=astextplain
+*.rtf	 diff=astextplain
+*.RTF	 diff=astextplain
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..5ebd21a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,163 @@
+#################
+## Eclipse
+#################
+
+*.pydevproject
+.project
+.metadata
+bin/
+tmp/
+*.tmp
+*.bak
+*.swp
+*~.nib
+local.properties
+.classpath
+.settings/
+.loadpath
+
+# External tool builders
+.externalToolBuilders/
+
+# Locally stored "Eclipse launch configurations"
+*.launch
+
+# CDT-specific
+.cproject
+
+# PDT-specific
+.buildpath
+
+
+#################
+## Visual Studio
+#################
+
+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+
+# User-specific files
+*.suo
+*.user
+*.sln.docstates
+
+# Build results
+[Dd]ebug/
+[Rr]elease/
+*_i.c
+*_p.c
+*.ilk
+*.meta
+*.obj
+*.pch
+*.pdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.vspscc
+.builds
+*.dotCover
+
+## TODO: If you have NuGet Package Restore enabled, uncomment this
+#packages/
+
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opensdf
+*.sdf
+
+# Visual Studio profiler
+*.psess
+*.vsp
+
+# ReSharper is a .NET coding add-in
+_ReSharper*
+
+# Installshield output folder
+[Ee]xpress
+
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+
+# Click-Once directory
+publish
+
+# Others
+[Bb]in
+[Oo]bj
+sql
+TestResults
+*.Cache
+ClientBin
+stylecop.*
+~$*
+*.dbmdl
+Generated_Code #added for RIA/Silverlight projects
+
+# Backup & report files from converting an old project file to a newer
+# Visual Studio version. Backup files are not needed, because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+
+
+
+############
+## Windows
+############
+
+# Windows image file caches
+Thumbs.db
+
+# Folder config file
+Desktop.ini
+
+
+#############
+## Python
+#############
+
+*.py[co]
+
+# Packages
+*.egg
+*.egg-info
+dist
+build
+eggs
+parts
+bin
+var
+sdist
+develop-eggs
+.installed.cfg
+
+# Installer logs
+pip-log.txt
+
+# Unit test / coverage reports
+.coverage
+.tox
+
+#Translations
+*.mo
+
+#Mr Developer
+.mr.developer.cfg
+
+# Mac crap
+.DS_Store
diff --git a/mdp.py b/mdp.py
index d8057ce..0048843 100644
--- a/mdp.py
+++ b/mdp.py
@@ -1,172 +1,172 @@
-"""Markov Decision Processes (Chapter 17)
-
-First we define an MDP, and the special case of a GridMDP, in which
-states are laid out in a 2-dimensional grid.  We also represent a policy
-as a dictionary of {state:action} pairs, and a Utility function as a
-dictionary of {state:number} pairs.  We then define the value_iteration
-and policy_iteration algorithms."""
-
-from utils import *
-
-class MDP:
-    """A Markov Decision Process, defined by an initial state, transition model,
-    and reward function. We also keep track of a gamma value, for use by
-    algorithms. The transition model is represented somewhat differently from
-    the text.  Instead of P(s' | s, a) being a probability number for each
-    state/state/action triplet, we instead have T(s, a) return a list of (p, s')
-    pairs.  We also keep track of the possible states, terminal states, and
-    actions for each state. [page 646]"""
-
-    def __init__(self, init, actlist, terminals, gamma=.9):
-        update(self, init=init, actlist=actlist, terminals=terminals,
-               gamma=gamma, states=set(), reward={})
-
-    def R(self, state):
-        "Return a numeric reward for this state."
-        return self.reward[state]
-
-    def T(self, state, action):
-        """Transition model.  From a state and an action, return a list
-        of (probability, result-state) pairs."""
-        abstract
-
-    def actions(self, state):
-        """Set of actions that can be performed in this state.  By default, a
-        fixed list of actions, except for terminal states. Override this
-        method if you need to specialize by state."""
-        if state in self.terminals:
-            return [None]
-        else:
-            return self.actlist
-
-class GridMDP(MDP):
-    """A two-dimensional grid MDP, as in [Figure 17.1].  All you have to do is
-    specify the grid as a list of lists of rewards; use None for an obstacle
-    (unreachable state).  Also, you should specify the terminal states.
-    An action is an (x, y) unit vector; e.g. (1, 0) means move east."""
-    def __init__(self, grid, terminals, init=(0, 0), gamma=.9):
-        grid.reverse() ## because we want row 0 on bottom, not on top
-        MDP.__init__(self, init, actlist=orientations,
-                     terminals=terminals, gamma=gamma)
-        update(self, grid=grid, rows=len(grid), cols=len(grid[0]))
-        for x in range(self.cols):
-            for y in range(self.rows):
-                self.reward[x, y] = grid[y][x]
-                if grid[y][x] is not None:
-                    self.states.add((x, y))
-
-    def T(self, state, action):
-        if action is None:
-            return [(0.0, state)]
-        else:
-            return [(0.8, self.go(state, action)),
-                    (0.1, self.go(state, turn_right(action))),
-                    (0.1, self.go(state, turn_left(action)))]
-
-    def go(self, state, direction):
-        "Return the state that results from going in this direction."
-        state1 = vector_add(state, direction)
-        return if_(state1 in self.states, state1, state)
-
-    def to_grid(self, mapping):
-        """Convert a mapping from (x, y) to v into a [[..., v, ...]] grid."""
-        return list(reversed([[mapping.get((x,y), None)
-                               for x in range(self.cols)]
-                              for y in range(self.rows)]))
-
-    def to_arrows(self, policy):
-        chars = {(1, 0):'>', (0, 1):'^', (-1, 0):'<', (0, -1):'v', None: '.'}
-        return self.to_grid(dict([(s, chars[a]) for (s, a) in policy.items()]))
-
-#______________________________________________________________________________
-
-Fig[17,1] = GridMDP([[-0.04, -0.04, -0.04, +1],
-                     [-0.04, None,  -0.04, -1],
-                     [-0.04, -0.04, -0.04, -0.04]],
-                    terminals=[(3, 2), (3, 1)])
-
-#______________________________________________________________________________
-
-def value_iteration(mdp, epsilon=0.001):
-    "Solving an MDP by value iteration. [Fig. 17.4]"
-    U1 = dict([(s, 0) for s in mdp.states])
-    R, T, gamma = mdp.R, mdp.T, mdp.gamma
-    while True:
-        U = U1.copy()
-        delta = 0
-        for s in mdp.states:
-            U1[s] = R(s) + gamma * max([sum([p * U[s1] for (p, s1) in T(s, a)])
-                                        for a in mdp.actions(s)])
-            delta = max(delta, abs(U1[s] - U[s]))
-        if (((gamma < 1) and (delta < epsilon * (1 - gamma) / gamma)) or
-            ((gamma == 1) and (delta < epsilon))): # allows for gamma to be 1
-             return U
-
-def best_policy(mdp, U):
-    """Given an MDP and a utility function U, determine the best policy,
-    as a mapping from state to action. (Equation 17.4)"""
-    pi = {}
-    for s in mdp.states:
-        pi[s] = argmax(mdp.actions(s), lambda a:expected_utility(a, s, U, mdp))
-    return pi
-
-def expected_utility(a, s, U, mdp):
-    "The expected utility of doing a in state s, according to the MDP and U."
-    return sum([p * U[s1] for (p, s1) in mdp.T(s, a)])
-
-#______________________________________________________________________________
-
-def policy_iteration(mdp):
-    "Solve an MDP by policy iteration [Fig. 17.7]"
-    U = dict([(s, 0) for s in mdp.states])
-    pi = dict([(s, random.choice(mdp.actions(s))) for s in mdp.states])
-    while True:
-        U = policy_evaluation(pi, U, mdp)
-        unchanged = True
-        for s in mdp.states:
-            a = argmax(mdp.actions(s), lambda a: expected_utility(a,s,U,mdp))
-            if a != pi[s]:
-                pi[s] = a
-                unchanged = False
-        if unchanged:
-            return pi
-
-def policy_evaluation(pi, U, mdp, k=20):
-    """Return an updated utility mapping U from each state in the MDP to its
-    utility, using an approximation (modified policy iteration)."""
-    R, T, gamma = mdp.R, mdp.T, mdp.gamma
-    for i in range(k):
-        for s in mdp.states:
-            U[s] = R(s) + gamma * sum([p * U[s1] for (p, s1) in T(s, pi[s])])
-    return U
-
-__doc__ += """
->>> pi = best_policy(Fig[17,1], value_iteration(Fig[17,1], .01))
-
->>> Fig[17,1].to_arrows(pi)
-[['>', '>', '>', '.'], ['^', None, '^', '.'], ['^', '>', '^', '<']]
-
->>> print_table(Fig[17,1].to_arrows(pi))
->   >      >   .
-^   None   ^   .
-^   >      ^   <
-
->>> print_table(Fig[17,1].to_arrows(policy_iteration(Fig[17,1])))
->   >      >   .
-^   None   ^   .
-^   >      ^   <
-"""
-
-__doc__ += random_tests("""
->>> pi
-{(3, 2): None, (3, 1): None, (3, 0): (-1, 0), (2, 1): (0, 1), (0, 2): (1, 0), (1, 0): (1, 0), (0, 0): (0, 1), (1, 2): (1, 0), (2, 0): (0, 1), (0, 1): (0, 1), (2, 2): (1, 0)}
-
->>> value_iteration(Fig[17,1], .01)
-{(3, 2): 1.0, (3, 1): -1.0, (3, 0): 0.12958868267972745, (0, 1): 0.39810203830605462, (0, 2): 0.50928545646220924, (1, 0): 0.25348746162470537, (0, 0): 0.29543540628363629, (1, 2): 0.64958064617168676, (2, 0): 0.34461306281476806, (2, 1): 0.48643676237737926, (2, 2): 0.79536093684710951}
-
->>> policy_iteration(Fig[17,1])
-{(3, 2): None, (3, 1): None, (3, 0): (0, -1), (2, 1): (-1, 0), (0, 2): (1, 0), (1, 0): (1, 0), (0, 0): (1, 0), (1, 2): (1, 0), (2, 0): (1, 0), (0, 1): (1, 0), (2, 2): (1, 0)}
-
-""")
-
-
+"""Markov Decision Processes (Chapter 17)
+
+First we define an MDP, and the special case of a GridMDP, in which
+states are laid out in a 2-dimensional grid.  We also represent a policy
+as a dictionary of {state:action} pairs, and a Utility function as a
+dictionary of {state:number} pairs.  We then define the value_iteration
+and policy_iteration algorithms."""
+
+from utils import *
+
+class MDP:
+    """A Markov Decision Process, defined by an initial state, transition model,
+    and reward function. We also keep track of a gamma value, for use by
+    algorithms. The transition model is represented somewhat differently from
+    the text.  Instead of P(s' | s, a) being a probability number for each
+    state/state/action triplet, we instead have T(s, a) return a list of (p, s')
+    pairs.  We also keep track of the possible states, terminal states, and
+    actions for each state. [page 646]"""
+
+    def __init__(self, init, actlist, terminals, gamma=.9):
+        update(self, init=init, actlist=actlist, terminals=terminals,
+               gamma=gamma, states=set(), reward={})
+
+    def R(self, state):
+        "Return a numeric reward for this state."
+        return self.reward[state]
+
+    def T(self, state, action):
+        """Transition model.  From a state and an action, return a list
+        of (probability, result-state) pairs."""
+        abstract
+
+    def actions(self, state):
+        """Set of actions that can be performed in this state.  By default, a
+        fixed list of actions, except for terminal states. Override this
+        method if you need to specialize by state."""
+        if state in self.terminals:
+            return [None]
+        else:
+            return self.actlist
+
+class GridMDP(MDP):
+    """A two-dimensional grid MDP, as in [Figure 17.1].  All you have to do is
+    specify the grid as a list of lists of rewards; use None for an obstacle
+    (unreachable state).  Also, you should specify the terminal states.
+    An action is an (x, y) unit vector; e.g. (1, 0) means move east."""
+    def __init__(self, grid, terminals, init=(0, 0), gamma=.9):
+        grid.reverse() ## because we want row 0 on bottom, not on top
+        MDP.__init__(self, init, actlist=orientations,
+                     terminals=terminals, gamma=gamma)
+        update(self, grid=grid, rows=len(grid), cols=len(grid[0]))
+        for x in range(self.cols):
+            for y in range(self.rows):
+                self.reward[x, y] = grid[y][x]
+                if grid[y][x] is not None:
+                    self.states.add((x, y))
+
+    def T(self, state, action):
+        if action is None:
+            return [(0.0, state)]
+        else:
+            return [(0.8, self.go(state, action)),
+                    (0.1, self.go(state, turn_right(action))),
+                    (0.1, self.go(state, turn_left(action)))]
+
+    def go(self, state, direction):
+        "Return the state that results from going in this direction."
+        state1 = vector_add(state, direction)
+        return if_(state1 in self.states, state1, state)
+
+    def to_grid(self, mapping):
+        """Convert a mapping from (x, y) to v into a [[..., v, ...]] grid."""
+        return list(reversed([[mapping.get((x,y), None)
+                               for x in range(self.cols)]
+                              for y in range(self.rows)]))
+
+    def to_arrows(self, policy):
+        chars = {(1, 0):'>', (0, 1):'^', (-1, 0):'<', (0, -1):'v', None: '.'}
+        return self.to_grid(dict([(s, chars[a]) for (s, a) in policy.items()]))
+
+#______________________________________________________________________________
+
+Fig[17,1] = GridMDP([[-0.04, -0.04, -0.04, +1],
+                     [-0.04, None,  -0.04, -1],
+                     [-0.04, -0.04, -0.04, -0.04]],
+                    terminals=[(3, 2), (3, 1)])
+
+#______________________________________________________________________________
+
+def value_iteration(mdp, epsilon=0.001):
+    "Solving an MDP by value iteration. [Fig. 17.4]"
+    U1 = dict([(s, 0) for s in mdp.states])
+    R, T, gamma = mdp.R, mdp.T, mdp.gamma
+    while True:
+        U = U1.copy()
+        delta = 0
+        for s in mdp.states:
+            U1[s] = R(s) + gamma * max([sum([p * U[s1] for (p, s1) in T(s, a)])
+                                        for a in mdp.actions(s)])
+            delta = max(delta, abs(U1[s] - U[s]))
+        if (((gamma < 1) and (delta < epsilon * (1 - gamma) / gamma)) or
+            ((gamma == 1) and (delta < epsilon))): # allows for gamma to be 1
+             return U
+
+def best_policy(mdp, U):
+    """Given an MDP and a utility function U, determine the best policy,
+    as a mapping from state to action. (Equation 17.4)"""
+    pi = {}
+    for s in mdp.states:
+        pi[s] = argmax(mdp.actions(s), lambda a:expected_utility(a, s, U, mdp))
+    return pi
+
+def expected_utility(a, s, U, mdp):
+    "The expected utility of doing a in state s, according to the MDP and U."
+    return sum([p * U[s1] for (p, s1) in mdp.T(s, a)])
+
+#______________________________________________________________________________
+
+def policy_iteration(mdp):
+    "Solve an MDP by policy iteration [Fig. 17.7]"
+    U = dict([(s, 0) for s in mdp.states])
+    pi = dict([(s, random.choice(mdp.actions(s))) for s in mdp.states])
+    while True:
+        U = policy_evaluation(pi, U, mdp)
+        unchanged = True
+        for s in mdp.states:
+            a = argmax(mdp.actions(s), lambda a: expected_utility(a,s,U,mdp))
+            if a != pi[s]:
+                pi[s] = a
+                unchanged = False
+        if unchanged:
+            return pi
+
+def policy_evaluation(pi, U, mdp, k=20):
+    """Return an updated utility mapping U from each state in the MDP to its
+    utility, using an approximation (modified policy iteration)."""
+    R, T, gamma = mdp.R, mdp.T, mdp.gamma
+    for i in range(k):
+        for s in mdp.states:
+            U[s] = R(s) + gamma * sum([p * U[s1] for (p, s1) in T(s, pi[s])])
+    return U
+
+__doc__ += """
+>>> pi = best_policy(Fig[17,1], value_iteration(Fig[17,1], .01))
+
+>>> Fig[17,1].to_arrows(pi)
+[['>', '>', '>', '.'], ['^', None, '^', '.'], ['^', '>', '^', '<']]
+
+>>> print_table(Fig[17,1].to_arrows(pi))
+>   >      >   .
+^   None   ^   .
+^   >      ^   <
+
+>>> print_table(Fig[17,1].to_arrows(policy_iteration(Fig[17,1])))
+>   >      >   .
+^   None   ^   .
+^   >      ^   <
+"""
+
+__doc__ += random_tests("""
+>>> pi
+{(3, 2): None, (3, 1): None, (3, 0): (-1, 0), (2, 1): (0, 1), (0, 2): (1, 0), (1, 0): (1, 0), (0, 0): (0, 1), (1, 2): (1, 0), (2, 0): (0, 1), (0, 1): (0, 1), (2, 2): (1, 0)}
+
+>>> value_iteration(Fig[17,1], .01)
+{(3, 2): 1.0, (3, 1): -1.0, (3, 0): 0.12958868267972745, (0, 1): 0.39810203830605462, (0, 2): 0.50928545646220924, (1, 0): 0.25348746162470537, (0, 0): 0.29543540628363629, (1, 2): 0.64958064617168676, (2, 0): 0.34461306281476806, (2, 1): 0.48643676237737926, (2, 2): 0.79536093684710951}
+
+>>> policy_iteration(Fig[17,1])
+{(3, 2): None, (3, 1): None, (3, 0): (0, -1), (2, 1): (-1, 0), (0, 2): (1, 0), (1, 0): (1, 0), (0, 0): (1, 0), (1, 2): (1, 0), (2, 0): (1, 0), (0, 1): (1, 0), (2, 2): (1, 0)}
+
+""")
+
+
diff --git a/rl.py b/rl.py
index 9147ff4..82520df 100644
--- a/rl.py
+++ b/rl.py
@@ -1,187 +1,192 @@
-"""Reinforcement Learning (Chapter 21)
-"""
-
-from mdp import GridMDP, MDP, value_iteration, policy_evaluation, Fig
-from utils import update
-from random import random
-from time import time
-import agents
-
-class PassiveADPAgent(agents.Agent):
-    """Passive (non-learning) agent that uses adaptive dynamic programming
-    on a given MDP and policy. [Fig. 21.2]"""
-    class LearntMDP:
-        """a model of the original mdp that the PassiveADP is trying to learn"""
-        def __init__(self, states, gamma, terminals):
-            update(self, P={}, reward={}, states=states, gamma=gamma, terminals=terminals)
-            
-        def R(self, s):
-            """Return a numeric reward for the state s"""
-            if s in self.reward:
-                return self.reward[s]
-            else:
-                return 0. # we don't know the value of the reward.
-            
-        def T(self, s, a):
-            """Returns a list of tuples with probabilities for states"""
-            try:
-                return [(p,s) for (s,p) in self.P[s][a].items()]
-            except KeyError:
-                return []
-            
-        def T_set(self, (s,a,t), p):
-            " Adds a value to the transistion model "
-            if (s in self.P) and (a in self.P[s]):
-                self.P[s][a][t] = p
-            elif (s in self.P):
-                self.P[s][a] = {t:p}
-            else:
-                self.P[s] = {a:{t:p}}
-    
-    def __init__(self, mdp, pi):
-        update(self,
-               pi = pi,
-               mdp = self.LearntMDP(mdp.states,mdp.gamma,mdp.terminals),
-               U = {},
-               Ns_sa = {s:{a:{t:0 for (p,t) in mdp.T(s,a)}
-                           for a in mdp.actlist}
-                        for s in mdp.states},
-               Nsa = {s:{a:0. for a in mdp.actlist}
-                      for s in mdp.states},
-               s = None,
-               a = None)
-        
-    def program(self, percept):
-        s1,r1 = percept
-        mdp,U,s,a,Nsa,Ns_sa = self.mdp,self.U,self.s,self.a,self.Nsa,self.Ns_sa
-        if s1 not in mdp.reward: # mdp.R also tracks the visited states
-            U[s1] = r1
-            mdp.reward[s1] = r1
-        if s is not None:
-            Nsa[s][a] += 1
-            Ns_sa[s][a][s1] += 1
-            for t in Ns_sa[s][a]:
-                if Ns_sa[s][a][t] > 0:
-                    self.mdp.T_set((s,a,t), Ns_sa[s][a][t] / Nsa[s][a])
-        U = policy_evaluation(self.pi, U, mdp)
-        if s1 in mdp.terminals:
-            self.s, self.a = None, None
-            return False
-        else:
-            self.s, self.a = s1, self.pi[s1]
-            return self.a
-
-def simulate(mdp,(s,a)):
-    r = random() # 0 <= r <= 1
-    p,s1 = zip(*(mdp.T(s,a)))
-    for i in range(len(p)):
-        if sum(p[:i+1]) >= r:
-            return s1[i]
-
-def execute_trial(agent,mdp):
-    current_state = mdp.init
-    while True:
-        current_reward = mdp.R(current_state)
-        next_action = agent.program((current_state, current_reward))
-        if next_action == False:
-            break
-        current_state = simulate(mdp,(current_state, next_action))
-
-def demoPassiveADPAgent():
-    print 'DEMO PassiveADPAgent'
-    print '--------------------'
-    # Setup values
-    policy = {(0, 1): (0, 1),
-              (1, 2): (1, 0),
-              (3, 2): None,
-              (0, 0): (0, 1),
-              (3, 0): (-1, 0),
-              (3, 1): None,
-              (2, 1): (0, 1),
-              (2, 0): (0, 1),
-              (2, 2): (1, 0),
-              (1, 0): (1, 0),
-              (0, 2): (1, 0)}
-    
-    # Create agent
-    time_start = time()
-    trials = 100
-    agent = PassiveADPAgent(Fig[17,1], policy)
-    for i in range (0,trials):
-        execute_trial(agent,Fig[17,1])
-    time_end = time()
-    
-    seconds_elapsed = time_end - time_start
-    minutes_elapsed = seconds_elapsed / 60.0
-    final_results = (('Took %d seconds, which is %d minutes' % (seconds_elapsed, minutes_elapsed)),\
-                     ('Executed %i trials' % (trials)), ('Utilities: %s' % (agent.U)))
-    for result in final_results:
-        print result
-
-    print '\nCorrect Utilities (estimated by value iteration, for comparison):'
-    print value_iteration(Fig[17,1])
-
-class PassiveTDAgent(agents.Agent):
-    """Passive (non-learning) agent that uses temporal differences to learn
-    utility estimates. [Fig. 21.4]"""
-    def __init__(self,mdp,pi,alpha=None):
-        update(self,
-               pi = pi,
-               U = {s:0. for s in mdp.states},
-               Ns = {s:0 for s in mdp.states},
-               s = None,
-               a = None,
-               r = None,
-               gamma = mdp.gamma,
-               terminals = mdp.terminals)
-        if alpha is None:
-            alpha = lambda n: 60./(59+n) # page 837
-    def program(self,percept):
-        s1,r1 = percept
-        pi,U,Ns,s,a,r = self.pi,self.U,self.Ns,self.s,self.a,self.r
-        alpha,gamma = self.alpha,self.gamma
-        if s1 not in U: U[s1] = r1
-        if s is not None:
-            Ns[s] += 1
-            U[s] = U[s] + alpha(Ns[s])*(r+gamma*U[s1]-U[s])
-        if s in self.terminals: self.s,self.a,self.r = None,None,None
-        else: self.s,self.a,self.r = s1, pi[s1],r1
-        return self.a
-
-def demoPassiveTDAgent():
-    print 'DEMO PassiveTDAgent'
-    print '--------------------'
-    # Setup values
-    policy = {(0, 1): (0, 1),
-              (1, 2): (1, 0),
-              (3, 2): None,
-              (0, 0): (0, 1),
-              (3, 0): (-1, 0),
-              (3, 1): None,
-              (2, 1): (0, 1),
-              (2, 0): (0, 1),
-              (2, 2): (1, 0),
-              (1, 0): (1, 0),
-              (0, 2): (1, 0)}
-    
-    # Create agent
-    time_start = time()
-    trials = 100
-    agent = PassiveADPAgent(Fig[17,1], policy)
-    for i in range (0,trials):
-        execute_trial(agent,Fig[17,1])
-    time_end = time()
-    
-    seconds_elapsed = time_end - time_start
-    minutes_elapsed = seconds_elapsed / 60.0
-    final_results = (('Took %d seconds, which is %d minutes' % (seconds_elapsed, minutes_elapsed)),\
-                     ('Executed %i trials' % (trials)), ('Utilities: %s' % (agent.U)))
-    for result in final_results:
-        print result
-
-    print '\nCorrect Utilities (estimated by value iteration, for comparison):'
-    print value_iteration(Fig[17,1])
-
-if __name__ == '__main__':
-    #demoPassiveADPAgent()
-    demoPassiveTDAgent()
+"""Reinforcement Learning (Chapter 21)
+"""
+
+from mdp import GridMDP, MDP, value_iteration, policy_evaluation, Fig
+from utils import update
+from random import random
+from time import time
+import agents
+
+class PassiveADPAgent(agents.Agent):
+    """Passive (non-learning) agent that uses adaptive dynamic programming
+    on a given MDP and policy. [Fig. 21.2]"""
+    class LearntMDP:
+        """a model of the original mdp that the PassiveADP is trying to learn"""
+        def __init__(self, states, gamma, terminals):
+            update(self, P={}, reward={}, states=states, gamma=gamma, terminals=terminals)
+            
+        def R(self, s):
+            """Return a numeric reward for the state s"""
+            if s in self.reward:
+                return self.reward[s]
+            else:
+                return 0. # we don't know the value of the reward.
+            
+        def T(self, s, a):
+            """Returns a list of tuples with probabilities for states"""
+            try:
+                return [(p,s) for (s,p) in self.P[s][a].items()]
+            except KeyError:
+                return []
+            
+        def T_set(self, (s,a,t), p):
+            " Adds a value to the transistion model "
+            if (s in self.P) and (a in self.P[s]):
+                self.P[s][a][t] = p
+            elif (s in self.P):
+                self.P[s][a] = {t:p}
+            else:
+                self.P[s] = {a:{t:p}}
+    
+    def __init__(self, mdp, pi):
+        update(self,
+               pi = pi,
+               mdp = self.LearntMDP(mdp.states,mdp.gamma,mdp.terminals),
+               U = {},
+               Ns_sa = {s:{a:{t:0 for (p,t) in mdp.T(s,a)}
+                           for a in mdp.actlist}
+                        for s in mdp.states},
+               Nsa = {s:{a:0. for a in mdp.actlist}
+                      for s in mdp.states},
+               s = None,
+               a = None)
+        
+    def program(self, percept):
+        s1,r1 = percept
+        mdp,U,s,a,Nsa,Ns_sa = self.mdp,self.U,self.s,self.a,self.Nsa,self.Ns_sa
+        if s1 not in mdp.reward: # mdp.R also tracks the visited states
+            U[s1] = r1
+            mdp.reward[s1] = r1
+        if s is not None:
+            Nsa[s][a] += 1
+            Ns_sa[s][a][s1] += 1
+            for t in Ns_sa[s][a]:
+                if Ns_sa[s][a][t] > 0:
+                    self.mdp.T_set((s,a,t), Ns_sa[s][a][t] / Nsa[s][a])
+        U = policy_evaluation(self.pi, U, mdp)
+        if s1 in mdp.terminals:
+            self.s, self.a = None, None
+            return False
+        else:
+            self.s, self.a = s1, self.pi[s1]
+            return self.a
+
+def simulate(mdp,(s,a)):
+    r = random() # 0 <= r <= 1
+    p,s1 = zip(*(mdp.T(s,a)))
+    for i in range(len(p)):
+        if sum(p[:i+1]) >= r:
+            return s1[i]
+
+def execute_trial(agent,mdp):
+    current_state = mdp.init
+    while True:
+        current_reward = mdp.R(current_state)
+        next_action = agent.program((current_state, current_reward))
+        if next_action == False:
+            break
+        current_state = simulate(mdp,(current_state, next_action))
+
+def demoPassiveADPAgent():
+    print 'DEMO PassiveADPAgent'
+    print '--------------------'
+    # Setup values
+    policy = {(0, 1): (0, 1),
+              (1, 2): (1, 0),
+              (3, 2): None,
+              (0, 0): (0, 1),
+              (3, 0): (-1, 0),
+              (3, 1): None,
+              (2, 1): (0, 1),
+              (2, 0): (0, 1),
+              (2, 2): (1, 0),
+              (1, 0): (1, 0),
+              (0, 2): (1, 0)}
+    
+    # Create agent
+    time_start = time()
+    trials = 100
+    agent = PassiveADPAgent(Fig[17,1], policy)
+    for i in range (0,trials):
+        execute_trial(agent,Fig[17,1])
+    time_end = time()
+    
+    seconds_elapsed = time_end - time_start
+    minutes_elapsed = seconds_elapsed / 60.0
+    final_results = (('Took %d seconds, which is %d minutes' % (seconds_elapsed, minutes_elapsed)),\
+                     ('Executed %i trials' % (trials)), ('Utilities: %s' % (agent.U)))
+    for result in final_results:
+        print result
+
+    print '\nCorrect Utilities (estimated by value iteration, for comparison):'
+    print value_iteration(Fig[17,1])
+
+class PassiveTDAgent(agents.Agent):
+    """Passive (non-learning) agent that uses temporal differences to learn
+    utility estimates. [Fig. 21.4]"""
+    def __init__(self,mdp,pi,alpha=None):
+        update(self,
+               pi = pi,
+               U = {s:0. for s in mdp.states},
+               Ns = {s:0 for s in mdp.states},
+               s = None,
+               a = None,
+               r = None,
+               gamma = mdp.gamma,
+               terminals = mdp.terminals)
+        if alpha is None:
+            alpha = lambda n: 60./(59+n) # page 837
+    def program(self,percept):
+        s1,r1 = percept
+        pi,U,Ns,s,a,r = self.pi,self.U,self.Ns,self.s,self.a,self.r
+        alpha,gamma = self.alpha,self.gamma
+        if s1 not in U: U[s1] = r1
+        if s is not None:
+            Ns[s] += 1
+            U[s] = U[s] + alpha(Ns[s])*(r+gamma*U[s1]-U[s])
+        if s in self.terminals: self.s,self.a,self.r = None,None,None
+        else: self.s,self.a,self.r = s1, pi[s1],r1
+        return self.a
+
+def demoPassiveTDAgent():
+    print 'DEMO PassiveTDAgent'
+    print '--------------------'
+    # Setup values
+    policy = {(0, 1): (0, 1),
+              (1, 2): (1, 0),
+              (3, 2): None,
+              (0, 0): (0, 1),
+              (3, 0): (-1, 0),
+              (3, 1): None,
+              (2, 1): (0, 1),
+              (2, 0): (0, 1),
+              (2, 2): (1, 0),
+              (1, 0): (1, 0),
+              (0, 2): (1, 0)}
+    
+    # Create agent
+    time_start = time()
+    trials = 100
+    agent = PassiveADPAgent(Fig[17,1], policy)
+    for i in range (0,trials):
+        execute_trial(agent,Fig[17,1])
+    time_end = time()
+    
+    seconds_elapsed = time_end - time_start
+    minutes_elapsed = seconds_elapsed / 60.0
+    final_results = (('Took %d seconds, which is %d minutes' % (seconds_elapsed, minutes_elapsed)),\
+                     ('Executed %i trials' % (trials)), ('Utilities: %s' % (agent.U)))
+    for result in final_results:
+        print result
+
+    print '\nCorrect Utilities (estimated by value iteration, for comparison):'
+    print value_iteration(Fig[17,1])
+
+class QLearningAgent(agents.Agent):
+    """Active TD agent that uses temporal differences to learn an
+    action-utility representation. [Fig. 21.8]"""
+    NotImplemented
+
+if __name__ == '__main__':
+    #demoPassiveADPAgent()
+    demoPassiveTDAgent()

From 8e90dc4081087e8cf766219e0041575b4cf12cfb Mon Sep 17 00:00:00 2001
From: Ng Yee Sian <ngyeesian@gmail.com>
Date: Sat, 8 Dec 2012 12:58:44 +0800
Subject: [PATCH 12/19] added code for QLearningAgent

---
 rl.py | 159 ++++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 122 insertions(+), 37 deletions(-)

diff --git a/rl.py b/rl.py
index 82520df..6aaa065 100644
--- a/rl.py
+++ b/rl.py
@@ -1,10 +1,12 @@
 """Reinforcement Learning (Chapter 21)
 """
 
-from mdp import GridMDP, MDP, value_iteration, policy_evaluation, Fig
-from utils import update
+from mdp import value_iteration, policy_evaluation, policy_iteration, \
+     GridMDP, MDP, Fig
+from utils import update, argmax
 from random import random
 from time import time
+from itertools import product
 import agents
 
 class PassiveADPAgent(agents.Agent):
@@ -71,6 +73,84 @@ def program(self, percept):
             self.s, self.a = s1, self.pi[s1]
             return self.a
 
+class PassiveTDAgent(agents.Agent):
+    """Passive (non-learning) agent that uses temporal differences to learn
+    utility estimates. [Fig. 21.4]"""
+    def __init__(self,mdp,pi,alpha=None):
+        update(self,
+               pi = pi,
+               U = {s:0. for s in mdp.states},
+               Ns = {s:0 for s in mdp.states},
+               s = None,
+               a = None,
+               r = None,
+               gamma = mdp.gamma,
+               terminals = mdp.terminals)
+        if alpha is None:
+            alpha = lambda n: 60./(59+n) # page 837
+        else:
+            self.alpha = alpha
+    def program(self,percept):
+        s1,r1 = percept
+        pi,U,Ns,s,a,r = self.pi,self.U,self.Ns,self.s,self.a,self.r
+        alpha,gamma = self.alpha,self.gamma
+        if s1 not in U: U[s1] = r1
+        if s is not None:
+            Ns[s] += 1
+            U[s] += alpha(Ns[s])*(r+gamma*U[s1]-U[s])
+        if s in self.terminals: self.s,self.a,self.r = None,None,None
+        else: self.s,self.a,self.r = s1, pi[s1],r1
+        return self.a
+
+class QLearningAgent(agents.Agent):
+    """Active TD agent that uses temporal differences to learn an
+    action-utility representation. [Fig. 21.8]"""
+    def __init__(self,mdp,alpha=None,Ne=5,Rplus=2):
+        update(self,
+               Q = {s:{a:0. for a in mdp.actlist}
+                    for s in mdp.states if s not in mdp.terminals},
+               Nsa = {s:{a:0. for a in mdp.actlist}
+                    for s in mdp.states},
+               s = None,
+               a = None,
+               r = None,
+               Ne = Ne,
+               Rplus = Rplus,
+               gamma = mdp.gamma,
+               terminals = mdp.terminals)
+
+        for s in mdp.terminals: self.Q[s] = {None:0.}
+        
+        if alpha is None:
+            self.alpha = lambda n: 60./(59+n) # page 837
+        else:
+            self.alpha = alpha
+            
+    def f(self,u,n): # the exploration function in AIMA(3rd ed), pg 842
+        if n < self.Ne:
+            return self.Rplus
+        else:
+            return u
+            
+    def program(self,percept):
+        s1,r1 = percept
+        Q, Nsa, s, a, r = self.Q, self.Nsa, self.s, self.a, self.r
+        alpha, gamma, f = self.alpha, self.gamma, self.f
+        if s1 in self.terminals:
+            Q[s1][None] = r1
+        if s is not None:
+            Nsa[s][a] += 1
+            Q[s][a] += alpha(Nsa[s][a])*(r+gamma*max(Q[s1].values())-Q[s][a])
+        if s1 in self.terminals:
+            self.s,self.a,self.r = None, None, None
+            return False
+        else:
+            self.s,self.r = s1,r1
+            self.a = argmax(Q[s1].keys(),lambda a1: f(Q[s1][a1],Nsa[s1][a1]))
+            return self.a
+
+# ---
+
 def simulate(mdp,(s,a)):
     r = random() # 0 <= r <= 1
     p,s1 = zip(*(mdp.T(s,a)))
@@ -88,9 +168,9 @@ def execute_trial(agent,mdp):
         current_state = simulate(mdp,(current_state, next_action))
 
 def demoPassiveADPAgent():
+    print '--------------------'
     print 'DEMO PassiveADPAgent'
     print '--------------------'
-    # Setup values
     policy = {(0, 1): (0, 1),
               (1, 2): (1, 0),
               (3, 2): None,
@@ -103,7 +183,6 @@ def demoPassiveADPAgent():
               (1, 0): (1, 0),
               (0, 2): (1, 0)}
     
-    # Create agent
     time_start = time()
     trials = 100
     agent = PassiveADPAgent(Fig[17,1], policy)
@@ -121,34 +200,8 @@ def demoPassiveADPAgent():
     print '\nCorrect Utilities (estimated by value iteration, for comparison):'
     print value_iteration(Fig[17,1])
 
-class PassiveTDAgent(agents.Agent):
-    """Passive (non-learning) agent that uses temporal differences to learn
-    utility estimates. [Fig. 21.4]"""
-    def __init__(self,mdp,pi,alpha=None):
-        update(self,
-               pi = pi,
-               U = {s:0. for s in mdp.states},
-               Ns = {s:0 for s in mdp.states},
-               s = None,
-               a = None,
-               r = None,
-               gamma = mdp.gamma,
-               terminals = mdp.terminals)
-        if alpha is None:
-            alpha = lambda n: 60./(59+n) # page 837
-    def program(self,percept):
-        s1,r1 = percept
-        pi,U,Ns,s,a,r = self.pi,self.U,self.Ns,self.s,self.a,self.r
-        alpha,gamma = self.alpha,self.gamma
-        if s1 not in U: U[s1] = r1
-        if s is not None:
-            Ns[s] += 1
-            U[s] = U[s] + alpha(Ns[s])*(r+gamma*U[s1]-U[s])
-        if s in self.terminals: self.s,self.a,self.r = None,None,None
-        else: self.s,self.a,self.r = s1, pi[s1],r1
-        return self.a
-
 def demoPassiveTDAgent():
+    print '--------------------'
     print 'DEMO PassiveTDAgent'
     print '--------------------'
     # Setup values
@@ -164,7 +217,6 @@ def demoPassiveTDAgent():
               (1, 0): (1, 0),
               (0, 2): (1, 0)}
     
-    # Create agent
     time_start = time()
     trials = 100
     agent = PassiveADPAgent(Fig[17,1], policy)
@@ -182,11 +234,44 @@ def demoPassiveTDAgent():
     print '\nCorrect Utilities (estimated by value iteration, for comparison):'
     print value_iteration(Fig[17,1])
 
-class QLearningAgent(agents.Agent):
-    """Active TD agent that uses temporal differences to learn an
-    action-utility representation. [Fig. 21.8]"""
-    NotImplemented
+def demoQLearningAgent():
+    print '--------------------'
+    print 'DEMO PassiveTDAgent'
+    print '--------------------'
+    # Setup values
+    policy = {(0, 1): (0, 1),
+              (1, 2): (1, 0),
+              (3, 2): None,
+              (0, 0): (0, 1),
+              (3, 0): (-1, 0),
+              (3, 1): None,
+              (2, 1): (0, 1),
+              (2, 0): (0, 1),
+              (2, 2): (1, 0),
+              (1, 0): (1, 0),
+              (0, 2): (1, 0)}
+    
+    time_start = time()
+    trials = 1000
+    agent = QLearningAgent(Fig[17,1])
+    for i in range (0,trials):
+        execute_trial(agent,Fig[17,1])
+    time_end = time()
+    
+    seconds_elapsed = time_end - time_start
+    minutes_elapsed = seconds_elapsed / 60.0
+    final_results = (('Took %d seconds, which is %d minutes' % (seconds_elapsed, minutes_elapsed)),\
+                     ('Executed %i trials' % (trials)),
+                     ('Utilities: %s' % {s:max(agent.Q[s].values()) for s in agent.Q}))
+    for result in final_results:
+        print result
+
+    print '\nCorrect Utilities (estimated by value iteration, for comparison):'
+    print value_iteration(Fig[17,1])
+
+# ---
 
 if __name__ == '__main__':
-    #demoPassiveADPAgent()
+    demoPassiveADPAgent()
     demoPassiveTDAgent()
+    demoQLearningAgent()

From 79811e02dfef49226bc487b43abf9325e8dc54f7 Mon Sep 17 00:00:00 2001
From: Ng Yee Sian <ngyeesian@gmail.com>
Date: Sat, 8 Dec 2012 15:11:07 +0800
Subject: [PATCH 13/19] fixed a bug with PassiveTDAgent

where alpha (rather than self.alpha) was called
---
 rl.py | 72 +++++++++++++++++++++++++++++++----------------------------
 1 file changed, 38 insertions(+), 34 deletions(-)

diff --git a/rl.py b/rl.py
index 6aaa065..4313948 100644
--- a/rl.py
+++ b/rl.py
@@ -6,7 +6,6 @@
 from utils import update, argmax
 from random import random
 from time import time
-from itertools import product
 import agents
 
 class PassiveADPAgent(agents.Agent):
@@ -15,13 +14,19 @@ class PassiveADPAgent(agents.Agent):
     class LearntMDP:
         """a model of the original mdp that the PassiveADP is trying to learn"""
         def __init__(self, states, gamma, terminals):
-            update(self, P={}, reward={}, states=states, gamma=gamma, terminals=terminals)
+            update(self,
+                   P={},
+                   reward={},
+                   states=states,
+                   gamma=gamma,
+                   terminals=terminals)
             
         def R(self, s):
             """Return a numeric reward for the state s"""
             if s in self.reward:
                 return self.reward[s]
             else:
+                # not specified in AIMA(3rd ed)
                 return 0. # we don't know the value of the reward.
             
         def T(self, s, a):
@@ -57,21 +62,19 @@ def program(self, percept):
         s1,r1 = percept
         mdp,U,s,a,Nsa,Ns_sa = self.mdp,self.U,self.s,self.a,self.Nsa,self.Ns_sa
         if s1 not in mdp.reward: # mdp.R also tracks the visited states
-            U[s1] = r1
-            mdp.reward[s1] = r1
+            U[s1] = mdp.reward[s1] = r1
         if s is not None:
             Nsa[s][a] += 1
             Ns_sa[s][a][s1] += 1
             for t in Ns_sa[s][a]:
                 if Ns_sa[s][a][t] > 0:
-                    self.mdp.T_set((s,a,t), Ns_sa[s][a][t] / Nsa[s][a])
+                    self.mdp.T_set((s,a,t), Ns_sa[s][a][t]/Nsa[s][a])
         U = policy_evaluation(self.pi, U, mdp)
         if s1 in mdp.terminals:
-            self.s, self.a = None, None
-            return False
+            self.s = self.a = None
         else:
             self.s, self.a = s1, self.pi[s1]
-            return self.a
+        return self.a
 
 class PassiveTDAgent(agents.Agent):
     """Passive (non-learning) agent that uses temporal differences to learn
@@ -86,20 +89,24 @@ def __init__(self,mdp,pi,alpha=None):
                r = None,
                gamma = mdp.gamma,
                terminals = mdp.terminals)
+        
         if alpha is None:
-            alpha = lambda n: 60./(59+n) # page 837
+            self.alpha = lambda n: 60./(59+n) # page 837
         else:
             self.alpha = alpha
-    def program(self,percept):
-        s1,r1 = percept
-        pi,U,Ns,s,a,r = self.pi,self.U,self.Ns,self.s,self.a,self.r
-        alpha,gamma = self.alpha,self.gamma
+
+    def program(self, percept):
+        s1, r1 = percept
+        pi, U, Ns, s, a, r = self.pi, self.U, self.Ns, self.s, self.a, self.r
+        alpha, gamma = self.alpha, self.gamma
         if s1 not in U: U[s1] = r1
         if s is not None:
             Ns[s] += 1
-            U[s] += alpha(Ns[s])*(r+gamma*U[s1]-U[s])
-        if s in self.terminals: self.s,self.a,self.r = None,None,None
-        else: self.s,self.a,self.r = s1, pi[s1],r1
+            U[s] += alpha(Ns[s]) * (r + gamma * U[s1] - U[s])
+        if s in self.terminals:
+            self.s = self.a = self.r = None
+        else:
+            self.s, self.a, self.r = s1, pi[s1], r1
         return self.a
 
 class QLearningAgent(agents.Agent):
@@ -107,8 +114,8 @@ class QLearningAgent(agents.Agent):
     action-utility representation. [Fig. 21.8]"""
     def __init__(self,mdp,alpha=None,Ne=5,Rplus=2):
         update(self,
-               Q = {s:{a:0. for a in mdp.actlist}
-                    for s in mdp.states if s not in mdp.terminals},
+               Q = {s:{a:0. for a in mdp.actlist} if s not in mdp.terminals
+                    else {None:0.} for s in mdp.states},
                Nsa = {s:{a:0. for a in mdp.actlist}
                     for s in mdp.states},
                s = None,
@@ -118,15 +125,13 @@ def __init__(self,mdp,alpha=None,Ne=5,Rplus=2):
                Rplus = Rplus,
                gamma = mdp.gamma,
                terminals = mdp.terminals)
-
-        for s in mdp.terminals: self.Q[s] = {None:0.}
         
         if alpha is None:
             self.alpha = lambda n: 60./(59+n) # page 837
         else:
             self.alpha = alpha
             
-    def f(self,u,n): # the exploration function in AIMA(3rd ed), pg 842
+    def f(self,u,n): # exploration function in AIMA(3rd ed), pg 842
         if n < self.Ne:
             return self.Rplus
         else:
@@ -142,16 +147,15 @@ def program(self,percept):
             Nsa[s][a] += 1
             Q[s][a] += alpha(Nsa[s][a])*(r+gamma*max(Q[s1].values())-Q[s][a])
         if s1 in self.terminals:
-            self.s,self.a,self.r = None, None, None
-            return False
+            self.s = self.a = self.r = None
         else:
-            self.s,self.r = s1,r1
-            self.a = argmax(Q[s1].keys(),lambda a1: f(Q[s1][a1],Nsa[s1][a1]))
-            return self.a
+            self.s, self.r = s1, r1
+            self.a = argmax(Q[s1].keys(), lambda a1: f(Q[s1][a1],Nsa[s1][a1]))
+        return self.a
 
 # ---
 
-def simulate(mdp,(s,a)):
+def simulate_move(mdp,(s,a)):
     r = random() # 0 <= r <= 1
     p,s1 = zip(*(mdp.T(s,a)))
     for i in range(len(p)):
@@ -163,9 +167,9 @@ def execute_trial(agent,mdp):
     while True:
         current_reward = mdp.R(current_state)
         next_action = agent.program((current_state, current_reward))
-        if next_action == False:
+        if next_action == None:
             break
-        current_state = simulate(mdp,(current_state, next_action))
+        current_state = simulate_move(mdp,(current_state, next_action))
 
 def demoPassiveADPAgent():
     print '--------------------'
@@ -219,7 +223,7 @@ def demoPassiveTDAgent():
     
     time_start = time()
     trials = 100
-    agent = PassiveADPAgent(Fig[17,1], policy)
+    agent = PassiveTDAgent(Fig[17,1], policy)
     for i in range (0,trials):
         execute_trial(agent,Fig[17,1])
     time_end = time()
@@ -236,7 +240,7 @@ def demoPassiveTDAgent():
 
 def demoQLearningAgent():
     print '--------------------'
-    print 'DEMO PassiveTDAgent'
+    print 'DEMO QLearningAgent'
     print '--------------------'
     # Setup values
     policy = {(0, 1): (0, 1),
@@ -252,7 +256,7 @@ def demoQLearningAgent():
               (0, 2): (1, 0)}
     
     time_start = time()
-    trials = 1000
+    trials = 100
     agent = QLearningAgent(Fig[17,1])
     for i in range (0,trials):
         execute_trial(agent,Fig[17,1])
@@ -272,6 +276,6 @@ def demoQLearningAgent():
 # ---
 
 if __name__ == '__main__':
-    demoPassiveADPAgent()
+    #demoPassiveADPAgent()
     demoPassiveTDAgent()
-    demoQLearningAgent()
+    #demoQLearningAgent()

From 8890b6950d679f56fa064ea3e14a143aa171dde8 Mon Sep 17 00:00:00 2001
From: Ng Yee Sian <ngyeesian@gmail.com>
Date: Sat, 8 Dec 2012 15:17:12 +0800
Subject: [PATCH 14/19] fixed a bug in PassiveTDAgent

added an attribute: reached_states to track new states.
---
 rl.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/rl.py b/rl.py
index 4313948..0d6bc92 100644
--- a/rl.py
+++ b/rl.py
@@ -88,7 +88,8 @@ def __init__(self,mdp,pi,alpha=None):
                a = None,
                r = None,
                gamma = mdp.gamma,
-               terminals = mdp.terminals)
+               terminals = mdp.terminals,
+               reached_states = set())
         
         if alpha is None:
             self.alpha = lambda n: 60./(59+n) # page 837
@@ -99,11 +100,13 @@ def program(self, percept):
         s1, r1 = percept
         pi, U, Ns, s, a, r = self.pi, self.U, self.Ns, self.s, self.a, self.r
         alpha, gamma = self.alpha, self.gamma
-        if s1 not in U: U[s1] = r1
+        if s1 not in self.reached_states:
+            self.reached_states.add(s1)
+            U[s1] = r1
         if s is not None:
             Ns[s] += 1
             U[s] += alpha(Ns[s]) * (r + gamma * U[s1] - U[s])
-        if s in self.terminals:
+        if s1 in self.terminals:
             self.s = self.a = self.r = None
         else:
             self.s, self.a, self.r = s1, pi[s1], r1

From 9956cbaacc85c38955c12c783f14629d1d126cc5 Mon Sep 17 00:00:00 2001
From: Ng Yee Sian <ngyeesian@gmail.com>
Date: Sat, 8 Dec 2012 23:14:53 +0800
Subject: [PATCH 15/19] refactored the code in the demo_functions

---
 rl.py | 41 ++++++++++++++---------------------------
 1 file changed, 14 insertions(+), 27 deletions(-)

diff --git a/rl.py b/rl.py
index 0d6bc92..438e4a1 100644
--- a/rl.py
+++ b/rl.py
@@ -197,14 +197,10 @@ def demoPassiveADPAgent():
         execute_trial(agent,Fig[17,1])
     time_end = time()
     
-    seconds_elapsed = time_end - time_start
-    minutes_elapsed = seconds_elapsed / 60.0
-    final_results = (('Took %d seconds, which is %d minutes' % (seconds_elapsed, minutes_elapsed)),\
-                     ('Executed %i trials' % (trials)), ('Utilities: %s' % (agent.U)))
-    for result in final_results:
-        print result
-
-    print '\nCorrect Utilities (estimated by value iteration, for comparison):'
+    print 'Executed %i trials' % trials
+    print 'Took %d seconds' % (time_end - time_start)
+    print 'Utilities: %s' % agent.U
+    print '\nCorrect Utilities (estimated by value iteration):'
     print value_iteration(Fig[17,1])
 
 def demoPassiveTDAgent():
@@ -231,14 +227,10 @@ def demoPassiveTDAgent():
         execute_trial(agent,Fig[17,1])
     time_end = time()
     
-    seconds_elapsed = time_end - time_start
-    minutes_elapsed = seconds_elapsed / 60.0
-    final_results = (('Took %d seconds, which is %d minutes' % (seconds_elapsed, minutes_elapsed)),\
-                     ('Executed %i trials' % (trials)), ('Utilities: %s' % (agent.U)))
-    for result in final_results:
-        print result
-
-    print '\nCorrect Utilities (estimated by value iteration, for comparison):'
+    print 'Executed %i trials' % trials
+    print 'Took %d seconds' % (time_end - time_start)
+    print 'Utilities: %s' % agent.U
+    print '\nCorrect Utilities (estimated by value iteration):'
     print value_iteration(Fig[17,1])
 
 def demoQLearningAgent():
@@ -264,21 +256,16 @@ def demoQLearningAgent():
     for i in range (0,trials):
         execute_trial(agent,Fig[17,1])
     time_end = time()
-    
-    seconds_elapsed = time_end - time_start
-    minutes_elapsed = seconds_elapsed / 60.0
-    final_results = (('Took %d seconds, which is %d minutes' % (seconds_elapsed, minutes_elapsed)),\
-                     ('Executed %i trials' % (trials)),
-                     ('Utilities: %s' % {s:max(agent.Q[s].values()) for s in agent.Q}))
-    for result in final_results:
-        print result
 
-    print '\nCorrect Utilities (estimated by value iteration, for comparison):'
+    print 'Executed %i trials' % trials
+    print 'Took %d seconds' % (time_end - time_start)
+    print 'Utilities: %s' % {s:max(agent.Q[s].values()) for s in agent.Q}
+    print '\nCorrect Utilities (estimated by value iteration):'
     print value_iteration(Fig[17,1])
 
 # ---
 
 if __name__ == '__main__':
-    #demoPassiveADPAgent()
+    demoPassiveADPAgent()
     demoPassiveTDAgent()
-    #demoQLearningAgent()
+    demoQLearningAgent()

From bb91e37da73ab23ef6fb05ebd6a46cfa18b2eb95 Mon Sep 17 00:00:00 2001
From: Ng Yee Sian <ngyeesian@gmail.com>
Date: Sun, 12 May 2013 11:52:06 +0800
Subject: [PATCH 16/19] used get() for dict look-ups

---
 rl.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/rl.py b/rl.py
index 438e4a1..8ddcb7f 100644
--- a/rl.py
+++ b/rl.py
@@ -23,11 +23,7 @@ def __init__(self, states, gamma, terminals):
             
         def R(self, s):
             """Return a numeric reward for the state s"""
-            if s in self.reward:
-                return self.reward[s]
-            else:
-                # not specified in AIMA(3rd ed)
-                return 0. # we don't know the value of the reward.
+            return self.reward.get(s, 0.)
             
         def T(self, s, a):
             """Returns a list of tuples with probabilities for states"""
@@ -91,10 +87,10 @@ def __init__(self,mdp,pi,alpha=None):
                terminals = mdp.terminals,
                reached_states = set())
         
-        if alpha is None:
-            self.alpha = lambda n: 60./(59+n) # page 837
-        else:
+        if alpha:
             self.alpha = alpha
+        else:
+            self.alpha = lambda n: 60./(59+n) # page 837
 
     def program(self, percept):
         s1, r1 = percept

From 0713821c11fe26fb91d42a50f864711a4022bd22 Mon Sep 17 00:00:00 2001
From: Ng Yee Sian <ngyeesian@gmail.com>
Date: Sun, 12 May 2013 12:22:22 +0800
Subject: [PATCH 17/19] normalize the files specified on commit

and always convert them to LF on checkout
---
 .gitattributes         |   2 +-
 images/dirt05-icon.jpg | Bin 1772 -> 1771 bytes
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitattributes b/.gitattributes
index 412eeda..930429e 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,5 +1,5 @@
 # Auto detect text files and perform LF normalization
-* text=auto
+* text eol=lf
 
 # Custom for Visual Studio
 *.cs     diff=csharp
diff --git a/images/dirt05-icon.jpg b/images/dirt05-icon.jpg
index 38d02e97f84973f98b2b98ad5ca3a7a752986174..262c2b7cee5c34b08d466c3dc64120e4ded44892 100644
GIT binary patch
delta 10
RcmaFE`<iz`_r{(CHUJ#21h4=A

delta 10
RcmaFO`-XQy_r|V7HUJ!{1g`)9


From 0eb4b6856d068a52c50fdbf25c8730f915eff457 Mon Sep 17 00:00:00 2001
From: Ng Yee Sian <ngyeesian@gmail.com>
Date: Sun, 12 May 2013 12:31:33 +0800
Subject: [PATCH 18/19] revert back to auto setting

---
 .gitattributes | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitattributes b/.gitattributes
index 930429e..412eeda 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,5 +1,5 @@
 # Auto detect text files and perform LF normalization
-* text eol=lf
+* text=auto
 
 # Custom for Visual Studio
 *.cs     diff=csharp

From 2ffc7ab2df89a21c205d5809bd46699d4fe613ea Mon Sep 17 00:00:00 2001
From: Ng Yee Sian <ngyeesian@gmail.com>
Date: Sun, 12 May 2013 12:39:37 +0800
Subject: [PATCH 19/19] normalize and convert .py files

---
 .gitattributes | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/.gitattributes b/.gitattributes
index 412eeda..f7c2ff5 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,6 +1,19 @@
 # Auto detect text files and perform LF normalization
 * text=auto
 
+# Explicitly declare text files we want to always be normalized and converted 
+# to native line endings on checkout.
+*.c text
+*.h text
+*.py text
+
+# Declare files that will always have CRLF line endings on checkout.
+*.sln text eol=crlf
+
+# Denote all files that are truly binary and should not be modified.
+*.png binary
+*.jpg binary
+
 # Custom for Visual Studio
 *.cs     diff=csharp
 *.sln    merge=union