diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..f7c2ff5 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,35 @@ +# Auto detect text files and perform LF normalization +* text=auto + +# Explicitly declare text files we want to always be normalized and converted +# to native line endings on checkout. +*.c text +*.h text +*.py text + +# Declare files that will always have CRLF line endings on checkout. +*.sln text eol=crlf + +# Denote all files that are truly binary and should not be modified. +*.png binary +*.jpg binary + +# Custom for Visual Studio +*.cs diff=csharp +*.sln merge=union +*.csproj merge=union +*.vbproj merge=union +*.fsproj merge=union +*.dbproj merge=union + +# Standard to msysgit +*.doc diff=astextplain +*.DOC diff=astextplain +*.docx diff=astextplain +*.DOCX diff=astextplain +*.dot diff=astextplain +*.DOT diff=astextplain +*.pdf diff=astextplain +*.PDF diff=astextplain +*.rtf diff=astextplain +*.RTF diff=astextplain diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5ebd21a --- /dev/null +++ b/.gitignore @@ -0,0 +1,163 @@ +################# +## Eclipse +################# + +*.pydevproject +.project +.metadata +bin/ +tmp/ +*.tmp +*.bak +*.swp +*~.nib +local.properties +.classpath +.settings/ +.loadpath + +# External tool builders +.externalToolBuilders/ + +# Locally stored "Eclipse launch configurations" +*.launch + +# CDT-specific +.cproject + +# PDT-specific +.buildpath + + +################# +## Visual Studio +################# + +## Ignore Visual Studio temporary files, build results, and +## files generated by popular Visual Studio add-ons. + +# User-specific files +*.suo +*.user +*.sln.docstates + +# Build results +[Dd]ebug/ +[Rr]elease/ +*_i.c +*_p.c +*.ilk +*.meta +*.obj +*.pch +*.pdb +*.pgc +*.pgd +*.rsp +*.sbr +*.tlb +*.tli +*.tlh +*.tmp +*.vspscc +.builds +*.dotCover + +## TODO: If you have NuGet Package Restore enabled, uncomment this +#packages/ + +# Visual C++ cache files +ipch/ +*.aps +*.ncb +*.opensdf +*.sdf + +# Visual Studio profiler +*.psess +*.vsp + +# ReSharper is a .NET coding add-in +_ReSharper* + +# Installshield output folder +[Ee]xpress + +# DocProject is a documentation generator add-in +DocProject/buildhelp/ +DocProject/Help/*.HxT +DocProject/Help/*.HxC +DocProject/Help/*.hhc +DocProject/Help/*.hhk +DocProject/Help/*.hhp +DocProject/Help/Html2 +DocProject/Help/html + +# Click-Once directory +publish + +# Others +[Bb]in +[Oo]bj +sql +TestResults +*.Cache +ClientBin +stylecop.* +~$* +*.dbmdl +Generated_Code #added for RIA/Silverlight projects + +# Backup & report files from converting an old project file to a newer +# Visual Studio version. Backup files are not needed, because we have git ;-) +_UpgradeReport_Files/ +Backup*/ +UpgradeLog*.XML + + + +############ +## Windows +############ + +# Windows image file caches +Thumbs.db + +# Folder config file +Desktop.ini + + +############# +## Python +############# + +*.py[co] + +# Packages +*.egg +*.egg-info +dist +build +eggs +parts +bin +var +sdist +develop-eggs +.installed.cfg + +# Installer logs +pip-log.txt + +# Unit test / coverage reports +.coverage +.tox + +#Translations +*.mo + +#Mr Developer +.mr.developer.cfg + +# Mac crap +.DS_Store diff --git a/images/dirt05-icon.jpg b/images/dirt05-icon.jpg index 38d02e9..262c2b7 100644 Binary files a/images/dirt05-icon.jpg and b/images/dirt05-icon.jpg differ diff --git a/mdp.py b/mdp.py index e5142c1..0048843 100644 --- a/mdp.py +++ b/mdp.py @@ -98,7 +98,8 @@ def value_iteration(mdp, epsilon=0.001): U1[s] = R(s) + gamma * max([sum([p * U[s1] for (p, s1) in T(s, a)]) for a in mdp.actions(s)]) delta = max(delta, abs(U1[s] - U[s])) - if delta < epsilon * (1 - gamma) / gamma: + if (((gamma < 1) and (delta < epsilon * (1 - gamma) / gamma)) or + ((gamma == 1) and (delta < epsilon))): # allows for gamma to be 1 return U def best_policy(mdp, U): diff --git a/rl.py b/rl.py index fc0e2c9..8ddcb7f 100644 --- a/rl.py +++ b/rl.py @@ -1,15 +1,267 @@ """Reinforcement Learning (Chapter 21) """ -from utils import * +from mdp import value_iteration, policy_evaluation, policy_iteration, \ + GridMDP, MDP, Fig +from utils import update, argmax +from random import random +from time import time import agents class PassiveADPAgent(agents.Agent): """Passive (non-learning) agent that uses adaptive dynamic programming on a given MDP and policy. [Fig. 21.2]""" - NotImplemented + class LearntMDP: + """a model of the original mdp that the PassiveADP is trying to learn""" + def __init__(self, states, gamma, terminals): + update(self, + P={}, + reward={}, + states=states, + gamma=gamma, + terminals=terminals) + + def R(self, s): + """Return a numeric reward for the state s""" + return self.reward.get(s, 0.) + + def T(self, s, a): + """Returns a list of tuples with probabilities for states""" + try: + return [(p,s) for (s,p) in self.P[s][a].items()] + except KeyError: + return [] + + def T_set(self, (s,a,t), p): + " Adds a value to the transistion model " + if (s in self.P) and (a in self.P[s]): + self.P[s][a][t] = p + elif (s in self.P): + self.P[s][a] = {t:p} + else: + self.P[s] = {a:{t:p}} + + def __init__(self, mdp, pi): + update(self, + pi = pi, + mdp = self.LearntMDP(mdp.states,mdp.gamma,mdp.terminals), + U = {}, + Ns_sa = {s:{a:{t:0 for (p,t) in mdp.T(s,a)} + for a in mdp.actlist} + for s in mdp.states}, + Nsa = {s:{a:0. for a in mdp.actlist} + for s in mdp.states}, + s = None, + a = None) + + def program(self, percept): + s1,r1 = percept + mdp,U,s,a,Nsa,Ns_sa = self.mdp,self.U,self.s,self.a,self.Nsa,self.Ns_sa + if s1 not in mdp.reward: # mdp.R also tracks the visited states + U[s1] = mdp.reward[s1] = r1 + if s is not None: + Nsa[s][a] += 1 + Ns_sa[s][a][s1] += 1 + for t in Ns_sa[s][a]: + if Ns_sa[s][a][t] > 0: + self.mdp.T_set((s,a,t), Ns_sa[s][a][t]/Nsa[s][a]) + U = policy_evaluation(self.pi, U, mdp) + if s1 in mdp.terminals: + self.s = self.a = None + else: + self.s, self.a = s1, self.pi[s1] + return self.a class PassiveTDAgent(agents.Agent): """Passive (non-learning) agent that uses temporal differences to learn utility estimates. [Fig. 21.4]""" - NotImplemented + def __init__(self,mdp,pi,alpha=None): + update(self, + pi = pi, + U = {s:0. for s in mdp.states}, + Ns = {s:0 for s in mdp.states}, + s = None, + a = None, + r = None, + gamma = mdp.gamma, + terminals = mdp.terminals, + reached_states = set()) + + if alpha: + self.alpha = alpha + else: + self.alpha = lambda n: 60./(59+n) # page 837 + + def program(self, percept): + s1, r1 = percept + pi, U, Ns, s, a, r = self.pi, self.U, self.Ns, self.s, self.a, self.r + alpha, gamma = self.alpha, self.gamma + if s1 not in self.reached_states: + self.reached_states.add(s1) + U[s1] = r1 + if s is not None: + Ns[s] += 1 + U[s] += alpha(Ns[s]) * (r + gamma * U[s1] - U[s]) + if s1 in self.terminals: + self.s = self.a = self.r = None + else: + self.s, self.a, self.r = s1, pi[s1], r1 + return self.a + +class QLearningAgent(agents.Agent): + """Active TD agent that uses temporal differences to learn an + action-utility representation. [Fig. 21.8]""" + def __init__(self,mdp,alpha=None,Ne=5,Rplus=2): + update(self, + Q = {s:{a:0. for a in mdp.actlist} if s not in mdp.terminals + else {None:0.} for s in mdp.states}, + Nsa = {s:{a:0. for a in mdp.actlist} + for s in mdp.states}, + s = None, + a = None, + r = None, + Ne = Ne, + Rplus = Rplus, + gamma = mdp.gamma, + terminals = mdp.terminals) + + if alpha is None: + self.alpha = lambda n: 60./(59+n) # page 837 + else: + self.alpha = alpha + + def f(self,u,n): # exploration function in AIMA(3rd ed), pg 842 + if n < self.Ne: + return self.Rplus + else: + return u + + def program(self,percept): + s1,r1 = percept + Q, Nsa, s, a, r = self.Q, self.Nsa, self.s, self.a, self.r + alpha, gamma, f = self.alpha, self.gamma, self.f + if s1 in self.terminals: + Q[s1][None] = r1 + if s is not None: + Nsa[s][a] += 1 + Q[s][a] += alpha(Nsa[s][a])*(r+gamma*max(Q[s1].values())-Q[s][a]) + if s1 in self.terminals: + self.s = self.a = self.r = None + else: + self.s, self.r = s1, r1 + self.a = argmax(Q[s1].keys(), lambda a1: f(Q[s1][a1],Nsa[s1][a1])) + return self.a + +# --- + +def simulate_move(mdp,(s,a)): + r = random() # 0 <= r <= 1 + p,s1 = zip(*(mdp.T(s,a))) + for i in range(len(p)): + if sum(p[:i+1]) >= r: + return s1[i] + +def execute_trial(agent,mdp): + current_state = mdp.init + while True: + current_reward = mdp.R(current_state) + next_action = agent.program((current_state, current_reward)) + if next_action == None: + break + current_state = simulate_move(mdp,(current_state, next_action)) + +def demoPassiveADPAgent(): + print '--------------------' + print 'DEMO PassiveADPAgent' + print '--------------------' + policy = {(0, 1): (0, 1), + (1, 2): (1, 0), + (3, 2): None, + (0, 0): (0, 1), + (3, 0): (-1, 0), + (3, 1): None, + (2, 1): (0, 1), + (2, 0): (0, 1), + (2, 2): (1, 0), + (1, 0): (1, 0), + (0, 2): (1, 0)} + + time_start = time() + trials = 100 + agent = PassiveADPAgent(Fig[17,1], policy) + for i in range (0,trials): + execute_trial(agent,Fig[17,1]) + time_end = time() + + print 'Executed %i trials' % trials + print 'Took %d seconds' % (time_end - time_start) + print 'Utilities: %s' % agent.U + print '\nCorrect Utilities (estimated by value iteration):' + print value_iteration(Fig[17,1]) + +def demoPassiveTDAgent(): + print '--------------------' + print 'DEMO PassiveTDAgent' + print '--------------------' + # Setup values + policy = {(0, 1): (0, 1), + (1, 2): (1, 0), + (3, 2): None, + (0, 0): (0, 1), + (3, 0): (-1, 0), + (3, 1): None, + (2, 1): (0, 1), + (2, 0): (0, 1), + (2, 2): (1, 0), + (1, 0): (1, 0), + (0, 2): (1, 0)} + + time_start = time() + trials = 100 + agent = PassiveTDAgent(Fig[17,1], policy) + for i in range (0,trials): + execute_trial(agent,Fig[17,1]) + time_end = time() + + print 'Executed %i trials' % trials + print 'Took %d seconds' % (time_end - time_start) + print 'Utilities: %s' % agent.U + print '\nCorrect Utilities (estimated by value iteration):' + print value_iteration(Fig[17,1]) + +def demoQLearningAgent(): + print '--------------------' + print 'DEMO QLearningAgent' + print '--------------------' + # Setup values + policy = {(0, 1): (0, 1), + (1, 2): (1, 0), + (3, 2): None, + (0, 0): (0, 1), + (3, 0): (-1, 0), + (3, 1): None, + (2, 1): (0, 1), + (2, 0): (0, 1), + (2, 2): (1, 0), + (1, 0): (1, 0), + (0, 2): (1, 0)} + + time_start = time() + trials = 100 + agent = QLearningAgent(Fig[17,1]) + for i in range (0,trials): + execute_trial(agent,Fig[17,1]) + time_end = time() + + print 'Executed %i trials' % trials + print 'Took %d seconds' % (time_end - time_start) + print 'Utilities: %s' % {s:max(agent.Q[s].values()) for s in agent.Q} + print '\nCorrect Utilities (estimated by value iteration):' + print value_iteration(Fig[17,1]) + +# --- + +if __name__ == '__main__': + demoPassiveADPAgent() + demoPassiveTDAgent() + demoQLearningAgent() diff --git a/utils.py b/utils.py index c167589..8356337 100644 --- a/utils.py +++ b/utils.py @@ -734,17 +734,22 @@ class PriorityQueue(Queue): Also supports dict-like lookup.""" def __init__(self, order=min, f=lambda x: x): update(self, A=[], order=order, f=f) + self.membership = {} def append(self, item): bisect.insort(self.A, (self.f(item), item)) + hashval = hash(item) + self.membership[hashval] = self.membership.get(hashval, 0) + 1 def __len__(self): return len(self.A) def pop(self): if self.order == min: - return self.A.pop(0)[1] + item = self.A.pop(0)[1] else: - return self.A.pop()[1] + item = self.A.pop()[1] + self._remove_(item) + return item def __contains__(self, item): - return some(lambda (_, x): x == item, self.A) + return hash(item) in self.membership def __getitem__(self, key): for _, item in self.A: if item == key: @@ -752,8 +757,15 @@ def __getitem__(self, key): def __delitem__(self, key): for i, (value, item) in enumerate(self.A): if item == key: - self.A.pop(i) + item = self.A.pop(i) + self._remove_(item) return + def _remove_(self, item): + hashval = hash(item) + self.membership[hashval] -= 1 + if self.membership[hashval] == 0: + del self.membership[hashval] + ## Fig: The idea is we can define things like Fig[3,10] later. ## Alas, it is Fig[3,10] not Fig[3.10], because that would be the same @@ -855,7 +867,14 @@ def fixup(test): ... q.extend(nums) ... for num in nums: assert num in q ... assert 42 not in q -... return [q.pop() for i in range(len(q))] +... result = [] +... for i in range(len(q)): +... num = q.pop() +... assert num not in q # num could appear multiple times, in which case this would fail +... result.append(num) +... +... return result + >>> qtest(Stack()) [0, 3, 4, 99, -99, 6, 5, 7, 2, 8, 1]