From b5c20af80453698f7dfda189dc530efe8b47f363 Mon Sep 17 00:00:00 2001 From: Justin Russell Date: Tue, 21 Nov 2017 22:45:46 -0500 Subject: [PATCH 1/3] Track num iterations for value/policy iteration --- mdp.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mdp.py b/mdp.py index 6637108e5..75dc91e11 100644 --- a/mdp.py +++ b/mdp.py @@ -35,6 +35,7 @@ def __init__(self, init, actlist, terminals, transitions={}, states=None, gamma= self.transitions = transitions self.gamma = gamma self.reward = {} + self.iters_ = 0 def R(self, state): """Return a numeric reward for this state.""" @@ -121,7 +122,9 @@ def value_iteration(mdp, epsilon=0.001): """Solving an MDP by value iteration. [Figure 17.4]""" U1 = {s: 0 for s in mdp.states} R, T, gamma = mdp.R, mdp.T, mdp.gamma + mdp.iters_ = 0 while True: + mdp.iters_ = mdp.iters_ + 1 U = U1.copy() delta = 0 for s in mdp.states: @@ -152,7 +155,9 @@ def policy_iteration(mdp): """Solve an MDP by policy iteration [Figure 17.7]""" U = {s: 0 for s in mdp.states} pi = {s: random.choice(mdp.actions(s)) for s in mdp.states} + mdp.iters_ = 0 while True: + mdp.iters_ = mdp.iters_ + 1 U = policy_evaluation(pi, U, mdp) unchanged = True for s in mdp.states: From 3f0e3f45b619f21c6db00d395072d21fdf28c926 Mon Sep 17 00:00:00 2001 From: Justin Russell Date: Wed, 22 Nov 2017 22:10:55 -0500 Subject: [PATCH 2/3] Track utilities for value/policy iteration --- mdp.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/mdp.py b/mdp.py index 75dc91e11..0fb7794ed 100644 --- a/mdp.py +++ b/mdp.py @@ -6,6 +6,7 @@ dictionary of {state:number} pairs. We then define the value_iteration and policy_iteration algorithms.""" +from time import clock as now from utils import argmax, vector_add, orientations, turn_right, turn_left import random @@ -118,14 +119,15 @@ def to_arrows(self, policy): # ______________________________________________________________________________ -def value_iteration(mdp, epsilon=0.001): +def value_iteration(mdp, epsilon=0.001, max_iters=5000): """Solving an MDP by value iteration. [Figure 17.4]""" U1 = {s: 0 for s in mdp.states} R, T, gamma = mdp.R, mdp.T, mdp.gamma mdp.iters_ = 0 - while True: + mdp.U_ = U = None + while mdp.iters_ < max_iters: mdp.iters_ = mdp.iters_ + 1 - U = U1.copy() + mdp.U_ = U = U1.copy() delta = 0 for s in mdp.states: U1[s] = R(s) + gamma * max([sum([p * U[s1] for (p, s1) in T(s, a)]) @@ -133,7 +135,7 @@ def value_iteration(mdp, epsilon=0.001): delta = max(delta, abs(U1[s] - U[s])) if delta < epsilon * (1 - gamma) / gamma: return U - + return U def best_policy(mdp, U): """Given an MDP and a utility function U, determine the best policy, @@ -151,14 +153,15 @@ def expected_utility(a, s, U, mdp): # ______________________________________________________________________________ -def policy_iteration(mdp): +def policy_iteration(mdp, max_iters=2000): """Solve an MDP by policy iteration [Figure 17.7]""" U = {s: 0 for s in mdp.states} pi = {s: random.choice(mdp.actions(s)) for s in mdp.states} mdp.iters_ = 0 - while True: + mdp.U_ = U + while mdp.iters_ < max_iters: mdp.iters_ = mdp.iters_ + 1 - U = policy_evaluation(pi, U, mdp) + mdp.U_ = U = policy_evaluation(pi, U, mdp) unchanged = True for s in mdp.states: a = argmax(mdp.actions(s), key=lambda a: expected_utility(a, s, U, mdp)) @@ -167,6 +170,7 @@ def policy_iteration(mdp): unchanged = False if unchanged: return pi + return pi def policy_evaluation(pi, U, mdp, k=20): From 577b033356b2454edbeb933e2924771d166647aa Mon Sep 17 00:00:00 2001 From: Justin Russell Date: Thu, 23 Nov 2017 10:30:03 -0500 Subject: [PATCH 3/3] Add method to get utilities for qlearningagent --- rl.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/rl.py b/rl.py index 20a392592..59b2a3f0e 100644 --- a/rl.py +++ b/rl.py @@ -173,6 +173,13 @@ def update_state(self, percept): assumes the percept to be of type (state, reward)''' return percept + def get_utilities(self): + U = defaultdict(lambda: -1000.) + for state_action, value in self.Q.items(): + state, action = state_action + if U[state] < value: + U[state] = value + return U def run_single_trial(agent_program, mdp): ''' Execute trial for given agent_program