diff --git a/mdp.py b/mdp.py index 6637108e5..0fb7794ed 100644 --- a/mdp.py +++ b/mdp.py @@ -6,6 +6,7 @@ dictionary of {state:number} pairs. We then define the value_iteration and policy_iteration algorithms.""" +from time import clock as now from utils import argmax, vector_add, orientations, turn_right, turn_left import random @@ -35,6 +36,7 @@ def __init__(self, init, actlist, terminals, transitions={}, states=None, gamma= self.transitions = transitions self.gamma = gamma self.reward = {} + self.iters_ = 0 def R(self, state): """Return a numeric reward for this state.""" @@ -117,12 +119,15 @@ def to_arrows(self, policy): # ______________________________________________________________________________ -def value_iteration(mdp, epsilon=0.001): +def value_iteration(mdp, epsilon=0.001, max_iters=5000): """Solving an MDP by value iteration. [Figure 17.4]""" U1 = {s: 0 for s in mdp.states} R, T, gamma = mdp.R, mdp.T, mdp.gamma - while True: - U = U1.copy() + mdp.iters_ = 0 + mdp.U_ = U = None + while mdp.iters_ < max_iters: + mdp.iters_ = mdp.iters_ + 1 + mdp.U_ = U = U1.copy() delta = 0 for s in mdp.states: U1[s] = R(s) + gamma * max([sum([p * U[s1] for (p, s1) in T(s, a)]) @@ -130,7 +135,7 @@ def value_iteration(mdp, epsilon=0.001): delta = max(delta, abs(U1[s] - U[s])) if delta < epsilon * (1 - gamma) / gamma: return U - + return U def best_policy(mdp, U): """Given an MDP and a utility function U, determine the best policy, @@ -148,12 +153,15 @@ def expected_utility(a, s, U, mdp): # ______________________________________________________________________________ -def policy_iteration(mdp): +def policy_iteration(mdp, max_iters=2000): """Solve an MDP by policy iteration [Figure 17.7]""" U = {s: 0 for s in mdp.states} pi = {s: random.choice(mdp.actions(s)) for s in mdp.states} - while True: - U = policy_evaluation(pi, U, mdp) + mdp.iters_ = 0 + mdp.U_ = U + while mdp.iters_ < max_iters: + mdp.iters_ = mdp.iters_ + 1 + mdp.U_ = U = policy_evaluation(pi, U, mdp) unchanged = True for s in mdp.states: a = argmax(mdp.actions(s), key=lambda a: expected_utility(a, s, U, mdp)) @@ -162,6 +170,7 @@ def policy_iteration(mdp): unchanged = False if unchanged: return pi + return pi def policy_evaluation(pi, U, mdp, k=20): diff --git a/rl.py b/rl.py index 20a392592..59b2a3f0e 100644 --- a/rl.py +++ b/rl.py @@ -173,6 +173,13 @@ def update_state(self, percept): assumes the percept to be of type (state, reward)''' return percept + def get_utilities(self): + U = defaultdict(lambda: -1000.) + for state_action, value in self.Q.items(): + state, action = state_action + if U[state] < value: + U[state] = value + return U def run_single_trial(agent_program, mdp): ''' Execute trial for given agent_program