Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit 3f0e3f4

Browse files
committed
Track utilities for value/policy iteration
1 parent b5c20af commit 3f0e3f4

File tree

1 file changed

+11
-7
lines changed

1 file changed

+11
-7
lines changed

mdp.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
dictionary of {state:number} pairs. We then define the value_iteration
77
and policy_iteration algorithms."""
88

9+
from time import clock as now
910
from utils import argmax, vector_add, orientations, turn_right, turn_left
1011

1112
import random
@@ -118,22 +119,23 @@ def to_arrows(self, policy):
118119
# ______________________________________________________________________________
119120

120121

121-
def value_iteration(mdp, epsilon=0.001):
122+
def value_iteration(mdp, epsilon=0.001, max_iters=5000):
122123
"""Solving an MDP by value iteration. [Figure 17.4]"""
123124
U1 = {s: 0 for s in mdp.states}
124125
R, T, gamma = mdp.R, mdp.T, mdp.gamma
125126
mdp.iters_ = 0
126-
while True:
127+
mdp.U_ = U = None
128+
while mdp.iters_ < max_iters:
127129
mdp.iters_ = mdp.iters_ + 1
128-
U = U1.copy()
130+
mdp.U_ = U = U1.copy()
129131
delta = 0
130132
for s in mdp.states:
131133
U1[s] = R(s) + gamma * max([sum([p * U[s1] for (p, s1) in T(s, a)])
132134
for a in mdp.actions(s)])
133135
delta = max(delta, abs(U1[s] - U[s]))
134136
if delta < epsilon * (1 - gamma) / gamma:
135137
return U
136-
138+
return U
137139

138140
def best_policy(mdp, U):
139141
"""Given an MDP and a utility function U, determine the best policy,
@@ -151,14 +153,15 @@ def expected_utility(a, s, U, mdp):
151153
# ______________________________________________________________________________
152154

153155

154-
def policy_iteration(mdp):
156+
def policy_iteration(mdp, max_iters=2000):
155157
"""Solve an MDP by policy iteration [Figure 17.7]"""
156158
U = {s: 0 for s in mdp.states}
157159
pi = {s: random.choice(mdp.actions(s)) for s in mdp.states}
158160
mdp.iters_ = 0
159-
while True:
161+
mdp.U_ = U
162+
while mdp.iters_ < max_iters:
160163
mdp.iters_ = mdp.iters_ + 1
161-
U = policy_evaluation(pi, U, mdp)
164+
mdp.U_ = U = policy_evaluation(pi, U, mdp)
162165
unchanged = True
163166
for s in mdp.states:
164167
a = argmax(mdp.actions(s), key=lambda a: expected_utility(a, s, U, mdp))
@@ -167,6 +170,7 @@ def policy_iteration(mdp):
167170
unchanged = False
168171
if unchanged:
169172
return pi
173+
return pi
170174

171175

172176
def policy_evaluation(pi, U, mdp, k=20):

0 commit comments

Comments
 (0)