6
6
dictionary of {state:number} pairs. We then define the value_iteration
7
7
and policy_iteration algorithms."""
8
8
9
+ from time import clock as now
9
10
from utils import argmax , vector_add , orientations , turn_right , turn_left
10
11
11
12
import random
@@ -118,22 +119,23 @@ def to_arrows(self, policy):
118
119
# ______________________________________________________________________________
119
120
120
121
121
- def value_iteration (mdp , epsilon = 0.001 ):
122
+ def value_iteration (mdp , epsilon = 0.001 , max_iters = 5000 ):
122
123
"""Solving an MDP by value iteration. [Figure 17.4]"""
123
124
U1 = {s : 0 for s in mdp .states }
124
125
R , T , gamma = mdp .R , mdp .T , mdp .gamma
125
126
mdp .iters_ = 0
126
- while True :
127
+ mdp .U_ = U = None
128
+ while mdp .iters_ < max_iters :
127
129
mdp .iters_ = mdp .iters_ + 1
128
- U = U1 .copy ()
130
+ mdp . U_ = U = U1 .copy ()
129
131
delta = 0
130
132
for s in mdp .states :
131
133
U1 [s ] = R (s ) + gamma * max ([sum ([p * U [s1 ] for (p , s1 ) in T (s , a )])
132
134
for a in mdp .actions (s )])
133
135
delta = max (delta , abs (U1 [s ] - U [s ]))
134
136
if delta < epsilon * (1 - gamma ) / gamma :
135
137
return U
136
-
138
+ return U
137
139
138
140
def best_policy (mdp , U ):
139
141
"""Given an MDP and a utility function U, determine the best policy,
@@ -151,14 +153,15 @@ def expected_utility(a, s, U, mdp):
151
153
# ______________________________________________________________________________
152
154
153
155
154
- def policy_iteration (mdp ):
156
+ def policy_iteration (mdp , max_iters = 2000 ):
155
157
"""Solve an MDP by policy iteration [Figure 17.7]"""
156
158
U = {s : 0 for s in mdp .states }
157
159
pi = {s : random .choice (mdp .actions (s )) for s in mdp .states }
158
160
mdp .iters_ = 0
159
- while True :
161
+ mdp .U_ = U
162
+ while mdp .iters_ < max_iters :
160
163
mdp .iters_ = mdp .iters_ + 1
161
- U = policy_evaluation (pi , U , mdp )
164
+ mdp . U_ = U = policy_evaluation (pi , U , mdp )
162
165
unchanged = True
163
166
for s in mdp .states :
164
167
a = argmax (mdp .actions (s ), key = lambda a : expected_utility (a , s , U , mdp ))
@@ -167,6 +170,7 @@ def policy_iteration(mdp):
167
170
unchanged = False
168
171
if unchanged :
169
172
return pi
173
+ return pi
170
174
171
175
172
176
def policy_evaluation (pi , U , mdp , k = 20 ):
0 commit comments