From 5857f42a6a266aea440b3955630c623d7e72ce4e Mon Sep 17 00:00:00 2001 From: Antonis Maronikolakis Date: Thu, 30 Mar 2017 15:09:03 +0300 Subject: [PATCH 1/2] Update test_mdp.py --- tests/test_mdp.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/test_mdp.py b/tests/test_mdp.py index f5cb40510..e992d263c 100644 --- a/tests/test_mdp.py +++ b/tests/test_mdp.py @@ -25,3 +25,17 @@ def test_best_policy(): assert sequential_decision_environment.to_arrows(pi) == [['>', '>', '>', '.'], ['^', None, '^', '.'], ['^', '>', '^', '<']] + + +def test_transition_model(): + transition_model = { + "A": {"a1": (0.3, "B"), "a2": (0.7, "C")}, + "B": {"a1": (0.5, "B"), "a2": (0.5, "A")}, + "C": {"a1": (0.9, "A"), "a2": (0.1, "B")}, + } + + mdp = MDP(init="A", actlist={"a1","a2"}, terminals={"C"}, states={"A","B","C"}, transitions=transition_model) + + assert mdp.T("A","a1") == (0.3, "B") + assert mdp.T("B","a2") == (0.5, "A") + assert mdp.T("C","a1") == (0.9, "A") From 66ed25f2e17ab200fde83d000afb2b096bc20f7d Mon Sep 17 00:00:00 2001 From: Antonis Maronikolakis Date: Thu, 30 Mar 2017 15:10:54 +0300 Subject: [PATCH 2/2] Update mdp.py --- mdp.py | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/mdp.py b/mdp.py index 902582b19..24bbb2a8d 100644 --- a/mdp.py +++ b/mdp.py @@ -1,9 +1,9 @@ """Markov Decision Processes (Chapter 17) First we define an MDP, and the special case of a GridMDP, in which -states are laid out in a 2-dimensional grid. We also represent a policy +states are laid out in a 2-dimensional grid. We also represent a policy as a dictionary of {state:action} pairs, and a Utility function as a -dictionary of {state:number} pairs. We then define the value_iteration +dictionary of {state:number} pairs. We then define the value_iteration and policy_iteration algorithms.""" from utils import argmax, vector_add, print_table # noqa @@ -17,32 +17,37 @@ class MDP: """A Markov Decision Process, defined by an initial state, transition model, and reward function. We also keep track of a gamma value, for use by algorithms. The transition model is represented somewhat differently from - the text. Instead of P(s' | s, a) being a probability number for each + the text. Instead of P(s' | s, a) being a probability number for each state/state/action triplet, we instead have T(s, a) return a - list of (p, s') pairs. We also keep track of the possible states, + list of (p, s') pairs. We also keep track of the possible states, terminal states, and actions for each state. [page 646]""" - def __init__(self, init, actlist, terminals, gamma=.9): + def __init__(self, init, actlist, terminals, transitions={}, states=set(), gamma=.9): + if not (0 <= gamma < 1): + raise ValueError("An MDP must have 0 <= gamma < 1") + self.init = init self.actlist = actlist self.terminals = terminals - if not (0 <= gamma < 1): - raise ValueError("An MDP must have 0 <= gamma < 1") + self.transitions = transitions + self.states = states self.gamma = gamma - self.states = set() self.reward = {} def R(self, state): - "Return a numeric reward for this state." + """Return a numeric reward for this state.""" return self.reward[state] def T(self, state, action): - """Transition model. From a state and an action, return a list + """Transition model. From a state and an action, return a list of (probability, result-state) pairs.""" - raise NotImplementedError + if(self.transitions == {}): + raise ValueError("Transition model is missing") + else: + return self.transitions[state][action] def actions(self, state): - """Set of actions that can be performed in this state. By default, a + """Set of actions that can be performed in this state. By default, a fixed list of actions, except for terminal states. Override this method if you need to specialize by state.""" if state in self.terminals: @@ -53,9 +58,9 @@ def actions(self, state): class GridMDP(MDP): - """A two-dimensional grid MDP, as in [Figure 17.1]. All you have to do is + """A two-dimensional grid MDP, as in [Figure 17.1]. All you have to do is specify the grid as a list of lists of rewards; use None for an obstacle - (unreachable state). Also, you should specify the terminal states. + (unreachable state). Also, you should specify the terminal states. An action is an (x, y) unit vector; e.g. (1, 0) means move east.""" def __init__(self, grid, terminals, init=(0, 0), gamma=.9):