Thanks to visit codestin.com
Credit goes to github.com

Skip to content

Commit f44631d

Browse files
bakerwhonorvig
authored andcommitted
Fix MDP class and add POMDP subclass and notebook (#781)
* Fixed typos and added inline LaTeX * Fixed backslash for inline LaTeX * Fixed more backslashes * generalised MDP class and created POMDP notebook * Fixed consistency issues with base MDP class * Small fix on CustomMDP * Set default args to pass tests * Added TableDrivenAgentProgram tests (#777) * Add tests for TableDrivenAgentProgram * Add tests for TableDrivenAgentProgram * Check environment status at every step * Check environment status at every step of TableDrivenAgentProgram * Fixing tests * fixed test_rl * removed redundant code, fixed a comment
1 parent 3f88880 commit f44631d

File tree

7 files changed

+761
-1329
lines changed

7 files changed

+761
-1329
lines changed

mdp.ipynb

Lines changed: 333 additions & 1240 deletions
Large diffs are not rendered by default.

mdp.py

Lines changed: 88 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -21,20 +21,36 @@ class MDP:
2121
list of (p, s') pairs. We also keep track of the possible states,
2222
terminal states, and actions for each state. [page 646]"""
2323

24-
def __init__(self, init, actlist, terminals, transitions={}, states=None, gamma=.9):
24+
def __init__(self, init, actlist, terminals, transitions = {}, reward = None, states=None, gamma=.9):
2525
if not (0 < gamma <= 1):
2626
raise ValueError("An MDP must have 0 < gamma <= 1")
2727

2828
if states:
2929
self.states = states
3030
else:
31-
self.states = set()
31+
## collect states from transitions table
32+
self.states = self.get_states_from_transitions(transitions)
33+
34+
3235
self.init = init
33-
self.actlist = actlist
36+
37+
if isinstance(actlist, list):
38+
## if actlist is a list, all states have the same actions
39+
self.actlist = actlist
40+
elif isinstance(actlist, dict):
41+
## if actlist is a dict, different actions for each state
42+
self.actlist = actlist
43+
3444
self.terminals = terminals
3545
self.transitions = transitions
46+
if self.transitions == {}:
47+
print("Warning: Transition table is empty.")
3648
self.gamma = gamma
37-
self.reward = {}
49+
if reward:
50+
self.reward = reward
51+
else:
52+
self.reward = {s : 0 for s in self.states}
53+
#self.check_consistency()
3854

3955
def R(self, state):
4056
"""Return a numeric reward for this state."""
@@ -57,6 +73,34 @@ def actions(self, state):
5773
else:
5874
return self.actlist
5975

76+
def get_states_from_transitions(self, transitions):
77+
if isinstance(transitions, dict):
78+
s1 = set(transitions.keys())
79+
s2 = set([tr[1] for actions in transitions.values()
80+
for effects in actions.values() for tr in effects])
81+
return s1.union(s2)
82+
else:
83+
print('Could not retrieve states from transitions')
84+
return None
85+
86+
def check_consistency(self):
87+
# check that all states in transitions are valid
88+
assert set(self.states) == self.get_states_from_transitions(self.transitions)
89+
# check that init is a valid state
90+
assert self.init in self.states
91+
# check reward for each state
92+
#assert set(self.reward.keys()) == set(self.states)
93+
assert set(self.reward.keys()) == set(self.states)
94+
# check that all terminals are valid states
95+
assert all([t in self.states for t in self.terminals])
96+
# check that probability distributions for all actions sum to 1
97+
for s1, actions in self.transitions.items():
98+
for a in actions.keys():
99+
s = 0
100+
for o in actions[a]:
101+
s += o[0]
102+
assert abs(s - 1) < 0.001
103+
60104

61105
class GridMDP(MDP):
62106

@@ -67,25 +111,41 @@ class GridMDP(MDP):
67111

68112
def __init__(self, grid, terminals, init=(0, 0), gamma=.9):
69113
grid.reverse() # because we want row 0 on bottom, not on top
70-
MDP.__init__(self, init, actlist=orientations,
71-
terminals=terminals, gamma=gamma)
72-
self.grid = grid
114+
reward = {}
115+
states = set()
73116
self.rows = len(grid)
74117
self.cols = len(grid[0])
118+
self.grid = grid
75119
for x in range(self.cols):
76120
for y in range(self.rows):
77-
self.reward[x, y] = grid[y][x]
78121
if grid[y][x] is not None:
79-
self.states.add((x, y))
80-
81-
def T(self, state, action):
122+
states.add((x, y))
123+
reward[(x, y)] = grid[y][x]
124+
self.states = states
125+
actlist = orientations
126+
transitions = {}
127+
for s in states:
128+
transitions[s] = {}
129+
for a in actlist:
130+
transitions[s][a] = self.calculate_T(s, a)
131+
MDP.__init__(self, init, actlist=actlist,
132+
terminals=terminals, transitions = transitions,
133+
reward = reward, states = states, gamma=gamma)
134+
135+
def calculate_T(self, state, action):
82136
if action is None:
83137
return [(0.0, state)]
84138
else:
85139
return [(0.8, self.go(state, action)),
86140
(0.1, self.go(state, turn_right(action))),
87141
(0.1, self.go(state, turn_left(action)))]
88-
142+
143+
def T(self, state, action):
144+
if action is None:
145+
return [(0.0, state)]
146+
else:
147+
return self.transitions[state][action]
148+
89149
def go(self, state, direction):
90150
"""Return the state that results from going in this direction."""
91151
state1 = vector_add(state, direction)
@@ -192,3 +252,19 @@ def policy_evaluation(pi, U, mdp, k=20):
192252
^ None ^ .
193253
^ > ^ <
194254
""" # noqa
255+
256+
"""
257+
s = { 'a' : { 'plan1' : [(0.2, 'a'), (0.3, 'b'), (0.3, 'c'), (0.2, 'd')],
258+
'plan2' : [(0.4, 'a'), (0.15, 'b'), (0.45, 'c')],
259+
'plan3' : [(0.2, 'a'), (0.5, 'b'), (0.3, 'c')],
260+
},
261+
'b' : { 'plan1' : [(0.2, 'a'), (0.6, 'b'), (0.2, 'c'), (0.1, 'd')],
262+
'plan2' : [(0.6, 'a'), (0.2, 'b'), (0.1, 'c'), (0.1, 'd')],
263+
'plan3' : [(0.3, 'a'), (0.3, 'b'), (0.4, 'c')],
264+
},
265+
'c' : { 'plan1' : [(0.3, 'a'), (0.5, 'b'), (0.1, 'c'), (0.1, 'd')],
266+
'plan2' : [(0.5, 'a'), (0.3, 'b'), (0.1, 'c'), (0.1, 'd')],
267+
'plan3' : [(0.1, 'a'), (0.3, 'b'), (0.1, 'c'), (0.5, 'd')],
268+
},
269+
}
270+
"""

pomdp.ipynb

Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Partially Observable Markov decision processes (POMDPs)\n",
8+
"\n",
9+
"This Jupyter notebook acts as supporting material for POMDPs, covered in **Chapter 17 Making Complex Decisions** of the book* Artificial Intelligence: A Modern Approach*. We make use of the implementations of POMPDPs in mdp.py module. This notebook has been separated from the notebook `mdp.py` as the topics are considerably more advanced.\n",
10+
"\n",
11+
"**Note that it is essential to work through and understand the mdp.ipynb notebook before diving into this one.**\n",
12+
"\n",
13+
"Let us import everything from the mdp module to get started."
14+
]
15+
},
16+
{
17+
"cell_type": "code",
18+
"execution_count": 1,
19+
"metadata": {
20+
"collapsed": true
21+
},
22+
"outputs": [],
23+
"source": [
24+
"from mdp import *\n",
25+
"from notebook import psource, pseudocode"
26+
]
27+
},
28+
{
29+
"cell_type": "markdown",
30+
"metadata": {},
31+
"source": [
32+
"## CONTENTS\n",
33+
"\n",
34+
"1. Overview of MDPs\n",
35+
"2. POMDPs - a conceptual outline\n",
36+
"3. POMDPs - a rigorous outline\n",
37+
"4. Value Iteration\n",
38+
" - Value Iteration Visualization"
39+
]
40+
},
41+
{
42+
"cell_type": "markdown",
43+
"metadata": {},
44+
"source": [
45+
"## 1. OVERVIEW\n",
46+
"\n",
47+
"We first review Markov property and MDPs as in [Section 17.1] of the book.\n",
48+
"\n",
49+
"- A stochastic process is said to have the **Markov property**, or to have a **Markovian transition model** if the conditional probability distribution of future states of the process (conditional on both past and present states) depends only on the present state, not on the sequence of events that preceded it.\n",
50+
"\n",
51+
" -- (Source: [Wikipedia](https://en.wikipedia.org/wiki/Markov_property))\n",
52+
"\n",
53+
"A Markov decision process or MDP is defined as:\n",
54+
"- a sequential decision problem for a fully observable, stochastic environment with a Markovian transition model and additive rewards.\n",
55+
"\n",
56+
"An MDP consists of a set of states (with an initial state $s_0$); a set $A(s)$ of actions\n",
57+
"in each state; a transition model $P(s' | s, a)$; and a reward function $R(s)$.\n",
58+
"\n",
59+
"The MDP seeks to make sequential decisions to occupy states so as to maximise some combination of the reward function $R(s)$.\n",
60+
"\n",
61+
"The characteristic problem of the MDP is hence to identify the optimal policy function $\\pi^*(s)$ that provides the _utility-maximising_ action $a$ to be taken when the current state is $s$.\n",
62+
"\n",
63+
"### Belief vector\n",
64+
"\n",
65+
"**Note**: The book refers to the _belief vector_ as the _belief state_. We use the latter terminology here to retain our ability to refer to the belief vector as a _probability distribution over states_.\n",
66+
"\n",
67+
"The solution of an MDP is subject to certain properties of the problem which are assumed and justified in [Section 17.1]. One critical assumption is that the agent is **fully aware of its current state at all times**.\n",
68+
"\n",
69+
"A tedious (but rewarding, as we will see) way of expressing this is in terms of the **belief vector** $b$ of the agent. The belief vector is a function mapping states to probabilities or certainties of being in those states.\n",
70+
"\n",
71+
"Consider an agent that is fully aware that it is in state $s_i$ in the statespace $(s_1, s_2, ... s_n)$ at the current time.\n",
72+
"\n",
73+
"Its belief vector is the vector $(b(s_1), b(s_2), ... b(s_n))$ given by the function $b(s)$:\n",
74+
"\\begin{align*}\n",
75+
"b(s) &= 0 \\quad \\text{if }s \\neq s_i \\\\ &= 1 \\quad \\text{if } s = s_i\n",
76+
"\\end{align*}\n",
77+
"\n",
78+
"Note that $b(s)$ is a probability distribution that necessarily sums to $1$ over all $s$.\n",
79+
"\n"
80+
]
81+
},
82+
{
83+
"cell_type": "markdown",
84+
"metadata": {
85+
"collapsed": true
86+
},
87+
"source": [
88+
"## 2. POMDPs - a conceptual outline\n",
89+
"\n",
90+
"The POMDP really has only two modifications to the **problem formulation** compared to the MDP.\n",
91+
"\n",
92+
"- **Belief state** - In the real world, the current state of an agent is often not known with complete certainty. This makes the concept of a belief vector extremely relevant. It allows the agent to represent different degrees of certainty with which it _believes_ it is in each state.\n",
93+
"\n",
94+
"- **Evidence percepts** - In the real world, agents often have certain kinds of evidence, collected from sensors. They can use the probability distribution of observed evidence, conditional on state, to consolidate their information. This is a known distribution $P(e\\ |\\ s)$ - $e$ being an evidence, and $s$ being the state it is conditional on.\n",
95+
"\n",
96+
"Consider the world we used for the MDP. \n",
97+
"\n",
98+
"![title](images/grid_mdp.jpg)\n",
99+
"\n",
100+
"#### Using the belief vector\n",
101+
"An agent beginning at $(1, 1)$ may not be certain that it is indeed in $(1, 1)$. Consider a belief vector $b$ such that:\n",
102+
"\\begin{align*}\n",
103+
" b((1,1)) &= 0.8 \\\\\n",
104+
" b((2,1)) &= 0.1 \\\\\n",
105+
" b((1,2)) &= 0.1 \\\\\n",
106+
" b(s) &= 0 \\quad \\quad \\forall \\text{ other } s\n",
107+
"\\end{align*}\n",
108+
"\n",
109+
"By horizontally catenating each row, we can represent this as an 11-dimensional vector (omitting $(2, 2)$).\n",
110+
"\n",
111+
"Thus, taking $s_1 = (1, 1)$, $s_2 = (1, 2)$, ... $s_{11} = (4,3)$, we have $b$:\n",
112+
"\n",
113+
"$b = (0.8, 0.1, 0, 0, 0.1, 0, 0, 0, 0, 0, 0)$ \n",
114+
"\n",
115+
"This fully represents the certainty to which the agent is aware of its state.\n",
116+
"\n",
117+
"#### Using evidence\n",
118+
"The evidence observed here could be the number of adjacent 'walls' or 'dead ends' observed by the agent. We assume that the agent cannot 'orient' the walls - only count them.\n",
119+
"\n",
120+
"In this case, $e$ can take only two values, 1 and 2. This gives $P(e\\ |\\ s)$ as:\n",
121+
"\\begin{align*}\n",
122+
" P(e=2\\ |\\ s) &= \\frac{1}{7} \\quad \\forall \\quad s \\in \\{s_1, s_2, s_4, s_5, s_8, s_9, s_{11}\\}\\\\\n",
123+
" P(e=1\\ |\\ s) &= \\frac{1}{4} \\quad \\forall \\quad s \\in \\{s_3, s_6, s_7, s_{10}\\} \\\\\n",
124+
" P(e\\ |\\ s) &= 0 \\quad \\forall \\quad \\text{ other } s, e\n",
125+
"\\end{align*}\n",
126+
"\n",
127+
"Note that the implications of the evidence on the state must be known **a priori** to the agent. Ways of reliably learning this distribution from percepts are beyond the scope of this notebook."
128+
]
129+
},
130+
{
131+
"cell_type": "markdown",
132+
"metadata": {},
133+
"source": [
134+
"## 3. POMDPs - a rigorous outline\n",
135+
"\n",
136+
"A POMDP is thus a sequential decision problem for for a *partially* observable, stochastic environment with a Markovian transition model, a known 'sensor model' for inferring state from observation, and additive rewards. \n",
137+
"\n",
138+
"Practically, a POMDP has the following, which an MDP also has:\n",
139+
"- a set of states, each denoted by $s$\n",
140+
"- a set of actions available in each state, $A(s)$\n",
141+
"- a reward accrued on attaining some state, $R(s)$\n",
142+
"- a transition probability $P(s'\\ |\\ s, a)$ of action $a$ changing the state from $s$ to $s'$\n",
143+
"\n",
144+
"And the following, which an MDP does not:\n",
145+
"- a sensor model $P(e\\ |\\ s)$ on evidence conditional on states\n",
146+
"\n",
147+
"Additionally, the POMDP is now uncertain of its current state hence has:\n",
148+
"- a belief vector $b$ representing the certainty of being in each state (as a probability distribution)\n",
149+
"\n",
150+
"\n",
151+
"#### New uncertainties\n",
152+
"\n",
153+
"It is useful to intuitively appreciate the new uncertainties that have arisen in the agent's awareness of its own state.\n",
154+
"\n",
155+
"- At any point, the agent has belief vector $b$, the distribution of its believed likelihood of being in each state $s$.\n",
156+
"- For each of these states $s$ that the agent may **actually** be in, it has some set of actions given by $A(s)$.\n",
157+
"- Each of these actions may transport it to some other state $s'$, assuming an initial state $s$, with probability $P(s'\\ |\\ s, a)$\n",
158+
"- Once the action is performed, the agent receives a percept $e$. $P(e\\ |\\ s)$ now tells it the chances of having perceived $e$ for each state $s$. The agent must use this information to update its new belief state appropriately.\n",
159+
"\n",
160+
"#### Evolution of the belief vector - the `FORWARD` function\n",
161+
"\n",
162+
"The new belief vector $b'(s')$ after an action $a$ on the belief vector $b(s)$ and the noting of evidence $e$ is:\n",
163+
"$$ b'(s') = \\alpha P(e\\ |\\ s') \\sum_s P(s'\\ | s, a) b(s)$$ \n",
164+
"\n",
165+
"where $\\alpha$ is a normalising constant (to retain the interpretation of $b$ as a probability distribution.\n",
166+
"\n",
167+
"This equation is just counts the sum of likelihoods of going to a state $s'$ from every possible state $s$, times the initial likelihood of being in each $s$. This is multiplied by the likelihood that the known evidence actually implies the new state $s'$. \n",
168+
"\n",
169+
"This function is represented as `b' = FORWARD(b, a, e)`\n",
170+
"\n",
171+
"#### Probability distribution of the evolving belief vector\n",
172+
"\n",
173+
"The goal here is to find $P(b'\\ |\\ b, a)$ - the probability that action $a$ transforms belief vector $b$ into belief vector $b'$. The following steps illustrate this -\n",
174+
"\n",
175+
"The probability of observing evidence $e$ when action $a$ is enacted on belief vector $b$ can be distributed over each possible new state $s'$ resulting from it:\n",
176+
"\\begin{align*}\n",
177+
" P(e\\ |\\ b, a) &= \\sum_{s'} P(e\\ |\\ b, a, s') P(s'\\ |\\ b, a) \\\\\n",
178+
" &= \\sum_{s'} P(e\\ |\\ s') P(s'\\ |\\ b, a) \\\\\n",
179+
" &= \\sum_{s'} P(e\\ |\\ s') \\sum_s P(s'\\ |\\ s, a) b(s)\n",
180+
"\\end{align*}\n",
181+
"\n",
182+
"The probability of getting belief vector $b'$ from $b$ by application of action $a$ can thus be summed over all possible evidences $e$:\n",
183+
"\\begin{align*}\n",
184+
" P(b'\\ |\\ b, a) &= \\sum_{e} P(b'\\ |\\ b, a, e) P(e\\ |\\ b, a) \\\\\n",
185+
" &= \\sum_{e} P(b'\\ |\\ b, a, e) \\sum_{s'} P(e\\ |\\ s') \\sum_s P(s'\\ |\\ s, a) b(s)\n",
186+
"\\end{align*}\n",
187+
"\n",
188+
"where $P(b'\\ |\\ b, a, e) = 1$ if $b' = $ `FORWARD(b, a, e)` and $= 0$ otherwise.\n",
189+
"\n",
190+
"Given initial and final belief states $b$ and $b'$, the transition probabilities still depend on the action $a$ and observed evidence $e$. Some belief states may be achievable by certain actions, but have non-zero probabilities for states prohibited by the evidence $e$. Thus, the above condition thus ensures that only valid combinations of $(b', b, a, e)$ are considered.\n",
191+
"\n",
192+
"#### A modified rewardspace\n",
193+
"\n",
194+
"For MDPs, the reward space was simple - one reward per available state. However, for a belief vector $b(s)$, the expected reward is now:\n",
195+
"$$\\rho(b) = \\sum_s b(s) R(s)$$\n",
196+
"\n",
197+
"Thus, as the belief vector can take infinite values of the distribution over states, so can the reward for each belief vector vary over a hyperplane in the belief space, or space of states (planes in an $N$-dimensional space are formed by a linear combination of the axes)."
198+
]
199+
},
200+
{
201+
"cell_type": "code",
202+
"execution_count": null,
203+
"metadata": {
204+
"collapsed": true
205+
},
206+
"outputs": [],
207+
"source": []
208+
},
209+
{
210+
"cell_type": "code",
211+
"execution_count": null,
212+
"metadata": {
213+
"collapsed": true
214+
},
215+
"outputs": [],
216+
"source": []
217+
}
218+
],
219+
"metadata": {
220+
"kernelspec": {
221+
"display_name": "Python 3",
222+
"language": "python",
223+
"name": "python3"
224+
},
225+
"language_info": {
226+
"codemirror_mode": {
227+
"name": "ipython",
228+
"version": 3
229+
},
230+
"file_extension": ".py",
231+
"mimetype": "text/x-python",
232+
"name": "python",
233+
"nbconvert_exporter": "python",
234+
"pygments_lexer": "ipython3",
235+
"version": "3.6.1"
236+
}
237+
},
238+
"nbformat": 4,
239+
"nbformat_minor": 2
240+
}

0 commit comments

Comments
 (0)