aimacode · norvig · Mar 2, 2018 · Feb 26, 2018 · Feb 27, 2018 · Feb 27, 2018
diff --git a/mdp.ipynb b/mdp.ipynb
diff --git a/mdp.py b/mdp.py
@@ -21,20 +21,36 @@ class MDP:
     list of (p, s') pairs. We also keep track of the possible states,
     terminal states, and actions for each state. [page 646]"""
 
-    def __init__(self, init, actlist, terminals, transitions={}, states=None, gamma=.9):
+    def __init__(self, init, actlist, terminals, transitions = {}, reward = None, states=None, gamma=.9):
         if not (0 < gamma <= 1):
             raise ValueError("An MDP must have 0 < gamma <= 1")
 
         if states:
             self.states = states
         else:
-            self.states = set()
+            ## collect states from transitions table
+            self.states = self.get_states_from_transitions(transitions)
+
+
         self.init = init
-        self.actlist = actlist
+
+        if isinstance(actlist, list):
+            ## if actlist is a list, all states have the same actions
+            self.actlist = actlist
+        elif isinstance(actlist, dict):
+            ## if actlist is a dict, different actions for each state
+            self.actlist = actlist
+
         self.terminals = terminals
         self.transitions = transitions
+        if self.transitions == {}:
+            print("Warning: Transition table is empty.")
         self.gamma = gamma
-        self.reward = {}
+        if reward:
+            self.reward = reward
+        else:
+            self.reward = {s : 0 for s in self.states}
+        #self.check_consistency()
 
     def R(self, state):
         """Return a numeric reward for this state."""
@@ -57,6 +73,34 @@ def actions(self, state):
         else:
             return self.actlist
 
+    def get_states_from_transitions(self, transitions):
+        if isinstance(transitions, dict):
+            s1 = set(transitions.keys())
+            s2 = set([tr[1] for actions in transitions.values() 
+                              for effects in actions.values() for tr in effects])
+            return s1.union(s2)
+        else:
+            print('Could not retrieve states from transitions')
+            return None
+
+    def check_consistency(self):
+        # check that all states in transitions are valid
+        assert set(self.states) == self.get_states_from_transitions(self.transitions)
+        # check that init is a valid state
+        assert self.init in self.states
+        # check reward for each state
+        #assert set(self.reward.keys()) == set(self.states)
+        assert set(self.reward.keys()) == set(self.states)
+        # check that all terminals are valid states
+        assert all([t in self.states for t in self.terminals])
+        # check that probability distributions for all actions sum to 1
+        for s1, actions in self.transitions.items():
+            for a in actions.keys():
+                s = 0
+                for o in actions[a]:
+                    s += o[0]
+                assert abs(s - 1) < 0.001
+
 
 class GridMDP(MDP):
 
@@ -67,25 +111,41 @@ class GridMDP(MDP):
 
     def __init__(self, grid, terminals, init=(0, 0), gamma=.9):
         grid.reverse()  # because we want row 0 on bottom, not on top
-        MDP.__init__(self, init, actlist=orientations,
-                     terminals=terminals, gamma=gamma)
-        self.grid = grid
+        reward = {}
+        states = set()
         self.rows = len(grid)
         self.cols = len(grid[0])
+        self.grid = grid
         for x in range(self.cols):
             for y in range(self.rows):
-                self.reward[x, y] = grid[y][x]
                 if grid[y][x] is not None:
-                    self.states.add((x, y))
-
-    def T(self, state, action):
+                    states.add((x, y))
+                    reward[(x, y)] = grid[y][x]
+        self.states = states
+        actlist = orientations
+        transitions = {}
+        for s in states:
+            transitions[s] = {}
+            for a in actlist:
+                transitions[s][a] = self.calculate_T(s, a)
+        MDP.__init__(self, init, actlist=actlist,
+                     terminals=terminals, transitions = transitions, 
+                     reward = reward, states = states, gamma=gamma)
+
+    def calculate_T(self, state, action):
         if action is None:
             return [(0.0, state)]
         else:
             return [(0.8, self.go(state, action)),
                     (0.1, self.go(state, turn_right(action))),
                     (0.1, self.go(state, turn_left(action)))]
-
+
+    def T(self, state, action):
+        if action is None:
+            return [(0.0, state)]
+        else:
+            return self.transitions[state][action]
+
     def go(self, state, direction):
         """Return the state that results from going in this direction."""
         state1 = vector_add(state, direction)
@@ -192,3 +252,19 @@ def policy_evaluation(pi, U, mdp, k=20):
 ^   None   ^   .
 ^   >      ^   <
 """  # noqa
+
+"""
+s = { 'a' : {	'plan1' : [(0.2, 'a'), (0.3, 'b'), (0.3, 'c'), (0.2, 'd')],
+				'plan2' : [(0.4, 'a'), (0.15, 'b'), (0.45, 'c')],
+				'plan3' : [(0.2, 'a'), (0.5, 'b'), (0.3, 'c')],
+			},
+	  'b' : {	'plan1' : [(0.2, 'a'), (0.6, 'b'), (0.2, 'c'), (0.1, 'd')],
+				'plan2' : [(0.6, 'a'), (0.2, 'b'), (0.1, 'c'), (0.1, 'd')],
+				'plan3' : [(0.3, 'a'), (0.3, 'b'), (0.4, 'c')],
+			},
+	  'c' : {	'plan1' : [(0.3, 'a'), (0.5, 'b'), (0.1, 'c'), (0.1, 'd')],
+				'plan2' : [(0.5, 'a'), (0.3, 'b'), (0.1, 'c'), (0.1, 'd')],
+				'plan3' : [(0.1, 'a'), (0.3, 'b'), (0.1, 'c'), (0.5, 'd')],
+	  		},
+	}
+"""
diff --git a/pomdp.ipynb b/pomdp.ipynb
@@ -0,0 +1,240 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Partially Observable Markov decision processes (POMDPs)\n",
+    "\n",
+    "This Jupyter notebook acts as supporting material for POMDPs, covered in **Chapter 17 Making Complex Decisions** of the book* Artificial Intelligence: A Modern Approach*. We make use of the implementations of POMPDPs in mdp.py module. This notebook has been separated from the notebook `mdp.py` as the topics are considerably more advanced.\n",
+    "\n",
+    "**Note that it is essential to work through and understand the mdp.ipynb notebook before diving into this one.**\n",
+    "\n",
+    "Let us import everything from the mdp module to get started."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from mdp import *\n",
+    "from notebook import psource, pseudocode"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## CONTENTS\n",
+    "\n",
+    "1. Overview of MDPs\n",
+    "2. POMDPs - a conceptual outline\n",
+    "3. POMDPs - a rigorous outline\n",
+    "4. Value Iteration\n",
+    "    - Value Iteration Visualization"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. OVERVIEW\n",
+    "\n",
+    "We first review Markov property and MDPs as in [Section 17.1] of the book.\n",
+    "\n",
+    "- A stochastic process is said to have the **Markov property**, or to have a **Markovian transition model** if the conditional probability distribution of future states of the process (conditional on both past and present states) depends only on the present state, not on the sequence of events that preceded it.\n",
+    "\n",
+    " -- (Source: [Wikipedia](https://en.wikipedia.org/wiki/Markov_property))\n",
+    "\n",
+    "A Markov decision process or MDP is defined as:\n",
+    "- a sequential decision problem for a fully observable, stochastic environment with a Markovian transition model and additive rewards.\n",
+    "\n",
+    "An MDP consists of a set of states (with an initial state $s_0$); a set $A(s)$ of actions\n",
+    "in each state; a transition model $P(s' | s, a)$; and a reward function $R(s)$.\n",
+    "\n",
+    "The MDP seeks to make sequential decisions to occupy states so as to maximise some combination of the reward function $R(s)$.\n",
+    "\n",
+    "The characteristic problem of the MDP is hence to identify the optimal policy function $\\pi^*(s)$ that provides the _utility-maximising_ action $a$ to be taken when the current state is $s$.\n",
+    "\n",
+    "### Belief vector\n",
+    "\n",
+    "**Note**: The book refers to the _belief vector_ as the _belief state_. We use the latter terminology here to retain our ability to refer to the belief vector as a _probability distribution over states_.\n",
+    "\n",
+    "The solution of an MDP is subject to certain properties of the problem which are assumed and justified in [Section 17.1]. One critical assumption is that the agent is **fully aware of its current state at all times**.\n",
+    "\n",
+    "A tedious (but rewarding, as we will see) way of expressing this is in terms of the **belief vector** $b$ of the agent. The belief vector is a function mapping states to probabilities or certainties of being in those states.\n",
+    "\n",
+    "Consider an agent that is fully aware that it is in state $s_i$ in the statespace $(s_1, s_2, ... s_n)$ at the current time.\n",
+    "\n",
+    "Its belief vector is the vector $(b(s_1), b(s_2), ... b(s_n))$ given by the function $b(s)$:\n",
+    "\\begin{align*}\n",
+    "b(s) &= 0 \\quad \\text{if }s \\neq s_i \\\\ &= 1 \\quad \\text{if } s = s_i\n",
+    "\\end{align*}\n",
+    "\n",
+    "Note that $b(s)$ is a probability distribution that necessarily sums to $1$ over all $s$.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "## 2. POMDPs - a conceptual outline\n",
+    "\n",
+    "The POMDP really has only two modifications to the **problem formulation** compared to the MDP.\n",
+    "\n",
+    "- **Belief state** - In the real world, the current state of an agent is often not known with complete certainty. This makes the concept of a belief vector extremely relevant. It allows the agent to represent different degrees of certainty with which it _believes_ it is in each state.\n",
+    "\n",
+    "- **Evidence percepts** - In the real world, agents often have certain kinds of evidence, collected from sensors. They can use the probability distribution of observed evidence, conditional on state, to consolidate their information. This is a known distribution $P(e\\ |\\ s)$ - $e$ being an evidence, and $s$ being the state it is conditional on.\n",
+    "\n",
+    "Consider the world we used for the MDP. \n",
+    "\n",
+    "![title](images/grid_mdp.jpg)\n",
+    "\n",
+    "#### Using the belief vector\n",
+    "An agent beginning at $(1, 1)$ may not be certain that it is indeed in $(1, 1)$. Consider a belief vector $b$ such that:\n",
+    "\\begin{align*}\n",
+    "    b((1,1)) &= 0.8 \\\\\n",
+    "    b((2,1)) &= 0.1 \\\\\n",
+    "    b((1,2)) &= 0.1 \\\\\n",
+    "    b(s) &= 0 \\quad \\quad \\forall \\text{ other } s\n",
+    "\\end{align*}\n",
+    "\n",
+    "By horizontally catenating each row, we can represent this as an 11-dimensional vector (omitting $(2, 2)$).\n",
+    "\n",
+    "Thus, taking $s_1 = (1, 1)$, $s_2 = (1, 2)$, ... $s_{11} = (4,3)$, we have $b$:\n",
+    "\n",
+    "$b = (0.8, 0.1, 0, 0, 0.1, 0, 0, 0, 0, 0, 0)$ \n",
+    "\n",
+    "This fully represents the certainty to which the agent is aware of its state.\n",
+    "\n",
+    "#### Using evidence\n",
+    "The evidence observed here could be the number of adjacent 'walls' or 'dead ends' observed by the agent. We assume that the agent cannot 'orient' the walls - only count them.\n",
+    "\n",
+    "In this case, $e$ can take only two values, 1 and 2. This gives $P(e\\ |\\ s)$ as:\n",
+    "\\begin{align*}\n",
+    "    P(e=2\\ |\\ s) &= \\frac{1}{7} \\quad \\forall \\quad s \\in \\{s_1, s_2, s_4, s_5, s_8, s_9, s_{11}\\}\\\\\n",
+    "    P(e=1\\ |\\ s) &= \\frac{1}{4} \\quad \\forall \\quad s \\in \\{s_3, s_6, s_7, s_{10}\\} \\\\\n",
+    "    P(e\\ |\\ s) &= 0 \\quad \\forall \\quad \\text{ other } s, e\n",
+    "\\end{align*}\n",
+    "\n",
+    "Note that the implications of the evidence on the state must be known **a priori** to the agent. Ways of reliably learning this distribution from percepts are beyond the scope of this notebook."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. POMDPs - a rigorous outline\n",
+    "\n",
+    "A POMDP is thus a sequential decision problem for for a *partially* observable, stochastic environment with a Markovian transition model, a known 'sensor model' for inferring state from observation, and additive rewards. \n",
+    "\n",
+    "Practically, a POMDP has the following, which an MDP also has:\n",
+    "- a set of states, each denoted by $s$\n",
+    "- a set of actions available in each state, $A(s)$\n",
+    "- a reward accrued on attaining some state, $R(s)$\n",
+    "- a transition probability $P(s'\\ |\\ s, a)$ of action $a$ changing the state from $s$ to $s'$\n",
+    "\n",
+    "And the following, which an MDP does not:\n",
+    "- a sensor model $P(e\\ |\\ s)$ on evidence conditional on states\n",
+    "\n",
+    "Additionally, the POMDP is now uncertain of its current state hence has:\n",
+    "- a belief vector $b$ representing the certainty of being in each state (as a probability distribution)\n",
+    "\n",
+    "\n",
+    "#### New uncertainties\n",
+    "\n",
+    "It is useful to intuitively appreciate the new uncertainties that have arisen in the agent's awareness of its own state.\n",
+    "\n",
+    "- At any point, the agent has belief vector $b$, the distribution of its believed likelihood of being in each state $s$.\n",
+    "- For each of these states $s$ that the agent may **actually** be in, it has some set of actions given by $A(s)$.\n",
+    "- Each of these actions may transport it to some other state $s'$, assuming an initial state $s$, with probability $P(s'\\ |\\ s, a)$\n",
+    "- Once the action is performed, the agent receives a percept $e$. $P(e\\ |\\ s)$ now tells it the chances of having perceived $e$ for each state $s$. The agent must use this information to update its new belief state appropriately.\n",
+    "\n",
+    "#### Evolution of the belief vector - the `FORWARD` function\n",
+    "\n",
+    "The new belief vector $b'(s')$ after an action $a$ on the belief vector $b(s)$ and the noting of evidence $e$ is:\n",
+    "$$ b'(s') = \\alpha P(e\\ |\\ s') \\sum_s P(s'\\ | s, a) b(s)$$ \n",
+    "\n",
+    "where $\\alpha$ is a normalising constant (to retain the interpretation of $b$ as a probability distribution.\n",
+    "\n",
+    "This equation is just counts the sum of likelihoods of going to a state $s'$ from every possible state $s$, times the initial likelihood of being in each $s$. This is multiplied by the likelihood that the known evidence actually implies the new state $s'$. \n",
+    "\n",
+    "This function is represented as `b' = FORWARD(b, a, e)`\n",
+    "\n",
+    "#### Probability distribution of the evolving belief vector\n",
+    "\n",
+    "The goal here is to find $P(b'\\ |\\ b, a)$ - the probability that action $a$ transforms belief vector $b$ into belief vector $b'$. The following steps illustrate this -\n",
+    "\n",
+    "The probability of observing evidence $e$ when action $a$ is enacted on belief vector $b$ can be distributed over each possible new state $s'$ resulting from it:\n",
+    "\\begin{align*}\n",
+    "    P(e\\ |\\ b, a) &= \\sum_{s'} P(e\\ |\\ b, a, s') P(s'\\ |\\ b, a) \\\\\n",
+    "                  &= \\sum_{s'} P(e\\ |\\ s') P(s'\\ |\\ b, a) \\\\\n",
+    "                  &= \\sum_{s'} P(e\\ |\\ s') \\sum_s P(s'\\ |\\ s, a) b(s)\n",
+    "\\end{align*}\n",
+    "\n",
+    "The probability of getting belief vector $b'$ from $b$ by application of action $a$ can thus be summed over all possible evidences $e$:\n",
+    "\\begin{align*}\n",
+    "    P(b'\\ |\\ b, a) &= \\sum_{e} P(b'\\ |\\ b, a, e) P(e\\ |\\ b, a) \\\\\n",
+    "                  &= \\sum_{e} P(b'\\ |\\ b, a, e) \\sum_{s'} P(e\\ |\\ s') \\sum_s P(s'\\ |\\ s, a) b(s)\n",
+    "\\end{align*}\n",
+    "\n",
+    "where $P(b'\\ |\\ b, a, e) = 1$ if $b' = $ `FORWARD(b, a, e)` and $= 0$ otherwise.\n",
+    "\n",
+    "Given initial and final belief states $b$ and $b'$, the transition probabilities still depend on the action $a$ and observed evidence $e$. Some belief states may be achievable by certain actions, but have non-zero probabilities for states prohibited by the evidence $e$. Thus, the above condition thus ensures that only valid combinations of $(b', b, a, e)$ are considered.\n",
+    "\n",
+    "#### A modified rewardspace\n",
+    "\n",
+    "For MDPs, the reward space was simple - one reward per available state. However, for a belief vector $b(s)$, the expected reward is now:\n",
+    "$$\\rho(b) = \\sum_s b(s) R(s)$$\n",
+    "\n",
+    "Thus, as the belief vector can take infinite values of the distribution over states, so can the reward for each belief vector vary over a hyperplane in the belief space, or space of states (planes in an $N$-dimensional space are formed by a linear combination of the axes)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}