diff --git a/mdp.ipynb b/mdp.ipynb index 910b49040..4c44ff9d8 100644 --- a/mdp.ipynb +++ b/mdp.ipynb @@ -1,7 +1,7 @@ { "cells": [ { - "cell_type": "markdown", + "cell_type": "raw", "metadata": {}, "source": [ "# Markov decision processes (MDPs)\n", @@ -10,19 +10,24 @@ ] }, { - "cell_type": "code", - "execution_count": 1, +<<<<<<< HEAD + "cell_type": "raw", "metadata": { "collapsed": true }, +======= + "cell_type": "code", + "execution_count": null, + "metadata": {}, "outputs": [], +>>>>>>> 3fed6614295b7270ca1226415beff7305e387eeb "source": [ "from mdp import *\n", "from notebook import psource, pseudocode" ] }, { - "cell_type": "markdown", + "cell_type": "raw", "metadata": {}, "source": [ "## CONTENTS\n", @@ -36,7 +41,7 @@ ] }, { - "cell_type": "markdown", + "cell_type": "raw", "metadata": {}, "source": [ "## OVERVIEW\n", @@ -56,7 +61,7 @@ ] }, { - "cell_type": "markdown", + "cell_type": "raw", "metadata": {}, "source": [ "## MDP\n", @@ -65,162 +70,21 @@ ] }, { +<<<<<<< HEAD + "cell_type": "raw", + "metadata": {}, +======= "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "
\n", - "class MDP:\n",
- "\n",
- " """A Markov Decision Process, defined by an initial state, transition model,\n",
- " and reward function. We also keep track of a gamma value, for use by\n",
- " algorithms. The transition model is represented somewhat differently from\n",
- " the text. Instead of P(s' | s, a) being a probability number for each\n",
- " state/state/action triplet, we instead have T(s, a) return a\n",
- " list of (p, s') pairs. We also keep track of the possible states,\n",
- " terminal states, and actions for each state. [page 646]"""\n",
- "\n",
- " def __init__(self, init, actlist, terminals, transitions={}, states=None, gamma=.9):\n",
- " if not (0 < gamma <= 1):\n",
- " raise ValueError("An MDP must have 0 < gamma <= 1")\n",
- "\n",
- " if states:\n",
- " self.states = states\n",
- " else:\n",
- " self.states = set()\n",
- " self.init = init\n",
- " self.actlist = actlist\n",
- " self.terminals = terminals\n",
- " self.transitions = transitions\n",
- " self.gamma = gamma\n",
- " self.reward = {}\n",
- "\n",
- " def R(self, state):\n",
- " """Return a numeric reward for this state."""\n",
- " return self.reward[state]\n",
- "\n",
- " def T(self, state, action):\n",
- " """Transition model. From a state and an action, return a list\n",
- " of (probability, result-state) pairs."""\n",
- " if(self.transitions == {}):\n",
- " raise ValueError("Transition model is missing")\n",
- " else:\n",
- " return self.transitions[state][action]\n",
- "\n",
- " def actions(self, state):\n",
- " """Set of actions that can be performed in this state. By default, a\n",
- " fixed list of actions, except for terminal states. Override this\n",
- " method if you need to specialize by state."""\n",
- " if state in self.terminals:\n",
- " return [None]\n",
- " else:\n",
- " return self.actlist\n",
- "
class GridMDP(MDP):\n",
- "\n",
- " """A two-dimensional grid MDP, as in [Figure 17.1]. All you have to do is\n",
- " specify the grid as a list of lists of rewards; use None for an obstacle\n",
- " (unreachable state). Also, you should specify the terminal states.\n",
- " An action is an (x, y) unit vector; e.g. (1, 0) means move east."""\n",
- "\n",
- " def __init__(self, grid, terminals, init=(0, 0), gamma=.9):\n",
- " grid.reverse() # because we want row 0 on bottom, not on top\n",
- " MDP.__init__(self, init, actlist=orientations,\n",
- " terminals=terminals, gamma=gamma)\n",
- " self.grid = grid\n",
- " self.rows = len(grid)\n",
- " self.cols = len(grid[0])\n",
- " for x in range(self.cols):\n",
- " for y in range(self.rows):\n",
- " self.reward[x, y] = grid[y][x]\n",
- " if grid[y][x] is not None:\n",
- " self.states.add((x, y))\n",
- "\n",
- " def T(self, state, action):\n",
- " if action is None:\n",
- " return [(0.0, state)]\n",
- " else:\n",
- " return [(0.8, self.go(state, action)),\n",
- " (0.1, self.go(state, turn_right(action))),\n",
- " (0.1, self.go(state, turn_left(action)))]\n",
- "\n",
- " def go(self, state, direction):\n",
- " """Return the state that results from going in this direction."""\n",
- " state1 = vector_add(state, direction)\n",
- " return state1 if state1 in self.states else state\n",
- "\n",
- " def to_grid(self, mapping):\n",
- " """Convert a mapping from (x, y) to v into a [[..., v, ...]] grid."""\n",
- " return list(reversed([[mapping.get((x, y), None)\n",
- " for x in range(self.cols)]\n",
- " for y in range(self.rows)]))\n",
- "\n",
- " def to_arrows(self, policy):\n",
- " chars = {\n",
- " (1, 0): '>', (0, 1): '^', (-1, 0): '<', (0, -1): 'v', None: '.'}\n",
- " return self.to_grid({s: chars[a] for (s, a) in policy.items()})\n",
- "
def value_iteration(mdp, epsilon=0.001):\n",
- " """Solving an MDP by value iteration. [Figure 17.4]"""\n",
- " U1 = {s: 0 for s in mdp.states}\n",
- " R, T, gamma = mdp.R, mdp.T, mdp.gamma\n",
- " while True:\n",
- " U = U1.copy()\n",
- " delta = 0\n",
- " for s in mdp.states:\n",
- " U1[s] = R(s) + gamma * max([sum([p * U[s1] for (p, s1) in T(s, a)])\n",
- " for a in mdp.actions(s)])\n",
- " delta = max(delta, abs(U1[s] - U[s]))\n",
- " if delta < epsilon * (1 - gamma) / gamma:\n",
- " return U\n",
- "
def expected_utility(a, s, U, mdp):\n",
- " """The expected utility of doing a in state s, according to the MDP and U."""\n",
- " return sum([p * U[s1] for (p, s1) in mdp.T(s, a)])\n",
- "
def policy_iteration(mdp):\n",
- " """Solve an MDP by policy iteration [Figure 17.7]"""\n",
- " U = {s: 0 for s in mdp.states}\n",
- " pi = {s: random.choice(mdp.actions(s)) for s in mdp.states}\n",
- " while True:\n",
- " U = policy_evaluation(pi, U, mdp)\n",
- " unchanged = True\n",
- " for s in mdp.states:\n",
- " a = argmax(mdp.actions(s), key=lambda a: expected_utility(a, s, U, mdp))\n",
- " if a != pi[s]:\n",
- " pi[s] = a\n",
- " unchanged = False\n",
- " if unchanged:\n",
- " return pi\n",
- "
def policy_evaluation(pi, U, mdp, k=20):\n",
- " """Return an updated utility mapping U from each state in the MDP to its\n",
- " utility, using an approximation (modified policy iteration)."""\n",
- " R, T, gamma = mdp.R, mdp.T, mdp.gamma\n",
- " for i in range(k):\n",
- " for s in mdp.states:\n",
- " U[s] = R(s) + gamma * sum([p * U[s1] for (p, s1) in T(s, pi[s])])\n",
- " return U\n",
- "
def T(self, state, action):\n",
- " if action is None:\n",
- " return [(0.0, state)]\n",
- " else:\n",
- " return [(0.8, self.go(state, action)),\n",
- " (0.1, self.go(state, turn_right(action))),\n",
- " (0.1, self.go(state, turn_left(action)))]\n",
- "
def to_arrows(self, policy):\n",
- " chars = {\n",
- " (1, 0): '>', (0, 1): '^', (-1, 0): '<', (0, -1): 'v', None: '.'}\n",
- " return self.to_grid({s: chars[a] for (s, a) in policy.items()})\n",
- "
def to_grid(self, mapping):\n",
- " """Convert a mapping from (x, y) to v into a [[..., v, ...]] grid."""\n",
- " return list(reversed([[mapping.get((x, y), None)\n",
- " for x in range(self.cols)]\n",
- " for y in range(self.rows)]))\n",
- "